[Bugfix][CI] fix typos (#34934)

Signed-off-by: 1195343015 <1195343015@qq.com> Signed-off-by: Jiayi Yan <66017932+1195343015@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>

[Bugfix][CI] fix typos (#34934)
Signed-off-by: 1195343015 <1195343015@qq.com> Signed-off-by: Jiayi Yan <66017932+1195343015@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
6a895197 · Jiayi Yan · GitHub · 8c760b6a · 6a895197 · 6a895197
Unverified Commit 6a895197 authored Mar 06, 2026 by Jiayi Yan Committed by GitHub Mar 05, 2026
20 changed files
--- a/.buildkite/scripts/upload-nightly-wheels.sh
+++ b/.buildkite/scripts/upload-nightly-wheels.sh
@@ -72,7 +72,7 @@ obj_json="objects.json"
 aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
 mkdir -p "$INDICES_OUTPUT_DIR"
-# call script to generate indicies for all existing wheels
+# call script to generate indices for all existing wheels
 # this indices have relative paths that could work as long as it is next to the wheel directory in s3
 # i.e., the wheels are always in s3://vllm-wheels/<commit>/
 # and indices can be placed in /<commit>/, or /nightly/, or /<version>/

--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -467,7 +467,7 @@ steps:
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
-# TODO: Add the "V1 Test attetion (MI300)" test group
+# TODO: Add the "V1 Test attention (MI300)" test group
 - label: V1 Test attention (H100) # 10min
  mirror_hardwares: [amdexperimental, amdproduction]
@@ -2174,7 +2174,7 @@ steps:
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
-# TODO: Add the "V1 Test attetion (MI300)" test group
+# TODO: Add the "V1 Test attention (MI300)" test group
 - label: V1 Test attention (H100) # 10min
  mirror_hardwares: [amdexperimental]

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -13,7 +13,7 @@ repos:
    args: [--output-format, github, --fix]
  - id: ruff-format
 - repo: https://github.com/crate-ci/typos
-  rev: v1.38.1
+  rev: v1.43.5
  hooks:
  - id: typos
    args: [--force-exclude]

--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
@@ -30,7 +30,7 @@ def batch_spec_sort_key(spec: str) -> tuple[int, int, int]:
        max_kv_len = max(r.kv_len for r in requests) if requests else 0
        return (batch_size, max_q_len, max_kv_len)
    except Exception:
-        # Fallback for unparseable specs
+        # Fallback for unparsable specs
        return (0, 0, 0)

--- a/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
@@ -202,7 +202,7 @@ def test_correctness(T: int, N: int):
    # reference output
    ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
-    # test ouptut
+    # test output
    out_q, out_s = output_from_impl(
        ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
    )

--- a/csrc/cpu/cpu_attn_amx.hpp
+++ b/csrc/cpu/cpu_attn_amx.hpp
@@ -420,7 +420,7 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
      const int64_t block_size, const int64_t block_size_stride) {
    // For AMX 2D tiles, size of each line is 64 bytes
    constexpr int64_t amx_tile_row_size = AMX_TILE_ROW_BYTES;
-    // For AMX B martix, N always is 16
+    // For AMX B matrix, N always is 16
    constexpr int64_t amx_b_tile_n_size = AMX_TILE_ROW_BYTES / 4;
    constexpr int64_t amx_b_tile_k_size = amx_tile_row_size / sizeof(scalar_t);
    // For now suppose block_size is divisible by amx_tile_column_num

--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -4,7 +4,7 @@
 #include <torch/library.h>
-// Note: overwrite the external defination for sharing same name between
+// Note: overwrite the external definition for sharing same name between
 // libraries use different ISAs.
 #define TORCH_EXTENSION_NAME _C

--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -35,11 +35,11 @@ __global__ void batched_moe_align_block_size_kernel(
  int32_t const block_ids_size = sorted_ids_size / block_size;
  int32_t const SENTINEL =
      num_batches * max_tokens_per_batch;  // To denote invalid entries.
-  // Intialize sorted_ids
+  // Initialize sorted_ids
  for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) {
    sorted_ids[i] = SENTINEL;
  }
-  // Intialize expert_ids with -1
+  // Initialize expert_ids with -1
  for (size_t i = threadIdx.x; i < block_ids_size; i += stride) {
    block_ids[i] = -1;
  }

--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@@ -542,7 +542,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
      if (!lane_id) {
        // Store scales.
        if constexpr (std::is_same<scale_t, uint8_t>::value) {
-          // Packed UE8MO format. Remove Mantissa.
+          // Packed UE8M0 format. Remove Mantissa.
          *y_s_ptr = reinterpret_cast<int16_t&>(y_s) >> 7;
          bool const jump_pack = (current_group_id + 1) % 4 == 0;

--- a/csrc/rocm/skinny_gemms.cu
+++ b/csrc/rocm/skinny_gemms.cu
@@ -1476,7 +1476,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  #endif
    // B[] staging is cooperative across GrpsShrB, so sync here before reading
-    // back. This wait is currently inserted by compiler, but not gauranteed.
+    // back. This wait is currently inserted by compiler, but not guaranteed.
    asm volatile("s_waitcnt 0");
    __syncthreads();

--- a/docs/design/cuda_graphs.md
+++ b/docs/design/cuda_graphs.md
@@ -98,7 +98,7 @@ The goal of this structure is to uniquely identify a (padded) batch with minimal
 ### `CudagraphDispatcher`
-The [CudagraphDispatcher][vllm.v1.cudagraph_dispatcher.CudagraphDispatcher] takes responsibility for maintaining two sets of valid dispatching keys, one set for `FULL` runtime mode and one set for `PIECEWISE` runtime mode, and dispatches the correct runtime mode and the dispatching keys before executing the model's forwards. It will take in the initial key (a rough batch_descriptor for the padded input) and return the selected runtime mode and the final batch_descriptor, then tell the CUDAGraphWarpper instances that decision through forward contexts. Notice that `CudagraphDispatcher` is the only source of truth for available CUDA Graph keys and `CUDAGraphWrapper` instances can blindly trust the forward context on what CUDA Graphs to dispatch to. This lets us simplify the wrapper code and centralize the logic in the dispatcher.
+The [CudagraphDispatcher][vllm.v1.cudagraph_dispatcher.CudagraphDispatcher] takes responsibility for maintaining two sets of valid dispatching keys, one set for `FULL` runtime mode and one set for `PIECEWISE` runtime mode, and dispatches the correct runtime mode and the dispatching keys before executing the model's forwards. It will take in the initial key (a rough batch_descriptor for the padded input) and return the selected runtime mode and the final batch_descriptor, then tell the CUDAGraphWrapper instances that decision through forward contexts. Notice that `CudagraphDispatcher` is the only source of truth for available CUDA Graph keys and `CUDAGraphWrapper` instances can blindly trust the forward context on what CUDA Graphs to dispatch to. This lets us simplify the wrapper code and centralize the logic in the dispatcher.
 The dispatching keys are initialized through the dispatcher's `initialize_cudagraph_keys` method, which is called by the gpu_model_runner after all possible attention backends are initialized. This is where we can get much fancier in the future and “prepare” all kinds of CUDA Graphs combinations. For now, we just append available keys based on the valid combos of `decode_mode`/`mixed_mode` of `cudagraph_mode` and `cudagraph_capture_sizes` in the compilation config.

--- a/docs/design/fused_moe_modular_kernel.md
+++ b/docs/design/fused_moe_modular_kernel.md
@@ -47,7 +47,7 @@ The TopK Weight Application and Reduction components happen right after the Unpe
 Please find the implementations of TopKWeightAndReduce [here](../../vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py).
 `FusedMoEPrepareAndFinalizeModular::finalize()` method accepts a `TopKWeightAndReduce` argument that is invoked inside the method.
-The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEExpertsModular` and `FusedMoEPerpareAndFinalize` implementations to determine where the TopK Weight Application and Reduction happens.
+The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEExpertsModular` and `FusedMoEPrepareAndFinalize` implementations to determine where the TopK Weight Application and Reduction happens.
 * `FusedMoEExpertsModular::finalize_weight_and_reduce_impl` method returns `TopKWeightAndReduceNoOp` if the `FusedMoEExpertsModular` implementation does the weight application and reduction itself.
 * `FusedMoEExpertsModular::finalize_weight_and_reduce_impl` method returns `TopKWeightAndReduceContiguous` / `TopKWeightAndReduceNaiveBatched` / `TopKWeightAndReduceDelegate` if the `FusedMoEExpertsModular` implementation needs the `FusedMoEPrepareAndFinalizeModular::finalize()` to do the weight application and reduction.

--- a/docs/design/logits_processors.md
+++ b/docs/design/logits_processors.md
@@ -352,7 +352,7 @@ The `BatchUpdate` abstraction models the persistent batch as a list of requests,
        (s, d, UNIDIRECTIONAL or SWAP)
        ```
-    * If the Move specifies `UNIDRECTIONAL`:
+    * If the Move specifies `UNIDIRECTIONAL`:
        * The request at index `s` is moved to index `d`; index `s` becomes an empty slot

--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -141,7 +141,7 @@ Every plugin has three parts:
    - triton ops
      Custom way doesn't work for triton ops now.
-7. (optional) Implement other plugable modules, such as lora, graph backend, quantization, mamba attention backend, etc.
+7. (optional) Implement other pluggable modules, such as lora, graph backend, quantization, mamba attention backend, etc.
 ## Compatibility Guarantee

--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -641,7 +641,7 @@ Then you obtain the sparse embeddings like this:
 curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
     "model": "BAAI/bge-m3",
     "task": "token_classify",
-     "input": ["What is BGE M3?", "Defination of BM25"]
+     "input": ["What is BGE M3?", "Definition of BM25"]
 }'
 ```
@@ -657,7 +657,7 @@ You can obtain the colbert embeddings like this:
 curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
     "model": "BAAI/bge-m3",
     "task": "token_embed",
-     "input": ["What is BGE M3?", "Defination of BM25"]
+     "input": ["What is BGE M3?", "Definition of BM25"]
 }'
 ```

--- a/examples/online_serving/dashboards/grafana/query_statistics.json
+++ b/examples/online_serving/dashboards/grafana/query_statistics.json
@@ -349,7 +349,7 @@
        "defaults": {
          "color": { "mode": "thresholds" },
          "mappings": [
-            { "options": { "Calcultion": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
+            { "options": { "Calculation": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
          ],
          "thresholds": {
            "mode": "absolute",

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -124,193 +124,54 @@ python = "./.venv"
 [tool.typos.files]
 # these files may be written in non english words
-extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*",
+extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*", "tests/tokenizers_/*",
-    "benchmarks/sonnet.txt", "tests/lora/data/*", "build/*",
+    "benchmarks/sonnet.txt", "tests/lora/data/*", "examples/pooling/token_embed/*", "build/*",
-    "vllm/third_party/*", "vllm/entrypoints/serve/instrumentator/static/*", 
+    "vllm/third_party/*", "vllm/entrypoints/serve/instrumentator/static/*", "tests/entrypoints/openai/test_transcription_validation.py",
-    "docs/governance/process.md"]
+    "docs/governance/process.md", "tests/v1/engine/test_fast_incdec_prefix_err.py", ".git/*"]
-ignore-hidden = true
+ignore-hidden = false
-ignore-files = true
-ignore-dot = true
-ignore-vcs = true
-ignore-global = true
-ignore-parent = true
 [tool.typos.default]
-binary = false
+extend-ignore-identifiers-re = [".*[Uu][Ee][0-9][Mm][0-9].*"]
-check-filename = false
-check-file = true
-unicode = true
-ignore-hex = true
-identifier-leading-digits = false
-locale = "en"
-extend-ignore-identifiers-re = ["NVML_*", ".*Unc.*", ".*_thw",
-    ".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*",
-     ".*[Tt]h[rR].*"]
-extend-ignore-words-re = []
-extend-ignore-re = []
 [tool.typos.default.extend-identifiers]
 bbc5b7ede = "bbc5b7ede"
-womens_doubles = "womens_doubles"
-v_2nd = "v_2nd"
-# splitted_input = "splitted_input"
 NOOPs = "NOOPs"
-typ = "typ"
 nin_shortcut = "nin_shortcut"
-UperNetDecoder = "UperNetDecoder"
-subtile = "subtile"
 cudaDevAttrMaxSharedMemoryPerBlockOptin = "cudaDevAttrMaxSharedMemoryPerBlockOptin"
-SFOuput = "SFOuput"
-# huggingface transformers repo uses these words
 depthwise_seperable_out_channel = "depthwise_seperable_out_channel"
-DepthWiseSeperableConv1d = "DepthWiseSeperableConv1d"
+pard_token = "pard_token"
-depthwise_seperable_CNN = "depthwise_seperable_CNN"
+ptd_token_id = "ptd_token_id"
+ser_de = "ser_de"
+shared_memory_per_block_optin = "shared_memory_per_block_optin"
+FoPE = "FoPE"
+k_ot = "k_ot"
+view_seperator = "view_seperator"
+inverse_std_variences = "inverse_std_variences"
 [tool.typos.default.extend-words]
 iy = "iy"
-tendencias = "tendencias"
 indx = "indx"
 # intel cpu features
 tme = "tme"
 dout = "dout"
 Pn = "Pn"
 arange = "arange"
+thw = "thw"
+subtile = "subtile"
+HSA = "HSA"
+setp = "setp"
+CPY = "CPY"
+thr = "thr"
+Thr = "Thr"
 PARD = "PARD"
 pard = "pard"
 AKS = "AKS"
-[tool.typos.type.py]
-extend-glob = []
-extend-ignore-identifiers-re = []
-extend-ignore-words-re = []
-extend-ignore-re = []
-[tool.typos.type.py.extend-identifiers]
-arange = "arange"
-NDArray = "NDArray"
-EOFError = "EOFError"
-fo = "fo"
 ba = "ba"
+fo = "fo"
-[tool.typos.type.py.extend-words]
-ba = "ba"
-nd = "nd"
-[tool.typos.type.cpp]
-extend-glob = ["*.cu"]
-extend-ignore-identifiers-re = []
-extend-ignore-words-re = []
-extend-ignore-re = []
-[tool.typos.type.cpp.extend-identifiers]
-countr_one = "countr_one"
-k_ot = "k_ot"
-ot = "ot"
-[tool.typos.type.cpp.extend-words]
-[tool.typos.type.rust]
-extend-glob = []
-extend-ignore-identifiers-re = []
-extend-ignore-words-re = []
-extend-ignore-re = []
-[tool.typos.type.rust.extend-identifiers]
-flate2 = "flate2"
-[tool.typos.type.rust.extend-words]
-ser = "ser"
-[tool.typos.type.lock]
-extend-glob = []
-check-file = false
-extend-ignore-identifiers-re = []
-extend-ignore-words-re = []
-extend-ignore-re = []
-[tool.typos.type.lock.extend-identifiers]
-[tool.typos.type.lock.extend-words]
-[tool.typos.type.jl]
-extend-glob = []
-extend-ignore-identifiers-re = []
-extend-ignore-words-re = []
-extend-ignore-re = []
-[tool.typos.type.jl.extend-identifiers]
-[tool.typos.type.jl.extend-words]
-modul = "modul"
-egals = "egals"
-usig = "usig"
-egal = "egal"
-[tool.typos.type.go]
-extend-glob = []
-extend-ignore-identifiers-re = []
-extend-ignore-words-re = []
-extend-ignore-re = []
-[tool.typos.type.go.extend-identifiers]
-flate = "flate"
-[tool.typos.type.go.extend-words]
-[tool.typos.type.css]
-extend-glob = []
-extend-ignore-identifiers-re = []
-extend-ignore-words-re = []
-extend-ignore-re = []
-[tool.typos.type.css.extend-identifiers]
 nd = "nd"
+eles = "eles"
-[tool.typos.type.css.extend-words]
+datas = "datas"
-[tool.typos.type.man]
-extend-glob = []
-extend-ignore-identifiers-re = []
-extend-ignore-words-re = []
-extend-ignore-re = []
-[tool.typos.type.man.extend-identifiers]
-Nd = "Nd"
-[tool.typos.type.man.extend-words]
-[tool.typos.type.cert]
-extend-glob = []
-check-file = false
-extend-ignore-identifiers-re = []
-extend-ignore-words-re = []
-extend-ignore-re = []
-[tool.typos.type.cert.extend-identifiers]
-[tool.typos.type.cert.extend-words]
-[tool.typos.type.sh]
-extend-glob = []
-extend-ignore-identifiers-re = []
-extend-ignore-words-re = []
-extend-ignore-re = []
-[tool.typos.type.sh.extend-identifiers]
-ot = "ot"
-[tool.typos.type.sh.extend-words]
-[tool.typos.type.vimscript]
-extend-glob = []
-extend-ignore-identifiers-re = []
-extend-ignore-words-re = []
-extend-ignore-re = []
-[tool.typos.type.vimscript.extend-identifiers]
-windo = "windo"
-[tool.typos.type.vimscript.extend-words]
 [tool.uv]
 no-build-isolation-package = ["torch"]
--- a/tests/compile/test_decorator.py
+++ b/tests/compile/test_decorator.py
@@ -234,7 +234,7 @@ def test_conditional_compile_enable_if(use_inductor_graph_partition, monkeypatch
        expected_num_backend_compilations = 4
    # A has support_torch_compile but enable_if fn returns False
-    # enalbe_if will be True for B, so we expect mod1 and mod2
+    # enable_if will be True for B, so we expect mod1 and mod2
    # to be compiled
    with compilation_counter.expect(
        num_graphs_seen=2,

--- a/tests/compile/test_wrapper.py
+++ b/tests/compile/test_wrapper.py
@@ -95,7 +95,7 @@ def test_torch_compile_wrapper(use_bytecode_hook, monkeypatch):
            f"Expected {expected1}, got {result1}"
        )
-        # Second call should triger another compilation
+        # Second call should trigger another compilation
        x2 = torch.tensor([1, 2, 3])
        result2 = wrapper(x2)
        expected2 = torch.tensor([100, 200, 300])

--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -444,7 +444,7 @@ def ref_multi_query_kv_attention(
 @pytest.mark.parametrize("attention_cls", [Attention, MMEncoderAttention])
-def test_num_heads_not_divisble_by_num_kv_heads(attention_cls: type) -> None:
+def test_num_heads_not_divisible_by_num_kv_heads(attention_cls: type) -> None:
    head_size = 64
    scale = float(1.0 / (head_size**0.5))
    num_heads = 16