[Bugfix][CI] fix typos (#34934)

Signed-off-by: 1195343015 <1195343015@qq.com> Signed-off-by: Jiayi Yan <66017932+1195343015@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>

[Bugfix][CI] fix typos (#34934)
Signed-off-by: 1195343015 <1195343015@qq.com> Signed-off-by: Jiayi Yan <66017932+1195343015@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
6a895197 · Jiayi Yan · GitHub · 8c760b6a · 6a895197 · 6a895197
Unverified Commit 6a895197 authored Mar 06, 2026 by Jiayi Yan Committed by GitHub Mar 05, 2026
18 changed files
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -916,7 +916,7 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
        self, max_pixels: int | None = None
    ) -> ImageSize:
        # NOTE: Simply processing a huge size with _get_vision_info might not give a
-        # size that maximizes the number of featrues, i.e., the number of (merged)
+        # size that maximizes the number of features, i.e., the number of (merged)
        # patches. This is because the number of patches limits the allowed aspect
        # ratios. For example, suppose the maximum number of patches is 1280. A square
        # image cannot be broken down into 1280 patches, so feeding a giant square image

--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -459,14 +459,14 @@ class Step3VLProcessor:
            image_inputs = {}
            text_inputs = self.tokenizer(text)
        else:
-            splitted_images_data = self._split_images(images)
+            split_images_data = self._split_images(images)
            pixel_values_lst = []
            patch_pixel_values_lst = []
            patch_newline_mask_lst = []
            image_repl_str_lst = []
            image_repl_ids_lst = []
            num_patches = []
-            for raw_img, img_patches, patch_newline_mask in splitted_images_data:
+            for raw_img, img_patches, patch_newline_mask in split_images_data:
                pixel_values_lst.extend(self._convert_images_to_pixel_values([raw_img]))
                if len(img_patches) > 0:

--- a/vllm/model_executor/models/step3p5.py
+++ b/vllm/model_executor/models/step3p5.py
@@ -353,7 +353,7 @@ class FusedMoEBlock(nn.Module):
        if swiglu_limit not in (None, 0):
            swiglu_limit = float(swiglu_limit)
            assert swiglu_limit == 7.0, (
-                "Swiglu limit in fused moe block only suport 7.0 now."
+                "Swiglu limit in fused moe block only support 7.0 now."
            )
            activation = "swiglustep"
            logger.debug(

--- a/vllm/reasoning/ernie45_reasoning_parser.py
+++ b/vllm/reasoning/ernie45_reasoning_parser.py
@@ -18,7 +18,7 @@ logger = init_logger(__name__)
 class Ernie45ReasoningParser(BaseThinkingReasoningParser):
    """
    Reasoning parser for Ernie45 thinking model.
-    The Ernie45 thinking model ouput format is
+    The Ernie45 thinking model output format is
        abc\n</think>\n\n<response>\ndef\n</response>\n
    or  abc\n</think>\ndef
    """
@@ -73,7 +73,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
        Extract reasoning content from a delta message.
        Handles streaming output where previous + delta = current.
        Uses token IDs for faster processing.
-        The Ernie45 thinking model ouput format is
+        The Ernie45 thinking model output format is
            abc\n</think>\n\n<response>\ndef\n</response>\n
        or  abc\n</think>\ndef
        - 'abc' goes to reasoning
@@ -148,7 +148,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
    ) -> tuple[str | None, str | None]:
        """
        Extract reasoning content from the model output.
-        The Ernie45 thinking model ouput format is
+        The Ernie45 thinking model output format is
            abc\n</think>\n\n\n<response>\ndef\n</response>\n
        or  abc\n</think>\ndef
        - 'abc' goes to reasoning

--- a/vllm/renderers/hf.py
+++ b/vllm/renderers/hf.py
@@ -564,7 +564,7 @@ def replace_vision_chunk_video_placeholder(
    mm_data: "MultiModalDataDict",
    video_placeholder: str | None,
 ) -> str | list[int]:
-    # get video placehoder, replace it with runtime video-chunk prompts
+    # get video placeholder, replace it with runtime video-chunk prompts
    if video_placeholder and isinstance(prompt_raw, str):
        video_prompts = build_video_prompts_from_mm_data(mm_data)

--- a/vllm/renderers/inputs/preprocess.py
+++ b/vllm/renderers/inputs/preprocess.py
 """
-Schemas and utilites for preprocessing inputs.
+Schemas and utilities for preprocessing inputs.
 """
 # SPDX-License-Identifier: Apache-2.0

--- a/vllm/renderers/inputs/tokenize.py
+++ b/vllm/renderers/inputs/tokenize.py
 """
-Schemas and utilites for tokenization inputs.
+Schemas and utilities for tokenization inputs.
 """
 # SPDX-License-Identifier: Apache-2.0

--- a/vllm/tokenizers/mistral.py
+++ b/vllm/tokenizers/mistral.py
@@ -169,7 +169,7 @@ def _prepare_apply_chat_template_tools_and_messages(
                    tool.pop(tool_key)
                    logger.warning_once(
                        f"'{tool_key}' is not supported by mistral-common for tools. "
-                        "It has been poped from the tool definition."
+                        "It has been popped from the tool definition."
                    )
                if tool["type"] == "function":
                    function_keys = list(tool["function"].keys())
@@ -178,7 +178,7 @@ def _prepare_apply_chat_template_tools_and_messages(
                            tool["function"].pop(function_key)
                            logger.warning_once(
                                f"'{function_key}' is not supported by mistral-common "
-                                "for function tools. It has been poped from the "
+                                "for function tools. It has been popped from the "
                                "function definition."
                            )
                else:

--- a/vllm/transformers_utils/processors/ovis2_5.py
+++ b/vllm/transformers_utils/processors/ovis2_5.py
@@ -402,7 +402,7 @@ class Ovis2_5Processor(ProcessorMixin):
                images = [images]
        elif video is not None:
            is_video = True
-            # type of vidoe in dummy_mm_data is np.ndarray
+            # type of video in dummy_mm_data is np.ndarray
            if isinstance(video, np.ndarray):
                images = []
                for i in range(video.shape[0]):

--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -174,7 +174,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
            query_start_loc = query_start_loc[: num_decodes + 1]
            block_table_tensor = block_table_tensor[:num_decodes]
-        sheduler_metadata = ops.cpu_attn_get_scheduler_metadata(
+        scheduler_metadata = ops.cpu_attn_get_scheduler_metadata(
            num_reqs=num_reqs,
            num_heads=self.num_heads,
            num_kv_heads=self.num_kv_heads,
@@ -197,7 +197,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
            seq_lens=seq_lens,
            block_table=block_table_tensor,
            slot_mapping=slot_mapping,
-            scheduler_metadata=sheduler_metadata,
+            scheduler_metadata=scheduler_metadata,
            causal=causal,
            use_sdpa_prefill=self.use_sdpa_prefill,
            num_decode_tokens=num_decode_tokens,

--- a/vllm/v1/attention/backends/mamba_attn.py
+++ b/vllm/v1/attention/backends/mamba_attn.py
@@ -383,7 +383,7 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC):
            # Return a tensor of shape (#requests, #max blocks)
            state_indices_tensor = common_attn_metadata.block_table_tensor
-            # Additional cache-related varaiables:
+            # Additional cache-related variables:
            mamba_block_size = self.kv_cache_spec.block_size
            (
                block_idx_last_computed_token,

--- a/vllm/v1/attention/backends/mla/flashmla_sparse.py
+++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py
@@ -49,14 +49,14 @@ if TYPE_CHECKING:
 logger = init_logger(__name__)
-# For FP8 sparse attention we have two impelementations:
+# For FP8 sparse attention we have two implementations:
 # 1. Mixed batch mode: use the FP8 decode kernel for both prefill and decode this is
 #    done by treating all tokens as single batch.
 # 2. Separate prefill and decode mode: use the BF16 prefill kernel for prefill
 #    (upconverting the FP8 cache to BF16 then calling the prefill kernel) and using
 #    the FP8 decode kernel for decode.
 # Currently we use #1 when the number of heads per rank is low (i.e. TP) since the BF16
-# prefill kernel requires padding the numer of heads to 128 while the decode does not
+# prefill kernel requires padding the number of heads to 128 while the decode does not
 # so when the per ranke head count is below MIN_HEADS_FOR_BF16_PREFILL we use the mixed
 # batch mode (#2).
 MIN_HEADS_FOR_BF16_PREFILL = 32
@@ -126,7 +126,7 @@ class FlashMLASparseBackend(AttentionBackend):
        cache_dtype_str: str = "auto",
    ) -> tuple[int, ...]:
        if cache_dtype_str == "fp8_ds_mla":
-            # custom storage fromat is 656 bytes
+            # custom storage format is 656 bytes
            #  see FlashMLA readme.md for details
            return (num_blocks, block_size, 656)
        else:

--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -370,7 +370,7 @@ class AiterFlashAttentionMetadata:
    slot_mapping: torch.Tensor
    block_table: torch.Tensor
-    # prefill and deocde split
+    # prefill and decode split
    num_decodes: int
    num_decode_tokens: int
    num_prefills: int
@@ -1099,7 +1099,7 @@ class AiterFlashAttentionImpl(AttentionImpl):
                extend_tokens_slice = slice(
                    num_decode_tokens, num_decode_tokens + num_extend_tokens
                )
-                extend_querys = query[extend_tokens_slice]
+                extend_queries = query[extend_tokens_slice]
                extend_keys = key[extend_tokens_slice]
                extend_values = value[extend_tokens_slice]
                extend_outputs = output[extend_tokens_slice]
@@ -1110,7 +1110,7 @@ class AiterFlashAttentionImpl(AttentionImpl):
                    v_scale = attn_metadata.v_scale
                self.extend_forward(
                    attn_metadata=attn_metadata,
-                    query=extend_querys,
+                    query=extend_queries,
                    key=extend_keys,
                    value=extend_values,
                    key_cache=key_cache,

--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -863,7 +863,7 @@ class MambaManager(SingleTypeKVCacheManager):
        ):
            # Mamba can't rely on blocks generated by other requests in the current step
            # To put it in the next step, we return num_gpu_blocks + 1 so
-            # that kv_cache_manager will think there is no enough blocks to allocte now
+            # that kv_cache_manager will think there is no enough blocks to allocate now
            # and don't schedule it in the current step.
            return self.block_pool.num_gpu_blocks + 1
        if self.mamba_cache_mode != "align":

--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -1724,11 +1724,11 @@ class DPEngineCoreProc(EngineCoreProc):
        """
        Send notifications to EngineCoreClient, which can then forward
        the notifications to other engine core processes. It is used for:
-        1) In scale up: new core engines to notify exisiting core engines
+        1) In scale up: new core engines to notify existing core engines
           that they are ready;
        2) In scale down: removing core engines to notify EngineCoreClient
           so EngineCoreClient can release their ray placement groups;
-        3) Both scale up/down: to notify EngineCoreClient that exisiting
+        3) Both scale up/down: to notify EngineCoreClient that existing
           core engines have already switched to the new parallel setup.
        """
        if vllm_config is None:

--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@@ -194,7 +194,7 @@ class InputProcessor:
    @staticmethod
    def assign_request_id(request: EngineCoreRequest):
        """Replace the externally supplied request ID with an internal request ID
-        that adds 8 random characters in order to ensure uniquness.
+        that adds 8 random characters in order to ensure uniqueness.
        """
        if request.external_req_id is not None:
            raise ValueError(

--- a/vllm/v1/kv_offload/worker/cpu_gpu.py
+++ b/vllm/v1/kv_offload/worker/cpu_gpu.py
@@ -197,7 +197,7 @@ class SingleDirectionOffloadingHandler(OffloadingHandler):
            transfer = self._transfers.popleft()
            transfer_time = (
                transfer.start_event.elapsed_time(transfer.end_event) * 1e-3
-            )  # elapsed_time is in miliseconds
+            )  # elapsed_time is in milliseconds
            result = TransferResult(
                job_id=transfer.job_id,
                success=True,

--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -905,7 +905,7 @@ class GPUModelRunner(
        Args:
            scheduler_output: The scheduler output.
        """
-        # Attention free models have zero kv_cache_goups, however models
+        # Attention free models have zero kv_cache_groups, however models
        # like Mamba are also attention free but use the kv_cache for
        # keeping its internal state. This is why we check the number
        # of kv_cache groups instead of solely checking
@@ -1065,7 +1065,7 @@ class GPUModelRunner(
                # of the request. for example:
                # fist step: num_computed_tokens = 0, spec_tokens = [],
                # prev_num_draft_len = 0.
-                # second step: num_computed_tokens = 100(prompt lenth),
+                # second step: num_computed_tokens = 100(prompt length),
                # spec_tokens = [a,b], prev_num_draft_len = 0.
                # third step: num_computed_tokens = 100 + 2, spec_tokens = [c,d],
                # prev_num_draft_len = 2.
@@ -1412,30 +1412,30 @@ class GPUModelRunner(
                prev_draft_token_indices.extend(range(start, start + draft_len))
                indices_match &= prev_index == flattened_index
                max_flattened_index = max(max_flattened_index, flattened_index)
-        num_commmon_tokens = len(sample_flattened_indices)
+        num_common_tokens = len(sample_flattened_indices)
        total_without_spec = total_num_scheduled_tokens - total_num_spec_tokens
-        if num_commmon_tokens < total_without_spec:
+        if num_common_tokens < total_without_spec:
            # If not all requests are decodes from the last iteration,
            # We need to copy the input_ids_cpu to the GPU first.
            self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
            if self.enable_prompt_embeds:
                self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens)
                self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens)
-        if num_commmon_tokens == 0:
+        if num_common_tokens == 0:
            # No requests in common with the previous iteration
            # So input_ids.cpu will have all the input ids.
            return
-        if indices_match and max_flattened_index == (num_commmon_tokens - 1):
+        if indices_match and max_flattened_index == (num_common_tokens - 1):
            # Common-case optimization: the batch is unchanged
            # and no reordering happened.
            # The indices are both the same permutation of 0..N-1 so
            # we can copy directly using a single slice.
-            self.input_ids.gpu[:num_commmon_tokens].copy_(
+            self.input_ids.gpu[:num_common_tokens].copy_(
-                self.input_batch.prev_sampled_token_ids[:num_commmon_tokens, 0],
+                self.input_batch.prev_sampled_token_ids[:num_common_tokens, 0],
                non_blocking=True,
            )
            if self.enable_prompt_embeds:
-                self.is_token_ids.gpu[:num_commmon_tokens] = True
+                self.is_token_ids.gpu[:num_common_tokens] = True
            return
        # Upload the index tensors asynchronously so the scatter can be non-blocking.
        sampled_tokens_index_tensor = torch.tensor(
@@ -4383,7 +4383,7 @@ class GPUModelRunner(
            self.model.compile(fullgraph=True, backend=backend)
            return
        # for other compilation modes, cudagraph behavior is controlled by
-        # CudagraphWraper and CudagraphDispatcher of vllm.
+        # CudagraphWrapper and CudagraphDispatcher of vllm.
        # wrap the model with full cudagraph wrapper if needed.
        cudagraph_mode = self.compilation_config.cudagraph_mode
@@ -4444,7 +4444,7 @@ class GPUModelRunner(
        :param weights_path: path to load weights from if weights_iterator is not
            provided. Use path of original model if neither is provided.
        :param is_checkpoint_format: set to False if weights have already been processed
-            into kernel format (repacking, renaming, ect.)
+            into kernel format (repacking, renaming, etc.)
        """
        # TODO(@kylesayrs): generalize to all runners and loaders
        # argument validation