[Bugfix][CI] fix typos (#34934)

Signed-off-by: 1195343015 <1195343015@qq.com> Signed-off-by: Jiayi Yan <66017932+1195343015@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>

[Bugfix][CI] fix typos (#34934)
Signed-off-by: 1195343015 <1195343015@qq.com> Signed-off-by: Jiayi Yan <66017932+1195343015@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
6a895197 · Jiayi Yan · GitHub · 8c760b6a · 6a895197 · 6a895197
Unverified Commit 6a895197 authored Mar 06, 2026 by Jiayi Yan Committed by GitHub Mar 05, 2026
20 changed files
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -3106,7 +3106,7 @@ def cpu_attn_get_scheduler_metadata(
    isa: str,
    enable_kv_split: bool,
 ) -> torch.Tensor:
-    sheduler_metadata = torch.ops._C.get_scheduler_metadata(
+    scheduler_metadata = torch.ops._C.get_scheduler_metadata(
        num_reqs,
        num_heads,
        num_kv_heads,
@@ -3119,7 +3119,7 @@ def cpu_attn_get_scheduler_metadata(
        isa,
        enable_kv_split,
    )
-    return sheduler_metadata
+    return scheduler_metadata


 def cpu_attn_reshape_and_cache(

--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -872,7 +872,7 @@ class CompilationConfig:
                )

        # Currently only eager and inductor backend are supported.
-        # for piecewise compilation. Custom backends are not suppported for
+        # for piecewise compilation. Custom backends are not supported for
        # piecewise compilation. Update when more backends are supported.
        if self.mode == CompilationMode.VLLM_COMPILE and self.backend not in [
            "",

--- a/vllm/config/observability.py
+++ b/vllm/config/observability.py
@@ -59,7 +59,7 @@ class ObservabilityConfig:

    enable_layerwise_nvtx_tracing: bool = False
    """Enable layerwise NVTX tracing. This traces the execution of each layer or
-    module in the model and attach informations such as input/output shapes to
+    module in the model and attach information such as input/output shapes to
    nvtx range markers. Noted that this doesn't work with CUDA graphs enabled."""

    enable_mfu_metrics: bool = False

--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -592,7 +592,7 @@ class VllmConfig:

        If the user configuration does not specify a value for a default field
        and if the default field is still None after all user selections are
-        applied, then default values will be applied to the field. User speciied
+        applied, then default values will be applied to the field. User specified
        fields will not be overridden by the default.

        Args:

--- a/vllm/distributed/eplb/policy/default.py
+++ b/vllm/distributed/eplb/policy/default.py
@@ -44,7 +44,7 @@ class DefaultEplbPolicy(AbstractEplbPolicy):
            rank_in_pack = np.zeros_like(pack_index, dtype=np.int64)
            return pack_index, rank_in_pack

-        # Sort and get indices in decending order
+        # Sort and get indices in descending order
        indices = np.argsort(-weight, axis=-1)

        pack_index = np.full((num_layers, num_groups), -1, dtype=np.int64)

--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -129,7 +129,7 @@ class KVConnectorRole(enum.Enum):
 class KVConnectorHandshakeMetadata(ABC):  # noqa: B024
    """
    Metadata used for out of band connector handshake between
-    P/D workers. This needs to serializeable.
+    P/D workers. This needs to serializable.
    """

    pass

--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
@@ -398,7 +398,7 @@ class ReqMeta:
        )


-def need_gpu_interm_buffer(lmcache_config: LMCacheEngineConfig):
+def need_gpu_interim_buffer(lmcache_config: LMCacheEngineConfig):
    return not lmcache_config.enable_pd


@@ -497,7 +497,7 @@ def _init_lmcache_engine(
        use_mla,
    )

-    use_gpu = need_gpu_interm_buffer(lmcache_config)
+    use_gpu = need_gpu_interim_buffer(lmcache_config)
    vllm_gpu_connector: (
        VLLMBufferLayerwiseGPUConnector
        | VLLMPagedMemGPUConnectorV2

--- a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/mooncake_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/mooncake_connector.py
@@ -481,7 +481,7 @@ class MooncakeConnectorWorker:
        )

        self._remote_agents: dict[EngineId, dict[int, dict[int, str]]] = {}
-        self._pending_bootstrap_querys: dict[str, asyncio.Event] = {}
+        self._pending_bootstrap_queries: dict[str, asyncio.Event] = {}
        self.side_channel_port: int = 0  # we will bind it in register_kv_caches()
        self.engine_id: EngineId = engine_id
        self.tp_rank = get_tensor_model_parallel_rank()
@@ -1077,7 +1077,7 @@ class MooncakeConnectorWorker:
                    response = self._xfer_resp_decoder.decode(ret_msg)
                    if response.status == MooncakeXferResponseStatus.ERROR:
                        logger.error(
-                            "Error happens during tranfering kvcache for %s: %s",
+                            "Error happens during transferring kvcache for %s: %s",
                            req_ids,
                            response.err_msg,
                        )
@@ -1140,8 +1140,8 @@ class MooncakeConnectorWorker:
            )

        # Always notify others regardless of connection success or failure.
-        self._pending_bootstrap_querys[remote_bootstrap_addr].set()
-        del self._pending_bootstrap_querys[remote_bootstrap_addr]
+        self._pending_bootstrap_queries[remote_bootstrap_addr].set()
+        del self._pending_bootstrap_queries[remote_bootstrap_addr]

    def receive_kv(
        self,
@@ -1171,11 +1171,11 @@ class MooncakeConnectorWorker:
        pull_metas: dict[ReqId, PullReqMeta],
    ):
        remote_bootstrap_addr = next(iter(pull_metas.values())).remote_bootstrap_addr
-        if remote_bootstrap_addr not in self._pending_bootstrap_querys:
-            self._pending_bootstrap_querys[remote_bootstrap_addr] = asyncio.Event()
+        if remote_bootstrap_addr not in self._pending_bootstrap_queries:
+            self._pending_bootstrap_queries[remote_bootstrap_addr] = asyncio.Event()
            await self._connect_to_prefiller_bootstrap(remote_bootstrap_addr)
        else:
-            await self._pending_bootstrap_querys[remote_bootstrap_addr].wait()
+            await self._pending_bootstrap_queries[remote_bootstrap_addr].wait()

        if remote_engine_id not in self._remote_agents:
            logger.error(

--- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
@@ -720,7 +720,7 @@ class OffloadPromMetrics(KVConnectorPromMetrics):
        per_engine_labelvalues: dict[int, list[object]],
    ):
        super().__init__(vllm_config, metric_types, labelnames, per_engine_labelvalues)
-        # (engine_idx, transfer_tupe) -> (metric with bounded labels)
+        # (engine_idx, transfer_type) -> (metric with bounded labels)
        self.histogram_transfer_size: dict[tuple[int, str], PromMetricT] = {}
        self.counter_kv_bytes: dict[tuple[int, str], PromMetricT] = {}
        self.counter_kv_transfer_time: dict[tuple[int, str], PromMetricT] = {}

--- a/vllm/entrypoints/openai/responses/serving.py
+++ b/vllm/entrypoints/openai/responses/serving.py
@@ -1647,9 +1647,9 @@ class OpenAIServingResponses(OpenAIServing):
                # TODO: in streaming, we noticed this bug:
                # https://github.com/vllm-project/vllm/issues/25697
                await self._initialize_tool_sessions(request, context, exit_stack)
-                processer = self._process_harmony_streaming_events
+                processor = self._process_harmony_streaming_events
            else:
-                processer = self._process_simple_streaming_events
+                processor = self._process_simple_streaming_events
            # TODO Hanchen make sampling params to include the structural tag

            initial_response = ResponsesResponse.from_request(
@@ -1677,7 +1677,7 @@ class OpenAIServingResponses(OpenAIServing):
            )

            try:
-                async for event_data in processer(
+                async for event_data in processor(
                    request,
                    sampling_params,
                    result_generator,

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1520,7 +1520,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
        os.getenv("VLLM_DEEPEP_BUFFER_SIZE_MB", "1024")
    ),
    # Force DeepEP to use intranode kernel for inter-node communication in
-    # high throughput mode. This is useful archive higher prefill throuhgput
+    # high throughput mode. This is useful archive higher prefill throughput
    # on system supports multi-node nvlink (e.g GB200).
    "VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE": lambda: bool(
        int(os.getenv("VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE", "0"))

--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -175,7 +175,7 @@ class DPMetadata:
    # Get the cumulative tokens across sequence parallel ranks.
    # In this case the input to the MoEs will be distributed w.r.t both
    # DP and TP rank.
-    # When sp_size==1, this is just the cummulative num tokens across DP.
+    # When sp_size==1, this is just the cumulative num tokens across DP.
    def cu_tokens_across_sp(self, sp_size: int) -> torch.Tensor:
        num_tokens_across_sp_cpu = (
            self.num_tokens_across_dp_cpu - 1 + sp_size

--- a/vllm/lora/layers/row_parallel_linear.py
+++ b/vllm/lora/layers/row_parallel_linear.py
@@ -57,10 +57,10 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
            input_parallel = input_
        else:
            # TODO: simplify code below
-            splitted_input = split_tensor_along_last_dim(
+            split_input = split_tensor_along_last_dim(
                input_, num_partitions=self.tp_size
            )
-            input_parallel = splitted_input[self.tp_rank].contiguous()
+            input_parallel = split_input[self.tp_rank].contiguous()

        # Matrix multiply.
        bias_ = (

--- a/vllm/lora/lora_model.py
+++ b/vllm/lora/lora_model.py
@@ -11,7 +11,7 @@ from vllm.lora.lora_weights import LoRALayerWeights
 from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.utils import (
    get_lora_id,
-    is_base_embeddding_weights,
+    is_base_embedding_weights,
    parse_fine_tuned_lora_name,
 )
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
@@ -86,7 +86,7 @@ class LoRAModel:
        pin_memory = str(device) == "cpu" and is_pin_memory_available()
        loras: dict[str, LoRALayerWeights] = {}
        for tensor_name, tensor in tensors.items():
-            if is_base_embeddding_weights(tensor_name):
+            if is_base_embedding_weights(tensor_name):
                continue
            # Skip modules based on model-defined prefixes (e.g., MTP layers)
            if skip_prefixes and cls._should_skip_module(tensor_name, skip_prefixes):
@@ -162,7 +162,7 @@ class LoRAModel:

        def check_unexpected_modules(modules: dict):
            for lora_module in modules.keys():  # noqa
-                if is_base_embeddding_weights(lora_module):
+                if is_base_embedding_weights(lora_module):
                    continue
                # Handle PEFT file format where experts.base_layer is the
                # gate_up_proj and experts is the down_proj

--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -193,7 +193,7 @@ def parse_fine_tuned_lora_name(
    raise ValueError(f"{name} is unsupported LoRA weight")


-def is_base_embeddding_weights(name: str) -> bool:
+def is_base_embedding_weights(name: str) -> bool:
    # hardcoded subfixes for input & output embedding weights
    embedding_suffixes = (
        ".embed_tokens.base_layer.weight",

--- a/vllm/model_executor/kernels/linear/mixed_precision/cpu.py
+++ b/vllm/model_executor/kernels/linear/mixed_precision/cpu.py
@@ -82,7 +82,7 @@ class CPUWNA16LinearKernel(MPLinearKernel):
        weight = weight.permute(0, 2, 1).reshape(input_size, output_size).contiguous()
        weight = pack_quantized_values_into_int32(weight, self.config.weight_type, 1)
        # make 16 output channel as a block and transpose to the make
-        # the block contigous
+        # the block contiguous
        weight = (
            weight.view(input_size, -1, 16 // pack_factor)
            .permute(1, 0, 2)

--- a/vllm/model_executor/layers/attention/mla_attention.py
+++ b/vllm/model_executor/layers/attention/mla_attention.py
@@ -2540,7 +2540,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
            )
            # workspace
            # |------- N tokens --------|--------- N*dcp_size tokens ----------|
-            # |<- use for loca_gather ->|<--------- use for allgather -------->|
+            # |<- use for local_gather ->|<--------- use for allgather -------->|
            allgather_offset = workspace.shape[0] // (dcp_world_size + 1)
            assert allgather_offset * (dcp_world_size + 1) == workspace.shape[0]
            assert toks <= allgather_offset

--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -394,5 +394,5 @@ class FlashInferExperts(mk.FusedMoEExpertsModular):

    def moe_sum(self, input: torch.Tensor, output: torch.Tensor) -> None:
        # No support for LoRA in flashinfer_cutlass_fused_moe.
-        # See TODOs in flashinfer functions runMoe and runMoeMinLantency.
+        # See TODOs in flashinfer functions runMoe and runMoeMinLatency.
        raise NotImplementedError("LoRA is not supported for flashinfer_cutlass_moe")
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -409,7 +409,7 @@ def batched_fused_marlin_moe(
    Note that the moe_align_block_size function indicates,
        - What rows of the A matrix (hidden_states) to access during the
        matmul, via sorted_ids output.
-        - What expert_id to use for each block matmul, via expert_ids ouptut.
+        - What expert_id to use for each block matmul, via expert_ids output.

    In the batched version, the tokens are already grouped/batched by experts
    they subscribe to. Due to this, we can represent the batched hidden_states

--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -606,7 +606,7 @@ class FusedMoEExperts(ABC):
        """
        Whether the kernel supports deployment in particular parallel config.

-        Can be overriden if a kernel does not support EP, SP or some other
+        Can be overridden if a kernel does not support EP, SP or some other
        configuration.
        """
        raise NotImplementedError
@@ -620,7 +620,7 @@ class FusedMoEExperts(ABC):
        """
        Whether the kernel supports a routing method (e.g. GroupedTopK).

-        Can be overriden by monolithic kernels that execute the router
+        Can be overridden by monolithic kernels that execute the router
        in addition to the experts if certain routers are not supported.
        """
        return True
@@ -633,7 +633,7 @@ class FusedMoEExperts(ABC):
        """
        Whether a kernel supports a particular dtype for router logits input.

-        Can be overriden by monolithic kernels that execute the router
+        Can be overridden by monolithic kernels that execute the router
        in addition to the experts if certain dtypes are not supported.
        """
        return True