[Doc]: fix typos in Python scripts (#23828)

Signed-off-by: Didier Durand <durand.didier@gmail.com>

[Doc]: fix typos in Python scripts (#23828)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
d3da2eea · Didier Durand · GitHub · bfab2196 · d3da2eea · d3da2eea
Unverified Commit d3da2eea authored Aug 28, 2025 by Didier Durand Committed by GitHub Aug 28, 2025
10 changed files
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -271,7 +271,7 @@ def split_graph(graph: fx.GraphModule,
        outputs.append(
            SplitItem(name, graph_id, (graph_id in split_op_graphs), module))

-    # sort by intetger graph_id, rather than string name
+    # sort by integer graph_id, rather than string name
    outputs.sort(key=lambda x: x.graph_id)

    return split_gm, outputs
@@ -424,7 +424,7 @@ class VllmBackend:

        # if the model is initialized with a non-empty prefix,
        # then usually it's enough to use that prefix,
-        # e.g. launguage_model, vision_model, etc.
+        # e.g. language_model, vision_model, etc.
        # when multiple parts are initialized as independent
        # models, we need to use the model_tag to distinguish
        # them, e.g. backbone (default), eagle_head, etc.

--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -115,7 +115,7 @@ class CacheConfig:

    In some KV sharing setups, e.g. YOCO (https://arxiv.org/abs/2405.05254),
    some layers can skip tokens corresponding to prefill. This flag enables
-    attention metadata for eligible layers to be overriden with metadata
+    attention metadata for eligible layers to be overridden with metadata
    necessary for implementing this optimization in some models (e.g. Gemma3n)
    """


--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1053,7 +1053,7 @@ class EngineArgs:
                                   self.trust_remote_code, self.revision,
                                   self.code_revision, self.config_format)

-            # if loading a SpeculatorsConfig, load the specualtive_config
+            # if loading a SpeculatorsConfig, load the speculative_config
            # details from the config directly
            # no user input required / expected
            if isinstance(hf_config, SpeculatorsConfig):

--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -640,7 +640,7 @@ class BaseMultiModalContentParser(ABC):
    def __init__(self) -> None:
        super().__init__()

-        # stores model placehodlers list with corresponding
+        # stores model placeholders list with corresponding
        # general MM placeholder:
        # {
        #   "<##IMAGE##>": ["<image>", "<image>", "<image>"],

--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1096,7 +1096,7 @@ if envs.VLLM_SERVER_DEV_MODE:
            raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value,
                                detail="Missing 'method' in request body")
        # For security reason, only serialized string args/kwargs are passed.
-        # User-defined `method` is responsible for deseralization if needed.
+        # User-defined `method` is responsible for deserialization if needed.
        args: list[str] = body.get("args", [])
        kwargs: dict[str, str] = body.get("kwargs", {})
        timeout: Optional[float] = body.get("timeout")

--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -579,7 +579,7 @@ class CompressedTensorsConfig(QuantizationConfig):
            format = scheme_dict.get("format")

        # Find the sparsity scheme of the layer
-        # assume that fused layers inerhit first component's sparsity scheme
+        # assume that fused layers inherit first component's sparsity scheme
        sparsity_targets = (self.sparsity_scheme_map.keys() -
                            set(self.sparsity_ignore_list))
        sparsity_scheme: Optional[SparsityCompressionConfig] = None

--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -71,7 +71,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
    ) -> "CompressedTensorsMoEMethod":
        # TODO: @dsikka: refactor this to use schemes as other kernels
        # are supported + check if the layer is being ignored.
-        # Check if a using "Linear" to select scheems
+        # Check if a using "Linear" to select schemes
        if "Linear" in quant_config.target_scheme_map:
            matched_target = "Linear"
        else:

--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -11,7 +11,7 @@ logger = init_logger(__name__)

 class CudagraphDispatcher:
    """
-    Runtime cudagraph dispatcher to dispach keys for multiple set of cudagraphs.
+    Runtime cudagraph dispatcher to dispatch keys for multiple set of cudagraphs.

    The dispatcher stores two sets of dispatch keys, one for PIECEWISE and one
    for FULL cudagraph runtime mode. The keys are initialized depending on 
@@ -21,7 +21,7 @@ class CudagraphDispatcher:

    At runtime, the dispatch method generates the runtime cudagraph mode (FULL, 
    PIECEWISE, or NONE for no cudagraph) and the valid key (batch descriptor)
-    based on the input key. After dispatching (commuicate via forward context), 
+    based on the input key. After dispatching (communicate via forward context),
    the cudagraph wrappers will trust the dispatch key to do either capturing
    or replaying (if mode matched), or pass through to the underlying runnable 
    without cudagraph (if mode no match or mode is NONE).

--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -110,7 +110,7 @@ class BlockTable:
        self.block_table_cpu.fill_(0)

    def get_device_tensor(self) -> torch.Tensor:
-        """Ruturns the device tensor of the block table."""
+        """Returns the device tensor of the block table."""
        return self.block_table

    def get_cpu_tensor(self) -> torch.Tensor:

--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -43,7 +43,7 @@ class CPUModelRunner(GPUModelRunner):
        Args:
            scheduler_output: The scheduler output.
        """
-        # Attention free models have zero kv_cache_goups, however models
+        # Attention free models have zero kv_cache_groups, however models
        # like Mamba are also attention free but use the kv_cache for
        # keeping its internal state. This is why we check the number
        # of kv_cache groups instead of solely checking