[Doc]: fix typos in Python comments (#24001)

Signed-off-by: Didier Durand <durand.didier@gmail.com>

[Doc]: fix typos in Python comments (#24001)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
9701352e · Didier Durand · GitHub · 749be00a · 9701352e · 9701352e
Unverified Commit 9701352e authored Aug 31, 2025 by Didier Durand Committed by GitHub Aug 31, 2025
10 changed files
--- a/vllm/compilation/monitor.py
+++ b/vllm/compilation/monitor.py
@@ -43,7 +43,7 @@ cudagraph_capturing_enabled: bool = True


 def validate_cudagraph_capturing_enabled():
-    # used to monitor whether an cudagraph capturing is legal at runtime.
+    # used to monitor whether a cudagraph capturing is legal at runtime.
    # should be called before any cudagraph capturing.
    # if an illegal cudagraph capturing happens, raise an error.
    global cudagraph_capturing_enabled

--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@@ -76,7 +76,7 @@ class LRUEvictor(Evictor):
    that's recorded in the Block. If there are multiple blocks with
    the same last_accessed time, then the one with the largest num_hashed_tokens
    will be evicted. If two blocks each have the lowest last_accessed time and
-    highest num_hashed_tokens value, then one will be chose arbitrarily
+    highest num_hashed_tokens value, then one will be chosen arbitrarily
    """

    # CLEANUP_THRESHOLD determines the maximum allowable size of the priority

--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1239,7 +1239,7 @@ class LLMEngine:

            # Stop the execute model loop in parallel workers until there are
            # more requests to process. This avoids waiting indefinitely in
-            # torch.distributed ops which may otherwise timeout, and unblocks
+            # torch.distributed ops which may otherwise time out, and unblocks
            # the RPC thread in the workers so that they can process any other
            # queued control plane messages, such as add/remove lora adapters.
            logger.debug("Stopping remote worker execution loop.")

--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -329,7 +329,7 @@ class LLM:
        Args:
            prompts: The prompts to the LLM. You may pass a sequence of prompts
                for batch inference. See [PromptType][vllm.inputs.PromptType]
-                for more details about the format of each prompts.
+                for more details about the format of each prompt.
            sampling_params: The sampling parameters for text generation. If
                None, we use the default sampling parameters.
                When it is a single value, it is applied to every prompt.
@@ -853,7 +853,7 @@ class LLM:
        Args:
            prompts: The prompts to the LLM. You may pass a sequence of prompts
                for batch inference. See [PromptType][vllm.inputs.PromptType]
-                for more details about the format of each prompts.
+                for more details about the format of each prompt.
            pooling_params: The pooling parameters for pooling. If None, we
                use the default pooling parameters.
            use_tqdm: If `True`, shows a tqdm progress bar.
@@ -946,7 +946,7 @@ class LLM:
        Args:
            prompts: The prompts to the LLM. You may pass a sequence of prompts
                for batch inference. See [PromptType][vllm.inputs.PromptType]
-                for more details about the format of each prompts.
+                for more details about the format of each prompt.
            pooling_params: The pooling parameters for pooling. If None, we
                use the default pooling parameters.
            use_tqdm: If `True`, shows a tqdm progress bar.
@@ -994,7 +994,7 @@ class LLM:
        Args:
            prompts: The prompts to the LLM. You may pass a sequence of prompts
                for batch inference. See [PromptType][vllm.inputs.PromptType]
-                for more details about the format of each prompts.
+                for more details about the format of each prompt.
            use_tqdm: If `True`, shows a tqdm progress bar.
                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
                it is used to create the progress bar.
@@ -1038,7 +1038,7 @@ class LLM:
        Args:
            prompts: The prompts to the LLM. You may pass a sequence of prompts
                for batch inference. See [PromptType][vllm.inputs.PromptType]
-                for more details about the format of each prompts.
+                for more details about the format of each prompt.
            use_tqdm: If `True`, shows a tqdm progress bar.
                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
                it is used to create the progress bar.

--- a/vllm/executor/mp_distributed_executor.py
+++ b/vllm/executor/mp_distributed_executor.py
@@ -101,7 +101,7 @@ class MultiprocessingDistributedExecutor(DistributedExecutorBase):
            result_handler.start()
            self.worker_monitor.start()

-        # Set up signal handlers to shutdown the executor cleanly
+        # Set up signal handlers to shut down the executor cleanly
        # sometimes gc does not work well

        self.driver_worker = WorkerWrapperBase(self.vllm_config, 0)

--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -605,7 +605,7 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA):

 class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
    """ColumnParallelLinear layer that is composed of 2 sublayers (slices)
-    packed together (eg. gate_proj + up_proj -> gate_up_proj).
+    packed together (e.g. gate_proj + up_proj -> gate_up_proj).

    This means we have 2 LoRAs, each applied to one half of the layer.


--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -537,7 +537,7 @@ class Platform:

    def get_global_graph_pool(self) -> Any:
        """
-        Return the global graph pool for the this platform.
+        Return the global graph pool for this platform.
        """
        cls = self.__class__
        if cls._global_graph_pool is None:

--- a/vllm/reasoning/hunyuan_a13b_reasoning_parser.py
+++ b/vllm/reasoning/hunyuan_a13b_reasoning_parser.py
@@ -30,7 +30,7 @@ class HunyuanA13BReasoningParser(ReasoningParser):
    Key Features:
        - For non-stream output , Recognizes and extracts reasoning ("think")
         and answer ("answer") sections from text using regular expressions.
-        - For stream process, it require a token id sequences to change the 
+        - For stream process, it requires a token id sequences to change the
          reasoning state and other state so it maintains internal state to 
          manage parsing across multiple token.


--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2734,7 +2734,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                                                 layer_names)
            attn_backends = {}
            attn_backend_layers = defaultdict(list)
-            # Dedupe based on full class name; this is a bit safer than using
+            # Dedupe based on full class name; this is a bit safer than
            # using the class itself as the key because when we create dynamic
            # attention backend subclasses (e.g. ChunkedLocalAttention) unless
            # they are cached correctly, there will be different objects per

--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -224,7 +224,7 @@ class Worker(WorkerBase):
        memory can be used for KV cache without OOMs.

        The engine will first conduct a profiling of the existing memory usage.
-        Then, it calculate the free memory that can be used for KV cache in
+        Then, it calculates the free memory that can be used for KV cache in
        bytes.

        Tip: