[Doc]: fix typos in Python comments (#24294)

Signed-off-by: Didier Durand <durand.didier@gmail.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>

[Doc]: fix typos in Python comments (#24294)
Signed-off-by: Didier Durand <durand.didier@gmail.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
35bf1938 · Didier Durand · GitHub · 35efa702 · 35bf1938 · 35bf1938
Unverified Commit 35bf1938 authored Sep 06, 2025 by Didier Durand Committed by GitHub Sep 05, 2025
12 changed files
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -417,7 +417,7 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
            ))
    def prepacked_type_key(prepack_type: PrepackTypeConfig):
-        # For now we we can just use the first accumulator type seen since
+        # For now, we can just use the first accumulator type seen since
        # the tensor core shapes/layouts don't vary based on accumulator
        # type so we can generate less code this way
        return (prepack_type.a, prepack_type.b_num_bits, prepack_type.convert)

--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -180,7 +180,7 @@ Inference batch size is an important parameter for the performance. Larger batch
    - Offline Inference: `256 * world_size`
    - Online Serving: `128 * world_size`
-vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use DP, TP and PP together if there are enough CPU sockets and memory nodes.
+vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommended to use DP, TP and PP together if there are enough CPU sockets and memory nodes.
 ### Which quantization configs does vLLM CPU support?

--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -42,7 +42,7 @@ def run_test(
    tensor_parallel_size: int = 1,
    vllm_embeddings: Optional[torch.Tensor] = None,
 ):
-    """Modality agnostic test test executor for comparing HF/vLLM outputs."""
+    """Modality agnostic test executor for comparing HF/vLLM outputs."""
    # In the case of embeddings, vLLM takes separate input tensors
    vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs

--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -60,7 +60,7 @@ class CustomAllreduce:
            group: the process group to work on. If None, it will use the
                default process group.
            device: the device to bind the CustomAllreduce to. If None,
-                it will be bind to f"cuda:{local_rank}".
+                it will be bound to f"cuda:{local_rank}".
        It is the caller's responsibility to make sure each communicator
        is bind to a unique device, and all communicators in this group
        are in the same node.
@@ -158,7 +158,7 @@ class CustomAllreduce:
        self.disabled = False
        # Buffers memory are owned by this Python class and passed to C++.
-        # Meta data composes of two parts: meta data for synchronization and a
+        # Metadata composes of two parts: metadata for synchronization and a
        # temporary buffer for storing intermediate allreduce results.
        self.meta_ptrs = self.create_shared_buffer(ops.meta_size() + max_size,
                                                   group=group,

--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@@ -35,7 +35,7 @@ class Internlm2ToolParser(ToolParser):
            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
        if request.tools and request.tool_choice != 'none':
            # do not skip special tokens because internlm use the special
-            # tokens to indicated the start and end of the tool calls
+            # tokens to indicate the start and end of the tool calls
            # information.
            request.skip_special_tokens = False
        return request
@@ -60,8 +60,8 @@ class Internlm2ToolParser(ToolParser):
        if '<|action_start|>' not in current_text:
            self.position = len(current_text)
            return DeltaMessage(content=delta_text)
-        # if the tool call is sended, return a empty delta message
+        # if the tool call is sended, return an empty delta message
-        # to make sure the finish_reason will be send correctly.
+        # to make sure the finish_reason will be sent correctly.
        if self.current_tool_id > 0:
            return DeltaMessage(content='')

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1064,7 +1064,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # vllm should use flashinfer fused allreduce. The variable should be a
    # JSON with the following format:
    #     { <world size>: <max size in mb> }
-    # Unspecified world sizes will fallback to
+    # Unspecified world sizes will fall back to
    #     { 2: 64, 4: 1, <everything else>: 0.5 }
    "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB":
    lambda: json.loads(os.getenv(

--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -534,7 +534,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
    EM = sorted_token_ids.size(0)
    if A.size(0) < config["BLOCK_SIZE_M"]:
        # optimize for small batch_size.
-        # We assume that top_ids of each token is unique, so
+        # We assume that top_ids of each token is unique,
        # so num_valid_experts <= batch_size <= BLOCK_SIZE_M,
        # and we can skip some invalid blocks.
        EM = min(sorted_token_ids.size(0),

--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -710,7 +710,7 @@ def determine_expert_map(
    # Create a tensor of size num_experts filled with -1
    expert_map = torch.full((global_num_experts, ), -1, dtype=torch.int32)
-    # Create a expert map for the local experts
+    # Create an expert map for the local experts
    start_idx = ep_rank * base_experts + min(ep_rank, remainder)
    expert_map[start_idx:start_idx + local_num_experts] = torch.arange(
        0, local_num_experts, dtype=torch.int32)
@@ -806,7 +806,7 @@ class FusedMoE(CustomOp):
        self.global_num_experts = num_experts + num_redundant_experts
-        # we padding globally so EP buffer allocation works
+        # we are padding globally so EP buffer allocation works
        if quant_config and quant_config.get_name() == "mxfp4":
            from vllm.model_executor.layers.quantization.mxfp4 import (  # noqa: E501
                should_use_flashinfer_mxfp4)

--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -469,7 +469,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
        )
        layer.register_parameter("w2_scales", w2_scales)
        set_weight_attrs(w2_scales, extra_weight_attrs)
-        # dont shard the w2 scales when running act order
+        # don't shard the w2 scales when running act order
        set_weight_attrs(w2_scales,
                         {"load_full_w2": self.quant_config.desc_act})
        # up_proj scales
@@ -493,7 +493,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
        )
        layer.register_parameter("w2_qzeros", w2_qzeros)
        set_weight_attrs(w2_qzeros, extra_weight_attrs)
-        # dont shard the w2 scales when running act order
+        # don't shard the w2 scales when running act order
        set_weight_attrs(w2_qzeros,
                         {"load_full_w2": self.quant_config.desc_act})
        w13_g_idx = torch.nn.Parameter(

--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -687,7 +687,7 @@ class FlashInferImpl(AttentionImpl):
            else:
                raise ValueError(f"Unsupported output dtype: {output.dtype}")
-            # TRTLLM attn kernel requires o scale to pass as a host scalar,
+            # TRTLLM attn kernel requires to scale to pass as a host scalar,
            # store the o scale as a host scalar in warmup run with cuda graph
            # not enabled
            if layer._o_scale_float is None:

--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -439,7 +439,7 @@ class EngineCore:
        """
        # Note on thread safety: no race condition.
        # `mm_receiver_cache` is reset at the end of LLMEngine init,
-        # and will only accessed in the input processing thread afterwards.
+        # and will only be accessed in the input processing thread afterwards.
        if self.mm_receiver_cache is not None and request.mm_features:
            request.mm_features = (
                self.mm_receiver_cache.get_and_update_features(

--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2826,7 +2826,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
        # Disable cudagraph capturing globally, so any unexpected cudagraph
        # capturing will be detected and raise an error after here.
        # Note: We don't put it into graph_capture context manager because
-        # we may doing lazy capturing in future that still allows capturing
+        # we may do lazy capturing in future that still allows capturing
        # after here.
        set_cudagraph_capturing_enabled(False)