fix some typos (#24071)

Signed-off-by: co63oc <co63oc@users.noreply.github.com>

fix some typos (#24071)
Signed-off-by: co63oc <co63oc@users.noreply.github.com>
1bd007f2 · co63oc · GitHub · 136d853e · 1bd007f2 · 1bd007f2
Unverified Commit 1bd007f2 authored Sep 03, 2025 by co63oc Committed by GitHub Sep 02, 2025
12 changed files
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -362,7 +362,7 @@ class ReLUSquaredActivation(CustomOp):
        return torch.square(F.relu(x))

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        #TODO : implement cuda kenrels
+        #TODO : implement cuda kernels
        return self.forward_native(x)



--- a/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py
@@ -83,7 +83,7 @@ class HadamardTransform(torch.nn.Module):
            # do not fold into weight in order to utilize FWHT
            self.scales[part_id] = 1 / math.sqrt(data.size(0))

-            # FUTURE: avoid runtime tranpose by processing weights
+            # FUTURE: avoid runtime transpose by processing weights
            # prior to apply

    def forward(self, value: Tensor, part_id: int = 0) -> Tensor:

--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -310,7 +310,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
            w13_bias = layer.w13_bias.data.to(torch.float32)
            w2_bias = layer.w2_bias.data.to(torch.float32)

-            # Swap w1 and w3 as the defenition of
+            # Swap w1 and w3 as the definition of
            # swiglu is different in the trtllm-gen
            def swap_every_two_rows(x, axis=-1):
                shape = x.shape

--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -179,7 +179,7 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo]
    ) -> BatchFeature:

        # HF Transformers audio processor no longer accepts `audios` key.
-        # We pop `audios` and replace it with `audio` key to surpress
+        # We pop `audios` and replace it with `audio` key to suppress
        # the warning.
        if 'audios' in mm_data:
            mm_data['audio'] = mm_data.pop('audios')

--- a/vllm/model_executor/models/interns1.py
+++ b/vllm/model_executor/models/interns1.py
@@ -492,7 +492,7 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,

    @classmethod
    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
-        # transformers InternVLProcessor uses <IMG_CONTEXT> as the seperator
+        # transformers InternVLProcessor uses <IMG_CONTEXT> as the separator
        # refer to https://github.com/huggingface/transformers/blob/f90de364c2484c7c325bbe05befdcf487bd75b63/src/transformers/models/internvl/processing_internvl.py#L116
        if modality.startswith("image"):
            return '<IMG_CONTEXT>'

--- a/vllm/third_party/pynvml.py
+++ b/vllm/third_party/pynvml.py
@@ -3533,7 +3533,7 @@ def nvmlDeviceGetMPSComputeRunningProcesses_v3(handle):
        return []
    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
        # typical case
-        # oversize the array incase more processes are created
+        # oversize the array in case more processes are created
        c_count.value = c_count.value * 2 + 5
        proc_array = c_nvmlProcessInfo_v3_t * c_count.value
        c_procs = proc_array()

--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -167,7 +167,7 @@ class FlashAttentionMetadataBuilder(
    # work for mixed prefill-decode and uniform-decode. But for non-spec decodes
    # the graphs would not work for mixed prefill-decode; sorta the inverse
    # of UNIFORM_SINGLE_TOKEN_DECODE.
-    # Theres probably a better way to describe this using `AttentionCGSupport`
+    # There's probably a better way to describe this using `AttentionCGSupport`
    # but for now just set it to `UNIFORM_BATCH` to get use to drop down
    # to FULL_AND_PIECEWISE.
    # TODO(luka, lucas): audit FA2 as part of:

--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -291,7 +291,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                paged_kv_indices_buffer=paged_kv_indices,
                paged_kv_last_page_len_buffer=paged_kv_last_page_len,
                # Tensor cores are enabled by default because the perf would be
-                # atleast as good as cuda cores for all attention ops in latest
+                # at least as good as cuda cores for all attention ops in latest
                # gpus.
                use_tensor_cores=True,
            )

--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -217,7 +217,7 @@ class FreeKVCacheBlockQueue:
        # Create a fake head and a tail block for the doubly linked list to
        # reduce branching in the code
        #
-        # The implementation garenteed that the fake head and tail
+        # The implementation guaranteed that the fake head and tail
        # are NEVER got popped, so we could safely assume each real blocks
        # in the queue has prev and next blocks.
        self.fake_free_list_head = KVCacheBlock(block_id=-1)

--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -584,7 +584,7 @@ class InputBatch:

            if self.is_pooling_model:
                last_req_index -= 1
-                # Samping state not used by pooling models.
+                # Sampling state not used by pooling models.
                continue

            # Autoregressive models require detailed tracking of condense

--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2776,7 +2776,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
            self.attn_groups.append(
                create_attn_groups(attn_backends, kv_cache_spec))

-        # Calculate reorder batch threshold (if neeeded)
+        # Calculate reorder batch threshold (if needed)
        self.calculate_reorder_batch_threshold()

    def initialize_cudagraph_capture(self) -> None:

--- a/vllm/v1/worker/kv_connector_model_runner_mixin.py
+++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py
@@ -82,7 +82,7 @@ class KVConnectorModelRunnerMixin:
            scheduler_output) if has_kv_transfer_group() else nullcontext()

    # This context manager must be used within an active forward context.
-    # It encapsulates the entire KV conector lifecycle within execute_model
+    # It encapsulates the entire KV connector lifecycle within execute_model
    @staticmethod
    @contextmanager
    def _get_kv_connector_output(