[BugFix] fix some typos found by typos. (#16314)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>

[BugFix] fix some typos found by typos. (#16314)
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
04149cce · yihong · GitHub · 24834f48 · 04149cce · 04149cce
Unverified Commit 04149cce authored Apr 09, 2025 by yihong Committed by GitHub Apr 09, 2025
20 changed files
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -921,7 +921,7 @@ if __name__ == "__main__":
        "--percentile-metrics",
        type=str,
        default="ttft,tpot,itl",
-        help="Comma-seperated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentils. "
        "This argument specifies the metrics to report percentiles. "
        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
        "Default value is \"ttft,tpot,itl\".")
@@ -929,7 +929,7 @@ if __name__ == "__main__":
        "--metric-percentiles",
        type=str,
        default="99",
-        help="Comma-seperated list of percentiles for selected metrics. "
+        help="Comma-separated list of percentiles for selected metrics. "
        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
        "Default value is \"99\". "
        "Use \"--percentile-metrics\" to select metrics.",

--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -963,7 +963,7 @@ if __name__ == "__main__":
        "--percentile-metrics",
        type=str,
        default="ttft,tpot,itl",
-        help="Comma-seperated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentils. "
        "This argument specifies the metrics to report percentiles. "
        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
        "Default value is \"ttft,tpot,itl\".")
@@ -971,7 +971,7 @@ if __name__ == "__main__":
        "--metric-percentiles",
        type=str,
        default="99",
-        help="Comma-seperated list of percentiles for selected metrics. "
+        help="Comma-separated list of percentiles for selected metrics. "
        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
        "Default value is \"99\". "
        "Use \"--percentile-metrics\" to select metrics.",

--- a/csrc/mamba/causal_conv1d/causal_conv1d.cu
+++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu
@@ -422,7 +422,7 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
        int final_state_position =  ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize);
        // in case the final state is separated between the last "smem_exchange" and 
        // and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2), 
-        // (which occurs when `final_state_position` is a non-positivie index)
+        // (which occurs when `final_state_position` is a non-positive index)
        // we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
        if (conv_states != nullptr && final_state_position < 0 && seqlen > kWidth){
            input_t vals_load[kNElts] = {0};

--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -326,7 +326,7 @@ class FlashAttentionMetadata(AttentionMetadata):
            assert self.use_cuda_graph

        if turn_prefills_into_decodes:
-            # When Mutli-Step is enabled with Chunked-Prefill, prefills and
+            # When Multi-Step is enabled with Chunked-Prefill, prefills and
            # decodes are scheduled together. In the first step, all the
            # prefills turn into decodes. This update reflects that
            # conversion.

--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -152,11 +152,11 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
                logger.warning("Could not import HPU FusedSDPA kernel. "
                               "vLLM will use native implementation.")

-        suppored_head_sizes = HPUPagedAttention.get_supported_head_sizes()
-        if head_size not in suppored_head_sizes:
+        supported_head_sizes = HPUPagedAttention.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
            raise ValueError(
                f"Head size {head_size} is not supported by PagedAttention. "
-                f"Supported head sizes are: {suppored_head_sizes}.")
+                f"Supported head sizes are: {supported_head_sizes}.")

        if attn_type != AttentionType.DECODER:
            raise NotImplementedError("Encoder self-attention and "

--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -83,8 +83,8 @@ spda_o = scaled_dot_product_attention(
 return spda_o @ W_O

 NOTE: in the actual code, 
-    `kv_b_proj` is [W_UK; W_UV] concatnated per head
-    `q_b_proj` is [W_UQ; W_QR] concatnated per head
+    `kv_b_proj` is [W_UK; W_UV] concatenated per head
+    `q_b_proj` is [W_UQ; W_QR] concatenated per head
    `out_proj` is W_O


@@ -667,7 +667,7 @@ class MLACommonMetadata(AttentionMetadata):
            assert num_seqs > num_queries

        if turn_prefills_into_decodes:
-            # When Mutli-Step is enabled with Chunked-Prefill, prefills and
+            # When Multi-Step is enabled with Chunked-Prefill, prefills and
            # decodes are scheduled together. In the first step, all the
            # prefills turn into decodes. This update reflects that
            # conversion.

--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -414,11 +414,11 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
        assert self.num_heads % self.num_kv_heads == 0
        self.num_queries_per_kv = self.num_heads // self.num_kv_heads

-        suppored_head_sizes = PagedAttention.get_supported_head_sizes()
-        if head_size not in suppored_head_sizes:
+        supported_head_sizes = PagedAttention.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
            raise ValueError(
                f"Head size {head_size} is not supported by PagedAttention. "
-                f"Supported head sizes are: {suppored_head_sizes}.")
+                f"Supported head sizes are: {supported_head_sizes}.")

        self.attn_type = attn_type


--- a/vllm/attention/ops/nki_flash_attn.py
+++ b/vllm/attention/ops/nki_flash_attn.py
@@ -446,7 +446,7 @@ def flash_paged_attention(
    IO tensor dtypes:
      - This kernel assumes all IO tensors have the same dtype except for
        block_tables (int32) and mask (int32)
-      - If mixed_percision is True, then all Tensor Engine operation will be
+      - If mixed_precision is True, then all Tensor Engine operation will be
        performed in bfloat16 and accumulation will be performed in float32.
        Otherwise the intermediates will be in the same type as the inputs.


--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -724,14 +724,14 @@ def add_cli_args(parser: argparse.ArgumentParser):
        "--percentile-metrics",
        type=str,
        default="ttft,tpot,itl",
-        help="Comma-seperated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentils. "
        "This argument specifies the metrics to report percentiles. "
        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". ")
    parser.add_argument(
        "--metric-percentiles",
        type=str,
        default="99",
-        help="Comma-seperated list of percentiles for selected metrics. "
+        help="Comma-separated list of percentiles for selected metrics. "
        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
        "Use \"--percentile-metrics\" to select metrics.",
    )

--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -93,7 +93,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
            externally (before the next schedule() call)
        """
        # Sequences can be in RUNNING or FINISHED_ABORTED state
-        # once scheduled, as a sequence is moved to FINSIHED_ABORTED
+        # once scheduled, as a sequence is moved to FINISHED_ABORTED
        # if a client disconnects from the api server.
        seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING)
        if seqs is None:

--- a/vllm/entrypoints/openai/tool_parsers/utils.py
+++ b/vllm/entrypoints/openai/tool_parsers/utils.py
@@ -98,7 +98,7 @@ def find_all_indices(string: str, substring: str) -> list[int]:


 # partial_json_parser doesn't support extra data and
-# JSONDecorder.raw_decode doesn't support partial JSON
+# JSONDecoder.raw_decode doesn't support partial JSON
 def partial_json_loads(input_str: str, flags: Allow) -> tuple[Any, int]:
    try:
        return (partial_json_parser.loads(input_str, flags), len(input_str))

--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
@@ -29,7 +29,7 @@ def choose_scaled_mm_linear_kernel(
        compute_capability: Optional[int] = None
 ) -> Type[ScaledMMLinearKernel]:
    """
-    Choose an ScalledMMLinearKernel that can implement the given config for the 
+    Choose an ScaledMMLinearKernel that can implement the given config for the 
    given compute capability. Attempts to choose the best kernel in terms of 
    performance.


--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -69,12 +69,12 @@ class CpuPlatform(Platform):

        cache_config = vllm_config.cache_config

-        ipex_avaliable = find_spec("intel_extension_for_pytorch") is not None
+        ipex_available = find_spec("intel_extension_for_pytorch") is not None

        if cache_config and cache_config.block_size is None:
-            cache_config.block_size = 128 if ipex_avaliable else 16
+            cache_config.block_size = 128 if ipex_available else 16

-        if not ipex_avaliable and cache_config.block_size != 16:
+        if not ipex_available and cache_config.block_size != 16:
            raise RuntimeError(
                f"--block-size={cache_config.block_size} requires"
                " intel_extension_for_pytorch")

--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -231,7 +231,7 @@ class Platform:
                                parser: Optional[FlexibleArgumentParser] = None
                                ) -> None:
        """
-        Do some pre-registeration or update action for the current platform.
+        Do some pre-registration or update action for the current platform.

        This function is called before global VllmConfig is initialized or cli
        arguments are parsed. It's used for out-of-tree platforms to register or

--- a/vllm/reasoning/granite_reasoning_parser.py
+++ b/vllm/reasoning/granite_reasoning_parser.py
@@ -60,7 +60,7 @@ class GraniteReasoningParser(ReasoningParser):

        Args:
            model_output (str): Output of the model to be parsed.
-            request (ChatCompletionReqest): Request being processed.
+            request (ChatCompletionRequest): Request being processed.

        Returns:
            tuple[Optional[str], Optional[str]]: Tuple pair containing the

--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -101,7 +101,7 @@ class RequestOutputKind(Enum):
    CUMULATIVE = 0
    # Return only deltas in each RequestOutput
    DELTA = 1
-    # Do not return intermediate RequestOuputs
+    # Do not return intermediate RequestOutput
    FINAL_ONLY = 2



--- a/vllm/third_party/pynvml.py
+++ b/vllm/third_party/pynvml.py
@@ -1119,7 +1119,7 @@ class _PrintableStructure(Structure):
    e.g. class that has _field_ 'hex_value', c_uint could be formatted with
      _fmt_ = {"hex_value" : "%08X"}
    to produce nicer output.
-    Default fomratting string for all fields can be set with key "<default>" like:
+    Default formatting string for all fields can be set with key "<default>" like:
      _fmt_ = {"<default>" : "%d MHz"} # e.g all values are numbers in MHz.
    If not set it's assumed to be just "%s"


--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -83,8 +83,8 @@ spda_o = scaled_dot_product_attention(
 return spda_o @ W_O

 NOTE: in the actual code,
-    `kv_b_proj` is [W_UK; W_UV] concatnated per head
-    `q_b_proj` is [W_UQ; W_QR] concatnated per head
+    `kv_b_proj` is [W_UK; W_UV] concatenated per head
+    `q_b_proj` is [W_UQ; W_QR] concatenated per head
    `out_proj` is W_O



--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -326,7 +326,7 @@ class WorkerProc:
            logger.debug("Worker interrupted.")

        except Exception:
-            # worker_busy_loop sends exceptions exceptons to Executor
+            # worker_busy_loop sends exceptions to Executor
            # for shutdown, but if there is an error in startup or an
            # error with IPC itself, we need to alert the parent.
            psutil.Process().parent().send_signal(signal.SIGUSR1)

--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -998,7 +998,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
    ) -> Union[ModelRunnerOutput, torch.Tensor]:
        self._update_states(scheduler_output)
        if not scheduler_output.total_num_scheduled_tokens:
-            # Return empty ModelRunnerOuptut if there's no work to do.
+            # Return empty ModelRunnerOutput if there's no work to do.
            return EMPTY_MODEL_RUNNER_OUTPUT

        if self.is_multimodal_model: