fix: typos (#18151)

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

fix: typos (#18151)
Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>
a9944aab · omahs · GitHub · a8f5aec2 · a9944aab · a9944aab
Unverified Commit a9944aab authored May 15, 2025 by omahs Committed by GitHub May 15, 2025
10 changed files
--- a/csrc/attention/attention_kernels.cuh
+++ b/csrc/attention/attention_kernels.cuh
@@ -172,7 +172,7 @@ __device__ void paged_attention_kernel(

  // Load the query to registers.
  // Each thread in a thread group has a different part of the query.
-  // For example, if the the thread group size is 4, then the first thread in
+  // For example, if the thread group size is 4, then the first thread in
  // the group has 0, 4, 8, ... th vectors of the query, and the second thread
  // has 1, 5, 9, ... th vectors of the query, and so on. NOTE(woosuk): Because
  // q is split from a qkv tensor, it may not be contiguous.
@@ -259,7 +259,7 @@ __device__ void paged_attention_kernel(

    // Load a key to registers.
    // Each thread in a thread group has a different part of the key.
-    // For example, if the the thread group size is 4, then the first thread in
+    // For example, if the thread group size is 4, then the first thread in
    // the group has 0, 4, 8, ... th vectors of the key, and the second thread
    // has 1, 5, 9, ... th vectors of the key, and so on.
    for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {

--- a/examples/offline_inference/chat_with_tools.py
+++ b/examples/offline_inference/chat_with_tools.py
@@ -68,7 +68,7 @@ def get_current_weather(city: str, state: str, unit: 'str'):
            "partly cloudly, with highs in the 90's.")


-tool_funtions = {"get_current_weather": get_current_weather}
+tool_functions = {"get_current_weather": get_current_weather}

 tools = [{
    "type": "function",
@@ -122,7 +122,7 @@ messages.append({
 # above defined function
 tool_calls = json.loads(output)
 tool_answers = [
-    tool_funtions[call['name']](**call['arguments']) for call in tool_calls
+    tool_functions[call['name']](**call['arguments']) for call in tool_calls
 ]

 # append the answer as a tool message and let the LLM give you an answer

--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -30,7 +30,7 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request):

    lora_path = get_adapter_absolute_path(lora_name)

-    # lora loading should work for either absolute path and hugggingface id.
+    # lora loading should work for either absolute path and huggingface id.
    peft_helper = PEFTHelper.from_local_dir(lora_path, 4096)
    lora_model = LoRAModel.from_local_checkpoint(
        lora_path,

--- a/tests/model_executor/weight_utils.py
+++ b/tests/model_executor/weight_utils.py
@@ -20,11 +20,11 @@ def test_hf_transfer_auto_activation():
    try:
        # enable hf hub transfer if available
        import hf_transfer  # type: ignore # noqa
-        HF_TRANFER_ACTIVE = True
+        HF_TRANSFER_ACTIVE = True
    except ImportError:
-        HF_TRANFER_ACTIVE = False
+        HF_TRANSFER_ACTIVE = False
    assert (huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER ==
-            HF_TRANFER_ACTIVE)
+            HF_TRANSFER_ACTIVE)


 def test_download_weights_from_hf():

--- a/vllm/config.py
+++ b/vllm/config.py
@@ -297,7 +297,7 @@ class ModelConfig:
    - 1K -> 1024\n
    - 25.6k -> 25,600"""
    spec_target_max_model_len: Optional[int] = None
-    """Specify the the maximum length for spec decoding draft models."""
+    """Specify the maximum length for spec decoding draft models."""
    quantization: Optional[QuantizationMethods] = None
    """Method used to quantize the weights. If `None`, we first check the
    `quantization_config` attribute in the model config file. If that is

--- a/vllm/lora/ops/triton_ops/lora_expand_op.py
+++ b/vllm/lora/ops/triton_ops/lora_expand_op.py
@@ -153,7 +153,7 @@ def _lora_expand(
        lora_token_start_loc (torch.Tensor): A cumulative sum of
            num_tokens_per_lora. lora_token_start_loc[0] is always 0 so that
            lora_token_start_loc[i], along with num_tokens_per_lora[i]
-            identifies the the region in token_indices_sorted_by_lora_ids that
+            identifies the region in token_indices_sorted_by_lora_ids that
            LoRA lora_ids[i] should process.
        lora_ids (torch.Tensor): LoRA ids to process.
        no_lora_flag_cpu (torch.Tensor): A CPU tensor of size 1, that indicates

--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -142,7 +142,7 @@ def mamba_v2_sharded_weight_loader(
 ) -> LoaderFunction:
    """Create a weight loader for mamba v2. This ensures that the projections 
    are correctly sharded so that they can be split into x, B, C. It also 
-    ensures the the all the groups corresponding to a head shard is placed 
+    ensures that all the groups corresponding to a head shard is placed 
    together with it.
    """


--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -21,7 +21,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference-only IBM Granite speeech model."""
+"""Inference-only IBM Granite speech model."""
 import math
 from collections.abc import Iterable, Mapping
 from typing import Optional, TypedDict, Union
@@ -626,7 +626,7 @@ class GraniteSpeechForConditionalGeneration(
        audio_embed_sizes: torch.Tensor,
    ) -> torch.Tensor:
        """Calculate the input features mask, which will generally be used
-        to mask the the padded features for all entries in the batch except
+        to mask the padded features for all entries in the batch except
        for those with the most audio features.

        Args:

--- a/vllm/model_executor/models/phi4mm_audio.py
+++ b/vllm/model_executor/models/phi4mm_audio.py
@@ -91,9 +91,9 @@ class ConformerEncoderLayer(nn.Module):
            if set to True, use GLULinear module,
             otherwise, used GLUPointWiseConv module.
              default to False.
-        attention_innner_dim: int, optional
+        attention_inner_dim: int, optional
            if equal to -1, attention dim for linears k/q/v is
-            equal to d_model. otherwise attention_innner_dim is used.
+            equal to d_model. otherwise attention_inner_dim is used.
            default -1.
        attention_glu_type: str, optional
            activation function for glu used in the multihead attention,
@@ -148,7 +148,7 @@ class ConformerEncoderLayer(nn.Module):
        conv_glu_type="sigmoid",
        bias_in_glu=True,
        linear_glu_in_convm=False,
-        attention_innner_dim=-1,
+        attention_inner_dim=-1,
        attention_glu_type="swish",
        activation_checkpointing="",
        export=False,
@@ -169,7 +169,7 @@ class ConformerEncoderLayer(nn.Module):
            n_head,
            d_model,
            dropout_rate,
-            attention_innner_dim,
+            attention_inner_dim,
            attention_glu_type,
            bias_in_glu,
            use_pt_scaled_dot_product_attention=

--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -72,7 +72,7 @@ class Request:
            assert len(self.mm_inputs) == len(self.mm_hashes)

        # Read-only views
-        # Prevent directly appending to the these lists since
+        # Prevent directly appending to these lists since
        # they should also be updated simultaneously.
        self.output_token_ids = ConstantList(self._output_token_ids)
        self.all_token_ids = ConstantList(self._all_token_ids)