Update `pre-commit` hooks (#12475)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>

Update `pre-commit` hooks (#12475)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
823ab796 · Harry Mellor · GitHub · 6116ca8c · 823ab796 · 823ab796
Unverified Commit 823ab796 authored Jan 28, 2025 by Harry Mellor Committed by GitHub Jan 27, 2025
20 changed files
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -115,17 +115,17 @@ class VocabParallelEmbeddingShardIndices:
    def __post_init__(self):
        # sanity checks
-        assert (self.padded_org_vocab_start_index <=
+        assert (self.padded_org_vocab_start_index
-                self.padded_org_vocab_end_index)
+                <= self.padded_org_vocab_end_index)
-        assert (self.padded_added_vocab_start_index <=
+        assert (self.padded_added_vocab_start_index
-                self.padded_added_vocab_end_index)
+                <= self.padded_added_vocab_end_index)
        assert self.org_vocab_start_index <= self.org_vocab_end_index
        assert self.added_vocab_start_index <= self.added_vocab_end_index
        assert self.org_vocab_start_index <= self.padded_org_vocab_start_index
-        assert (self.added_vocab_start_index <=
+        assert (self.added_vocab_start_index
-                self.padded_added_vocab_start_index)
+                <= self.padded_added_vocab_start_index)
        assert self.org_vocab_end_index <= self.padded_org_vocab_end_index
        assert self.added_vocab_end_index <= self.padded_added_vocab_end_index
@@ -141,8 +141,8 @@ def get_masked_input_and_mask(
        added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]:
    # torch.compile will fuse all of the pointwise ops below
    # into a single kernel, making it very fast
-    org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ <
+    org_vocab_mask = (input_ >= org_vocab_start_index) & (
-                                                          org_vocab_end_index)
+        input_ < org_vocab_end_index)
    added_vocab_mask = (input_ >= added_vocab_start_index) & (
        input_ < added_vocab_end_index)
    added_offset = added_vocab_start_index - (

--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -1121,8 +1121,9 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                # from being incorrectly identified as being present in
                # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight
                shard_pos = quant_param_name.find(shard_name)
-                can_correct_rename = (shard_pos > 0) and (
+                can_correct_rename = (shard_pos
-                    quant_param_name[shard_pos - 1] == ".")
+                                      > 0) and (quant_param_name[shard_pos - 1]
+                                                == ".")
                # If the quant_param_name is packed, it won't occur in the
                # param_dict before renaming.
                new_quant_param_name = quant_param_name.replace(

--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -298,8 +298,8 @@ class TensorizerAgent:
        to allow for adapter added tokens."""
        for child in self.model.modules():
            if (isinstance(child, VocabParallelEmbedding)
-                    and child.weight.shape[0] <
+                    and child.weight.shape[0]
-                    child.num_embeddings_per_partition):
+                    < child.num_embeddings_per_partition):
                new_weight = torch.empty(child.num_embeddings_per_partition,
                                         child.embedding_dim,
                                         dtype=child.weight.dtype,

--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Gemma model compatible with HuggingFace weights."""
-from functools import lru_cache
+from functools import cache
 from typing import Iterable, List, Optional, Set, Tuple, Union
 import torch
@@ -48,7 +48,7 @@ from .utils import (is_pp_missing_parameter,
 logger = init_logger(__name__)
-@lru_cache(maxsize=None)
+@cache
 def _get_gemma_act_fn(
    hidden_act: Optional[str],
    hidden_activation: Optional[str],

--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -429,10 +429,10 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                for e in range(p.size(0)):
                    w1_name = n.replace(
                        '.block_sparse_moe.input_linear.weight',
-                        ".block_sparse_moe.experts.%d.w1.weight" % e)
+                        f".block_sparse_moe.experts.{e}.w1.weight")
                    w3_name = n.replace(
                        '.block_sparse_moe.input_linear.weight',
-                        ".block_sparse_moe.experts.%d.w3.weight" % e)
+                        f".block_sparse_moe.experts.{e}.w3.weight")
                    w1_param, w3_param = p[e].chunk(2, dim=0)
                    assert w1_name not in new_weights
                    assert w3_name not in new_weights
@@ -442,7 +442,7 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                for e in range(p.size(0)):
                    w2_name = n.replace(
                        '.block_sparse_moe.output_linear.weight',
-                        ".block_sparse_moe.experts.%d.w2.weight" % e)
+                        f".block_sparse_moe.experts.{e}.w2.weight")
                    w2_param = p[e]
                    assert w2_name not in new_weights
                    new_weights[w2_name] = w2_param

--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1365,8 +1365,8 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
        # For 1) text-only prefill and decode, 2) image-present decode.
        if image_inputs is None:
            full_text_row_masked_out_mask = (
-                attn_metadata.encoder_seq_lens_tensor != 0).reshape(-1, 1).to(
+                attn_metadata.encoder_seq_lens_tensor
-                    input_ids.device)
+                != 0).reshape(-1, 1).to(input_ids.device)
            skip_cross_attention = max(attn_metadata.encoder_seq_lens) == 0
        # For image-present prefill.

--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -81,8 +81,8 @@ class MLPSpeculator(nn.Module):
        if self.tie_weights:
            assert (
-                self.n_predict >
+                self.n_predict > 1
-                1), "You cannot tie weights between stages when only 1 exists"
+            ), "You cannot tie weights between stages when only 1 exists"
            embedding = VocabParallelEmbedding(
                config.vocab_size,
                self.inner_dim,

--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -167,8 +167,8 @@ def sparsemixer(scores, jitter_eps=0.01):
        # compute mask for sparsity
        mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True)
        factor = scores.abs().clamp(min=mask_logits_threshold)
-        mask_logits_threshold = (
+        mask_logits_threshold = ((mask_logits_threshold - scores) /
-            (mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
+                                 factor) > (2 * jitter_eps)
    # apply mask
    masked_gates = scores.masked_fill(mask_logits_threshold, float("-inf"))
@@ -192,8 +192,8 @@ def sparsemixer(scores, jitter_eps=0.01):
        mask_logits_threshold, max_ind = masked_scores.max(dim=-1,
                                                           keepdim=True)
        factor = scores.abs().clamp(min=mask_logits_threshold)
-        mask_logits_threshold = (
+        mask_logits_threshold = ((mask_logits_threshold - scores) /
-            (mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
+                                 factor) > (2 * jitter_eps)
    # apply mask
    masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold,

--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -462,7 +462,8 @@ class _ModelRegistry:
 ModelRegistry = _ModelRegistry({
-    model_arch: _LazyRegisteredModel(
+    model_arch:
+    _LazyRegisteredModel(
        module_name=f"vllm.model_executor.models.{mod_relname}",
        class_name=cls_name,
    )

--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -333,10 +333,10 @@ class ModifiedWhisperEncoder(WhisperEncoder):
        return hidden_states
-@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor,
+@MULTIMODAL_REGISTRY.register_processor(
-                                        info=UltravoxProcessingInfo,
+    UltravoxMultiModalProcessor,
-                                        dummy_inputs=UltravoxDummyInputsBuilder
+    info=UltravoxProcessingInfo,
-                                        )
+    dummy_inputs=UltravoxDummyInputsBuilder)
 class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
    hf_to_vllm_mapper = WeightsMapper(

--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -599,9 +599,8 @@ def make_empty_intermediate_tensors_factory(keys: List[str], hidden_size: int):
        device: torch.device,
    ) -> IntermediateTensors:
        return IntermediateTensors({
-            key: torch.zeros((batch_size, hidden_size),
+            key:
-                             dtype=dtype,
+            torch.zeros((batch_size, hidden_size), dtype=dtype, device=device)
-                             device=device)
            for key in keys
        })

--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -166,7 +166,8 @@ class SamplingMetadata:
            pin_memory=pin_memory,
        )
        categorized_sample_indices = {
-            t: async_tensor_h2d(
+            t:
+            async_tensor_h2d(
                seq_ids,
                dtype=torch.int,
                target_device=device,
@@ -198,8 +199,12 @@ def _prepare_seq_groups(
    device: str,
    generators: Optional[Dict[str, torch.Generator]] = None,
    cache: Optional[SamplingMetadataCache] = None,
-) -> Tuple[List[SequenceGroupToSample], List[int], Dict[SamplingType,
+) -> Tuple[
-                                                        List[int]], int, ]:
+        List[SequenceGroupToSample],
+        List[int],
+        Dict[SamplingType, List[int]],
+        int,
+]:
    """Prepare sequence groups and indices for sampling.
    Args:

--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -38,8 +38,8 @@ class NeuronPlatform(Platform):
        if parallel_config.world_size > 1:
            parallel_config.distributed_executor_backend = "uni"
-        assert (vllm_config.lora_config is
+        assert (vllm_config.lora_config
-                None), "LoRA is not supported for Neuron backend."
+                is None), "LoRA is not supported for Neuron backend."
        assert (not vllm_config.speculative_config
                ), "Speculative decoding not yet supported for Neuron backend."

--- a/vllm/scalar_type.py
+++ b/vllm/scalar_type.py
@@ -121,8 +121,8 @@ class ScalarType:
            min_raw = max_raw | sign_bit_double
            return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
        else:
-            assert (not self.is_signed() or
+            assert (not self.is_signed() or self.size_bits
-                    self.size_bits <= 64), "Cannot represent min as a int64_t"
+                    <= 64), "Cannot represent min as a int64_t"
            if self.is_signed():
                return -(1 << (self.size_bits - 1))

--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -510,8 +510,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
            self, execute_model_req: ExecuteModelRequest) -> bool:
        # When the batch size is too large, disable speculative decoding
        # to stop trading off throughput for latency.
-        return (execute_model_req.running_queue_size >=
+        return (execute_model_req.running_queue_size
-                self.disable_by_batch_size)
+                >= self.disable_by_batch_size)
    def _maybe_disable_speculative_tokens(
            self, disable_all_speculation: bool,

--- a/vllm/spec_decode/top1_proposer.py
+++ b/vllm/spec_decode/top1_proposer.py
@@ -104,11 +104,11 @@ class Top1Proposer(SpeculativeProposer):
            sampler_transposed=transposed,
        )
-        proposals = SpeculativeProposals(
+        proposals = SpeculativeProposals(proposal_token_ids=proposal_tokens,
-            proposal_token_ids=proposal_tokens,
+                                         proposal_probs=proposal_probs,
-            proposal_probs=proposal_probs,
+                                         proposal_lens=proposal_lens,
-            proposal_lens=proposal_lens,
+                                         no_proposals=maybe_sampler_output
-            no_proposals=maybe_sampler_output is None)
+                                         is None)
        return proposals
    def _split_by_proposal_len(

--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -40,13 +40,15 @@ def get_sampled_token_logprobs(
    """
    num_steps, batch_size, vocab_size = logprob_tensor.shape
-    selected_logprobs = logprob_tensor[torch.arange(num_steps).unsqueeze(1),
+    selected_logprobs = logprob_tensor[
-                                       torch.arange(batch_size),
+        torch.arange(num_steps).unsqueeze(1),
-                                       sampled_token_ids, ]
+        torch.arange(batch_size),
+        sampled_token_ids,
+    ]
    expanded_selected_logprobs = selected_logprobs.unsqueeze(-1).expand(
        -1, -1, vocab_size)
-    sampled_token_ids_ranks = (logprob_tensor >
+    sampled_token_ids_ranks = (logprob_tensor
-                               expanded_selected_logprobs).sum(-1).add_(1)
+                               > expanded_selected_logprobs).sum(-1).add_(1)
    return sampled_token_ids_ranks, selected_logprobs

--- a/vllm/transformers_utils/configs/nemotron.py
+++ b/vllm/transformers_utils/configs/nemotron.py
@@ -182,8 +182,8 @@ class NemotronConfig(PretrainedConfig):
        if self.rope_scaling is None:
            return
-        if not isinstance(self.rope_scaling,
+        if not isinstance(self.rope_scaling, dict) or len(
-                          dict) or len(self.rope_scaling) != 2:
+                self.rope_scaling) != 2:
            raise ValueError(
                "`rope_scaling` must be a dictionary with two fields, "
                f"`type` and `factor`, got {self.rope_scaling}")

--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -29,7 +29,7 @@ from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
 from collections import OrderedDict, UserDict, defaultdict
 from collections.abc import Hashable, Iterable, Mapping
 from dataclasses import dataclass, field
-from functools import lru_cache, partial, wraps
+from functools import cache, lru_cache, partial, wraps
 from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
                    Dict, Generator, Generic, Iterator, List, Literal,
                    NamedTuple, Optional, Tuple, Type, TypeVar, Union,
@@ -352,7 +352,7 @@ class PyObjectCache:
        self._index = 0
-@lru_cache(maxsize=None)
+@cache
 def get_max_shared_memory_bytes(gpu: int = 0) -> int:
    """Returns the maximum shared memory per thread block in bytes."""
    from vllm import _custom_ops as ops
@@ -697,7 +697,7 @@ def create_kv_caches_with_random(
    return key_caches, value_caches
-@lru_cache(maxsize=None)
+@cache
 def is_pin_memory_available() -> bool:
    from vllm.platforms import current_platform
    return current_platform.is_pin_memory_available()
@@ -886,7 +886,7 @@ def init_cached_hf_modules() -> None:
    init_hf_modules()
-@lru_cache(maxsize=None)
+@cache
 def find_library(lib_name: str) -> str:
    """
    Find the library file in the system.
@@ -1607,7 +1607,7 @@ def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
    return module
-@lru_cache(maxsize=None)
+@cache
 def get_vllm_optional_dependencies():
    metadata = importlib.metadata.metadata("vllm")
    requirements = metadata.get_all("Requires-Dist", [])

--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -247,8 +247,8 @@ class Scheduler:
                token_budget -= num_new_tokens
                request.status = RequestStatus.RUNNING
                request.num_computed_tokens = num_computed_tokens
-                has_partial_request = (num_computed_tokens + num_new_tokens <
+                has_partial_request = (num_computed_tokens + num_new_tokens
-                                       request.num_tokens)
+                                       < request.num_tokens)
                # Encoder-related.
                if encoder_inputs_to_schedule: