[Minor] Various small code cleanups/simplifications (#31508)

Signed-off-by: njhill <nickhill123@gmail.com>

[Minor] Various small code cleanups/simplifications (#31508)
Signed-off-by: njhill <nickhill123@gmail.com>
3b312fb7 · Nick Hill · GitHub · f84bf7d7 · 3b312fb7 · 3b312fb7
Unverified Commit 3b312fb7 authored Dec 29, 2025 by Nick Hill Committed by GitHub Dec 29, 2025
14 changed files
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -1579,14 +1579,14 @@ class ModelConfig:

    @property
    def is_hybrid(self) -> bool:
+        if not self._model_info.is_hybrid:
+            return False
        # Handle granite-4.0-micro case which uses hybrid config but does not
        # actually contain any non-attention layers.
        layer_types = getattr(self.hf_config, "layer_types", None)
-        if layer_types is not None and all(
+        return layer_types is None or not all(
            layer == "attention" for layer in layer_types
-        ):
-            return False
-        return self._model_info.is_hybrid
+        )

    @property
    def has_noops(self) -> bool:

--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -2005,7 +2005,6 @@ class OpenAIServingResponses(OpenAIServing):
            return event

        async with AsyncExitStack() as exit_stack:
-            processer = None
            if self.use_harmony:
                # TODO: in streaming, we noticed this bug:
                # https://github.com/vllm-project/vllm/issues/25697

--- a/vllm/entrypoints/renderer.py
+++ b/vllm/entrypoints/renderer.py
@@ -44,11 +44,8 @@ class RenderConfig:
    def verify_truncate_prompt_tokens(self, model_config: ModelConfig) -> int | None:
        """Validate and normalize `truncate_prompt_tokens` parameter."""
        truncate_prompt_tokens = self.truncate_prompt_tokens
-        if truncate_prompt_tokens is None:
-            return None
-
-        if truncate_prompt_tokens == 0:
-            return 0
+        if truncate_prompt_tokens is None or truncate_prompt_tokens == 0:
+            return truncate_prompt_tokens

        if truncate_prompt_tokens < 0:
            truncate_prompt_tokens = model_config.max_model_len

--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -686,11 +686,7 @@ class InputPreprocessor:
        mm_uuids: MultiModalUUIDDict | None = None,
    ) -> ProcessorInputs:
        """Preprocess the input prompt."""
-        res = self._preprocess(
-            prompt,
-            tokenization_kwargs,
-            mm_uuids=mm_uuids,
-        )
+        res = self._preprocess(prompt, tokenization_kwargs, mm_uuids=mm_uuids)

        if self.mm_processor_cache and self.mm_cache_stats is not None:
            delta = self.mm_processor_cache.make_stats(delta=True)

--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -171,10 +171,7 @@ class PlaceholderRange:

    @cached_property
    def embeds_cumsum(self) -> torch.Tensor | None:
-        if self.is_embed is None:
-            return None
-
-        return self.is_embed.cumsum(dim=0)
+        return None if self.is_embed is None else self.is_embed.cumsum(dim=0)

    @cached_property
    def get_num_embeds(self) -> int:
@@ -308,13 +305,7 @@ def batched_tensors_equal(a: BatchedTensorInputs, b: BatchedTensorInputs) -> boo
    Equality check between
    [`BatchedTensorInputs`][vllm.multimodal.inputs.BatchedTensorInputs] objects.
    """
-    for k in a:
-        if k not in b:
-            return False
-        if not nested_tensors_equal(a[k], b[k]):
-            return False
-
-    return True
+    return all(k in b and nested_tensors_equal(a[k], b[k]) for k in a)


 @dataclass

--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -487,10 +487,8 @@ class EngineCore:
            request_ids = []
            while not self.aborts_queue.empty():
                ids = self.aborts_queue.get_nowait()
-                if isinstance(ids, str):
                # Should be a list here, but also handle string just in case.
-                    ids = (ids,)
-                request_ids.extend(ids)
+                request_ids.extend((ids,) if isinstance(ids, str) else ids)
            # More efficient to abort all as a single batch.
            self.abort_requests(request_ids)


--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@@ -618,7 +618,7 @@ class InputProcessor:

        tokenizer = self.tokenizer
        if tokenizer is not None:
-            max_input_id = max(prompt_ids or [], default=0)
+            max_input_id = max(prompt_ids or (), default=0)

            # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
            # self.model_config.get_vocab_size() is the model’s vocab size.

--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -339,10 +339,7 @@ class RequestState:
            stop_reason=stop_reason if finished else None,
        )

-    def _new_pooling_output(
-        self,
-        pooling_output: torch.Tensor,
-    ) -> PoolingOutput:
+    def _new_pooling_output(self, pooling_output: torch.Tensor) -> PoolingOutput:
        return PoolingOutput(data=pooling_output)


@@ -695,9 +692,7 @@ class OutputProcessor:
        assert req_state.stats is not None
        iteration_stats.update_from_finished_request(
            finish_reason=finish_reason,
-            num_prompt_tokens=length_from_prompt_token_ids_or_embeds(
-                req_state.prompt_token_ids, req_state.prompt_embeds
-            ),
+            num_prompt_tokens=req_state.prompt_len,
            max_tokens_param=req_state.max_tokens_param,
            req_stats=req_state.stats,
            num_cached_tokens=req_state.num_cached_tokens,

--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -695,7 +695,7 @@ class WorkerProc:
        worker = None
        # tuple[Connection, Connection]
        reader, ready_writer = kwargs.pop("ready_pipe")
-        death_pipe = kwargs.pop("death_pipe", None)
+        death_pipe: Connection | None = kwargs.pop("death_pipe", None)
        shutdown_event = threading.Event()
        # Start death monitoring thread if death_pipe is provided
        if death_pipe is not None:

--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -211,8 +211,7 @@ class Request:

    def get_num_encoder_embeds(self, input_id: int) -> int:
        assert input_id < len(self.mm_features)
-        num_embeds = self.mm_features[input_id].mm_position.get_num_embeds
-        return num_embeds
+        return self.mm_features[input_id].mm_position.get_num_embeds

    def record_event(
        self,

--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import itertools
 import multiprocessing
+from collections.abc import Iterable
 from concurrent.futures import Future, ThreadPoolExecutor
 from typing import TYPE_CHECKING

@@ -172,7 +174,7 @@ class StructuredOutputManager:

    def _fill_bitmasks(
        self,
-        batch: list[tuple[StructuredOutputGrammar, int, bool]],
+        batch: Iterable[tuple[StructuredOutputGrammar, int, bool]],
    ) -> None:
        assert self._grammar_bitmask is not None
        for grammar, index, apply_bitmask in batch:
@@ -265,16 +267,16 @@ class StructuredOutputManager:
                apply_bitmask = self.should_fill_bitmask(request)

                state_advancements = 0
-                req_tokens = scheduled_spec_decode_tokens.get(req_id, [])
-                for i, token in enumerate(req_tokens + [None]):
+                req_tokens = scheduled_spec_decode_tokens.get(req_id, ())
+                for token in itertools.chain(req_tokens, (None,)):
                    self._fill_bitmasks(
-                        [
+                        (
                            (
                                structured_output_request.grammar,
                                cumulative_index,
                                apply_bitmask,
+                            ),
                        )
-                        ]
                    )

                    if (

--- a/vllm/v1/structured_output/request.py
+++ b/vllm/v1/structured_output/request.py
@@ -28,12 +28,9 @@ class StructuredOutputRequest:
        if sampling_params is None:
            return None
        params = sampling_params.structured_outputs
-        if params:
-            if params.all_constraints_none():
+        if not params or params.all_constraints_none():
            return None
-            else:
        return StructuredOutputRequest(params=params)
-        return None

    def _check_grammar_completion(self) -> bool:
        # NOTE: We have to lazy import to gate circular imports

--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -829,7 +829,7 @@ class InputBatch:
            presence_penalties=self.presence_penalties[:num_reqs],
            repetition_penalties=self.repetition_penalties[:num_reqs],
            output_token_ids=output_token_ids,
-            spec_token_ids=cast(list[list[int]], self.spec_token_ids),
+            spec_token_ids=self.spec_token_ids,
            no_penalties=self.no_penalties,
            allowed_token_ids_mask=allowed_token_ids_mask,
            bad_words_token_ids=self.bad_words_token_ids,

--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1026,7 +1026,7 @@ class GPUModelRunner(
        each sequence, and a shifting is done during the next iteration
        based on the number of accepted tokens.
        """
-        if not self.model_config.is_hybrid or not self.speculative_config:
+        if not self.speculative_config or not self.model_config.is_hybrid:
            return

        # Find the number of accepted tokens for each sequence.