sync v0.18.0

eefa41c1 · zhuwenwen · 82155c76 · eefa41c1 · eefa41c1 · eefa41c1
Commit eefa41c1 authored Mar 24, 2026 by zhuwenwen
13 changed files
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -344,7 +344,7 @@ class VoxtralForConditionalGeneration(
    def forward(
        self,
-        input_ids: torch.Tensor,
+        input_ids: torch.Tensor | None,
        positions: torch.Tensor,
        intermediate_tensors: IntermediateTensors | None = None,
        inputs_embeds: torch.Tensor | None = None,

--- a/vllm/model_executor/models/voxtral_realtime.py
+++ b/vllm/model_executor/models/voxtral_realtime.py
@@ -328,7 +328,7 @@ class VoxtralRealtimeGeneration(VoxtralForConditionalGeneration, SupportsRealtim
    def forward(
        self,
-        input_ids: torch.Tensor,
+        input_ids: torch.Tensor | None,
        positions: torch.Tensor,
        intermediate_tensors: IntermediateTensors | None = None,
        inputs_embeds: torch.Tensor | None = None,
@@ -492,4 +492,4 @@ class VoxtralRealtimeGeneration(VoxtralForConditionalGeneration, SupportsRealtim
            multi_modal_data={
                "audio": (tokenized.audios[0].audio_array, stt_config.sample_rate)
            },
        )
\ No newline at end of file
--- a/vllm/model_executor/models/whisper_causal.py
+++ b/vllm/model_executor/models/whisper_causal.py
@@ -115,6 +115,7 @@ def create_whisper_attention_backend_with_block_pooling(
 ) -> type[AttentionBackend]:
    prefix = "WhisperCausalAttentionWithBlockPooling_"
    underlying_builder = underlying_attn_backend.get_builder_cls()
+    underlying_impl = underlying_attn_backend.get_impl_cls()
    class WhisperCausalAttentionWithBlockPoolingBuilder(underlying_builder):  # type: ignore
        def __init__(
@@ -243,6 +244,7 @@ def create_whisper_attention_backend_with_block_pooling(
        attention_backend_cls=underlying_attn_backend,
        overrides={
            "get_builder_cls": lambda: WhisperCausalAttentionWithBlockPoolingBuilder,
+            "get_impl_cls": lambda: WhisperCausalAttentionWithBlockPoolingImpl,
            "get_kv_cache_shape": lambda num_blocks,
            block_size,
            num_kv_heads,

--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -771,7 +771,7 @@ class Zamba2Model(nn.Module):
    def forward(
        self,
-        input_ids: torch.Tensor,
+        input_ids: torch.Tensor | None,
        positions: torch.Tensor,
        inputs_embeds: torch.Tensor | None = None,
    ) -> torch.Tensor | IntermediateTensors:
@@ -947,7 +947,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsMambaPrefixC
    def forward(
        self,
-        input_ids: torch.Tensor,
+        input_ids: torch.Tensor | None,
        positions: torch.Tensor,
        inputs_embeds: torch.Tensor | None = None,
        **kwargs: Any,

--- a/vllm/model_executor/warmup/deep_gemm_warmup.py
+++ b/vllm/model_executor/warmup/deep_gemm_warmup.py
@@ -14,7 +14,6 @@ from vllm.distributed.parallel_state import get_dp_group, is_global_first_rank
 from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
 from vllm.model_executor.layers.fused_moe.deep_gemm_utils import compute_aligned_M
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEModularMethod
-from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
 from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
    TritonOrDeepGemmExperts,
 )
@@ -171,7 +170,6 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool:
        # modular kernels could invoke deep_gemm_moe_fp8
        return True
-    mk: FusedMoEModularKernel = module.quant_method.fused_experts
    # Further check if the ModularKernel implementation uses the DeepGemmExperts
    return isinstance(
        module.quant_method.moe_kernel, (DeepGemmExperts, TritonOrDeepGemmExperts)

--- a/vllm/plugins/lora_resolvers/filesystem_resolver.py
+++ b/vllm/plugins/lora_resolvers/filesystem_resolver.py
@@ -16,10 +16,20 @@ class FilesystemResolver(LoRAResolver):
        self, base_model_name: str, lora_name: str
    ) -> LoRARequest | None:
        lora_path = os.path.join(self.lora_cache_dir, lora_name)
+        maybe_lora_request = await self._get_lora_req_from_path(
+            lora_name, lora_path, base_model_name
+        )
+        return maybe_lora_request
+    async def _get_lora_req_from_path(
+        self, lora_name: str, lora_path: str, base_model_name: str
+    ) -> LoRARequest | None:
+        """Builds a LoraRequest pointing to the lora path if it's a valid
+        LoRA adapter and has a matching base_model_name.
+        """
        if os.path.exists(lora_path):
-            adapter_config_path = os.path.join(
+            adapter_config_path = os.path.join(lora_path, "adapter_config.json")
-                self.lora_cache_dir, lora_name, "adapter_config.json"
-            )
            if os.path.exists(adapter_config_path):
                with open(adapter_config_path) as file:
                    adapter_config = json.load(file)

--- a/vllm/plugins/lora_resolvers/hf_hub_resolver.py
+++ b/vllm/plugins/lora_resolvers/hf_hub_resolver.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import os
+from huggingface_hub import HfApi, snapshot_download
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.lora.resolver import LoRAResolverRegistry
+from vllm.plugins.lora_resolvers.filesystem_resolver import FilesystemResolver
+logger = init_logger(__name__)
+class HfHubResolver(FilesystemResolver):
+    def __init__(self, repo_list: list[str]):
+        logger.warning(
+            "LoRA is allowing resolution from the following repositories on"
+            " HF Hub: %s please note that allowing remote downloads"
+            " is not secure, and that this plugin is not intended for use in"
+            " production environments.",
+            repo_list,
+        )
+        self.repo_list: list[str] = repo_list
+        self.adapter_dirs: dict[str, set[str]] = {}
+    async def resolve_lora(
+        self, base_model_name: str, lora_name: str
+    ) -> LoRARequest | None:
+        """Resolves potential LoRA requests in a remote repo on HF Hub.
+        This is effectively the same behavior as the filesystem resolver, but
+        with a snapshot_download on dirs containing an adapter config prior
+        to inspecting the cached dir to build a potential LoRA
+        request.
+        """
+        # If a LoRA name begins with the repository name, it's disambiguated
+        maybe_repo = await self._resolve_repo(lora_name)
+        # If we haven't inspected this repo before, save available adapter dirs
+        if maybe_repo is not None and maybe_repo not in self.adapter_dirs:
+            self.adapter_dirs[maybe_repo] = await self._get_adapter_dirs(maybe_repo)
+        maybe_subpath = await self._resolve_repo_subpath(lora_name, maybe_repo)
+        if maybe_repo is None or maybe_subpath is None:
+            return None
+        repo_path = await asyncio.to_thread(
+            snapshot_download,
+            repo_id=maybe_repo,
+            allow_patterns=f"{maybe_subpath}/*" if maybe_subpath != "." else "*",
+        )
+        lora_path = os.path.join(repo_path, maybe_subpath)
+        maybe_lora_request = await self._get_lora_req_from_path(
+            lora_name, lora_path, base_model_name
+        )
+        return maybe_lora_request
+    async def _resolve_repo(self, lora_name: str) -> str | None:
+        """Given a fully qualified path to a LoRA with respect to its HF Hub
+        repo, match the right repo to potentially download from if one exists.
+        Args:
+            lora_name: Path to LoRA in HF Hub, e.g., <org>/<repo>/<subpath>,
+                match on <org>/<repo> (if it contains an adapter directly) or
+                <org>/<repo>/ if it may have one in subdirs.
+        """
+        for potential_repo in self.repo_list:
+            if lora_name.startswith(potential_repo) and (
+                len(lora_name) == len(potential_repo)
+                or lora_name[len(potential_repo)] == "/"
+            ):
+                return potential_repo
+        return None
+    async def _resolve_repo_subpath(
+        self, lora_name: str, maybe_repo: str | None
+    ) -> str | None:
+        """Given the fully qualified path of the LoRA with respect to the HF
+        Repo, get the subpath to download from assuming it's actually got an
+        adapter in it.
+        Args:
+            lora_name: Path to LoRA in HF Hub, e.g., <org>/<repo>/<subpath>
+            maybe_repo: Path to the repo to match against if one exists.
+        """
+        if maybe_repo is None:
+            return None
+        repo_len = len(maybe_repo)
+        if lora_name == maybe_repo or (
+            len(lora_name) == repo_len + 1 and lora_name[-1] == "/"
+        ):
+            # Resolves to the root of the directory
+            adapter_dir = "."
+        else:
+            # It's a subpath; removing trailing slashes if there are any
+            adapter_dir = lora_name[repo_len + 1 :].rstrip("/")
+        # Only download if the directory actually contains an adapter
+        is_adapter = adapter_dir in self.adapter_dirs[maybe_repo]
+        return adapter_dir if is_adapter else None
+    async def _get_adapter_dirs(self, repo_name: str) -> set[str]:
+        """Gets the subpaths within a HF repo that contain an adapter config.
+        Args:
+            repo_name: Name of the HF hub repo to inspect.
+        """
+        repo_files = await asyncio.to_thread(HfApi().list_repo_files, repo_id=repo_name)
+        adapter_dirs = {
+            os.path.dirname(name)
+            for name in repo_files
+            if name.endswith("adapter_config.json")
+        }
+        if "adapter_config.json" in repo_files:
+            adapter_dirs.add(".")
+        return adapter_dirs
+def register_hf_hub_resolver():
+    """Register the Hf hub LoRA Resolver with vLLM"""
+    hf_repo_list = envs.VLLM_LORA_RESOLVER_HF_REPO_LIST
+    is_enabled = (
+        envs.VLLM_PLUGINS is not None and "lora_hf_hub_resolver" in envs.VLLM_PLUGINS
+    )
+    if hf_repo_list:
+        if not is_enabled:
+            logger.warning(
+                "It appears that VLLM_LORA_RESOLVER_HF_REPO_LIST is set, but "
+                "lora_hf_hub_resolver is not enabled in VLLM_PLUGINS; you must"
+                " enable this resolver directly in VLLM_PLUGINS to use it "
+                " because it allows remote downloads."
+            )
+        else:
+            hf_hub_resolver = HfHubResolver(hf_repo_list.split(","))
+            LoRAResolverRegistry.register_resolver("Hf Hub Resolver", hf_hub_resolver)
+    return
\ No newline at end of file
--- a/vllm/tool_parsers/kimi_k2_tool_parser.py
+++ b/vllm/tool_parsers/kimi_k2_tool_parser.py
@@ -448,7 +448,7 @@ class KimiK2ToolParser(ToolParser):
                if current_tool_call_matches:
                    tool_id, tool_args = current_tool_call_matches.groups()
                    tool_name = tool_id.split(":")[0].split(".")[-1]
-                    current_tool_call["id"] = tool_id
+                    current_tool_call["id"] = tool_id.strip()
                    current_tool_call["name"] = tool_name
                    current_tool_call["arguments"] = tool_args
                else:
@@ -458,7 +458,7 @@ class KimiK2ToolParser(ToolParser):
                    if current_tool_call_name_matches:
                        (tool_id_str,) = current_tool_call_name_matches.groups()
                        tool_name = tool_id_str.split(":")[0].split(".")[-1]
-                        current_tool_call["id"] = tool_id_str
+                        current_tool_call["id"] = tool_id_str.strip()
                        current_tool_call["name"] = tool_name
                        current_tool_call["arguments"] = ""
                    else:
@@ -597,4 +597,4 @@ class KimiK2ToolParser(ToolParser):
        except Exception:
            logger.exception("Error trying to handle streaming tool call.")
            return None  # do not stream a delta. skip this token ID.
\ No newline at end of file
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -377,7 +377,7 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
    partial_rotary_factor = getattr_iter(config, names, None, warn=True)
    ompe = getattr(config, "original_max_position_embeddings", None)
-    if Version(version("transformers")) < Version("5.0.0.dev0"):
+    if Version(version("transformers")) < Version("5.0.0"):
        # Transformers v4 installed, legacy config fields may be present
        if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
            config.rope_parameters = rope_scaling
@@ -1209,4 +1209,4 @@ def _maybe_retrieve_max_pos_from_hf(model, revision, **kwargs) -> int:
            exc_info=e,
        )
    return max_position_embeddings
\ No newline at end of file
--- a/vllm/transformers_utils/model_arch_config_convertor.py
+++ b/vllm/transformers_utils/model_arch_config_convertor.py
@@ -441,7 +441,8 @@ MODEL_ARCH_CONFIG_CONVERTORS = {
    "qwen3_5_mtp": Qwen3_5MTPModelArchConfigConvertor,
    "mimo_mtp": MimoMTPModelArchConfigConvertor,
    "glm4_moe_mtp": GLM4MoeMTPModelArchConfigConvertor,
+    "glm_ocr_mtp": GLM4MoeMTPModelArchConfigConvertor,
    "ernie_mtp": ErnieMTPModelArchConfigConvertor,
    "pangu_ultra_moe_mtp": PanguUltraMoeMTPModelArchConfigConvertor,
    "longcat_flash_mtp": LongCatFlashMTPModelArchConfigConvertor,
 }
\ No newline at end of file
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -74,9 +74,6 @@ class StructuredOutputManager:
            self.tokenizer = cached_tokenizer_from_config(
                model_config=self.vllm_config.model_config
            )
-            reasoning_parser = (
-                self.vllm_config.structured_outputs_config.reasoning_parser
-            )
            reasoning_parser_plugin = (
                self.vllm_config.structured_outputs_config.reasoning_parser_plugin
            )
@@ -341,4 +338,4 @@ class StructuredOutputManager:
    def clear_backend(self) -> None:
        if self.backend is not None:
            self.backend.destroy()
\ No newline at end of file
--- a/vllm/v1/worker/gpu/mm/encoder_runner.py
+++ b/vllm/v1/worker/gpu/mm/encoder_runner.py
@@ -132,7 +132,7 @@ class EncoderRunner:
                mm_embeds.append(mm_embeds_item)
        # Copy the is_mm_embed tensor to the GPU.
-        is_mm_embed = self.tmp_is_mm_embed.copy_to_gpu(is_mm_embed)
+        is_mm_embed = is_mm_embed.to(device=self.device, non_blocking=True)
        return mm_embeds, is_mm_embed
    @torch.inference_mode()

--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -672,7 +672,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
            cu_num_logits_np = np.empty(num_reqs + 1, dtype=np.int32)
            cu_num_logits_np[0] = 0
            np.cumsum(num_logits, out=cu_num_logits_np[1:])
-            cu_num_logits = self.tmp_cu_num_logits.copy_to_gpu(cu_num_logits_np)
+            cu_num_logits = async_copy_to_gpu(cu_num_logits_np, device=self.device)
            max_expand_len = self.num_speculative_steps + 1
            expanded_idx_mapping, expanded_local_pos = expand_idx_mapping(
@@ -1225,4 +1225,4 @@ class ExecuteModelState(NamedTuple):
    hidden_states: torch.Tensor | IntermediateTensors
    aux_hidden_states: list[torch.Tensor] | None
    kv_connector_output: KVConnectorOutput | None
    num_tokens_across_dp: torch.Tensor | None
\ No newline at end of file