Commit eefa41c1 authored by zhuwenwen's avatar zhuwenwen
Browse files

sync v0.18.0

parent 82155c76
...@@ -344,7 +344,7 @@ class VoxtralForConditionalGeneration( ...@@ -344,7 +344,7 @@ class VoxtralForConditionalGeneration(
def forward( def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor | None,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
......
...@@ -328,7 +328,7 @@ class VoxtralRealtimeGeneration(VoxtralForConditionalGeneration, SupportsRealtim ...@@ -328,7 +328,7 @@ class VoxtralRealtimeGeneration(VoxtralForConditionalGeneration, SupportsRealtim
def forward( def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor | None,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
...@@ -492,4 +492,4 @@ class VoxtralRealtimeGeneration(VoxtralForConditionalGeneration, SupportsRealtim ...@@ -492,4 +492,4 @@ class VoxtralRealtimeGeneration(VoxtralForConditionalGeneration, SupportsRealtim
multi_modal_data={ multi_modal_data={
"audio": (tokenized.audios[0].audio_array, stt_config.sample_rate) "audio": (tokenized.audios[0].audio_array, stt_config.sample_rate)
}, },
) )
\ No newline at end of file
...@@ -115,6 +115,7 @@ def create_whisper_attention_backend_with_block_pooling( ...@@ -115,6 +115,7 @@ def create_whisper_attention_backend_with_block_pooling(
) -> type[AttentionBackend]: ) -> type[AttentionBackend]:
prefix = "WhisperCausalAttentionWithBlockPooling_" prefix = "WhisperCausalAttentionWithBlockPooling_"
underlying_builder = underlying_attn_backend.get_builder_cls() underlying_builder = underlying_attn_backend.get_builder_cls()
underlying_impl = underlying_attn_backend.get_impl_cls()
class WhisperCausalAttentionWithBlockPoolingBuilder(underlying_builder): # type: ignore class WhisperCausalAttentionWithBlockPoolingBuilder(underlying_builder): # type: ignore
def __init__( def __init__(
...@@ -243,6 +244,7 @@ def create_whisper_attention_backend_with_block_pooling( ...@@ -243,6 +244,7 @@ def create_whisper_attention_backend_with_block_pooling(
attention_backend_cls=underlying_attn_backend, attention_backend_cls=underlying_attn_backend,
overrides={ overrides={
"get_builder_cls": lambda: WhisperCausalAttentionWithBlockPoolingBuilder, "get_builder_cls": lambda: WhisperCausalAttentionWithBlockPoolingBuilder,
"get_impl_cls": lambda: WhisperCausalAttentionWithBlockPoolingImpl,
"get_kv_cache_shape": lambda num_blocks, "get_kv_cache_shape": lambda num_blocks,
block_size, block_size,
num_kv_heads, num_kv_heads,
......
...@@ -771,7 +771,7 @@ class Zamba2Model(nn.Module): ...@@ -771,7 +771,7 @@ class Zamba2Model(nn.Module):
def forward( def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor | None,
positions: torch.Tensor, positions: torch.Tensor,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
) -> torch.Tensor | IntermediateTensors: ) -> torch.Tensor | IntermediateTensors:
...@@ -947,7 +947,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsMambaPrefixC ...@@ -947,7 +947,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsMambaPrefixC
def forward( def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor | None,
positions: torch.Tensor, positions: torch.Tensor,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
**kwargs: Any, **kwargs: Any,
......
...@@ -14,7 +14,6 @@ from vllm.distributed.parallel_state import get_dp_group, is_global_first_rank ...@@ -14,7 +14,6 @@ from vllm.distributed.parallel_state import get_dp_group, is_global_first_rank
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
from vllm.model_executor.layers.fused_moe.deep_gemm_utils import compute_aligned_M from vllm.model_executor.layers.fused_moe.deep_gemm_utils import compute_aligned_M
from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEModularMethod from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEModularMethod
from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts, TritonOrDeepGemmExperts,
) )
...@@ -171,7 +170,6 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool: ...@@ -171,7 +170,6 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool:
# modular kernels could invoke deep_gemm_moe_fp8 # modular kernels could invoke deep_gemm_moe_fp8
return True return True
mk: FusedMoEModularKernel = module.quant_method.fused_experts
# Further check if the ModularKernel implementation uses the DeepGemmExperts # Further check if the ModularKernel implementation uses the DeepGemmExperts
return isinstance( return isinstance(
module.quant_method.moe_kernel, (DeepGemmExperts, TritonOrDeepGemmExperts) module.quant_method.moe_kernel, (DeepGemmExperts, TritonOrDeepGemmExperts)
......
...@@ -16,10 +16,20 @@ class FilesystemResolver(LoRAResolver): ...@@ -16,10 +16,20 @@ class FilesystemResolver(LoRAResolver):
self, base_model_name: str, lora_name: str self, base_model_name: str, lora_name: str
) -> LoRARequest | None: ) -> LoRARequest | None:
lora_path = os.path.join(self.lora_cache_dir, lora_name) lora_path = os.path.join(self.lora_cache_dir, lora_name)
maybe_lora_request = await self._get_lora_req_from_path(
lora_name, lora_path, base_model_name
)
return maybe_lora_request
async def _get_lora_req_from_path(
self, lora_name: str, lora_path: str, base_model_name: str
) -> LoRARequest | None:
"""Builds a LoraRequest pointing to the lora path if it's a valid
LoRA adapter and has a matching base_model_name.
"""
if os.path.exists(lora_path): if os.path.exists(lora_path):
adapter_config_path = os.path.join( adapter_config_path = os.path.join(lora_path, "adapter_config.json")
self.lora_cache_dir, lora_name, "adapter_config.json"
)
if os.path.exists(adapter_config_path): if os.path.exists(adapter_config_path):
with open(adapter_config_path) as file: with open(adapter_config_path) as file:
adapter_config = json.load(file) adapter_config = json.load(file)
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import os
from huggingface_hub import HfApi, snapshot_download
import vllm.envs as envs
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolverRegistry
from vllm.plugins.lora_resolvers.filesystem_resolver import FilesystemResolver
logger = init_logger(__name__)
class HfHubResolver(FilesystemResolver):
def __init__(self, repo_list: list[str]):
logger.warning(
"LoRA is allowing resolution from the following repositories on"
" HF Hub: %s please note that allowing remote downloads"
" is not secure, and that this plugin is not intended for use in"
" production environments.",
repo_list,
)
self.repo_list: list[str] = repo_list
self.adapter_dirs: dict[str, set[str]] = {}
async def resolve_lora(
self, base_model_name: str, lora_name: str
) -> LoRARequest | None:
"""Resolves potential LoRA requests in a remote repo on HF Hub.
This is effectively the same behavior as the filesystem resolver, but
with a snapshot_download on dirs containing an adapter config prior
to inspecting the cached dir to build a potential LoRA
request.
"""
# If a LoRA name begins with the repository name, it's disambiguated
maybe_repo = await self._resolve_repo(lora_name)
# If we haven't inspected this repo before, save available adapter dirs
if maybe_repo is not None and maybe_repo not in self.adapter_dirs:
self.adapter_dirs[maybe_repo] = await self._get_adapter_dirs(maybe_repo)
maybe_subpath = await self._resolve_repo_subpath(lora_name, maybe_repo)
if maybe_repo is None or maybe_subpath is None:
return None
repo_path = await asyncio.to_thread(
snapshot_download,
repo_id=maybe_repo,
allow_patterns=f"{maybe_subpath}/*" if maybe_subpath != "." else "*",
)
lora_path = os.path.join(repo_path, maybe_subpath)
maybe_lora_request = await self._get_lora_req_from_path(
lora_name, lora_path, base_model_name
)
return maybe_lora_request
async def _resolve_repo(self, lora_name: str) -> str | None:
"""Given a fully qualified path to a LoRA with respect to its HF Hub
repo, match the right repo to potentially download from if one exists.
Args:
lora_name: Path to LoRA in HF Hub, e.g., <org>/<repo>/<subpath>,
match on <org>/<repo> (if it contains an adapter directly) or
<org>/<repo>/ if it may have one in subdirs.
"""
for potential_repo in self.repo_list:
if lora_name.startswith(potential_repo) and (
len(lora_name) == len(potential_repo)
or lora_name[len(potential_repo)] == "/"
):
return potential_repo
return None
async def _resolve_repo_subpath(
self, lora_name: str, maybe_repo: str | None
) -> str | None:
"""Given the fully qualified path of the LoRA with respect to the HF
Repo, get the subpath to download from assuming it's actually got an
adapter in it.
Args:
lora_name: Path to LoRA in HF Hub, e.g., <org>/<repo>/<subpath>
maybe_repo: Path to the repo to match against if one exists.
"""
if maybe_repo is None:
return None
repo_len = len(maybe_repo)
if lora_name == maybe_repo or (
len(lora_name) == repo_len + 1 and lora_name[-1] == "/"
):
# Resolves to the root of the directory
adapter_dir = "."
else:
# It's a subpath; removing trailing slashes if there are any
adapter_dir = lora_name[repo_len + 1 :].rstrip("/")
# Only download if the directory actually contains an adapter
is_adapter = adapter_dir in self.adapter_dirs[maybe_repo]
return adapter_dir if is_adapter else None
async def _get_adapter_dirs(self, repo_name: str) -> set[str]:
"""Gets the subpaths within a HF repo that contain an adapter config.
Args:
repo_name: Name of the HF hub repo to inspect.
"""
repo_files = await asyncio.to_thread(HfApi().list_repo_files, repo_id=repo_name)
adapter_dirs = {
os.path.dirname(name)
for name in repo_files
if name.endswith("adapter_config.json")
}
if "adapter_config.json" in repo_files:
adapter_dirs.add(".")
return adapter_dirs
def register_hf_hub_resolver():
"""Register the Hf hub LoRA Resolver with vLLM"""
hf_repo_list = envs.VLLM_LORA_RESOLVER_HF_REPO_LIST
is_enabled = (
envs.VLLM_PLUGINS is not None and "lora_hf_hub_resolver" in envs.VLLM_PLUGINS
)
if hf_repo_list:
if not is_enabled:
logger.warning(
"It appears that VLLM_LORA_RESOLVER_HF_REPO_LIST is set, but "
"lora_hf_hub_resolver is not enabled in VLLM_PLUGINS; you must"
" enable this resolver directly in VLLM_PLUGINS to use it "
" because it allows remote downloads."
)
else:
hf_hub_resolver = HfHubResolver(hf_repo_list.split(","))
LoRAResolverRegistry.register_resolver("Hf Hub Resolver", hf_hub_resolver)
return
\ No newline at end of file
...@@ -448,7 +448,7 @@ class KimiK2ToolParser(ToolParser): ...@@ -448,7 +448,7 @@ class KimiK2ToolParser(ToolParser):
if current_tool_call_matches: if current_tool_call_matches:
tool_id, tool_args = current_tool_call_matches.groups() tool_id, tool_args = current_tool_call_matches.groups()
tool_name = tool_id.split(":")[0].split(".")[-1] tool_name = tool_id.split(":")[0].split(".")[-1]
current_tool_call["id"] = tool_id current_tool_call["id"] = tool_id.strip()
current_tool_call["name"] = tool_name current_tool_call["name"] = tool_name
current_tool_call["arguments"] = tool_args current_tool_call["arguments"] = tool_args
else: else:
...@@ -458,7 +458,7 @@ class KimiK2ToolParser(ToolParser): ...@@ -458,7 +458,7 @@ class KimiK2ToolParser(ToolParser):
if current_tool_call_name_matches: if current_tool_call_name_matches:
(tool_id_str,) = current_tool_call_name_matches.groups() (tool_id_str,) = current_tool_call_name_matches.groups()
tool_name = tool_id_str.split(":")[0].split(".")[-1] tool_name = tool_id_str.split(":")[0].split(".")[-1]
current_tool_call["id"] = tool_id_str current_tool_call["id"] = tool_id_str.strip()
current_tool_call["name"] = tool_name current_tool_call["name"] = tool_name
current_tool_call["arguments"] = "" current_tool_call["arguments"] = ""
else: else:
...@@ -597,4 +597,4 @@ class KimiK2ToolParser(ToolParser): ...@@ -597,4 +597,4 @@ class KimiK2ToolParser(ToolParser):
except Exception: except Exception:
logger.exception("Error trying to handle streaming tool call.") logger.exception("Error trying to handle streaming tool call.")
return None # do not stream a delta. skip this token ID. return None # do not stream a delta. skip this token ID.
\ No newline at end of file
...@@ -377,7 +377,7 @@ def patch_rope_parameters(config: PretrainedConfig) -> None: ...@@ -377,7 +377,7 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
partial_rotary_factor = getattr_iter(config, names, None, warn=True) partial_rotary_factor = getattr_iter(config, names, None, warn=True)
ompe = getattr(config, "original_max_position_embeddings", None) ompe = getattr(config, "original_max_position_embeddings", None)
if Version(version("transformers")) < Version("5.0.0.dev0"): if Version(version("transformers")) < Version("5.0.0"):
# Transformers v4 installed, legacy config fields may be present # Transformers v4 installed, legacy config fields may be present
if (rope_scaling := getattr(config, "rope_scaling", None)) is not None: if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
config.rope_parameters = rope_scaling config.rope_parameters = rope_scaling
...@@ -1209,4 +1209,4 @@ def _maybe_retrieve_max_pos_from_hf(model, revision, **kwargs) -> int: ...@@ -1209,4 +1209,4 @@ def _maybe_retrieve_max_pos_from_hf(model, revision, **kwargs) -> int:
exc_info=e, exc_info=e,
) )
return max_position_embeddings return max_position_embeddings
\ No newline at end of file
...@@ -441,7 +441,8 @@ MODEL_ARCH_CONFIG_CONVERTORS = { ...@@ -441,7 +441,8 @@ MODEL_ARCH_CONFIG_CONVERTORS = {
"qwen3_5_mtp": Qwen3_5MTPModelArchConfigConvertor, "qwen3_5_mtp": Qwen3_5MTPModelArchConfigConvertor,
"mimo_mtp": MimoMTPModelArchConfigConvertor, "mimo_mtp": MimoMTPModelArchConfigConvertor,
"glm4_moe_mtp": GLM4MoeMTPModelArchConfigConvertor, "glm4_moe_mtp": GLM4MoeMTPModelArchConfigConvertor,
"glm_ocr_mtp": GLM4MoeMTPModelArchConfigConvertor,
"ernie_mtp": ErnieMTPModelArchConfigConvertor, "ernie_mtp": ErnieMTPModelArchConfigConvertor,
"pangu_ultra_moe_mtp": PanguUltraMoeMTPModelArchConfigConvertor, "pangu_ultra_moe_mtp": PanguUltraMoeMTPModelArchConfigConvertor,
"longcat_flash_mtp": LongCatFlashMTPModelArchConfigConvertor, "longcat_flash_mtp": LongCatFlashMTPModelArchConfigConvertor,
} }
\ No newline at end of file
...@@ -74,9 +74,6 @@ class StructuredOutputManager: ...@@ -74,9 +74,6 @@ class StructuredOutputManager:
self.tokenizer = cached_tokenizer_from_config( self.tokenizer = cached_tokenizer_from_config(
model_config=self.vllm_config.model_config model_config=self.vllm_config.model_config
) )
reasoning_parser = (
self.vllm_config.structured_outputs_config.reasoning_parser
)
reasoning_parser_plugin = ( reasoning_parser_plugin = (
self.vllm_config.structured_outputs_config.reasoning_parser_plugin self.vllm_config.structured_outputs_config.reasoning_parser_plugin
) )
...@@ -341,4 +338,4 @@ class StructuredOutputManager: ...@@ -341,4 +338,4 @@ class StructuredOutputManager:
def clear_backend(self) -> None: def clear_backend(self) -> None:
if self.backend is not None: if self.backend is not None:
self.backend.destroy() self.backend.destroy()
\ No newline at end of file
...@@ -132,7 +132,7 @@ class EncoderRunner: ...@@ -132,7 +132,7 @@ class EncoderRunner:
mm_embeds.append(mm_embeds_item) mm_embeds.append(mm_embeds_item)
# Copy the is_mm_embed tensor to the GPU. # Copy the is_mm_embed tensor to the GPU.
is_mm_embed = self.tmp_is_mm_embed.copy_to_gpu(is_mm_embed) is_mm_embed = is_mm_embed.to(device=self.device, non_blocking=True)
return mm_embeds, is_mm_embed return mm_embeds, is_mm_embed
@torch.inference_mode() @torch.inference_mode()
......
...@@ -672,7 +672,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -672,7 +672,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
cu_num_logits_np = np.empty(num_reqs + 1, dtype=np.int32) cu_num_logits_np = np.empty(num_reqs + 1, dtype=np.int32)
cu_num_logits_np[0] = 0 cu_num_logits_np[0] = 0
np.cumsum(num_logits, out=cu_num_logits_np[1:]) np.cumsum(num_logits, out=cu_num_logits_np[1:])
cu_num_logits = self.tmp_cu_num_logits.copy_to_gpu(cu_num_logits_np) cu_num_logits = async_copy_to_gpu(cu_num_logits_np, device=self.device)
max_expand_len = self.num_speculative_steps + 1 max_expand_len = self.num_speculative_steps + 1
expanded_idx_mapping, expanded_local_pos = expand_idx_mapping( expanded_idx_mapping, expanded_local_pos = expand_idx_mapping(
...@@ -1225,4 +1225,4 @@ class ExecuteModelState(NamedTuple): ...@@ -1225,4 +1225,4 @@ class ExecuteModelState(NamedTuple):
hidden_states: torch.Tensor | IntermediateTensors hidden_states: torch.Tensor | IntermediateTensors
aux_hidden_states: list[torch.Tensor] | None aux_hidden_states: list[torch.Tensor] | None
kv_connector_output: KVConnectorOutput | None kv_connector_output: KVConnectorOutput | None
num_tokens_across_dp: torch.Tensor | None num_tokens_across_dp: torch.Tensor | None
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment