Commit c721b814 authored by zhuwenwen's avatar zhuwenwen
Browse files

sync v0.15.1

parent d53fe7e5
...@@ -1101,7 +1101,7 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP) ...@@ -1101,7 +1101,7 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
...@@ -1124,4 +1124,4 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP) ...@@ -1124,4 +1124,4 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
loader = AutoWeightsLoader(self) loader = AutoWeightsLoader(self)
return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
\ No newline at end of file
...@@ -585,7 +585,7 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP) ...@@ -585,7 +585,7 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
...@@ -610,4 +610,4 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP) ...@@ -610,4 +610,4 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
loader = AutoWeightsLoader(self) loader = AutoWeightsLoader(self)
return loader.load_weights(weights) return loader.load_weights(weights)
\ No newline at end of file
...@@ -350,7 +350,7 @@ class Base( ...@@ -350,7 +350,7 @@ class Base(
# vLLM does not support encoder-decoder models, so if any encoder layer is # vLLM does not support encoder-decoder models, so if any encoder layer is
# found in a text only model, we assume the whole model is an encoder model # found in a text only model, we assume the whole model is an encoder model
if has_encoder(self.model) and not is_multimodal(self.config): if has_encoder(self.model) and not is_multimodal(self.config):
self.check_version("5.0.0", "encoder models support") self.check_version("5.0.0.dev0", "encoder models support")
attn_type = AttentionType.ENCODER_ONLY attn_type = AttentionType.ENCODER_ONLY
else: else:
attn_type = AttentionType.DECODER attn_type = AttentionType.DECODER
...@@ -502,7 +502,7 @@ class Base( ...@@ -502,7 +502,7 @@ class Base(
) )
def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None: def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
self.check_version("5.0.0", "Eagle3 support") self.check_version("5.0.0.dev0", "Eagle3 support")
from transformers.utils.generic import OutputRecorder from transformers.utils.generic import OutputRecorder
# The default value in PreTrainedModel is None # The default value in PreTrainedModel is None
...@@ -520,4 +520,4 @@ class Base( ...@@ -520,4 +520,4 @@ class Base(
def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]: def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
num_layers = self.text_config.num_hidden_layers num_layers = self.text_config.num_hidden_layers
return (2, num_layers // 2, num_layers - 3) return (2, num_layers // 2, num_layers - 3)
\ No newline at end of file
...@@ -118,7 +118,7 @@ direct_register_custom_op( ...@@ -118,7 +118,7 @@ direct_register_custom_op(
class MoEMixin(MixtureOfExperts): class MoEMixin(MixtureOfExperts):
def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""): def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
self.check_version("5.0.0", "MoE models support") self.check_version("5.0.0.dev0", "MoE models support")
# Skip MixtureOfExperts.__init__ and call the next class in MRO # Skip MixtureOfExperts.__init__ and call the next class in MRO
super(MixtureOfExperts, self).__init__(vllm_config=vllm_config, prefix=prefix) super(MixtureOfExperts, self).__init__(vllm_config=vllm_config, prefix=prefix)
......
...@@ -714,7 +714,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): ...@@ -714,7 +714,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: torch.Tensor | None = None, intermediate_tensors: torch.Tensor | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
...@@ -784,4 +784,4 @@ def pad_and_concat_to_dim3( ...@@ -784,4 +784,4 @@ def pad_and_concat_to_dim3(
# Pad and concatenate: # Pad and concatenate:
# [[B1, 80, M1], [B2, 80, M2]] -> [B1+B2, 80, max(M1, M2)] # [[B1, 80, M1], [B2, 80, M2]] -> [B1+B2, 80, max(M1, M2)]
features = [F.pad(f, (0, max_len - f.shape[-1])) for f in features] features = [F.pad(f, (0, max_len - f.shape[-1])) for f in features]
return torch.cat(features) return torch.cat(features)
\ No newline at end of file
...@@ -867,6 +867,7 @@ def fast_topk( ...@@ -867,6 +867,7 @@ def fast_topk(
# Use topk for efficiency with larger k values # Use topk for efficiency with larger k values
return torch.topk(values, topk, dim=dim) return torch.topk(values, topk, dim=dim)
# Chunk x along the num_tokens axis for sequence parallelism # Chunk x along the num_tokens axis for sequence parallelism
# NOTE: This is wrapped in a torch custom op to work around the following issue: # NOTE: This is wrapped in a torch custom op to work around the following issue:
# The output tensor can have a sequence length 0 at small input sequence lengths # The output tensor can have a sequence length 0 at small input sequence lengths
...@@ -942,4 +943,4 @@ def get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int: ...@@ -942,4 +943,4 @@ def get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
""" """
if feature_layer_index < 0: if feature_layer_index < 0:
return num_hidden_layers + feature_layer_index + 1 return num_hidden_layers + feature_layer_index + 1
return feature_layer_index return feature_layer_index
\ No newline at end of file
...@@ -397,7 +397,7 @@ class VoxtralForConditionalGeneration( ...@@ -397,7 +397,7 @@ class VoxtralForConditionalGeneration(
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
...@@ -899,4 +899,4 @@ class VoxtralEncoderModel(nn.Module): ...@@ -899,4 +899,4 @@ class VoxtralEncoderModel(nn.Module):
weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, loaded_weight) weight_loader(param, loaded_weight)
return name return name
\ No newline at end of file
...@@ -173,7 +173,7 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration): ...@@ -173,7 +173,7 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration):
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
...@@ -318,4 +318,4 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration): ...@@ -318,4 +318,4 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration):
audio = (tokenized.audios[0].audio_array, stt_config.sample_rate) audio = (tokenized.audios[0].audio_array, stt_config.sample_rate)
prompts_dict = {"multi_modal_data": {"audio": audio}} prompts_dict = {"multi_modal_data": {"audio": audio}}
prompts_dict["prompt_token_ids"] = tokenized.tokens prompts_dict["prompt_token_ids"] = tokenized.tokens
return cast(PromptType, prompts_dict) return cast(PromptType, prompts_dict)
\ No newline at end of file
...@@ -105,7 +105,6 @@ def create_whisper_attention_backend_with_block_pooling( ...@@ -105,7 +105,6 @@ def create_whisper_attention_backend_with_block_pooling(
) -> type[AttentionBackend]: ) -> type[AttentionBackend]:
prefix = "WhisperCausalAttentionWithBlockPooling_" prefix = "WhisperCausalAttentionWithBlockPooling_"
underlying_builder = underlying_attn_backend.get_builder_cls() underlying_builder = underlying_attn_backend.get_builder_cls()
underlying_impl = underlying_attn_backend.get_impl_cls()
class WhisperCausalAttentionWithBlockPoolingBuilder(underlying_builder): # type: ignore class WhisperCausalAttentionWithBlockPoolingBuilder(underlying_builder): # type: ignore
def __init__( def __init__(
...@@ -152,43 +151,6 @@ def create_whisper_attention_backend_with_block_pooling( ...@@ -152,43 +151,6 @@ def create_whisper_attention_backend_with_block_pooling(
common_prefix_len, new_common_attn_metadata, fast_build common_prefix_len, new_common_attn_metadata, fast_build
) )
# NOTE: We need a custom impl so we can use the transformed slot_mapping
# computed by `WhisperCausalAttentionWithBlockPoolingBuilder` instead of
# the one from `forward_context.slot_mapping` (gpu_model_runner).
# This follows the same pattern as CrossAttentionImpl.
class WhisperCausalAttentionWithBlockPoolingImpl(underlying_impl): # type: ignore[valid-type,misc]
def forward(
self,
layer: torch.nn.Module,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
kv_cache: torch.Tensor,
attn_metadata: AttentionMetadata,
output: torch.Tensor | None = None,
output_scale: torch.Tensor | None = None,
output_block_scale: torch.Tensor | None = None,
) -> torch.Tensor:
if (
not underlying_attn_backend.forward_includes_kv_cache_update
and attn_metadata is not None
):
self.do_kv_cache_update(
layer, key, value, kv_cache, attn_metadata.slot_mapping
)
return super().forward(
layer,
query,
key,
value,
kv_cache,
attn_metadata,
output,
output_scale,
output_block_scale,
)
if not issubclass(underlying_attn_backend, FlashAttentionBackend): if not issubclass(underlying_attn_backend, FlashAttentionBackend):
raise NotImplementedError( raise NotImplementedError(
f"{underlying_attn_backend} is not yet supported." f"{underlying_attn_backend} is not yet supported."
...@@ -201,7 +163,6 @@ def create_whisper_attention_backend_with_block_pooling( ...@@ -201,7 +163,6 @@ def create_whisper_attention_backend_with_block_pooling(
attention_backend_cls=underlying_attn_backend, attention_backend_cls=underlying_attn_backend,
overrides={ overrides={
"get_builder_cls": lambda: WhisperCausalAttentionWithBlockPoolingBuilder, "get_builder_cls": lambda: WhisperCausalAttentionWithBlockPoolingBuilder,
"get_impl_cls": lambda: WhisperCausalAttentionWithBlockPoolingImpl,
"get_kv_cache_shape": lambda num_blocks, "get_kv_cache_shape": lambda num_blocks,
block_size, block_size,
num_kv_heads, num_kv_heads,
...@@ -214,7 +175,6 @@ def create_whisper_attention_backend_with_block_pooling( ...@@ -214,7 +175,6 @@ def create_whisper_attention_backend_with_block_pooling(
num_kv_heads // block_pool_size, num_kv_heads // block_pool_size,
head_size, head_size,
), # TODO: generalize to other backends ), # TODO: generalize to other backends
"forward_includes_kv_cache_update": True,
}, },
) )
...@@ -502,4 +462,4 @@ class WhisperCausalEncoder(nn.Module): ...@@ -502,4 +462,4 @@ class WhisperCausalEncoder(nn.Module):
hidden_states = encoder_layer(hidden_states, positions) hidden_states = encoder_layer(hidden_states, positions)
hidden_states = self.layer_norm(hidden_states) hidden_states = self.layer_norm(hidden_states)
return hidden_states return hidden_states
\ No newline at end of file
...@@ -771,7 +771,7 @@ class Zamba2Model(nn.Module): ...@@ -771,7 +771,7 @@ class Zamba2Model(nn.Module):
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
) -> torch.Tensor | IntermediateTensors: ) -> torch.Tensor | IntermediateTensors:
...@@ -947,7 +947,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsMambaPrefixC ...@@ -947,7 +947,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsMambaPrefixC
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
**kwargs: Any, **kwargs: Any,
...@@ -989,4 +989,4 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsMambaPrefixC ...@@ -989,4 +989,4 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsMambaPrefixC
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
loader = AutoWeightsLoader(self) loader = AutoWeightsLoader(self)
return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
\ No newline at end of file
...@@ -14,6 +14,7 @@ from vllm.distributed.parallel_state import get_dp_group, is_global_first_rank ...@@ -14,6 +14,7 @@ from vllm.distributed.parallel_state import get_dp_group, is_global_first_rank
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
from vllm.model_executor.layers.fused_moe.deep_gemm_utils import compute_aligned_M from vllm.model_executor.layers.fused_moe.deep_gemm_utils import compute_aligned_M
from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEModularMethod from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEModularMethod
from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts, TritonOrDeepGemmExperts,
) )
...@@ -168,10 +169,9 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool: ...@@ -168,10 +169,9 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool:
# modular kernels could invoke deep_gemm_moe_fp8 # modular kernels could invoke deep_gemm_moe_fp8
return True return True
mk: FusedMoEModularKernel = module.quant_method.fused_experts
# Further check if the ModularKernel implementation uses the DeepGemmExperts # Further check if the ModularKernel implementation uses the DeepGemmExperts
return isinstance( return isinstance(mk.fused_experts, (DeepGemmExperts, TritonOrDeepGemmExperts))
module.quant_method.moe_mk, (DeepGemmExperts, TritonOrDeepGemmExperts)
)
FP8_GEMM_NT_WARMUP_CACHE: set[torch.Size] = set() FP8_GEMM_NT_WARMUP_CACHE: set[torch.Size] = set()
...@@ -370,4 +370,4 @@ def deep_gemm_warmup(model: torch.nn.Module, max_tokens: int): ...@@ -370,4 +370,4 @@ def deep_gemm_warmup(model: torch.nn.Module, max_tokens: int):
deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model, max_tokens, pbar) deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model, max_tokens, pbar)
else: else:
deepgemm_fp8_gemm_nt_warmup(model, max_tokens, None) deepgemm_fp8_gemm_nt_warmup(model, max_tokens, None)
deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model, max_tokens, None) deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model, max_tokens, None)
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.logger import init_logger from vllm.logger import init_logger
logger = init_logger(__name__) logger = init_logger(__name__)
......
...@@ -147,10 +147,7 @@ class XPUPlatform(Platform): ...@@ -147,10 +147,7 @@ class XPUPlatform(Platform):
model_config = vllm_config.model_config model_config = vllm_config.model_config
# in V1(or with ipex chunked prefill) block_size is 64 # in V1(or with ipex chunked prefill) block_size is 64
if cache_config and cache_config.block_size is None: if cache_config and cache_config.block_size is None:
if envs.VLLM_USE_V1: cache_config.block_size = 64
cache_config.block_size = 64
else:
cache_config.block_size = 16
# lazy import to avoid circular import # lazy import to avoid circular import
from vllm.config import CompilationMode, CUDAGraphMode from vllm.config import CompilationMode, CUDAGraphMode
...@@ -262,4 +259,4 @@ class XPUPlatform(Platform): ...@@ -262,4 +259,4 @@ class XPUPlatform(Platform):
) -> None: ) -> None:
"""Copy blocks from XPU to host (CPU).""" """Copy blocks from XPU to host (CPU)."""
_src_cache = src_cache[:, src_block_indices] _src_cache = src_cache[:, src_block_indices]
dst_cache[:, dst_block_indices] = _src_cache.cpu() dst_cache[:, dst_block_indices] = _src_cache.cpu()
\ No newline at end of file
...@@ -16,20 +16,10 @@ class FilesystemResolver(LoRAResolver): ...@@ -16,20 +16,10 @@ class FilesystemResolver(LoRAResolver):
self, base_model_name: str, lora_name: str self, base_model_name: str, lora_name: str
) -> LoRARequest | None: ) -> LoRARequest | None:
lora_path = os.path.join(self.lora_cache_dir, lora_name) lora_path = os.path.join(self.lora_cache_dir, lora_name)
maybe_lora_request = await self._get_lora_req_from_path(
lora_name, lora_path, base_model_name
)
return maybe_lora_request
async def _get_lora_req_from_path(
self, lora_name: str, lora_path: str, base_model_name: str
) -> LoRARequest | None:
"""Builds a LoraRequest pointing to the lora path if it's a valid
LoRA adapter and has a matching base_model_name.
"""
if os.path.exists(lora_path): if os.path.exists(lora_path):
adapter_config_path = os.path.join(lora_path, "adapter_config.json") adapter_config_path = os.path.join(
self.lora_cache_dir, lora_name, "adapter_config.json"
)
if os.path.exists(adapter_config_path): if os.path.exists(adapter_config_path):
with open(adapter_config_path) as file: with open(adapter_config_path) as file:
adapter_config = json.load(file) adapter_config = json.load(file)
...@@ -59,4 +49,4 @@ def register_filesystem_resolver(): ...@@ -59,4 +49,4 @@ def register_filesystem_resolver():
fs_resolver = FilesystemResolver(lora_cache_dir) fs_resolver = FilesystemResolver(lora_cache_dir)
LoRAResolverRegistry.register_resolver("Filesystem Resolver", fs_resolver) LoRAResolverRegistry.register_resolver("Filesystem Resolver", fs_resolver)
return return
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import os
from huggingface_hub import HfApi, snapshot_download
import vllm.envs as envs
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolverRegistry
from vllm.plugins.lora_resolvers.filesystem_resolver import FilesystemResolver
logger = init_logger(__name__)
class HfHubResolver(FilesystemResolver):
def __init__(self, repo_list: list[str]):
logger.warning(
"LoRA is allowing resolution from the following repositories on"
" HF Hub: %s please note that allowing remote downloads"
" is not secure, and that this plugin is not intended for use in"
" production environments.",
repo_list,
)
self.repo_list: list[str] = repo_list
self.adapter_dirs: dict[str, set[str]] = {}
async def resolve_lora(
self, base_model_name: str, lora_name: str
) -> LoRARequest | None:
"""Resolves potential LoRA requests in a remote repo on HF Hub.
This is effectively the same behavior as the filesystem resolver, but
with a snapshot_download on dirs containing an adapter config prior
to inspecting the cached dir to build a potential LoRA
request.
"""
# If a LoRA name begins with the repository name, it's disambiguated
maybe_repo = await self._resolve_repo(lora_name)
# If we haven't inspected this repo before, save available adapter dirs
if maybe_repo is not None and maybe_repo not in self.adapter_dirs:
self.adapter_dirs[maybe_repo] = await self._get_adapter_dirs(maybe_repo)
maybe_subpath = await self._resolve_repo_subpath(lora_name, maybe_repo)
if maybe_repo is None or maybe_subpath is None:
return None
repo_path = await asyncio.to_thread(
snapshot_download,
repo_id=maybe_repo,
allow_patterns=f"{maybe_subpath}/*" if maybe_subpath != "." else "*",
)
lora_path = os.path.join(repo_path, maybe_subpath)
maybe_lora_request = await self._get_lora_req_from_path(
lora_name, lora_path, base_model_name
)
return maybe_lora_request
async def _resolve_repo(self, lora_name: str) -> str | None:
"""Given a fully qualified path to a LoRA with respect to its HF Hub
repo, match the right repo to potentially download from if one exists.
Args:
lora_name: Path to LoRA in HF Hub, e.g., <org>/<repo>/<subpath>,
match on <org>/<repo> (if it contains an adapter directly) or
<org>/<repo>/ if it may have one in subdirs.
"""
for potential_repo in self.repo_list:
if lora_name.startswith(potential_repo) and (
len(lora_name) == len(potential_repo)
or lora_name[len(potential_repo)] == "/"
):
return potential_repo
return None
async def _resolve_repo_subpath(
self, lora_name: str, maybe_repo: str | None
) -> str | None:
"""Given the fully qualified path of the LoRA with respect to the HF
Repo, get the subpath to download from assuming it's actually got an
adapter in it.
Args:
lora_name: Path to LoRA in HF Hub, e.g., <org>/<repo>/<subpath>
maybe_repo: Path to the repo to match against if one exists.
"""
if maybe_repo is None:
return None
repo_len = len(maybe_repo)
if lora_name == maybe_repo or (
len(lora_name) == repo_len + 1 and lora_name[-1] == "/"
):
# Resolves to the root of the directory
adapter_dir = "."
else:
# It's a subpath; removing trailing slashes if there are any
adapter_dir = lora_name[repo_len + 1 :].rstrip("/")
# Only download if the directory actually contains an adapter
is_adapter = adapter_dir in self.adapter_dirs[maybe_repo]
return adapter_dir if is_adapter else None
async def _get_adapter_dirs(self, repo_name: str) -> set[str]:
"""Gets the subpaths within a HF repo that contain an adapter config.
Args:
repo_name: Name of the HF hub repo to inspect.
"""
repo_files = await asyncio.to_thread(HfApi().list_repo_files, repo_id=repo_name)
adapter_dirs = {
os.path.dirname(name)
for name in repo_files
if name.endswith("adapter_config.json")
}
if "adapter_config.json" in repo_files:
adapter_dirs.add(".")
return adapter_dirs
def register_hf_hub_resolver():
"""Register the Hf hub LoRA Resolver with vLLM"""
hf_repo_list = envs.VLLM_LORA_RESOLVER_HF_REPO_LIST
is_enabled = (
envs.VLLM_PLUGINS is not None and "lora_hf_hub_resolver" in envs.VLLM_PLUGINS
)
if hf_repo_list:
if not is_enabled:
logger.warning(
"It appears that VLLM_LORA_RESOLVER_HF_REPO_LIST is set, but "
"lora_hf_hub_resolver is not enabled in VLLM_PLUGINS; you must"
" enable this resolver directly in VLLM_PLUGINS to use it "
" because it allows remote downloads."
)
else:
hf_hub_resolver = HfHubResolver(hf_repo_list.split(","))
LoRAResolverRegistry.register_resolver("Hf Hub Resolver", hf_hub_resolver)
return
...@@ -448,7 +448,7 @@ class KimiK2ToolParser(ToolParser): ...@@ -448,7 +448,7 @@ class KimiK2ToolParser(ToolParser):
if current_tool_call_matches: if current_tool_call_matches:
tool_id, tool_args = current_tool_call_matches.groups() tool_id, tool_args = current_tool_call_matches.groups()
tool_name = tool_id.split(":")[0].split(".")[-1] tool_name = tool_id.split(":")[0].split(".")[-1]
current_tool_call["id"] = tool_id.strip() current_tool_call["id"] = tool_id
current_tool_call["name"] = tool_name current_tool_call["name"] = tool_name
current_tool_call["arguments"] = tool_args current_tool_call["arguments"] = tool_args
else: else:
...@@ -458,7 +458,7 @@ class KimiK2ToolParser(ToolParser): ...@@ -458,7 +458,7 @@ class KimiK2ToolParser(ToolParser):
if current_tool_call_name_matches: if current_tool_call_name_matches:
(tool_id_str,) = current_tool_call_name_matches.groups() (tool_id_str,) = current_tool_call_name_matches.groups()
tool_name = tool_id_str.split(":")[0].split(".")[-1] tool_name = tool_id_str.split(":")[0].split(".")[-1]
current_tool_call["id"] = tool_id_str.strip() current_tool_call["id"] = tool_id_str
current_tool_call["name"] = tool_name current_tool_call["name"] = tool_name
current_tool_call["arguments"] = "" current_tool_call["arguments"] = ""
else: else:
......
...@@ -331,7 +331,7 @@ def patch_rope_parameters(config: PretrainedConfig) -> None: ...@@ -331,7 +331,7 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
partial_rotary_factor = getattr_iter(config, names, None, warn=True) partial_rotary_factor = getattr_iter(config, names, None, warn=True)
ompe = getattr(config, "original_max_position_embeddings", None) ompe = getattr(config, "original_max_position_embeddings", None)
if Version(version("transformers")) < Version("5.0.0"): if Version(version("transformers")) < Version("5.0.0.dev0"):
# Transformers v4 installed, legacy config fields may be present # Transformers v4 installed, legacy config fields may be present
if (rope_scaling := getattr(config, "rope_scaling", None)) is not None: if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
config.rope_parameters = rope_scaling config.rope_parameters = rope_scaling
......
...@@ -398,7 +398,6 @@ MODEL_ARCH_CONFIG_CONVERTORS = { ...@@ -398,7 +398,6 @@ MODEL_ARCH_CONFIG_CONVERTORS = {
"qwen3_next_mtp": Qwen3NextMTPModelArchConfigConvertor, "qwen3_next_mtp": Qwen3NextMTPModelArchConfigConvertor,
"mimo_mtp": MimoMTPModelArchConfigConvertor, "mimo_mtp": MimoMTPModelArchConfigConvertor,
"glm4_moe_mtp": GLM4MoeMTPModelArchConfigConvertor, "glm4_moe_mtp": GLM4MoeMTPModelArchConfigConvertor,
"glm_ocr_mtp": GLM4MoeMTPModelArchConfigConvertor,
"ernie_mtp": ErnieMTPModelArchConfigConvertor, "ernie_mtp": ErnieMTPModelArchConfigConvertor,
"pangu_ultra_moe_mtp": PanguUltraMoeMTPModelArchConfigConvertor, "pangu_ultra_moe_mtp": PanguUltraMoeMTPModelArchConfigConvertor,
"longcat_flash_mtp": LongCatFlashMTPModelArchConfigConvertor, "longcat_flash_mtp": LongCatFlashMTPModelArchConfigConvertor,
......
...@@ -40,7 +40,6 @@ from vllm.v1.attention.ops.flashmla import ( ...@@ -40,7 +40,6 @@ from vllm.v1.attention.ops.flashmla import (
is_flashmla_dense_supported, is_flashmla_dense_supported,
) )
from vllm.v1.kv_cache_interface import AttentionSpec from vllm.v1.kv_cache_interface import AttentionSpec
from vllm.platforms import current_platform
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -285,7 +284,6 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]): ...@@ -285,7 +284,6 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
num_splits = torch.zeros((B + 1,), dtype=dtype, device=device) num_splits = torch.zeros((B + 1,), dtype=dtype, device=device)
scheduler_metadata.tile_scheduler_metadata = tile_scheduler_metadata scheduler_metadata.tile_scheduler_metadata = tile_scheduler_metadata
scheduler_metadata.num_splits = num_splits scheduler_metadata.num_splits = num_splits
if self.kv_cache_dtype.startswith("fp8"): if self.kv_cache_dtype.startswith("fp8"):
o, lse = flash_mla_with_kvcache_fp8( o, lse = flash_mla_with_kvcache_fp8(
q=q, q=q,
......
...@@ -330,14 +330,7 @@ class RocmAttentionImpl(AttentionImpl): ...@@ -330,14 +330,7 @@ class RocmAttentionImpl(AttentionImpl):
kv_cache, self.num_kv_heads, self.head_size kv_cache, self.num_kv_heads, self.head_size
) )
# key and value may be None in the case of cross attention. They are if self.kv_sharing_target_layer_name is None:
# calculated once based on the output from the encoder and then cached
# in KV cache.
if (
self.kv_sharing_target_layer_name is None
and key is not None
and value is not None
):
# Reshape the input keys and values and store them in the cache. # Reshape the input keys and values and store them in the cache.
# Skip this if sharing KV cache with an earlier attention layer. # Skip this if sharing KV cache with an earlier attention layer.
...@@ -389,8 +382,8 @@ class RocmAttentionImpl(AttentionImpl): ...@@ -389,8 +382,8 @@ class RocmAttentionImpl(AttentionImpl):
# Compute attention and update output up to `num_actual_tokens`. # Compute attention and update output up to `num_actual_tokens`.
chunked_prefill_paged_decode( chunked_prefill_paged_decode(
query=query[:num_actual_tokens], query=query[:num_actual_tokens],
key=key[:num_actual_tokens] if key is not None else None, key=key[:num_actual_tokens],
value=value[:num_actual_tokens] if value is not None else None, value=value[:num_actual_tokens],
output=output[:num_actual_tokens], output=output[:num_actual_tokens],
kv_cache_dtype=self.kv_cache_dtype, kv_cache_dtype=self.kv_cache_dtype,
key_cache=key_cache, key_cache=key_cache,
...@@ -409,4 +402,4 @@ class RocmAttentionImpl(AttentionImpl): ...@@ -409,4 +402,4 @@ class RocmAttentionImpl(AttentionImpl):
sinks=self.sinks, sinks=self.sinks,
) )
return output return output
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment