Commit d76fc11e authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.15.0rc1' into v0.15.0rc1-dev

parents 38166ec4 58996f35
......@@ -252,7 +252,7 @@ class Starcoder2Model(nn.Module):
def forward(
self,
input_ids: torch.Tensor,
input_ids: torch.Tensor | None,
positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None,
inputs_embeds: torch.Tensor | None = None,
......@@ -336,7 +336,7 @@ class Starcoder2ForCausalLM(nn.Module, SupportsPP):
def forward(
self,
input_ids: torch.Tensor,
input_ids: torch.Tensor | None,
positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None,
......
......@@ -354,7 +354,7 @@ class Step3TextModel(nn.Module):
def forward(
self,
input_ids: torch.Tensor,
input_ids: torch.Tensor | None,
positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None,
......@@ -419,7 +419,7 @@ class Step3TextForCausalLM(nn.Module, SupportsPP):
def forward(
self,
input_ids: torch.Tensor,
input_ids: torch.Tensor | None,
positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None,
......
......@@ -1101,7 +1101,7 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
def forward(
self,
input_ids: torch.Tensor,
input_ids: torch.Tensor | None,
positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None,
......
......@@ -585,7 +585,7 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
def forward(
self,
input_ids: torch.Tensor,
input_ids: torch.Tensor | None,
positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None,
......
......@@ -350,7 +350,7 @@ class Base(
# vLLM does not support encoder-decoder models, so if any encoder layer is
# found in a text only model, we assume the whole model is an encoder model
if has_encoder(self.model) and not is_multimodal(self.config):
self.check_version("5.0.0.dev0", "encoder models support")
self.check_version("5.0.0", "encoder models support")
attn_type = AttentionType.ENCODER_ONLY
else:
attn_type = AttentionType.DECODER
......@@ -502,7 +502,7 @@ class Base(
)
def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
self.check_version("5.0.0.dev0", "Eagle3 support")
self.check_version("5.0.0", "Eagle3 support")
from transformers.utils.generic import OutputRecorder
# The default value in PreTrainedModel is None
......
......@@ -118,7 +118,7 @@ direct_register_custom_op(
class MoEMixin(MixtureOfExperts):
def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
self.check_version("5.0.0.dev0", "MoE models support")
self.check_version("5.0.0", "MoE models support")
# Skip MixtureOfExperts.__init__ and call the next class in MRO
super(MixtureOfExperts, self).__init__(vllm_config=vllm_config, prefix=prefix)
......
......@@ -714,7 +714,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
def forward(
self,
input_ids: torch.Tensor,
input_ids: torch.Tensor | None,
positions: torch.Tensor,
intermediate_tensors: torch.Tensor | None = None,
inputs_embeds: torch.Tensor | None = None,
......
......@@ -397,7 +397,7 @@ class VoxtralForConditionalGeneration(
def forward(
self,
input_ids: torch.Tensor,
input_ids: torch.Tensor | None,
positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None,
......
......@@ -173,7 +173,7 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration):
def forward(
self,
input_ids: torch.Tensor,
input_ids: torch.Tensor | None,
positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None,
......
......@@ -105,6 +105,7 @@ def create_whisper_attention_backend_with_block_pooling(
) -> type[AttentionBackend]:
prefix = "WhisperCausalAttentionWithBlockPooling_"
underlying_builder = underlying_attn_backend.get_builder_cls()
underlying_impl = underlying_attn_backend.get_impl_cls()
class WhisperCausalAttentionWithBlockPoolingBuilder(underlying_builder): # type: ignore
def __init__(
......@@ -151,6 +152,43 @@ def create_whisper_attention_backend_with_block_pooling(
common_prefix_len, new_common_attn_metadata, fast_build
)
# NOTE: We need a custom impl so we can use the transformed slot_mapping
# computed by `WhisperCausalAttentionWithBlockPoolingBuilder` instead of
# the one from `forward_context.slot_mapping` (gpu_model_runner).
# This follows the same pattern as CrossAttentionImpl.
class WhisperCausalAttentionWithBlockPoolingImpl(underlying_impl): # type: ignore[valid-type,misc]
def forward(
self,
layer: torch.nn.Module,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
kv_cache: torch.Tensor,
attn_metadata: AttentionMetadata,
output: torch.Tensor | None = None,
output_scale: torch.Tensor | None = None,
output_block_scale: torch.Tensor | None = None,
) -> torch.Tensor:
if (
not underlying_attn_backend.forward_includes_kv_cache_update
and attn_metadata is not None
):
self.do_kv_cache_update(
layer, key, value, kv_cache, attn_metadata.slot_mapping
)
return super().forward(
layer,
query,
key,
value,
kv_cache,
attn_metadata,
output,
output_scale,
output_block_scale,
)
if not issubclass(underlying_attn_backend, FlashAttentionBackend):
raise NotImplementedError(
f"{underlying_attn_backend} is not yet supported."
......@@ -163,6 +201,7 @@ def create_whisper_attention_backend_with_block_pooling(
attention_backend_cls=underlying_attn_backend,
overrides={
"get_builder_cls": lambda: WhisperCausalAttentionWithBlockPoolingBuilder,
"get_impl_cls": lambda: WhisperCausalAttentionWithBlockPoolingImpl,
"get_kv_cache_shape": lambda num_blocks,
block_size,
num_kv_heads,
......@@ -175,6 +214,7 @@ def create_whisper_attention_backend_with_block_pooling(
num_kv_heads // block_pool_size,
head_size,
), # TODO: generalize to other backends
"forward_includes_kv_cache_update": True,
},
)
......
......@@ -771,7 +771,7 @@ class Zamba2Model(nn.Module):
def forward(
self,
input_ids: torch.Tensor,
input_ids: torch.Tensor | None,
positions: torch.Tensor,
inputs_embeds: torch.Tensor | None = None,
) -> torch.Tensor | IntermediateTensors:
......@@ -947,7 +947,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsMambaPrefixC
def forward(
self,
input_ids: torch.Tensor,
input_ids: torch.Tensor | None,
positions: torch.Tensor,
inputs_embeds: torch.Tensor | None = None,
**kwargs: Any,
......
......@@ -14,7 +14,6 @@ from vllm.distributed.parallel_state import get_dp_group, is_global_first_rank
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
from vllm.model_executor.layers.fused_moe.deep_gemm_utils import compute_aligned_M
from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEModularMethod
from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts,
)
......@@ -169,9 +168,10 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool:
# modular kernels could invoke deep_gemm_moe_fp8
return True
mk: FusedMoEModularKernel = module.quant_method.fused_experts
# Further check if the ModularKernel implementation uses the DeepGemmExperts
return isinstance(mk.fused_experts, (DeepGemmExperts, TritonOrDeepGemmExperts))
return isinstance(
module.quant_method.moe_mk, (DeepGemmExperts, TritonOrDeepGemmExperts)
)
FP8_GEMM_NT_WARMUP_CACHE: set[torch.Size] = set()
......
......@@ -20,6 +20,7 @@ from typing import (
)
import numpy as np
from PIL.Image import Image
from typing_extensions import NotRequired, TypeVar
from vllm.utils.collection_utils import full_groupby, is_list_of
......@@ -29,7 +30,6 @@ from vllm.utils.jsontree import json_map_leaves
if TYPE_CHECKING:
import torch
import torch.types
from PIL.Image import Image
from transformers.feature_extraction_utils import BatchFeature
from .media import MediaWithBytes
......@@ -105,6 +105,28 @@ The number of data items allowed per modality is restricted by
"""
class VisionChunkImage(TypedDict):
"""Represents an image wrapped as a vision chunk."""
type: Literal["image"]
image: Image
uuid: str | None
class VisionChunkVideo(TypedDict):
"""Represents a video chunk with metadata."""
type: Literal["video_chunk"]
video_chunk: list[Image]
uuid: str | None
prompt: str
video_idx: int
VisionChunk = VisionChunkImage | VisionChunkVideo
"""A vision chunk is either an image or a video chunk."""
@final
class MultiModalDataBuiltins(TypedDict, total=False):
"""Type annotations for modality types predefined by vLLM."""
......@@ -118,6 +140,9 @@ class MultiModalDataBuiltins(TypedDict, total=False):
audio: ModalityData[AudioItem]
"""The input audio(s)."""
vision_chunk: ModalityData[VisionChunk]
"""The input visual atom(s) - unified modality for images and video chunks."""
MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
"""
......
......@@ -384,6 +384,13 @@ class VideoEmbeddingItems(EmbeddingItems):
super().__init__(data, "video", expected_hidden_size)
class VisionChunkProcessorItems(ProcessorBatchItems[Any]):
"""Processor items for vision chunks (unified image and video chunks)."""
def __init__(self, data: Sequence[Any]) -> None:
super().__init__(data, "vision_chunk")
_D = TypeVar("_D", bound=ModalityDataItems[Any, Any])
......@@ -652,11 +659,23 @@ class MultiModalDataParser:
return VideoProcessorItems(new_videos, metadata=metadata_lst)
def _parse_vision_chunk_data(
self,
data: ModalityData[Any],
) -> ModalityDataItems[Any, Any] | None:
"""Parse vision chunk data (unified image and video chunks)."""
if data is None or self._is_empty(data):
return None
if self.is_embeddings(data):
raise ValueError("Do not support embedding data for vision_chunk right now")
return VisionChunkProcessorItems(data)
def _get_subparsers(self) -> Mapping[str, ModalityDataParser]:
return {
"audio": self._parse_audio_data,
"image": self._parse_image_data,
"video": self._parse_video_data,
"vision_chunk": self._parse_vision_chunk_data,
}
def parse_mm_data(self, mm_data: MultiModalDataDict) -> MultiModalDataItems:
......
......@@ -235,6 +235,27 @@ class VideoLoader:
VIDEO_LOADER_REGISTRY = ExtensionManager()
@VIDEO_LOADER_REGISTRY.register("identity")
class IdentityVideoLoader(VideoLoader):
"""IdentityVideoLoader returns raw video bytes without decoding.
This allows the model processor to handle video decoding and
is required for models like Kimi-K2.5 that need custom video chunk splitting.
NOTE: This is temporary for Kimi-K2.5 testing. Remember to change back
to opencv before release if needed.
"""
@classmethod
def load_bytes(
cls,
data: bytes,
num_frames: int = -1,
**kwargs: Any,
) -> tuple[Any, Any]:
return data, None
@VIDEO_LOADER_REGISTRY.register("opencv")
class OpenCVVideoBackend(VideoLoader):
def get_cv2_video_api(self):
......
......@@ -599,7 +599,8 @@ class RocmPlatform(Platform):
cls, device: torch.types.Device | None = None
) -> float:
torch.cuda.reset_peak_memory_stats(device)
# return torch.cuda.mem_get_info(device)[1] - torch.cuda.mem_get_info(device)[0]
# free_mem, total_mem = torch.cuda.mem_get_info(device)
# return total_mem - free_mem
return torch.cuda.max_memory_allocated(device)
@classmethod
......
......@@ -16,10 +16,20 @@ class FilesystemResolver(LoRAResolver):
self, base_model_name: str, lora_name: str
) -> LoRARequest | None:
lora_path = os.path.join(self.lora_cache_dir, lora_name)
maybe_lora_request = await self._get_lora_req_from_path(
lora_name, lora_path, base_model_name
)
return maybe_lora_request
async def _get_lora_req_from_path(
self, lora_name: str, lora_path: str, base_model_name: str
) -> LoRARequest | None:
"""Builds a LoraRequest pointing to the lora path if it's a valid
LoRA adapter and has a matching base_model_name.
"""
if os.path.exists(lora_path):
adapter_config_path = os.path.join(
self.lora_cache_dir, lora_name, "adapter_config.json"
)
adapter_config_path = os.path.join(lora_path, "adapter_config.json")
if os.path.exists(adapter_config_path):
with open(adapter_config_path) as file:
adapter_config = json.load(file)
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import os
from huggingface_hub import HfApi, snapshot_download
import vllm.envs as envs
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolverRegistry
from vllm.plugins.lora_resolvers.filesystem_resolver import FilesystemResolver
logger = init_logger(__name__)
class HfHubResolver(FilesystemResolver):
def __init__(self, repo_list: list[str]):
logger.warning(
"LoRA is allowing resolution from the following repositories on"
" HF Hub: %s please note that allowing remote downloads"
" is not secure, and that this plugin is not intended for use in"
" production environments.",
repo_list,
)
self.repo_list: list[str] = repo_list
self.adapter_dirs: dict[str, set[str]] = {}
async def resolve_lora(
self, base_model_name: str, lora_name: str
) -> LoRARequest | None:
"""Resolves potential LoRA requests in a remote repo on HF Hub.
This is effectively the same behavior as the filesystem resolver, but
with a snapshot_download on dirs containing an adapter config prior
to inspecting the cached dir to build a potential LoRA
request.
"""
# If a LoRA name begins with the repository name, it's disambiguated
maybe_repo = await self._resolve_repo(lora_name)
# If we haven't inspected this repo before, save available adapter dirs
if maybe_repo is not None and maybe_repo not in self.adapter_dirs:
self.adapter_dirs[maybe_repo] = await self._get_adapter_dirs(maybe_repo)
maybe_subpath = await self._resolve_repo_subpath(lora_name, maybe_repo)
if maybe_repo is None or maybe_subpath is None:
return None
repo_path = await asyncio.to_thread(
snapshot_download,
repo_id=maybe_repo,
allow_patterns=f"{maybe_subpath}/*" if maybe_subpath != "." else "*",
)
lora_path = os.path.join(repo_path, maybe_subpath)
maybe_lora_request = await self._get_lora_req_from_path(
lora_name, lora_path, base_model_name
)
return maybe_lora_request
async def _resolve_repo(self, lora_name: str) -> str | None:
"""Given a fully qualified path to a LoRA with respect to its HF Hub
repo, match the right repo to potentially download from if one exists.
Args:
lora_name: Path to LoRA in HF Hub, e.g., <org>/<repo>/<subpath>,
match on <org>/<repo> (if it contains an adapter directly) or
<org>/<repo>/ if it may have one in subdirs.
"""
for potential_repo in self.repo_list:
if lora_name.startswith(potential_repo) and (
len(lora_name) == len(potential_repo)
or lora_name[len(potential_repo)] == "/"
):
return potential_repo
return None
async def _resolve_repo_subpath(
self, lora_name: str, maybe_repo: str | None
) -> str | None:
"""Given the fully qualified path of the LoRA with respect to the HF
Repo, get the subpath to download from assuming it's actually got an
adapter in it.
Args:
lora_name: Path to LoRA in HF Hub, e.g., <org>/<repo>/<subpath>
maybe_repo: Path to the repo to match against if one exists.
"""
if maybe_repo is None:
return None
repo_len = len(maybe_repo)
if lora_name == maybe_repo or (
len(lora_name) == repo_len + 1 and lora_name[-1] == "/"
):
# Resolves to the root of the directory
adapter_dir = "."
else:
# It's a subpath; removing trailing slashes if there are any
adapter_dir = lora_name[repo_len + 1 :].rstrip("/")
# Only download if the directory actually contains an adapter
is_adapter = adapter_dir in self.adapter_dirs[maybe_repo]
return adapter_dir if is_adapter else None
async def _get_adapter_dirs(self, repo_name: str) -> set[str]:
"""Gets the subpaths within a HF repo that contain an adapter config.
Args:
repo_name: Name of the HF hub repo to inspect.
"""
repo_files = await asyncio.to_thread(HfApi().list_repo_files, repo_id=repo_name)
adapter_dirs = {
os.path.dirname(name)
for name in repo_files
if name.endswith("adapter_config.json")
}
if "adapter_config.json" in repo_files:
adapter_dirs.add(".")
return adapter_dirs
def register_hf_hub_resolver():
"""Register the Hf hub LoRA Resolver with vLLM"""
hf_repo_list = envs.VLLM_LORA_RESOLVER_HF_REPO_LIST
is_enabled = (
envs.VLLM_PLUGINS is not None and "lora_hf_hub_resolver" in envs.VLLM_PLUGINS
)
if hf_repo_list:
if not is_enabled:
logger.warning(
"It appears that VLLM_LORA_RESOLVER_HF_REPO_LIST is set, but "
"lora_hf_hub_resolver is not enabled in VLLM_PLUGINS; you must"
" enable this resolver directly in VLLM_PLUGINS to use it "
" because it allows remote downloads."
)
else:
hf_hub_resolver = HfHubResolver(hf_repo_list.split(","))
LoRAResolverRegistry.register_resolver("Hf Hub Resolver", hf_hub_resolver)
return
......@@ -54,8 +54,8 @@ _REASONING_PARSERS_TO_REGISTER = {
"HunyuanA13BReasoningParser",
),
"kimi_k2": (
"deepseek_r1_reasoning_parser",
"DeepSeekR1ReasoningParser",
"kimi_k2_reasoning_parser",
"KimiK2ReasoningParser",
),
"minimax_m2": (
"minimax_m2_reasoning_parser",
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
from typing import TYPE_CHECKING, Any
from transformers import PreTrainedTokenizerBase
from vllm.entrypoints.openai.engine.protocol import DeltaMessage
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParser
from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
from .identity_reasoning_parser import IdentityReasoningParser
if TYPE_CHECKING:
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
else:
ChatCompletionRequest = Any
logger = init_logger(__name__)
class KimiK2ReasoningParser(ReasoningParser):
"""
Kimi K2 parser that delegates to either DeepSeekR1ReasoningParser or
IdentityReasoningParser based on `thinking` and `separate_reasoning`.
Unlike DeepSeekV3ReasoningParser which defaults to NOT thinking,
KimiK2ReasoningParser defaults to thinking mode (uses DeepSeekR1ReasoningParser).
"""
def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
super().__init__(tokenizer, *args, **kwargs)
chat_kwargs = kwargs.pop("chat_template_kwargs", {}) or {}
# Key difference: default to True instead of False
thinking = bool(chat_kwargs.pop("thinking", True))
if thinking:
self._parser = DeepSeekR1ReasoningParser(tokenizer, *args, **kwargs)
else:
self._parser = IdentityReasoningParser(tokenizer, *args, **kwargs)
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
return self._parser.is_reasoning_end(input_ids)
def is_reasoning_end_streaming(
self, input_ids: list[int], delta_ids: list[int]
) -> bool:
return self._parser.is_reasoning_end_streaming(input_ids, delta_ids)
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
return self._parser.extract_content_ids(input_ids)
def extract_reasoning(
self, model_output: str, request: "ChatCompletionRequest"
) -> tuple[str | None, str | None]:
return self._parser.extract_reasoning(model_output, request)
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
return self._parser.extract_reasoning_streaming(
previous_text,
current_text,
delta_text,
previous_token_ids,
current_token_ids,
delta_token_ids,
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment