Commit 0da93439 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.18.1rc0' into v0.18.1rc0-ori

parents 25f2f756 298e5108
......@@ -124,8 +124,8 @@ _TEXT_GENERATION_MODELS = {
"GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
"GraniteForCausalLM": ("granite", "GraniteForCausalLM"),
"GraniteMoeForCausalLM": ("granitemoe", "GraniteMoeForCausalLM"),
"GraniteMoeHybridForCausalLM": ("granitemoehybrid", "GraniteMoeHybridForCausalLM"), # noqa: E501
"GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"), # noqa: E501
"GraniteMoeHybridForCausalLM": ("granitemoehybrid", "GraniteMoeHybridForCausalLM"),
"GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"),
"GritLM": ("gritlm", "GritLM"),
"Grok1ModelForCausalLM": ("grok1", "GrokForCausalLM"),
"Grok1ForCausalLM": ("grok1", "GrokForCausalLM"),
......@@ -143,7 +143,7 @@ _TEXT_GENERATION_MODELS = {
"JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
"Jais2ForCausalLM": ("jais2", "Jais2ForCausalLM"),
"JambaForCausalLM": ("jamba", "JambaForCausalLM"),
"KimiLinearForCausalLM": ("kimi_linear", "KimiLinearForCausalLM"), # noqa: E501
"KimiLinearForCausalLM": ("kimi_linear", "KimiLinearForCausalLM"),
"Lfm2ForCausalLM": ("lfm2", "Lfm2ForCausalLM"),
"Lfm2MoeForCausalLM": ("lfm2_moe", "Lfm2MoeForCausalLM"),
"LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
......@@ -249,17 +249,14 @@ _EMBEDDING_MODELS = {
# [Multimodal]
"CLIPModel": ("clip", "CLIPEmbeddingModel"),
"ColPaliForRetrieval": ("colpali", "ColPaliModel"),
"LlamaNemotronVLModel": ("nemotron_vl", "LlamaNemotronVLForEmbedding"),
"LlavaNextForConditionalGeneration": (
"llava_next",
"LlavaNextForConditionalGeneration",
),
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
"Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501
"Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),
"SiglipModel": ("siglip", "SiglipEmbeddingModel"),
"LlamaNemotronVLModel": (
"nemotron_vl",
"LlamaNemotronVLForEmbedding",
),
# Technically Terratorch models work on images, both in
# input and output. I am adding it here because it piggy-backs on embedding
# models for the time being.
......@@ -272,10 +269,13 @@ _LATE_INTERACTION_MODELS = {
"HF_ColBERT": ("colbert", "ColBERTModel"),
"ColBERTModernBertModel": ("colbert", "ColBERTModernBertModel"),
"ColBERTJinaRobertaModel": ("colbert", "ColBERTJinaRobertaModel"),
"ColBERTLfm2Model": ("colbert", "ColBERTLfm2Model"),
# [Multimodal]
"ColModernVBertForRetrieval": ("colmodernvbert", "ColModernVBertForRetrieval"),
"ColPaliForRetrieval": ("colpali", "ColPaliModel"),
"ColQwen3": ("colqwen3", "ColQwen3Model"),
"OpsColQwen3Model": ("colqwen3", "ColQwen3Model"),
"ColQwen3_5": ("colqwen3_5", "ColQwen3_5Model"),
"Qwen3VLNemotronEmbedModel": ("colqwen3", "ColQwen3Model"),
}
......@@ -302,7 +302,7 @@ _SEQUENCE_CLASSIFICATION_MODELS = {
"bert_with_rope",
"GteNewForSequenceClassification",
),
"JambaForSequenceClassification": ("jamba", "JambaForSequenceClassification"), # noqa: E501
"JambaForSequenceClassification": ("jamba", "JambaForSequenceClassification"),
"LlamaBidirectionalForSequenceClassification": (
"llama",
"LlamaBidirectionalForSequenceClassification",
......@@ -366,13 +366,13 @@ _MULTIMODAL_MODELS = {
"fireredasr2",
"FireRedASR2ForConditionalGeneration",
),
"FunASRForConditionalGeneration": ("funasr", "FunASRForConditionalGeneration"), # noqa: E501
"FunASRForConditionalGeneration": ("funasr", "FunASRForConditionalGeneration"),
"FunAudioChatForConditionalGeneration": (
"funaudiochat",
"FunAudioChatForConditionalGeneration",
),
"FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
"Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"), # noqa: E501
"Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"),
"Gemma3nForConditionalGeneration": (
"gemma3n_mm",
"Gemma3nForConditionalGeneration",
......@@ -381,7 +381,7 @@ _MULTIMODAL_MODELS = {
"GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"),
"Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"),
"Glm4vMoeForConditionalGeneration": ("glm4_1v", "Glm4vMoeForConditionalGeneration"),
"GlmOcrForConditionalGeneration": ("glm_ocr", "GlmOcrForConditionalGeneration"), # noqa: E501
"GlmOcrForConditionalGeneration": ("glm_ocr", "GlmOcrForConditionalGeneration"),
"GraniteSpeechForConditionalGeneration": (
"granite_speech",
"GraniteSpeechForConditionalGeneration",
......@@ -391,13 +391,7 @@ _MULTIMODAL_MODELS = {
"hunyuan_vision",
"HunYuanVLForConditionalGeneration",
),
"StepVLForConditionalGeneration": ("step_vl", "StepVLForConditionalGeneration"),
"InternVLChatModel": ("internvl", "InternVLChatModel"),
"NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
"OpenCUAForConditionalGeneration": (
"opencua",
"OpenCUAForConditionalGeneration",
),
"InternS1ForConditionalGeneration": (
"interns1",
"InternS1ForConditionalGeneration",
......@@ -415,24 +409,22 @@ _MULTIMODAL_MODELS = {
"Idefics3ForConditionalGeneration",
),
"IsaacForConditionalGeneration": ("isaac", "IsaacForConditionalGeneration"),
"SmolVLMForConditionalGeneration": ("smolvlm", "SmolVLMForConditionalGeneration"), # noqa: E501
"KananaVForConditionalGeneration": ("kanana_v", "KananaVForConditionalGeneration"),
"KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"),
"KeyeVL1_5ForConditionalGeneration": (
"keye_vl1_5",
"KeyeVL1_5ForConditionalGeneration",
),
"RForConditionalGeneration": ("rvl", "RForConditionalGeneration"),
"KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"), # noqa: E501
"KimiK25ForConditionalGeneration": ("kimi_k25", "KimiK25ForConditionalGeneration"), # noqa: E501
"MoonshotKimiaForCausalLM": ("kimi_audio", "KimiAudioForConditionalGeneration"), # noqa: E501
"KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"),
"KimiK25ForConditionalGeneration": ("kimi_k25", "KimiK25ForConditionalGeneration"),
"MoonshotKimiaForCausalLM": ("kimi_audio", "KimiAudioForConditionalGeneration"),
"LightOnOCRForConditionalGeneration": (
"lightonocr",
"LightOnOCRForConditionalGeneration",
),
"Lfm2VlForConditionalGeneration": ("lfm2_vl", "Lfm2VLForConditionalGeneration"),
"Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"),
"Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"),
"Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"), # noqa: E501
"LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
"LlavaNextForConditionalGeneration": (
"llava_next",
......@@ -446,7 +438,7 @@ _MULTIMODAL_MODELS = {
"llava_onevision",
"LlavaOnevisionForConditionalGeneration",
),
"MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"), # noqa: E501
"MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"),
"MiDashengLMModel": ("midashenglm", "MiDashengLMModel"),
"MiniMaxVL01ForConditionalGeneration": (
"minimax_vl_01",
......@@ -460,7 +452,9 @@ _MULTIMODAL_MODELS = {
),
"MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
"Molmo2ForConditionalGeneration": ("molmo2", "Molmo2ForConditionalGeneration"),
"NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
"NVLM_D": ("nvlm_d", "NVLM_D_Model"),
"OpenCUAForConditionalGeneration": ("opencua", "OpenCUAForConditionalGeneration"),
"OpenPanguVLForConditionalGeneration": (
"openpangu_vl",
"OpenPanguVLForConditionalGeneration",
......@@ -479,9 +473,9 @@ _MULTIMODAL_MODELS = {
),
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
"Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
"PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"), # noqa: E501
"QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"), # noqa: E501
"Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501
"PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"),
"QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"),
"Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),
"Qwen2_5_VLForConditionalGeneration": (
"qwen2_5_vl",
"Qwen2_5_VLForConditionalGeneration",
......@@ -506,39 +500,40 @@ _MULTIMODAL_MODELS = {
"qwen3_asr",
"Qwen3ASRForConditionalGeneration",
),
"Qwen3ASRRealtimeGeneration": (
"qwen3_asr_realtime",
"Qwen3ASRRealtimeGeneration",
),
"Qwen3VLForConditionalGeneration": ("qwen3_vl", "Qwen3VLForConditionalGeneration"), # noqa: E501
"Qwen3ASRRealtimeGeneration": ("qwen3_asr_realtime", "Qwen3ASRRealtimeGeneration"),
"Qwen3VLForConditionalGeneration": ("qwen3_vl", "Qwen3VLForConditionalGeneration"),
"Qwen3VLMoeForConditionalGeneration": (
"qwen3_vl_moe",
"Qwen3VLMoeForConditionalGeneration",
),
"Qwen3_5ForConditionalGeneration": (
"qwen3_5",
"Qwen3_5ForConditionalGeneration",
),
"Qwen3_5ForConditionalGeneration": ("qwen3_5", "Qwen3_5ForConditionalGeneration"),
"Qwen3_5MoeForConditionalGeneration": (
"qwen3_5",
"Qwen3_5MoeForConditionalGeneration",
),
"RForConditionalGeneration": ("rvl", "RForConditionalGeneration"),
"SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
"Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"), # noqa: E501
"TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"), # noqa: E501
"SmolVLMForConditionalGeneration": ("smolvlm", "SmolVLMForConditionalGeneration"),
"StepVLForConditionalGeneration": ("step_vl", "StepVLForConditionalGeneration"),
"Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"),
"TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"),
"Tarsier2ForConditionalGeneration": (
"qwen2_vl",
"Tarsier2ForConditionalGeneration",
),
"UltravoxModel": ("ultravox", "UltravoxModel"),
"VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501
"VoxtralRealtimeGeneration": ("voxtral_realtime", "VoxtralRealtimeGeneration"), # noqa: E501
"VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"),
"VoxtralRealtimeGeneration": ("voxtral_realtime", "VoxtralRealtimeGeneration"),
# [Encoder-decoder]
"CohereASRForConditionalGeneration": (
"cohere_asr",
"CohereASRForConditionalGeneration",
),
"NemotronParseForConditionalGeneration": (
"nemotron_parse",
"NemotronParseForConditionalGeneration",
),
"WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"), # noqa: E501
"WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"),
}
_SPECULATIVE_DECODING_MODELS = {
......@@ -648,14 +643,17 @@ _PREVIOUSLY_SUPPORTED_MODELS = {
"Phi4MultimodalForCausalLM": "0.12.0",
# encoder-decoder models except whisper
# have been removed for V0 deprecation.
"BartModel": "0.10.2",
"BartForConditionalGeneration": "0.10.2",
"DonutForConditionalGeneration": "0.10.2",
"Florence2ForConditionalGeneration": "0.10.2",
"MBartForConditionalGeneration": "0.10.2",
"MllamaForConditionalGeneration": "0.10.2",
}
_OOT_SUPPORTED_MODELS = {
"BartModel": "https://github.com/vllm-project/bart-plugin",
"BartForConditionalGeneration": "https://github.com/vllm-project/bart-plugin",
"Florence2ForConditionalGeneration": "https://github.com/vllm-project/bart-plugin",
"MBartForConditionalGeneration": "https://github.com/vllm-project/bart-plugin",
}
@dataclass(frozen=True)
class _ModelInfo:
......@@ -952,6 +950,14 @@ class _ModelRegistry:
"Please use an older version of vLLM if you want to "
"use this model architecture."
)
if arch in _OOT_SUPPORTED_MODELS:
plugin_url = _OOT_SUPPORTED_MODELS[arch]
raise ValueError(
f"Model architecture {arch} is not supported in-tree anymore. "
f"Please install the plugin at {plugin_url} if you want to "
"use this model architecture."
)
raise ValueError(
f"Model architectures {architectures} are not supported for now. "
......
......@@ -10,6 +10,7 @@ from transformers import RobertaConfig
from vllm.config import ModelConfig, PoolerConfig, VllmConfig
from vllm.model_executor.layers.pooler import (
BgeM3Pooler,
BOSEOSFilter,
DispatchPooler,
Pooler,
......@@ -216,24 +217,29 @@ class BgeM3EmbeddingModel(RobertaEmbeddingModel):
self.colbert_linear = nn.Linear(
self.hidden_size, self.hidden_size, dtype=self.head_dtype
)
embed_pooler = pooler_for_embed(pooler_config)
token_classify_pooler = BOSEOSFilter(
pooler_for_token_classify(
pooler_config,
pooling=AllPool(),
classifier=self.sparse_linear,
act_fn=torch.relu,
),
self.bos_token_id,
self.eos_token_id,
)
return DispatchPooler(
{
"embed": pooler_for_embed(pooler_config),
"embed": embed_pooler,
"token_embed": BOSEOSFilter(
pooler_for_token_embed(pooler_config, self.colbert_linear),
self.bos_token_id,
# for some reason m3 only filters the bos for colbert vectors
),
"token_classify": BOSEOSFilter(
pooler_for_token_classify(
pooler_config,
pooling=AllPool(),
classifier=self.sparse_linear,
act_fn=torch.relu,
),
self.bos_token_id,
self.eos_token_id,
"token_classify": token_classify_pooler,
"embed&token_classify": BgeM3Pooler(
token_classify_pooler, embed_pooler
),
}
)
......
......@@ -7,14 +7,12 @@
# Copyright (c) 2025 Skywork
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from collections.abc import Iterable, Mapping, Sequence
from collections.abc import Iterable, Mapping
from typing import Annotated, Literal, TypeAlias
import torch
import torch.nn as nn
import torchvision.transforms as T
from PIL import Image
from transformers import BatchFeature, PretrainedConfig, TensorType
from transformers import PretrainedConfig
from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
......@@ -26,40 +24,23 @@ from vllm.model_executor.models.intern_vit import (
InternVisionPatchModel,
)
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import convert_image_mode
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig,
MultiModalKwargsItems,
)
from vllm.multimodal.parse import (
ImageEmbeddingItems,
ImageProcessorItems,
ImageSize,
MultiModalDataItems,
)
from vllm.multimodal.processing import (
BaseDummyInputsBuilder,
BaseMultiModalProcessor,
BaseProcessingInfo,
PromptReplacement,
PromptUpdate,
PromptUpdateDetails,
)
from vllm.multimodal.inputs import MultiModalDataDict
from vllm.multimodal.processing import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
from vllm.tokenizers import TokenizerLike
from vllm.transformers_utils.processors.internvl import (
InternVLImageProcessor,
InternVLProcessor,
)
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
from .internvl import (
BaseInternVLDummyInputsBuilder,
BaseInternVLMultiModalProcessor,
BaseInternVLProcessingInfo,
)
from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<IMG_CONTEXT>"
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
class SkyworkR1VImagePixelInputs(TensorSchema):
"""
......@@ -106,418 +87,36 @@ SkyworkR1VImageInputs: TypeAlias = (
)
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
def build_transform(input_size: int):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
return T.Compose(
[
T.Lambda(lambda img: convert_image_mode(img, "RGB")),
T.Resize(
(input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD),
]
)
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
def find_closest_aspect_ratio(
aspect_ratio: float,
target_ratios: list[tuple[int, int]],
*,
width: int,
height: int,
image_size: int,
) -> tuple[int, int]:
best_ratio_diff = float("inf")
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def resolve_skyworkr1v_min_max_num(
*,
min_dynamic_patch: int,
max_dynamic_patch: int,
dynamic_image_size: bool,
use_thumbnail: bool,
) -> tuple[int, int]:
min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
if use_thumbnail and max_dynamic_patch != 1:
max_dynamic_patch += 1
return min_dynamic_patch, max_dynamic_patch
def get_skyworkr1v_target_ratios(
min_num: int,
max_num: int,
) -> list[tuple[int, int]]:
target_ratios = {
(i, j)
for n in range(min_num, max_num + 1)
for i in range(1, n + 1)
for j in range(1, n + 1)
if min_num <= i * j <= max_num
}
return sorted(target_ratios, key=lambda x: x[0] * x[1])
def calculate_skyworkr1v_targets(
*,
orig_width: int,
orig_height: int,
target_ratios: list[tuple[int, int]],
image_size: int,
use_thumbnail: bool,
) -> tuple[int, int, int]:
aspect_ratio = orig_width / orig_height
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio,
target_ratios,
width=orig_width,
height=orig_height,
image_size=image_size,
)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# add thumbnail image if num_blocks != 1
if use_thumbnail and blocks != 1:
blocks += 1
return blocks, target_width, target_height
def dynamic_preprocess_skyworkr1v(
image: Image.Image,
*,
target_ratios: list[tuple[int, int]],
image_size: int,
use_thumbnail: bool,
) -> list[Image.Image]:
orig_width, orig_height = image.size
# calculate the number of blocks without thumbnail
blocks, target_width, target_height = calculate_skyworkr1v_targets(
orig_width=orig_width,
orig_height=orig_height,
target_ratios=target_ratios,
image_size=image_size,
use_thumbnail=False,
)
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size,
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B
def image_to_pixel_values_skyworkr1v(
image: Image.Image,
*,
input_size: int,
min_num: int,
max_num: int,
use_thumbnail: bool,
) -> torch.Tensor:
target_ratios = get_skyworkr1v_target_ratios(min_num, max_num)
transform = build_transform(input_size=input_size)
images = dynamic_preprocess_skyworkr1v(
image,
target_ratios=target_ratios,
image_size=input_size,
use_thumbnail=use_thumbnail,
)
pixel_values = torch.stack([transform(image) for image in images])
return pixel_values
class SkyworkR1VProcessor:
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py#L252
"""
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> None:
super().__init__()
self.config = config
self.tokenizer = tokenizer
image_size: int = config.vision_config.image_size
patch_size: int = config.vision_config.patch_size
if min_dynamic_patch is None:
min_dynamic_patch = config.min_dynamic_patch
assert isinstance(min_dynamic_patch, int)
if max_dynamic_patch is None:
max_dynamic_patch = config.max_dynamic_patch
assert isinstance(max_dynamic_patch, int)
if dynamic_image_size is None:
dynamic_image_size = config.dynamic_image_size
assert isinstance(dynamic_image_size, bool)
self.num_image_token = int(
(image_size // patch_size) ** 2 * (config.downsample_ratio**2)
)
self.image_size = image_size
self.min_dynamic_patch = min_dynamic_patch
self.max_dynamic_patch = max_dynamic_patch
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail: bool = config.use_thumbnail
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[IMG_CONTEXT]
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
repl_features = IMG_CONTEXT * feature_size
repl_full = IMG_START + repl_features + IMG_END
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
def resolve_min_max_num(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
) -> tuple[int, int]:
min_dynamic_patch = (
self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
)
max_dynamic_patch = (
self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
)
dynamic_image_size = (
self.dynamic_image_size
if dynamic_image_size is None
else dynamic_image_size
)
use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
return resolve_skyworkr1v_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
def resolve_target_ratios(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
) -> list[tuple[int, int]]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
return get_skyworkr1v_target_ratios(min_num, max_num)
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
) -> int:
target_ratios = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
)
num_patches, _, _ = calculate_skyworkr1v_targets(
orig_width=image_width,
orig_height=image_height,
image_size=self.image_size,
target_ratios=target_ratios,
use_thumbnail=self.use_thumbnail,
)
return num_patches * self.num_image_token
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> list[torch.Tensor]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values
)
return [
image_to_pixel_values_skyworkr1v(
image,
input_size=self.image_size,
min_num=min_num,
max_num=max_num,
use_thumbnail=self.use_thumbnail,
)
for image in images
]
def __call__(
self,
text: str | list[str] | None = None,
images: Image.Image | list[Image.Image] | None = None,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
return_tensors: str | TensorType | None = None,
) -> BatchFeature:
if text is None:
text = []
if not isinstance(text, list):
text = [text]
if images is None:
images = []
if not isinstance(images, list):
images = [images]
if len(images) == 0:
image_inputs = {}
else:
pixel_values_lst = self._images_to_pixel_values_lst(
images,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
image_inputs = {
"pixel_values_flat": torch.cat(pixel_values_lst),
"image_num_patches": torch.tensor(
[len(item) for item in pixel_values_lst]
),
}
for pixel_values in pixel_values_lst:
num_patches = pixel_values.shape[0]
feature_size = num_patches * self.num_image_token
image_repl = self.get_image_repl(feature_size, num_patches)
class SkyworkR1VProcessingInfo(BaseInternVLProcessingInfo):
def get_image_processor(self, **kwargs):
config = self.get_hf_config()
vision_config = config.vision_config
text = [t.replace("<image>", image_repl.full, 1) for t in text]
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("image_size", vision_config.image_size)
kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
kwargs.setdefault("use_thumbnail", config.use_thumbnail)
text_inputs = self.tokenizer(text)
return InternVLImageProcessor(**kwargs)
combined_outputs = {**text_inputs, **image_inputs}
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
config = self.get_hf_config()
vision_config = config.vision_config
return BatchFeature(combined_outputs, tensor_type=return_tensors)
image_processor = self.get_image_processor(**kwargs)
image_size = image_processor.image_size
patch_size = vision_config.patch_size
downsample_ratio = config.downsample_ratio
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
class SkyworkR1VProcessingInfo(BaseProcessingInfo):
def get_hf_processor(self, **kwargs: object) -> SkyworkR1VProcessor:
return self.ctx.init_processor(
SkyworkR1VProcessor,
config=self.get_hf_config(),
return InternVLProcessor(
tokenizer=self.get_tokenizer(),
**kwargs,
)
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
return {"image": None}
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
processor: SkyworkR1VProcessor,
) -> int:
return processor.get_num_image_tokens(
image_width=image_width,
image_height=image_height,
image_processor=image_processor,
image_seq_length=image_seq_length,
)
def get_image_size_with_most_features(self) -> ImageSize:
processor = self.get_hf_processor()
base_size = processor.image_size
target_ratios = processor.resolve_target_ratios()
largest_feature_size, largest_feature_pinpoint = 0, None
for wr, hr in target_ratios:
width, height = base_size * wr, base_size * hr
feat_size = self.get_num_image_tokens(
image_width=width,
image_height=height,
processor=processor,
)
if feat_size > largest_feature_size:
largest_feature_size = feat_size
largest_feature_pinpoint = ImageSize(width=width, height=height)
if largest_feature_size == 0 or largest_feature_pinpoint is None:
raise ValueError("Cannot have a largest feature size of 0!")
return largest_feature_pinpoint
class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[SkyworkR1VProcessingInfo]):
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
......@@ -546,102 +145,10 @@ class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[SkyworkR1VProcessingIn
}
class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[SkyworkR1VProcessingInfo]):
def _call_hf_processor(
self,
prompt: str,
mm_data: Mapping[str, object],
mm_kwargs: Mapping[str, object],
tok_kwargs: Mapping[str, object],
) -> BatchFeature:
processed_outputs = super()._call_hf_processor(
prompt=prompt,
mm_data=mm_data,
mm_kwargs=mm_kwargs,
tok_kwargs=tok_kwargs,
)
hf_processor = self.info.get_hf_processor(**mm_kwargs)
image_token_id = hf_processor.image_token_id
# Since there may be extra tokens in the feature placeholders,
# we need to pass the image token ID to the model to select the
# tokens to merge from the vision encoder outputs
processed_outputs["image_token_id"] = torch.tensor(image_token_id)
return processed_outputs
def _get_mm_fields_config(
self,
hf_inputs: BatchFeature,
hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0))
num_images = len(image_num_patches)
return dict(
pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
"image", image_num_patches
),
image_num_patches=MultiModalFieldConfig.batched("image"),
image_embeds=MultiModalFieldConfig.batched("image"),
image_token_id=MultiModalFieldConfig.shared("image", num_images),
)
def _get_prompt_updates(
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
out_mm_data = out_mm_kwargs.get_data()
if "image_num_patches" in out_mm_data:
image_num_patches = out_mm_data["image_num_patches"]
assert isinstance(image_num_patches, torch.Tensor)
image_num_patches = image_num_patches.tolist()
elif "image_embeds" in out_mm_data:
# TODO: Use image size information in dictionary embedding inputs
# to compute num_patches (similar to Qwen2-VL)
image_num_patches = [None] * len(out_mm_data["image_embeds"])
else:
image_num_patches = []
def get_replacement_skyworkr1v(item_idx: int):
images = mm_items.get_items(
"image", (ImageEmbeddingItems, ImageProcessorItems)
)
if isinstance(images, ImageEmbeddingItems):
feature_size = images.get_feature_size(item_idx)
else:
image_size = images.get_image_size(item_idx)
feature_size = self.info.get_num_image_tokens(
image_width=image_size.width,
image_height=image_size.height,
processor=hf_processor,
)
num_patches = image_num_patches[item_idx]
if num_patches is not None:
assert isinstance(num_patches, int)
return hf_processor.get_image_repl(feature_size, num_patches)
return [
PromptReplacement(
modality="image",
target="<image>",
replacement=get_replacement_skyworkr1v,
)
]
@MULTIMODAL_REGISTRY.register_processor(
SkyworkR1VMultiModalProcessor,
BaseInternVLMultiModalProcessor,
info=SkyworkR1VProcessingInfo,
dummy_inputs=SkyworkR1VDummyInputsBuilder,
dummy_inputs=BaseInternVLDummyInputsBuilder,
)
class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
@classmethod
......
......@@ -2,18 +2,13 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
from collections.abc import Iterable, Mapping, Sequence
from itertools import product
from math import ceil, sqrt
from math import sqrt
from typing import Annotated, Any, Literal, TypeAlias
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from PIL import Image
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
from transformers import BatchFeature, PretrainedConfig, TensorType
from transformers import BatchFeature
from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
......@@ -43,8 +38,12 @@ from vllm.multimodal.processing import (
PromptUpdateDetails,
)
from vllm.sequence import IntermediateTensors
from vllm.tokenizers import TokenizerLike
from vllm.transformers_utils.configs import Step3VisionEncoderConfig
from vllm.transformers_utils.configs.step3_vl import Step3VisionEncoderConfig
from vllm.transformers_utils.processors.step3_vl import (
MAX_IMAGE_SIZE,
Step3VLImageProcessor,
Step3VLProcessor,
)
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
......@@ -89,447 +88,32 @@ class Step3VLImageEmbeddingInputs(TensorSchema):
Step3VLImageInputs: TypeAlias = Step3VLImagePixelInputs | Step3VLImageEmbeddingInputs
ImageWithPatches = tuple[Image.Image, list[Image.Image], list[bool] | None]
MAX_IMAGE_SIZE: int = 3024
class Step3VisionProcessor:
def __init__(self, size, interpolation_mode="bicubic", patch_size=None):
mean = [0.48145466, 0.4578275, 0.40821073]
std = [0.26862954, 0.26130258, 0.27577711]
patch_size = patch_size if patch_size is not None else size
self.transform = transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize(mean, std),
transforms.Resize(
(size, size),
interpolation=InterpolationMode.BICUBIC
if interpolation_mode == "bicubic"
else InterpolationMode.BILINEAR,
antialias=True,
),
]
)
self.patch_transform = (
transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize(mean, std),
transforms.Resize(
(patch_size, patch_size),
interpolation=InterpolationMode.BICUBIC
if interpolation_mode == "bicubic"
else InterpolationMode.BILINEAR,
antialias=True,
),
]
)
if patch_size is not None
else None
)
def __call__(self, image, is_patch=False):
if is_patch:
return {"pixel_values": self.patch_transform(image).unsqueeze(0)}
else:
return {"pixel_values": self.transform(image).unsqueeze(0)}
class ImagePatcher:
def __init__(self, enable_patch: bool = True) -> None:
self.enable_patch = enable_patch
def determine_window_size(self, long: int, short: int) -> int:
if long < 728:
return short if long / short > 1.5 else 0
return min(short, 504) if long / short > 4 else 504
def slide_window(
self,
width: int,
height: int,
sizes: list[tuple[int, int]],
steps: list[tuple[int, int]],
img_rate_thr: float = 0.6,
) -> tuple[list[tuple[int, int, int, int]], tuple[int, int]]:
assert 1 >= img_rate_thr >= 0, "The `in_rate_thr` should lie in 0~1"
windows = []
# Sliding windows.
for size, step in zip(sizes, steps):
size_w, size_h = size
step_w, step_h = step
x_num = 1 if width <= size_w else ceil((width - size_w) / step_w + 1)
x_start = [step_w * i for i in range(x_num)]
if len(x_start) > 1 and x_start[-1] + size_w > width:
x_start[-1] = width - size_w
y_num = 1 if height <= size_h else ceil((height - size_h) / step_h + 1)
y_start = [step_h * i for i in range(y_num)]
if len(y_start) > 1 and y_start[-1] + size_h > height:
y_start[-1] = height - size_h
start = np.array(list(product(y_start, x_start)), dtype=int)
start[:, [0, 1]] = start[:, [1, 0]]
windows.append(np.concatenate([start, start + size], axis=1))
windows = np.concatenate(windows, axis=0)
return [
(int(box[0]), int(box[1]), int(box[2] - box[0]), int(box[3] - box[1]))
for box in windows
], (x_num, y_num)
def square_pad(self, img: Image.Image) -> Image.Image:
w, h = img.size
if w == h:
return img
size = max(w, h)
padded = Image.new(img.mode, (size, size), 0)
padded.paste(img, (0, 0))
return padded
def get_image_size_for_padding(
self, img_width: int, img_height: int
) -> tuple[int, int]:
ratio = img_width / img_height
if min(img_height, img_width) < 32 and (ratio > 4 or ratio < 1 / 4):
new_size = max(img_height, img_width)
return new_size, new_size
return img_width, img_height
def get_image_size_for_preprocess(
self, img_width: int, img_height: int
) -> tuple[int, int]:
if max(img_height, img_width) > MAX_IMAGE_SIZE:
scale_factor = MAX_IMAGE_SIZE / max(img_height, img_width)
img_width = int(img_width * scale_factor)
img_height = int(img_height * scale_factor)
return img_width, img_height
def get_image_size_for_crop(
self, img_width: int, img_height: int, window_size: int
):
w_ratio = img_width / window_size
h_ratio = img_height / window_size
if w_ratio < 1:
width_new = img_width
else:
decimal_w = w_ratio - img_width // window_size
w_ratio = int(w_ratio) + 1 if decimal_w > 0.2 else int(w_ratio)
width_new = window_size * w_ratio
if h_ratio < 1:
height_new = img_height
else:
decimal_h = h_ratio - img_height // window_size
h_ratio = int(h_ratio) + 1 if decimal_h > 0.2 else int(h_ratio)
height_new = window_size * h_ratio
return int(width_new), int(height_new)
def patch_crop(self, img: Image.Image, i: int, j: int, th: int, tw: int):
target = img.crop((j, i, j + tw, i + th))
return target
def get_num_patches(self, img_width: int, img_height: int) -> tuple[int, int]:
img_width, img_height = self.get_image_size_for_padding(img_width, img_height)
img_width, img_height = self.get_image_size_for_preprocess(
img_width, img_height
)
window_size = self.determine_window_size(
max(img_height, img_width), min(img_height, img_width)
)
if window_size == 0 or not self.enable_patch:
return 0, 0
else:
img_width, img_height = self.get_image_size_for_crop(
img_width, img_height, window_size
)
center_list, (x_num, y_num) = self.slide_window(
img_width,
img_height,
[(window_size, window_size)],
[(window_size, window_size)],
)
full_rows = (len(center_list) - 1) // x_num + 1
if len(center_list) > 0 and len(center_list) % x_num == 0:
full_rows -= 1
return len(center_list), full_rows
def __call__(
self, img: Image.Image
) -> tuple[Image.Image, list[Image.Image], list[bool] | None]:
img_width, img_height = img.size
new_img_width, new_img_height = self.get_image_size_for_padding(
img_width, img_height
)
if new_img_width != img_width or new_img_height != img_height:
img = self.square_pad(img)
img_width, img_height = img.size
new_img_width, new_img_height = self.get_image_size_for_preprocess(
img_width, img_height
)
img = img.resize((new_img_width, new_img_height), Image.Resampling.BILINEAR)
window_size = self.determine_window_size(
max(new_img_height, new_img_width), min(new_img_height, new_img_width)
)
if window_size == 0 or not self.enable_patch:
return img, [], None
else:
new_img_width, new_img_height = self.get_image_size_for_crop(
new_img_width, new_img_height, window_size
)
if (new_img_width, new_img_height) != (img_width, img_height):
img_for_crop = img.resize(
(new_img_width, new_img_height), Image.Resampling.BILINEAR
)
else:
img_for_crop = img
patches = []
newlines = []
center_list, (x_num, y_num) = self.slide_window(
new_img_width,
new_img_height,
[(window_size, window_size)],
[(window_size, window_size)],
)
for patch_id, center_lf_point in enumerate(center_list):
x, y, patch_w, patch_h = center_lf_point
big_patch = self.patch_crop(img_for_crop, y, x, patch_h, patch_w)
patches.append(big_patch)
if (patch_id + 1) % x_num == 0:
newlines.append(patch_id)
if newlines and newlines[-1] == len(patches) - 1:
newlines.pop()
return (
img,
patches,
[i in newlines for i in range(len(patches))]
if len(patches) > 0
else None,
)
class Step3VLProcessor:
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
) -> None:
super().__init__()
self.config = config
self.tokenizer = tokenizer
self.image_size = 728
self.patch_size = 504
self.image_preprocessor = Step3VisionProcessor(
self.image_size, "bilinear", self.patch_size
)
self.num_image_feature_size = 169
self.num_patch_feature_size = 81
self.image_token = "<im_patch>"
self.image_feature_placeholder = self.image_token * self.num_image_feature_size
self.patch_feature_placeholder = self.image_token * self.num_patch_feature_size
# Respect vision config switch to enable/disable patch extraction.
# For video understanding, it's preferable to disable patch.
enable_patch = getattr(self.config.vision_config, "enable_patch", True)
self.patcher = ImagePatcher(enable_patch=enable_patch)
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[self.image_token]
def get_num_image_tokens(self, img_width: int, img_height: int) -> int:
num_patches, num_newlines = self.patcher.get_num_patches(img_width, img_height)
return (
num_patches * (self.num_patch_feature_size + 2)
+ self.num_image_feature_size
+ 2
+ num_newlines
)
def _split_images(self, images: list[Image.Image]) -> list[ImageWithPatches]:
result = []
for img in images:
result.append(self.patcher(img))
return result
def _convert_images_to_pixel_values(
self,
images: list[Image.Image],
is_patch: bool = False,
) -> list[torch.Tensor]:
return [
self.image_preprocessor(img, is_patch=is_patch)["pixel_values"]
for img in images
]
def _get_patch_repl(
self,
num_patches: int,
patch_newline_mask: list[bool] | None,
) -> tuple[str, list[int]]:
text = ""
token_ids = []
for i in range(num_patches):
assert len(patch_newline_mask) == num_patches
text += f"<patch_start>{self.patch_feature_placeholder}<patch_end>"
token_ids.extend(
[self.tokenizer.convert_tokens_to_ids("<patch_start>")]
+ [self.image_token_id] * self.num_patch_feature_size
+ [self.tokenizer.convert_tokens_to_ids("<patch_end>")]
)
if patch_newline_mask and patch_newline_mask[i]:
text += "<patch_newline>"
token_ids.append(
self.tokenizer.convert_tokens_to_ids("<patch_newline>")
)
return text, token_ids
def _get_image_repl(
self,
num_images: int,
) -> tuple[str, list[int]]:
text = f"<im_start>{self.image_feature_placeholder}<im_end>"
token_ids = (
[self.tokenizer.convert_tokens_to_ids("<im_start>")]
+ [self.image_token_id] * self.num_image_feature_size
+ [self.tokenizer.convert_tokens_to_ids("<im_end>")]
)
return text * num_images, token_ids * num_images
def _get_image_repl_features(
self,
num_images: int,
num_patches: int,
patch_new_line_idx: list[bool] | None,
) -> tuple[str, list[int]]:
if num_patches > 0:
patch_repl, patch_repl_ids = self._get_patch_repl(
num_patches, patch_new_line_idx
)
else:
patch_repl = ""
patch_repl_ids = []
image_repl, image_repl_ids = self._get_image_repl(num_images)
return patch_repl + image_repl, patch_repl_ids + image_repl_ids
def replace_placeholder(self, text: str, placeholder: str, repls: list[str]) -> str:
parts = text.split(placeholder)
if len(parts) - 1 != len(repls):
raise ValueError(
"The number of placeholders does not match the number of replacements."
)
result = [parts[0]]
for i, repl in enumerate(repls):
result.append(repl)
result.append(parts[i + 1])
return "".join(result)
class Step3VLProcessingInfo(BaseProcessingInfo):
def get_image_processor(self, **kwargs):
config = self.get_hf_config()
def __call__(
self,
text: str | list[str] | None = None,
images: Image.Image | list[Image.Image] | None = None,
return_tensors: str | TensorType | None = None,
) -> BatchFeature:
if text is None:
text = []
if not isinstance(text, list):
text = [text]
if images is None:
images = []
if not isinstance(images, list):
images = [images]
if len(images) == 0:
image_inputs = {}
text_inputs = self.tokenizer(text)
else:
split_images_data = self._split_images(images)
pixel_values_lst = []
patch_pixel_values_lst = []
patch_newline_mask_lst = []
image_repl_str_lst = []
image_repl_ids_lst = []
num_patches = []
for raw_img, img_patches, patch_newline_mask in split_images_data:
pixel_values_lst.extend(self._convert_images_to_pixel_values([raw_img]))
if len(img_patches) > 0:
patch_pixel_values_lst.extend(
self._convert_images_to_pixel_values(img_patches, is_patch=True)
)
num_patches.append(len(img_patches))
image_repl_str, image_repl_ids = self._get_image_repl_features(
1, len(img_patches), patch_newline_mask
)
image_repl_str_lst.append(image_repl_str)
image_repl_ids_lst.extend(image_repl_ids)
if patch_newline_mask is not None:
patch_newline_mask_lst.extend(patch_newline_mask)
pixel_values = torch.cat(pixel_values_lst)
patch_size = self.patch_size
image_inputs = {
"pixel_values": pixel_values,
"num_patches": num_patches,
"patch_pixel_values": (
torch.cat(patch_pixel_values_lst)
if patch_pixel_values_lst
else pixel_values.new_empty((0, 3, patch_size, patch_size))
),
"patch_newline_mask": torch.tensor(
patch_newline_mask_lst, dtype=torch.bool
),
}
text = [
self.replace_placeholder(t, self.image_token, image_repl_str_lst)
for t in text
]
text_inputs = self.tokenizer(text)
return BatchFeature(
{
**text_inputs,
**image_inputs,
},
tensor_type=return_tensors,
kwargs.setdefault(
"enable_patch",
getattr(config.vision_config, "enable_patch", True),
)
return Step3VLImageProcessor(**kwargs)
class Step3VLProcessingInfo(BaseProcessingInfo):
def get_hf_processor(self) -> Step3VLProcessor:
return Step3VLProcessor(
self.get_hf_config(),
self.get_tokenizer(),
tokenizer=self.get_tokenizer(),
image_processor=self.get_image_processor(),
)
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
return {"image": None}
def get_max_image_tokens(self) -> int:
hf_processor = self.get_hf_processor()
return hf_processor.get_num_image_tokens(
self.get_image_size_with_most_features().width,
self.get_image_size_with_most_features().height,
)
image_processor = self.get_image_processor()
target_width, target_height = self.get_image_size_with_most_features()
return image_processor.get_num_image_tokens(target_width, target_height)
def get_mm_max_tokens_per_item(
self,
......@@ -539,20 +123,7 @@ class Step3VLProcessingInfo(BaseProcessingInfo):
return {"image": self.get_max_image_tokens()}
def get_image_size_with_most_features(self) -> ImageSize:
return ImageSize(3024, 3024)
def get_num_mm_tokens(self, mm_data: MultiModalDataDict) -> int:
if len(mm_data) != 1 or "image" not in mm_data:
raise ValueError("mm_data could only contain one key 'image' for steo1o")
image_data = mm_data["image"]
if not isinstance(image_data, (list, tuple)):
image_data = [image_data]
return sum(
self.get_hf_processor().get_num_image_tokens(img.width, img.height)
for img in image_data
)
return ImageSize(MAX_IMAGE_SIZE, MAX_IMAGE_SIZE)
class Step3VLDummyInputsBuilder(BaseDummyInputsBuilder[Step3VLProcessingInfo]):
......@@ -594,13 +165,11 @@ class Step3VLMultiModalProcessor(BaseMultiModalProcessor[Step3VLProcessingInfo])
def get_replacement_step1o(item_idx: int):
out_item = out_mm_kwargs["image"][item_idx]
num_patches = int(out_item["num_patches"].data)
if num_patches > 0:
patch_newline_mask = out_item["patch_newline_mask"].data
image_repl_ids = hf_processor._get_image_repl_features(
1, num_patches, patch_newline_mask.tolist()
)[1]
else:
image_repl_ids = hf_processor._get_image_repl_features(1, 0, None)[1]
patch_newline_mask = out_item["patch_newline_mask"].data
image_repl_ids = hf_processor.get_image_repl_feature_ids(
1, num_patches, patch_newline_mask.tolist()
)
return PromptUpdateDetails.select_token_id(
seq=image_repl_ids,
embed_token_id=image_placeholder_token_id,
......
......@@ -25,7 +25,6 @@ from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelL
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.models.llava import LlavaDummyInputsBuilder
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.cache import BaseMultiModalProcessorCache
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems
from vllm.multimodal.parse import (
ImageEmbeddingItems,
......@@ -34,10 +33,8 @@ from vllm.multimodal.parse import (
MultiModalDataItems,
)
from vllm.multimodal.processing import (
BaseDummyInputsBuilder,
BaseMultiModalProcessor,
BaseProcessingInfo,
InputProcessingContext,
PromptReplacement,
PromptUpdate,
)
......@@ -329,25 +326,6 @@ class TarsierMultiModalProcessor(BaseMultiModalProcessor[_I_Tarsier]):
]
def _build_tarsier_hf_info(ctx: InputProcessingContext) -> TarsierProcessingInfo:
return TarsierProcessingInfo(ctx)
def _build_tarsier_hf_processor(
info: _I_Tarsier,
dummy_inputs: BaseDummyInputsBuilder[_I_Tarsier],
*,
cache: BaseMultiModalProcessorCache | None = None,
) -> BaseMultiModalProcessor:
if isinstance(info, TarsierProcessingInfo):
return TarsierMultiModalProcessor(
info,
dummy_inputs,
cache=cache,
)
raise NotImplementedError(type(info))
def init_vision_tower_for_tarsier(
hf_config: TarsierHfConfig, # Use the Tarsier specific config protocol
quant_config: QuantizationConfig | None,
......@@ -395,8 +373,8 @@ def init_vision_tower_for_tarsier(
@MULTIMODAL_REGISTRY.register_processor(
_build_tarsier_hf_processor,
info=_build_tarsier_hf_info,
TarsierMultiModalProcessor,
info=TarsierProcessingInfo,
dummy_inputs=TarsierDummyInputsBuilder,
)
class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
......
......@@ -404,12 +404,14 @@ class UltravoxTransformerProjector(nn.Module, ModuleUtilsMixin):
kwargs["layer_head_mask"] = None
for layer in self.layers:
layer_outputs = layer(
hidden_states = layer(
hidden_states,
attention_mask=extended_attention_mask,
**kwargs,
)
hidden_states = layer_outputs[0]
# BC version that allows for the old tupled output
if isinstance(hidden_states, tuple):
hidden_states = hidden_states[0]
hidden_states = self.ln_post(hidden_states)
hidden_states = self.linear_out(hidden_states)
......@@ -509,13 +511,14 @@ class ModifiedWhisperEncoder(WhisperEncoder):
kwargs["layer_head_mask"] = None
for encoder_layer in self.layers:
layer_outputs = encoder_layer(
hidden_states = encoder_layer(
hidden_states,
attention_mask,
**kwargs,
)
hidden_states = layer_outputs[0]
# BC version that allows for the old tupled output
if isinstance(hidden_states, tuple):
hidden_states = hidden_states[0]
hidden_states = self.layer_norm(hidden_states)
return hidden_states
......
......@@ -150,8 +150,10 @@ def create_whisper_attention_backend_with_block_pooling(
new_common_attn_metadata.query_start_loc *= block_pool_size
new_common_attn_metadata.query_start_loc_cpu *= block_pool_size
new_common_attn_metadata.seq_lens *= block_pool_size
new_common_attn_metadata._seq_lens_cpu *= block_pool_size
new_common_attn_metadata._num_computed_tokens_cpu *= block_pool_size
if new_common_attn_metadata._seq_lens_cpu is not None:
new_common_attn_metadata._seq_lens_cpu *= block_pool_size
if new_common_attn_metadata._num_computed_tokens_cpu is not None:
new_common_attn_metadata._num_computed_tokens_cpu *= block_pool_size
new_common_attn_metadata.num_actual_tokens *= block_pool_size
new_common_attn_metadata.max_query_len *= block_pool_size
new_common_attn_metadata.max_seq_len *= block_pool_size
......
......@@ -431,10 +431,32 @@ class _ModuleOffloader:
Called after process_weights_after_loading to ensure _cpu_storage
contains the final processed weights, not stale pre-loading data.
Parameters whose underlying nn.Parameter was deleted by
process_weights_after_loading (e.g. transient KV-cache scale params)
are pruned from self._param_offloaders so they do not participate in
buffer-pool allocation or prefetching.
"""
for param_offloader in self._param_offloaders.values():
param_offloader.sync_cpu_storage()
# Remove offloaders whose parameter was deleted during
# process_weights_after_loading (e.g. k_scale / v_scale).
deleted = [
name
for name, offloader in self._param_offloaders.items()
if getattr(offloader, "_param_deleted", False)
]
if deleted:
logger.debug(
"Pruning %d transient offloaded param(s) that were deleted "
"by process_weights_after_loading: %s",
len(deleted),
deleted,
)
for name in deleted:
del self._param_offloaders[name]
def get_param_infos(self) -> list[ParamInfo]:
"""Get parameter metadata for buffer pool allocation.
......@@ -590,6 +612,11 @@ class _CpuParamOffloader(_BaseParamOffloader):
super().__init__(module, param_name)
self._cpu_storage: torch.Tensor | None = None
self._gpu_buffer: torch.Tensor | None = None # Store reference to GPU buffer
# Set to True if the underlying nn.Parameter was deleted by
# process_weights_after_loading (e.g. transient KV-cache scale params
# such as k_scale/v_scale created by BaseKVCacheMethod.create_weights
# and deleted after copying into permanent _k_scale buffers).
self._param_deleted: bool = False
# Offload to CPU immediately to free GPU memory during model loading
self._offload_to_cpu_internal()
......@@ -696,8 +723,22 @@ class _CpuParamOffloader(_BaseParamOffloader):
1. process_weights_after_loading may transform weights (quantization)
2. device_loading_context creates NEW CPU tensors when moving back
3. Our old _cpu_storage would have pre-processed or stale data
If the parameter no longer exists on the module (e.g. transient
KV-cache scale parameters such as k_scale/v_scale that are created
by BaseKVCacheMethod.create_weights() and then deleted by
process_weights_after_loading() after copying their values into
permanent _k_scale buffers), the offloader marks itself as deleted
and skips the sync. The caller (_ModuleOffloader.sync_cpu_storage)
is responsible for removing these stale entries.
"""
self._update_cpu_storage_from_param()
try:
self._update_cpu_storage_from_param()
except AttributeError:
# The parameter was deleted by process_weights_after_loading.
# Drop the now-stale CPU storage so this offloader can be pruned.
self._param_deleted = True
self._cpu_storage = None
def post_init(self):
"""No-op: offloading done in offload_to_cpu/assign_static_buffer."""
......
......@@ -19,6 +19,7 @@ from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
)
from vllm.model_executor.layers.linear import LinearBase
from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
from vllm.model_executor.layers.quantization.mxfp8 import Mxfp8OnlineLinearMethod
from vllm.tracing import instrument
from vllm.utils.deep_gemm import (
fp8_gemm_nt,
......@@ -136,8 +137,9 @@ def _fp8_linear_may_use_deep_gemm(module: torch.nn.Module) -> bool:
if not (
isinstance(module, LinearBase)
and isinstance(module.quant_method, Fp8LinearMethod)
and module.quant_method.block_quant
and not module.quant_method.use_marlin
and not isinstance(module.quant_method, Mxfp8OnlineLinearMethod)
and getattr(module.quant_method, "block_quant", False)
and not getattr(module.quant_method, "use_marlin", True)
):
return False
......
......@@ -12,17 +12,35 @@ import torch
from vllm.utils.import_utils import PlaceholderModule
try:
import librosa
import av as av
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
av = PlaceholderModule("av") # type: ignore[assignment]
try:
import resampy
except ImportError:
resampy = PlaceholderModule("resampy") # type: ignore[assignment]
try:
import scipy.signal as scipy_signal
except ImportError:
scipy_signal = PlaceholderModule("scipy").placeholder_attr("signal") # type: ignore[assignment]
# ============================================================
# Aligned with `librosa.get_duration` function
def get_audio_duration(*, y: npt.NDArray[np.floating], sr: float = 22050) -> float:
"""Get the duration of an audio array in seconds.
Args:
y: Audio time series. Can be 1D (samples,) or 2D (channels, samples).
sr: Sample rate of the audio in Hz.
Returns:
Duration of the audio in seconds.
"""
n_samples = y.shape[-1]
return float(n_samples) / sr
class ChannelReduction(str, Enum):
......@@ -153,13 +171,71 @@ def normalize_audio(
# ============================================================
def resample_audio_librosa(
def resample_audio_pyav(
audio: npt.NDArray[np.floating],
*,
orig_sr: float,
target_sr: float,
) -> npt.NDArray[np.floating]:
return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
"""Resample audio using PyAV (libswresample via FFmpeg).
Args:
audio: Input audio. Can be:
- 1D array ``(samples,)``: mono audio
- 2D array ``(channels, samples)``: stereo audio
orig_sr: Original sample rate in Hz.
target_sr: Target sample rate in Hz.
Returns:
Resampled audio with the same shape as the input (1D → 1D, 2D → 2D).
"""
orig_sr_int = int(round(orig_sr))
target_sr_int = int(round(target_sr))
if orig_sr_int == target_sr_int:
return audio
if audio.ndim == 2:
# Resample each channel independently and re-stack.
return np.stack(
[
resample_audio_pyav(ch, orig_sr=orig_sr, target_sr=target_sr)
for ch in audio
],
axis=0,
)
expected_len = int(math.ceil(audio.shape[-1] * target_sr_int / orig_sr_int))
# from_ndarray expects shape (channels, samples) for planar formats.
# libswresample requires a minimum number of input samples to produce
# output frames; pad short inputs with zeros so we always get output,
# then trim to the expected output length.
_MIN_SAMPLES = 1024
audio_f32 = np.asarray(audio, dtype=np.float32)
if len(audio_f32) < _MIN_SAMPLES:
audio_f32 = np.pad(audio_f32, (0, _MIN_SAMPLES - len(audio_f32)))
audio_f32 = audio_f32.reshape(1, -1)
resampler = av.AudioResampler(format="fltp", layout="mono", rate=target_sr_int)
frame = av.AudioFrame.from_ndarray(audio_f32, format="fltp", layout="mono")
frame.sample_rate = orig_sr_int
out_frames = resampler.resample(frame)
out_frames.extend(resampler.resample(None)) # flush buffered samples
result = np.concatenate([f.to_ndarray() for f in out_frames], axis=1).squeeze(0)
return result[:expected_len]
def resample_audio_resampy(
audio: npt.NDArray[np.floating],
*,
orig_sr: float,
target_sr: float,
) -> npt.NDArray[np.floating]:
return resampy.resample(audio, sr_orig=orig_sr, sr_new=target_sr)
def resample_audio_scipy(
......@@ -167,7 +243,7 @@ def resample_audio_scipy(
*,
orig_sr: float,
target_sr: float,
):
) -> npt.NDArray[np.floating]:
if orig_sr > target_sr:
return scipy_signal.resample_poly(audio, 1, orig_sr // target_sr)
elif orig_sr < target_sr:
......@@ -181,7 +257,7 @@ class AudioResampler:
def __init__(
self,
target_sr: float | None = None,
method: Literal["librosa", "scipy"] = "librosa",
method: Literal["pyav", "resampy", "scipy"] = "resampy",
):
self.target_sr = target_sr
self.method = method
......@@ -203,8 +279,10 @@ class AudioResampler:
abs_tol=1e-6,
):
return audio
if self.method == "librosa":
return resample_audio_librosa(
if self.method == "pyav":
return resample_audio_pyav(audio, orig_sr=orig_sr, target_sr=self.target_sr)
if self.method == "resampy":
return resample_audio_resampy(
audio, orig_sr=orig_sr, target_sr=self.target_sr
)
elif self.method == "scipy":
......@@ -214,7 +292,7 @@ class AudioResampler:
else:
raise ValueError(
f"Invalid resampling method: {self.method}. "
"Supported methods are 'librosa' and 'scipy'."
"Supported methods are 'pyav' and 'scipy'."
)
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import math
from io import BytesIO
from pathlib import Path
......@@ -15,58 +15,80 @@ from vllm.utils.serial_utils import tensor2base64
from .base import MediaIO
try:
import librosa
import av
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
av = PlaceholderModule("av") # type: ignore[assignment]
try:
import soundfile
except ImportError:
soundfile = PlaceholderModule("soundfile") # type: ignore[assignment]
try:
import av
import resampy
except ImportError:
av = PlaceholderModule("av") # type: ignore[assignment]
resampy = PlaceholderModule("resampy") # type: ignore[assignment]
def extract_audio_from_video_bytes(
data: bytes,
) -> tuple[npt.NDArray, float]:
"""Extract the audio track from raw video bytes using PyAV.
# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile
# being librosa's main backend. Used to validate if an audio loading error is due to a
# server error vs a client error (invalid audio file).
# 1 = unrecognised format (file is not a supported audio container)
# 3 = malformed file (corrupt or structurally invalid audio)
# 4 = unsupported encoding (codec not supported by this libsndfile build)
_BAD_SF_CODES = {1, 3, 4}
PyAV wraps FFmpeg's C libraries in-process — no subprocess is
spawned, which is critical to avoid crashing CUDA-active vLLM
worker processes.
The returned waveform is at the native sample rate of the video's
audio stream. Resampling to a model-specific rate is left to the
downstream :class:`AudioResampler` in the parsing pipeline.
def load_audio_pyav(
path: BytesIO | Path | str,
*,
sr: float | None = 22050,
mono: bool = True,
) -> tuple[npt.NDArray, float]:
"""Load an audio file using PyAV (FFmpeg), returning float32 mono waveform.
Decodes the audio stream at its native sample rate. Channel reduction to
mono is performed by averaging across channels. Resampling to a
model-specific rate is left to the downstream :class:`AudioResampler`.
Args:
data: Raw video file bytes (e.g. from an mp4 file).
path: A :class:`~io.BytesIO` buffer, a filesystem
:class:`~pathlib.Path`, or a string path.
Returns:
A tuple of ``(waveform, sample_rate)`` suitable for use as an
:class:`AudioItem`.
``(waveform, sample_rate)`` where *waveform* is a 1-D float32
NumPy array and *sample_rate* is the native sample rate in Hz.
"""
if data is None or len(data) == 0:
raise ValueError(
"Cannot extract audio: video bytes are missing or empty. "
"Ensure video was loaded with keep_video_bytes=True for "
"audio-in-video extraction."
)
native_sr = None
try:
with av.open(BytesIO(data)) as container:
with av.open(path) as container:
if not container.streams.audio:
raise ValueError("No audio stream found in the video.")
raise ValueError("No audio stream found.")
stream = container.streams.audio[0]
stream.thread_type = "AUTO"
native_sr = stream.rate
sr = sr or native_sr
chunks: list[npt.NDArray] = []
for frame in container.decode(audio=0):
arr = frame.to_ndarray()
chunks.append(arr.mean(axis=0) if arr.ndim > 1 else arr)
needs_resampling = not math.isclose(
float(sr),
float(native_sr),
rel_tol=0.0,
abs_tol=1e-6,
)
resampler = (
av.AudioResampler(format="fltp", layout="mono", rate=sr)
if needs_resampling
else None
)
for frame in container.decode(stream):
if needs_resampling:
assert resampler is not None
for out_frame in resampler.resample(frame):
chunks.append(out_frame.to_ndarray())
else:
chunks.append(frame.to_ndarray())
except ValueError:
raise
except Exception as e:
......@@ -78,37 +100,54 @@ def extract_audio_from_video_bytes(
if not chunks:
raise ValueError("No audio found in the video.")
audio = np.concatenate(chunks).astype(np.float32)
return audio, float(native_sr)
audio = np.concatenate(chunks, axis=-1).astype(np.float32)
if mono and audio.ndim > 1:
audio = np.mean(audio, axis=0)
return audio, sr
def is_video(data: bytes) -> bool:
"""Check if the fetched bytes are video"""
if len(data) < 12:
return False
box_type = data[4:8]
major_brand = data[8:12]
def load_audio_soundfile(
path: BytesIO | Path | str,
*,
sr: float | None = 22050,
mono: bool = True,
) -> tuple[np.ndarray, int]:
"""Load audio via soundfile"""
with soundfile.SoundFile(path) as f:
native_sr = f.samplerate
y = f.read(dtype="float32", always_2d=False).T
MP4_BRANDS = {
b"mp41",
b"mp42", # MP4
b"isom", # ISO Base Media
b"iso2",
b"iso4",
b"iso5",
b"iso6",
b"M4V ",
b"M4A ", # Apple
b"avc1", # H.264
b"dash", # DASH
b"mmp4",
b"MSNV",
}
if mono and y.ndim > 1:
y = np.mean(y, axis=tuple(range(y.ndim - 1)))
is_avi = data[:4] == b"RIFF" and major_brand == b"AVI "
is_mp4 = box_type == b"ftyp" and major_brand in MP4_BRANDS
return is_mp4 or is_avi
if sr is not None and sr != native_sr:
y = resampy.resample(y, sr_orig=native_sr, sr_new=sr)
return y, int(sr)
return y, native_sr
def load_audio(
path: BytesIO | Path | str,
*,
sr: float | None = 22050,
mono: bool = True,
):
try:
return load_audio_soundfile(path, sr=sr, mono=mono)
except soundfile.LibsndfileError as exc:
# Only fall back for known format-detection failures.
# Re-raise anything else (e.g. corrupt but recognised format).
if exc.code not in _BAD_SF_CODES:
raise
# soundfile may have advanced the BytesIO seek position before failing;
# reset it so PyAV can read from the beginning.
if isinstance(path, BytesIO):
path.seek(0)
try:
return load_audio_pyav(path, sr=sr, mono=mono)
except Exception as pyav_exc:
raise ValueError("Invalid or unsupported audio file.") from pyav_exc
class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
......@@ -129,19 +168,17 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
self.kwargs = kwargs
def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
if is_video(data):
return extract_audio_from_video_bytes(data)
return librosa.load(BytesIO(data), sr=None)
return load_audio(BytesIO(data), sr=None)
def load_base64(
self,
media_type: str,
data: str,
) -> tuple[npt.NDArray, float]:
return self.load_bytes(base64.b64decode(data))
return self.load_bytes(pybase64.b64decode(data))
def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
return librosa.load(filepath, sr=None)
return load_audio(filepath, sr=None)
def encode_base64(
self,
......@@ -155,7 +192,7 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
soundfile.write(buffer, audio, sr, format=audio_format)
data = buffer.getvalue()
return base64.b64encode(data).decode("utf-8")
return pybase64.b64encode(data).decode("utf-8")
class AudioEmbeddingMediaIO(MediaIO[torch.Tensor]):
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
from functools import partial
from pathlib import Path
from typing import Any
import numpy as np
import numpy.typing as npt
import pybase64
from PIL import Image
from vllm import envs
......@@ -80,11 +80,23 @@ class VideoMediaIO(MediaIO[tuple[npt.NDArray, dict[str, Any]]]):
"image/jpeg",
)
return np.stack(
frames = np.stack(
[np.asarray(load_frame(frame_data)) for frame_data in data.split(",")]
), {}
return self.load_bytes(base64.b64decode(data))
)
total = int(frames.shape[0])
fps = float(self.kwargs.get("fps", 1))
duration = total / fps if fps > 0 else 0.0
metadata = {
"total_num_frames": total,
"fps": fps,
"duration": duration,
"video_backend": "jpeg_sequence",
"frames_indices": list(range(total)),
"do_sample_frames": False,
}
return frames, metadata
return self.load_bytes(pybase64.b64decode(data))
def load_file(self, filepath: Path) -> tuple[npt.NDArray, dict[str, Any]]:
with filepath.open("rb") as f:
......
......@@ -497,7 +497,7 @@ class MultiModalDataParser:
*,
target_sr: float | None = None,
target_channels: int | None = None,
audio_resample_method: Literal["librosa", "scipy"] = "librosa",
audio_resample_method: Literal["pyav", "scipy"] = "pyav",
video_needs_metadata: bool = False,
expected_hidden_size: int | None = None,
) -> None:
......
......@@ -1682,6 +1682,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
skip_decoder_start_token: bool = False
@abstractmethod
def create_encoder_prompt(
self,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import contextlib
import json
from abc import abstractmethod
from collections.abc import Sequence
......@@ -18,7 +19,7 @@ from openai.types.responses.response_output_text import Logprob
from openai.types.responses.response_reasoning_item import (
Content as ResponseReasoningTextContent,
)
from pydantic import TypeAdapter
from pydantic import TypeAdapter, ValidationError
from vllm.entrypoints.chat_utils import make_tool_call_id
from vllm.entrypoints.openai.chat_completion.protocol import (
......@@ -154,7 +155,9 @@ class Parser:
@abstractmethod
def extract_response_outputs(
self,
*,
model_output: str,
model_output_token_ids: Sequence[int],
request: ResponsesRequest,
enable_auto_tools: bool = False,
tool_call_id_type: str = "random",
......@@ -169,6 +172,7 @@ class Parser:
Args:
model_output: The complete model-generated string.
model_output_token_ids: The token IDs of the model output.
request: The request object used to generate the output.
enable_auto_tools: Whether to enable automatic tool call parsing.
tool_call_id_type: Type of tool call ID generation ("random", etc).
......@@ -195,7 +199,7 @@ class Parser:
request: The request object used to generate the output.
Returns:
A tuple of (reasoning_content, response_content).
A tuple of (reasoning, response_content).
"""
@abstractmethod
......@@ -312,7 +316,9 @@ class DelegatingParser(Parser):
def extract_response_outputs(
self,
*,
model_output: str,
model_output_token_ids: Sequence[int],
request: ResponsesRequest,
enable_auto_tools: bool = False,
tool_call_id_type: str = "random",
......@@ -422,15 +428,19 @@ class DelegatingParser(Parser):
if request.tool_choice == "required":
# Required tool calls - parse JSON
assert content is not None
tool_calls = TypeAdapter(list[FunctionDefinition]).validate_json(content)
function_calls.extend(
FunctionCall(
name=tool_call.name,
arguments=json.dumps(tool_call.parameters, ensure_ascii=False),
tool_calls = []
with contextlib.suppress(ValidationError):
content = content or ""
tool_calls = TypeAdapter(list[FunctionDefinition]).validate_json(
content
)
for tool_call in tool_calls:
function_calls.append(
FunctionCall(
name=tool_call.name,
arguments=json.dumps(tool_call.parameters, ensure_ascii=False),
)
)
for tool_call in tool_calls
)
return function_calls, None # Clear content since tool is called.
if (
......
......@@ -199,7 +199,7 @@ class ParserManager:
parser: type[ToolParser] | None = None
if not enable_auto_tools or tool_parser_name is None:
return parser
logger.info('"auto" tool choice has been enabled.')
logger.info_once('"auto" tool choice has been enabled.')
try:
if (
......
......@@ -281,6 +281,9 @@ class CpuPlatform(Platform):
# Disable multi-stream for shared experts as no Stream on CPU
os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "1"
# Avoid inductor generates num_thread() and breaks the thread binding
os.environ["TORCHINDUCTOR_CPP_DYNAMIC_THREADS"] = "1"
# Intel OpenMP setting
ld_preload_str = os.getenv("LD_PRELOAD", "")
if "libiomp5.so" in ld_preload_str:
......
......@@ -4,6 +4,8 @@
pynvml. However, it should not initialize cuda context.
"""
from __future__ import annotations
import os
from collections.abc import Callable
from datetime import timedelta
......@@ -17,6 +19,7 @@ from typing_extensions import ParamSpec
# import custom ops, trigger op registration
import vllm._C # noqa
import vllm._C_stable_libtorch # noqa
from vllm.logger import init_logger
from vllm.utils.import_utils import import_pynvml
from vllm.utils.torch_utils import cuda_device_count_stateless
......@@ -49,21 +52,34 @@ def _get_backend_priorities(
use_mla: bool,
device_capability: DeviceCapability,
num_heads: int | None = None,
kv_cache_dtype: CacheDType | None = None,
) -> list[AttentionBackendEnum]:
"""Get backend priorities with lazy import to avoid circular dependency."""
if use_mla:
if device_capability.major == 10:
# Prefer FlashInfer at low head counts (FlashMLA uses padding)
if num_heads is not None and num_heads <= 16:
# Sparse MLA backend priorities
# See https://github.com/vllm-project/vllm/issues/35807 for
# benchmark results
if kv_cache_dtype is not None and kv_cache_dtype.startswith("fp8"):
# Prefer FlashInfer for fp8 kv cache
sparse_backends = [
AttentionBackendEnum.FLASHINFER_MLA_SPARSE,
AttentionBackendEnum.FLASHMLA_SPARSE,
]
else:
sparse_backends = [
AttentionBackendEnum.FLASHMLA_SPARSE,
AttentionBackendEnum.FLASHINFER_MLA_SPARSE,
]
# BF16 KV Cache
# Prefer FlashInfer at low head counts (FlashMLA uses padding)
if num_heads is not None and num_heads <= 16:
sparse_backends = [
AttentionBackendEnum.FLASHINFER_MLA_SPARSE,
AttentionBackendEnum.FLASHMLA_SPARSE,
]
else:
sparse_backends = [
AttentionBackendEnum.FLASHMLA_SPARSE,
AttentionBackendEnum.FLASHINFER_MLA_SPARSE,
]
return [
AttentionBackendEnum.FLASHINFER_MLA,
AttentionBackendEnum.CUTLASS_MLA,
......@@ -165,7 +181,7 @@ class CudaPlatformBase(Platform):
pass
@classmethod
def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
parallel_config = vllm_config.parallel_config
model_config = vllm_config.model_config
......@@ -198,11 +214,11 @@ class CudaPlatformBase(Platform):
def get_valid_backends(
cls,
device_capability: DeviceCapability,
attn_selector_config: "AttentionSelectorConfig",
attn_selector_config: AttentionSelectorConfig,
num_heads: int | None = None,
) -> tuple[
list[tuple["AttentionBackendEnum", int]],
dict["AttentionBackendEnum", tuple[int, list[str]]],
list[tuple[AttentionBackendEnum, int]],
dict[AttentionBackendEnum, tuple[int, list[str]]],
]:
valid_backends_priorities = []
invalid_reasons: dict[AttentionBackendEnum, tuple[int, list[str]]] = {}
......@@ -211,6 +227,7 @@ class CudaPlatformBase(Platform):
attn_selector_config.use_mla,
device_capability,
num_heads,
attn_selector_config.kv_cache_dtype,
)
for priority, backend in enumerate(backend_priorities):
try:
......@@ -231,8 +248,8 @@ class CudaPlatformBase(Platform):
@classmethod
def get_attn_backend_cls(
cls,
selected_backend: "AttentionBackendEnum | None",
attn_selector_config: "AttentionSelectorConfig",
selected_backend: AttentionBackendEnum | None,
attn_selector_config: AttentionSelectorConfig,
num_heads: int | None = None,
) -> str:
device_capability = cls.get_device_capability()
......@@ -324,7 +341,7 @@ class CudaPlatformBase(Platform):
return selected_backend.get_path()
@classmethod
def get_supported_vit_attn_backends(cls) -> list["AttentionBackendEnum"]:
def get_supported_vit_attn_backends(cls) -> list[AttentionBackendEnum]:
if cls.has_device_capability(80):
return [
AttentionBackendEnum.FLASH_ATTN,
......@@ -345,8 +362,8 @@ class CudaPlatformBase(Platform):
cls,
head_size: int,
dtype: torch.dtype,
backend: "AttentionBackendEnum | None" = None,
) -> "AttentionBackendEnum":
backend: AttentionBackendEnum | None = None,
) -> AttentionBackendEnum:
if backend is not None:
assert backend in cls.get_supported_vit_attn_backends(), (
f"Backend {backend} is not supported for vit attention. "
......@@ -371,7 +388,8 @@ class CudaPlatformBase(Platform):
)
if is_backend_supported:
logger.info_once(
f"Using backend {vit_attn_backend} for vit attention"
f"Using backend {vit_attn_backend} for vit attention",
scope="local",
)
return vit_attn_backend
except ImportError:
......@@ -493,6 +511,11 @@ class CudaPlatformBase(Platform):
def support_static_graph_mode(cls) -> bool:
return True
@classmethod
def support_deep_gemm(cls) -> bool:
"""Currently, only Hopper and Blackwell GPUs are supported."""
return cls.is_device_capability(90) or cls.is_device_capability_family(100)
@classmethod
def num_compute_units(cls, device_id: int = 0) -> int:
return torch.cuda.get_device_properties(device_id).multi_processor_count
......
......@@ -712,6 +712,13 @@ class Platform:
"""
return False
@classmethod
def support_deep_gemm(cls) -> bool:
"""
Returns if DeepGEMM is supported by the current platform.
"""
return False
@classmethod
def use_custom_op_collectives(cls) -> bool:
"""
......
......@@ -28,6 +28,7 @@ try:
from amdsmi import (
AmdSmiException,
amdsmi_get_gpu_asic_info,
amdsmi_get_gpu_device_uuid,
amdsmi_get_processor_handles,
amdsmi_init,
amdsmi_shut_down,
......@@ -439,8 +440,6 @@ class RocmPlatform(Platform):
device_capability = cls.get_device_capability()
assert device_capability is not None
attn_selector_config = attn_selector_config._replace(block_size=None)
# First try checking just the selected backend, if there is one.
if selected_backend is not None:
try:
......@@ -611,6 +610,20 @@ class RocmPlatform(Platform):
return _ROCM_DEVICE_ID_NAME_MAP[device_name]
return asic_info["market_name"]
@classmethod
@with_amdsmi_context
def get_device_uuid(cls, device_id: int = 0) -> str:
try:
device = amdsmi_get_processor_handles()[device_id]
except AmdSmiException as error:
logger.error("amdsmi device query failed ", exc_info=error)
return ""
try:
device_uuid = amdsmi_get_gpu_device_uuid(device)
except AmdSmiException as error:
logger.error("amdsmi device uuid query failed ", exc_info=error)
return device_uuid
@classmethod
def get_device_total_memory(cls, device_id: int = 0) -> int:
device_props = torch.cuda.get_device_properties(device_id)
......@@ -668,7 +681,6 @@ class RocmPlatform(Platform):
def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
from vllm.config.compilation import CUDAGraphMode
cache_config = vllm_config.cache_config
compilation_config = vllm_config.compilation_config
parallel_config = vllm_config.parallel_config
......@@ -690,32 +702,9 @@ class RocmPlatform(Platform):
)
compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
if cache_config and not cache_config.user_specified_block_size:
if (
envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION and envs.VLLM_ROCM_USE_AITER
# NOTE: This block has been deprecated
# or get_env_variable_attn_backend()
# == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN
# TODO: monitor https://github.com/vllm-project/vllm/pull/30396
# to see how we can transition to the new way of selecting
# attention backends
):
cache_config.block_size = 64
logger.warning(
"[ROCM_AITER_UNIFIED_ATTN]: Setting kv cache block size to 64."
)
else:
cache_config.block_size = 16
if parallel_config.worker_cls == "auto":
parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
@classmethod
def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:
# TODO: ROCm still sets block_size in check_and_update_config.
# Move that logic here so block_size is chosen by the backend.
pass
@classmethod
def verify_model_arch(cls, model_arch: str) -> None:
if model_arch in _ROCM_UNSUPPORTED_MODELS:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment