Unverified Commit 759ef49b authored by Woosuk Kwon's avatar Woosuk Kwon Committed by GitHub
Browse files

Remove V0 Encoder-Decoder Support (#24907)


Signed-off-by: default avatarWoosuk Kwon <woosuk@thinkingmachines.ai>
parent 5206ab20
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional
import pytest
from PIL import Image
from vllm.inputs.data import ExplicitEncoderDecoderPrompt, TextPrompt
from vllm.multimodal.image import rescale_image_size
from vllm.sequence import SampleLogprobs
from ....conftest import IMAGE_ASSETS, HfRunner, ImageTestAssets, VllmRunner
from ...utils import check_logprobs_close
MODELS = ["microsoft/Florence-2-base"]
# Florence-2 model repo's tokenizer config is missing some special tokens.
# Therefore, we use a converted tokenizer from a forked repo
TOKENIZER = "Isotr0py/Florence-2-tokenizer"
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
"<OD>", # special task token which will output special tokens
"cherry_blossom":
"Describe in detail what is shown in the image.",
})
def get_hf_images_prompts(
prompts_: list[ExplicitEncoderDecoderPrompt[str, TextPrompt]],
) -> tuple[list[ExplicitEncoderDecoderPrompt[str, str]], list[Image.Image]]:
prompts, images = [], []
for prompt in prompts_:
encoder_prompt = prompt["encoder_prompt"]
prompts.append(
ExplicitEncoderDecoderPrompt(
encoder_prompt=encoder_prompt["prompt"],
decoder_prompt=None,
))
images.append(encoder_prompt["multi_modal_data"]["image"])
return prompts, images
def hf_to_vllm_output(hf_output: tuple[list[int], str,
Optional[SampleLogprobs]]):
"""Sanitize hf output to be comparable with vllm output."""
output_ids, output_str, out_logprobs = hf_output
output_str = output_str.replace("</s>", "").replace("<s>", "")
return output_ids, output_str, out_logprobs
def run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
inputs: list[list[ExplicitEncoderDecoderPrompt]],
model: str,
*,
dtype: str,
max_tokens: int,
num_logprobs: int,
tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None,
) -> None:
with vllm_runner(model,
max_num_seqs=8,
tokenizer_name=TOKENIZER,
dtype=dtype,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True) as vllm_model:
vllm_outputs_per_case = [
vllm_model.generate_encoder_decoder_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
skip_special_tokens=False,
) for prompts in inputs
]
hf_inputs = [get_hf_images_prompts(prompts) for prompts in inputs]
with hf_runner(model, dtype=dtype, skip_tokenizer_init=True) as hf_model:
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.language_model.lm_head
hf_outputs_per_case = [
hf_model.generate_encoder_decoder_greedy_logprobs_limit(
prompts, max_tokens, num_logprobs=num_logprobs, images=images)
for prompts, images in hf_inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=[hf_to_vllm_output(output) for output in hf_outputs],
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
num_outputs_0_skip_tokens=1,
)
# FIXME: https://github.com/huggingface/transformers/issues/38358
@pytest.mark.skip("Model initialization fails")
@pytest.mark.core_model
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize(
"size_factors",
[
# No image
[],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets, model: str,
size_factors: list[int], dtype: str, max_tokens: int,
num_logprobs: int) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_image = [[
ExplicitEncoderDecoderPrompt(
encoder_prompt=TextPrompt(
prompt=prompt,
multi_modal_data={"image": rescale_image_size(image, factor)}),
decoder_prompt=None,
) for factor in size_factors
] for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
run_test(
hf_runner,
vllm_runner,
inputs_per_image,
model,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=1,
)
This diff is collapsed.
...@@ -167,8 +167,6 @@ def _test_processing_correctness( ...@@ -167,8 +167,6 @@ def _test_processing_correctness(
# incorrect token ids. So we need use `add_special_tokens=False` here # incorrect token ids. So we need use `add_special_tokens=False` here
# to leave bos_token to be added by the processor. # to leave bos_token to be added by the processor.
_ADD_SPECIAL_TOKENS_OVERRIDES = { _ADD_SPECIAL_TOKENS_OVERRIDES = {
"donut": False,
"mllama": False,
"ovis": False, "ovis": False,
"ovis2_5": False, "ovis2_5": False,
"paligemma": False, "paligemma": False,
...@@ -278,9 +276,7 @@ def _test_processing_correctness_one( ...@@ -278,9 +276,7 @@ def _test_processing_correctness_one(
"facebook/chameleon-7b", "facebook/chameleon-7b",
"CohereLabs/command-a-vision-07-2025", "CohereLabs/command-a-vision-07-2025",
"deepseek-ai/deepseek-vl2-tiny", "deepseek-ai/deepseek-vl2-tiny",
"naver-clova-ix/donut-base-finetuned-docvqa",
"baidu/ERNIE-4.5-VL-28B-A3B-PT", "baidu/ERNIE-4.5-VL-28B-A3B-PT",
"microsoft/Florence-2-base",
"adept/fuyu-8b", "adept/fuyu-8b",
"google/gemma-3-4b-it", "google/gemma-3-4b-it",
"google/gemma-3n-E2B-it", "google/gemma-3n-E2B-it",
...@@ -305,7 +301,6 @@ def _test_processing_correctness_one( ...@@ -305,7 +301,6 @@ def _test_processing_correctness_one(
"llava-hf/llava-v1.6-mistral-7b-hf", "llava-hf/llava-v1.6-mistral-7b-hf",
"llava-hf/LLaVA-NeXT-Video-7B-hf", "llava-hf/LLaVA-NeXT-Video-7B-hf",
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf", "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
"meta-llama/Llama-3.2-11B-Vision-Instruct",
"TIGER-Lab/Mantis-8B-siglip-llama3", "TIGER-Lab/Mantis-8B-siglip-llama3",
"mispeech/midashenglm-7b", "mispeech/midashenglm-7b",
"openbmb/MiniCPM-Llama3-V-2_5", "openbmb/MiniCPM-Llama3-V-2_5",
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for mllama's multimodal preprocessing and profiling."""
import pytest
from transformers import MllamaConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.profiling import MultiModalProfiler
from ...utils import build_model_context
@pytest.mark.parametrize("model_id",
["meta-llama/Llama-3.2-11B-Vision-Instruct"])
@pytest.mark.parametrize("max_model_len", [4096, 8192, 25600, 131072])
@pytest.mark.parametrize("max_num_seqs", [1, 2, 8])
def test_profiling(
model_id: str,
max_model_len: int,
max_num_seqs: int,
):
# regression test for https://github.com/vllm-project/vllm/issues/13929
from vllm.model_executor.models.mllama import calc_token_per_chunk
model_config_kwargs = {
"max_model_len": max_model_len,
}
ctx = build_model_context(
model_id,
model_config_kwargs=model_config_kwargs,
limit_mm_per_prompt={"image": 1},
)
mm_config = ctx.get_mm_config()
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
profiler = MultiModalProfiler(processor)
dummy_encoder_data = profiler.get_encoder_dummy_data(
max_model_len,
mm_counts=mm_config.limit_per_prompt,
)
dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs(
max_model_len,
mm_counts=mm_config.limit_per_prompt,
)
hf_config = ctx.get_hf_config(MllamaConfig)
image_size = hf_config.vision_config.image_size
encoder_seq_lens = [len(dummy_encoder_data.prompt_token_ids)
] * max_num_seqs
mm_data = processor.apply(
prompt=dummy_mm_data.prompt,
mm_data=dummy_mm_data.mm_data,
hf_processor_mm_kwargs=dict(),
)["mm_kwargs"].get_data()
# Get the actual number of encoder tokens for each sample.
# Because attn_metadata.encoder_seq_lens only counts the last
# group of images for each sample, which is used to cheat the
# block manager to allocate blocks for those images only.
# See MllamaMultiModalProcessor for more details.
num_tiles = [[t] for t in mm_data.pop("num_tiles")]
num_tokens_per_tile = calc_token_per_chunk(image_size)
actual_encoder_seq_lens = [
sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles
]
# simulate mllama image-present prefill.
for actual_len, last_group_len in zip(actual_encoder_seq_lens,
encoder_seq_lens):
assert actual_len >= last_group_len
...@@ -31,7 +31,6 @@ from ...utils import dummy_hf_overrides ...@@ -31,7 +31,6 @@ from ...utils import dummy_hf_overrides
ARCH_TO_SKIP = { ARCH_TO_SKIP = {
"MolmoForCausalLM": "incompatible requirements", "MolmoForCausalLM": "incompatible requirements",
"Florence2ForConditionalGeneration": "not supported in V1",
} }
ARCH_NEEDS_EXTRAS = [ ARCH_NEEDS_EXTRAS = [
"InternVLChatModel", "InternVLChatModel",
......
...@@ -354,11 +354,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { ...@@ -354,11 +354,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", "MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
trust_remote_code=True), trust_remote_code=True),
"Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst"), "Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst"),
# [Encoder-decoder]
"BartModel": _HfExamplesInfo("facebook/bart-base"),
"BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"),
"MBartForConditionalGeneration": _HfExamplesInfo("facebook/mbart-large-en-ro", # noqa: E501
hf_overrides={"architectures": ["MBartForConditionalGeneration"]}), # noqa: E501
} }
_EMBEDDING_EXAMPLE_MODELS = { _EMBEDDING_EXAMPLE_MODELS = {
...@@ -583,15 +578,6 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -583,15 +578,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
is_available_online=False, is_available_online=False,
), ),
# [Encoder-decoder] # [Encoder-decoder]
"DonutForConditionalGeneration": _HfExamplesInfo("naver-clova-ix/donut-base-finetuned-docvqa", # noqa: E501
hf_overrides={"architectures": ["DonutForConditionalGeneration"], "model_type": "donut"}, # noqa: E501
extras={"dolphin": "ByteDance/Dolphin"}), # noqa: E501
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
# Therefore, we borrow the BartTokenizer from the original Bart model
"Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base", # noqa: E501
tokenizer="Isotr0py/Florence-2-tokenizer", # noqa: E501
trust_remote_code=True), # noqa: E501
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501 "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501
# [Cross-encoder] # [Cross-encoder]
"JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"), # noqa: E501 "JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"), # noqa: E501
......
...@@ -92,10 +92,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, ...@@ -92,10 +92,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
# has cc==8.9 which hasn't supported FA3 yet. Remove this hack when # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
# L4 supports FA3. # L4 supports FA3.
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1") m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
if model_arch == "Florence2ForConditionalGeneration":
# An encoder-decoder model that's V0-only. Just skip it
# since V0 is about to be removed.
pytest.skip("Skipping Florence2ForConditionalGeneration")
if model_arch == "WhisperForConditionalGeneration": if model_arch == "WhisperForConditionalGeneration":
m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
LLM( LLM(
......
...@@ -50,7 +50,6 @@ def test_registry_imports(model_arch): ...@@ -50,7 +50,6 @@ def test_registry_imports(model_arch):
@create_new_process_for_each_test() @create_new_process_for_each_test()
@pytest.mark.parametrize("model_arch,is_mm,init_cuda,is_ce", [ @pytest.mark.parametrize("model_arch,is_mm,init_cuda,is_ce", [
("LlamaForCausalLM", False, False, False), ("LlamaForCausalLM", False, False, False),
("MllamaForConditionalGeneration", True, False, False),
("LlavaForConditionalGeneration", True, True, False), ("LlavaForConditionalGeneration", True, True, False),
("BertForSequenceClassification", False, False, True), ("BertForSequenceClassification", False, False, True),
("RobertaForSequenceClassification", False, False, True), ("RobertaForSequenceClassification", False, False, True),
......
...@@ -299,9 +299,8 @@ def test_rope_customization(): ...@@ -299,9 +299,8 @@ def test_rope_customization():
reason="Encoder Decoder models not supported on ROCm.") reason="Encoder Decoder models not supported on ROCm.")
@pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [ @pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [
("facebook/opt-125m", False), ("facebook/opt-125m", False),
("facebook/bart-base", True), ("openai/whisper-tiny", True),
("meta-llama/Llama-3.2-1B-Instruct", False), ("meta-llama/Llama-3.2-1B-Instruct", False),
("meta-llama/Llama-3.2-11B-Vision", True),
]) ])
def test_is_encoder_decoder(model_id, is_encoder_decoder): def test_is_encoder_decoder(model_id, is_encoder_decoder):
config = ModelConfig(model_id) config = ModelConfig(model_id)
......
...@@ -501,34 +501,6 @@ def test_bind_kv_cache_non_attention(): ...@@ -501,34 +501,6 @@ def test_bind_kv_cache_non_attention():
assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1] assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
def test_bind_kv_cache_encoder_decoder(monkeypatch: pytest.MonkeyPatch):
# V1 TESTS: ENCODER_DECODER is not supported on V1 yet.
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "0")
from vllm.attention import Attention, AttentionType
# example from bart
ctx = {
'encoder.layers.0.self_attn.attn':
Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
'decoder.layers.0.encoder_attn.attn':
Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
'decoder.layers.0.self_attn.attn':
Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
}
kv_cache = [
torch.zeros((1, )),
]
encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache
bind_kv_cache(ctx, [kv_cache])
assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]
def test_bind_kv_cache_pp(): def test_bind_kv_cache_pp():
with patch("vllm.utils.cuda_device_count_stateless", lambda: 2): with patch("vllm.utils.cuda_device_count_stateless", lambda: 2):
# this test runs with 1 GPU, but we simulate 2 GPUs # this test runs with 1 GPU, but we simulate 2 GPUs
......
...@@ -9,24 +9,9 @@ from vllm import LLM ...@@ -9,24 +9,9 @@ from vllm import LLM
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncLLMEngine
UNSUPPORTED_MODELS_V1 = [
"facebook/bart-large-cnn", # encoder decoder
]
MODEL = "meta-llama/Llama-3.2-1B-Instruct" MODEL = "meta-llama/Llama-3.2-1B-Instruct"
@pytest.mark.parametrize("model", UNSUPPORTED_MODELS_V1)
def test_reject_unsupported_models(monkeypatch, model):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
args = AsyncEngineArgs(model=model)
with pytest.raises(NotImplementedError):
_ = args.create_engine_config()
m.delenv("VLLM_USE_V1")
def test_reject_bad_config(monkeypatch): def test_reject_bad_config(monkeypatch):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "0") m.setenv("VLLM_USE_V1", "0")
...@@ -77,12 +62,6 @@ def test_enable_by_default_fallback(monkeypatch): ...@@ -77,12 +62,6 @@ def test_enable_by_default_fallback(monkeypatch):
assert envs.VLLM_USE_V1 assert envs.VLLM_USE_V1
m.delenv("VLLM_USE_V1") m.delenv("VLLM_USE_V1")
# Should fall back to V0 for supported model.
_ = AsyncEngineArgs(
model=UNSUPPORTED_MODELS_V1[0]).create_engine_config()
assert not envs.VLLM_USE_V1
m.delenv("VLLM_USE_V1")
def test_v1_llm_by_default(monkeypatch): def test_v1_llm_by_default(monkeypatch):
with monkeypatch.context() as m: with monkeypatch.context() as m:
......
This diff is collapsed.
...@@ -1201,11 +1201,8 @@ class ModelConfig: ...@@ -1201,11 +1201,8 @@ class ModelConfig:
getattr(self.hf_config, "max_source_positions", 0)) getattr(self.hf_config, "max_source_positions", 0))
self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
effective_max_seq_len) effective_max_seq_len)
# CUDAGraph capture not supported for enc-dec models and mllama on ROCm # CUDAGraph capture not supported for encoder-decoder models on ROCm
ROCM_UNSUPPORTED_MODELS = ['mllama'] unsupported_rocm = self.is_encoder_decoder
unsupported_rocm = (self.hf_config.model_type
in ROCM_UNSUPPORTED_MODELS
or self.is_encoder_decoder)
if (unsupported_rocm and not self.enforce_eager if (unsupported_rocm and not self.enforce_eager
and current_platform.is_rocm()): and current_platform.is_rocm()):
...@@ -1671,10 +1668,6 @@ class ModelConfig: ...@@ -1671,10 +1668,6 @@ class ModelConfig:
@property @property
def is_encoder_decoder(self) -> bool: def is_encoder_decoder(self) -> bool:
"""Extract the HF encoder/decoder model flag.""" """Extract the HF encoder/decoder model flag."""
"""
For Mllama, VLLM overrides HF's is_encoder_decoder flag and sets it to
True to enable cross-attention
"""
return is_encoder_decoder(self.hf_config) return is_encoder_decoder(self.hf_config)
@property @property
......
...@@ -1789,7 +1789,7 @@ class LLMEngine: ...@@ -1789,7 +1789,7 @@ class LLMEngine:
assert isinstance(mm_processor, EncDecMultiModalProcessor) assert isinstance(mm_processor, EncDecMultiModalProcessor)
if mm_processor.pad_dummy_encoder_prompt: if mm_processor.pad_dummy_encoder_prompt:
return # Skip encoder length check for Whisper and Donut return # Skip encoder length check for Whisper
if model_config.is_multimodal_model: if model_config.is_multimodal_model:
suggestion = ( suggestion = (
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -147,10 +147,6 @@ _TEXT_GENERATION_MODELS = { ...@@ -147,10 +147,6 @@ _TEXT_GENERATION_MODELS = {
"TeleFLMForCausalLM": ("teleflm", "TeleFLMForCausalLM"), "TeleFLMForCausalLM": ("teleflm", "TeleFLMForCausalLM"),
"XverseForCausalLM": ("llama", "LlamaForCausalLM"), "XverseForCausalLM": ("llama", "LlamaForCausalLM"),
"Zamba2ForCausalLM": ("zamba2", "Zamba2ForCausalLM"), "Zamba2ForCausalLM": ("zamba2", "Zamba2ForCausalLM"),
# [Encoder-decoder]
"BartModel": ("bart", "BartForConditionalGeneration"),
"BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"),
"MBartForConditionalGeneration": ("bart", "MBartForConditionalGeneration"),
} }
_EMBEDDING_MODELS = { _EMBEDDING_MODELS = {
...@@ -237,6 +233,7 @@ _MULTIMODAL_MODELS = { ...@@ -237,6 +233,7 @@ _MULTIMODAL_MODELS = {
"RForConditionalGeneration": ("rvl", "RForConditionalGeneration"), "RForConditionalGeneration": ("rvl", "RForConditionalGeneration"),
"KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"), # noqa: E501 "KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"), # noqa: E501
"Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"), "Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"),
"Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"), # noqa: E501
"LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"), "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
"LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501
"LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), # noqa: E501 "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), # noqa: E501
...@@ -263,16 +260,12 @@ _MULTIMODAL_MODELS = { ...@@ -263,16 +260,12 @@ _MULTIMODAL_MODELS = {
"Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501 "Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501
"Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501 "Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501
"UltravoxModel": ("ultravox", "UltravoxModel"), "UltravoxModel": ("ultravox", "UltravoxModel"),
"SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
"Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"), # noqa: E501 "Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"), # noqa: E501
"TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"), # noqa: E501 "TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"), # noqa: E501
"Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"), # noqa: E501 "Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"), # noqa: E501
"VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501 "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501
# [Encoder-decoder] # [Encoder-decoder]
"DonutForConditionalGeneration": ("donut", "DonutForConditionalGeneration"),
"Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"), # noqa: E501
"MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), # noqa: E501
"Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"), # noqa: E501
"SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
"WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"), # noqa: E501 "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"), # noqa: E501
} }
......
...@@ -209,7 +209,7 @@ class MultiModalProfiler(Generic[_I]): ...@@ -209,7 +209,7 @@ class MultiModalProfiler(Generic[_I]):
if processor.pad_dummy_encoder_prompt: if processor.pad_dummy_encoder_prompt:
num_tokens_to_pad = max(total_len, seq_len) - total_len num_tokens_to_pad = max(total_len, seq_len) - total_len
encoder_prompt_token_ids.extend([0] * num_tokens_to_pad) encoder_prompt_token_ids.extend([0] * num_tokens_to_pad)
# NOTE: Whisper and Donut allows total_len > seq_len. # NOTE: Whisper allows total_len > seq_len.
elif total_len > seq_len and not envs.VLLM_USE_V1: elif total_len > seq_len and not envs.VLLM_USE_V1:
# `max_num_batched_tokens` is defined by `SchedulerConfig` # `max_num_batched_tokens` is defined by `SchedulerConfig`
logger.warning_once( logger.warning_once(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment