Remove V0 Encoder-Decoder Support (#24907)

Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>

Remove V0 Encoder-Decoder Support (#24907)
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
759ef49b · Woosuk Kwon · GitHub · 5206ab20 · 5206ab20 · 5206ab20
Unverified Commit 759ef49b authored Sep 15, 2025 by Woosuk Kwon Committed by GitHub Sep 15, 2025
20 changed files
--- a/tests/models/multimodal/generation/test_florence2.py
+++ b/tests/models/multimodal/generation/test_florence2.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
-import pytest
-from PIL import Image
-from vllm.inputs.data import ExplicitEncoderDecoderPrompt, TextPrompt
-from vllm.multimodal.image import rescale_image_size
-from vllm.sequence import SampleLogprobs
-from ....conftest import IMAGE_ASSETS, HfRunner, ImageTestAssets, VllmRunner
-from ...utils import check_logprobs_close
-MODELS = ["microsoft/Florence-2-base"]
-# Florence-2 model repo's tokenizer config is missing some special tokens.
-# Therefore, we use a converted tokenizer from a forked repo
-TOKENIZER = "Isotr0py/Florence-2-tokenizer"
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "<OD>",  # special task token which will output special tokens
-    "cherry_blossom":
-    "Describe in detail what is shown in the image.",
-})
-def get_hf_images_prompts(
-    prompts_: list[ExplicitEncoderDecoderPrompt[str, TextPrompt]],
-) -> tuple[list[ExplicitEncoderDecoderPrompt[str, str]], list[Image.Image]]:
-    prompts, images = [], []
-    for prompt in prompts_:
-        encoder_prompt = prompt["encoder_prompt"]
-        prompts.append(
-            ExplicitEncoderDecoderPrompt(
-                encoder_prompt=encoder_prompt["prompt"],
-                decoder_prompt=None,
-            ))
-        images.append(encoder_prompt["multi_modal_data"]["image"])
-    return prompts, images
-def hf_to_vllm_output(hf_output: tuple[list[int], str,
-                                       Optional[SampleLogprobs]]):
-    """Sanitize hf output to be comparable with vllm output."""
-    output_ids, output_str, out_logprobs = hf_output
-    output_str = output_str.replace("</s>", "").replace("<s>", "")
-    return output_ids, output_str, out_logprobs
-def run_test(
-    hf_runner: type[HfRunner],
-    vllm_runner: type[VllmRunner],
-    inputs: list[list[ExplicitEncoderDecoderPrompt]],
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-) -> None:
-    with vllm_runner(model,
-                     max_num_seqs=8,
-                     tokenizer_name=TOKENIZER,
-                     dtype=dtype,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
-        vllm_outputs_per_case = [
-            vllm_model.generate_encoder_decoder_greedy_logprobs(
-                prompts,
-                max_tokens,
-                num_logprobs=num_logprobs,
-                skip_special_tokens=False,
-            ) for prompts in inputs
-        ]
-    hf_inputs = [get_hf_images_prompts(prompts) for prompts in inputs]
-    with hf_runner(model, dtype=dtype, skip_tokenizer_init=True) as hf_model:
-        hf_model.model.get_output_embeddings = lambda: \
-            hf_model.model.language_model.lm_head
-        hf_outputs_per_case = [
-            hf_model.generate_encoder_decoder_greedy_logprobs_limit(
-                prompts, max_tokens, num_logprobs=num_logprobs, images=images)
-            for prompts, images in hf_inputs
-        ]
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
-                                        vllm_outputs_per_case):
-        check_logprobs_close(
-            outputs_0_lst=[hf_to_vllm_output(output) for output in hf_outputs],
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-            num_outputs_0_skip_tokens=1,
-        )
-# FIXME: https://github.com/huggingface/transformers/issues/38358
-@pytest.mark.skip("Model initialization fails")
-@pytest.mark.core_model
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [64])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
-                image_assets: ImageTestAssets, model: str,
-                size_factors: list[int], dtype: str, max_tokens: int,
-                num_logprobs: int) -> None:
-    images = [asset.pil_image for asset in image_assets]
-    inputs_per_image = [[
-        ExplicitEncoderDecoderPrompt(
-            encoder_prompt=TextPrompt(
-                prompt=prompt,
-                multi_modal_data={"image": rescale_image_size(image, factor)}),
-            decoder_prompt=None,
-        ) for factor in size_factors
-    ] for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_per_image,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=1,
-    )
--- a/tests/models/multimodal/generation/test_mllama.py
+++ b/tests/models/multimodal/generation/test_mllama.py
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -167,8 +167,6 @@ def _test_processing_correctness(
 # incorrect token ids. So we need use `add_special_tokens=False` here
 # to leave bos_token to be added by the processor.
 _ADD_SPECIAL_TOKENS_OVERRIDES = {
-    "donut": False,
-    "mllama": False,
    "ovis": False,
    "ovis2_5": False,
    "paligemma": False,
@@ -278,9 +276,7 @@ def _test_processing_correctness_one(
    "facebook/chameleon-7b",
    "CohereLabs/command-a-vision-07-2025",
    "deepseek-ai/deepseek-vl2-tiny",
-    "naver-clova-ix/donut-base-finetuned-docvqa",
    "baidu/ERNIE-4.5-VL-28B-A3B-PT",
-    "microsoft/Florence-2-base",
    "adept/fuyu-8b",
    "google/gemma-3-4b-it",
    "google/gemma-3n-E2B-it",
@@ -305,7 +301,6 @@ def _test_processing_correctness_one(
    "llava-hf/llava-v1.6-mistral-7b-hf",
    "llava-hf/LLaVA-NeXT-Video-7B-hf",
    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
-    "meta-llama/Llama-3.2-11B-Vision-Instruct",
    "TIGER-Lab/Mantis-8B-siglip-llama3",
    "mispeech/midashenglm-7b",
    "openbmb/MiniCPM-Llama3-V-2_5",

--- a/tests/models/multimodal/processing/test_mllama.py
+++ b/tests/models/multimodal/processing/test_mllama.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests for mllama's multimodal preprocessing and profiling."""
-import pytest
-from transformers import MllamaConfig
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.profiling import MultiModalProfiler
-from ...utils import build_model_context
-@pytest.mark.parametrize("model_id",
-                         ["meta-llama/Llama-3.2-11B-Vision-Instruct"])
-@pytest.mark.parametrize("max_model_len", [4096, 8192, 25600, 131072])
-@pytest.mark.parametrize("max_num_seqs", [1, 2, 8])
-def test_profiling(
-    model_id: str,
-    max_model_len: int,
-    max_num_seqs: int,
-):
-    # regression test for https://github.com/vllm-project/vllm/issues/13929
-    from vllm.model_executor.models.mllama import calc_token_per_chunk
-    model_config_kwargs = {
-        "max_model_len": max_model_len,
-    }
-    ctx = build_model_context(
-        model_id,
-        model_config_kwargs=model_config_kwargs,
-        limit_mm_per_prompt={"image": 1},
-    )
-    mm_config = ctx.get_mm_config()
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
-    profiler = MultiModalProfiler(processor)
-    dummy_encoder_data = profiler.get_encoder_dummy_data(
-        max_model_len,
-        mm_counts=mm_config.limit_per_prompt,
-    )
-    dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs(
-        max_model_len,
-        mm_counts=mm_config.limit_per_prompt,
-    )
-    hf_config = ctx.get_hf_config(MllamaConfig)
-    image_size = hf_config.vision_config.image_size
-    encoder_seq_lens = [len(dummy_encoder_data.prompt_token_ids)
-                        ] * max_num_seqs
-    mm_data = processor.apply(
-        prompt=dummy_mm_data.prompt,
-        mm_data=dummy_mm_data.mm_data,
-        hf_processor_mm_kwargs=dict(),
-    )["mm_kwargs"].get_data()
-    # Get the actual number of encoder tokens for each sample.
-    # Because attn_metadata.encoder_seq_lens only counts the last
-    # group of images for each sample, which is used to cheat the
-    # block manager to allocate blocks for those images only.
-    # See MllamaMultiModalProcessor for more details.
-    num_tiles = [[t] for t in mm_data.pop("num_tiles")]
-    num_tokens_per_tile = calc_token_per_chunk(image_size)
-    actual_encoder_seq_lens = [
-        sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles
-    ]
-    # simulate mllama image-present prefill.
-    for actual_len, last_group_len in zip(actual_encoder_seq_lens,
-                                          encoder_seq_lens):
-        assert actual_len >= last_group_len
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -31,7 +31,6 @@ from ...utils import dummy_hf_overrides
 ARCH_TO_SKIP = {
    "MolmoForCausalLM": "incompatible requirements",
-    "Florence2ForConditionalGeneration": "not supported in V1",
 }
 ARCH_NEEDS_EXTRAS = [
    "InternVLChatModel",

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -354,11 +354,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
                                        trust_remote_code=True),
    "Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst"),
-    # [Encoder-decoder]
-    "BartModel": _HfExamplesInfo("facebook/bart-base"),
-    "BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"),
-    "MBartForConditionalGeneration": _HfExamplesInfo("facebook/mbart-large-en-ro",  # noqa: E501
-                                                    hf_overrides={"architectures": ["MBartForConditionalGeneration"]}),  # noqa: E501
 }
 _EMBEDDING_EXAMPLE_MODELS = {
@@ -583,15 +578,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        is_available_online=False,
    ),
    # [Encoder-decoder]
-    "DonutForConditionalGeneration": _HfExamplesInfo("naver-clova-ix/donut-base-finetuned-docvqa",  # noqa: E501
-                                                    hf_overrides={"architectures": ["DonutForConditionalGeneration"], "model_type": "donut"},  # noqa: E501
-                                                    extras={"dolphin": "ByteDance/Dolphin"}),  # noqa: E501
-    # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
-    # Therefore, we borrow the BartTokenizer from the original Bart model
-    "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base",  # noqa: E501
-                                                         tokenizer="Isotr0py/Florence-2-tokenizer",  # noqa: E501
-                                                         trust_remote_code=True),  # noqa: E501
-    "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501
    "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),  # noqa: E501
    # [Cross-encoder]
    "JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"),   # noqa: E501

--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -92,10 +92,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
            # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
            # L4 supports FA3.
            m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
-        if model_arch == "Florence2ForConditionalGeneration":
-            # An encoder-decoder model that's V0-only. Just skip it
-            # since V0 is about to be removed.
-            pytest.skip("Skipping Florence2ForConditionalGeneration")
        if model_arch == "WhisperForConditionalGeneration":
            m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
        LLM(

--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -50,7 +50,6 @@ def test_registry_imports(model_arch):
 @create_new_process_for_each_test()
 @pytest.mark.parametrize("model_arch,is_mm,init_cuda,is_ce", [
    ("LlamaForCausalLM", False, False, False),
-    ("MllamaForConditionalGeneration", True, False, False),
    ("LlavaForConditionalGeneration", True, True, False),
    ("BertForSequenceClassification", False, False, True),
    ("RobertaForSequenceClassification", False, False, True),

--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -299,9 +299,8 @@ def test_rope_customization():
                    reason="Encoder Decoder models not supported on ROCm.")
 @pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [
    ("facebook/opt-125m", False),
-    ("facebook/bart-base", True),
+    ("openai/whisper-tiny", True),
    ("meta-llama/Llama-3.2-1B-Instruct", False),
-    ("meta-llama/Llama-3.2-11B-Vision", True),
 ])
 def test_is_encoder_decoder(model_id, is_encoder_decoder):
    config = ModelConfig(model_id)

--- a/tests/utils_/test_utils.py
+++ b/tests/utils_/test_utils.py
@@ -501,34 +501,6 @@ def test_bind_kv_cache_non_attention():
    assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
-def test_bind_kv_cache_encoder_decoder(monkeypatch: pytest.MonkeyPatch):
-    # V1 TESTS: ENCODER_DECODER is not supported on V1 yet.
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
-        from vllm.attention import Attention, AttentionType
-        # example from bart
-        ctx = {
-            'encoder.layers.0.self_attn.attn':
-                Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
-            'decoder.layers.0.encoder_attn.attn':
-                Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
-            'decoder.layers.0.self_attn.attn':
-                Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
-        }
-        kv_cache = [
-            torch.zeros((1, )),
-        ]
-        encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache
-        bind_kv_cache(ctx, [kv_cache])
-        assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
-        assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
-        assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]
 def test_bind_kv_cache_pp():
    with patch("vllm.utils.cuda_device_count_stateless", lambda: 2):
        # this test runs with 1 GPU, but we simulate 2 GPUs

--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -9,24 +9,9 @@ from vllm import LLM
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
-UNSUPPORTED_MODELS_V1 = [
-    "facebook/bart-large-cnn",  # encoder decoder
-]
 MODEL = "meta-llama/Llama-3.2-1B-Instruct"
-@pytest.mark.parametrize("model", UNSUPPORTED_MODELS_V1)
-def test_reject_unsupported_models(monkeypatch, model):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        args = AsyncEngineArgs(model=model)
-        with pytest.raises(NotImplementedError):
-            _ = args.create_engine_config()
-        m.delenv("VLLM_USE_V1")
 def test_reject_bad_config(monkeypatch):
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "0")
@@ -77,12 +62,6 @@ def test_enable_by_default_fallback(monkeypatch):
        assert envs.VLLM_USE_V1
        m.delenv("VLLM_USE_V1")
-        # Should fall back to V0 for supported model.
-        _ = AsyncEngineArgs(
-            model=UNSUPPORTED_MODELS_V1[0]).create_engine_config()
-        assert not envs.VLLM_USE_V1
-        m.delenv("VLLM_USE_V1")
 def test_v1_llm_by_default(monkeypatch):
    with monkeypatch.context() as m:

--- a/tests/worker/test_encoder_decoder_model_runner.py
+++ b/tests/worker/test_encoder_decoder_model_runner.py
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -1201,11 +1201,8 @@ class ModelConfig:
                getattr(self.hf_config, "max_source_positions", 0))
        self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                          effective_max_seq_len)
-        # CUDAGraph capture not supported for enc-dec models and mllama on ROCm
+        # CUDAGraph capture not supported for encoder-decoder models on ROCm
-        ROCM_UNSUPPORTED_MODELS = ['mllama']
+        unsupported_rocm = self.is_encoder_decoder
-        unsupported_rocm = (self.hf_config.model_type
-                            in ROCM_UNSUPPORTED_MODELS
-                            or self.is_encoder_decoder)
        if (unsupported_rocm and not self.enforce_eager
                and current_platform.is_rocm()):
@@ -1671,10 +1668,6 @@ class ModelConfig:
    @property
    def is_encoder_decoder(self) -> bool:
        """Extract the HF encoder/decoder model flag."""
-        """
-        For Mllama, VLLM overrides HF's is_encoder_decoder flag and sets it to
-        True to enable cross-attention
-        """
        return is_encoder_decoder(self.hf_config)
    @property

--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1789,7 +1789,7 @@ class LLMEngine:
                assert isinstance(mm_processor, EncDecMultiModalProcessor)
                if mm_processor.pad_dummy_encoder_prompt:
-                    return  # Skip encoder length check for Whisper and Donut
+                    return  # Skip encoder length check for Whisper
            if model_config.is_multimodal_model:
                suggestion = (

--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
--- a/vllm/model_executor/models/donut.py
+++ b/vllm/model_executor/models/donut.py
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -147,10 +147,6 @@ _TEXT_GENERATION_MODELS = {
    "TeleFLMForCausalLM": ("teleflm", "TeleFLMForCausalLM"),
    "XverseForCausalLM": ("llama", "LlamaForCausalLM"),
    "Zamba2ForCausalLM": ("zamba2", "Zamba2ForCausalLM"),
-    # [Encoder-decoder]
-    "BartModel": ("bart", "BartForConditionalGeneration"),
-    "BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"),
-    "MBartForConditionalGeneration": ("bart", "MBartForConditionalGeneration"),
 }
 _EMBEDDING_MODELS = {
@@ -237,6 +233,7 @@ _MULTIMODAL_MODELS = {
    "RForConditionalGeneration": ("rvl", "RForConditionalGeneration"),
    "KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"),  # noqa: E501
    "Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"),
+    "Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"),  # noqa: E501
    "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
    "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
    "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),  # noqa: E501
@@ -263,16 +260,12 @@ _MULTIMODAL_MODELS = {
    "Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
    "Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
    "UltravoxModel": ("ultravox", "UltravoxModel"),
+    "SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
    "Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"),  # noqa: E501
    "TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"),  # noqa: E501
    "Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"),  # noqa: E501
    "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"),  # noqa: E501
    # [Encoder-decoder]
-    "DonutForConditionalGeneration": ("donut", "DonutForConditionalGeneration"),
-    "Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"),  # noqa: E501
-    "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"),  # noqa: E501
-    "Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"),  # noqa: E501
-    "SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
    "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"),  # noqa: E501
 }

--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -209,7 +209,7 @@ class MultiModalProfiler(Generic[_I]):
        if processor.pad_dummy_encoder_prompt:
            num_tokens_to_pad = max(total_len, seq_len) - total_len
            encoder_prompt_token_ids.extend([0] * num_tokens_to_pad)
-        # NOTE: Whisper and Donut allows total_len > seq_len.
+        # NOTE: Whisper allows total_len > seq_len.
        elif total_len > seq_len and not envs.VLLM_USE_V1:
            # `max_num_batched_tokens` is defined by `SchedulerConfig`
            logger.warning_once(