Unverified Commit 759ef49b authored by Woosuk Kwon's avatar Woosuk Kwon Committed by GitHub
Browse files

Remove V0 Encoder-Decoder Support (#24907)


Signed-off-by: default avatarWoosuk Kwon <woosuk@thinkingmachines.ai>
parent 5206ab20
......@@ -36,7 +36,6 @@ MODELS_ON_S3 = [
"llava-hf/llava-v1.6-mistral-7b-hf",
"llava-hf/LLaVA-NeXT-Video-7B-hf",
# "meta-llama/Llama-2-7b-hf",
"meta-llama/Llama-3.2-11B-Vision-Instruct",
"meta-llama/Llama-3.2-1B",
"meta-llama/Llama-3.2-1B-Instruct",
"meta-llama/Meta-Llama-3-8B",
......
......@@ -35,7 +35,6 @@ _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
"blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja",
"chameleon": CHAT_TEMPLATES_DIR / "template_basic.jinja",
"deepseek_vl_v2": CHAT_TEMPLATES_DIR / "template_deepseek_vl2.jinja",
"florence2": CHAT_TEMPLATES_DIR / "template_basic.jinja",
"fuyu": CHAT_TEMPLATES_DIR / "template_fuyu.jinja",
"minicpmv": _get_minicpmv_chat_template_fallback,
"paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja",
......
......@@ -90,11 +90,6 @@ _AUTO_CONFIG_KWARGS_OVERRIDES: dict[str, dict[str, Any]] = {
"internvl_chat": {
"has_no_defaults_at_init": True
},
# transformers regards mllama as is_encoder_decoder=False
# vllm needs is_encoder_decoder=True to enable cross-attention
"mllama": {
"is_encoder_decoder": True
},
"NVLM_D": {
"has_no_defaults_at_init": True
},
......
......@@ -498,7 +498,7 @@ class Processor:
assert isinstance(mm_processor, EncDecMultiModalProcessor)
if mm_processor.pad_dummy_encoder_prompt:
return # Skip encoder length check for Whisper and Donut
return # Skip encoder length check for Whisper
if model_config.is_multimodal_model:
suggestion = (
......
This diff is collapsed.
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
'''
Worker-related helper functions.
'''
from vllm.utils import STR_NOT_IMPL_ENC_DEC_ERR_STRS
from vllm.worker.model_runner import GPUModelRunnerBase
def assert_enc_dec_mr_supported_scenario(
enc_dec_mr: GPUModelRunnerBase) -> None:
'''
Asserted that the provided encoder/decoder model runner instance reflects
a supported scenario.
'''
# Reminder: Please update docs/features/compatibility_matrix.md
# If the feature combo become valid
if enc_dec_mr.cache_config.enable_prefix_caching:
raise NotImplementedError(
STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE'])
if enc_dec_mr.sliding_window is not None:
raise NotImplementedError(
STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_SWA'])
if enc_dec_mr.scheduler_config.chunked_prefill_enabled:
raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_ERR_STRS[
'STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL'])
if getattr(enc_dec_mr.model_config.hf_config, 'attn_logit_softcapping',
None) is not None:
raise NotImplementedError(
STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP']
)
if enc_dec_mr.lora_config is not None:
raise NotImplementedError(
STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_LORA'])
if enc_dec_mr.parallel_config.pipeline_parallel_size > 1:
raise NotImplementedError(
STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_PP'])
if enc_dec_mr.scheduler_config.num_lookahead_slots > 0:
raise NotImplementedError(
STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_SPEC_DEC'])
......@@ -28,7 +28,6 @@ from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
from vllm.utils import (GiB_bytes, MemorySnapshot, bind_kv_cache,
memory_profiling)
from vllm.worker.cache_engine import CacheEngine
from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
WorkerInput)
......@@ -82,10 +81,7 @@ class Worker(LocalOrDistributedWorkerBase):
"qwen3_next_mtp")) \
else {"return_hidden_states": True}
ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner
if self.model_config.is_encoder_decoder:
ModelRunnerClass = EncoderDecoderModelRunner
self.model_runner: GPUModelRunnerBase = ModelRunnerClass(
self.model_runner: GPUModelRunnerBase = ModelRunner(
vllm_config=self.vllm_config,
kv_cache_dtype=self.cache_config.cache_dtype,
is_driver_worker=is_driver_worker,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment