Unverified Commit e83b7e37 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

Revert "[Renderer] Separate out `RendererConfig` from `ModelConfig` (#30145)" (#30199)

parent 27f4c2fd
...@@ -22,7 +22,7 @@ Declare supported languages and capabilities: ...@@ -22,7 +22,7 @@ Declare supported languages and capabilities:
import torch import torch
from torch import nn from torch import nn
from vllm.config import RendererConfig, SpeechToTextConfig from vllm.config import ModelConfig, SpeechToTextConfig
from vllm.inputs.data import PromptType from vllm.inputs.data import PromptType
from vllm.model_executor.models.interfaces import SupportsTranscription from vllm.model_executor.models.interfaces import SupportsTranscription
...@@ -52,7 +52,7 @@ This is for controlling general behavior of the API when serving your model: ...@@ -52,7 +52,7 @@ This is for controlling general behavior of the API when serving your model:
@classmethod @classmethod
def get_speech_to_text_config( def get_speech_to_text_config(
cls, cls,
renderer_config: RendererConfig, model_config: ModelConfig,
task_type: Literal["transcribe", "translate"], task_type: Literal["transcribe", "translate"],
) -> SpeechToTextConfig: ) -> SpeechToTextConfig:
return SpeechToTextConfig( return SpeechToTextConfig(
...@@ -83,7 +83,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt ...@@ -83,7 +83,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
cls, cls,
audio: np.ndarray, audio: np.ndarray,
stt_config: SpeechToTextConfig, stt_config: SpeechToTextConfig,
renderer_config: RendererConfig, model_config: ModelConfig,
language: str | None, language: str | None,
task_type: Literal["transcribe", "translate"], task_type: Literal["transcribe", "translate"],
request_prompt: str, request_prompt: str,
...@@ -120,7 +120,7 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries: ...@@ -120,7 +120,7 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
cls, cls,
audio: np.ndarray, audio: np.ndarray,
stt_config: SpeechToTextConfig, stt_config: SpeechToTextConfig,
renderer_config: RendererConfig, model_config: ModelConfig,
language: str | None, language: str | None,
task_type: Literal["transcribe", "translate"], task_type: Literal["transcribe", "translate"],
request_prompt: str, request_prompt: str,
...@@ -183,7 +183,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics: ...@@ -183,7 +183,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics:
cls, cls,
audio_duration_s: float, audio_duration_s: float,
stt_config: SpeechToTextConfig, stt_config: SpeechToTextConfig,
renderer_config: RendererConfig, model_config: ModelConfig,
) -> int | None: ) -> int | None:
# Return None if unknown; otherwise return an estimate. # Return None if unknown; otherwise return an estimate.
return int(audio_duration_s * stt_config.sample_rate // 320) # example return int(audio_duration_s * stt_config.sample_rate // 320) # example
...@@ -216,7 +216,7 @@ Relevant server logic: ...@@ -216,7 +216,7 @@ Relevant server logic:
prompt = self.model_cls.get_generation_prompt( prompt = self.model_cls.get_generation_prompt(
audio=chunk, audio=chunk,
stt_config=self.asr_config, stt_config=self.asr_config,
renderer_config=self.renderer_config, model_config=self.model_config,
language=language, language=language,
task_type=self.task_type, task_type=self.task_type,
request_prompt=request.prompt, request_prompt=request.prompt,
......
...@@ -17,7 +17,6 @@ from vllm.config import ( ...@@ -17,7 +17,6 @@ from vllm.config import (
DeviceConfig, DeviceConfig,
ModelConfig, ModelConfig,
PassConfig, PassConfig,
RendererConfig,
VllmConfig, VllmConfig,
get_current_vllm_config, get_current_vllm_config,
set_current_vllm_config, set_current_vllm_config,
...@@ -277,7 +276,6 @@ def sequence_parallelism_pass_on_test_model( ...@@ -277,7 +276,6 @@ def sequence_parallelism_pass_on_test_model(
vllm_config = VllmConfig( vllm_config = VllmConfig(
model_config=model_config, model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
device_config=device_config, device_config=device_config,
compilation_config=compilation_config, compilation_config=compilation_config,
) )
......
...@@ -15,7 +15,6 @@ from vllm.config import ( ...@@ -15,7 +15,6 @@ from vllm.config import (
CompilationConfig, CompilationConfig,
ModelConfig, ModelConfig,
PassConfig, PassConfig,
RendererConfig,
VllmConfig, VllmConfig,
set_current_vllm_config, set_current_vllm_config,
) )
...@@ -220,11 +219,8 @@ def test_fix_functionalization( ...@@ -220,11 +219,8 @@ def test_fix_functionalization(
torch.set_default_device("cuda") torch.set_default_device("cuda")
torch.set_default_dtype(dtype) torch.set_default_dtype(dtype)
model_config = ModelConfig(dtype=dtype)
vllm_config = VllmConfig( vllm_config = VllmConfig(
model_config=model_config, model_config=ModelConfig(dtype=dtype),
renderer_config=RendererConfig(model_config=model_config),
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
custom_ops=["all"], custom_ops=["all"],
pass_config=PassConfig( pass_config=PassConfig(
......
...@@ -15,7 +15,6 @@ from vllm.config import ( ...@@ -15,7 +15,6 @@ from vllm.config import (
CompilationMode, CompilationMode,
ModelConfig, ModelConfig,
PassConfig, PassConfig,
RendererConfig,
VllmConfig, VllmConfig,
) )
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
...@@ -155,11 +154,8 @@ def test_fusion_rmsnorm_quant( ...@@ -155,11 +154,8 @@ def test_fusion_rmsnorm_quant(
custom_ops.append("+rms_norm") custom_ops.append("+rms_norm")
if enable_quant_fp8_custom_op: if enable_quant_fp8_custom_op:
custom_ops.append("+quant_fp8") custom_ops.append("+quant_fp8")
model_config = ModelConfig(dtype=dtype)
vllm_config = VllmConfig( vllm_config = VllmConfig(
model_config=model_config, model_config=ModelConfig(dtype=dtype),
renderer_config=RendererConfig(model_config=model_config),
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE, mode=CompilationMode.VLLM_COMPILE,
custom_ops=custom_ops, custom_ops=custom_ops,
......
...@@ -24,7 +24,6 @@ from vllm.config import ( ...@@ -24,7 +24,6 @@ from vllm.config import (
CompilationMode, CompilationMode,
ModelConfig, ModelConfig,
PassConfig, PassConfig,
RendererConfig,
SchedulerConfig, SchedulerConfig,
VllmConfig, VllmConfig,
set_current_vllm_config, set_current_vllm_config,
...@@ -326,7 +325,6 @@ def test_attention_quant_pattern( ...@@ -326,7 +325,6 @@ def test_attention_quant_pattern(
) )
vllm_config = VllmConfig( vllm_config = VllmConfig(
model_config=model_config, model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
scheduler_config=SchedulerConfig( scheduler_config=SchedulerConfig(
max_num_seqs=1024, max_num_seqs=1024,
max_model_len=model_config.max_model_len, max_model_len=model_config.max_model_len,
......
...@@ -7,7 +7,7 @@ import torch ...@@ -7,7 +7,7 @@ import torch
from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
from vllm.compilation.pass_manager import PostGradPassManager from vllm.compilation.pass_manager import PostGradPassManager
from vllm.config import ModelConfig, RendererConfig, VllmConfig from vllm.config import ModelConfig, VllmConfig
# dummy custom pass that doesn't inherit # dummy custom pass that doesn't inherit
...@@ -43,11 +43,7 @@ class ProperPass(InductorPass): ...@@ -43,11 +43,7 @@ class ProperPass(InductorPass):
) )
def test_pass_manager_uuid(callable): def test_pass_manager_uuid(callable):
# Some passes need dtype to be set # Some passes need dtype to be set
model_config = ModelConfig(dtype=torch.bfloat16) config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16))
config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
)
pass_manager = PostGradPassManager() pass_manager = PostGradPassManager()
pass_manager.configure(config) pass_manager.configure(config)
......
...@@ -19,7 +19,6 @@ from vllm.config import ( ...@@ -19,7 +19,6 @@ from vllm.config import (
CompilationMode, CompilationMode,
ModelConfig, ModelConfig,
PassConfig, PassConfig,
RendererConfig,
VllmConfig, VllmConfig,
set_current_vllm_config, set_current_vllm_config,
) )
...@@ -134,10 +133,8 @@ def test_qk_norm_rope_fusion( ...@@ -134,10 +133,8 @@ def test_qk_norm_rope_fusion(
if enable_rope_custom_op: if enable_rope_custom_op:
custom_ops.append("+rotary_embedding") custom_ops.append("+rotary_embedding")
model_config = ModelConfig(dtype=dtype)
vllm_config = VllmConfig( vllm_config = VllmConfig(
model_config=model_config, model_config=ModelConfig(dtype=dtype),
renderer_config=RendererConfig(model_config=model_config),
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE, mode=CompilationMode.VLLM_COMPILE,
custom_ops=custom_ops, custom_ops=custom_ops,
......
...@@ -5,7 +5,6 @@ from vllm.config import ( ...@@ -5,7 +5,6 @@ from vllm.config import (
DeviceConfig, DeviceConfig,
KVTransferConfig, KVTransferConfig,
ModelConfig, ModelConfig,
RendererConfig,
VllmConfig, VllmConfig,
set_current_vllm_config, set_current_vllm_config,
) )
...@@ -48,7 +47,6 @@ def test_get_kv_connector_cache_layout_with_nixl_connector(): ...@@ -48,7 +47,6 @@ def test_get_kv_connector_cache_layout_with_nixl_connector():
vllm_config = VllmConfig( vllm_config = VllmConfig(
device_config=DeviceConfig("cpu"), device_config=DeviceConfig("cpu"),
model_config=model_config, model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
kv_transfer_config=kv_transfer_config, kv_transfer_config=kv_transfer_config,
) )
with set_current_vllm_config(vllm_config): with set_current_vllm_config(vllm_config):
...@@ -72,7 +70,6 @@ def test_get_kv_connector_cache_layout_with_multi_connector(): ...@@ -72,7 +70,6 @@ def test_get_kv_connector_cache_layout_with_multi_connector():
vllm_config = VllmConfig( vllm_config = VllmConfig(
device_config=DeviceConfig("cpu"), device_config=DeviceConfig("cpu"),
model_config=model_config, model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
kv_transfer_config=kv_transfer_config, kv_transfer_config=kv_transfer_config,
) )
with set_current_vllm_config(vllm_config): with set_current_vllm_config(vllm_config):
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
import pytest import pytest
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template
from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.tokenizers import get_tokenizer from vllm.tokenizers import get_tokenizer
...@@ -106,11 +107,24 @@ def test_get_gen_prompt( ...@@ -106,11 +107,24 @@ def test_get_gen_prompt(
model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip") model_info.check_available_online(on_fail="skip")
renderer_config = model_info.build_renderer_config(model) model_config = ModelConfig(
model,
tokenizer=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode,
trust_remote_code=model_info.trust_remote_code,
revision=model_info.revision,
hf_overrides=model_info.hf_overrides,
skip_tokenizer_init=model_info.require_embed_inputs,
enable_prompt_embeds=model_info.require_embed_inputs,
enable_mm_embeds=model_info.require_embed_inputs,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype,
)
# Initialize the tokenizer
tokenizer = get_tokenizer( tokenizer = get_tokenizer(
renderer_config.tokenizer, tokenizer_name=model_config.tokenizer,
trust_remote_code=renderer_config.trust_remote_code, trust_remote_code=model_config.trust_remote_code,
) )
template_content = load_chat_template(chat_template=template) template_content = load_chat_template(chat_template=template)
...@@ -129,7 +143,7 @@ def test_get_gen_prompt( ...@@ -129,7 +143,7 @@ def test_get_gen_prompt(
tokenizer=tokenizer, tokenizer=tokenizer,
conversation=mock_request.messages, conversation=mock_request.messages,
chat_template=mock_request.chat_template or template_content, chat_template=mock_request.chat_template or template_content,
renderer_config=renderer_config, model_config=model_config,
tools=None, tools=None,
add_generation_prompt=mock_request.add_generation_prompt, add_generation_prompt=mock_request.add_generation_prompt,
continue_final_message=mock_request.continue_final_message, continue_final_message=mock_request.continue_final_message,
......
...@@ -33,34 +33,26 @@ class MockModelConfig: ...@@ -33,34 +33,26 @@ class MockModelConfig:
"""Minimal mock ModelConfig for testing.""" """Minimal mock ModelConfig for testing."""
model: str = MODEL_NAME model: str = MODEL_NAME
tokenizer: str = MODEL_NAME
trust_remote_code: bool = False trust_remote_code: bool = False
tokenizer_mode: str = "auto"
max_model_len: int = 100 max_model_len: int = 100
tokenizer_revision: str | None = None
multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig) multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
hf_config: MockHFConfig = field(default_factory=MockHFConfig) hf_config: MockHFConfig = field(default_factory=MockHFConfig)
logits_processors: list[str] | None = None logits_processors: list[str] | None = None
logits_processor_pattern: str | None = None logits_processor_pattern: str | None = None
diff_sampling_param: dict | None = None diff_sampling_param: dict | None = None
allowed_local_media_path: str = ""
allowed_media_domains: list[str] | None = None
encoder_config = None encoder_config = None
generation_config: str = "auto" generation_config: str = "auto"
skip_tokenizer_init: bool = False
def get_diff_sampling_param(self): def get_diff_sampling_param(self):
return self.diff_sampling_param or {} return self.diff_sampling_param or {}
@dataclass
class MockRendererConfig:
"""Minimal mock RendererConfig for testing."""
model_config: MockModelConfig
tokenizer: str = MODEL_NAME
tokenizer_mode: str = "auto"
tokenizer_revision: str | None = None
skip_tokenizer_init: bool = False
allowed_local_media_path: str = ""
allowed_media_domains: list[str] | None = None
class MockLoRAResolver(LoRAResolver): class MockLoRAResolver(LoRAResolver):
async def resolve_lora( async def resolve_lora(
self, base_model_name: str, lora_name: str self, base_model_name: str, lora_name: str
...@@ -122,7 +114,6 @@ def mock_serving_setup(): ...@@ -122,7 +114,6 @@ def mock_serving_setup():
mock_engine.add_lora.reset_mock() mock_engine.add_lora.reset_mock()
mock_engine.model_config = MockModelConfig() mock_engine.model_config = MockModelConfig()
mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
mock_engine.input_processor = MagicMock() mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock() mock_engine.io_processor = MagicMock()
......
...@@ -346,33 +346,27 @@ class MockHFConfig: ...@@ -346,33 +346,27 @@ class MockHFConfig:
class MockModelConfig: class MockModelConfig:
task = "generate" task = "generate"
runner_type = "generate" runner_type = "generate"
tokenizer = MODEL_NAME
trust_remote_code = False trust_remote_code = False
tokenizer_mode = "auto"
max_model_len = 100 max_model_len = 100
tokenizer_revision = None
multimodal_config = MultiModalConfig() multimodal_config = MultiModalConfig()
hf_config = MockHFConfig() hf_config = MockHFConfig()
logits_processors: list[str] | None = None logits_processors: list[str] | None = None
logits_processor_pattern = None logits_processor_pattern = None
diff_sampling_param: dict | None = None diff_sampling_param: dict | None = None
allowed_local_media_path: str = ""
allowed_media_domains: list[str] | None = None
encoder_config = None encoder_config = None
generation_config: str = "auto" generation_config: str = "auto"
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
skip_tokenizer_init = False
def get_diff_sampling_param(self): def get_diff_sampling_param(self):
return self.diff_sampling_param or {} return self.diff_sampling_param or {}
@dataclass
class MockRendererConfig:
model_config: MockModelConfig = field(default_factory=MockModelConfig)
tokenizer = MODEL_NAME
tokenizer_mode = "auto"
tokenizer_revision = None
skip_tokenizer_init = False
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
allowed_local_media_path: str = ""
allowed_media_domains: list[str] | None = None
def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
models = OpenAIServingModels( models = OpenAIServingModels(
engine_client=engine, engine_client=engine,
...@@ -405,7 +399,6 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: ...@@ -405,7 +399,6 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
@dataclass @dataclass
class MockEngine: class MockEngine:
model_config: MockModelConfig = field(default_factory=MockModelConfig) model_config: MockModelConfig = field(default_factory=MockModelConfig)
renderer_config: MockRendererConfig = field(default_factory=MockRendererConfig)
input_processor: MagicMock = field(default_factory=MagicMock) input_processor: MagicMock = field(default_factory=MagicMock)
io_processor: MagicMock = field(default_factory=MagicMock) io_processor: MagicMock = field(default_factory=MagicMock)
...@@ -436,7 +429,6 @@ async def test_serving_chat_returns_correct_model_name(): ...@@ -436,7 +429,6 @@ async def test_serving_chat_returns_correct_model_name():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False mock_engine.errored = False
mock_engine.model_config = MockModelConfig() mock_engine.model_config = MockModelConfig()
mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
mock_engine.input_processor = MagicMock() mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock() mock_engine.io_processor = MagicMock()
...@@ -467,7 +459,6 @@ async def test_serving_chat_should_set_correct_max_tokens(): ...@@ -467,7 +459,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False mock_engine.errored = False
mock_engine.model_config = MockModelConfig() mock_engine.model_config = MockModelConfig()
mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
mock_engine.input_processor = MagicMock() mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock() mock_engine.io_processor = MagicMock()
...@@ -501,7 +492,6 @@ async def test_serving_chat_should_set_correct_max_tokens(): ...@@ -501,7 +492,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False mock_engine.errored = False
mock_engine.model_config = mock_model_config mock_engine.model_config = mock_model_config
mock_engine.renderer_config = MockRendererConfig(mock_model_config)
mock_engine.input_processor = MagicMock() mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock() mock_engine.io_processor = MagicMock()
...@@ -547,7 +537,6 @@ async def test_serving_chat_should_set_correct_max_tokens(): ...@@ -547,7 +537,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False mock_engine.errored = False
mock_engine.model_config = mock_model_config mock_engine.model_config = mock_model_config
mock_engine.renderer_config = MockRendererConfig(mock_model_config)
mock_engine.input_processor = MagicMock() mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock() mock_engine.io_processor = MagicMock()
...@@ -594,7 +583,6 @@ async def test_serving_chat_could_load_correct_generation_config(): ...@@ -594,7 +583,6 @@ async def test_serving_chat_could_load_correct_generation_config():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False mock_engine.errored = False
mock_engine.model_config = mock_model_config mock_engine.model_config = mock_model_config
mock_engine.renderer_config = MockRendererConfig(mock_model_config)
mock_engine.input_processor = MagicMock() mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock() mock_engine.io_processor = MagicMock()
...@@ -641,7 +629,6 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type): ...@@ -641,7 +629,6 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False mock_engine.errored = False
mock_engine.model_config = mock_model_config mock_engine.model_config = mock_model_config
mock_engine.renderer_config = MockRendererConfig(mock_model_config)
mock_engine.input_processor = MagicMock() mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock() mock_engine.io_processor = MagicMock()
...@@ -675,7 +662,6 @@ async def test_serving_chat_data_parallel_rank_extraction(): ...@@ -675,7 +662,6 @@ async def test_serving_chat_data_parallel_rank_extraction():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False mock_engine.errored = False
mock_engine.model_config = MockModelConfig() mock_engine.model_config = MockModelConfig()
mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
mock_engine.input_processor = MagicMock() mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock() mock_engine.io_processor = MagicMock()
......
...@@ -7,7 +7,7 @@ from unittest.mock import Mock ...@@ -7,7 +7,7 @@ from unittest.mock import Mock
import pytest import pytest
from vllm.config import ModelConfig, RendererConfig from vllm.config import ModelConfig
from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.tokenizers import MistralTokenizer from vllm.tokenizers import MistralTokenizer
...@@ -19,16 +19,10 @@ def serving() -> OpenAIServing: ...@@ -19,16 +19,10 @@ def serving() -> OpenAIServing:
# Create minimal mocks # Create minimal mocks
engine_client = Mock() engine_client = Mock()
model_config = Mock(spec=ModelConfig) model_config = Mock(spec=ModelConfig)
model_config.max_model_len = 32768 model_config.max_model_len = 32768
renderer_config = Mock(spec=RendererConfig)
renderer_config.model_config = model_config
models = Mock(spec=OpenAIServingModels) models = Mock(spec=OpenAIServingModels)
models.model_config = model_config models.model_config = model_config
models.renderer_config = renderer_config
models.input_processor = Mock() models.input_processor = Mock()
models.io_processor = Mock() models.io_processor = Mock()
......
...@@ -6,7 +6,7 @@ from unittest.mock import MagicMock ...@@ -6,7 +6,7 @@ from unittest.mock import MagicMock
import pytest import pytest
from vllm.config import ModelConfig, RendererConfig from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.protocol import (
ErrorResponse, ErrorResponse,
...@@ -27,15 +27,9 @@ LORA_UNLOADING_SUCCESS_MESSAGE = ( ...@@ -27,15 +27,9 @@ LORA_UNLOADING_SUCCESS_MESSAGE = (
async def _async_serving_models_init() -> OpenAIServingModels: async def _async_serving_models_init() -> OpenAIServingModels:
mock_engine_client = MagicMock(spec=EngineClient) mock_engine_client = MagicMock(spec=EngineClient)
# Set the max_model_len attribute to avoid missing attribute # Set the max_model_len attribute to avoid missing attribute
mock_model_config = MagicMock(spec=ModelConfig) mock_model_config = MagicMock(spec=ModelConfig)
mock_model_config.max_model_len = 2048 mock_model_config.max_model_len = 2048
mock_renderer_config = MagicMock(spec=RendererConfig)
mock_renderer_config.model_config = mock_model_config
mock_engine_client.model_config = mock_model_config mock_engine_client.model_config = mock_model_config
mock_engine_client.renderer_config = mock_renderer_config
mock_engine_client.input_processor = MagicMock() mock_engine_client.input_processor = MagicMock()
mock_engine_client.io_processor = MagicMock() mock_engine_client.io_processor = MagicMock()
......
...@@ -12,7 +12,7 @@ from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy ...@@ -12,7 +12,7 @@ from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset from vllm.assets.video import VideoAsset
from vllm.config import ModelConfig, RendererConfig from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import ( from vllm.entrypoints.chat_utils import (
_try_extract_ast, _try_extract_ast,
apply_mistral_chat_template, apply_mistral_chat_template,
...@@ -233,7 +233,7 @@ def test_parse_chat_messages_single_image( ...@@ -233,7 +233,7 @@ def test_parse_chat_messages_single_image(
], ],
} }
], ],
RendererConfig(model_config=phi3v_model_config), phi3v_model_config,
content_format="string", content_format="string",
) )
...@@ -265,7 +265,7 @@ def test_parse_chat_messages_single_image_with_uuid( ...@@ -265,7 +265,7 @@ def test_parse_chat_messages_single_image_with_uuid(
], ],
} }
], ],
RendererConfig(model_config=phi3v_model_config), phi3v_model_config,
content_format="string", content_format="string",
) )
...@@ -295,7 +295,7 @@ def test_parse_chat_messages_single_empty_image_with_uuid( ...@@ -295,7 +295,7 @@ def test_parse_chat_messages_single_empty_image_with_uuid(
], ],
} }
], ],
RendererConfig(model_config=phi3v_model_config), phi3v_model_config,
content_format="string", content_format="string",
) )
...@@ -328,7 +328,7 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format( ...@@ -328,7 +328,7 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format(
], ],
} }
], ],
RendererConfig(model_config=phi3v_model_config), phi3v_model_config,
content_format="string", content_format="string",
) )
...@@ -369,7 +369,7 @@ def test_parse_chat_messages_multiple_images_with_uuids( ...@@ -369,7 +369,7 @@ def test_parse_chat_messages_multiple_images_with_uuids(
], ],
} }
], ],
RendererConfig(model_config=phi3v_model_config), phi3v_model_config,
content_format="string", content_format="string",
) )
...@@ -409,7 +409,7 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids( ...@@ -409,7 +409,7 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids(
], ],
} }
], ],
RendererConfig(model_config=phi3v_model_config), phi3v_model_config,
content_format="string", content_format="string",
) )
...@@ -451,7 +451,7 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids( ...@@ -451,7 +451,7 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids(
], ],
} }
], ],
RendererConfig(model_config=phi3v_model_config), phi3v_model_config,
content_format="string", content_format="string",
) )
...@@ -485,7 +485,7 @@ async def test_parse_chat_messages_single_image_with_uuid_async( ...@@ -485,7 +485,7 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
], ],
} }
], ],
RendererConfig(model_config=phi3v_model_config), phi3v_model_config,
content_format="string", content_format="string",
) )
...@@ -516,7 +516,7 @@ async def test_parse_chat_messages_empty_image_with_uuid_async( ...@@ -516,7 +516,7 @@ async def test_parse_chat_messages_empty_image_with_uuid_async(
], ],
} }
], ],
RendererConfig(model_config=phi3v_model_config), phi3v_model_config,
content_format="string", content_format="string",
) )
...@@ -554,7 +554,7 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async( ...@@ -554,7 +554,7 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
], ],
} }
], ],
RendererConfig(model_config=phi3v_model_config), phi3v_model_config,
content_format="string", content_format="string",
) )
...@@ -595,7 +595,7 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async( ...@@ -595,7 +595,7 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
], ],
} }
], ],
RendererConfig(model_config=phi3v_model_config), phi3v_model_config,
content_format="string", content_format="string",
) )
...@@ -634,7 +634,7 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async( ...@@ -634,7 +634,7 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
], ],
} }
], ],
RendererConfig(model_config=phi3v_model_config), phi3v_model_config,
content_format="string", content_format="string",
) )
...@@ -660,7 +660,7 @@ def test_parse_chat_messages_empty_system( ...@@ -660,7 +660,7 @@ def test_parse_chat_messages_empty_system(
"content": [{"type": "text", "text": "Who are you?"}], "content": [{"type": "text", "text": "Who are you?"}],
}, },
], ],
RendererConfig(model_config=mistral_model_config), mistral_model_config,
content_format="string", content_format="string",
) )
assert conversation == [ assert conversation == [
...@@ -677,7 +677,7 @@ def test_parse_chat_messages_empty_system( ...@@ -677,7 +677,7 @@ def test_parse_chat_messages_empty_system(
"content": [{"type": "text", "text": "Who are you?"}], "content": [{"type": "text", "text": "Who are you?"}],
}, },
], ],
RendererConfig(model_config=mistral_model_config), mistral_model_config,
content_format="openai", content_format="openai",
) )
assert conversation == [ assert conversation == [
...@@ -701,7 +701,7 @@ async def test_parse_chat_messages_single_image_async( ...@@ -701,7 +701,7 @@ async def test_parse_chat_messages_single_image_async(
], ],
} }
], ],
RendererConfig(model_config=phi3v_model_config), phi3v_model_config,
content_format="string", content_format="string",
) )
...@@ -730,7 +730,7 @@ def test_parse_chat_messages_multiple_images( ...@@ -730,7 +730,7 @@ def test_parse_chat_messages_multiple_images(
], ],
} }
], ],
RendererConfig(model_config=phi3v_model_config), phi3v_model_config,
content_format="string", content_format="string",
) )
...@@ -758,7 +758,7 @@ def test_parse_chat_messages_empty_pil_image_with_uuid( ...@@ -758,7 +758,7 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(
], ],
} }
], ],
RendererConfig(model_config=phi3v_model_config), phi3v_model_config,
content_format="string", content_format="string",
) )
...@@ -786,7 +786,7 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid( ...@@ -786,7 +786,7 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
], ],
} }
], ],
RendererConfig(model_config=phi3v_model_config_image_embeds), phi3v_model_config_image_embeds,
content_format="string", content_format="string",
) )
...@@ -818,7 +818,7 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid( ...@@ -818,7 +818,7 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
], ],
} }
], ],
RendererConfig(model_config=audio_embeds_model_config), audio_embeds_model_config,
content_format="string", content_format="string",
) )
...@@ -858,7 +858,7 @@ def test_parse_chat_messages_audio_embeds_with_string( ...@@ -858,7 +858,7 @@ def test_parse_chat_messages_audio_embeds_with_string(
], ],
} }
], ],
RendererConfig(model_config=audio_embeds_model_config), audio_embeds_model_config,
content_format="string", content_format="string",
) )
...@@ -900,7 +900,7 @@ async def test_parse_chat_messages_audio_embeds_async( ...@@ -900,7 +900,7 @@ async def test_parse_chat_messages_audio_embeds_async(
], ],
} }
], ],
RendererConfig(model_config=audio_embeds_model_config), audio_embeds_model_config,
content_format="string", content_format="string",
) )
...@@ -1108,7 +1108,7 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async( ...@@ -1108,7 +1108,7 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
], ],
} }
], ],
RendererConfig(model_config=phi3v_model_config_image_embeds), phi3v_model_config_image_embeds,
content_format="string", content_format="string",
) )
...@@ -1144,7 +1144,7 @@ async def test_parse_chat_messages_multiple_images_async( ...@@ -1144,7 +1144,7 @@ async def test_parse_chat_messages_multiple_images_async(
], ],
} }
], ],
RendererConfig(model_config=phi3v_model_config), phi3v_model_config,
content_format="string", content_format="string",
) )
...@@ -1176,7 +1176,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt( ...@@ -1176,7 +1176,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
], ],
} }
], ],
RendererConfig(model_config=phi3v_model_config), phi3v_model_config,
content_format="string", content_format="string",
) )
assert conversation == [ assert conversation == [
...@@ -1208,7 +1208,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt( ...@@ -1208,7 +1208,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
], ],
} }
], ],
RendererConfig(model_config=phi3v_model_config), phi3v_model_config,
content_format="string", content_format="string",
) )
...@@ -1245,7 +1245,7 @@ def test_parse_chat_messages_multiple_images_across_messages( ...@@ -1245,7 +1245,7 @@ def test_parse_chat_messages_multiple_images_across_messages(
], ],
}, },
], ],
RendererConfig(model_config=phi3v_model_config), phi3v_model_config,
content_format="string", content_format="string",
) )
...@@ -1289,7 +1289,7 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages( ...@@ -1289,7 +1289,7 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
], ],
}, },
], ],
RendererConfig(model_config=phi3v_model_config), phi3v_model_config,
content_format="string", content_format="string",
) )
...@@ -1314,7 +1314,7 @@ def test_parse_chat_messages_context_text_format( ...@@ -1314,7 +1314,7 @@ def test_parse_chat_messages_context_text_format(
{"role": "assistant", "content": "Some stuff."}, {"role": "assistant", "content": "Some stuff."},
{"role": "user", "content": "What about this one?"}, {"role": "user", "content": "What about this one?"},
], ],
RendererConfig(model_config=phi3v_model_config), phi3v_model_config,
content_format="openai", content_format="openai",
) )
...@@ -1367,7 +1367,7 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message( ...@@ -1367,7 +1367,7 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message(
], ],
} }
], ],
RendererConfig(model_config=phi3v_model_config), phi3v_model_config,
content_format="string", content_format="string",
) )
...@@ -1410,7 +1410,7 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages( ...@@ -1410,7 +1410,7 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages(
], ],
}, },
], ],
RendererConfig(model_config=phi3v_model_config), phi3v_model_config,
content_format="string", content_format="string",
) )
...@@ -1430,7 +1430,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input( ...@@ -1430,7 +1430,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
], ],
} }
], ],
RendererConfig(model_config=phi3v_model_config), phi3v_model_config,
content_format="string", content_format="string",
) )
...@@ -1464,7 +1464,7 @@ def test_parse_chat_messages_multiple_images_interleave( ...@@ -1464,7 +1464,7 @@ def test_parse_chat_messages_multiple_images_interleave(
], ],
} }
], ],
RendererConfig(model_config=phi3v_model_config_mm_interleaved), phi3v_model_config_mm_interleaved,
content_format="string", content_format="string",
) )
...@@ -1500,7 +1500,7 @@ async def test_parse_chat_messages_multiple_images_interleave_async( ...@@ -1500,7 +1500,7 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
], ],
} }
], ],
RendererConfig(model_config=phi3v_model_config_mm_interleaved), phi3v_model_config_mm_interleaved,
content_format="string", content_format="string",
) )
...@@ -1545,7 +1545,7 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async( ...@@ -1545,7 +1545,7 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
], ],
} }
], ],
RendererConfig(model_config=phi3v_model_config_mm_interleaved), phi3v_model_config_mm_interleaved,
content_format="string", content_format="string",
) )
...@@ -1583,7 +1583,7 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave( ...@@ -1583,7 +1583,7 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
], ],
}, },
], ],
RendererConfig(model_config=phi3v_model_config_mm_interleaved), phi3v_model_config_mm_interleaved,
content_format="string", content_format="string",
) )
...@@ -1631,7 +1631,7 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl ...@@ -1631,7 +1631,7 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl
], ],
}, },
], ],
RendererConfig(model_config=phi3v_model_config_mm_interleaved), phi3v_model_config_mm_interleaved,
content_format="string", content_format="string",
) )
...@@ -1675,7 +1675,7 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave( ...@@ -1675,7 +1675,7 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
], ],
}, },
], ],
RendererConfig(model_config=qwen25omni_model_config_mm_interleaved), qwen25omni_model_config_mm_interleaved,
content_format="string", content_format="string",
) )
...@@ -1743,7 +1743,7 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl ...@@ -1743,7 +1743,7 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
], ],
}, },
], ],
RendererConfig(model_config=qwen25omni_model_config_mm_interleaved), qwen25omni_model_config_mm_interleaved,
content_format="string", content_format="string",
) )
...@@ -1813,7 +1813,7 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes ...@@ -1813,7 +1813,7 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
], ],
}, },
], ],
RendererConfig(model_config=qwen25omni_model_config_mm_interleaved), qwen25omni_model_config_mm_interleaved,
content_format="string", content_format="string",
) )
...@@ -1879,7 +1879,7 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message ...@@ -1879,7 +1879,7 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
], ],
}, },
], ],
RendererConfig(model_config=qwen25omni_model_config_mm_interleaved), qwen25omni_model_config_mm_interleaved,
content_format="string", content_format="string",
) )
...@@ -1927,7 +1927,7 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders( ...@@ -1927,7 +1927,7 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
], ],
} }
], ],
RendererConfig(model_config=phi3v_model_config_mm_interleaved), phi3v_model_config_mm_interleaved,
content_format="string", content_format="string",
) )
...@@ -1945,11 +1945,24 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools): ...@@ -1945,11 +1945,24 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip") model_info.check_available_online(on_fail="skip")
renderer_config = model_info.build_renderer_config(model) model_config = ModelConfig(
model,
tokenizer=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
hf_overrides=model_info.hf_overrides,
skip_tokenizer_init=model_info.require_embed_inputs,
enable_prompt_embeds=model_info.require_embed_inputs,
enable_mm_embeds=model_info.require_embed_inputs,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype,
)
# Build the tokenizer
tokenizer = get_tokenizer( tokenizer = get_tokenizer(
renderer_config.tokenizer, model,
trust_remote_code=renderer_config.trust_remote_code, trust_remote_code=model_config.trust_remote_code,
) )
tools = ( tools = (
...@@ -1972,7 +1985,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools): ...@@ -1972,7 +1985,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
tokenizer, tokenizer,
chat_template=None, chat_template=None,
tools=tools, tools=tools,
model_config=renderer_config.model_config, model_config=model_config,
) )
assert isinstance(chat_template, str) assert isinstance(chat_template, str)
...@@ -2034,11 +2047,24 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa ...@@ -2034,11 +2047,24 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa
"enable_thinking": True, "enable_thinking": True,
} }
renderer_config = model_info.build_renderer_config(model) model_config = ModelConfig(
model,
tokenizer=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
hf_overrides=model_info.hf_overrides,
skip_tokenizer_init=model_info.require_embed_inputs,
enable_prompt_embeds=model_info.require_embed_inputs,
enable_mm_embeds=model_info.require_embed_inputs,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype,
)
# Build the tokenizer
tokenizer = get_tokenizer( tokenizer = get_tokenizer(
renderer_config.tokenizer, model,
trust_remote_code=renderer_config.trust_remote_code, trust_remote_code=model_config.trust_remote_code,
) )
# Test detecting the tokenizer's chat_template # Test detecting the tokenizer's chat_template
...@@ -2046,7 +2072,7 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa ...@@ -2046,7 +2072,7 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa
tokenizer, tokenizer,
chat_template=None, chat_template=None,
tools=tools, tools=tools,
model_config=renderer_config.model_config, model_config=model_config,
) )
with pytest.raises( with pytest.raises(
ValueError, match="Found unexpected chat template kwargs from request" ValueError, match="Found unexpected chat template kwargs from request"
...@@ -2117,11 +2143,23 @@ def test_resolve_content_format_hf_defined(model, expected_format): ...@@ -2117,11 +2143,23 @@ def test_resolve_content_format_hf_defined(model, expected_format):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip") model_info.check_available_online(on_fail="skip")
renderer_config = model_info.build_renderer_config(model) model_config = ModelConfig(
model,
tokenizer=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
hf_overrides=model_info.hf_overrides,
skip_tokenizer_init=model_info.require_embed_inputs,
enable_prompt_embeds=model_info.require_embed_inputs,
enable_mm_embeds=model_info.require_embed_inputs,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype,
)
tokenizer = get_tokenizer( tokenizer = get_tokenizer(
renderer_config.tokenizer, model,
trust_remote_code=renderer_config.trust_remote_code, trust_remote_code=model_config.trust_remote_code,
) )
# Test detecting the tokenizer's chat_template # Test detecting the tokenizer's chat_template
...@@ -2129,7 +2167,7 @@ def test_resolve_content_format_hf_defined(model, expected_format): ...@@ -2129,7 +2167,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
tokenizer, tokenizer,
chat_template=None, chat_template=None,
tools=None, tools=None,
model_config=renderer_config.model_config, model_config=model_config,
) )
assert isinstance(chat_template, str) assert isinstance(chat_template, str)
...@@ -2143,7 +2181,7 @@ def test_resolve_content_format_hf_defined(model, expected_format): ...@@ -2143,7 +2181,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
None, None,
"auto", "auto",
tokenizer, tokenizer,
renderer_config=renderer_config, model_config=model_config,
) )
assert resolved_format == expected_format assert resolved_format == expected_format
...@@ -2165,11 +2203,23 @@ def test_resolve_content_format_fallbacks(model, expected_format): ...@@ -2165,11 +2203,23 @@ def test_resolve_content_format_fallbacks(model, expected_format):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip") model_info.check_available_online(on_fail="skip")
renderer_config = model_info.build_renderer_config(model) model_config = ModelConfig(
model,
tokenizer=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
hf_overrides=model_info.hf_overrides,
skip_tokenizer_init=model_info.require_embed_inputs,
enable_prompt_embeds=model_info.require_embed_inputs,
enable_mm_embeds=model_info.require_embed_inputs,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype,
)
tokenizer = get_tokenizer( tokenizer = get_tokenizer(
renderer_config.tokenizer, model_config.tokenizer,
trust_remote_code=renderer_config.trust_remote_code, trust_remote_code=model_config.trust_remote_code,
) )
# Test detecting the tokenizer's chat_template # Test detecting the tokenizer's chat_template
...@@ -2177,7 +2227,7 @@ def test_resolve_content_format_fallbacks(model, expected_format): ...@@ -2177,7 +2227,7 @@ def test_resolve_content_format_fallbacks(model, expected_format):
tokenizer, tokenizer,
chat_template=None, chat_template=None,
tools=None, tools=None,
model_config=renderer_config.model_config, model_config=model_config,
) )
assert isinstance(chat_template, str) assert isinstance(chat_template, str)
...@@ -2191,7 +2241,7 @@ def test_resolve_content_format_fallbacks(model, expected_format): ...@@ -2191,7 +2241,7 @@ def test_resolve_content_format_fallbacks(model, expected_format):
None, None,
"auto", "auto",
tokenizer, tokenizer,
renderer_config=renderer_config, model_config=model_config,
) )
assert resolved_format == expected_format assert resolved_format == expected_format
...@@ -2222,13 +2272,15 @@ def test_resolve_content_format_fallbacks(model, expected_format): ...@@ -2222,13 +2272,15 @@ def test_resolve_content_format_fallbacks(model, expected_format):
], ],
) )
def test_resolve_content_format_examples(template_path, expected_format): def test_resolve_content_format_examples(template_path, expected_format):
model = PHI3V_MODEL_ID # Dummy model_config = ModelConfig(
model_config = ModelConfig(model, trust_remote_code=True) PHI3V_MODEL_ID, # Dummy
renderer_config = RendererConfig(model_config=model_config, tokenizer=model) tokenizer=PHI3V_MODEL_ID, # Dummy
trust_remote_code=True,
)
dummy_tokenizer = get_tokenizer( dummy_tokenizer = get_tokenizer(
renderer_config.tokenizer, PHI3V_MODEL_ID, # Dummy
trust_remote_code=renderer_config.trust_remote_code, trust_remote_code=model_config.trust_remote_code,
) )
dummy_tokenizer.chat_template = None dummy_tokenizer.chat_template = None
...@@ -2245,7 +2297,7 @@ def test_resolve_content_format_examples(template_path, expected_format): ...@@ -2245,7 +2297,7 @@ def test_resolve_content_format_examples(template_path, expected_format):
None, None,
"auto", "auto",
dummy_tokenizer, dummy_tokenizer,
renderer_config=renderer_config, model_config=model_config,
) )
assert resolved_format == expected_format assert resolved_format == expected_format
...@@ -2280,7 +2332,7 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config): ...@@ -2280,7 +2332,7 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config):
conversation_with_thinking, _, _ = parse_chat_messages( conversation_with_thinking, _, _ = parse_chat_messages(
messages, messages,
RendererConfig(model_config=mistral_model_config), mistral_model_config,
content_format="openai", content_format="openai",
) )
...@@ -2380,7 +2432,7 @@ def test_parse_chat_messages_single_empty_audio_with_uuid( ...@@ -2380,7 +2432,7 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
], ],
} }
], ],
RendererConfig(model_config=qwen2_audio_model_config), qwen2_audio_model_config,
content_format="string", content_format="string",
) )
...@@ -2414,7 +2466,7 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async( ...@@ -2414,7 +2466,7 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
], ],
} }
], ],
RendererConfig(model_config=qwen2_audio_model_config), qwen2_audio_model_config,
content_format="string", content_format="string",
) )
......
...@@ -8,7 +8,7 @@ import torch ...@@ -8,7 +8,7 @@ import torch
from safetensors.torch import load_file from safetensors.torch import load_file
from torch import nn from torch import nn
from vllm.config import ModelConfig, RendererConfig, VllmConfig from vllm.config import ModelConfig, VllmConfig
from vllm.config.lora import LoRAConfig from vllm.config.lora import LoRAConfig
from vllm.lora.layers import ( from vllm.lora.layers import (
ColumnParallelLinearWithLoRA, ColumnParallelLinearWithLoRA,
...@@ -422,11 +422,7 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa ...@@ -422,11 +422,7 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa
) )
model_config = ModelConfig(max_model_len=16) model_config = ModelConfig(max_model_len=16)
vllm_config = VllmConfig( vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
lora_config=lora_config,
)
vllm_config.scheduler_config.max_num_seqs = 4 vllm_config.scheduler_config.max_num_seqs = 4
vllm_config.scheduler_config.max_num_batched_tokens = 2 vllm_config.scheduler_config.max_num_batched_tokens = 2
...@@ -529,11 +525,7 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path ...@@ -529,11 +525,7 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path
) )
model_config = ModelConfig(max_model_len=16) model_config = ModelConfig(max_model_len=16)
vllm_config = VllmConfig( vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
lora_config=lora_config,
)
vllm_config.scheduler_config.max_num_seqs = 4 vllm_config.scheduler_config.max_num_seqs = 4
vllm_config.scheduler_config.max_num_batched_tokens = 2 vllm_config.scheduler_config.max_num_batched_tokens = 2
......
...@@ -11,7 +11,6 @@ from vllm.config import ( ...@@ -11,7 +11,6 @@ from vllm.config import (
DeviceConfig, DeviceConfig,
ModelConfig, ModelConfig,
ParallelConfig, ParallelConfig,
RendererConfig,
SchedulerConfig, SchedulerConfig,
VllmConfig, VllmConfig,
) )
...@@ -44,7 +43,6 @@ def test_worker_apply_lora(qwen3_lora_files): ...@@ -44,7 +43,6 @@ def test_worker_apply_lora(qwen3_lora_files):
vllm_config = VllmConfig( vllm_config = VllmConfig(
model_config=model_config, model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
load_config=LoadConfig( load_config=LoadConfig(
download_dir=None, download_dir=None,
load_format="dummy", load_format="dummy",
......
...@@ -42,10 +42,8 @@ def test_model_loading_with_params(vllm_runner, monkeypatch): ...@@ -42,10 +42,8 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
"Write a short story about a robot that dreams for the first time.\n" "Write a short story about a robot that dreams for the first time.\n"
) )
llm_engine = vllm_model.llm.llm_engine model_config = vllm_model.llm.llm_engine.model_config
model_config = llm_engine.model_config model_tokenizer = vllm_model.llm.llm_engine.tokenizer
renderer_config = llm_engine.renderer_config
tokenizer = llm_engine.tokenizer
# asserts on the bert model config file # asserts on the bert model config file
assert model_config.encoder_config["max_seq_length"] == 512 assert model_config.encoder_config["max_seq_length"] == 512
...@@ -56,8 +54,8 @@ def test_model_loading_with_params(vllm_runner, monkeypatch): ...@@ -56,8 +54,8 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
assert model_config.pooler_config.normalize assert model_config.pooler_config.normalize
# asserts on the tokenizer loaded # asserts on the tokenizer loaded
assert renderer_config.tokenizer == "BAAI/bge-base-en-v1.5" assert model_config.tokenizer == "BAAI/bge-base-en-v1.5"
assert tokenizer.model_max_length == 512 assert model_tokenizer.model_max_length == 512
def check_model(model): def check_model(model):
assert isinstance(model, BertEmbeddingModel) assert isinstance(model, BertEmbeddingModel)
...@@ -88,10 +86,8 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch): ...@@ -88,10 +86,8 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
"Write a short story about a robot that dreams for the first time.\n" "Write a short story about a robot that dreams for the first time.\n"
) )
llm_engine = vllm_model.llm.llm_engine model_config = vllm_model.llm.llm_engine.model_config
model_config = llm_engine.model_config model_tokenizer = vllm_model.llm.llm_engine.tokenizer
renderer_config = llm_engine.renderer_config
tokenizer = llm_engine.tokenizer
# asserts on the bert model config file # asserts on the bert model config file
assert model_config.encoder_config["max_seq_length"] == 512 assert model_config.encoder_config["max_seq_length"] == 512
...@@ -102,8 +98,8 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch): ...@@ -102,8 +98,8 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
assert model_config.pooler_config.normalize assert model_config.pooler_config.normalize
# asserts on the tokenizer loaded # asserts on the tokenizer loaded
assert renderer_config.tokenizer == "intfloat/multilingual-e5-base" assert model_config.tokenizer == "intfloat/multilingual-e5-base"
assert tokenizer.model_max_length == 512 assert model_tokenizer.model_max_length == 512
def check_model(model): def check_model(model):
assert isinstance(model, RobertaEmbeddingModel) assert isinstance(model, RobertaEmbeddingModel)
...@@ -132,7 +128,7 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch): ...@@ -132,7 +128,7 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch):
"Write a short story about a robot that dreams for the first time.\n" "Write a short story about a robot that dreams for the first time.\n"
) )
assert vllm_model.llm.llm_engine.renderer_config.tokenizer == model_name assert vllm_model.llm.llm_engine.model_config.tokenizer == model_name
def check_model(model): def check_model(model):
assert isinstance(model, RobertaEmbeddingModel) assert isinstance(model, RobertaEmbeddingModel)
......
...@@ -6,7 +6,7 @@ import pytest ...@@ -6,7 +6,7 @@ import pytest
from scipy.spatial.distance import cosine from scipy.spatial.distance import cosine
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.config import ModelConfig, RendererConfig from vllm.config import ModelConfig
from ....utils import RemoteOpenAIServer from ....utils import RemoteOpenAIServer
...@@ -31,8 +31,7 @@ def test_find_array(): ...@@ -31,8 +31,7 @@ def test_find_array():
dtype="bfloat16", dtype="bfloat16",
seed=0, seed=0,
) )
renderer_config = RendererConfig(model_config=model_config) pooling = GritLMMeanPool(model_config=model_config)
pooling = GritLMMeanPool(renderer_config=renderer_config)
arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
......
...@@ -25,6 +25,7 @@ from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingC ...@@ -25,6 +25,7 @@ from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingC
from vllm.tokenizers import ( from vllm.tokenizers import (
MistralTokenizer, MistralTokenizer,
TokenizerLike, TokenizerLike,
cached_tokenizer_from_config,
) )
from ....multimodal.utils import random_audio, random_image, random_video from ....multimodal.utils import random_audio, random_image, random_video
...@@ -211,20 +212,31 @@ def _test_processing_correctness( ...@@ -211,20 +212,31 @@ def _test_processing_correctness(
else: else:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch) model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch)
model_id = model_id_or_arch model_id = model_id_or_arch
model_info.check_available_online(on_fail="skip") model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip") model_info.check_transformers_version(on_fail="skip")
renderer_config = model_info.build_renderer_config( model_config = ModelConfig(
model=model_id, model_id,
tokenizer=model_info.tokenizer or model_id,
tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
hf_overrides=model_info.hf_overrides,
# Ensure that the cache can fit all of the data # Ensure that the cache can fit all of the data
mm_processor_cache_gb=2048, mm_processor_cache_gb=2048,
skip_tokenizer_init=model_info.require_embed_inputs,
enable_prompt_embeds=model_info.require_embed_inputs,
enable_mm_embeds=model_info.require_embed_inputs,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype,
) )
model_config = renderer_config.model_config
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
factories = model_cls._processor_factory factories = model_cls._processor_factory
ctx = InputProcessingContext.from_config(renderer_config) ctx = InputProcessingContext(
model_config,
tokenizer=cached_tokenizer_from_config(model_config),
)
cache = MultiModalProcessorOnlyCache(model_config) cache = MultiModalProcessorOnlyCache(model_config)
processing_info = factories.info(ctx) processing_info = factories.info(ctx)
......
...@@ -40,7 +40,7 @@ def test_processor_override( ...@@ -40,7 +40,7 @@ def test_processor_override(
mm_processor_kwargs=None, mm_processor_kwargs=None,
limit_mm_per_prompt={"video": 1}, limit_mm_per_prompt={"video": 1},
) )
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
tokenizer = processor.info.get_tokenizer() tokenizer = processor.info.get_tokenizer()
hf_processor_mm_kwargs = {"fps": fps} hf_processor_mm_kwargs = {"fps": fps}
...@@ -79,7 +79,7 @@ def test_video_loader_consistency( ...@@ -79,7 +79,7 @@ def test_video_loader_consistency(
mm_processor_kwargs=None, mm_processor_kwargs=None,
limit_mm_per_prompt={"video": 1}, limit_mm_per_prompt={"video": 1},
) )
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
hf_processor_mm_kwargs = {"fps": fps} hf_processor_mm_kwargs = {"fps": fps}
# Build the image str / prompt based on the number of images we pass # Build the image str / prompt based on the number of images we pass
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment