Unverified Commit e83b7e37 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

Revert "[Renderer] Separate out `RendererConfig` from `ModelConfig` (#30145)" (#30199)

parent 27f4c2fd
......@@ -31,6 +31,4 @@ def test_supports_multimodal_inputs(model_id, limit_mm_per_prompt, expected):
model_id,
limit_mm_per_prompt=limit_mm_per_prompt,
)
assert (
MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.renderer_config) is expected
)
assert MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.model_config) is expected
......@@ -13,7 +13,6 @@ from vllm.config import (
CompilationConfig,
ModelConfig,
PoolerConfig,
RendererConfig,
SchedulerConfig,
VllmConfig,
update_config,
......@@ -477,41 +476,27 @@ def test_load_config_pt_load_map_location(pt_load_map_location):
("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", 131073, 131072, True),
],
)
def test_recalculate_max_model_len(
def test_get_and_verify_max_len(
model_id, max_model_len, expected_max_len, should_raise
):
"""Test recalculate_max_model_len with different configurations."""
"""Test get_and_verify_max_len with different configurations."""
model_config = ModelConfig(model_id)
if should_raise:
with pytest.raises(ValueError):
model_config.recalculate_max_model_len(
max_model_len,
tokenizer=model_id,
tokenizer_revision=None,
)
model_config.get_and_verify_max_len(max_model_len)
else:
model_config.recalculate_max_model_len(
max_model_len,
tokenizer=model_id,
tokenizer_revision=None,
)
assert model_config.max_model_len == expected_max_len
actual_max_len = model_config.get_and_verify_max_len(max_model_len)
assert actual_max_len == expected_max_len
class MockModelConfig:
"""Simple mock object for testing maybe_pull_model_for_runai"""
class MockConfig:
"""Simple mock object for testing maybe_pull_model_tokenizer_for_runai"""
def __init__(self, model: str):
def __init__(self, model: str, tokenizer: str):
self.model = model
class MockRendererConfig:
"""Simple mock object for testing maybe_pull_tokenizer_for_runai"""
def __init__(self, model_config: MockModelConfig):
self.model_config = model_config
self.tokenizer = model_config.model
self.tokenizer = tokenizer
self.model_weights = None
@pytest.mark.parametrize(
......@@ -529,65 +514,59 @@ def test_s3_url_model_tokenizer_paths(mock_pull_files, s3_url):
mock_pull_files.return_value = None
# Create first mock and run the method
model_config1 = MockModelConfig(model=s3_url)
renderer_config1 = MockRendererConfig(model_config=model_config1)
ModelConfig.maybe_pull_model_for_runai(model_config1, s3_url)
RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config1, s3_url)
config1 = MockConfig(model=s3_url, tokenizer=s3_url)
ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url, s3_url)
# Check that model and tokenizer point to existing directories
assert os.path.exists(model_config1.model), (
f"Model directory does not exist: {model_config1.model}"
assert os.path.exists(config1.model), (
f"Model directory does not exist: {config1.model}"
)
assert os.path.isdir(model_config1.model), (
f"Model path is not a directory: {model_config1.model}"
assert os.path.isdir(config1.model), (
f"Model path is not a directory: {config1.model}"
)
assert os.path.exists(renderer_config1.tokenizer), (
f"Tokenizer directory does not exist: {renderer_config1.tokenizer}"
assert os.path.exists(config1.tokenizer), (
f"Tokenizer directory does not exist: {config1.tokenizer}"
)
assert os.path.isdir(renderer_config1.tokenizer), (
f"Tokenizer path is not a directory: {renderer_config1.tokenizer}"
assert os.path.isdir(config1.tokenizer), (
f"Tokenizer path is not a directory: {config1.tokenizer}"
)
# Verify that the paths are different from the original S3 URL
assert model_config1.model != s3_url, (
"Model path should be converted to local directory"
)
assert renderer_config1.tokenizer != s3_url, (
assert config1.model != s3_url, "Model path should be converted to local directory"
assert config1.tokenizer != s3_url, (
"Tokenizer path should be converted to local directory"
)
# Store the original paths
created_model_dir = model_config1.model
create_tokenizer_dir = renderer_config1.tokenizer
created_model_dir = config1.model
create_tokenizer_dir = config1.tokenizer
# Create a new mock and run the method with the same S3 URL
model_config2 = MockModelConfig(model=s3_url)
renderer_config2 = MockRendererConfig(model_config=model_config2)
ModelConfig.maybe_pull_model_for_runai(model_config2, s3_url)
RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config2, s3_url)
config2 = MockConfig(model=s3_url, tokenizer=s3_url)
ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url, s3_url)
# Check that the new directories exist
assert os.path.exists(model_config2.model), (
f"Model directory does not exist: {model_config2.model}"
assert os.path.exists(config2.model), (
f"Model directory does not exist: {config2.model}"
)
assert os.path.isdir(model_config2.model), (
f"Model path is not a directory: {model_config2.model}"
assert os.path.isdir(config2.model), (
f"Model path is not a directory: {config2.model}"
)
assert os.path.exists(renderer_config2.tokenizer), (
f"Tokenizer directory does not exist: {renderer_config2.tokenizer}"
assert os.path.exists(config2.tokenizer), (
f"Tokenizer directory does not exist: {config2.tokenizer}"
)
assert os.path.isdir(renderer_config2.tokenizer), (
f"Tokenizer path is not a directory: {renderer_config2.tokenizer}"
assert os.path.isdir(config2.tokenizer), (
f"Tokenizer path is not a directory: {config2.tokenizer}"
)
# Verify that the paths are deterministic (same as before)
assert model_config2.model == created_model_dir, (
assert config2.model == created_model_dir, (
f"Model paths are not deterministic. "
f"Original: {created_model_dir}, New: {model_config2.model}"
f"Original: {created_model_dir}, New: {config2.model}"
)
assert renderer_config2.tokenizer == create_tokenizer_dir, (
assert config2.tokenizer == create_tokenizer_dir, (
f"Tokenizer paths are not deterministic. "
f"Original: {create_tokenizer_dir}, New: {renderer_config2.tokenizer}"
f"Original: {create_tokenizer_dir}, New: {config2.tokenizer}"
)
......@@ -601,36 +580,28 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files):
s3_url2 = "s3://example-bucket-2/model/"
# Create mocks with different S3 URLs and run the method
model_config1 = MockModelConfig(model=s3_url1)
renderer_config1 = MockRendererConfig(model_config=model_config1)
ModelConfig.maybe_pull_model_for_runai(model_config1, s3_url1)
RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config1, s3_url1)
config1 = MockConfig(model=s3_url1, tokenizer=s3_url1)
ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url1, s3_url1)
model_config2 = MockModelConfig(model=s3_url2)
renderer_config2 = MockRendererConfig(model_config=model_config2)
ModelConfig.maybe_pull_model_for_runai(model_config2, s3_url2)
RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config2, s3_url2)
config2 = MockConfig(model=s3_url2, tokenizer=s3_url2)
ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url2, s3_url2)
# Verify that different URLs produce different directories
assert model_config1.model != model_config2.model, (
assert config1.model != config2.model, (
f"Different S3 URLs should create different model directories. "
f"URL1 model: {model_config1.model}, URL2 model: {model_config2.model}"
f"URL1 model: {config1.model}, URL2 model: {config2.model}"
)
assert renderer_config1.tokenizer != renderer_config2.tokenizer, (
assert config1.tokenizer != config2.tokenizer, (
f"Different S3 URLs should create different tokenizer directories. "
f"URL1 tokenizer: {renderer_config1.tokenizer}, "
f"URL2 tokenizer: {renderer_config2.tokenizer}"
f"URL1 tokenizer: {config1.tokenizer}, "
f"URL2 tokenizer: {config2.tokenizer}"
)
# Verify that both sets of directories exist
assert os.path.exists(model_config1.model) and os.path.isdir(model_config1.model)
assert os.path.exists(renderer_config1.tokenizer) and os.path.isdir(
renderer_config1.tokenizer
)
assert os.path.exists(model_config2.model) and os.path.isdir(model_config2.model)
assert os.path.exists(renderer_config2.tokenizer) and os.path.isdir(
renderer_config2.tokenizer
)
assert os.path.exists(config1.model) and os.path.isdir(config1.model)
assert os.path.exists(config1.tokenizer) and os.path.isdir(config1.tokenizer)
assert os.path.exists(config2.model) and os.path.isdir(config2.model)
assert os.path.exists(config2.tokenizer) and os.path.isdir(config2.tokenizer)
@pytest.mark.parametrize(
......
......@@ -3,7 +3,7 @@
import pytest
from vllm.config import ModelConfig, RendererConfig
from vllm.config import ModelConfig
from vllm.inputs import zip_enc_dec_prompts
from vllm.inputs.parse import parse_raw_prompts
from vllm.inputs.preprocess import InputPreprocessor
......@@ -108,9 +108,8 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
)
def test_preprocessor_always_mm_code_path(model_id, prompt):
model_config = ModelConfig(model=model_id)
renderer_config = RendererConfig(model_config=model_config)
tokenizer = init_tokenizer_from_config(renderer_config)
input_preprocessor = InputPreprocessor(renderer_config, tokenizer)
tokenizer = init_tokenizer_from_config(model_config)
input_preprocessor = InputPreprocessor(model_config, tokenizer)
# HF processor adds sep token
sep_token_id = tokenizer.vocab[tokenizer.sep_token]
......
......@@ -16,7 +16,6 @@ from vllm.config import (
LoadConfig,
ModelConfig,
ParallelConfig,
RendererConfig,
SchedulerConfig,
VllmConfig,
)
......@@ -217,7 +216,6 @@ def create_vllm_config(
return VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=cache_config,
parallel_config=parallel_config,
scheduler_config=scheduler_config,
......
......@@ -8,7 +8,7 @@ import pytest
import torch
import vllm.v1.core.kv_cache_utils as kv_cache_utils
from vllm.config import ModelConfig, RendererConfig, SchedulerConfig, VllmConfig
from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
from vllm.lora.request import LoRARequest
from vllm.multimodal.inputs import (
MultiModalFeatureSpec,
......@@ -667,10 +667,7 @@ def test_metrics_empty_stats():
def test_get_kv_cache_configs_multiple_workers():
model_config = ModelConfig(max_model_len=16)
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
)
vllm_config = VllmConfig(model_config=model_config)
ref_kv_cache_spec = new_kv_cache_spec()
same_kv_cache_specs = [
......@@ -1139,7 +1136,6 @@ def test_estimate_max_model_len(model_id, max_model_len, want_estimated_max_len)
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
scheduler_config=scheduler_config,
)
......@@ -1179,7 +1175,6 @@ def test_get_max_concurrency_for_kv_cache_config():
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
scheduler_config=scheduler_config,
)
......@@ -1298,10 +1293,7 @@ def test_allocate_with_lookahead():
def test_get_kv_cache_config_one_worker():
# pass max_model_len to pass check_enough_kv_cache_memory
model_config = ModelConfig(max_model_len=16)
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
)
vllm_config = VllmConfig(model_config=model_config)
mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
# all layers are full attention -> single group
......@@ -1592,11 +1584,7 @@ def test_get_kv_cache_config_one_worker():
def test_get_kv_cache_configs_attention_free():
kv_cache_specs: dict[str, KVCacheSpec] = {}
model_config = ModelConfig(max_model_len=16)
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
)
vllm_config = VllmConfig(model_config=ModelConfig(max_model_len=16))
kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [0])
assert kv_cache_configs == [
KVCacheConfig(
......
......@@ -11,7 +11,6 @@ from vllm.config import (
ECTransferConfig,
KVTransferConfig,
ModelConfig,
RendererConfig,
SchedulerConfig,
SpeculativeConfig,
VllmConfig,
......@@ -1564,7 +1563,6 @@ def create_scheduler_with_priority(
vllm_config = VllmConfig(
scheduler_config=scheduler_config,
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=cache_config,
kv_transfer_config=kv_transfer_config,
speculative_config=speculative_config,
......
......@@ -9,7 +9,6 @@ from vllm.config import (
ECTransferConfig,
KVTransferConfig,
ModelConfig,
RendererConfig,
SchedulerConfig,
SpeculativeConfig,
VllmConfig,
......@@ -133,7 +132,6 @@ def create_scheduler(
vllm_config = VllmConfig(
scheduler_config=scheduler_config,
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=cache_config,
kv_transfer_config=kv_transfer_config,
speculative_config=speculative_config,
......
......@@ -15,7 +15,6 @@ from vllm.config import (
ECTransferConfig,
KVTransferConfig,
ModelConfig,
RendererConfig,
SchedulerConfig,
VllmConfig,
)
......@@ -523,7 +522,6 @@ def test_encoder_instance_zero_kv_cache(
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=cache_config,
scheduler_config=scheduler_config,
kv_transfer_config=kv_transfer_config,
......
......@@ -5,14 +5,7 @@ import pytest
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.config import (
CacheConfig,
DeviceConfig,
ModelConfig,
MultiModalConfig,
RendererConfig,
VllmConfig,
)
from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
from vllm.sampling_params import SamplingParams
from vllm.v1.engine import input_processor as input_processor_mod
from vllm.v1.engine.input_processor import InputProcessor
......@@ -51,21 +44,22 @@ def _mock_input_processor(
monkeypatch.setattr(VllmConfig, "__post_init__", lambda self: None, raising=True)
model_config = ModelConfig(
skip_tokenizer_init=True,
max_model_len=128,
mm_processor_cache_gb=mm_cache_gb,
generation_config="vllm",
)
model_config.multimodal_config = MultiModalConfig(mm_processor_cache_gb=mm_cache_gb)
renderer_config = RendererConfig(
model_config=model_config,
tokenizer="dummy",
skip_tokenizer_init=True,
)
# Minimal multimodal_config to satisfy references in
# Processor.process_inputs.
class _MockMMConfig:
def __init__(self, gb: float):
self.mm_processor_cache_gb = gb
model_config.multimodal_config = _MockMMConfig(mm_cache_gb) # type: ignore[attr-defined]
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=renderer_config,
cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching),
device_config=DeviceConfig(device="cpu"),
)
......
......@@ -15,7 +15,6 @@ from vllm.config import (
DeviceConfig,
KVTransferConfig,
ModelConfig,
RendererConfig,
SchedulerConfig,
VllmConfig,
)
......@@ -128,7 +127,6 @@ def create_vllm_config(
return VllmConfig(
scheduler_config=scheduler_config,
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=cache_config,
kv_transfer_config=kv_transfer_config,
device_config=DeviceConfig("cpu"),
......
......@@ -19,7 +19,6 @@ from vllm.config import (
DeviceConfig,
ModelConfig,
ParallelConfig,
RendererConfig,
SchedulerConfig,
SpeculativeConfig,
VllmConfig,
......@@ -62,7 +61,6 @@ def _create_proposer(
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=CacheConfig(),
speculative_config=speculative_config,
device_config=DeviceConfig(device=current_platform.device_type),
......
......@@ -18,7 +18,6 @@ from vllm.config import (
DeviceConfig,
ModelConfig,
ParallelConfig,
RendererConfig,
SchedulerConfig,
SpeculativeConfig,
VllmConfig,
......@@ -47,7 +46,6 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer:
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=CacheConfig(),
speculative_config=speculative_config,
device_config=DeviceConfig(device=current_platform.device_type),
......
......@@ -4,7 +4,6 @@ import numpy as np
from vllm.config import (
ModelConfig,
RendererConfig,
SpeculativeConfig,
VllmConfig,
)
......@@ -70,7 +69,6 @@ def test_ngram_proposer():
return NgramProposer(
vllm_config=VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
speculative_config=SpeculativeConfig(
prompt_lookup_min=min_n,
prompt_lookup_max=max_n,
......
......@@ -6,7 +6,7 @@ from concurrent.futures import Future
import pytest
from transformers import AutoTokenizer
from vllm.config import RendererConfig, StructuredOutputsConfig, VllmConfig
from vllm.config import StructuredOutputsConfig, VllmConfig
from vllm.config.model import ModelConfig
from vllm.config.parallel import ParallelConfig
from vllm.config.speculative import SpeculativeConfig
......@@ -72,11 +72,8 @@ def test_backend_guidance_rollback_terminated():
def test_grammar_bitmask_with_specdec():
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
prompt = tokenizer.encode('{"a": "b"}')
model_config = ModelConfig(tokenizer=TOKENIZER)
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config, tokenizer=TOKENIZER),
model_config=ModelConfig(tokenizer=TOKENIZER),
structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
speculative_config=SpeculativeConfig(model="[ngram]", num_speculative_tokens=3),
)
......@@ -140,11 +137,8 @@ def test_grammar_init_async_and_sync(async_grammar):
# Use "external_launcher" for sync mode, None for async mode
executor_backend = None if async_grammar else "external_launcher"
model_config = ModelConfig(tokenizer=TOKENIZER)
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config, tokenizer=TOKENIZER),
model_config=ModelConfig(tokenizer=TOKENIZER),
structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
parallel_config=ParallelConfig(distributed_executor_backend=executor_backend),
)
......
......@@ -7,7 +7,7 @@ from unittest.mock import Mock
import pytest
from vllm.config import ModelConfig, RendererConfig, SchedulerConfig, VllmConfig
from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
from vllm.reasoning import ReasoningParser
from vllm.v1.request import Request
from vllm.v1.structured_output import StructuredOutputManager
......@@ -17,26 +17,19 @@ class TestReasoningStructuredOutput:
"""Test reasoning-aware structured output functionality."""
@pytest.fixture
def mock_renderer_config(self):
"""Create a mock RendererConfig."""
renderer_config = Mock(spec=RendererConfig)
renderer_config.skip_tokenizer_init = (
True # Skip tokenizer init to avoid network calls
)
model_config = Mock(spec=ModelConfig)
model_config.get_vocab_size = Mock(return_value=50000)
model_config.trust_remote_code = False
def mock_model_config(self):
"""Create a mock ModelConfig."""
config = Mock(spec=ModelConfig)
config.skip_tokenizer_init = True # Skip tokenizer init to avoid network calls
config.get_vocab_size = Mock(return_value=50000)
# Add missing runner_type attribute that tokenizer initialization expects
model_config.runner_type = "generate"
renderer_config.model_config = model_config
config.runner_type = "generate"
# Add other attributes that tokenizer initialization might need
renderer_config.tokenizer = "test-tokenizer"
renderer_config.tokenizer_mode = "auto"
renderer_config.tokenizer_revision = None
return renderer_config
config.tokenizer = "test-tokenizer"
config.tokenizer_mode = "auto"
config.trust_remote_code = False
config.tokenizer_revision = None
return config
@pytest.fixture
def mock_scheduler_config(self):
......@@ -46,10 +39,10 @@ class TestReasoningStructuredOutput:
return config
@pytest.fixture
def mock_vllm_config(self, mock_renderer_config, mock_scheduler_config):
def mock_vllm_config(self, mock_model_config, mock_scheduler_config):
"""Create a mock VllmConfig."""
config = Mock(spec=VllmConfig)
config.renderer_config = mock_renderer_config
config.model_config = mock_model_config
config.scheduler_config = mock_scheduler_config
config.structured_outputs_config = Mock()
config.structured_outputs_config.reasoning_parser = None
......
......@@ -7,7 +7,6 @@ from vllm.attention.layer import Attention
from vllm.config import (
CacheConfig,
ModelConfig,
RendererConfig,
SchedulerConfig,
VllmConfig,
set_current_vllm_config,
......@@ -46,7 +45,6 @@ def get_vllm_config():
)
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=cache_config,
scheduler_config=scheduler_config,
)
......
......@@ -13,7 +13,6 @@ from vllm.config import (
CacheConfig,
ModelConfig,
ParallelConfig,
RendererConfig,
SchedulerConfig,
VllmConfig,
set_current_vllm_config,
......@@ -102,7 +101,6 @@ def get_vllm_config():
parallel_config = ParallelConfig()
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=cache_config,
scheduler_config=scheduler_config,
parallel_config=parallel_config,
......@@ -813,7 +811,6 @@ def test_hybrid_attention_mamba_tensor_shapes():
attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASHINFER)
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=cache_config,
scheduler_config=scheduler_config,
parallel_config=parallel_config,
......
......@@ -24,7 +24,6 @@ from vllm.config.multimodal import MultiModalConfig
from vllm.config.observability import ObservabilityConfig
from vllm.config.parallel import EPLBConfig, ParallelConfig
from vllm.config.pooler import PoolerConfig
from vllm.config.renderer import RendererConfig
from vllm.config.scheduler import SchedulerConfig
from vllm.config.speculative import SpeculativeConfig
from vllm.config.speech_to_text import SpeechToTextConfig
......@@ -82,8 +81,6 @@ __all__ = [
"ParallelConfig",
# From vllm.config.pooler
"PoolerConfig",
# From vllm.config.renderer
"RendererConfig",
# From vllm.config.scheduler
"SchedulerConfig",
# From vllm.config.speculative
......
......@@ -36,6 +36,7 @@ from vllm.transformers_utils.config import (
uses_xdrope_dim,
)
from vllm.transformers_utils.gguf_utils import (
is_gguf,
is_remote_gguf,
maybe_patch_hf_config_from_gguf,
split_remote_gguf,
......@@ -82,6 +83,7 @@ TaskOption = Literal[
"transcription",
"draft",
]
TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
LogprobsMode = Literal[
"raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
......@@ -129,6 +131,18 @@ class ModelConfig:
Note that the model may support other tasks using the same model runner.
"""
tokenizer: SkipValidation[str] = None # type: ignore
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
name or path will be used."""
tokenizer_mode: TokenizerMode | str = "auto"
"""Tokenizer mode:\n
- "auto" will use the tokenizer from `mistral_common` for Mistral models
if available, otherwise it will use the "hf" tokenizer.\n
- "hf" will use the fast tokenizer if available.\n
- "slow" will always use the slow tokenizer.\n
- "mistral" will always use the tokenizer from `mistral_common`.\n
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
- Other custom values can be supported via plugins."""
trust_remote_code: bool = False
"""Trust remote code (e.g., from HuggingFace) when downloading the model
and tokenizer."""
......@@ -154,6 +168,13 @@ class ModelConfig:
hf_config_path: str | None = None
"""Name or path of the Hugging Face config to use. If unspecified, model
name or path will be used."""
allowed_local_media_path: str = ""
"""Allowing API requests to read local images or videos from directories
specified by the server file system. This is a security risk. Should only
be enabled in trusted environments."""
allowed_media_domains: list[str] | None = None
"""If set, only media URLs that belong to this domain can be used for
multi-modal inputs. """
revision: str | None = None
"""The specific model version to use. It can be a branch name, a tag name,
or a commit id. If unspecified, will use the default version."""
......@@ -161,6 +182,10 @@ class ModelConfig:
"""The specific revision to use for the model code on the Hugging Face Hub.
It can be a branch name, a tag name, or a commit id. If unspecified, will
use the default version."""
tokenizer_revision: str | None = None
"""The specific revision to use for the tokenizer on the Hugging Face Hub.
It can be a branch name, a tag name, or a commit id. If unspecified, will
use the default version."""
max_model_len: SkipValidation[int] = None # type: ignore
"""Model context length (prompt and output). If unspecified, will be
automatically derived from the model config.
......@@ -205,6 +230,10 @@ class ModelConfig:
preventing potential numerical issues. Note that even if this is set to
False, cascade attention will be only used when the heuristic tells that
it's beneficial."""
skip_tokenizer_init: bool = False
"""Skip initialization of tokenizer and detokenizer. Expects valid
`prompt_token_ids` and `None` for prompt from the input. The generated
output will contain token ids."""
enable_prompt_embeds: bool = False
"""If `True`, enables passing text embeddings as inputs via the
`prompt_embeds` key.
......@@ -265,6 +294,8 @@ class ModelConfig:
logits_processors: list[str | type[LogitsProcessor]] | None = None
"""One or more logits processors' fully-qualified class names or class
definitions"""
io_processor_plugin: str | None = None
"""IOProcessor plugin name to load at model startup"""
# Pooler config
pooler_config: PoolerConfig | None = None
......@@ -277,6 +308,7 @@ class ModelConfig:
from the architecture of `self.model`."""
limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None
enable_mm_embeds: InitVar[bool | None] = None
media_io_kwargs: InitVar[dict[str, dict[str, Any]] | None] = None
mm_processor_kwargs: InitVar[dict[str, Any] | None] = None
mm_processor_cache_gb: InitVar[float | None] = None
mm_processor_cache_type: InitVar[MMCacheType | None] = None
......@@ -303,12 +335,18 @@ class ModelConfig:
"runner",
"convert",
"task",
"tokenizer",
"tokenizer_mode",
"seed",
"hf_config_path",
"allowed_local_media_path",
"allowed_media_domains",
"tokenizer_revision",
"spec_target_max_model_len",
"enforce_eager",
"logprobs_mode",
"disable_cascade_attn",
"skip_tokenizer_init",
"served_model_name",
"config_format",
"hf_token",
......@@ -316,9 +354,11 @@ class ModelConfig:
"logits_processor_pattern",
"override_attention_dtype",
"logits_processors",
"io_processor_plugin",
"pooler_config",
"multimodal_config",
"limit_mm_per_prompt",
"media_io_kwargs",
"mm_processor_kwargs",
"mm_processor_cache_gb",
"mm_processor_cache_type",
......@@ -383,6 +423,7 @@ class ModelConfig:
# Multimodal config init vars
limit_mm_per_prompt: dict[str, int | dict[str, int]] | None,
enable_mm_embeds: bool | None,
media_io_kwargs: dict[str, dict[str, Any]] | None,
mm_processor_kwargs: dict[str, Any] | None,
mm_processor_cache_gb: float | None,
mm_processor_cache_type: MMCacheType | None,
......@@ -397,8 +438,13 @@ class ModelConfig:
self.served_model_name = get_served_model_name(
self.model, self.served_model_name
)
self.original_model = self.model
self.model = maybe_model_redirect(self.original_model)
self.model = maybe_model_redirect(self.model)
# The tokenizer is consistent with the model by default.
if self.tokenizer is None:
self.tokenizer = self.model
if self.tokenizer_revision is None:
self.tokenizer_revision = self.revision
self.tokenizer = maybe_model_redirect(self.tokenizer)
if isinstance(self.hf_config_path, str):
self.hf_config_path = maybe_model_redirect(self.hf_config_path)
......@@ -419,7 +465,7 @@ class ModelConfig:
hf_overrides_kw[key] = value
hf_overrides_fn = None
self.maybe_pull_model_for_runai(self.model)
self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)
from vllm.platforms import current_platform
......@@ -602,8 +648,7 @@ class ModelConfig:
)
self.original_max_model_len = self.max_model_len
self.recalculate_max_model_len(self.original_max_model_len)
self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
# Init multimodal config if needed
if self._model_info.supports_multimodal:
if (
......@@ -619,6 +664,7 @@ class ModelConfig:
mm_config_kwargs = dict(
limit_per_prompt=limit_mm_per_prompt,
enable_mm_embeds=enable_mm_embeds,
media_io_kwargs=media_io_kwargs,
mm_processor_kwargs=mm_processor_kwargs,
mm_processor_cache_gb=mm_processor_cache_gb,
mm_processor_cache_type=mm_processor_cache_type,
......@@ -636,8 +682,16 @@ class ModelConfig:
self.multimodal_config = MultiModalConfig(**mm_config_kwargs)
# Multimodal GGUF models must use original repo for mm processing
if is_gguf(self.tokenizer) and self.is_multimodal_model:
raise ValueError(
"Loading a multimodal GGUF model needs to use original "
"tokenizer. Please specify the unquantized hf model's "
"repo name or path using the --tokenizer argument."
)
if self.disable_sliding_window:
# Set after recalculate_max_model_len to ensure that max_model_len
# Set after get_and_verify_max_len to ensure that max_model_len
# can be correctly capped to sliding window size
self.hf_text_config.sliding_window = None
......@@ -661,9 +715,10 @@ class ModelConfig:
@model_validator(mode="after")
def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
if not isinstance(self.tokenizer, str):
raise ValueError("tokenizer must be a string after __post_init__.")
if not isinstance(self.max_model_len, int):
raise ValueError("max_model_len must be an integer after __post_init__.")
return self
def _get_transformers_backend_cls(self) -> str:
......@@ -712,17 +767,49 @@ class ModelConfig:
"""The architecture vllm actually used."""
return self._architecture
def maybe_pull_model_for_runai(self, model: str) -> None:
"""Pull model from Object Storage to temporary directory when needed."""
if not is_runai_obj_uri(model):
def maybe_pull_model_tokenizer_for_runai(self, model: str, tokenizer: str) -> None:
"""Pull model/tokenizer from Object Storage to temporary
directory when needed.
Args:
model: Model name or path
tokenizer: Tokenizer name or path
"""
if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)):
return
object_storage_model = ObjectStorageModel(url=model)
object_storage_model.pull_files(
model, allow_pattern=["*.model", "*.py", "*.json"]
)
self.model_weights = model
self.model = object_storage_model.dir
if is_runai_obj_uri(model):
object_storage_model = ObjectStorageModel(url=model)
object_storage_model.pull_files(
model, allow_pattern=["*.model", "*.py", "*.json"]
)
self.model_weights = model
self.model = object_storage_model.dir
# If tokenizer is same as model, download to same directory
if model == tokenizer:
object_storage_model.pull_files(
model,
ignore_pattern=[
"*.pt",
"*.safetensors",
"*.bin",
"*.tensors",
"*.pth",
],
)
self.tokenizer = object_storage_model.dir
return
# Only download tokenizer if needed and not already handled
if is_runai_obj_uri(tokenizer):
object_storage_tokenizer = ObjectStorageModel(url=tokenizer)
object_storage_tokenizer.pull_files(
model,
ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"],
)
self.tokenizer = object_storage_tokenizer.dir
def _get_encoder_config(self):
model = self.model
......@@ -1625,38 +1712,30 @@ class ModelConfig:
return dense_modules[-1]["out_features"]
return self.get_hidden_size()
def recalculate_max_model_len(
self,
original_max_model_len: int | None,
*,
tokenizer: str | None = None,
tokenizer_revision: str | None = None,
) -> None:
def get_and_verify_max_len(self, max_model_len: int):
# Consider max_model_len in tokenizer_config only when
# pooling models use absolute position_embedding.
# NOTE: For simplicity we assume `args.model == args.tokenizer`
# since this is
tokenizer_config = None
if (
self.runner_type == "pooling"
and getattr(self.hf_config, "position_embedding_type", "") == "absolute"
):
tokenizer_config = try_get_tokenizer_config(
tokenizer or self.model,
self.tokenizer,
trust_remote_code=self.trust_remote_code,
revision=tokenizer_revision or self.revision,
revision=self.tokenizer_revision,
)
self.max_model_len = _get_and_verify_max_len(
max_model_len = _get_and_verify_max_len(
hf_config=self.hf_text_config,
tokenizer_config=tokenizer_config,
max_model_len=original_max_model_len,
max_model_len=max_model_len,
disable_sliding_window=self.disable_sliding_window,
sliding_window=self.get_sliding_window(),
spec_target_max_model_len=self.spec_target_max_model_len,
encoder_config=self.encoder_config,
)
logger.info("Using max model len %s", self.max_model_len)
logger.info("Using max model len %s", max_model_len)
return max_model_len
@property
def attn_type(self) -> AttnTypeStr:
......
......@@ -79,6 +79,10 @@ class MultiModalConfig:
WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
Only enable this flag for trusted users!"""
media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
"""Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set
`--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
mm_processor_kwargs: dict[str, object] | None = None
"""Arguments to be forwarded to the model's processor for multi-modal data,
e.g., image processor. Overrides for the multi-modal processor obtained
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment