Unverified Commit e83b7e37 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

Revert "[Renderer] Separate out `RendererConfig` from `ModelConfig` (#30145)" (#30199)

parent 27f4c2fd
...@@ -31,6 +31,4 @@ def test_supports_multimodal_inputs(model_id, limit_mm_per_prompt, expected): ...@@ -31,6 +31,4 @@ def test_supports_multimodal_inputs(model_id, limit_mm_per_prompt, expected):
model_id, model_id,
limit_mm_per_prompt=limit_mm_per_prompt, limit_mm_per_prompt=limit_mm_per_prompt,
) )
assert ( assert MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.model_config) is expected
MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.renderer_config) is expected
)
...@@ -13,7 +13,6 @@ from vllm.config import ( ...@@ -13,7 +13,6 @@ from vllm.config import (
CompilationConfig, CompilationConfig,
ModelConfig, ModelConfig,
PoolerConfig, PoolerConfig,
RendererConfig,
SchedulerConfig, SchedulerConfig,
VllmConfig, VllmConfig,
update_config, update_config,
...@@ -477,41 +476,27 @@ def test_load_config_pt_load_map_location(pt_load_map_location): ...@@ -477,41 +476,27 @@ def test_load_config_pt_load_map_location(pt_load_map_location):
("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", 131073, 131072, True), ("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", 131073, 131072, True),
], ],
) )
def test_recalculate_max_model_len( def test_get_and_verify_max_len(
model_id, max_model_len, expected_max_len, should_raise model_id, max_model_len, expected_max_len, should_raise
): ):
"""Test recalculate_max_model_len with different configurations.""" """Test get_and_verify_max_len with different configurations."""
model_config = ModelConfig(model_id) model_config = ModelConfig(model_id)
if should_raise: if should_raise:
with pytest.raises(ValueError): with pytest.raises(ValueError):
model_config.recalculate_max_model_len( model_config.get_and_verify_max_len(max_model_len)
max_model_len,
tokenizer=model_id,
tokenizer_revision=None,
)
else: else:
model_config.recalculate_max_model_len( actual_max_len = model_config.get_and_verify_max_len(max_model_len)
max_model_len, assert actual_max_len == expected_max_len
tokenizer=model_id,
tokenizer_revision=None,
)
assert model_config.max_model_len == expected_max_len
class MockModelConfig: class MockConfig:
"""Simple mock object for testing maybe_pull_model_for_runai""" """Simple mock object for testing maybe_pull_model_tokenizer_for_runai"""
def __init__(self, model: str): def __init__(self, model: str, tokenizer: str):
self.model = model self.model = model
self.tokenizer = tokenizer
self.model_weights = None
class MockRendererConfig:
"""Simple mock object for testing maybe_pull_tokenizer_for_runai"""
def __init__(self, model_config: MockModelConfig):
self.model_config = model_config
self.tokenizer = model_config.model
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -529,65 +514,59 @@ def test_s3_url_model_tokenizer_paths(mock_pull_files, s3_url): ...@@ -529,65 +514,59 @@ def test_s3_url_model_tokenizer_paths(mock_pull_files, s3_url):
mock_pull_files.return_value = None mock_pull_files.return_value = None
# Create first mock and run the method # Create first mock and run the method
model_config1 = MockModelConfig(model=s3_url) config1 = MockConfig(model=s3_url, tokenizer=s3_url)
renderer_config1 = MockRendererConfig(model_config=model_config1) ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url, s3_url)
ModelConfig.maybe_pull_model_for_runai(model_config1, s3_url)
RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config1, s3_url)
# Check that model and tokenizer point to existing directories # Check that model and tokenizer point to existing directories
assert os.path.exists(model_config1.model), ( assert os.path.exists(config1.model), (
f"Model directory does not exist: {model_config1.model}" f"Model directory does not exist: {config1.model}"
) )
assert os.path.isdir(model_config1.model), ( assert os.path.isdir(config1.model), (
f"Model path is not a directory: {model_config1.model}" f"Model path is not a directory: {config1.model}"
) )
assert os.path.exists(renderer_config1.tokenizer), ( assert os.path.exists(config1.tokenizer), (
f"Tokenizer directory does not exist: {renderer_config1.tokenizer}" f"Tokenizer directory does not exist: {config1.tokenizer}"
) )
assert os.path.isdir(renderer_config1.tokenizer), ( assert os.path.isdir(config1.tokenizer), (
f"Tokenizer path is not a directory: {renderer_config1.tokenizer}" f"Tokenizer path is not a directory: {config1.tokenizer}"
) )
# Verify that the paths are different from the original S3 URL # Verify that the paths are different from the original S3 URL
assert model_config1.model != s3_url, ( assert config1.model != s3_url, "Model path should be converted to local directory"
"Model path should be converted to local directory" assert config1.tokenizer != s3_url, (
)
assert renderer_config1.tokenizer != s3_url, (
"Tokenizer path should be converted to local directory" "Tokenizer path should be converted to local directory"
) )
# Store the original paths # Store the original paths
created_model_dir = model_config1.model created_model_dir = config1.model
create_tokenizer_dir = renderer_config1.tokenizer create_tokenizer_dir = config1.tokenizer
# Create a new mock and run the method with the same S3 URL # Create a new mock and run the method with the same S3 URL
model_config2 = MockModelConfig(model=s3_url) config2 = MockConfig(model=s3_url, tokenizer=s3_url)
renderer_config2 = MockRendererConfig(model_config=model_config2) ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url, s3_url)
ModelConfig.maybe_pull_model_for_runai(model_config2, s3_url)
RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config2, s3_url)
# Check that the new directories exist # Check that the new directories exist
assert os.path.exists(model_config2.model), ( assert os.path.exists(config2.model), (
f"Model directory does not exist: {model_config2.model}" f"Model directory does not exist: {config2.model}"
) )
assert os.path.isdir(model_config2.model), ( assert os.path.isdir(config2.model), (
f"Model path is not a directory: {model_config2.model}" f"Model path is not a directory: {config2.model}"
) )
assert os.path.exists(renderer_config2.tokenizer), ( assert os.path.exists(config2.tokenizer), (
f"Tokenizer directory does not exist: {renderer_config2.tokenizer}" f"Tokenizer directory does not exist: {config2.tokenizer}"
) )
assert os.path.isdir(renderer_config2.tokenizer), ( assert os.path.isdir(config2.tokenizer), (
f"Tokenizer path is not a directory: {renderer_config2.tokenizer}" f"Tokenizer path is not a directory: {config2.tokenizer}"
) )
# Verify that the paths are deterministic (same as before) # Verify that the paths are deterministic (same as before)
assert model_config2.model == created_model_dir, ( assert config2.model == created_model_dir, (
f"Model paths are not deterministic. " f"Model paths are not deterministic. "
f"Original: {created_model_dir}, New: {model_config2.model}" f"Original: {created_model_dir}, New: {config2.model}"
) )
assert renderer_config2.tokenizer == create_tokenizer_dir, ( assert config2.tokenizer == create_tokenizer_dir, (
f"Tokenizer paths are not deterministic. " f"Tokenizer paths are not deterministic. "
f"Original: {create_tokenizer_dir}, New: {renderer_config2.tokenizer}" f"Original: {create_tokenizer_dir}, New: {config2.tokenizer}"
) )
...@@ -601,36 +580,28 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files): ...@@ -601,36 +580,28 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files):
s3_url2 = "s3://example-bucket-2/model/" s3_url2 = "s3://example-bucket-2/model/"
# Create mocks with different S3 URLs and run the method # Create mocks with different S3 URLs and run the method
model_config1 = MockModelConfig(model=s3_url1) config1 = MockConfig(model=s3_url1, tokenizer=s3_url1)
renderer_config1 = MockRendererConfig(model_config=model_config1) ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url1, s3_url1)
ModelConfig.maybe_pull_model_for_runai(model_config1, s3_url1)
RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config1, s3_url1)
model_config2 = MockModelConfig(model=s3_url2) config2 = MockConfig(model=s3_url2, tokenizer=s3_url2)
renderer_config2 = MockRendererConfig(model_config=model_config2) ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url2, s3_url2)
ModelConfig.maybe_pull_model_for_runai(model_config2, s3_url2)
RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config2, s3_url2)
# Verify that different URLs produce different directories # Verify that different URLs produce different directories
assert model_config1.model != model_config2.model, ( assert config1.model != config2.model, (
f"Different S3 URLs should create different model directories. " f"Different S3 URLs should create different model directories. "
f"URL1 model: {model_config1.model}, URL2 model: {model_config2.model}" f"URL1 model: {config1.model}, URL2 model: {config2.model}"
) )
assert renderer_config1.tokenizer != renderer_config2.tokenizer, ( assert config1.tokenizer != config2.tokenizer, (
f"Different S3 URLs should create different tokenizer directories. " f"Different S3 URLs should create different tokenizer directories. "
f"URL1 tokenizer: {renderer_config1.tokenizer}, " f"URL1 tokenizer: {config1.tokenizer}, "
f"URL2 tokenizer: {renderer_config2.tokenizer}" f"URL2 tokenizer: {config2.tokenizer}"
) )
# Verify that both sets of directories exist # Verify that both sets of directories exist
assert os.path.exists(model_config1.model) and os.path.isdir(model_config1.model) assert os.path.exists(config1.model) and os.path.isdir(config1.model)
assert os.path.exists(renderer_config1.tokenizer) and os.path.isdir( assert os.path.exists(config1.tokenizer) and os.path.isdir(config1.tokenizer)
renderer_config1.tokenizer assert os.path.exists(config2.model) and os.path.isdir(config2.model)
) assert os.path.exists(config2.tokenizer) and os.path.isdir(config2.tokenizer)
assert os.path.exists(model_config2.model) and os.path.isdir(model_config2.model)
assert os.path.exists(renderer_config2.tokenizer) and os.path.isdir(
renderer_config2.tokenizer
)
@pytest.mark.parametrize( @pytest.mark.parametrize(
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
import pytest import pytest
from vllm.config import ModelConfig, RendererConfig from vllm.config import ModelConfig
from vllm.inputs import zip_enc_dec_prompts from vllm.inputs import zip_enc_dec_prompts
from vllm.inputs.parse import parse_raw_prompts from vllm.inputs.parse import parse_raw_prompts
from vllm.inputs.preprocess import InputPreprocessor from vllm.inputs.preprocess import InputPreprocessor
...@@ -108,9 +108,8 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs): ...@@ -108,9 +108,8 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
) )
def test_preprocessor_always_mm_code_path(model_id, prompt): def test_preprocessor_always_mm_code_path(model_id, prompt):
model_config = ModelConfig(model=model_id) model_config = ModelConfig(model=model_id)
renderer_config = RendererConfig(model_config=model_config) tokenizer = init_tokenizer_from_config(model_config)
tokenizer = init_tokenizer_from_config(renderer_config) input_preprocessor = InputPreprocessor(model_config, tokenizer)
input_preprocessor = InputPreprocessor(renderer_config, tokenizer)
# HF processor adds sep token # HF processor adds sep token
sep_token_id = tokenizer.vocab[tokenizer.sep_token] sep_token_id = tokenizer.vocab[tokenizer.sep_token]
......
...@@ -16,7 +16,6 @@ from vllm.config import ( ...@@ -16,7 +16,6 @@ from vllm.config import (
LoadConfig, LoadConfig,
ModelConfig, ModelConfig,
ParallelConfig, ParallelConfig,
RendererConfig,
SchedulerConfig, SchedulerConfig,
VllmConfig, VllmConfig,
) )
...@@ -217,7 +216,6 @@ def create_vllm_config( ...@@ -217,7 +216,6 @@ def create_vllm_config(
return VllmConfig( return VllmConfig(
model_config=model_config, model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=cache_config, cache_config=cache_config,
parallel_config=parallel_config, parallel_config=parallel_config,
scheduler_config=scheduler_config, scheduler_config=scheduler_config,
......
...@@ -8,7 +8,7 @@ import pytest ...@@ -8,7 +8,7 @@ import pytest
import torch import torch
import vllm.v1.core.kv_cache_utils as kv_cache_utils import vllm.v1.core.kv_cache_utils as kv_cache_utils
from vllm.config import ModelConfig, RendererConfig, SchedulerConfig, VllmConfig from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalFeatureSpec, MultiModalFeatureSpec,
...@@ -667,10 +667,7 @@ def test_metrics_empty_stats(): ...@@ -667,10 +667,7 @@ def test_metrics_empty_stats():
def test_get_kv_cache_configs_multiple_workers(): def test_get_kv_cache_configs_multiple_workers():
model_config = ModelConfig(max_model_len=16) model_config = ModelConfig(max_model_len=16)
vllm_config = VllmConfig( vllm_config = VllmConfig(model_config=model_config)
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
)
ref_kv_cache_spec = new_kv_cache_spec() ref_kv_cache_spec = new_kv_cache_spec()
same_kv_cache_specs = [ same_kv_cache_specs = [
...@@ -1139,7 +1136,6 @@ def test_estimate_max_model_len(model_id, max_model_len, want_estimated_max_len) ...@@ -1139,7 +1136,6 @@ def test_estimate_max_model_len(model_id, max_model_len, want_estimated_max_len)
vllm_config = VllmConfig( vllm_config = VllmConfig(
model_config=model_config, model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
scheduler_config=scheduler_config, scheduler_config=scheduler_config,
) )
...@@ -1179,7 +1175,6 @@ def test_get_max_concurrency_for_kv_cache_config(): ...@@ -1179,7 +1175,6 @@ def test_get_max_concurrency_for_kv_cache_config():
vllm_config = VllmConfig( vllm_config = VllmConfig(
model_config=model_config, model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
scheduler_config=scheduler_config, scheduler_config=scheduler_config,
) )
...@@ -1298,10 +1293,7 @@ def test_allocate_with_lookahead(): ...@@ -1298,10 +1293,7 @@ def test_allocate_with_lookahead():
def test_get_kv_cache_config_one_worker(): def test_get_kv_cache_config_one_worker():
# pass max_model_len to pass check_enough_kv_cache_memory # pass max_model_len to pass check_enough_kv_cache_memory
model_config = ModelConfig(max_model_len=16) model_config = ModelConfig(max_model_len=16)
vllm_config = VllmConfig( vllm_config = VllmConfig(model_config=model_config)
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
)
mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2 mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
# all layers are full attention -> single group # all layers are full attention -> single group
...@@ -1592,11 +1584,7 @@ def test_get_kv_cache_config_one_worker(): ...@@ -1592,11 +1584,7 @@ def test_get_kv_cache_config_one_worker():
def test_get_kv_cache_configs_attention_free(): def test_get_kv_cache_configs_attention_free():
kv_cache_specs: dict[str, KVCacheSpec] = {} kv_cache_specs: dict[str, KVCacheSpec] = {}
model_config = ModelConfig(max_model_len=16) vllm_config = VllmConfig(model_config=ModelConfig(max_model_len=16))
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
)
kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [0]) kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [0])
assert kv_cache_configs == [ assert kv_cache_configs == [
KVCacheConfig( KVCacheConfig(
......
...@@ -11,7 +11,6 @@ from vllm.config import ( ...@@ -11,7 +11,6 @@ from vllm.config import (
ECTransferConfig, ECTransferConfig,
KVTransferConfig, KVTransferConfig,
ModelConfig, ModelConfig,
RendererConfig,
SchedulerConfig, SchedulerConfig,
SpeculativeConfig, SpeculativeConfig,
VllmConfig, VllmConfig,
...@@ -1564,7 +1563,6 @@ def create_scheduler_with_priority( ...@@ -1564,7 +1563,6 @@ def create_scheduler_with_priority(
vllm_config = VllmConfig( vllm_config = VllmConfig(
scheduler_config=scheduler_config, scheduler_config=scheduler_config,
model_config=model_config, model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=cache_config, cache_config=cache_config,
kv_transfer_config=kv_transfer_config, kv_transfer_config=kv_transfer_config,
speculative_config=speculative_config, speculative_config=speculative_config,
......
...@@ -9,7 +9,6 @@ from vllm.config import ( ...@@ -9,7 +9,6 @@ from vllm.config import (
ECTransferConfig, ECTransferConfig,
KVTransferConfig, KVTransferConfig,
ModelConfig, ModelConfig,
RendererConfig,
SchedulerConfig, SchedulerConfig,
SpeculativeConfig, SpeculativeConfig,
VllmConfig, VllmConfig,
...@@ -133,7 +132,6 @@ def create_scheduler( ...@@ -133,7 +132,6 @@ def create_scheduler(
vllm_config = VllmConfig( vllm_config = VllmConfig(
scheduler_config=scheduler_config, scheduler_config=scheduler_config,
model_config=model_config, model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=cache_config, cache_config=cache_config,
kv_transfer_config=kv_transfer_config, kv_transfer_config=kv_transfer_config,
speculative_config=speculative_config, speculative_config=speculative_config,
......
...@@ -15,7 +15,6 @@ from vllm.config import ( ...@@ -15,7 +15,6 @@ from vllm.config import (
ECTransferConfig, ECTransferConfig,
KVTransferConfig, KVTransferConfig,
ModelConfig, ModelConfig,
RendererConfig,
SchedulerConfig, SchedulerConfig,
VllmConfig, VllmConfig,
) )
...@@ -523,7 +522,6 @@ def test_encoder_instance_zero_kv_cache( ...@@ -523,7 +522,6 @@ def test_encoder_instance_zero_kv_cache(
vllm_config = VllmConfig( vllm_config = VllmConfig(
model_config=model_config, model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=cache_config, cache_config=cache_config,
scheduler_config=scheduler_config, scheduler_config=scheduler_config,
kv_transfer_config=kv_transfer_config, kv_transfer_config=kv_transfer_config,
......
...@@ -5,14 +5,7 @@ import pytest ...@@ -5,14 +5,7 @@ import pytest
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset from vllm.assets.video import VideoAsset
from vllm.config import ( from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
CacheConfig,
DeviceConfig,
ModelConfig,
MultiModalConfig,
RendererConfig,
VllmConfig,
)
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.v1.engine import input_processor as input_processor_mod from vllm.v1.engine import input_processor as input_processor_mod
from vllm.v1.engine.input_processor import InputProcessor from vllm.v1.engine.input_processor import InputProcessor
...@@ -51,21 +44,22 @@ def _mock_input_processor( ...@@ -51,21 +44,22 @@ def _mock_input_processor(
monkeypatch.setattr(VllmConfig, "__post_init__", lambda self: None, raising=True) monkeypatch.setattr(VllmConfig, "__post_init__", lambda self: None, raising=True)
model_config = ModelConfig( model_config = ModelConfig(
skip_tokenizer_init=True,
max_model_len=128, max_model_len=128,
mm_processor_cache_gb=mm_cache_gb, mm_processor_cache_gb=mm_cache_gb,
generation_config="vllm", generation_config="vllm",
)
model_config.multimodal_config = MultiModalConfig(mm_processor_cache_gb=mm_cache_gb)
renderer_config = RendererConfig(
model_config=model_config,
tokenizer="dummy", tokenizer="dummy",
skip_tokenizer_init=True,
) )
# Minimal multimodal_config to satisfy references in
# Processor.process_inputs.
class _MockMMConfig:
def __init__(self, gb: float):
self.mm_processor_cache_gb = gb
model_config.multimodal_config = _MockMMConfig(mm_cache_gb) # type: ignore[attr-defined]
vllm_config = VllmConfig( vllm_config = VllmConfig(
model_config=model_config, model_config=model_config,
renderer_config=renderer_config,
cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching), cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching),
device_config=DeviceConfig(device="cpu"), device_config=DeviceConfig(device="cpu"),
) )
......
...@@ -15,7 +15,6 @@ from vllm.config import ( ...@@ -15,7 +15,6 @@ from vllm.config import (
DeviceConfig, DeviceConfig,
KVTransferConfig, KVTransferConfig,
ModelConfig, ModelConfig,
RendererConfig,
SchedulerConfig, SchedulerConfig,
VllmConfig, VllmConfig,
) )
...@@ -128,7 +127,6 @@ def create_vllm_config( ...@@ -128,7 +127,6 @@ def create_vllm_config(
return VllmConfig( return VllmConfig(
scheduler_config=scheduler_config, scheduler_config=scheduler_config,
model_config=model_config, model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=cache_config, cache_config=cache_config,
kv_transfer_config=kv_transfer_config, kv_transfer_config=kv_transfer_config,
device_config=DeviceConfig("cpu"), device_config=DeviceConfig("cpu"),
......
...@@ -19,7 +19,6 @@ from vllm.config import ( ...@@ -19,7 +19,6 @@ from vllm.config import (
DeviceConfig, DeviceConfig,
ModelConfig, ModelConfig,
ParallelConfig, ParallelConfig,
RendererConfig,
SchedulerConfig, SchedulerConfig,
SpeculativeConfig, SpeculativeConfig,
VllmConfig, VllmConfig,
...@@ -62,7 +61,6 @@ def _create_proposer( ...@@ -62,7 +61,6 @@ def _create_proposer(
vllm_config = VllmConfig( vllm_config = VllmConfig(
model_config=model_config, model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=CacheConfig(), cache_config=CacheConfig(),
speculative_config=speculative_config, speculative_config=speculative_config,
device_config=DeviceConfig(device=current_platform.device_type), device_config=DeviceConfig(device=current_platform.device_type),
......
...@@ -18,7 +18,6 @@ from vllm.config import ( ...@@ -18,7 +18,6 @@ from vllm.config import (
DeviceConfig, DeviceConfig,
ModelConfig, ModelConfig,
ParallelConfig, ParallelConfig,
RendererConfig,
SchedulerConfig, SchedulerConfig,
SpeculativeConfig, SpeculativeConfig,
VllmConfig, VllmConfig,
...@@ -47,7 +46,6 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer: ...@@ -47,7 +46,6 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer:
vllm_config = VllmConfig( vllm_config = VllmConfig(
model_config=model_config, model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=CacheConfig(), cache_config=CacheConfig(),
speculative_config=speculative_config, speculative_config=speculative_config,
device_config=DeviceConfig(device=current_platform.device_type), device_config=DeviceConfig(device=current_platform.device_type),
......
...@@ -4,7 +4,6 @@ import numpy as np ...@@ -4,7 +4,6 @@ import numpy as np
from vllm.config import ( from vllm.config import (
ModelConfig, ModelConfig,
RendererConfig,
SpeculativeConfig, SpeculativeConfig,
VllmConfig, VllmConfig,
) )
...@@ -70,7 +69,6 @@ def test_ngram_proposer(): ...@@ -70,7 +69,6 @@ def test_ngram_proposer():
return NgramProposer( return NgramProposer(
vllm_config=VllmConfig( vllm_config=VllmConfig(
model_config=model_config, model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
speculative_config=SpeculativeConfig( speculative_config=SpeculativeConfig(
prompt_lookup_min=min_n, prompt_lookup_min=min_n,
prompt_lookup_max=max_n, prompt_lookup_max=max_n,
......
...@@ -6,7 +6,7 @@ from concurrent.futures import Future ...@@ -6,7 +6,7 @@ from concurrent.futures import Future
import pytest import pytest
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm.config import RendererConfig, StructuredOutputsConfig, VllmConfig from vllm.config import StructuredOutputsConfig, VllmConfig
from vllm.config.model import ModelConfig from vllm.config.model import ModelConfig
from vllm.config.parallel import ParallelConfig from vllm.config.parallel import ParallelConfig
from vllm.config.speculative import SpeculativeConfig from vllm.config.speculative import SpeculativeConfig
...@@ -72,11 +72,8 @@ def test_backend_guidance_rollback_terminated(): ...@@ -72,11 +72,8 @@ def test_backend_guidance_rollback_terminated():
def test_grammar_bitmask_with_specdec(): def test_grammar_bitmask_with_specdec():
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER) tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
prompt = tokenizer.encode('{"a": "b"}') prompt = tokenizer.encode('{"a": "b"}')
model_config = ModelConfig(tokenizer=TOKENIZER)
vllm_config = VllmConfig( vllm_config = VllmConfig(
model_config=model_config, model_config=ModelConfig(tokenizer=TOKENIZER),
renderer_config=RendererConfig(model_config=model_config, tokenizer=TOKENIZER),
structured_outputs_config=StructuredOutputsConfig(backend="guidance"), structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
speculative_config=SpeculativeConfig(model="[ngram]", num_speculative_tokens=3), speculative_config=SpeculativeConfig(model="[ngram]", num_speculative_tokens=3),
) )
...@@ -140,11 +137,8 @@ def test_grammar_init_async_and_sync(async_grammar): ...@@ -140,11 +137,8 @@ def test_grammar_init_async_and_sync(async_grammar):
# Use "external_launcher" for sync mode, None for async mode # Use "external_launcher" for sync mode, None for async mode
executor_backend = None if async_grammar else "external_launcher" executor_backend = None if async_grammar else "external_launcher"
model_config = ModelConfig(tokenizer=TOKENIZER)
vllm_config = VllmConfig( vllm_config = VllmConfig(
model_config=model_config, model_config=ModelConfig(tokenizer=TOKENIZER),
renderer_config=RendererConfig(model_config=model_config, tokenizer=TOKENIZER),
structured_outputs_config=StructuredOutputsConfig(backend="guidance"), structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
parallel_config=ParallelConfig(distributed_executor_backend=executor_backend), parallel_config=ParallelConfig(distributed_executor_backend=executor_backend),
) )
......
...@@ -7,7 +7,7 @@ from unittest.mock import Mock ...@@ -7,7 +7,7 @@ from unittest.mock import Mock
import pytest import pytest
from vllm.config import ModelConfig, RendererConfig, SchedulerConfig, VllmConfig from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
from vllm.reasoning import ReasoningParser from vllm.reasoning import ReasoningParser
from vllm.v1.request import Request from vllm.v1.request import Request
from vllm.v1.structured_output import StructuredOutputManager from vllm.v1.structured_output import StructuredOutputManager
...@@ -17,26 +17,19 @@ class TestReasoningStructuredOutput: ...@@ -17,26 +17,19 @@ class TestReasoningStructuredOutput:
"""Test reasoning-aware structured output functionality.""" """Test reasoning-aware structured output functionality."""
@pytest.fixture @pytest.fixture
def mock_renderer_config(self): def mock_model_config(self):
"""Create a mock RendererConfig.""" """Create a mock ModelConfig."""
renderer_config = Mock(spec=RendererConfig) config = Mock(spec=ModelConfig)
renderer_config.skip_tokenizer_init = ( config.skip_tokenizer_init = True # Skip tokenizer init to avoid network calls
True # Skip tokenizer init to avoid network calls config.get_vocab_size = Mock(return_value=50000)
)
model_config = Mock(spec=ModelConfig)
model_config.get_vocab_size = Mock(return_value=50000)
model_config.trust_remote_code = False
# Add missing runner_type attribute that tokenizer initialization expects # Add missing runner_type attribute that tokenizer initialization expects
model_config.runner_type = "generate" config.runner_type = "generate"
renderer_config.model_config = model_config
# Add other attributes that tokenizer initialization might need # Add other attributes that tokenizer initialization might need
renderer_config.tokenizer = "test-tokenizer" config.tokenizer = "test-tokenizer"
renderer_config.tokenizer_mode = "auto" config.tokenizer_mode = "auto"
renderer_config.tokenizer_revision = None config.trust_remote_code = False
config.tokenizer_revision = None
return renderer_config return config
@pytest.fixture @pytest.fixture
def mock_scheduler_config(self): def mock_scheduler_config(self):
...@@ -46,10 +39,10 @@ class TestReasoningStructuredOutput: ...@@ -46,10 +39,10 @@ class TestReasoningStructuredOutput:
return config return config
@pytest.fixture @pytest.fixture
def mock_vllm_config(self, mock_renderer_config, mock_scheduler_config): def mock_vllm_config(self, mock_model_config, mock_scheduler_config):
"""Create a mock VllmConfig.""" """Create a mock VllmConfig."""
config = Mock(spec=VllmConfig) config = Mock(spec=VllmConfig)
config.renderer_config = mock_renderer_config config.model_config = mock_model_config
config.scheduler_config = mock_scheduler_config config.scheduler_config = mock_scheduler_config
config.structured_outputs_config = Mock() config.structured_outputs_config = Mock()
config.structured_outputs_config.reasoning_parser = None config.structured_outputs_config.reasoning_parser = None
......
...@@ -7,7 +7,6 @@ from vllm.attention.layer import Attention ...@@ -7,7 +7,6 @@ from vllm.attention.layer import Attention
from vllm.config import ( from vllm.config import (
CacheConfig, CacheConfig,
ModelConfig, ModelConfig,
RendererConfig,
SchedulerConfig, SchedulerConfig,
VllmConfig, VllmConfig,
set_current_vllm_config, set_current_vllm_config,
...@@ -46,7 +45,6 @@ def get_vllm_config(): ...@@ -46,7 +45,6 @@ def get_vllm_config():
) )
vllm_config = VllmConfig( vllm_config = VllmConfig(
model_config=model_config, model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=cache_config, cache_config=cache_config,
scheduler_config=scheduler_config, scheduler_config=scheduler_config,
) )
......
...@@ -13,7 +13,6 @@ from vllm.config import ( ...@@ -13,7 +13,6 @@ from vllm.config import (
CacheConfig, CacheConfig,
ModelConfig, ModelConfig,
ParallelConfig, ParallelConfig,
RendererConfig,
SchedulerConfig, SchedulerConfig,
VllmConfig, VllmConfig,
set_current_vllm_config, set_current_vllm_config,
...@@ -102,7 +101,6 @@ def get_vllm_config(): ...@@ -102,7 +101,6 @@ def get_vllm_config():
parallel_config = ParallelConfig() parallel_config = ParallelConfig()
vllm_config = VllmConfig( vllm_config = VllmConfig(
model_config=model_config, model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=cache_config, cache_config=cache_config,
scheduler_config=scheduler_config, scheduler_config=scheduler_config,
parallel_config=parallel_config, parallel_config=parallel_config,
...@@ -813,7 +811,6 @@ def test_hybrid_attention_mamba_tensor_shapes(): ...@@ -813,7 +811,6 @@ def test_hybrid_attention_mamba_tensor_shapes():
attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASHINFER) attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASHINFER)
vllm_config = VllmConfig( vllm_config = VllmConfig(
model_config=model_config, model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=cache_config, cache_config=cache_config,
scheduler_config=scheduler_config, scheduler_config=scheduler_config,
parallel_config=parallel_config, parallel_config=parallel_config,
......
...@@ -24,7 +24,6 @@ from vllm.config.multimodal import MultiModalConfig ...@@ -24,7 +24,6 @@ from vllm.config.multimodal import MultiModalConfig
from vllm.config.observability import ObservabilityConfig from vllm.config.observability import ObservabilityConfig
from vllm.config.parallel import EPLBConfig, ParallelConfig from vllm.config.parallel import EPLBConfig, ParallelConfig
from vllm.config.pooler import PoolerConfig from vllm.config.pooler import PoolerConfig
from vllm.config.renderer import RendererConfig
from vllm.config.scheduler import SchedulerConfig from vllm.config.scheduler import SchedulerConfig
from vllm.config.speculative import SpeculativeConfig from vllm.config.speculative import SpeculativeConfig
from vllm.config.speech_to_text import SpeechToTextConfig from vllm.config.speech_to_text import SpeechToTextConfig
...@@ -82,8 +81,6 @@ __all__ = [ ...@@ -82,8 +81,6 @@ __all__ = [
"ParallelConfig", "ParallelConfig",
# From vllm.config.pooler # From vllm.config.pooler
"PoolerConfig", "PoolerConfig",
# From vllm.config.renderer
"RendererConfig",
# From vllm.config.scheduler # From vllm.config.scheduler
"SchedulerConfig", "SchedulerConfig",
# From vllm.config.speculative # From vllm.config.speculative
......
...@@ -36,6 +36,7 @@ from vllm.transformers_utils.config import ( ...@@ -36,6 +36,7 @@ from vllm.transformers_utils.config import (
uses_xdrope_dim, uses_xdrope_dim,
) )
from vllm.transformers_utils.gguf_utils import ( from vllm.transformers_utils.gguf_utils import (
is_gguf,
is_remote_gguf, is_remote_gguf,
maybe_patch_hf_config_from_gguf, maybe_patch_hf_config_from_gguf,
split_remote_gguf, split_remote_gguf,
...@@ -82,6 +83,7 @@ TaskOption = Literal[ ...@@ -82,6 +83,7 @@ TaskOption = Literal[
"transcription", "transcription",
"draft", "draft",
] ]
TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
LogprobsMode = Literal[ LogprobsMode = Literal[
"raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs" "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
...@@ -129,6 +131,18 @@ class ModelConfig: ...@@ -129,6 +131,18 @@ class ModelConfig:
Note that the model may support other tasks using the same model runner. Note that the model may support other tasks using the same model runner.
""" """
tokenizer: SkipValidation[str] = None # type: ignore
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
name or path will be used."""
tokenizer_mode: TokenizerMode | str = "auto"
"""Tokenizer mode:\n
- "auto" will use the tokenizer from `mistral_common` for Mistral models
if available, otherwise it will use the "hf" tokenizer.\n
- "hf" will use the fast tokenizer if available.\n
- "slow" will always use the slow tokenizer.\n
- "mistral" will always use the tokenizer from `mistral_common`.\n
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
- Other custom values can be supported via plugins."""
trust_remote_code: bool = False trust_remote_code: bool = False
"""Trust remote code (e.g., from HuggingFace) when downloading the model """Trust remote code (e.g., from HuggingFace) when downloading the model
and tokenizer.""" and tokenizer."""
...@@ -154,6 +168,13 @@ class ModelConfig: ...@@ -154,6 +168,13 @@ class ModelConfig:
hf_config_path: str | None = None hf_config_path: str | None = None
"""Name or path of the Hugging Face config to use. If unspecified, model """Name or path of the Hugging Face config to use. If unspecified, model
name or path will be used.""" name or path will be used."""
allowed_local_media_path: str = ""
"""Allowing API requests to read local images or videos from directories
specified by the server file system. This is a security risk. Should only
be enabled in trusted environments."""
allowed_media_domains: list[str] | None = None
"""If set, only media URLs that belong to this domain can be used for
multi-modal inputs. """
revision: str | None = None revision: str | None = None
"""The specific model version to use. It can be a branch name, a tag name, """The specific model version to use. It can be a branch name, a tag name,
or a commit id. If unspecified, will use the default version.""" or a commit id. If unspecified, will use the default version."""
...@@ -161,6 +182,10 @@ class ModelConfig: ...@@ -161,6 +182,10 @@ class ModelConfig:
"""The specific revision to use for the model code on the Hugging Face Hub. """The specific revision to use for the model code on the Hugging Face Hub.
It can be a branch name, a tag name, or a commit id. If unspecified, will It can be a branch name, a tag name, or a commit id. If unspecified, will
use the default version.""" use the default version."""
tokenizer_revision: str | None = None
"""The specific revision to use for the tokenizer on the Hugging Face Hub.
It can be a branch name, a tag name, or a commit id. If unspecified, will
use the default version."""
max_model_len: SkipValidation[int] = None # type: ignore max_model_len: SkipValidation[int] = None # type: ignore
"""Model context length (prompt and output). If unspecified, will be """Model context length (prompt and output). If unspecified, will be
automatically derived from the model config. automatically derived from the model config.
...@@ -205,6 +230,10 @@ class ModelConfig: ...@@ -205,6 +230,10 @@ class ModelConfig:
preventing potential numerical issues. Note that even if this is set to preventing potential numerical issues. Note that even if this is set to
False, cascade attention will be only used when the heuristic tells that False, cascade attention will be only used when the heuristic tells that
it's beneficial.""" it's beneficial."""
skip_tokenizer_init: bool = False
"""Skip initialization of tokenizer and detokenizer. Expects valid
`prompt_token_ids` and `None` for prompt from the input. The generated
output will contain token ids."""
enable_prompt_embeds: bool = False enable_prompt_embeds: bool = False
"""If `True`, enables passing text embeddings as inputs via the """If `True`, enables passing text embeddings as inputs via the
`prompt_embeds` key. `prompt_embeds` key.
...@@ -265,6 +294,8 @@ class ModelConfig: ...@@ -265,6 +294,8 @@ class ModelConfig:
logits_processors: list[str | type[LogitsProcessor]] | None = None logits_processors: list[str | type[LogitsProcessor]] | None = None
"""One or more logits processors' fully-qualified class names or class """One or more logits processors' fully-qualified class names or class
definitions""" definitions"""
io_processor_plugin: str | None = None
"""IOProcessor plugin name to load at model startup"""
# Pooler config # Pooler config
pooler_config: PoolerConfig | None = None pooler_config: PoolerConfig | None = None
...@@ -277,6 +308,7 @@ class ModelConfig: ...@@ -277,6 +308,7 @@ class ModelConfig:
from the architecture of `self.model`.""" from the architecture of `self.model`."""
limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None
enable_mm_embeds: InitVar[bool | None] = None enable_mm_embeds: InitVar[bool | None] = None
media_io_kwargs: InitVar[dict[str, dict[str, Any]] | None] = None
mm_processor_kwargs: InitVar[dict[str, Any] | None] = None mm_processor_kwargs: InitVar[dict[str, Any] | None] = None
mm_processor_cache_gb: InitVar[float | None] = None mm_processor_cache_gb: InitVar[float | None] = None
mm_processor_cache_type: InitVar[MMCacheType | None] = None mm_processor_cache_type: InitVar[MMCacheType | None] = None
...@@ -303,12 +335,18 @@ class ModelConfig: ...@@ -303,12 +335,18 @@ class ModelConfig:
"runner", "runner",
"convert", "convert",
"task", "task",
"tokenizer",
"tokenizer_mode",
"seed", "seed",
"hf_config_path", "hf_config_path",
"allowed_local_media_path",
"allowed_media_domains",
"tokenizer_revision",
"spec_target_max_model_len", "spec_target_max_model_len",
"enforce_eager", "enforce_eager",
"logprobs_mode", "logprobs_mode",
"disable_cascade_attn", "disable_cascade_attn",
"skip_tokenizer_init",
"served_model_name", "served_model_name",
"config_format", "config_format",
"hf_token", "hf_token",
...@@ -316,9 +354,11 @@ class ModelConfig: ...@@ -316,9 +354,11 @@ class ModelConfig:
"logits_processor_pattern", "logits_processor_pattern",
"override_attention_dtype", "override_attention_dtype",
"logits_processors", "logits_processors",
"io_processor_plugin",
"pooler_config", "pooler_config",
"multimodal_config", "multimodal_config",
"limit_mm_per_prompt", "limit_mm_per_prompt",
"media_io_kwargs",
"mm_processor_kwargs", "mm_processor_kwargs",
"mm_processor_cache_gb", "mm_processor_cache_gb",
"mm_processor_cache_type", "mm_processor_cache_type",
...@@ -383,6 +423,7 @@ class ModelConfig: ...@@ -383,6 +423,7 @@ class ModelConfig:
# Multimodal config init vars # Multimodal config init vars
limit_mm_per_prompt: dict[str, int | dict[str, int]] | None, limit_mm_per_prompt: dict[str, int | dict[str, int]] | None,
enable_mm_embeds: bool | None, enable_mm_embeds: bool | None,
media_io_kwargs: dict[str, dict[str, Any]] | None,
mm_processor_kwargs: dict[str, Any] | None, mm_processor_kwargs: dict[str, Any] | None,
mm_processor_cache_gb: float | None, mm_processor_cache_gb: float | None,
mm_processor_cache_type: MMCacheType | None, mm_processor_cache_type: MMCacheType | None,
...@@ -397,8 +438,13 @@ class ModelConfig: ...@@ -397,8 +438,13 @@ class ModelConfig:
self.served_model_name = get_served_model_name( self.served_model_name = get_served_model_name(
self.model, self.served_model_name self.model, self.served_model_name
) )
self.original_model = self.model self.model = maybe_model_redirect(self.model)
self.model = maybe_model_redirect(self.original_model) # The tokenizer is consistent with the model by default.
if self.tokenizer is None:
self.tokenizer = self.model
if self.tokenizer_revision is None:
self.tokenizer_revision = self.revision
self.tokenizer = maybe_model_redirect(self.tokenizer)
if isinstance(self.hf_config_path, str): if isinstance(self.hf_config_path, str):
self.hf_config_path = maybe_model_redirect(self.hf_config_path) self.hf_config_path = maybe_model_redirect(self.hf_config_path)
...@@ -419,7 +465,7 @@ class ModelConfig: ...@@ -419,7 +465,7 @@ class ModelConfig:
hf_overrides_kw[key] = value hf_overrides_kw[key] = value
hf_overrides_fn = None hf_overrides_fn = None
self.maybe_pull_model_for_runai(self.model) self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)
from vllm.platforms import current_platform from vllm.platforms import current_platform
...@@ -602,8 +648,7 @@ class ModelConfig: ...@@ -602,8 +648,7 @@ class ModelConfig:
) )
self.original_max_model_len = self.max_model_len self.original_max_model_len = self.max_model_len
self.recalculate_max_model_len(self.original_max_model_len) self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
# Init multimodal config if needed # Init multimodal config if needed
if self._model_info.supports_multimodal: if self._model_info.supports_multimodal:
if ( if (
...@@ -619,6 +664,7 @@ class ModelConfig: ...@@ -619,6 +664,7 @@ class ModelConfig:
mm_config_kwargs = dict( mm_config_kwargs = dict(
limit_per_prompt=limit_mm_per_prompt, limit_per_prompt=limit_mm_per_prompt,
enable_mm_embeds=enable_mm_embeds, enable_mm_embeds=enable_mm_embeds,
media_io_kwargs=media_io_kwargs,
mm_processor_kwargs=mm_processor_kwargs, mm_processor_kwargs=mm_processor_kwargs,
mm_processor_cache_gb=mm_processor_cache_gb, mm_processor_cache_gb=mm_processor_cache_gb,
mm_processor_cache_type=mm_processor_cache_type, mm_processor_cache_type=mm_processor_cache_type,
...@@ -636,8 +682,16 @@ class ModelConfig: ...@@ -636,8 +682,16 @@ class ModelConfig:
self.multimodal_config = MultiModalConfig(**mm_config_kwargs) self.multimodal_config = MultiModalConfig(**mm_config_kwargs)
# Multimodal GGUF models must use original repo for mm processing
if is_gguf(self.tokenizer) and self.is_multimodal_model:
raise ValueError(
"Loading a multimodal GGUF model needs to use original "
"tokenizer. Please specify the unquantized hf model's "
"repo name or path using the --tokenizer argument."
)
if self.disable_sliding_window: if self.disable_sliding_window:
# Set after recalculate_max_model_len to ensure that max_model_len # Set after get_and_verify_max_len to ensure that max_model_len
# can be correctly capped to sliding window size # can be correctly capped to sliding window size
self.hf_text_config.sliding_window = None self.hf_text_config.sliding_window = None
...@@ -661,9 +715,10 @@ class ModelConfig: ...@@ -661,9 +715,10 @@ class ModelConfig:
@model_validator(mode="after") @model_validator(mode="after")
def validate_model_config_after(self: "ModelConfig") -> "ModelConfig": def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
if not isinstance(self.tokenizer, str):
raise ValueError("tokenizer must be a string after __post_init__.")
if not isinstance(self.max_model_len, int): if not isinstance(self.max_model_len, int):
raise ValueError("max_model_len must be an integer after __post_init__.") raise ValueError("max_model_len must be an integer after __post_init__.")
return self return self
def _get_transformers_backend_cls(self) -> str: def _get_transformers_backend_cls(self) -> str:
...@@ -712,11 +767,19 @@ class ModelConfig: ...@@ -712,11 +767,19 @@ class ModelConfig:
"""The architecture vllm actually used.""" """The architecture vllm actually used."""
return self._architecture return self._architecture
def maybe_pull_model_for_runai(self, model: str) -> None: def maybe_pull_model_tokenizer_for_runai(self, model: str, tokenizer: str) -> None:
"""Pull model from Object Storage to temporary directory when needed.""" """Pull model/tokenizer from Object Storage to temporary
if not is_runai_obj_uri(model): directory when needed.
Args:
model: Model name or path
tokenizer: Tokenizer name or path
"""
if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)):
return return
if is_runai_obj_uri(model):
object_storage_model = ObjectStorageModel(url=model) object_storage_model = ObjectStorageModel(url=model)
object_storage_model.pull_files( object_storage_model.pull_files(
model, allow_pattern=["*.model", "*.py", "*.json"] model, allow_pattern=["*.model", "*.py", "*.json"]
...@@ -724,6 +787,30 @@ class ModelConfig: ...@@ -724,6 +787,30 @@ class ModelConfig:
self.model_weights = model self.model_weights = model
self.model = object_storage_model.dir self.model = object_storage_model.dir
# If tokenizer is same as model, download to same directory
if model == tokenizer:
object_storage_model.pull_files(
model,
ignore_pattern=[
"*.pt",
"*.safetensors",
"*.bin",
"*.tensors",
"*.pth",
],
)
self.tokenizer = object_storage_model.dir
return
# Only download tokenizer if needed and not already handled
if is_runai_obj_uri(tokenizer):
object_storage_tokenizer = ObjectStorageModel(url=tokenizer)
object_storage_tokenizer.pull_files(
model,
ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"],
)
self.tokenizer = object_storage_tokenizer.dir
def _get_encoder_config(self): def _get_encoder_config(self):
model = self.model model = self.model
if is_remote_gguf(model): if is_remote_gguf(model):
...@@ -1625,38 +1712,30 @@ class ModelConfig: ...@@ -1625,38 +1712,30 @@ class ModelConfig:
return dense_modules[-1]["out_features"] return dense_modules[-1]["out_features"]
return self.get_hidden_size() return self.get_hidden_size()
def recalculate_max_model_len( def get_and_verify_max_len(self, max_model_len: int):
self,
original_max_model_len: int | None,
*,
tokenizer: str | None = None,
tokenizer_revision: str | None = None,
) -> None:
# Consider max_model_len in tokenizer_config only when # Consider max_model_len in tokenizer_config only when
# pooling models use absolute position_embedding. # pooling models use absolute position_embedding.
# NOTE: For simplicity we assume `args.model == args.tokenizer`
# since this is
tokenizer_config = None tokenizer_config = None
if ( if (
self.runner_type == "pooling" self.runner_type == "pooling"
and getattr(self.hf_config, "position_embedding_type", "") == "absolute" and getattr(self.hf_config, "position_embedding_type", "") == "absolute"
): ):
tokenizer_config = try_get_tokenizer_config( tokenizer_config = try_get_tokenizer_config(
tokenizer or self.model, self.tokenizer,
trust_remote_code=self.trust_remote_code, trust_remote_code=self.trust_remote_code,
revision=tokenizer_revision or self.revision, revision=self.tokenizer_revision,
) )
max_model_len = _get_and_verify_max_len(
self.max_model_len = _get_and_verify_max_len(
hf_config=self.hf_text_config, hf_config=self.hf_text_config,
tokenizer_config=tokenizer_config, tokenizer_config=tokenizer_config,
max_model_len=original_max_model_len, max_model_len=max_model_len,
disable_sliding_window=self.disable_sliding_window, disable_sliding_window=self.disable_sliding_window,
sliding_window=self.get_sliding_window(), sliding_window=self.get_sliding_window(),
spec_target_max_model_len=self.spec_target_max_model_len, spec_target_max_model_len=self.spec_target_max_model_len,
encoder_config=self.encoder_config, encoder_config=self.encoder_config,
) )
logger.info("Using max model len %s", self.max_model_len) logger.info("Using max model len %s", max_model_len)
return max_model_len
@property @property
def attn_type(self) -> AttnTypeStr: def attn_type(self) -> AttnTypeStr:
......
...@@ -79,6 +79,10 @@ class MultiModalConfig: ...@@ -79,6 +79,10 @@ class MultiModalConfig:
WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed. WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
Only enable this flag for trusted users!""" Only enable this flag for trusted users!"""
media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
"""Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set
`--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
mm_processor_kwargs: dict[str, object] | None = None mm_processor_kwargs: dict[str, object] | None = None
"""Arguments to be forwarded to the model's processor for multi-modal data, """Arguments to be forwarded to the model's processor for multi-modal data,
e.g., image processor. Overrides for the multi-modal processor obtained e.g., image processor. Overrides for the multi-modal processor obtained
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment