Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e83b7e37
Unverified
Commit
e83b7e37
authored
Dec 07, 2025
by
Cyrus Leung
Committed by
GitHub
Dec 07, 2025
Browse files
Revert "[Renderer] Separate out `RendererConfig` from `ModelConfig` (#30145)" (#30199)
parent
27f4c2fd
Changes
105
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
199 additions
and
203 deletions
+199
-203
tests/multimodal/test_registry.py
tests/multimodal/test_registry.py
+1
-3
tests/test_config.py
tests/test_config.py
+51
-80
tests/test_inputs.py
tests/test_inputs.py
+3
-4
tests/v1/attention/utils.py
tests/v1/attention/utils.py
+0
-2
tests/v1/core/test_kv_cache_utils.py
tests/v1/core/test_kv_cache_utils.py
+4
-16
tests/v1/core/test_scheduler.py
tests/v1/core/test_scheduler.py
+0
-2
tests/v1/core/utils.py
tests/v1/core/utils.py
+0
-2
tests/v1/engine/test_engine_core.py
tests/v1/engine/test_engine_core.py
+0
-2
tests/v1/engine/test_process_multi_modal_uuids.py
tests/v1/engine/test_process_multi_modal_uuids.py
+9
-15
tests/v1/kv_connector/unit/utils.py
tests/v1/kv_connector/unit/utils.py
+0
-2
tests/v1/spec_decode/test_eagle.py
tests/v1/spec_decode/test_eagle.py
+0
-2
tests/v1/spec_decode/test_mtp.py
tests/v1/spec_decode/test_mtp.py
+0
-2
tests/v1/spec_decode/test_ngram.py
tests/v1/spec_decode/test_ngram.py
+0
-2
tests/v1/structured_output/test_backend_guidance.py
tests/v1/structured_output/test_backend_guidance.py
+3
-9
tests/v1/structured_output/test_reasoning_structured_output.py
.../v1/structured_output/test_reasoning_structured_output.py
+14
-21
tests/v1/tpu/worker/test_tpu_model_runner.py
tests/v1/tpu/worker/test_tpu_model_runner.py
+0
-2
tests/v1/worker/test_gpu_model_runner.py
tests/v1/worker/test_gpu_model_runner.py
+0
-3
vllm/config/__init__.py
vllm/config/__init__.py
+0
-3
vllm/config/model.py
vllm/config/model.py
+110
-31
vllm/config/multimodal.py
vllm/config/multimodal.py
+4
-0
No files found.
tests/multimodal/test_registry.py
View file @
e83b7e37
...
...
@@ -31,6 +31,4 @@ def test_supports_multimodal_inputs(model_id, limit_mm_per_prompt, expected):
model_id
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
)
assert
(
MULTIMODAL_REGISTRY
.
supports_multimodal_inputs
(
ctx
.
renderer_config
)
is
expected
)
assert
MULTIMODAL_REGISTRY
.
supports_multimodal_inputs
(
ctx
.
model_config
)
is
expected
tests/test_config.py
View file @
e83b7e37
...
...
@@ -13,7 +13,6 @@ from vllm.config import (
CompilationConfig
,
ModelConfig
,
PoolerConfig
,
RendererConfig
,
SchedulerConfig
,
VllmConfig
,
update_config
,
...
...
@@ -477,41 +476,27 @@ def test_load_config_pt_load_map_location(pt_load_map_location):
(
"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
,
131073
,
131072
,
True
),
],
)
def
test_
recalculate_max_model
_len
(
def
test_
get_and_verify_max
_len
(
model_id
,
max_model_len
,
expected_max_len
,
should_raise
):
"""Test
recalculate_max_model
_len with different configurations."""
"""Test
get_and_verify_max
_len with different configurations."""
model_config
=
ModelConfig
(
model_id
)
if
should_raise
:
with
pytest
.
raises
(
ValueError
):
model_config
.
recalculate_max_model_len
(
max_model_len
,
tokenizer
=
model_id
,
tokenizer_revision
=
None
,
)
model_config
.
get_and_verify_max_len
(
max_model_len
)
else
:
model_config
.
recalculate_max_model_len
(
max_model_len
,
tokenizer
=
model_id
,
tokenizer_revision
=
None
,
)
assert
model_config
.
max_model_len
==
expected_max_len
actual_max_len
=
model_config
.
get_and_verify_max_len
(
max_model_len
)
assert
actual_max_len
==
expected_max_len
class
Mock
Model
Config
:
"""Simple mock object for testing maybe_pull_model_for_runai"""
class
MockConfig
:
"""Simple mock object for testing maybe_pull_model_
tokenizer_
for_runai"""
def
__init__
(
self
,
model
:
str
):
def
__init__
(
self
,
model
:
str
,
tokenizer
:
str
):
self
.
model
=
model
class
MockRendererConfig
:
"""Simple mock object for testing maybe_pull_tokenizer_for_runai"""
def
__init__
(
self
,
model_config
:
MockModelConfig
):
self
.
model_config
=
model_config
self
.
tokenizer
=
model_config
.
model
self
.
tokenizer
=
tokenizer
self
.
model_weights
=
None
@
pytest
.
mark
.
parametrize
(
...
...
@@ -529,65 +514,59 @@ def test_s3_url_model_tokenizer_paths(mock_pull_files, s3_url):
mock_pull_files
.
return_value
=
None
# Create first mock and run the method
model_config1
=
MockModelConfig
(
model
=
s3_url
)
renderer_config1
=
MockRendererConfig
(
model_config
=
model_config1
)
ModelConfig
.
maybe_pull_model_for_runai
(
model_config1
,
s3_url
)
RendererConfig
.
maybe_pull_tokenizer_for_runai
(
renderer_config1
,
s3_url
)
config1
=
MockConfig
(
model
=
s3_url
,
tokenizer
=
s3_url
)
ModelConfig
.
maybe_pull_model_tokenizer_for_runai
(
config1
,
s3_url
,
s3_url
)
# Check that model and tokenizer point to existing directories
assert
os
.
path
.
exists
(
model_
config1
.
model
),
(
f
"Model directory does not exist:
{
model_
config1
.
model
}
"
assert
os
.
path
.
exists
(
config1
.
model
),
(
f
"Model directory does not exist:
{
config1
.
model
}
"
)
assert
os
.
path
.
isdir
(
model_
config1
.
model
),
(
f
"Model path is not a directory:
{
model_
config1
.
model
}
"
assert
os
.
path
.
isdir
(
config1
.
model
),
(
f
"Model path is not a directory:
{
config1
.
model
}
"
)
assert
os
.
path
.
exists
(
renderer_
config1
.
tokenizer
),
(
f
"Tokenizer directory does not exist:
{
renderer_
config1
.
tokenizer
}
"
assert
os
.
path
.
exists
(
config1
.
tokenizer
),
(
f
"Tokenizer directory does not exist:
{
config1
.
tokenizer
}
"
)
assert
os
.
path
.
isdir
(
renderer_
config1
.
tokenizer
),
(
f
"Tokenizer path is not a directory:
{
renderer_
config1
.
tokenizer
}
"
assert
os
.
path
.
isdir
(
config1
.
tokenizer
),
(
f
"Tokenizer path is not a directory:
{
config1
.
tokenizer
}
"
)
# Verify that the paths are different from the original S3 URL
assert
model_config1
.
model
!=
s3_url
,
(
"Model path should be converted to local directory"
)
assert
renderer_config1
.
tokenizer
!=
s3_url
,
(
assert
config1
.
model
!=
s3_url
,
"Model path should be converted to local directory"
assert
config1
.
tokenizer
!=
s3_url
,
(
"Tokenizer path should be converted to local directory"
)
# Store the original paths
created_model_dir
=
model_
config1
.
model
create_tokenizer_dir
=
renderer_
config1
.
tokenizer
created_model_dir
=
config1
.
model
create_tokenizer_dir
=
config1
.
tokenizer
# Create a new mock and run the method with the same S3 URL
model_config2
=
MockModelConfig
(
model
=
s3_url
)
renderer_config2
=
MockRendererConfig
(
model_config
=
model_config2
)
ModelConfig
.
maybe_pull_model_for_runai
(
model_config2
,
s3_url
)
RendererConfig
.
maybe_pull_tokenizer_for_runai
(
renderer_config2
,
s3_url
)
config2
=
MockConfig
(
model
=
s3_url
,
tokenizer
=
s3_url
)
ModelConfig
.
maybe_pull_model_tokenizer_for_runai
(
config2
,
s3_url
,
s3_url
)
# Check that the new directories exist
assert
os
.
path
.
exists
(
model_
config2
.
model
),
(
f
"Model directory does not exist:
{
model_
config2
.
model
}
"
assert
os
.
path
.
exists
(
config2
.
model
),
(
f
"Model directory does not exist:
{
config2
.
model
}
"
)
assert
os
.
path
.
isdir
(
model_
config2
.
model
),
(
f
"Model path is not a directory:
{
model_
config2
.
model
}
"
assert
os
.
path
.
isdir
(
config2
.
model
),
(
f
"Model path is not a directory:
{
config2
.
model
}
"
)
assert
os
.
path
.
exists
(
renderer_
config2
.
tokenizer
),
(
f
"Tokenizer directory does not exist:
{
renderer_
config2
.
tokenizer
}
"
assert
os
.
path
.
exists
(
config2
.
tokenizer
),
(
f
"Tokenizer directory does not exist:
{
config2
.
tokenizer
}
"
)
assert
os
.
path
.
isdir
(
renderer_
config2
.
tokenizer
),
(
f
"Tokenizer path is not a directory:
{
renderer_
config2
.
tokenizer
}
"
assert
os
.
path
.
isdir
(
config2
.
tokenizer
),
(
f
"Tokenizer path is not a directory:
{
config2
.
tokenizer
}
"
)
# Verify that the paths are deterministic (same as before)
assert
model_
config2
.
model
==
created_model_dir
,
(
assert
config2
.
model
==
created_model_dir
,
(
f
"Model paths are not deterministic. "
f
"Original:
{
created_model_dir
}
, New:
{
model_
config2
.
model
}
"
f
"Original:
{
created_model_dir
}
, New:
{
config2
.
model
}
"
)
assert
renderer_
config2
.
tokenizer
==
create_tokenizer_dir
,
(
assert
config2
.
tokenizer
==
create_tokenizer_dir
,
(
f
"Tokenizer paths are not deterministic. "
f
"Original:
{
create_tokenizer_dir
}
, New:
{
renderer_
config2
.
tokenizer
}
"
f
"Original:
{
create_tokenizer_dir
}
, New:
{
config2
.
tokenizer
}
"
)
...
...
@@ -601,36 +580,28 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files):
s3_url2
=
"s3://example-bucket-2/model/"
# Create mocks with different S3 URLs and run the method
model_config1
=
MockModelConfig
(
model
=
s3_url1
)
renderer_config1
=
MockRendererConfig
(
model_config
=
model_config1
)
ModelConfig
.
maybe_pull_model_for_runai
(
model_config1
,
s3_url1
)
RendererConfig
.
maybe_pull_tokenizer_for_runai
(
renderer_config1
,
s3_url1
)
config1
=
MockConfig
(
model
=
s3_url1
,
tokenizer
=
s3_url1
)
ModelConfig
.
maybe_pull_model_tokenizer_for_runai
(
config1
,
s3_url1
,
s3_url1
)
model_config2
=
MockModelConfig
(
model
=
s3_url2
)
renderer_config2
=
MockRendererConfig
(
model_config
=
model_config2
)
ModelConfig
.
maybe_pull_model_for_runai
(
model_config2
,
s3_url2
)
RendererConfig
.
maybe_pull_tokenizer_for_runai
(
renderer_config2
,
s3_url2
)
config2
=
MockConfig
(
model
=
s3_url2
,
tokenizer
=
s3_url2
)
ModelConfig
.
maybe_pull_model_tokenizer_for_runai
(
config2
,
s3_url2
,
s3_url2
)
# Verify that different URLs produce different directories
assert
model_
config1
.
model
!=
model_
config2
.
model
,
(
assert
config1
.
model
!=
config2
.
model
,
(
f
"Different S3 URLs should create different model directories. "
f
"URL1 model:
{
model_
config1
.
model
}
, URL2 model:
{
model_
config2
.
model
}
"
f
"URL1 model:
{
config1
.
model
}
, URL2 model:
{
config2
.
model
}
"
)
assert
renderer_
config1
.
tokenizer
!=
renderer_
config2
.
tokenizer
,
(
assert
config1
.
tokenizer
!=
config2
.
tokenizer
,
(
f
"Different S3 URLs should create different tokenizer directories. "
f
"URL1 tokenizer:
{
renderer_
config1
.
tokenizer
}
, "
f
"URL2 tokenizer:
{
renderer_
config2
.
tokenizer
}
"
f
"URL1 tokenizer:
{
config1
.
tokenizer
}
, "
f
"URL2 tokenizer:
{
config2
.
tokenizer
}
"
)
# Verify that both sets of directories exist
assert
os
.
path
.
exists
(
model_config1
.
model
)
and
os
.
path
.
isdir
(
model_config1
.
model
)
assert
os
.
path
.
exists
(
renderer_config1
.
tokenizer
)
and
os
.
path
.
isdir
(
renderer_config1
.
tokenizer
)
assert
os
.
path
.
exists
(
model_config2
.
model
)
and
os
.
path
.
isdir
(
model_config2
.
model
)
assert
os
.
path
.
exists
(
renderer_config2
.
tokenizer
)
and
os
.
path
.
isdir
(
renderer_config2
.
tokenizer
)
assert
os
.
path
.
exists
(
config1
.
model
)
and
os
.
path
.
isdir
(
config1
.
model
)
assert
os
.
path
.
exists
(
config1
.
tokenizer
)
and
os
.
path
.
isdir
(
config1
.
tokenizer
)
assert
os
.
path
.
exists
(
config2
.
model
)
and
os
.
path
.
isdir
(
config2
.
model
)
assert
os
.
path
.
exists
(
config2
.
tokenizer
)
and
os
.
path
.
isdir
(
config2
.
tokenizer
)
@
pytest
.
mark
.
parametrize
(
...
...
tests/test_inputs.py
View file @
e83b7e37
...
...
@@ -3,7 +3,7 @@
import
pytest
from
vllm.config
import
ModelConfig
,
RendererConfig
from
vllm.config
import
ModelConfig
from
vllm.inputs
import
zip_enc_dec_prompts
from
vllm.inputs.parse
import
parse_raw_prompts
from
vllm.inputs.preprocess
import
InputPreprocessor
...
...
@@ -108,9 +108,8 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
)
def
test_preprocessor_always_mm_code_path
(
model_id
,
prompt
):
model_config
=
ModelConfig
(
model
=
model_id
)
renderer_config
=
RendererConfig
(
model_config
=
model_config
)
tokenizer
=
init_tokenizer_from_config
(
renderer_config
)
input_preprocessor
=
InputPreprocessor
(
renderer_config
,
tokenizer
)
tokenizer
=
init_tokenizer_from_config
(
model_config
)
input_preprocessor
=
InputPreprocessor
(
model_config
,
tokenizer
)
# HF processor adds sep token
sep_token_id
=
tokenizer
.
vocab
[
tokenizer
.
sep_token
]
...
...
tests/v1/attention/utils.py
View file @
e83b7e37
...
...
@@ -16,7 +16,6 @@ from vllm.config import (
LoadConfig
,
ModelConfig
,
ParallelConfig
,
RendererConfig
,
SchedulerConfig
,
VllmConfig
,
)
...
...
@@ -217,7 +216,6 @@ def create_vllm_config(
return
VllmConfig
(
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
cache_config
=
cache_config
,
parallel_config
=
parallel_config
,
scheduler_config
=
scheduler_config
,
...
...
tests/v1/core/test_kv_cache_utils.py
View file @
e83b7e37
...
...
@@ -8,7 +8,7 @@ import pytest
import
torch
import
vllm.v1.core.kv_cache_utils
as
kv_cache_utils
from
vllm.config
import
ModelConfig
,
RendererConfig
,
SchedulerConfig
,
VllmConfig
from
vllm.config
import
ModelConfig
,
SchedulerConfig
,
VllmConfig
from
vllm.lora.request
import
LoRARequest
from
vllm.multimodal.inputs
import
(
MultiModalFeatureSpec
,
...
...
@@ -667,10 +667,7 @@ def test_metrics_empty_stats():
def
test_get_kv_cache_configs_multiple_workers
():
model_config
=
ModelConfig
(
max_model_len
=
16
)
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
)
vllm_config
=
VllmConfig
(
model_config
=
model_config
)
ref_kv_cache_spec
=
new_kv_cache_spec
()
same_kv_cache_specs
=
[
...
...
@@ -1139,7 +1136,6 @@ def test_estimate_max_model_len(model_id, max_model_len, want_estimated_max_len)
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
scheduler_config
=
scheduler_config
,
)
...
...
@@ -1179,7 +1175,6 @@ def test_get_max_concurrency_for_kv_cache_config():
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
scheduler_config
=
scheduler_config
,
)
...
...
@@ -1298,10 +1293,7 @@ def test_allocate_with_lookahead():
def
test_get_kv_cache_config_one_worker
():
# pass max_model_len to pass check_enough_kv_cache_memory
model_config
=
ModelConfig
(
max_model_len
=
16
)
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
)
vllm_config
=
VllmConfig
(
model_config
=
model_config
)
mem_per_block_per_layer
=
16
*
2
*
64
*
4
*
2
# all layers are full attention -> single group
...
...
@@ -1592,11 +1584,7 @@ def test_get_kv_cache_config_one_worker():
def
test_get_kv_cache_configs_attention_free
():
kv_cache_specs
:
dict
[
str
,
KVCacheSpec
]
=
{}
model_config
=
ModelConfig
(
max_model_len
=
16
)
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
)
vllm_config
=
VllmConfig
(
model_config
=
ModelConfig
(
max_model_len
=
16
))
kv_cache_configs
=
get_kv_cache_configs
(
vllm_config
,
[
kv_cache_specs
],
[
0
])
assert
kv_cache_configs
==
[
KVCacheConfig
(
...
...
tests/v1/core/test_scheduler.py
View file @
e83b7e37
...
...
@@ -11,7 +11,6 @@ from vllm.config import (
ECTransferConfig
,
KVTransferConfig
,
ModelConfig
,
RendererConfig
,
SchedulerConfig
,
SpeculativeConfig
,
VllmConfig
,
...
...
@@ -1564,7 +1563,6 @@ def create_scheduler_with_priority(
vllm_config
=
VllmConfig
(
scheduler_config
=
scheduler_config
,
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
cache_config
=
cache_config
,
kv_transfer_config
=
kv_transfer_config
,
speculative_config
=
speculative_config
,
...
...
tests/v1/core/utils.py
View file @
e83b7e37
...
...
@@ -9,7 +9,6 @@ from vllm.config import (
ECTransferConfig
,
KVTransferConfig
,
ModelConfig
,
RendererConfig
,
SchedulerConfig
,
SpeculativeConfig
,
VllmConfig
,
...
...
@@ -133,7 +132,6 @@ def create_scheduler(
vllm_config
=
VllmConfig
(
scheduler_config
=
scheduler_config
,
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
cache_config
=
cache_config
,
kv_transfer_config
=
kv_transfer_config
,
speculative_config
=
speculative_config
,
...
...
tests/v1/engine/test_engine_core.py
View file @
e83b7e37
...
...
@@ -15,7 +15,6 @@ from vllm.config import (
ECTransferConfig
,
KVTransferConfig
,
ModelConfig
,
RendererConfig
,
SchedulerConfig
,
VllmConfig
,
)
...
...
@@ -523,7 +522,6 @@ def test_encoder_instance_zero_kv_cache(
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
cache_config
=
cache_config
,
scheduler_config
=
scheduler_config
,
kv_transfer_config
=
kv_transfer_config
,
...
...
tests/v1/engine/test_process_multi_modal_uuids.py
View file @
e83b7e37
...
...
@@ -5,14 +5,7 @@ import pytest
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
ModelConfig
,
MultiModalConfig
,
RendererConfig
,
VllmConfig
,
)
from
vllm.config
import
CacheConfig
,
DeviceConfig
,
ModelConfig
,
VllmConfig
from
vllm.sampling_params
import
SamplingParams
from
vllm.v1.engine
import
input_processor
as
input_processor_mod
from
vllm.v1.engine.input_processor
import
InputProcessor
...
...
@@ -51,21 +44,22 @@ def _mock_input_processor(
monkeypatch
.
setattr
(
VllmConfig
,
"__post_init__"
,
lambda
self
:
None
,
raising
=
True
)
model_config
=
ModelConfig
(
skip_tokenizer_init
=
True
,
max_model_len
=
128
,
mm_processor_cache_gb
=
mm_cache_gb
,
generation_config
=
"vllm"
,
)
model_config
.
multimodal_config
=
MultiModalConfig
(
mm_processor_cache_gb
=
mm_cache_gb
)
renderer_config
=
RendererConfig
(
model_config
=
model_config
,
tokenizer
=
"dummy"
,
skip_tokenizer_init
=
True
,
)
# Minimal multimodal_config to satisfy references in
# Processor.process_inputs.
class
_MockMMConfig
:
def
__init__
(
self
,
gb
:
float
):
self
.
mm_processor_cache_gb
=
gb
model_config
.
multimodal_config
=
_MockMMConfig
(
mm_cache_gb
)
# type: ignore[attr-defined]
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
renderer_config
=
renderer_config
,
cache_config
=
CacheConfig
(
enable_prefix_caching
=
enable_prefix_caching
),
device_config
=
DeviceConfig
(
device
=
"cpu"
),
)
...
...
tests/v1/kv_connector/unit/utils.py
View file @
e83b7e37
...
...
@@ -15,7 +15,6 @@ from vllm.config import (
DeviceConfig
,
KVTransferConfig
,
ModelConfig
,
RendererConfig
,
SchedulerConfig
,
VllmConfig
,
)
...
...
@@ -128,7 +127,6 @@ def create_vllm_config(
return
VllmConfig
(
scheduler_config
=
scheduler_config
,
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
cache_config
=
cache_config
,
kv_transfer_config
=
kv_transfer_config
,
device_config
=
DeviceConfig
(
"cpu"
),
...
...
tests/v1/spec_decode/test_eagle.py
View file @
e83b7e37
...
...
@@ -19,7 +19,6 @@ from vllm.config import (
DeviceConfig
,
ModelConfig
,
ParallelConfig
,
RendererConfig
,
SchedulerConfig
,
SpeculativeConfig
,
VllmConfig
,
...
...
@@ -62,7 +61,6 @@ def _create_proposer(
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
cache_config
=
CacheConfig
(),
speculative_config
=
speculative_config
,
device_config
=
DeviceConfig
(
device
=
current_platform
.
device_type
),
...
...
tests/v1/spec_decode/test_mtp.py
View file @
e83b7e37
...
...
@@ -18,7 +18,6 @@ from vllm.config import (
DeviceConfig
,
ModelConfig
,
ParallelConfig
,
RendererConfig
,
SchedulerConfig
,
SpeculativeConfig
,
VllmConfig
,
...
...
@@ -47,7 +46,6 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer:
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
cache_config
=
CacheConfig
(),
speculative_config
=
speculative_config
,
device_config
=
DeviceConfig
(
device
=
current_platform
.
device_type
),
...
...
tests/v1/spec_decode/test_ngram.py
View file @
e83b7e37
...
...
@@ -4,7 +4,6 @@ import numpy as np
from
vllm.config
import
(
ModelConfig
,
RendererConfig
,
SpeculativeConfig
,
VllmConfig
,
)
...
...
@@ -70,7 +69,6 @@ def test_ngram_proposer():
return
NgramProposer
(
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
speculative_config
=
SpeculativeConfig
(
prompt_lookup_min
=
min_n
,
prompt_lookup_max
=
max_n
,
...
...
tests/v1/structured_output/test_backend_guidance.py
View file @
e83b7e37
...
...
@@ -6,7 +6,7 @@ from concurrent.futures import Future
import
pytest
from
transformers
import
AutoTokenizer
from
vllm.config
import
RendererConfig
,
StructuredOutputsConfig
,
VllmConfig
from
vllm.config
import
StructuredOutputsConfig
,
VllmConfig
from
vllm.config.model
import
ModelConfig
from
vllm.config.parallel
import
ParallelConfig
from
vllm.config.speculative
import
SpeculativeConfig
...
...
@@ -72,11 +72,8 @@ def test_backend_guidance_rollback_terminated():
def
test_grammar_bitmask_with_specdec
():
tokenizer
=
AutoTokenizer
.
from_pretrained
(
TOKENIZER
)
prompt
=
tokenizer
.
encode
(
'{"a": "b"}'
)
model_config
=
ModelConfig
(
tokenizer
=
TOKENIZER
)
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
,
tokenizer
=
TOKENIZER
),
model_config
=
ModelConfig
(
tokenizer
=
TOKENIZER
),
structured_outputs_config
=
StructuredOutputsConfig
(
backend
=
"guidance"
),
speculative_config
=
SpeculativeConfig
(
model
=
"[ngram]"
,
num_speculative_tokens
=
3
),
)
...
...
@@ -140,11 +137,8 @@ def test_grammar_init_async_and_sync(async_grammar):
# Use "external_launcher" for sync mode, None for async mode
executor_backend
=
None
if
async_grammar
else
"external_launcher"
model_config
=
ModelConfig
(
tokenizer
=
TOKENIZER
)
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
,
tokenizer
=
TOKENIZER
),
model_config
=
ModelConfig
(
tokenizer
=
TOKENIZER
),
structured_outputs_config
=
StructuredOutputsConfig
(
backend
=
"guidance"
),
parallel_config
=
ParallelConfig
(
distributed_executor_backend
=
executor_backend
),
)
...
...
tests/v1/structured_output/test_reasoning_structured_output.py
View file @
e83b7e37
...
...
@@ -7,7 +7,7 @@ from unittest.mock import Mock
import
pytest
from
vllm.config
import
ModelConfig
,
RendererConfig
,
SchedulerConfig
,
VllmConfig
from
vllm.config
import
ModelConfig
,
SchedulerConfig
,
VllmConfig
from
vllm.reasoning
import
ReasoningParser
from
vllm.v1.request
import
Request
from
vllm.v1.structured_output
import
StructuredOutputManager
...
...
@@ -17,26 +17,19 @@ class TestReasoningStructuredOutput:
"""Test reasoning-aware structured output functionality."""
@
pytest
.
fixture
def
mock_renderer_config
(
self
):
"""Create a mock RendererConfig."""
renderer_config
=
Mock
(
spec
=
RendererConfig
)
renderer_config
.
skip_tokenizer_init
=
(
True
# Skip tokenizer init to avoid network calls
)
model_config
=
Mock
(
spec
=
ModelConfig
)
model_config
.
get_vocab_size
=
Mock
(
return_value
=
50000
)
model_config
.
trust_remote_code
=
False
def
mock_model_config
(
self
):
"""Create a mock ModelConfig."""
config
=
Mock
(
spec
=
ModelConfig
)
config
.
skip_tokenizer_init
=
True
# Skip tokenizer init to avoid network calls
config
.
get_vocab_size
=
Mock
(
return_value
=
50000
)
# Add missing runner_type attribute that tokenizer initialization expects
model_config
.
runner_type
=
"generate"
renderer_config
.
model_config
=
model_config
config
.
runner_type
=
"generate"
# Add other attributes that tokenizer initialization might need
renderer_
config
.
tokenizer
=
"test-tokenizer"
renderer_
config
.
tokenizer_mode
=
"auto"
renderer_config
.
tokenizer_revision
=
Non
e
return
renderer_
config
config
.
tokenizer
=
"test-tokenizer"
config
.
tokenizer_mode
=
"auto"
config
.
trust_remote_code
=
Fals
e
config
.
tokenizer_revision
=
None
return
config
@
pytest
.
fixture
def
mock_scheduler_config
(
self
):
...
...
@@ -46,10 +39,10 @@ class TestReasoningStructuredOutput:
return
config
@
pytest
.
fixture
def
mock_vllm_config
(
self
,
mock_
renderer
_config
,
mock_scheduler_config
):
def
mock_vllm_config
(
self
,
mock_
model
_config
,
mock_scheduler_config
):
"""Create a mock VllmConfig."""
config
=
Mock
(
spec
=
VllmConfig
)
config
.
renderer
_config
=
mock_
renderer
_config
config
.
model
_config
=
mock_
model
_config
config
.
scheduler_config
=
mock_scheduler_config
config
.
structured_outputs_config
=
Mock
()
config
.
structured_outputs_config
.
reasoning_parser
=
None
...
...
tests/v1/tpu/worker/test_tpu_model_runner.py
View file @
e83b7e37
...
...
@@ -7,7 +7,6 @@ from vllm.attention.layer import Attention
from
vllm.config
import
(
CacheConfig
,
ModelConfig
,
RendererConfig
,
SchedulerConfig
,
VllmConfig
,
set_current_vllm_config
,
...
...
@@ -46,7 +45,6 @@ def get_vllm_config():
)
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
cache_config
=
cache_config
,
scheduler_config
=
scheduler_config
,
)
...
...
tests/v1/worker/test_gpu_model_runner.py
View file @
e83b7e37
...
...
@@ -13,7 +13,6 @@ from vllm.config import (
CacheConfig
,
ModelConfig
,
ParallelConfig
,
RendererConfig
,
SchedulerConfig
,
VllmConfig
,
set_current_vllm_config
,
...
...
@@ -102,7 +101,6 @@ def get_vllm_config():
parallel_config
=
ParallelConfig
()
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
cache_config
=
cache_config
,
scheduler_config
=
scheduler_config
,
parallel_config
=
parallel_config
,
...
...
@@ -813,7 +811,6 @@ def test_hybrid_attention_mamba_tensor_shapes():
attention_config
=
AttentionConfig
(
backend
=
AttentionBackendEnum
.
FLASHINFER
)
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
cache_config
=
cache_config
,
scheduler_config
=
scheduler_config
,
parallel_config
=
parallel_config
,
...
...
vllm/config/__init__.py
View file @
e83b7e37
...
...
@@ -24,7 +24,6 @@ from vllm.config.multimodal import MultiModalConfig
from
vllm.config.observability
import
ObservabilityConfig
from
vllm.config.parallel
import
EPLBConfig
,
ParallelConfig
from
vllm.config.pooler
import
PoolerConfig
from
vllm.config.renderer
import
RendererConfig
from
vllm.config.scheduler
import
SchedulerConfig
from
vllm.config.speculative
import
SpeculativeConfig
from
vllm.config.speech_to_text
import
SpeechToTextConfig
...
...
@@ -82,8 +81,6 @@ __all__ = [
"ParallelConfig"
,
# From vllm.config.pooler
"PoolerConfig"
,
# From vllm.config.renderer
"RendererConfig"
,
# From vllm.config.scheduler
"SchedulerConfig"
,
# From vllm.config.speculative
...
...
vllm/config/model.py
View file @
e83b7e37
...
...
@@ -36,6 +36,7 @@ from vllm.transformers_utils.config import (
uses_xdrope_dim
,
)
from
vllm.transformers_utils.gguf_utils
import
(
is_gguf
,
is_remote_gguf
,
maybe_patch_hf_config_from_gguf
,
split_remote_gguf
,
...
...
@@ -82,6 +83,7 @@ TaskOption = Literal[
"transcription"
,
"draft"
,
]
TokenizerMode
=
Literal
[
"auto"
,
"hf"
,
"slow"
,
"mistral"
,
"deepseek_v32"
]
ModelDType
=
Literal
[
"auto"
,
"half"
,
"float16"
,
"bfloat16"
,
"float"
,
"float32"
]
LogprobsMode
=
Literal
[
"raw_logits"
,
"raw_logprobs"
,
"processed_logits"
,
"processed_logprobs"
...
...
@@ -129,6 +131,18 @@ class ModelConfig:
Note that the model may support other tasks using the same model runner.
"""
tokenizer
:
SkipValidation
[
str
]
=
None
# type: ignore
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
name or path will be used."""
tokenizer_mode
:
TokenizerMode
|
str
=
"auto"
"""Tokenizer mode:
\n
- "auto" will use the tokenizer from `mistral_common` for Mistral models
if available, otherwise it will use the "hf" tokenizer.
\n
- "hf" will use the fast tokenizer if available.
\n
- "slow" will always use the slow tokenizer.
\n
- "mistral" will always use the tokenizer from `mistral_common`.
\n
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.
\n
- Other custom values can be supported via plugins."""
trust_remote_code
:
bool
=
False
"""Trust remote code (e.g., from HuggingFace) when downloading the model
and tokenizer."""
...
...
@@ -154,6 +168,13 @@ class ModelConfig:
hf_config_path
:
str
|
None
=
None
"""Name or path of the Hugging Face config to use. If unspecified, model
name or path will be used."""
allowed_local_media_path
:
str
=
""
"""Allowing API requests to read local images or videos from directories
specified by the server file system. This is a security risk. Should only
be enabled in trusted environments."""
allowed_media_domains
:
list
[
str
]
|
None
=
None
"""If set, only media URLs that belong to this domain can be used for
multi-modal inputs. """
revision
:
str
|
None
=
None
"""The specific model version to use. It can be a branch name, a tag name,
or a commit id. If unspecified, will use the default version."""
...
...
@@ -161,6 +182,10 @@ class ModelConfig:
"""The specific revision to use for the model code on the Hugging Face Hub.
It can be a branch name, a tag name, or a commit id. If unspecified, will
use the default version."""
tokenizer_revision
:
str
|
None
=
None
"""The specific revision to use for the tokenizer on the Hugging Face Hub.
It can be a branch name, a tag name, or a commit id. If unspecified, will
use the default version."""
max_model_len
:
SkipValidation
[
int
]
=
None
# type: ignore
"""Model context length (prompt and output). If unspecified, will be
automatically derived from the model config.
...
...
@@ -205,6 +230,10 @@ class ModelConfig:
preventing potential numerical issues. Note that even if this is set to
False, cascade attention will be only used when the heuristic tells that
it's beneficial."""
skip_tokenizer_init
:
bool
=
False
"""Skip initialization of tokenizer and detokenizer. Expects valid
`prompt_token_ids` and `None` for prompt from the input. The generated
output will contain token ids."""
enable_prompt_embeds
:
bool
=
False
"""If `True`, enables passing text embeddings as inputs via the
`prompt_embeds` key.
...
...
@@ -265,6 +294,8 @@ class ModelConfig:
logits_processors
:
list
[
str
|
type
[
LogitsProcessor
]]
|
None
=
None
"""One or more logits processors' fully-qualified class names or class
definitions"""
io_processor_plugin
:
str
|
None
=
None
"""IOProcessor plugin name to load at model startup"""
# Pooler config
pooler_config
:
PoolerConfig
|
None
=
None
...
...
@@ -277,6 +308,7 @@ class ModelConfig:
from the architecture of `self.model`."""
limit_mm_per_prompt
:
InitVar
[
dict
[
str
,
int
|
dict
[
str
,
int
]]
|
None
]
=
None
enable_mm_embeds
:
InitVar
[
bool
|
None
]
=
None
media_io_kwargs
:
InitVar
[
dict
[
str
,
dict
[
str
,
Any
]]
|
None
]
=
None
mm_processor_kwargs
:
InitVar
[
dict
[
str
,
Any
]
|
None
]
=
None
mm_processor_cache_gb
:
InitVar
[
float
|
None
]
=
None
mm_processor_cache_type
:
InitVar
[
MMCacheType
|
None
]
=
None
...
...
@@ -303,12 +335,18 @@ class ModelConfig:
"runner"
,
"convert"
,
"task"
,
"tokenizer"
,
"tokenizer_mode"
,
"seed"
,
"hf_config_path"
,
"allowed_local_media_path"
,
"allowed_media_domains"
,
"tokenizer_revision"
,
"spec_target_max_model_len"
,
"enforce_eager"
,
"logprobs_mode"
,
"disable_cascade_attn"
,
"skip_tokenizer_init"
,
"served_model_name"
,
"config_format"
,
"hf_token"
,
...
...
@@ -316,9 +354,11 @@ class ModelConfig:
"logits_processor_pattern"
,
"override_attention_dtype"
,
"logits_processors"
,
"io_processor_plugin"
,
"pooler_config"
,
"multimodal_config"
,
"limit_mm_per_prompt"
,
"media_io_kwargs"
,
"mm_processor_kwargs"
,
"mm_processor_cache_gb"
,
"mm_processor_cache_type"
,
...
...
@@ -383,6 +423,7 @@ class ModelConfig:
# Multimodal config init vars
limit_mm_per_prompt
:
dict
[
str
,
int
|
dict
[
str
,
int
]]
|
None
,
enable_mm_embeds
:
bool
|
None
,
media_io_kwargs
:
dict
[
str
,
dict
[
str
,
Any
]]
|
None
,
mm_processor_kwargs
:
dict
[
str
,
Any
]
|
None
,
mm_processor_cache_gb
:
float
|
None
,
mm_processor_cache_type
:
MMCacheType
|
None
,
...
...
@@ -397,8 +438,13 @@ class ModelConfig:
self
.
served_model_name
=
get_served_model_name
(
self
.
model
,
self
.
served_model_name
)
self
.
original_model
=
self
.
model
self
.
model
=
maybe_model_redirect
(
self
.
original_model
)
self
.
model
=
maybe_model_redirect
(
self
.
model
)
# The tokenizer is consistent with the model by default.
if
self
.
tokenizer
is
None
:
self
.
tokenizer
=
self
.
model
if
self
.
tokenizer_revision
is
None
:
self
.
tokenizer_revision
=
self
.
revision
self
.
tokenizer
=
maybe_model_redirect
(
self
.
tokenizer
)
if
isinstance
(
self
.
hf_config_path
,
str
):
self
.
hf_config_path
=
maybe_model_redirect
(
self
.
hf_config_path
)
...
...
@@ -419,7 +465,7 @@ class ModelConfig:
hf_overrides_kw
[
key
]
=
value
hf_overrides_fn
=
None
self
.
maybe_pull_model_for_runai
(
self
.
model
)
self
.
maybe_pull_model_
tokenizer_
for_runai
(
self
.
model
,
self
.
tokenizer
)
from
vllm.platforms
import
current_platform
...
...
@@ -602,8 +648,7 @@ class ModelConfig:
)
self
.
original_max_model_len
=
self
.
max_model_len
self
.
recalculate_max_model_len
(
self
.
original_max_model_len
)
self
.
max_model_len
=
self
.
get_and_verify_max_len
(
self
.
max_model_len
)
# Init multimodal config if needed
if
self
.
_model_info
.
supports_multimodal
:
if
(
...
...
@@ -619,6 +664,7 @@ class ModelConfig:
mm_config_kwargs
=
dict
(
limit_per_prompt
=
limit_mm_per_prompt
,
enable_mm_embeds
=
enable_mm_embeds
,
media_io_kwargs
=
media_io_kwargs
,
mm_processor_kwargs
=
mm_processor_kwargs
,
mm_processor_cache_gb
=
mm_processor_cache_gb
,
mm_processor_cache_type
=
mm_processor_cache_type
,
...
...
@@ -636,8 +682,16 @@ class ModelConfig:
self
.
multimodal_config
=
MultiModalConfig
(
**
mm_config_kwargs
)
# Multimodal GGUF models must use original repo for mm processing
if
is_gguf
(
self
.
tokenizer
)
and
self
.
is_multimodal_model
:
raise
ValueError
(
"Loading a multimodal GGUF model needs to use original "
"tokenizer. Please specify the unquantized hf model's "
"repo name or path using the --tokenizer argument."
)
if
self
.
disable_sliding_window
:
# Set after
recalculate_max_model
_len to ensure that max_model_len
# Set after
get_and_verify_max
_len to ensure that max_model_len
# can be correctly capped to sliding window size
self
.
hf_text_config
.
sliding_window
=
None
...
...
@@ -661,9 +715,10 @@ class ModelConfig:
@
model_validator
(
mode
=
"after"
)
def
validate_model_config_after
(
self
:
"ModelConfig"
)
->
"ModelConfig"
:
if
not
isinstance
(
self
.
tokenizer
,
str
):
raise
ValueError
(
"tokenizer must be a string after __post_init__."
)
if
not
isinstance
(
self
.
max_model_len
,
int
):
raise
ValueError
(
"max_model_len must be an integer after __post_init__."
)
return
self
def
_get_transformers_backend_cls
(
self
)
->
str
:
...
...
@@ -712,17 +767,49 @@ class ModelConfig:
"""The architecture vllm actually used."""
return
self
.
_architecture
def
maybe_pull_model_for_runai
(
self
,
model
:
str
)
->
None
:
"""Pull model from Object Storage to temporary directory when needed."""
if
not
is_runai_obj_uri
(
model
):
def
maybe_pull_model_tokenizer_for_runai
(
self
,
model
:
str
,
tokenizer
:
str
)
->
None
:
"""Pull model/tokenizer from Object Storage to temporary
directory when needed.
Args:
model: Model name or path
tokenizer: Tokenizer name or path
"""
if
not
(
is_runai_obj_uri
(
model
)
or
is_runai_obj_uri
(
tokenizer
)):
return
object_storage_model
=
ObjectStorageModel
(
url
=
model
)
object_storage_model
.
pull_files
(
model
,
allow_pattern
=
[
"*.model"
,
"*.py"
,
"*.json"
]
)
self
.
model_weights
=
model
self
.
model
=
object_storage_model
.
dir
if
is_runai_obj_uri
(
model
):
object_storage_model
=
ObjectStorageModel
(
url
=
model
)
object_storage_model
.
pull_files
(
model
,
allow_pattern
=
[
"*.model"
,
"*.py"
,
"*.json"
]
)
self
.
model_weights
=
model
self
.
model
=
object_storage_model
.
dir
# If tokenizer is same as model, download to same directory
if
model
==
tokenizer
:
object_storage_model
.
pull_files
(
model
,
ignore_pattern
=
[
"*.pt"
,
"*.safetensors"
,
"*.bin"
,
"*.tensors"
,
"*.pth"
,
],
)
self
.
tokenizer
=
object_storage_model
.
dir
return
# Only download tokenizer if needed and not already handled
if
is_runai_obj_uri
(
tokenizer
):
object_storage_tokenizer
=
ObjectStorageModel
(
url
=
tokenizer
)
object_storage_tokenizer
.
pull_files
(
model
,
ignore_pattern
=
[
"*.pt"
,
"*.safetensors"
,
"*.bin"
,
"*.tensors"
,
"*.pth"
],
)
self
.
tokenizer
=
object_storage_tokenizer
.
dir
def
_get_encoder_config
(
self
):
model
=
self
.
model
...
...
@@ -1625,38 +1712,30 @@ class ModelConfig:
return
dense_modules
[
-
1
][
"out_features"
]
return
self
.
get_hidden_size
()
def
recalculate_max_model_len
(
self
,
original_max_model_len
:
int
|
None
,
*
,
tokenizer
:
str
|
None
=
None
,
tokenizer_revision
:
str
|
None
=
None
,
)
->
None
:
def
get_and_verify_max_len
(
self
,
max_model_len
:
int
):
# Consider max_model_len in tokenizer_config only when
# pooling models use absolute position_embedding.
# NOTE: For simplicity we assume `args.model == args.tokenizer`
# since this is
tokenizer_config
=
None
if
(
self
.
runner_type
==
"pooling"
and
getattr
(
self
.
hf_config
,
"position_embedding_type"
,
""
)
==
"absolute"
):
tokenizer_config
=
try_get_tokenizer_config
(
tokenizer
or
self
.
model
,
self
.
tokenizer
,
trust_remote_code
=
self
.
trust_remote_code
,
revision
=
tokenizer_revision
or
self
.
revision
,
revision
=
self
.
tokenizer_revision
,
)
self
.
max_model_len
=
_get_and_verify_max_len
(
max_model_len
=
_get_and_verify_max_len
(
hf_config
=
self
.
hf_text_config
,
tokenizer_config
=
tokenizer_config
,
max_model_len
=
original_
max_model_len
,
max_model_len
=
max_model_len
,
disable_sliding_window
=
self
.
disable_sliding_window
,
sliding_window
=
self
.
get_sliding_window
(),
spec_target_max_model_len
=
self
.
spec_target_max_model_len
,
encoder_config
=
self
.
encoder_config
,
)
logger
.
info
(
"Using max model len %s"
,
self
.
max_model_len
)
logger
.
info
(
"Using max model len %s"
,
max_model_len
)
return
max_model_len
@
property
def
attn_type
(
self
)
->
AttnTypeStr
:
...
...
vllm/config/multimodal.py
View file @
e83b7e37
...
...
@@ -79,6 +79,10 @@ class MultiModalConfig:
WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
Only enable this flag for trusted users!"""
media_io_kwargs
:
dict
[
str
,
dict
[
str
,
Any
]]
=
Field
(
default_factory
=
dict
)
"""Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set
`--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
mm_processor_kwargs
:
dict
[
str
,
object
]
|
None
=
None
"""Arguments to be forwarded to the model's processor for multi-modal data,
e.g., image processor. Overrides for the multi-modal processor obtained
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment