Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e83b7e37
Unverified
Commit
e83b7e37
authored
Dec 07, 2025
by
Cyrus Leung
Committed by
GitHub
Dec 07, 2025
Browse files
Revert "[Renderer] Separate out `RendererConfig` from `ModelConfig` (#30145)" (#30199)
parent
27f4c2fd
Changes
105
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
199 additions
and
203 deletions
+199
-203
tests/multimodal/test_registry.py
tests/multimodal/test_registry.py
+1
-3
tests/test_config.py
tests/test_config.py
+51
-80
tests/test_inputs.py
tests/test_inputs.py
+3
-4
tests/v1/attention/utils.py
tests/v1/attention/utils.py
+0
-2
tests/v1/core/test_kv_cache_utils.py
tests/v1/core/test_kv_cache_utils.py
+4
-16
tests/v1/core/test_scheduler.py
tests/v1/core/test_scheduler.py
+0
-2
tests/v1/core/utils.py
tests/v1/core/utils.py
+0
-2
tests/v1/engine/test_engine_core.py
tests/v1/engine/test_engine_core.py
+0
-2
tests/v1/engine/test_process_multi_modal_uuids.py
tests/v1/engine/test_process_multi_modal_uuids.py
+9
-15
tests/v1/kv_connector/unit/utils.py
tests/v1/kv_connector/unit/utils.py
+0
-2
tests/v1/spec_decode/test_eagle.py
tests/v1/spec_decode/test_eagle.py
+0
-2
tests/v1/spec_decode/test_mtp.py
tests/v1/spec_decode/test_mtp.py
+0
-2
tests/v1/spec_decode/test_ngram.py
tests/v1/spec_decode/test_ngram.py
+0
-2
tests/v1/structured_output/test_backend_guidance.py
tests/v1/structured_output/test_backend_guidance.py
+3
-9
tests/v1/structured_output/test_reasoning_structured_output.py
.../v1/structured_output/test_reasoning_structured_output.py
+14
-21
tests/v1/tpu/worker/test_tpu_model_runner.py
tests/v1/tpu/worker/test_tpu_model_runner.py
+0
-2
tests/v1/worker/test_gpu_model_runner.py
tests/v1/worker/test_gpu_model_runner.py
+0
-3
vllm/config/__init__.py
vllm/config/__init__.py
+0
-3
vllm/config/model.py
vllm/config/model.py
+110
-31
vllm/config/multimodal.py
vllm/config/multimodal.py
+4
-0
No files found.
tests/multimodal/test_registry.py
View file @
e83b7e37
...
@@ -31,6 +31,4 @@ def test_supports_multimodal_inputs(model_id, limit_mm_per_prompt, expected):
...
@@ -31,6 +31,4 @@ def test_supports_multimodal_inputs(model_id, limit_mm_per_prompt, expected):
model_id
,
model_id
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
)
)
assert
(
assert
MULTIMODAL_REGISTRY
.
supports_multimodal_inputs
(
ctx
.
model_config
)
is
expected
MULTIMODAL_REGISTRY
.
supports_multimodal_inputs
(
ctx
.
renderer_config
)
is
expected
)
tests/test_config.py
View file @
e83b7e37
...
@@ -13,7 +13,6 @@ from vllm.config import (
...
@@ -13,7 +13,6 @@ from vllm.config import (
CompilationConfig
,
CompilationConfig
,
ModelConfig
,
ModelConfig
,
PoolerConfig
,
PoolerConfig
,
RendererConfig
,
SchedulerConfig
,
SchedulerConfig
,
VllmConfig
,
VllmConfig
,
update_config
,
update_config
,
...
@@ -477,41 +476,27 @@ def test_load_config_pt_load_map_location(pt_load_map_location):
...
@@ -477,41 +476,27 @@ def test_load_config_pt_load_map_location(pt_load_map_location):
(
"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
,
131073
,
131072
,
True
),
(
"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
,
131073
,
131072
,
True
),
],
],
)
)
def
test_
recalculate_max_model
_len
(
def
test_
get_and_verify_max
_len
(
model_id
,
max_model_len
,
expected_max_len
,
should_raise
model_id
,
max_model_len
,
expected_max_len
,
should_raise
):
):
"""Test
recalculate_max_model
_len with different configurations."""
"""Test
get_and_verify_max
_len with different configurations."""
model_config
=
ModelConfig
(
model_id
)
model_config
=
ModelConfig
(
model_id
)
if
should_raise
:
if
should_raise
:
with
pytest
.
raises
(
ValueError
):
with
pytest
.
raises
(
ValueError
):
model_config
.
recalculate_max_model_len
(
model_config
.
get_and_verify_max_len
(
max_model_len
)
max_model_len
,
tokenizer
=
model_id
,
tokenizer_revision
=
None
,
)
else
:
else
:
model_config
.
recalculate_max_model_len
(
actual_max_len
=
model_config
.
get_and_verify_max_len
(
max_model_len
)
max_model_len
,
assert
actual_max_len
==
expected_max_len
tokenizer
=
model_id
,
tokenizer_revision
=
None
,
)
assert
model_config
.
max_model_len
==
expected_max_len
class
Mock
Model
Config
:
class
MockConfig
:
"""Simple mock object for testing maybe_pull_model_for_runai"""
"""Simple mock object for testing maybe_pull_model_
tokenizer_
for_runai"""
def
__init__
(
self
,
model
:
str
):
def
__init__
(
self
,
model
:
str
,
tokenizer
:
str
):
self
.
model
=
model
self
.
model
=
model
self
.
tokenizer
=
tokenizer
self
.
model_weights
=
None
class
MockRendererConfig
:
"""Simple mock object for testing maybe_pull_tokenizer_for_runai"""
def
__init__
(
self
,
model_config
:
MockModelConfig
):
self
.
model_config
=
model_config
self
.
tokenizer
=
model_config
.
model
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
...
@@ -529,65 +514,59 @@ def test_s3_url_model_tokenizer_paths(mock_pull_files, s3_url):
...
@@ -529,65 +514,59 @@ def test_s3_url_model_tokenizer_paths(mock_pull_files, s3_url):
mock_pull_files
.
return_value
=
None
mock_pull_files
.
return_value
=
None
# Create first mock and run the method
# Create first mock and run the method
model_config1
=
MockModelConfig
(
model
=
s3_url
)
config1
=
MockConfig
(
model
=
s3_url
,
tokenizer
=
s3_url
)
renderer_config1
=
MockRendererConfig
(
model_config
=
model_config1
)
ModelConfig
.
maybe_pull_model_tokenizer_for_runai
(
config1
,
s3_url
,
s3_url
)
ModelConfig
.
maybe_pull_model_for_runai
(
model_config1
,
s3_url
)
RendererConfig
.
maybe_pull_tokenizer_for_runai
(
renderer_config1
,
s3_url
)
# Check that model and tokenizer point to existing directories
# Check that model and tokenizer point to existing directories
assert
os
.
path
.
exists
(
model_
config1
.
model
),
(
assert
os
.
path
.
exists
(
config1
.
model
),
(
f
"Model directory does not exist:
{
model_
config1
.
model
}
"
f
"Model directory does not exist:
{
config1
.
model
}
"
)
)
assert
os
.
path
.
isdir
(
model_
config1
.
model
),
(
assert
os
.
path
.
isdir
(
config1
.
model
),
(
f
"Model path is not a directory:
{
model_
config1
.
model
}
"
f
"Model path is not a directory:
{
config1
.
model
}
"
)
)
assert
os
.
path
.
exists
(
renderer_
config1
.
tokenizer
),
(
assert
os
.
path
.
exists
(
config1
.
tokenizer
),
(
f
"Tokenizer directory does not exist:
{
renderer_
config1
.
tokenizer
}
"
f
"Tokenizer directory does not exist:
{
config1
.
tokenizer
}
"
)
)
assert
os
.
path
.
isdir
(
renderer_
config1
.
tokenizer
),
(
assert
os
.
path
.
isdir
(
config1
.
tokenizer
),
(
f
"Tokenizer path is not a directory:
{
renderer_
config1
.
tokenizer
}
"
f
"Tokenizer path is not a directory:
{
config1
.
tokenizer
}
"
)
)
# Verify that the paths are different from the original S3 URL
# Verify that the paths are different from the original S3 URL
assert
model_config1
.
model
!=
s3_url
,
(
assert
config1
.
model
!=
s3_url
,
"Model path should be converted to local directory"
"Model path should be converted to local directory"
assert
config1
.
tokenizer
!=
s3_url
,
(
)
assert
renderer_config1
.
tokenizer
!=
s3_url
,
(
"Tokenizer path should be converted to local directory"
"Tokenizer path should be converted to local directory"
)
)
# Store the original paths
# Store the original paths
created_model_dir
=
model_
config1
.
model
created_model_dir
=
config1
.
model
create_tokenizer_dir
=
renderer_
config1
.
tokenizer
create_tokenizer_dir
=
config1
.
tokenizer
# Create a new mock and run the method with the same S3 URL
# Create a new mock and run the method with the same S3 URL
model_config2
=
MockModelConfig
(
model
=
s3_url
)
config2
=
MockConfig
(
model
=
s3_url
,
tokenizer
=
s3_url
)
renderer_config2
=
MockRendererConfig
(
model_config
=
model_config2
)
ModelConfig
.
maybe_pull_model_tokenizer_for_runai
(
config2
,
s3_url
,
s3_url
)
ModelConfig
.
maybe_pull_model_for_runai
(
model_config2
,
s3_url
)
RendererConfig
.
maybe_pull_tokenizer_for_runai
(
renderer_config2
,
s3_url
)
# Check that the new directories exist
# Check that the new directories exist
assert
os
.
path
.
exists
(
model_
config2
.
model
),
(
assert
os
.
path
.
exists
(
config2
.
model
),
(
f
"Model directory does not exist:
{
model_
config2
.
model
}
"
f
"Model directory does not exist:
{
config2
.
model
}
"
)
)
assert
os
.
path
.
isdir
(
model_
config2
.
model
),
(
assert
os
.
path
.
isdir
(
config2
.
model
),
(
f
"Model path is not a directory:
{
model_
config2
.
model
}
"
f
"Model path is not a directory:
{
config2
.
model
}
"
)
)
assert
os
.
path
.
exists
(
renderer_
config2
.
tokenizer
),
(
assert
os
.
path
.
exists
(
config2
.
tokenizer
),
(
f
"Tokenizer directory does not exist:
{
renderer_
config2
.
tokenizer
}
"
f
"Tokenizer directory does not exist:
{
config2
.
tokenizer
}
"
)
)
assert
os
.
path
.
isdir
(
renderer_
config2
.
tokenizer
),
(
assert
os
.
path
.
isdir
(
config2
.
tokenizer
),
(
f
"Tokenizer path is not a directory:
{
renderer_
config2
.
tokenizer
}
"
f
"Tokenizer path is not a directory:
{
config2
.
tokenizer
}
"
)
)
# Verify that the paths are deterministic (same as before)
# Verify that the paths are deterministic (same as before)
assert
model_
config2
.
model
==
created_model_dir
,
(
assert
config2
.
model
==
created_model_dir
,
(
f
"Model paths are not deterministic. "
f
"Model paths are not deterministic. "
f
"Original:
{
created_model_dir
}
, New:
{
model_
config2
.
model
}
"
f
"Original:
{
created_model_dir
}
, New:
{
config2
.
model
}
"
)
)
assert
renderer_
config2
.
tokenizer
==
create_tokenizer_dir
,
(
assert
config2
.
tokenizer
==
create_tokenizer_dir
,
(
f
"Tokenizer paths are not deterministic. "
f
"Tokenizer paths are not deterministic. "
f
"Original:
{
create_tokenizer_dir
}
, New:
{
renderer_
config2
.
tokenizer
}
"
f
"Original:
{
create_tokenizer_dir
}
, New:
{
config2
.
tokenizer
}
"
)
)
...
@@ -601,36 +580,28 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files):
...
@@ -601,36 +580,28 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files):
s3_url2
=
"s3://example-bucket-2/model/"
s3_url2
=
"s3://example-bucket-2/model/"
# Create mocks with different S3 URLs and run the method
# Create mocks with different S3 URLs and run the method
model_config1
=
MockModelConfig
(
model
=
s3_url1
)
config1
=
MockConfig
(
model
=
s3_url1
,
tokenizer
=
s3_url1
)
renderer_config1
=
MockRendererConfig
(
model_config
=
model_config1
)
ModelConfig
.
maybe_pull_model_tokenizer_for_runai
(
config1
,
s3_url1
,
s3_url1
)
ModelConfig
.
maybe_pull_model_for_runai
(
model_config1
,
s3_url1
)
RendererConfig
.
maybe_pull_tokenizer_for_runai
(
renderer_config1
,
s3_url1
)
model_config2
=
MockModelConfig
(
model
=
s3_url2
)
config2
=
MockConfig
(
model
=
s3_url2
,
tokenizer
=
s3_url2
)
renderer_config2
=
MockRendererConfig
(
model_config
=
model_config2
)
ModelConfig
.
maybe_pull_model_tokenizer_for_runai
(
config2
,
s3_url2
,
s3_url2
)
ModelConfig
.
maybe_pull_model_for_runai
(
model_config2
,
s3_url2
)
RendererConfig
.
maybe_pull_tokenizer_for_runai
(
renderer_config2
,
s3_url2
)
# Verify that different URLs produce different directories
# Verify that different URLs produce different directories
assert
model_
config1
.
model
!=
model_
config2
.
model
,
(
assert
config1
.
model
!=
config2
.
model
,
(
f
"Different S3 URLs should create different model directories. "
f
"Different S3 URLs should create different model directories. "
f
"URL1 model:
{
model_
config1
.
model
}
, URL2 model:
{
model_
config2
.
model
}
"
f
"URL1 model:
{
config1
.
model
}
, URL2 model:
{
config2
.
model
}
"
)
)
assert
renderer_
config1
.
tokenizer
!=
renderer_
config2
.
tokenizer
,
(
assert
config1
.
tokenizer
!=
config2
.
tokenizer
,
(
f
"Different S3 URLs should create different tokenizer directories. "
f
"Different S3 URLs should create different tokenizer directories. "
f
"URL1 tokenizer:
{
renderer_
config1
.
tokenizer
}
, "
f
"URL1 tokenizer:
{
config1
.
tokenizer
}
, "
f
"URL2 tokenizer:
{
renderer_
config2
.
tokenizer
}
"
f
"URL2 tokenizer:
{
config2
.
tokenizer
}
"
)
)
# Verify that both sets of directories exist
# Verify that both sets of directories exist
assert
os
.
path
.
exists
(
model_config1
.
model
)
and
os
.
path
.
isdir
(
model_config1
.
model
)
assert
os
.
path
.
exists
(
config1
.
model
)
and
os
.
path
.
isdir
(
config1
.
model
)
assert
os
.
path
.
exists
(
renderer_config1
.
tokenizer
)
and
os
.
path
.
isdir
(
assert
os
.
path
.
exists
(
config1
.
tokenizer
)
and
os
.
path
.
isdir
(
config1
.
tokenizer
)
renderer_config1
.
tokenizer
assert
os
.
path
.
exists
(
config2
.
model
)
and
os
.
path
.
isdir
(
config2
.
model
)
)
assert
os
.
path
.
exists
(
config2
.
tokenizer
)
and
os
.
path
.
isdir
(
config2
.
tokenizer
)
assert
os
.
path
.
exists
(
model_config2
.
model
)
and
os
.
path
.
isdir
(
model_config2
.
model
)
assert
os
.
path
.
exists
(
renderer_config2
.
tokenizer
)
and
os
.
path
.
isdir
(
renderer_config2
.
tokenizer
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
...
...
tests/test_inputs.py
View file @
e83b7e37
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
import
pytest
import
pytest
from
vllm.config
import
ModelConfig
,
RendererConfig
from
vllm.config
import
ModelConfig
from
vllm.inputs
import
zip_enc_dec_prompts
from
vllm.inputs
import
zip_enc_dec_prompts
from
vllm.inputs.parse
import
parse_raw_prompts
from
vllm.inputs.parse
import
parse_raw_prompts
from
vllm.inputs.preprocess
import
InputPreprocessor
from
vllm.inputs.preprocess
import
InputPreprocessor
...
@@ -108,9 +108,8 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
...
@@ -108,9 +108,8 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
)
)
def
test_preprocessor_always_mm_code_path
(
model_id
,
prompt
):
def
test_preprocessor_always_mm_code_path
(
model_id
,
prompt
):
model_config
=
ModelConfig
(
model
=
model_id
)
model_config
=
ModelConfig
(
model
=
model_id
)
renderer_config
=
RendererConfig
(
model_config
=
model_config
)
tokenizer
=
init_tokenizer_from_config
(
model_config
)
tokenizer
=
init_tokenizer_from_config
(
renderer_config
)
input_preprocessor
=
InputPreprocessor
(
model_config
,
tokenizer
)
input_preprocessor
=
InputPreprocessor
(
renderer_config
,
tokenizer
)
# HF processor adds sep token
# HF processor adds sep token
sep_token_id
=
tokenizer
.
vocab
[
tokenizer
.
sep_token
]
sep_token_id
=
tokenizer
.
vocab
[
tokenizer
.
sep_token
]
...
...
tests/v1/attention/utils.py
View file @
e83b7e37
...
@@ -16,7 +16,6 @@ from vllm.config import (
...
@@ -16,7 +16,6 @@ from vllm.config import (
LoadConfig
,
LoadConfig
,
ModelConfig
,
ModelConfig
,
ParallelConfig
,
ParallelConfig
,
RendererConfig
,
SchedulerConfig
,
SchedulerConfig
,
VllmConfig
,
VllmConfig
,
)
)
...
@@ -217,7 +216,6 @@ def create_vllm_config(
...
@@ -217,7 +216,6 @@ def create_vllm_config(
return
VllmConfig
(
return
VllmConfig
(
model_config
=
model_config
,
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
cache_config
=
cache_config
,
cache_config
=
cache_config
,
parallel_config
=
parallel_config
,
parallel_config
=
parallel_config
,
scheduler_config
=
scheduler_config
,
scheduler_config
=
scheduler_config
,
...
...
tests/v1/core/test_kv_cache_utils.py
View file @
e83b7e37
...
@@ -8,7 +8,7 @@ import pytest
...
@@ -8,7 +8,7 @@ import pytest
import
torch
import
torch
import
vllm.v1.core.kv_cache_utils
as
kv_cache_utils
import
vllm.v1.core.kv_cache_utils
as
kv_cache_utils
from
vllm.config
import
ModelConfig
,
RendererConfig
,
SchedulerConfig
,
VllmConfig
from
vllm.config
import
ModelConfig
,
SchedulerConfig
,
VllmConfig
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
vllm.multimodal.inputs
import
(
from
vllm.multimodal.inputs
import
(
MultiModalFeatureSpec
,
MultiModalFeatureSpec
,
...
@@ -667,10 +667,7 @@ def test_metrics_empty_stats():
...
@@ -667,10 +667,7 @@ def test_metrics_empty_stats():
def
test_get_kv_cache_configs_multiple_workers
():
def
test_get_kv_cache_configs_multiple_workers
():
model_config
=
ModelConfig
(
max_model_len
=
16
)
model_config
=
ModelConfig
(
max_model_len
=
16
)
vllm_config
=
VllmConfig
(
vllm_config
=
VllmConfig
(
model_config
=
model_config
)
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
)
ref_kv_cache_spec
=
new_kv_cache_spec
()
ref_kv_cache_spec
=
new_kv_cache_spec
()
same_kv_cache_specs
=
[
same_kv_cache_specs
=
[
...
@@ -1139,7 +1136,6 @@ def test_estimate_max_model_len(model_id, max_model_len, want_estimated_max_len)
...
@@ -1139,7 +1136,6 @@ def test_estimate_max_model_len(model_id, max_model_len, want_estimated_max_len)
vllm_config
=
VllmConfig
(
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
scheduler_config
=
scheduler_config
,
scheduler_config
=
scheduler_config
,
)
)
...
@@ -1179,7 +1175,6 @@ def test_get_max_concurrency_for_kv_cache_config():
...
@@ -1179,7 +1175,6 @@ def test_get_max_concurrency_for_kv_cache_config():
vllm_config
=
VllmConfig
(
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
scheduler_config
=
scheduler_config
,
scheduler_config
=
scheduler_config
,
)
)
...
@@ -1298,10 +1293,7 @@ def test_allocate_with_lookahead():
...
@@ -1298,10 +1293,7 @@ def test_allocate_with_lookahead():
def
test_get_kv_cache_config_one_worker
():
def
test_get_kv_cache_config_one_worker
():
# pass max_model_len to pass check_enough_kv_cache_memory
# pass max_model_len to pass check_enough_kv_cache_memory
model_config
=
ModelConfig
(
max_model_len
=
16
)
model_config
=
ModelConfig
(
max_model_len
=
16
)
vllm_config
=
VllmConfig
(
vllm_config
=
VllmConfig
(
model_config
=
model_config
)
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
)
mem_per_block_per_layer
=
16
*
2
*
64
*
4
*
2
mem_per_block_per_layer
=
16
*
2
*
64
*
4
*
2
# all layers are full attention -> single group
# all layers are full attention -> single group
...
@@ -1592,11 +1584,7 @@ def test_get_kv_cache_config_one_worker():
...
@@ -1592,11 +1584,7 @@ def test_get_kv_cache_config_one_worker():
def
test_get_kv_cache_configs_attention_free
():
def
test_get_kv_cache_configs_attention_free
():
kv_cache_specs
:
dict
[
str
,
KVCacheSpec
]
=
{}
kv_cache_specs
:
dict
[
str
,
KVCacheSpec
]
=
{}
model_config
=
ModelConfig
(
max_model_len
=
16
)
vllm_config
=
VllmConfig
(
model_config
=
ModelConfig
(
max_model_len
=
16
))
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
)
kv_cache_configs
=
get_kv_cache_configs
(
vllm_config
,
[
kv_cache_specs
],
[
0
])
kv_cache_configs
=
get_kv_cache_configs
(
vllm_config
,
[
kv_cache_specs
],
[
0
])
assert
kv_cache_configs
==
[
assert
kv_cache_configs
==
[
KVCacheConfig
(
KVCacheConfig
(
...
...
tests/v1/core/test_scheduler.py
View file @
e83b7e37
...
@@ -11,7 +11,6 @@ from vllm.config import (
...
@@ -11,7 +11,6 @@ from vllm.config import (
ECTransferConfig
,
ECTransferConfig
,
KVTransferConfig
,
KVTransferConfig
,
ModelConfig
,
ModelConfig
,
RendererConfig
,
SchedulerConfig
,
SchedulerConfig
,
SpeculativeConfig
,
SpeculativeConfig
,
VllmConfig
,
VllmConfig
,
...
@@ -1564,7 +1563,6 @@ def create_scheduler_with_priority(
...
@@ -1564,7 +1563,6 @@ def create_scheduler_with_priority(
vllm_config
=
VllmConfig
(
vllm_config
=
VllmConfig
(
scheduler_config
=
scheduler_config
,
scheduler_config
=
scheduler_config
,
model_config
=
model_config
,
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
cache_config
=
cache_config
,
cache_config
=
cache_config
,
kv_transfer_config
=
kv_transfer_config
,
kv_transfer_config
=
kv_transfer_config
,
speculative_config
=
speculative_config
,
speculative_config
=
speculative_config
,
...
...
tests/v1/core/utils.py
View file @
e83b7e37
...
@@ -9,7 +9,6 @@ from vllm.config import (
...
@@ -9,7 +9,6 @@ from vllm.config import (
ECTransferConfig
,
ECTransferConfig
,
KVTransferConfig
,
KVTransferConfig
,
ModelConfig
,
ModelConfig
,
RendererConfig
,
SchedulerConfig
,
SchedulerConfig
,
SpeculativeConfig
,
SpeculativeConfig
,
VllmConfig
,
VllmConfig
,
...
@@ -133,7 +132,6 @@ def create_scheduler(
...
@@ -133,7 +132,6 @@ def create_scheduler(
vllm_config
=
VllmConfig
(
vllm_config
=
VllmConfig
(
scheduler_config
=
scheduler_config
,
scheduler_config
=
scheduler_config
,
model_config
=
model_config
,
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
cache_config
=
cache_config
,
cache_config
=
cache_config
,
kv_transfer_config
=
kv_transfer_config
,
kv_transfer_config
=
kv_transfer_config
,
speculative_config
=
speculative_config
,
speculative_config
=
speculative_config
,
...
...
tests/v1/engine/test_engine_core.py
View file @
e83b7e37
...
@@ -15,7 +15,6 @@ from vllm.config import (
...
@@ -15,7 +15,6 @@ from vllm.config import (
ECTransferConfig
,
ECTransferConfig
,
KVTransferConfig
,
KVTransferConfig
,
ModelConfig
,
ModelConfig
,
RendererConfig
,
SchedulerConfig
,
SchedulerConfig
,
VllmConfig
,
VllmConfig
,
)
)
...
@@ -523,7 +522,6 @@ def test_encoder_instance_zero_kv_cache(
...
@@ -523,7 +522,6 @@ def test_encoder_instance_zero_kv_cache(
vllm_config
=
VllmConfig
(
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
cache_config
=
cache_config
,
cache_config
=
cache_config
,
scheduler_config
=
scheduler_config
,
scheduler_config
=
scheduler_config
,
kv_transfer_config
=
kv_transfer_config
,
kv_transfer_config
=
kv_transfer_config
,
...
...
tests/v1/engine/test_process_multi_modal_uuids.py
View file @
e83b7e37
...
@@ -5,14 +5,7 @@ import pytest
...
@@ -5,14 +5,7 @@ import pytest
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.config
import
(
from
vllm.config
import
CacheConfig
,
DeviceConfig
,
ModelConfig
,
VllmConfig
CacheConfig
,
DeviceConfig
,
ModelConfig
,
MultiModalConfig
,
RendererConfig
,
VllmConfig
,
)
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
from
vllm.v1.engine
import
input_processor
as
input_processor_mod
from
vllm.v1.engine
import
input_processor
as
input_processor_mod
from
vllm.v1.engine.input_processor
import
InputProcessor
from
vllm.v1.engine.input_processor
import
InputProcessor
...
@@ -51,21 +44,22 @@ def _mock_input_processor(
...
@@ -51,21 +44,22 @@ def _mock_input_processor(
monkeypatch
.
setattr
(
VllmConfig
,
"__post_init__"
,
lambda
self
:
None
,
raising
=
True
)
monkeypatch
.
setattr
(
VllmConfig
,
"__post_init__"
,
lambda
self
:
None
,
raising
=
True
)
model_config
=
ModelConfig
(
model_config
=
ModelConfig
(
skip_tokenizer_init
=
True
,
max_model_len
=
128
,
max_model_len
=
128
,
mm_processor_cache_gb
=
mm_cache_gb
,
mm_processor_cache_gb
=
mm_cache_gb
,
generation_config
=
"vllm"
,
generation_config
=
"vllm"
,
)
model_config
.
multimodal_config
=
MultiModalConfig
(
mm_processor_cache_gb
=
mm_cache_gb
)
renderer_config
=
RendererConfig
(
model_config
=
model_config
,
tokenizer
=
"dummy"
,
tokenizer
=
"dummy"
,
skip_tokenizer_init
=
True
,
)
)
# Minimal multimodal_config to satisfy references in
# Processor.process_inputs.
class
_MockMMConfig
:
def
__init__
(
self
,
gb
:
float
):
self
.
mm_processor_cache_gb
=
gb
model_config
.
multimodal_config
=
_MockMMConfig
(
mm_cache_gb
)
# type: ignore[attr-defined]
vllm_config
=
VllmConfig
(
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
model_config
=
model_config
,
renderer_config
=
renderer_config
,
cache_config
=
CacheConfig
(
enable_prefix_caching
=
enable_prefix_caching
),
cache_config
=
CacheConfig
(
enable_prefix_caching
=
enable_prefix_caching
),
device_config
=
DeviceConfig
(
device
=
"cpu"
),
device_config
=
DeviceConfig
(
device
=
"cpu"
),
)
)
...
...
tests/v1/kv_connector/unit/utils.py
View file @
e83b7e37
...
@@ -15,7 +15,6 @@ from vllm.config import (
...
@@ -15,7 +15,6 @@ from vllm.config import (
DeviceConfig
,
DeviceConfig
,
KVTransferConfig
,
KVTransferConfig
,
ModelConfig
,
ModelConfig
,
RendererConfig
,
SchedulerConfig
,
SchedulerConfig
,
VllmConfig
,
VllmConfig
,
)
)
...
@@ -128,7 +127,6 @@ def create_vllm_config(
...
@@ -128,7 +127,6 @@ def create_vllm_config(
return
VllmConfig
(
return
VllmConfig
(
scheduler_config
=
scheduler_config
,
scheduler_config
=
scheduler_config
,
model_config
=
model_config
,
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
cache_config
=
cache_config
,
cache_config
=
cache_config
,
kv_transfer_config
=
kv_transfer_config
,
kv_transfer_config
=
kv_transfer_config
,
device_config
=
DeviceConfig
(
"cpu"
),
device_config
=
DeviceConfig
(
"cpu"
),
...
...
tests/v1/spec_decode/test_eagle.py
View file @
e83b7e37
...
@@ -19,7 +19,6 @@ from vllm.config import (
...
@@ -19,7 +19,6 @@ from vllm.config import (
DeviceConfig
,
DeviceConfig
,
ModelConfig
,
ModelConfig
,
ParallelConfig
,
ParallelConfig
,
RendererConfig
,
SchedulerConfig
,
SchedulerConfig
,
SpeculativeConfig
,
SpeculativeConfig
,
VllmConfig
,
VllmConfig
,
...
@@ -62,7 +61,6 @@ def _create_proposer(
...
@@ -62,7 +61,6 @@ def _create_proposer(
vllm_config
=
VllmConfig
(
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
cache_config
=
CacheConfig
(),
cache_config
=
CacheConfig
(),
speculative_config
=
speculative_config
,
speculative_config
=
speculative_config
,
device_config
=
DeviceConfig
(
device
=
current_platform
.
device_type
),
device_config
=
DeviceConfig
(
device
=
current_platform
.
device_type
),
...
...
tests/v1/spec_decode/test_mtp.py
View file @
e83b7e37
...
@@ -18,7 +18,6 @@ from vllm.config import (
...
@@ -18,7 +18,6 @@ from vllm.config import (
DeviceConfig
,
DeviceConfig
,
ModelConfig
,
ModelConfig
,
ParallelConfig
,
ParallelConfig
,
RendererConfig
,
SchedulerConfig
,
SchedulerConfig
,
SpeculativeConfig
,
SpeculativeConfig
,
VllmConfig
,
VllmConfig
,
...
@@ -47,7 +46,6 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer:
...
@@ -47,7 +46,6 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer:
vllm_config
=
VllmConfig
(
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
cache_config
=
CacheConfig
(),
cache_config
=
CacheConfig
(),
speculative_config
=
speculative_config
,
speculative_config
=
speculative_config
,
device_config
=
DeviceConfig
(
device
=
current_platform
.
device_type
),
device_config
=
DeviceConfig
(
device
=
current_platform
.
device_type
),
...
...
tests/v1/spec_decode/test_ngram.py
View file @
e83b7e37
...
@@ -4,7 +4,6 @@ import numpy as np
...
@@ -4,7 +4,6 @@ import numpy as np
from
vllm.config
import
(
from
vllm.config
import
(
ModelConfig
,
ModelConfig
,
RendererConfig
,
SpeculativeConfig
,
SpeculativeConfig
,
VllmConfig
,
VllmConfig
,
)
)
...
@@ -70,7 +69,6 @@ def test_ngram_proposer():
...
@@ -70,7 +69,6 @@ def test_ngram_proposer():
return
NgramProposer
(
return
NgramProposer
(
vllm_config
=
VllmConfig
(
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
speculative_config
=
SpeculativeConfig
(
speculative_config
=
SpeculativeConfig
(
prompt_lookup_min
=
min_n
,
prompt_lookup_min
=
min_n
,
prompt_lookup_max
=
max_n
,
prompt_lookup_max
=
max_n
,
...
...
tests/v1/structured_output/test_backend_guidance.py
View file @
e83b7e37
...
@@ -6,7 +6,7 @@ from concurrent.futures import Future
...
@@ -6,7 +6,7 @@ from concurrent.futures import Future
import
pytest
import
pytest
from
transformers
import
AutoTokenizer
from
transformers
import
AutoTokenizer
from
vllm.config
import
RendererConfig
,
StructuredOutputsConfig
,
VllmConfig
from
vllm.config
import
StructuredOutputsConfig
,
VllmConfig
from
vllm.config.model
import
ModelConfig
from
vllm.config.model
import
ModelConfig
from
vllm.config.parallel
import
ParallelConfig
from
vllm.config.parallel
import
ParallelConfig
from
vllm.config.speculative
import
SpeculativeConfig
from
vllm.config.speculative
import
SpeculativeConfig
...
@@ -72,11 +72,8 @@ def test_backend_guidance_rollback_terminated():
...
@@ -72,11 +72,8 @@ def test_backend_guidance_rollback_terminated():
def
test_grammar_bitmask_with_specdec
():
def
test_grammar_bitmask_with_specdec
():
tokenizer
=
AutoTokenizer
.
from_pretrained
(
TOKENIZER
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
TOKENIZER
)
prompt
=
tokenizer
.
encode
(
'{"a": "b"}'
)
prompt
=
tokenizer
.
encode
(
'{"a": "b"}'
)
model_config
=
ModelConfig
(
tokenizer
=
TOKENIZER
)
vllm_config
=
VllmConfig
(
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
model_config
=
ModelConfig
(
tokenizer
=
TOKENIZER
),
renderer_config
=
RendererConfig
(
model_config
=
model_config
,
tokenizer
=
TOKENIZER
),
structured_outputs_config
=
StructuredOutputsConfig
(
backend
=
"guidance"
),
structured_outputs_config
=
StructuredOutputsConfig
(
backend
=
"guidance"
),
speculative_config
=
SpeculativeConfig
(
model
=
"[ngram]"
,
num_speculative_tokens
=
3
),
speculative_config
=
SpeculativeConfig
(
model
=
"[ngram]"
,
num_speculative_tokens
=
3
),
)
)
...
@@ -140,11 +137,8 @@ def test_grammar_init_async_and_sync(async_grammar):
...
@@ -140,11 +137,8 @@ def test_grammar_init_async_and_sync(async_grammar):
# Use "external_launcher" for sync mode, None for async mode
# Use "external_launcher" for sync mode, None for async mode
executor_backend
=
None
if
async_grammar
else
"external_launcher"
executor_backend
=
None
if
async_grammar
else
"external_launcher"
model_config
=
ModelConfig
(
tokenizer
=
TOKENIZER
)
vllm_config
=
VllmConfig
(
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
model_config
=
ModelConfig
(
tokenizer
=
TOKENIZER
),
renderer_config
=
RendererConfig
(
model_config
=
model_config
,
tokenizer
=
TOKENIZER
),
structured_outputs_config
=
StructuredOutputsConfig
(
backend
=
"guidance"
),
structured_outputs_config
=
StructuredOutputsConfig
(
backend
=
"guidance"
),
parallel_config
=
ParallelConfig
(
distributed_executor_backend
=
executor_backend
),
parallel_config
=
ParallelConfig
(
distributed_executor_backend
=
executor_backend
),
)
)
...
...
tests/v1/structured_output/test_reasoning_structured_output.py
View file @
e83b7e37
...
@@ -7,7 +7,7 @@ from unittest.mock import Mock
...
@@ -7,7 +7,7 @@ from unittest.mock import Mock
import
pytest
import
pytest
from
vllm.config
import
ModelConfig
,
RendererConfig
,
SchedulerConfig
,
VllmConfig
from
vllm.config
import
ModelConfig
,
SchedulerConfig
,
VllmConfig
from
vllm.reasoning
import
ReasoningParser
from
vllm.reasoning
import
ReasoningParser
from
vllm.v1.request
import
Request
from
vllm.v1.request
import
Request
from
vllm.v1.structured_output
import
StructuredOutputManager
from
vllm.v1.structured_output
import
StructuredOutputManager
...
@@ -17,26 +17,19 @@ class TestReasoningStructuredOutput:
...
@@ -17,26 +17,19 @@ class TestReasoningStructuredOutput:
"""Test reasoning-aware structured output functionality."""
"""Test reasoning-aware structured output functionality."""
@
pytest
.
fixture
@
pytest
.
fixture
def
mock_renderer_config
(
self
):
def
mock_model_config
(
self
):
"""Create a mock RendererConfig."""
"""Create a mock ModelConfig."""
renderer_config
=
Mock
(
spec
=
RendererConfig
)
config
=
Mock
(
spec
=
ModelConfig
)
renderer_config
.
skip_tokenizer_init
=
(
config
.
skip_tokenizer_init
=
True
# Skip tokenizer init to avoid network calls
True
# Skip tokenizer init to avoid network calls
config
.
get_vocab_size
=
Mock
(
return_value
=
50000
)
)
model_config
=
Mock
(
spec
=
ModelConfig
)
model_config
.
get_vocab_size
=
Mock
(
return_value
=
50000
)
model_config
.
trust_remote_code
=
False
# Add missing runner_type attribute that tokenizer initialization expects
# Add missing runner_type attribute that tokenizer initialization expects
model_config
.
runner_type
=
"generate"
config
.
runner_type
=
"generate"
renderer_config
.
model_config
=
model_config
# Add other attributes that tokenizer initialization might need
# Add other attributes that tokenizer initialization might need
renderer_
config
.
tokenizer
=
"test-tokenizer"
config
.
tokenizer
=
"test-tokenizer"
renderer_
config
.
tokenizer_mode
=
"auto"
config
.
tokenizer_mode
=
"auto"
renderer_config
.
tokenizer_revision
=
Non
e
config
.
trust_remote_code
=
Fals
e
config
.
tokenizer_revision
=
None
return
renderer_
config
return
config
@
pytest
.
fixture
@
pytest
.
fixture
def
mock_scheduler_config
(
self
):
def
mock_scheduler_config
(
self
):
...
@@ -46,10 +39,10 @@ class TestReasoningStructuredOutput:
...
@@ -46,10 +39,10 @@ class TestReasoningStructuredOutput:
return
config
return
config
@
pytest
.
fixture
@
pytest
.
fixture
def
mock_vllm_config
(
self
,
mock_
renderer
_config
,
mock_scheduler_config
):
def
mock_vllm_config
(
self
,
mock_
model
_config
,
mock_scheduler_config
):
"""Create a mock VllmConfig."""
"""Create a mock VllmConfig."""
config
=
Mock
(
spec
=
VllmConfig
)
config
=
Mock
(
spec
=
VllmConfig
)
config
.
renderer
_config
=
mock_
renderer
_config
config
.
model
_config
=
mock_
model
_config
config
.
scheduler_config
=
mock_scheduler_config
config
.
scheduler_config
=
mock_scheduler_config
config
.
structured_outputs_config
=
Mock
()
config
.
structured_outputs_config
=
Mock
()
config
.
structured_outputs_config
.
reasoning_parser
=
None
config
.
structured_outputs_config
.
reasoning_parser
=
None
...
...
tests/v1/tpu/worker/test_tpu_model_runner.py
View file @
e83b7e37
...
@@ -7,7 +7,6 @@ from vllm.attention.layer import Attention
...
@@ -7,7 +7,6 @@ from vllm.attention.layer import Attention
from
vllm.config
import
(
from
vllm.config
import
(
CacheConfig
,
CacheConfig
,
ModelConfig
,
ModelConfig
,
RendererConfig
,
SchedulerConfig
,
SchedulerConfig
,
VllmConfig
,
VllmConfig
,
set_current_vllm_config
,
set_current_vllm_config
,
...
@@ -46,7 +45,6 @@ def get_vllm_config():
...
@@ -46,7 +45,6 @@ def get_vllm_config():
)
)
vllm_config
=
VllmConfig
(
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
cache_config
=
cache_config
,
cache_config
=
cache_config
,
scheduler_config
=
scheduler_config
,
scheduler_config
=
scheduler_config
,
)
)
...
...
tests/v1/worker/test_gpu_model_runner.py
View file @
e83b7e37
...
@@ -13,7 +13,6 @@ from vllm.config import (
...
@@ -13,7 +13,6 @@ from vllm.config import (
CacheConfig
,
CacheConfig
,
ModelConfig
,
ModelConfig
,
ParallelConfig
,
ParallelConfig
,
RendererConfig
,
SchedulerConfig
,
SchedulerConfig
,
VllmConfig
,
VllmConfig
,
set_current_vllm_config
,
set_current_vllm_config
,
...
@@ -102,7 +101,6 @@ def get_vllm_config():
...
@@ -102,7 +101,6 @@ def get_vllm_config():
parallel_config
=
ParallelConfig
()
parallel_config
=
ParallelConfig
()
vllm_config
=
VllmConfig
(
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
cache_config
=
cache_config
,
cache_config
=
cache_config
,
scheduler_config
=
scheduler_config
,
scheduler_config
=
scheduler_config
,
parallel_config
=
parallel_config
,
parallel_config
=
parallel_config
,
...
@@ -813,7 +811,6 @@ def test_hybrid_attention_mamba_tensor_shapes():
...
@@ -813,7 +811,6 @@ def test_hybrid_attention_mamba_tensor_shapes():
attention_config
=
AttentionConfig
(
backend
=
AttentionBackendEnum
.
FLASHINFER
)
attention_config
=
AttentionConfig
(
backend
=
AttentionBackendEnum
.
FLASHINFER
)
vllm_config
=
VllmConfig
(
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
model_config
=
model_config
,
renderer_config
=
RendererConfig
(
model_config
=
model_config
),
cache_config
=
cache_config
,
cache_config
=
cache_config
,
scheduler_config
=
scheduler_config
,
scheduler_config
=
scheduler_config
,
parallel_config
=
parallel_config
,
parallel_config
=
parallel_config
,
...
...
vllm/config/__init__.py
View file @
e83b7e37
...
@@ -24,7 +24,6 @@ from vllm.config.multimodal import MultiModalConfig
...
@@ -24,7 +24,6 @@ from vllm.config.multimodal import MultiModalConfig
from
vllm.config.observability
import
ObservabilityConfig
from
vllm.config.observability
import
ObservabilityConfig
from
vllm.config.parallel
import
EPLBConfig
,
ParallelConfig
from
vllm.config.parallel
import
EPLBConfig
,
ParallelConfig
from
vllm.config.pooler
import
PoolerConfig
from
vllm.config.pooler
import
PoolerConfig
from
vllm.config.renderer
import
RendererConfig
from
vllm.config.scheduler
import
SchedulerConfig
from
vllm.config.scheduler
import
SchedulerConfig
from
vllm.config.speculative
import
SpeculativeConfig
from
vllm.config.speculative
import
SpeculativeConfig
from
vllm.config.speech_to_text
import
SpeechToTextConfig
from
vllm.config.speech_to_text
import
SpeechToTextConfig
...
@@ -82,8 +81,6 @@ __all__ = [
...
@@ -82,8 +81,6 @@ __all__ = [
"ParallelConfig"
,
"ParallelConfig"
,
# From vllm.config.pooler
# From vllm.config.pooler
"PoolerConfig"
,
"PoolerConfig"
,
# From vllm.config.renderer
"RendererConfig"
,
# From vllm.config.scheduler
# From vllm.config.scheduler
"SchedulerConfig"
,
"SchedulerConfig"
,
# From vllm.config.speculative
# From vllm.config.speculative
...
...
vllm/config/model.py
View file @
e83b7e37
...
@@ -36,6 +36,7 @@ from vllm.transformers_utils.config import (
...
@@ -36,6 +36,7 @@ from vllm.transformers_utils.config import (
uses_xdrope_dim
,
uses_xdrope_dim
,
)
)
from
vllm.transformers_utils.gguf_utils
import
(
from
vllm.transformers_utils.gguf_utils
import
(
is_gguf
,
is_remote_gguf
,
is_remote_gguf
,
maybe_patch_hf_config_from_gguf
,
maybe_patch_hf_config_from_gguf
,
split_remote_gguf
,
split_remote_gguf
,
...
@@ -82,6 +83,7 @@ TaskOption = Literal[
...
@@ -82,6 +83,7 @@ TaskOption = Literal[
"transcription"
,
"transcription"
,
"draft"
,
"draft"
,
]
]
TokenizerMode
=
Literal
[
"auto"
,
"hf"
,
"slow"
,
"mistral"
,
"deepseek_v32"
]
ModelDType
=
Literal
[
"auto"
,
"half"
,
"float16"
,
"bfloat16"
,
"float"
,
"float32"
]
ModelDType
=
Literal
[
"auto"
,
"half"
,
"float16"
,
"bfloat16"
,
"float"
,
"float32"
]
LogprobsMode
=
Literal
[
LogprobsMode
=
Literal
[
"raw_logits"
,
"raw_logprobs"
,
"processed_logits"
,
"processed_logprobs"
"raw_logits"
,
"raw_logprobs"
,
"processed_logits"
,
"processed_logprobs"
...
@@ -129,6 +131,18 @@ class ModelConfig:
...
@@ -129,6 +131,18 @@ class ModelConfig:
Note that the model may support other tasks using the same model runner.
Note that the model may support other tasks using the same model runner.
"""
"""
tokenizer
:
SkipValidation
[
str
]
=
None
# type: ignore
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
name or path will be used."""
tokenizer_mode
:
TokenizerMode
|
str
=
"auto"
"""Tokenizer mode:
\n
- "auto" will use the tokenizer from `mistral_common` for Mistral models
if available, otherwise it will use the "hf" tokenizer.
\n
- "hf" will use the fast tokenizer if available.
\n
- "slow" will always use the slow tokenizer.
\n
- "mistral" will always use the tokenizer from `mistral_common`.
\n
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.
\n
- Other custom values can be supported via plugins."""
trust_remote_code
:
bool
=
False
trust_remote_code
:
bool
=
False
"""Trust remote code (e.g., from HuggingFace) when downloading the model
"""Trust remote code (e.g., from HuggingFace) when downloading the model
and tokenizer."""
and tokenizer."""
...
@@ -154,6 +168,13 @@ class ModelConfig:
...
@@ -154,6 +168,13 @@ class ModelConfig:
hf_config_path
:
str
|
None
=
None
hf_config_path
:
str
|
None
=
None
"""Name or path of the Hugging Face config to use. If unspecified, model
"""Name or path of the Hugging Face config to use. If unspecified, model
name or path will be used."""
name or path will be used."""
allowed_local_media_path
:
str
=
""
"""Allowing API requests to read local images or videos from directories
specified by the server file system. This is a security risk. Should only
be enabled in trusted environments."""
allowed_media_domains
:
list
[
str
]
|
None
=
None
"""If set, only media URLs that belong to this domain can be used for
multi-modal inputs. """
revision
:
str
|
None
=
None
revision
:
str
|
None
=
None
"""The specific model version to use. It can be a branch name, a tag name,
"""The specific model version to use. It can be a branch name, a tag name,
or a commit id. If unspecified, will use the default version."""
or a commit id. If unspecified, will use the default version."""
...
@@ -161,6 +182,10 @@ class ModelConfig:
...
@@ -161,6 +182,10 @@ class ModelConfig:
"""The specific revision to use for the model code on the Hugging Face Hub.
"""The specific revision to use for the model code on the Hugging Face Hub.
It can be a branch name, a tag name, or a commit id. If unspecified, will
It can be a branch name, a tag name, or a commit id. If unspecified, will
use the default version."""
use the default version."""
tokenizer_revision
:
str
|
None
=
None
"""The specific revision to use for the tokenizer on the Hugging Face Hub.
It can be a branch name, a tag name, or a commit id. If unspecified, will
use the default version."""
max_model_len
:
SkipValidation
[
int
]
=
None
# type: ignore
max_model_len
:
SkipValidation
[
int
]
=
None
# type: ignore
"""Model context length (prompt and output). If unspecified, will be
"""Model context length (prompt and output). If unspecified, will be
automatically derived from the model config.
automatically derived from the model config.
...
@@ -205,6 +230,10 @@ class ModelConfig:
...
@@ -205,6 +230,10 @@ class ModelConfig:
preventing potential numerical issues. Note that even if this is set to
preventing potential numerical issues. Note that even if this is set to
False, cascade attention will be only used when the heuristic tells that
False, cascade attention will be only used when the heuristic tells that
it's beneficial."""
it's beneficial."""
skip_tokenizer_init
:
bool
=
False
"""Skip initialization of tokenizer and detokenizer. Expects valid
`prompt_token_ids` and `None` for prompt from the input. The generated
output will contain token ids."""
enable_prompt_embeds
:
bool
=
False
enable_prompt_embeds
:
bool
=
False
"""If `True`, enables passing text embeddings as inputs via the
"""If `True`, enables passing text embeddings as inputs via the
`prompt_embeds` key.
`prompt_embeds` key.
...
@@ -265,6 +294,8 @@ class ModelConfig:
...
@@ -265,6 +294,8 @@ class ModelConfig:
logits_processors
:
list
[
str
|
type
[
LogitsProcessor
]]
|
None
=
None
logits_processors
:
list
[
str
|
type
[
LogitsProcessor
]]
|
None
=
None
"""One or more logits processors' fully-qualified class names or class
"""One or more logits processors' fully-qualified class names or class
definitions"""
definitions"""
io_processor_plugin
:
str
|
None
=
None
"""IOProcessor plugin name to load at model startup"""
# Pooler config
# Pooler config
pooler_config
:
PoolerConfig
|
None
=
None
pooler_config
:
PoolerConfig
|
None
=
None
...
@@ -277,6 +308,7 @@ class ModelConfig:
...
@@ -277,6 +308,7 @@ class ModelConfig:
from the architecture of `self.model`."""
from the architecture of `self.model`."""
limit_mm_per_prompt
:
InitVar
[
dict
[
str
,
int
|
dict
[
str
,
int
]]
|
None
]
=
None
limit_mm_per_prompt
:
InitVar
[
dict
[
str
,
int
|
dict
[
str
,
int
]]
|
None
]
=
None
enable_mm_embeds
:
InitVar
[
bool
|
None
]
=
None
enable_mm_embeds
:
InitVar
[
bool
|
None
]
=
None
media_io_kwargs
:
InitVar
[
dict
[
str
,
dict
[
str
,
Any
]]
|
None
]
=
None
mm_processor_kwargs
:
InitVar
[
dict
[
str
,
Any
]
|
None
]
=
None
mm_processor_kwargs
:
InitVar
[
dict
[
str
,
Any
]
|
None
]
=
None
mm_processor_cache_gb
:
InitVar
[
float
|
None
]
=
None
mm_processor_cache_gb
:
InitVar
[
float
|
None
]
=
None
mm_processor_cache_type
:
InitVar
[
MMCacheType
|
None
]
=
None
mm_processor_cache_type
:
InitVar
[
MMCacheType
|
None
]
=
None
...
@@ -303,12 +335,18 @@ class ModelConfig:
...
@@ -303,12 +335,18 @@ class ModelConfig:
"runner"
,
"runner"
,
"convert"
,
"convert"
,
"task"
,
"task"
,
"tokenizer"
,
"tokenizer_mode"
,
"seed"
,
"seed"
,
"hf_config_path"
,
"hf_config_path"
,
"allowed_local_media_path"
,
"allowed_media_domains"
,
"tokenizer_revision"
,
"spec_target_max_model_len"
,
"spec_target_max_model_len"
,
"enforce_eager"
,
"enforce_eager"
,
"logprobs_mode"
,
"logprobs_mode"
,
"disable_cascade_attn"
,
"disable_cascade_attn"
,
"skip_tokenizer_init"
,
"served_model_name"
,
"served_model_name"
,
"config_format"
,
"config_format"
,
"hf_token"
,
"hf_token"
,
...
@@ -316,9 +354,11 @@ class ModelConfig:
...
@@ -316,9 +354,11 @@ class ModelConfig:
"logits_processor_pattern"
,
"logits_processor_pattern"
,
"override_attention_dtype"
,
"override_attention_dtype"
,
"logits_processors"
,
"logits_processors"
,
"io_processor_plugin"
,
"pooler_config"
,
"pooler_config"
,
"multimodal_config"
,
"multimodal_config"
,
"limit_mm_per_prompt"
,
"limit_mm_per_prompt"
,
"media_io_kwargs"
,
"mm_processor_kwargs"
,
"mm_processor_kwargs"
,
"mm_processor_cache_gb"
,
"mm_processor_cache_gb"
,
"mm_processor_cache_type"
,
"mm_processor_cache_type"
,
...
@@ -383,6 +423,7 @@ class ModelConfig:
...
@@ -383,6 +423,7 @@ class ModelConfig:
# Multimodal config init vars
# Multimodal config init vars
limit_mm_per_prompt
:
dict
[
str
,
int
|
dict
[
str
,
int
]]
|
None
,
limit_mm_per_prompt
:
dict
[
str
,
int
|
dict
[
str
,
int
]]
|
None
,
enable_mm_embeds
:
bool
|
None
,
enable_mm_embeds
:
bool
|
None
,
media_io_kwargs
:
dict
[
str
,
dict
[
str
,
Any
]]
|
None
,
mm_processor_kwargs
:
dict
[
str
,
Any
]
|
None
,
mm_processor_kwargs
:
dict
[
str
,
Any
]
|
None
,
mm_processor_cache_gb
:
float
|
None
,
mm_processor_cache_gb
:
float
|
None
,
mm_processor_cache_type
:
MMCacheType
|
None
,
mm_processor_cache_type
:
MMCacheType
|
None
,
...
@@ -397,8 +438,13 @@ class ModelConfig:
...
@@ -397,8 +438,13 @@ class ModelConfig:
self
.
served_model_name
=
get_served_model_name
(
self
.
served_model_name
=
get_served_model_name
(
self
.
model
,
self
.
served_model_name
self
.
model
,
self
.
served_model_name
)
)
self
.
original_model
=
self
.
model
self
.
model
=
maybe_model_redirect
(
self
.
model
)
self
.
model
=
maybe_model_redirect
(
self
.
original_model
)
# The tokenizer is consistent with the model by default.
if
self
.
tokenizer
is
None
:
self
.
tokenizer
=
self
.
model
if
self
.
tokenizer_revision
is
None
:
self
.
tokenizer_revision
=
self
.
revision
self
.
tokenizer
=
maybe_model_redirect
(
self
.
tokenizer
)
if
isinstance
(
self
.
hf_config_path
,
str
):
if
isinstance
(
self
.
hf_config_path
,
str
):
self
.
hf_config_path
=
maybe_model_redirect
(
self
.
hf_config_path
)
self
.
hf_config_path
=
maybe_model_redirect
(
self
.
hf_config_path
)
...
@@ -419,7 +465,7 @@ class ModelConfig:
...
@@ -419,7 +465,7 @@ class ModelConfig:
hf_overrides_kw
[
key
]
=
value
hf_overrides_kw
[
key
]
=
value
hf_overrides_fn
=
None
hf_overrides_fn
=
None
self
.
maybe_pull_model_for_runai
(
self
.
model
)
self
.
maybe_pull_model_
tokenizer_
for_runai
(
self
.
model
,
self
.
tokenizer
)
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
...
@@ -602,8 +648,7 @@ class ModelConfig:
...
@@ -602,8 +648,7 @@ class ModelConfig:
)
)
self
.
original_max_model_len
=
self
.
max_model_len
self
.
original_max_model_len
=
self
.
max_model_len
self
.
recalculate_max_model_len
(
self
.
original_max_model_len
)
self
.
max_model_len
=
self
.
get_and_verify_max_len
(
self
.
max_model_len
)
# Init multimodal config if needed
# Init multimodal config if needed
if
self
.
_model_info
.
supports_multimodal
:
if
self
.
_model_info
.
supports_multimodal
:
if
(
if
(
...
@@ -619,6 +664,7 @@ class ModelConfig:
...
@@ -619,6 +664,7 @@ class ModelConfig:
mm_config_kwargs
=
dict
(
mm_config_kwargs
=
dict
(
limit_per_prompt
=
limit_mm_per_prompt
,
limit_per_prompt
=
limit_mm_per_prompt
,
enable_mm_embeds
=
enable_mm_embeds
,
enable_mm_embeds
=
enable_mm_embeds
,
media_io_kwargs
=
media_io_kwargs
,
mm_processor_kwargs
=
mm_processor_kwargs
,
mm_processor_kwargs
=
mm_processor_kwargs
,
mm_processor_cache_gb
=
mm_processor_cache_gb
,
mm_processor_cache_gb
=
mm_processor_cache_gb
,
mm_processor_cache_type
=
mm_processor_cache_type
,
mm_processor_cache_type
=
mm_processor_cache_type
,
...
@@ -636,8 +682,16 @@ class ModelConfig:
...
@@ -636,8 +682,16 @@ class ModelConfig:
self
.
multimodal_config
=
MultiModalConfig
(
**
mm_config_kwargs
)
self
.
multimodal_config
=
MultiModalConfig
(
**
mm_config_kwargs
)
# Multimodal GGUF models must use original repo for mm processing
if
is_gguf
(
self
.
tokenizer
)
and
self
.
is_multimodal_model
:
raise
ValueError
(
"Loading a multimodal GGUF model needs to use original "
"tokenizer. Please specify the unquantized hf model's "
"repo name or path using the --tokenizer argument."
)
if
self
.
disable_sliding_window
:
if
self
.
disable_sliding_window
:
# Set after
recalculate_max_model
_len to ensure that max_model_len
# Set after
get_and_verify_max
_len to ensure that max_model_len
# can be correctly capped to sliding window size
# can be correctly capped to sliding window size
self
.
hf_text_config
.
sliding_window
=
None
self
.
hf_text_config
.
sliding_window
=
None
...
@@ -661,9 +715,10 @@ class ModelConfig:
...
@@ -661,9 +715,10 @@ class ModelConfig:
@
model_validator
(
mode
=
"after"
)
@
model_validator
(
mode
=
"after"
)
def
validate_model_config_after
(
self
:
"ModelConfig"
)
->
"ModelConfig"
:
def
validate_model_config_after
(
self
:
"ModelConfig"
)
->
"ModelConfig"
:
if
not
isinstance
(
self
.
tokenizer
,
str
):
raise
ValueError
(
"tokenizer must be a string after __post_init__."
)
if
not
isinstance
(
self
.
max_model_len
,
int
):
if
not
isinstance
(
self
.
max_model_len
,
int
):
raise
ValueError
(
"max_model_len must be an integer after __post_init__."
)
raise
ValueError
(
"max_model_len must be an integer after __post_init__."
)
return
self
return
self
def
_get_transformers_backend_cls
(
self
)
->
str
:
def
_get_transformers_backend_cls
(
self
)
->
str
:
...
@@ -712,17 +767,49 @@ class ModelConfig:
...
@@ -712,17 +767,49 @@ class ModelConfig:
"""The architecture vllm actually used."""
"""The architecture vllm actually used."""
return
self
.
_architecture
return
self
.
_architecture
def
maybe_pull_model_for_runai
(
self
,
model
:
str
)
->
None
:
def
maybe_pull_model_tokenizer_for_runai
(
self
,
model
:
str
,
tokenizer
:
str
)
->
None
:
"""Pull model from Object Storage to temporary directory when needed."""
"""Pull model/tokenizer from Object Storage to temporary
if
not
is_runai_obj_uri
(
model
):
directory when needed.
Args:
model: Model name or path
tokenizer: Tokenizer name or path
"""
if
not
(
is_runai_obj_uri
(
model
)
or
is_runai_obj_uri
(
tokenizer
)):
return
return
object_storage_model
=
ObjectStorageModel
(
url
=
model
)
if
is_runai_obj_uri
(
model
):
object_storage_model
.
pull_files
(
object_storage_model
=
ObjectStorageModel
(
url
=
model
)
model
,
allow_pattern
=
[
"*.model"
,
"*.py"
,
"*.json"
]
object_storage_model
.
pull_files
(
)
model
,
allow_pattern
=
[
"*.model"
,
"*.py"
,
"*.json"
]
self
.
model_weights
=
model
)
self
.
model
=
object_storage_model
.
dir
self
.
model_weights
=
model
self
.
model
=
object_storage_model
.
dir
# If tokenizer is same as model, download to same directory
if
model
==
tokenizer
:
object_storage_model
.
pull_files
(
model
,
ignore_pattern
=
[
"*.pt"
,
"*.safetensors"
,
"*.bin"
,
"*.tensors"
,
"*.pth"
,
],
)
self
.
tokenizer
=
object_storage_model
.
dir
return
# Only download tokenizer if needed and not already handled
if
is_runai_obj_uri
(
tokenizer
):
object_storage_tokenizer
=
ObjectStorageModel
(
url
=
tokenizer
)
object_storage_tokenizer
.
pull_files
(
model
,
ignore_pattern
=
[
"*.pt"
,
"*.safetensors"
,
"*.bin"
,
"*.tensors"
,
"*.pth"
],
)
self
.
tokenizer
=
object_storage_tokenizer
.
dir
def
_get_encoder_config
(
self
):
def
_get_encoder_config
(
self
):
model
=
self
.
model
model
=
self
.
model
...
@@ -1625,38 +1712,30 @@ class ModelConfig:
...
@@ -1625,38 +1712,30 @@ class ModelConfig:
return
dense_modules
[
-
1
][
"out_features"
]
return
dense_modules
[
-
1
][
"out_features"
]
return
self
.
get_hidden_size
()
return
self
.
get_hidden_size
()
def
recalculate_max_model_len
(
def
get_and_verify_max_len
(
self
,
max_model_len
:
int
):
self
,
original_max_model_len
:
int
|
None
,
*
,
tokenizer
:
str
|
None
=
None
,
tokenizer_revision
:
str
|
None
=
None
,
)
->
None
:
# Consider max_model_len in tokenizer_config only when
# Consider max_model_len in tokenizer_config only when
# pooling models use absolute position_embedding.
# pooling models use absolute position_embedding.
# NOTE: For simplicity we assume `args.model == args.tokenizer`
# since this is
tokenizer_config
=
None
tokenizer_config
=
None
if
(
if
(
self
.
runner_type
==
"pooling"
self
.
runner_type
==
"pooling"
and
getattr
(
self
.
hf_config
,
"position_embedding_type"
,
""
)
==
"absolute"
and
getattr
(
self
.
hf_config
,
"position_embedding_type"
,
""
)
==
"absolute"
):
):
tokenizer_config
=
try_get_tokenizer_config
(
tokenizer_config
=
try_get_tokenizer_config
(
tokenizer
or
self
.
model
,
self
.
tokenizer
,
trust_remote_code
=
self
.
trust_remote_code
,
trust_remote_code
=
self
.
trust_remote_code
,
revision
=
tokenizer_revision
or
self
.
revision
,
revision
=
self
.
tokenizer_revision
,
)
)
max_model_len
=
_get_and_verify_max_len
(
self
.
max_model_len
=
_get_and_verify_max_len
(
hf_config
=
self
.
hf_text_config
,
hf_config
=
self
.
hf_text_config
,
tokenizer_config
=
tokenizer_config
,
tokenizer_config
=
tokenizer_config
,
max_model_len
=
original_
max_model_len
,
max_model_len
=
max_model_len
,
disable_sliding_window
=
self
.
disable_sliding_window
,
disable_sliding_window
=
self
.
disable_sliding_window
,
sliding_window
=
self
.
get_sliding_window
(),
sliding_window
=
self
.
get_sliding_window
(),
spec_target_max_model_len
=
self
.
spec_target_max_model_len
,
spec_target_max_model_len
=
self
.
spec_target_max_model_len
,
encoder_config
=
self
.
encoder_config
,
encoder_config
=
self
.
encoder_config
,
)
)
logger
.
info
(
"Using max model len %s"
,
self
.
max_model_len
)
logger
.
info
(
"Using max model len %s"
,
max_model_len
)
return
max_model_len
@
property
@
property
def
attn_type
(
self
)
->
AttnTypeStr
:
def
attn_type
(
self
)
->
AttnTypeStr
:
...
...
vllm/config/multimodal.py
View file @
e83b7e37
...
@@ -79,6 +79,10 @@ class MultiModalConfig:
...
@@ -79,6 +79,10 @@ class MultiModalConfig:
WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
Only enable this flag for trusted users!"""
Only enable this flag for trusted users!"""
media_io_kwargs
:
dict
[
str
,
dict
[
str
,
Any
]]
=
Field
(
default_factory
=
dict
)
"""Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set
`--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
mm_processor_kwargs
:
dict
[
str
,
object
]
|
None
=
None
mm_processor_kwargs
:
dict
[
str
,
object
]
|
None
=
None
"""Arguments to be forwarded to the model's processor for multi-modal data,
"""Arguments to be forwarded to the model's processor for multi-modal data,
e.g., image processor. Overrides for the multi-modal processor obtained
e.g., image processor. Overrides for the multi-modal processor obtained
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment