Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e83b7e37
Unverified
Commit
e83b7e37
authored
Dec 07, 2025
by
Cyrus Leung
Committed by
GitHub
Dec 07, 2025
Browse files
Revert "[Renderer] Separate out `RendererConfig` from `ModelConfig` (#30145)" (#30199)
parent
27f4c2fd
Changes
105
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
26 additions
and
27 deletions
+26
-27
vllm/v1/spec_decode/eagle.py
vllm/v1/spec_decode/eagle.py
+1
-1
vllm/v1/structured_output/__init__.py
vllm/v1/structured_output/__init__.py
+12
-6
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+3
-4
vllm/v1/worker/tpu_model_runner.py
vllm/v1/worker/tpu_model_runner.py
+3
-4
vllm/v1/worker/utils.py
vllm/v1/worker/utils.py
+7
-12
No files found.
vllm/v1/spec_decode/eagle.py
View file @
e83b7e37
...
@@ -85,7 +85,7 @@ class EagleProposer:
...
@@ -85,7 +85,7 @@ class EagleProposer:
# Multi-modal data support
# Multi-modal data support
self
.
mm_registry
=
MULTIMODAL_REGISTRY
self
.
mm_registry
=
MULTIMODAL_REGISTRY
self
.
supports_mm_inputs
=
self
.
mm_registry
.
supports_multimodal_inputs
(
self
.
supports_mm_inputs
=
self
.
mm_registry
.
supports_multimodal_inputs
(
vllm_config
.
renderer
_config
vllm_config
.
model
_config
)
)
self
.
attn_metadata_builder
:
AttentionMetadataBuilder
|
None
=
None
self
.
attn_metadata_builder
:
AttentionMetadataBuilder
|
None
=
None
...
...
vllm/v1/structured_output/__init__.py
View file @
e83b7e37
...
@@ -63,7 +63,7 @@ class StructuredOutputManager:
...
@@ -63,7 +63,7 @@ class StructuredOutputManager:
max_workers
=
max
(
1
,
min
(
multiprocessing
.
cpu_count
()
//
2
,
8
))
max_workers
=
max
(
1
,
min
(
multiprocessing
.
cpu_count
()
//
2
,
8
))
self
.
executor_for_fillmask
=
ThreadPoolExecutor
(
max_workers
=
max_workers
)
self
.
executor_for_fillmask
=
ThreadPoolExecutor
(
max_workers
=
max_workers
)
if
not
vllm_config
.
renderer
_config
.
skip_tokenizer_init
:
if
not
self
.
vllm_config
.
model
_config
.
skip_tokenizer_init
:
# The default max_workers if not specified is the number of
# The default max_workers if not specified is the number of
# CPUs * 5, which is way too high since these tasks are CPU-bound,
# CPUs * 5, which is way too high since these tasks are CPU-bound,
# not I/O bound. We also know we would never dominate CPU usage
# not I/O bound. We also know we would never dominate CPU usage
...
@@ -71,15 +71,21 @@ class StructuredOutputManager:
...
@@ -71,15 +71,21 @@ class StructuredOutputManager:
# of CPUs.
# of CPUs.
max_workers
=
max
(
1
,
(
multiprocessing
.
cpu_count
()
+
1
)
//
2
)
max_workers
=
max
(
1
,
(
multiprocessing
.
cpu_count
()
+
1
)
//
2
)
self
.
executor
=
ThreadPoolExecutor
(
max_workers
=
max_workers
)
self
.
executor
=
ThreadPoolExecutor
(
max_workers
=
max_workers
)
self
.
tokenizer
=
init_tokenizer_from_config
(
vllm_config
.
renderer_config
)
self
.
tokenizer
=
init_tokenizer_from_config
(
reasoning_parser
=
vllm_config
.
structured_outputs_config
.
reasoning_parser
model_config
=
self
.
vllm_config
.
model_config
)
reasoning_parser
=
(
self
.
vllm_config
.
structured_outputs_config
.
reasoning_parser
)
reasoning_parser_plugin
=
(
reasoning_parser_plugin
=
(
vllm_config
.
structured_outputs_config
.
reasoning_parser_plugin
self
.
vllm_config
.
structured_outputs_config
.
reasoning_parser_plugin
)
)
if
reasoning_parser_plugin
and
len
(
reasoning_parser_plugin
)
>
3
:
if
reasoning_parser_plugin
and
len
(
reasoning_parser_plugin
)
>
3
:
ReasoningParserManager
.
import_reasoning_parser
(
reasoning_parser_plugin
)
ReasoningParserManager
.
import_reasoning_parser
(
reasoning_parser_plugin
)
reasoning_parser
=
vllm_config
.
structured_outputs_config
.
reasoning_parser
reasoning_parser
=
(
self
.
vllm_config
.
structured_outputs_config
.
reasoning_parser
)
if
reasoning_parser
:
if
reasoning_parser
:
reasoner_cls
=
ReasoningParserManager
.
get_reasoning_parser
(
reasoner_cls
=
ReasoningParserManager
.
get_reasoning_parser
(
reasoning_parser
reasoning_parser
...
@@ -87,7 +93,7 @@ class StructuredOutputManager:
...
@@ -87,7 +93,7 @@ class StructuredOutputManager:
self
.
reasoner
=
reasoner_cls
(
tokenizer
=
self
.
tokenizer
)
self
.
reasoner
=
reasoner_cls
(
tokenizer
=
self
.
tokenizer
)
self
.
enable_in_reasoning
=
(
self
.
enable_in_reasoning
=
(
vllm_config
.
structured_outputs_config
.
enable_in_reasoning
self
.
vllm_config
.
structured_outputs_config
.
enable_in_reasoning
)
)
def
grammar_init
(
self
,
request
:
Request
)
->
None
:
def
grammar_init
(
self
,
request
:
Request
)
->
None
:
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
e83b7e37
...
@@ -271,7 +271,6 @@ class GPUModelRunner(
...
@@ -271,7 +271,6 @@ class GPUModelRunner(
device
:
torch
.
device
,
device
:
torch
.
device
,
):
):
self
.
vllm_config
=
vllm_config
self
.
vllm_config
=
vllm_config
self
.
renderer_config
=
vllm_config
.
renderer_config
self
.
model_config
=
vllm_config
.
model_config
self
.
model_config
=
vllm_config
.
model_config
self
.
cache_config
=
vllm_config
.
cache_config
self
.
cache_config
=
vllm_config
.
cache_config
self
.
compilation_config
=
vllm_config
.
compilation_config
self
.
compilation_config
=
vllm_config
.
compilation_config
...
@@ -336,7 +335,7 @@ class GPUModelRunner(
...
@@ -336,7 +335,7 @@ class GPUModelRunner(
self
.
uses_mrope
=
model_config
.
uses_mrope
self
.
uses_mrope
=
model_config
.
uses_mrope
self
.
uses_xdrope_dim
=
model_config
.
uses_xdrope_dim
self
.
uses_xdrope_dim
=
model_config
.
uses_xdrope_dim
self
.
supports_mm_inputs
=
self
.
mm_registry
.
supports_multimodal_inputs
(
self
.
supports_mm_inputs
=
self
.
mm_registry
.
supports_multimodal_inputs
(
self
.
renderer
_config
model
_config
)
)
if
self
.
model_config
.
is_encoder_decoder
:
if
self
.
model_config
.
is_encoder_decoder
:
...
@@ -559,7 +558,7 @@ class GPUModelRunner(
...
@@ -559,7 +558,7 @@ class GPUModelRunner(
self
.
mm_budget
=
(
self
.
mm_budget
=
(
MultiModalBudget
(
MultiModalBudget
(
self
.
renderer
_config
,
self
.
model
_config
,
self
.
scheduler_config
,
self
.
scheduler_config
,
self
.
mm_registry
,
self
.
mm_registry
,
)
)
...
@@ -3874,7 +3873,7 @@ class GPUModelRunner(
...
@@ -3874,7 +3873,7 @@ class GPUModelRunner(
assert
self
.
mm_budget
is
not
None
assert
self
.
mm_budget
is
not
None
dummy_decoder_data
=
self
.
mm_registry
.
get_decoder_dummy_data
(
dummy_decoder_data
=
self
.
mm_registry
.
get_decoder_dummy_data
(
renderer
_config
=
self
.
renderer
_config
,
model
_config
=
self
.
model
_config
,
seq_len
=
self
.
max_model_len
,
seq_len
=
self
.
max_model_len
,
mm_counts
=
{
modality
:
1
},
mm_counts
=
{
modality
:
1
},
cache
=
self
.
mm_budget
.
cache
,
cache
=
self
.
mm_budget
.
cache
,
...
...
vllm/v1/worker/tpu_model_runner.py
View file @
e83b7e37
...
@@ -143,7 +143,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -143,7 +143,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
original_parallel_config
:
ParallelConfig
|
None
=
None
,
original_parallel_config
:
ParallelConfig
|
None
=
None
,
):
):
self
.
vllm_config
=
vllm_config
self
.
vllm_config
=
vllm_config
self
.
renderer_config
=
vllm_config
.
renderer_config
self
.
model_config
=
vllm_config
.
model_config
self
.
model_config
=
vllm_config
.
model_config
self
.
cache_config
=
vllm_config
.
cache_config
self
.
cache_config
=
vllm_config
.
cache_config
self
.
lora_config
=
vllm_config
.
lora_config
self
.
lora_config
=
vllm_config
.
lora_config
...
@@ -223,7 +222,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -223,7 +222,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self
.
mm_registry
=
MULTIMODAL_REGISTRY
self
.
mm_registry
=
MULTIMODAL_REGISTRY
self
.
uses_mrope
=
model_config
.
uses_mrope
self
.
uses_mrope
=
model_config
.
uses_mrope
self
.
supports_mm_inputs
=
self
.
mm_registry
.
supports_multimodal_inputs
(
self
.
supports_mm_inputs
=
self
.
mm_registry
.
supports_multimodal_inputs
(
self
.
renderer
_config
model
_config
)
)
# TODO: Support M-RoPE (e.g, Qwen2-VL)
# TODO: Support M-RoPE (e.g, Qwen2-VL)
assert
not
self
.
uses_mrope
,
"TPU does not support M-RoPE yet."
assert
not
self
.
uses_mrope
,
"TPU does not support M-RoPE yet."
...
@@ -354,7 +353,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -354,7 +353,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self
.
mm_budget
=
(
self
.
mm_budget
=
(
MultiModalBudget
(
MultiModalBudget
(
self
.
renderer
_config
,
self
.
model
_config
,
self
.
scheduler_config
,
self
.
scheduler_config
,
self
.
mm_registry
,
self
.
mm_registry
,
)
)
...
@@ -2039,7 +2038,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -2039,7 +2038,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
assert
self
.
mm_budget
is
not
None
assert
self
.
mm_budget
is
not
None
dummy_decoder_data
=
self
.
mm_registry
.
get_decoder_dummy_data
(
dummy_decoder_data
=
self
.
mm_registry
.
get_decoder_dummy_data
(
renderer
_config
=
self
.
renderer
_config
,
model
_config
=
self
.
model
_config
,
seq_len
=
self
.
max_model_len
,
seq_len
=
self
.
max_model_len
,
mm_counts
=
{
modality
:
1
},
mm_counts
=
{
modality
:
1
},
cache
=
self
.
mm_budget
.
cache
,
cache
=
self
.
mm_budget
.
cache
,
...
...
vllm/v1/worker/utils.py
View file @
e83b7e37
...
@@ -7,7 +7,7 @@ import torch
...
@@ -7,7 +7,7 @@ import torch
from
vllm.attention.backends.abstract
import
AttentionBackend
from
vllm.attention.backends.abstract
import
AttentionBackend
from
vllm.attention.layer
import
Attention
from
vllm.attention.layer
import
Attention
from
vllm.config
import
Renderer
Config
,
SchedulerConfig
,
VllmConfig
from
vllm.config
import
Model
Config
,
SchedulerConfig
,
VllmConfig
from
vllm.model_executor.models.interfaces
import
MultiModalEmbeddings
from
vllm.model_executor.models.interfaces
import
MultiModalEmbeddings
from
vllm.model_executor.models.utils
import
extract_layer_index
from
vllm.model_executor.models.utils
import
extract_layer_index
from
vllm.multimodal.cache
import
processor_only_cache_from_config
from
vllm.multimodal.cache
import
processor_only_cache_from_config
...
@@ -23,29 +23,24 @@ class MultiModalBudget:
...
@@ -23,29 +23,24 @@ class MultiModalBudget:
def
__init__
(
def
__init__
(
self
,
self
,
renderer
_config
:
Renderer
Config
,
model
_config
:
Model
Config
,
scheduler_config
:
SchedulerConfig
,
scheduler_config
:
SchedulerConfig
,
mm_registry
:
MultiModalRegistry
,
mm_registry
:
MultiModalRegistry
,
)
->
None
:
)
->
None
:
super
().
__init__
()
super
().
__init__
()
self
.
renderer_config
=
renderer_config
self
.
model_config
=
model_config
self
.
model_config
=
renderer_config
.
model_config
self
.
scheduler_config
=
scheduler_config
self
.
scheduler_config
=
scheduler_config
self
.
mm_registry
=
mm_registry
self
.
mm_registry
=
mm_registry
self
.
cache
=
cache
=
processor_only_cache_from_config
(
self
.
cache
=
cache
=
processor_only_cache_from_config
(
model_config
,
mm_registry
)
renderer_config
,
mm_registry
)
self
.
max_model_len
=
self
.
model_config
.
max_model_len
self
.
max_model_len
=
model_config
.
max_model_len
self
.
max_num_reqs
=
scheduler_config
.
max_num_seqs
self
.
max_num_reqs
=
scheduler_config
.
max_num_seqs
self
.
mm_limits
=
mm_registry
.
get_mm_limits_per_prompt
(
self
.
mm_limits
=
mm_registry
.
get_mm_limits_per_prompt
(
model_config
,
cache
=
cache
)
renderer_config
,
cache
=
cache
)
max_tokens_by_modality
=
mm_registry
.
get_max_tokens_per_item_by_modality
(
max_tokens_by_modality
=
mm_registry
.
get_max_tokens_per_item_by_modality
(
renderer
_config
,
model
_config
,
cache
=
cache
,
cache
=
cache
,
profiler_limits
=
self
.
mm_limits
,
profiler_limits
=
self
.
mm_limits
,
)
)
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment