Unverified Commit e83b7e37 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

Revert "[Renderer] Separate out `RendererConfig` from `ModelConfig` (#30145)" (#30199)

parent 27f4c2fd
......@@ -85,7 +85,7 @@ class EagleProposer:
# Multi-modal data support
self.mm_registry = MULTIMODAL_REGISTRY
self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
vllm_config.renderer_config
vllm_config.model_config
)
self.attn_metadata_builder: AttentionMetadataBuilder | None = None
......
......@@ -63,7 +63,7 @@ class StructuredOutputManager:
max_workers = max(1, min(multiprocessing.cpu_count() // 2, 8))
self.executor_for_fillmask = ThreadPoolExecutor(max_workers=max_workers)
if not vllm_config.renderer_config.skip_tokenizer_init:
if not self.vllm_config.model_config.skip_tokenizer_init:
# The default max_workers if not specified is the number of
# CPUs * 5, which is way too high since these tasks are CPU-bound,
# not I/O bound. We also know we would never dominate CPU usage
......@@ -71,15 +71,21 @@ class StructuredOutputManager:
# of CPUs.
max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
self.executor = ThreadPoolExecutor(max_workers=max_workers)
self.tokenizer = init_tokenizer_from_config(vllm_config.renderer_config)
reasoning_parser = vllm_config.structured_outputs_config.reasoning_parser
self.tokenizer = init_tokenizer_from_config(
model_config=self.vllm_config.model_config
)
reasoning_parser = (
self.vllm_config.structured_outputs_config.reasoning_parser
)
reasoning_parser_plugin = (
vllm_config.structured_outputs_config.reasoning_parser_plugin
self.vllm_config.structured_outputs_config.reasoning_parser_plugin
)
if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3:
ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin)
reasoning_parser = vllm_config.structured_outputs_config.reasoning_parser
reasoning_parser = (
self.vllm_config.structured_outputs_config.reasoning_parser
)
if reasoning_parser:
reasoner_cls = ReasoningParserManager.get_reasoning_parser(
reasoning_parser
......@@ -87,7 +93,7 @@ class StructuredOutputManager:
self.reasoner = reasoner_cls(tokenizer=self.tokenizer)
self.enable_in_reasoning = (
vllm_config.structured_outputs_config.enable_in_reasoning
self.vllm_config.structured_outputs_config.enable_in_reasoning
)
def grammar_init(self, request: Request) -> None:
......
......@@ -271,7 +271,6 @@ class GPUModelRunner(
device: torch.device,
):
self.vllm_config = vllm_config
self.renderer_config = vllm_config.renderer_config
self.model_config = vllm_config.model_config
self.cache_config = vllm_config.cache_config
self.compilation_config = vllm_config.compilation_config
......@@ -336,7 +335,7 @@ class GPUModelRunner(
self.uses_mrope = model_config.uses_mrope
self.uses_xdrope_dim = model_config.uses_xdrope_dim
self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
self.renderer_config
model_config
)
if self.model_config.is_encoder_decoder:
......@@ -559,7 +558,7 @@ class GPUModelRunner(
self.mm_budget = (
MultiModalBudget(
self.renderer_config,
self.model_config,
self.scheduler_config,
self.mm_registry,
)
......@@ -3874,7 +3873,7 @@ class GPUModelRunner(
assert self.mm_budget is not None
dummy_decoder_data = self.mm_registry.get_decoder_dummy_data(
renderer_config=self.renderer_config,
model_config=self.model_config,
seq_len=self.max_model_len,
mm_counts={modality: 1},
cache=self.mm_budget.cache,
......
......@@ -143,7 +143,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
original_parallel_config: ParallelConfig | None = None,
):
self.vllm_config = vllm_config
self.renderer_config = vllm_config.renderer_config
self.model_config = vllm_config.model_config
self.cache_config = vllm_config.cache_config
self.lora_config = vllm_config.lora_config
......@@ -223,7 +222,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self.mm_registry = MULTIMODAL_REGISTRY
self.uses_mrope = model_config.uses_mrope
self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
self.renderer_config
model_config
)
# TODO: Support M-RoPE (e.g, Qwen2-VL)
assert not self.uses_mrope, "TPU does not support M-RoPE yet."
......@@ -354,7 +353,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self.mm_budget = (
MultiModalBudget(
self.renderer_config,
self.model_config,
self.scheduler_config,
self.mm_registry,
)
......@@ -2039,7 +2038,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
assert self.mm_budget is not None
dummy_decoder_data = self.mm_registry.get_decoder_dummy_data(
renderer_config=self.renderer_config,
model_config=self.model_config,
seq_len=self.max_model_len,
mm_counts={modality: 1},
cache=self.mm_budget.cache,
......
......@@ -7,7 +7,7 @@ import torch
from vllm.attention.backends.abstract import AttentionBackend
from vllm.attention.layer import Attention
from vllm.config import RendererConfig, SchedulerConfig, VllmConfig
from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
from vllm.model_executor.models.interfaces import MultiModalEmbeddings
from vllm.model_executor.models.utils import extract_layer_index
from vllm.multimodal.cache import processor_only_cache_from_config
......@@ -23,29 +23,24 @@ class MultiModalBudget:
def __init__(
self,
renderer_config: RendererConfig,
model_config: ModelConfig,
scheduler_config: SchedulerConfig,
mm_registry: MultiModalRegistry,
) -> None:
super().__init__()
self.renderer_config = renderer_config
self.model_config = renderer_config.model_config
self.model_config = model_config
self.scheduler_config = scheduler_config
self.mm_registry = mm_registry
self.cache = cache = processor_only_cache_from_config(
renderer_config, mm_registry
)
self.cache = cache = processor_only_cache_from_config(model_config, mm_registry)
self.max_model_len = self.model_config.max_model_len
self.max_model_len = model_config.max_model_len
self.max_num_reqs = scheduler_config.max_num_seqs
self.mm_limits = mm_registry.get_mm_limits_per_prompt(
renderer_config, cache=cache
)
self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config, cache=cache)
max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality(
renderer_config,
model_config,
cache=cache,
profiler_limits=self.mm_limits,
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment