Unverified Commit 27f4c2fd authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Renderer] Separate out `RendererConfig` from `ModelConfig` (#30145)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent a49d813f
......@@ -85,7 +85,7 @@ class EagleProposer:
# Multi-modal data support
self.mm_registry = MULTIMODAL_REGISTRY
self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
vllm_config.model_config
vllm_config.renderer_config
)
self.attn_metadata_builder: AttentionMetadataBuilder | None = None
......
......@@ -63,7 +63,7 @@ class StructuredOutputManager:
max_workers = max(1, min(multiprocessing.cpu_count() // 2, 8))
self.executor_for_fillmask = ThreadPoolExecutor(max_workers=max_workers)
if not self.vllm_config.model_config.skip_tokenizer_init:
if not vllm_config.renderer_config.skip_tokenizer_init:
# The default max_workers if not specified is the number of
# CPUs * 5, which is way too high since these tasks are CPU-bound,
# not I/O bound. We also know we would never dominate CPU usage
......@@ -71,21 +71,15 @@ class StructuredOutputManager:
# of CPUs.
max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
self.executor = ThreadPoolExecutor(max_workers=max_workers)
self.tokenizer = init_tokenizer_from_config(
model_config=self.vllm_config.model_config
)
reasoning_parser = (
self.vllm_config.structured_outputs_config.reasoning_parser
)
self.tokenizer = init_tokenizer_from_config(vllm_config.renderer_config)
reasoning_parser = vllm_config.structured_outputs_config.reasoning_parser
reasoning_parser_plugin = (
self.vllm_config.structured_outputs_config.reasoning_parser_plugin
vllm_config.structured_outputs_config.reasoning_parser_plugin
)
if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3:
ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin)
reasoning_parser = (
self.vllm_config.structured_outputs_config.reasoning_parser
)
reasoning_parser = vllm_config.structured_outputs_config.reasoning_parser
if reasoning_parser:
reasoner_cls = ReasoningParserManager.get_reasoning_parser(
reasoning_parser
......@@ -93,7 +87,7 @@ class StructuredOutputManager:
self.reasoner = reasoner_cls(tokenizer=self.tokenizer)
self.enable_in_reasoning = (
self.vllm_config.structured_outputs_config.enable_in_reasoning
vllm_config.structured_outputs_config.enable_in_reasoning
)
def grammar_init(self, request: Request) -> None:
......
......@@ -271,6 +271,7 @@ class GPUModelRunner(
device: torch.device,
):
self.vllm_config = vllm_config
self.renderer_config = vllm_config.renderer_config
self.model_config = vllm_config.model_config
self.cache_config = vllm_config.cache_config
self.compilation_config = vllm_config.compilation_config
......@@ -335,7 +336,7 @@ class GPUModelRunner(
self.uses_mrope = model_config.uses_mrope
self.uses_xdrope_dim = model_config.uses_xdrope_dim
self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
model_config
self.renderer_config
)
if self.model_config.is_encoder_decoder:
......@@ -558,7 +559,7 @@ class GPUModelRunner(
self.mm_budget = (
MultiModalBudget(
self.model_config,
self.renderer_config,
self.scheduler_config,
self.mm_registry,
)
......@@ -3873,7 +3874,7 @@ class GPUModelRunner(
assert self.mm_budget is not None
dummy_decoder_data = self.mm_registry.get_decoder_dummy_data(
model_config=self.model_config,
renderer_config=self.renderer_config,
seq_len=self.max_model_len,
mm_counts={modality: 1},
cache=self.mm_budget.cache,
......
......@@ -143,6 +143,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
original_parallel_config: ParallelConfig | None = None,
):
self.vllm_config = vllm_config
self.renderer_config = vllm_config.renderer_config
self.model_config = vllm_config.model_config
self.cache_config = vllm_config.cache_config
self.lora_config = vllm_config.lora_config
......@@ -222,7 +223,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self.mm_registry = MULTIMODAL_REGISTRY
self.uses_mrope = model_config.uses_mrope
self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
model_config
self.renderer_config
)
# TODO: Support M-RoPE (e.g, Qwen2-VL)
assert not self.uses_mrope, "TPU does not support M-RoPE yet."
......@@ -353,7 +354,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self.mm_budget = (
MultiModalBudget(
self.model_config,
self.renderer_config,
self.scheduler_config,
self.mm_registry,
)
......@@ -2038,7 +2039,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
assert self.mm_budget is not None
dummy_decoder_data = self.mm_registry.get_decoder_dummy_data(
model_config=self.model_config,
renderer_config=self.renderer_config,
seq_len=self.max_model_len,
mm_counts={modality: 1},
cache=self.mm_budget.cache,
......
......@@ -7,7 +7,7 @@ import torch
from vllm.attention.backends.abstract import AttentionBackend
from vllm.attention.layer import Attention
from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
from vllm.config import RendererConfig, SchedulerConfig, VllmConfig
from vllm.model_executor.models.interfaces import MultiModalEmbeddings
from vllm.model_executor.models.utils import extract_layer_index
from vllm.multimodal.cache import processor_only_cache_from_config
......@@ -23,24 +23,29 @@ class MultiModalBudget:
def __init__(
self,
model_config: ModelConfig,
renderer_config: RendererConfig,
scheduler_config: SchedulerConfig,
mm_registry: MultiModalRegistry,
) -> None:
super().__init__()
self.model_config = model_config
self.renderer_config = renderer_config
self.model_config = renderer_config.model_config
self.scheduler_config = scheduler_config
self.mm_registry = mm_registry
self.cache = cache = processor_only_cache_from_config(model_config, mm_registry)
self.cache = cache = processor_only_cache_from_config(
renderer_config, mm_registry
)
self.max_model_len = model_config.max_model_len
self.max_model_len = self.model_config.max_model_len
self.max_num_reqs = scheduler_config.max_num_seqs
self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config, cache=cache)
self.mm_limits = mm_registry.get_mm_limits_per_prompt(
renderer_config, cache=cache
)
max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality(
model_config,
renderer_config,
cache=cache,
profiler_limits=self.mm_limits,
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment