Unverified Commit e83b7e37 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

Revert "[Renderer] Separate out `RendererConfig` from `ModelConfig` (#30145)" (#30199)

parent 27f4c2fd
...@@ -85,7 +85,7 @@ class EagleProposer: ...@@ -85,7 +85,7 @@ class EagleProposer:
# Multi-modal data support # Multi-modal data support
self.mm_registry = MULTIMODAL_REGISTRY self.mm_registry = MULTIMODAL_REGISTRY
self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs( self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
vllm_config.renderer_config vllm_config.model_config
) )
self.attn_metadata_builder: AttentionMetadataBuilder | None = None self.attn_metadata_builder: AttentionMetadataBuilder | None = None
......
...@@ -63,7 +63,7 @@ class StructuredOutputManager: ...@@ -63,7 +63,7 @@ class StructuredOutputManager:
max_workers = max(1, min(multiprocessing.cpu_count() // 2, 8)) max_workers = max(1, min(multiprocessing.cpu_count() // 2, 8))
self.executor_for_fillmask = ThreadPoolExecutor(max_workers=max_workers) self.executor_for_fillmask = ThreadPoolExecutor(max_workers=max_workers)
if not vllm_config.renderer_config.skip_tokenizer_init: if not self.vllm_config.model_config.skip_tokenizer_init:
# The default max_workers if not specified is the number of # The default max_workers if not specified is the number of
# CPUs * 5, which is way too high since these tasks are CPU-bound, # CPUs * 5, which is way too high since these tasks are CPU-bound,
# not I/O bound. We also know we would never dominate CPU usage # not I/O bound. We also know we would never dominate CPU usage
...@@ -71,15 +71,21 @@ class StructuredOutputManager: ...@@ -71,15 +71,21 @@ class StructuredOutputManager:
# of CPUs. # of CPUs.
max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2) max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
self.executor = ThreadPoolExecutor(max_workers=max_workers) self.executor = ThreadPoolExecutor(max_workers=max_workers)
self.tokenizer = init_tokenizer_from_config(vllm_config.renderer_config) self.tokenizer = init_tokenizer_from_config(
reasoning_parser = vllm_config.structured_outputs_config.reasoning_parser model_config=self.vllm_config.model_config
)
reasoning_parser = (
self.vllm_config.structured_outputs_config.reasoning_parser
)
reasoning_parser_plugin = ( reasoning_parser_plugin = (
vllm_config.structured_outputs_config.reasoning_parser_plugin self.vllm_config.structured_outputs_config.reasoning_parser_plugin
) )
if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3: if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3:
ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin) ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin)
reasoning_parser = vllm_config.structured_outputs_config.reasoning_parser reasoning_parser = (
self.vllm_config.structured_outputs_config.reasoning_parser
)
if reasoning_parser: if reasoning_parser:
reasoner_cls = ReasoningParserManager.get_reasoning_parser( reasoner_cls = ReasoningParserManager.get_reasoning_parser(
reasoning_parser reasoning_parser
...@@ -87,7 +93,7 @@ class StructuredOutputManager: ...@@ -87,7 +93,7 @@ class StructuredOutputManager:
self.reasoner = reasoner_cls(tokenizer=self.tokenizer) self.reasoner = reasoner_cls(tokenizer=self.tokenizer)
self.enable_in_reasoning = ( self.enable_in_reasoning = (
vllm_config.structured_outputs_config.enable_in_reasoning self.vllm_config.structured_outputs_config.enable_in_reasoning
) )
def grammar_init(self, request: Request) -> None: def grammar_init(self, request: Request) -> None:
......
...@@ -271,7 +271,6 @@ class GPUModelRunner( ...@@ -271,7 +271,6 @@ class GPUModelRunner(
device: torch.device, device: torch.device,
): ):
self.vllm_config = vllm_config self.vllm_config = vllm_config
self.renderer_config = vllm_config.renderer_config
self.model_config = vllm_config.model_config self.model_config = vllm_config.model_config
self.cache_config = vllm_config.cache_config self.cache_config = vllm_config.cache_config
self.compilation_config = vllm_config.compilation_config self.compilation_config = vllm_config.compilation_config
...@@ -336,7 +335,7 @@ class GPUModelRunner( ...@@ -336,7 +335,7 @@ class GPUModelRunner(
self.uses_mrope = model_config.uses_mrope self.uses_mrope = model_config.uses_mrope
self.uses_xdrope_dim = model_config.uses_xdrope_dim self.uses_xdrope_dim = model_config.uses_xdrope_dim
self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs( self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
self.renderer_config model_config
) )
if self.model_config.is_encoder_decoder: if self.model_config.is_encoder_decoder:
...@@ -559,7 +558,7 @@ class GPUModelRunner( ...@@ -559,7 +558,7 @@ class GPUModelRunner(
self.mm_budget = ( self.mm_budget = (
MultiModalBudget( MultiModalBudget(
self.renderer_config, self.model_config,
self.scheduler_config, self.scheduler_config,
self.mm_registry, self.mm_registry,
) )
...@@ -3874,7 +3873,7 @@ class GPUModelRunner( ...@@ -3874,7 +3873,7 @@ class GPUModelRunner(
assert self.mm_budget is not None assert self.mm_budget is not None
dummy_decoder_data = self.mm_registry.get_decoder_dummy_data( dummy_decoder_data = self.mm_registry.get_decoder_dummy_data(
renderer_config=self.renderer_config, model_config=self.model_config,
seq_len=self.max_model_len, seq_len=self.max_model_len,
mm_counts={modality: 1}, mm_counts={modality: 1},
cache=self.mm_budget.cache, cache=self.mm_budget.cache,
......
...@@ -143,7 +143,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -143,7 +143,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
original_parallel_config: ParallelConfig | None = None, original_parallel_config: ParallelConfig | None = None,
): ):
self.vllm_config = vllm_config self.vllm_config = vllm_config
self.renderer_config = vllm_config.renderer_config
self.model_config = vllm_config.model_config self.model_config = vllm_config.model_config
self.cache_config = vllm_config.cache_config self.cache_config = vllm_config.cache_config
self.lora_config = vllm_config.lora_config self.lora_config = vllm_config.lora_config
...@@ -223,7 +222,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -223,7 +222,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self.mm_registry = MULTIMODAL_REGISTRY self.mm_registry = MULTIMODAL_REGISTRY
self.uses_mrope = model_config.uses_mrope self.uses_mrope = model_config.uses_mrope
self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs( self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
self.renderer_config model_config
) )
# TODO: Support M-RoPE (e.g, Qwen2-VL) # TODO: Support M-RoPE (e.g, Qwen2-VL)
assert not self.uses_mrope, "TPU does not support M-RoPE yet." assert not self.uses_mrope, "TPU does not support M-RoPE yet."
...@@ -354,7 +353,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -354,7 +353,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self.mm_budget = ( self.mm_budget = (
MultiModalBudget( MultiModalBudget(
self.renderer_config, self.model_config,
self.scheduler_config, self.scheduler_config,
self.mm_registry, self.mm_registry,
) )
...@@ -2039,7 +2038,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -2039,7 +2038,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
assert self.mm_budget is not None assert self.mm_budget is not None
dummy_decoder_data = self.mm_registry.get_decoder_dummy_data( dummy_decoder_data = self.mm_registry.get_decoder_dummy_data(
renderer_config=self.renderer_config, model_config=self.model_config,
seq_len=self.max_model_len, seq_len=self.max_model_len,
mm_counts={modality: 1}, mm_counts={modality: 1},
cache=self.mm_budget.cache, cache=self.mm_budget.cache,
......
...@@ -7,7 +7,7 @@ import torch ...@@ -7,7 +7,7 @@ import torch
from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.backends.abstract import AttentionBackend
from vllm.attention.layer import Attention from vllm.attention.layer import Attention
from vllm.config import RendererConfig, SchedulerConfig, VllmConfig from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.interfaces import MultiModalEmbeddings
from vllm.model_executor.models.utils import extract_layer_index from vllm.model_executor.models.utils import extract_layer_index
from vllm.multimodal.cache import processor_only_cache_from_config from vllm.multimodal.cache import processor_only_cache_from_config
...@@ -23,29 +23,24 @@ class MultiModalBudget: ...@@ -23,29 +23,24 @@ class MultiModalBudget:
def __init__( def __init__(
self, self,
renderer_config: RendererConfig, model_config: ModelConfig,
scheduler_config: SchedulerConfig, scheduler_config: SchedulerConfig,
mm_registry: MultiModalRegistry, mm_registry: MultiModalRegistry,
) -> None: ) -> None:
super().__init__() super().__init__()
self.renderer_config = renderer_config self.model_config = model_config
self.model_config = renderer_config.model_config
self.scheduler_config = scheduler_config self.scheduler_config = scheduler_config
self.mm_registry = mm_registry self.mm_registry = mm_registry
self.cache = cache = processor_only_cache_from_config( self.cache = cache = processor_only_cache_from_config(model_config, mm_registry)
renderer_config, mm_registry
)
self.max_model_len = self.model_config.max_model_len self.max_model_len = model_config.max_model_len
self.max_num_reqs = scheduler_config.max_num_seqs self.max_num_reqs = scheduler_config.max_num_seqs
self.mm_limits = mm_registry.get_mm_limits_per_prompt( self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config, cache=cache)
renderer_config, cache=cache
)
max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality( max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality(
renderer_config, model_config,
cache=cache, cache=cache,
profiler_limits=self.mm_limits, profiler_limits=self.mm_limits,
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment