From 951445a52df030050c9a3ed72d612d7e807ba368 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 2 Dec 2025 12:16:37 +0000
Subject: [PATCH 001/258] Remove default values from `InitVar`s so that they're
 not stored (#29859)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 benchmarks/benchmark_ngram_proposer.py        |  5 ++-
 tests/compile/test_fusion_attn.py             | 15 ++++---
 tests/lora/test_worker.py                     | 25 +++++++----
 tests/test_config.py                          | 13 ++++++
 tests/v1/attention/utils.py                   |  2 +
 tests/v1/core/test_kv_cache_utils.py          | 11 ++++-
 tests/v1/core/test_scheduler.py               | 13 +++---
 tests/v1/core/utils.py                        | 15 +++----
 tests/v1/cudagraph/test_cudagraph_dispatch.py |  4 +-
 tests/v1/engine/test_engine_core.py           | 13 +++---
 tests/v1/kv_connector/unit/utils.py           | 13 +++---
 tests/v1/spec_decode/test_eagle.py            |  5 ++-
 tests/v1/spec_decode/test_mtp.py              |  5 ++-
 tests/v1/tpu/worker/test_tpu_model_runner.py  | 11 ++---
 tests/v1/worker/test_gpu_model_runner.py      | 20 +++++----
 vllm/config/scheduler.py                      | 42 +++++++++++--------
 vllm/config/vllm.py                           |  4 +-
 17 files changed, 139 insertions(+), 77 deletions(-)

diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py
index dedb564ff..cac401456 100644
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -108,7 +108,10 @@ def benchmark_batched_propose(args):
         device_config=DeviceConfig(device=current_platform.device_type),
         parallel_config=ParallelConfig(),
         load_config=LoadConfig(),
-        scheduler_config=SchedulerConfig(),
+        scheduler_config=SchedulerConfig(
+            max_model_len=model_config.max_model_len,
+            is_encoder_decoder=model_config.is_encoder_decoder,
+        ),
     )
 
     # monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index dbe12dc5d..4d213e030 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -318,13 +318,18 @@ def test_attention_quant_pattern(
     torch.set_default_dtype(dtype)
     torch.manual_seed(42)
 
+    model_config = ModelConfig(
+        model=model_name,
+        max_model_len=2048,
+        dtype=dtype,
+    )
     vllm_config = VllmConfig(
-        model_config=ModelConfig(
-            model=model_name,
-            max_model_len=2048,
-            dtype=dtype,
+        model_config=model_config,
+        scheduler_config=SchedulerConfig(
+            max_num_seqs=1024,
+            max_model_len=model_config.max_model_len,
+            is_encoder_decoder=model_config.is_encoder_decoder,
         ),
-        scheduler_config=SchedulerConfig(max_num_seqs=1024),
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
             custom_ops=custom_ops_list,
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index b163559a9..54059ec56 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -33,14 +33,16 @@ def test_worker_apply_lora(qwen3_lora_files):
             lora_requests, lora_mapping
         )
 
+    model_config = ModelConfig(
+        MODEL_PATH,
+        seed=0,
+        dtype="float16",
+        max_model_len=127,
+        enforce_eager=True,
+    )
+
     vllm_config = VllmConfig(
-        model_config=ModelConfig(
-            MODEL_PATH,
-            seed=0,
-            dtype="float16",
-            max_model_len=127,
-            enforce_eager=True,
-        ),
+        model_config=model_config,
         load_config=LoadConfig(
             download_dir=None,
             load_format="dummy",
@@ -50,7 +52,14 @@ def test_worker_apply_lora(qwen3_lora_files):
             tensor_parallel_size=1,
             data_parallel_size=1,
         ),
-        scheduler_config=SchedulerConfig("generate", 32, 32, 32),
+        scheduler_config=SchedulerConfig(
+            max_model_len=model_config.max_model_len,
+            is_encoder_decoder=model_config.is_encoder_decoder,
+            runner_type="generate",
+            max_num_batched_tokens=32,
+            max_num_seqs=32,
+            max_num_partial_prefills=32,
+        ),
         device_config=DeviceConfig("cuda"),
         cache_config=CacheConfig(
             block_size=16,
diff --git a/tests/test_config.py b/tests/test_config.py
index 76e0d9442..b7ed68fea 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -6,12 +6,14 @@ from dataclasses import MISSING, Field, asdict, dataclass, field
 from unittest.mock import patch
 
 import pytest
+from pydantic import ValidationError
 
 from vllm.compilation.backends import VllmBackend
 from vllm.config import (
     CompilationConfig,
     ModelConfig,
     PoolerConfig,
+    SchedulerConfig,
     VllmConfig,
     update_config,
 )
@@ -1095,3 +1097,14 @@ def test_vllm_config_explicit_overrides():
     # Other fields should still use defaults
     assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE
     assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
+
+
+def test_scheduler_config_init():
+    with pytest.raises(ValidationError):
+        # Positional InitVars missing
+        # (InitVars cannot have defaults otherwise they will become attributes)
+        SchedulerConfig()
+
+    with pytest.raises(AttributeError):
+        # InitVar does not become an attribute
+        print(SchedulerConfig.default_factory().max_model_len)
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index df3d53332..6cab129c1 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -185,6 +185,8 @@ def create_vllm_config(
         max_num_seqs=max_num_seqs,
         max_num_batched_tokens=max_num_batched_tokens,
         enable_chunked_prefill=enable_chunked_prefill,
+        max_model_len=model_config.max_model_len,
+        is_encoder_decoder=model_config.is_encoder_decoder,
     )
 
     device_config = DeviceConfig()
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 58a7a2692..fd5cf6d3e 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1128,7 +1128,11 @@ def test_estimate_max_model_len(model_id, max_model_len, want_estimated_max_len)
         dtype="float16",
         max_model_len=max_model_len,
     )
-    scheduler_config = SchedulerConfig(max_num_batched_tokens=32768)
+    scheduler_config = SchedulerConfig(
+        max_num_batched_tokens=32768,
+        max_model_len=model_config.max_model_len,
+        is_encoder_decoder=model_config.is_encoder_decoder,
+    )
 
     vllm_config = VllmConfig(
         model_config=model_config,
@@ -1163,7 +1167,10 @@ def test_get_max_concurrency_for_kv_cache_config():
         max_model_len=max_model_len,
     )
     scheduler_config = SchedulerConfig(
-        max_num_batched_tokens=1024, enable_chunked_prefill=True
+        max_num_batched_tokens=1024,
+        enable_chunked_prefill=True,
+        max_model_len=model_config.max_model_len,
+        is_encoder_decoder=model_config.is_encoder_decoder,
     )
 
     vllm_config = VllmConfig(
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 0051c11d1..c6c4a5085 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1508,6 +1508,12 @@ def create_scheduler_with_priority(
     Returns:
       {class}`Scheduler` instance with priority scheduling
     """
+    model_config = ModelConfig(
+        model=model,
+        trust_remote_code=True,
+        dtype="float16",
+        seed=42,
+    )
     if max_model_len is None:
         max_model_len = max_num_batched_tokens
     scheduler_config = SchedulerConfig(
@@ -1517,14 +1523,9 @@ def create_scheduler_with_priority(
         long_prefill_token_threshold=long_prefill_token_threshold,
         disable_chunked_mm_input=disable_chunked_mm_input,
         enable_chunked_prefill=True,
+        is_encoder_decoder=model_config.is_encoder_decoder,
         policy="priority",  # Enable priority scheduling
     )
-    model_config = ModelConfig(
-        model=model,
-        trust_remote_code=True,
-        dtype="float16",
-        seed=42,
-    )
     # Cache config, optionally force APC
     cache_config = CacheConfig(
         block_size=block_size,
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 7537c7a60..f5ba613d3 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -69,6 +69,13 @@ def create_scheduler(
     Returns:
       {class}`Scheduler` instance
     """
+    model_config = ModelConfig(
+        model=model,
+        trust_remote_code=True,
+        dtype="float16",
+        seed=42,
+        skip_tokenizer_init=skip_tokenizer_init,
+    )
     if max_model_len is None:
         max_model_len = max_num_batched_tokens
     scheduler_config = SchedulerConfig(
@@ -79,13 +86,7 @@ def create_scheduler(
         disable_chunked_mm_input=disable_chunked_mm_input,
         enable_chunked_prefill=enable_chunked_prefill,
         async_scheduling=async_scheduling,
-    )
-    model_config = ModelConfig(
-        model=model,
-        trust_remote_code=True,
-        dtype="float16",
-        seed=42,
-        skip_tokenizer_init=skip_tokenizer_init,
+        is_encoder_decoder=model_config.is_encoder_decoder,
     )
     # Cache config, optionally force APC
     cache_config = CacheConfig(
diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py
index 314e7094e..b86534d3d 100644
--- a/tests/v1/cudagraph/test_cudagraph_dispatch.py
+++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py
@@ -40,7 +40,9 @@ def _create_vllm_config(
 ) -> MagicMock:
     mock_config = MagicMock(spec=VllmConfig)
     mock_config.compilation_config = compilation_config
-    mock_config.scheduler_config = SchedulerConfig(max_num_seqs=max_num_seqs)
+    mock_config.scheduler_config = SchedulerConfig.default_factory(
+        max_num_seqs=max_num_seqs,
+    )
     mock_config.parallel_config = ParallelConfig()
     mock_config.speculative_config = None  # No speculative decoding
     if not lora_config:
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 3ba8ab26f..48be8c15a 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -484,12 +484,6 @@ def test_encoder_instance_zero_kv_cache(
     vision encoder, so they don't need KV cache for text generation.
     """
     # Form vllm config
-    scheduler_config = SchedulerConfig(
-        max_num_seqs=10,
-        max_num_batched_tokens=512,
-        max_model_len=512,
-        disable_hybrid_kv_cache_manager=True,
-    )
     model_config = ModelConfig(
         model="llava-hf/llava-1.5-7b-hf",  # Multimodal model
         enforce_eager=True,
@@ -497,6 +491,13 @@ def test_encoder_instance_zero_kv_cache(
         dtype="float16",
         seed=42,
     )
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=10,
+        max_num_batched_tokens=512,
+        max_model_len=512,
+        disable_hybrid_kv_cache_manager=True,
+        is_encoder_decoder=model_config.is_encoder_decoder,
+    )
     cache_config = CacheConfig(
         block_size=16,
         gpu_memory_utilization=gpu_memory_utilization,
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index f35f91bb3..98f1f4492 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -92,18 +92,19 @@ def create_vllm_config(
     enable_permute_local_kv: bool = False,
 ) -> VllmConfig:
     """Initialize VllmConfig For Testing."""
-    scheduler_config = SchedulerConfig(
-        max_num_seqs=max_num_seqs,
-        max_num_batched_tokens=max_num_batched_tokens,
-        max_model_len=max_model_len,
-        enable_chunked_prefill=enable_chunked_prefill,
-    )
     model_config = ModelConfig(
         model=model,
         trust_remote_code=True,
         dtype="float16",
         seed=42,
     )
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=max_num_seqs,
+        max_num_batched_tokens=max_num_batched_tokens,
+        max_model_len=max_model_len,
+        enable_chunked_prefill=enable_chunked_prefill,
+        is_encoder_decoder=model_config.is_encoder_decoder,
+    )
     # Cache config, optionally force APC
     cache_config = CacheConfig(
         block_size=block_size,
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 9436ab471..616e57de3 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -66,7 +66,10 @@ def _create_proposer(
         device_config=DeviceConfig(device=current_platform.device_type),
         parallel_config=ParallelConfig(),
         load_config=LoadConfig(),
-        scheduler_config=SchedulerConfig(),
+        scheduler_config=SchedulerConfig(
+            max_model_len=model_config.max_model_len,
+            is_encoder_decoder=model_config.is_encoder_decoder,
+        ),
     )
 
     return EagleProposer(vllm_config=vllm_config, device=current_platform.device_type)
diff --git a/tests/v1/spec_decode/test_mtp.py b/tests/v1/spec_decode/test_mtp.py
index c5c0491ab..3b8813ceb 100644
--- a/tests/v1/spec_decode/test_mtp.py
+++ b/tests/v1/spec_decode/test_mtp.py
@@ -51,7 +51,10 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer:
         device_config=DeviceConfig(device=current_platform.device_type),
         parallel_config=ParallelConfig(),
         load_config=LoadConfig(),
-        scheduler_config=SchedulerConfig(),
+        scheduler_config=SchedulerConfig(
+            max_model_len=model_config.max_model_len,
+            is_encoder_decoder=model_config.is_encoder_decoder,
+        ),
     )
 
     return EagleProposer(vllm_config=vllm_config, device=current_platform.device_type)
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 7b3a07b4e..cfc06666e 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -26,16 +26,17 @@ from vllm.v1.worker.tpu_model_runner import (
 
 
 def get_vllm_config():
-    scheduler_config = SchedulerConfig(
-        max_num_seqs=10,
-        max_num_batched_tokens=512,
-        max_model_len=512,
-    )
     model_config = ModelConfig(
         model="facebook/opt-125m",
         dtype="bfloat16",  # TPUs typically use bfloat16
         seed=42,
     )
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=10,
+        max_num_batched_tokens=512,
+        max_model_len=512,
+        is_encoder_decoder=model_config.is_encoder_decoder,
+    )
     cache_config = CacheConfig(
         block_size=16,
         gpu_memory_utilization=0.9,
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 89669ee8b..0439bef12 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -79,16 +79,17 @@ def initialize_kv_cache(runner: GPUModelRunner):
 
 
 def get_vllm_config():
-    scheduler_config = SchedulerConfig(
-        max_num_seqs=10,
-        max_num_batched_tokens=512,
-        max_model_len=512,
-    )
     model_config = ModelConfig(
         model="facebook/opt-125m",
         dtype="float16",
         seed=42,
     )
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=10,
+        max_num_batched_tokens=512,
+        max_model_len=512,
+        is_encoder_decoder=model_config.is_encoder_decoder,
+    )
     cache_config = CacheConfig(
         block_size=BLOCK_SIZE,
         gpu_memory_utilization=0.9,
@@ -784,14 +785,15 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
     initialize_model_parallel(tensor_model_parallel_size=1)
     torch.set_default_dtype(torch.float16)
 
+    model_config = ModelConfig(
+        model="ibm-granite/granite-4.0-tiny-preview",
+        dtype="float16",
+    )
     scheduler_config = SchedulerConfig(
         max_num_seqs=10,
         max_num_batched_tokens=512,
         max_model_len=512,
-    )
-    model_config = ModelConfig(
-        model="ibm-granite/granite-4.0-tiny-preview",
-        dtype="float16",
+        is_encoder_decoder=model_config.is_encoder_decoder,
     )
     cache_config = CacheConfig(
         block_size=BLOCK_SIZE,
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index 1e089b42c..8da3ae538 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -28,6 +28,19 @@ SchedulerPolicy = Literal["fcfs", "priority"]
 class SchedulerConfig:
     """Scheduler configuration."""
 
+    max_model_len: InitVar[int]
+    """Maximum length of a sequence (including prompt and generated text).
+
+    Note: This is stored in the ModelConfig, and is used only here to
+    provide fallbacks and validate other attributes."""
+
+    is_encoder_decoder: InitVar[bool]
+    """True if the model is an encoder-decoder model.
+
+    Note: This is stored in the ModelConfig, and is used only here to
+    disable chunked prefill and prefix caching for encoder-decoder models.
+    """
+
     DEFAULT_MAX_NUM_BATCHED_TOKENS: ClassVar[int] = 2048
     DEFAULT_MAX_NUM_SEQS: ClassVar[int] = 128
 
@@ -73,19 +86,6 @@ class SchedulerConfig:
     is_multimodal_model: bool = False
     """True if the model is multimodal."""
 
-    max_model_len: InitVar[int] = 8192
-    """Maximum length of a sequence (including prompt and generated text).
-
-    Note: This is stored in the ModelConfig, and is used only here to
-    provide fallbacks and validate other attributes."""
-
-    is_encoder_decoder: InitVar[bool] = False
-    """True if the model is an encoder-decoder model.
-
-    Note: This is stored in the ModelConfig, and is used only here to
-    disable chunked prefill and prefix caching for encoder-decoder models.
-    """
-
     # TODO (ywang96): Make this configurable.
     max_num_encoder_input_tokens: int = Field(init=False)
     """Multimodal encoder compute budget, only used in V1.
@@ -141,6 +141,17 @@ class SchedulerConfig:
     while a larger value (e.g., 10) reduces host overhead and may increase throughput
     by batching multiple tokens before sending."""
 
+    @staticmethod
+    def default_factory(**kwargs):
+        """
+        Factory method to create `SchedulerConfig` with default values for `InitVar`s.
+        """
+        if "max_model_len" not in kwargs:
+            kwargs["max_model_len"] = 8192
+        if "is_encoder_decoder" not in kwargs:
+            kwargs["is_encoder_decoder"] = False
+        return SchedulerConfig(**kwargs)
+
     def get_scheduler_cls(self) -> type["SchedulerInterface"]:
         if self.scheduler_cls is None:
             if self.async_scheduling:
@@ -284,8 +295,3 @@ class SchedulerConfig:
             )
 
         return self
-
-    def __getattribute__(self, name: str) -> Any:
-        if name == "max_model_len" or name == "is_encoder_decoder":
-            raise AttributeError(f"{name} is an init-only parameter. ")
-        return object.__getattribute__(self, name)
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 615b1f848..5b3a9c437 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -170,7 +170,9 @@ class VllmConfig:
     """Cache configuration."""
     parallel_config: ParallelConfig = Field(default_factory=ParallelConfig)
     """Parallel configuration."""
-    scheduler_config: SchedulerConfig = Field(default_factory=SchedulerConfig)
+    scheduler_config: SchedulerConfig = Field(
+        default_factory=SchedulerConfig.default_factory,
+    )
     """Scheduler configuration."""
     device_config: DeviceConfig = Field(default_factory=DeviceConfig)
     """Device configuration."""
-- 
GitLab


From 68ffbca7e462cfa6a32b46dabc9a604c7c1b918d Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 2 Dec 2025 20:30:40 +0800
Subject: [PATCH 002/258] [Chore] Use `tokenizer.encode` and `tokenizer.decode`
 directly (#29851)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../multimodal/processing/test_common.py      | 14 ++++++-----
 .../multimodal/processing/test_llama4.py      |  3 +--
 vllm/entrypoints/openai/speech_to_text.py     |  2 +-
 vllm/entrypoints/renderer.py                  |  4 +--
 vllm/entrypoints/score_utils.py               |  2 +-
 .../model_executor/models/nano_nemotron_vl.py | 25 ++++++++-----------
 .../models/qwen2_5_omni_thinker.py            |  3 +--
 vllm/multimodal/processing.py                 | 19 ++++++--------
 vllm/transformers_utils/tokenizer.py          |  4 +++
 9 files changed, 36 insertions(+), 40 deletions(-)

diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 90158a028..8ef1fba8d 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -22,8 +22,11 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
 from vllm.multimodal.cache import MultiModalProcessorOnlyCache
 from vllm.multimodal.inputs import MultiModalInputs
 from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
-from vllm.tokenizers import MistralTokenizer, cached_tokenizer_from_config
-from vllm.transformers_utils.tokenizer import encode_tokens
+from vllm.tokenizers import (
+    MistralTokenizer,
+    TokenizerLike,
+    cached_tokenizer_from_config,
+)
 
 from ....multimodal.utils import random_audio, random_image, random_video
 from ...registry import (
@@ -151,7 +154,7 @@ def get_text_token_prompts(
     mm_data: MultiModalDataDict,
 ):
     dummy_inputs = processor.dummy_inputs
-    tokenizer = processor.info.get_tokenizer()
+    tokenizer: TokenizerLike = processor.info.get_tokenizer()
     model_config = processor.info.ctx.model_config
 
     model_type = model_config.hf_config.model_type
@@ -188,10 +191,9 @@ def get_text_token_prompts(
         assert isinstance(inputs.prompt, str)
 
         text_prompt = inputs.prompt
-        token_prompt = encode_tokens(
-            tokenizer,
+        token_prompt = tokenizer.encode(
             text_prompt,
-            add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type),
+            add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type, True),
         )
 
     return text_prompt, token_prompt
diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py
index 4c0791ea3..b73246b68 100644
--- a/tests/models/multimodal/processing/test_llama4.py
+++ b/tests/models/multimodal/processing/test_llama4.py
@@ -5,7 +5,6 @@
 import pytest
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.transformers_utils.tokenizer import encode_tokens
 
 from ....conftest import ImageTestAssets
 from ...utils import build_model_context
@@ -48,7 +47,7 @@ def test_processor_override(
         ]
     }
     if tokenized_prompt:
-        prompt = encode_tokens(tokenizer, prompt)
+        prompt = tokenizer.encode(prompt)
 
     processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
     mm_data = processed_inputs["mm_kwargs"].get_data()
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index b34446d32..cea9924eb 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -37,7 +37,7 @@ from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
 from vllm.model_executor.models import SupportsTranscription
 from vllm.outputs import RequestOutput
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
 from vllm.utils.import_utils import PlaceholderModule
 
 try:
diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py
index 10b90bbbb..f31b309b8 100644
--- a/vllm/entrypoints/renderer.py
+++ b/vllm/entrypoints/renderer.py
@@ -33,7 +33,7 @@ class RenderConfig:
     `0` yields an empty list (and skips embeds).
     `-1` maps to `model_config.max_model_len`."""
 
-    add_special_tokens: bool | None = True
+    add_special_tokens: bool = True
     """Whether to add model-specific special tokens during tokenization."""
 
     cache_salt: str | None = None
@@ -315,7 +315,7 @@ class CompletionRenderer(BaseRenderer):
         text: str,
         max_length: int | None,
         truncate_prompt_tokens: int | None,
-        add_special_tokens: bool | None,
+        add_special_tokens: bool,
         cache_salt: str | None,
     ) -> EngineTokensPrompt:
         """Tokenize text input asynchronously."""
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index 602f59ac0..8819c85af 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -19,7 +19,7 @@ from vllm.inputs import TokensPrompt
 from vllm.model_executor.models.interfaces import supports_score_template
 from vllm.multimodal.inputs import MultiModalDataDict
 from vllm.outputs import PoolingRequestOutput
-from vllm.transformers_utils.tokenizer import TokenizerLike
+from vllm.tokenizers import TokenizerLike
 
 ScoreContentPartParam: TypeAlias = (
     ChatCompletionContentPartImageParam | ChatCompletionContentPartImageEmbedsParam
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 0f86a1775..891a9ce08 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -75,7 +75,6 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
 from vllm.transformers_utils.configs.radio import RadioConfig
-from vllm.transformers_utils.tokenizer import encode_tokens
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .utils import _merge_multimodal_embeddings
@@ -454,14 +453,12 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
 
         # Pre-tokenize special tokens for video processing
         # to avoid repeated tokenization
-        self._img_start_token_ids = encode_tokens(
-            tokenizer, IMG_START, add_special_tokens=False
+        self._img_start_token_ids = tokenizer.encode(
+            IMG_START, add_special_tokens=False
         )
-        self._img_end_token_ids = encode_tokens(
-            tokenizer, IMG_END, add_special_tokens=False
-        )
-        self._img_context_token_ids = encode_tokens(
-            tokenizer, IMG_CONTEXT, add_special_tokens=False
+        self._img_end_token_ids = tokenizer.encode(IMG_END, add_special_tokens=False)
+        self._img_context_token_ids = tokenizer.encode(
+            IMG_CONTEXT, add_special_tokens=False
         )
 
     @property
@@ -1179,14 +1176,12 @@ class NemotronH_Nano_VL_V2(
         # Pre-tokenize special tokens for video processing
         # to avoid repeated tokenization
         tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
-        self._img_start_token_ids = encode_tokens(
-            tokenizer, IMG_START, add_special_tokens=False
-        )
-        self._img_end_token_ids = encode_tokens(
-            tokenizer, IMG_END, add_special_tokens=False
+        self._img_start_token_ids = tokenizer.encode(
+            IMG_START, add_special_tokens=False
         )
-        self._img_context_token_ids = encode_tokens(
-            tokenizer, IMG_CONTEXT, add_special_tokens=False
+        self._img_end_token_ids = tokenizer.encode(IMG_END, add_special_tokens=False)
+        self._img_context_token_ids = tokenizer.encode(
+            IMG_CONTEXT, add_special_tokens=False
         )
 
     def pixel_shuffle(self, x, scale_factor=0.5):
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 7506ee865..1ce0fb4e4 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -88,7 +88,6 @@ from vllm.multimodal.processing import (
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import encode_tokens
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (
@@ -591,7 +590,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
                     tokenization_kwargs=tokenization_kwargs,
                 )
             tokenizer = self.info.get_tokenizer()
-            prompt_ids = encode_tokens(tokenizer, prompt)
+            prompt_ids = tokenizer.encode(prompt)
         else:
             prompt_ids = self._apply_hf_processor_tokens_only(prompt)
 
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 2f651bd71..f241e79cf 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -25,7 +25,6 @@ from typing_extensions import TypeVar, assert_never
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.processor import cached_processor_from_config
-from vllm.transformers_utils.tokenizer import decode_tokens, encode_tokens
 from vllm.utils.collection_utils import flatten_2d_lists, full_groupby
 from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
 from vllm.utils.jsontree import JSONTree, json_map_leaves
@@ -80,9 +79,9 @@ def _cached_encode(
     tokenizer: TokenizerLike,
     text: str,
     *,
-    add_special_tokens: bool | None = None,
+    add_special_tokens: bool = True,
 ) -> list[int]:
-    return encode_tokens(tokenizer, text, add_special_tokens=add_special_tokens)
+    return tokenizer.encode(text, add_special_tokens=add_special_tokens)
 
 
 @lru_cache(maxsize=2048)
@@ -90,11 +89,9 @@ def _cached_decode(
     tokenizer: TokenizerLike,
     token_ids: tuple[int, ...],
     *,
-    skip_special_tokens: bool | None = None,
+    skip_special_tokens: bool = False,
 ) -> str:
-    return decode_tokens(
-        tokenizer, list(token_ids), skip_special_tokens=skip_special_tokens
-    )
+    return tokenizer.decode(list(token_ids), skip_special_tokens=skip_special_tokens)
 
 
 def _seq2text(
@@ -110,7 +107,7 @@ def _seq2text(
         raise ValueError("You cannot decode tokens when `skip_tokenizer_init=True`")
 
     if not use_cache:
-        return decode_tokens(tokenizer, seq)
+        return tokenizer.decode(seq)
 
     return _cached_decode(tokenizer, tuple(seq))
 
@@ -126,7 +123,7 @@ def _seq2tokens(
             raise ValueError("You cannot encode text when `skip_tokenizer_init=True`")
 
         if not use_cache:
-            return encode_tokens(tokenizer, seq, add_special_tokens=False)
+            return tokenizer.encode(seq, add_special_tokens=False)
 
         return _cached_encode(tokenizer, seq, add_special_tokens=False)
 
@@ -2198,8 +2195,8 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
         tokenizer = self.info.get_tokenizer()
         decoder_prompt_raw = self.create_decoder_prompt(prompt, mm_data)
         if isinstance(decoder_prompt_raw, str):
-            decoder_prompt_ids = encode_tokens(
-                tokenizer, decoder_prompt_raw, add_special_tokens=False
+            decoder_prompt_ids = tokenizer.encode(
+                decoder_prompt_raw, add_special_tokens=False
             )
         else:
             decoder_prompt_ids = decoder_prompt_raw
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 617d16779..32999903b 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -4,6 +4,8 @@
 import warnings
 from typing import Any
 
+from typing_extensions import deprecated
+
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
 
@@ -73,6 +75,7 @@ def __getattr__(name: str):
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
 
+@deprecated("Will be removed in v0.13. Please use `tokenizer.decode()` instead.")
 def decode_tokens(
     tokenizer: TokenizerLike,
     token_ids: list[int],
@@ -94,6 +97,7 @@ def decode_tokens(
     return tokenizer.decode(token_ids, **kw_args)
 
 
+@deprecated("Will be removed in v0.13. Please use `tokenizer.encode()` instead.")
 def encode_tokens(
     tokenizer: TokenizerLike,
     text: str,
-- 
GitLab


From 60c3d413afccab6a1f9a18cf3cd1fe11019c1040 Mon Sep 17 00:00:00 2001
From: ImaGoodFella <31959740+ImaGoodFella@users.noreply.github.com>
Date: Tue, 2 Dec 2025 14:49:02 +0100
Subject: [PATCH 003/258] [Multimodal][Core] Optimize multimodal preprocessing
 cache by hashing image bytes instead of pixel values (#29621)

Signed-off-by: Rahul Steiger <rasteiger@ethz.ch>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 tests/conftest.py                             |  7 ++++-
 tests/entrypoints/openai/test_vision.py       |  7 ++++-
 .../pooling/embed/test_online_vision.py       |  7 ++++-
 vllm/multimodal/base.py                       | 28 +++++++++++++++++++
 vllm/multimodal/hasher.py                     | 24 ++++++++++++----
 vllm/multimodal/image.py                      | 24 +++++++++-------
 vllm/multimodal/parse.py                      | 15 ++++++++++
 vllm/multimodal/processing.py                 |  2 +-
 8 files changed, 95 insertions(+), 19 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 53bbaddd0..b20c9efef 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -59,6 +59,7 @@ from vllm.distributed import (
 )
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob
+from vllm.multimodal.base import MediaWithBytes
 from vllm.multimodal.utils import fetch_image
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
@@ -1389,7 +1390,11 @@ class LocalAssetServer:
         return f"{self.base_url}/{name}"
 
     def get_image_asset(self, name: str) -> Image.Image:
-        return fetch_image(self.url_for(name))
+        image = fetch_image(self.url_for(name))
+        # Unwrap MediaWithBytes if present
+        if isinstance(image, MediaWithBytes):
+            image = image.media
+        return image
 
 
 @pytest.fixture(scope="session")
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index d83c6726e..ae8860ee8 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -8,6 +8,7 @@ import pytest
 import pytest_asyncio
 from transformers import AutoProcessor
 
+from vllm.multimodal.base import MediaWithBytes
 from vllm.multimodal.utils import encode_image_base64, fetch_image
 
 from ...utils import RemoteOpenAIServer
@@ -111,7 +112,11 @@ def get_hf_prompt_tokens(model_name, content, image_url):
             "content": f"{placeholder}{content}",
         }
     ]
-    images = [fetch_image(image_url)]
+    image = fetch_image(image_url)
+    # Unwrap MediaWithBytes if present
+    if isinstance(image, MediaWithBytes):
+        image = image.media
+    images = [image]
 
     prompt = processor.tokenizer.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
diff --git a/tests/entrypoints/pooling/embed/test_online_vision.py b/tests/entrypoints/pooling/embed/test_online_vision.py
index 83e7048b9..eebbcdd2e 100644
--- a/tests/entrypoints/pooling/embed/test_online_vision.py
+++ b/tests/entrypoints/pooling/embed/test_online_vision.py
@@ -9,6 +9,7 @@ from transformers import AutoProcessor
 
 from tests.utils import VLLM_PATH, RemoteOpenAIServer
 from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
+from vllm.multimodal.base import MediaWithBytes
 from vllm.multimodal.utils import encode_image_base64, fetch_image
 
 MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
@@ -62,7 +63,11 @@ def get_hf_prompt_tokens(model_name, content, image_url):
 
     placeholder = "<|image_1|> "
     prompt = f"{placeholder}{content}"
-    images = [fetch_image(image_url)]
+    image = fetch_image(image_url)
+    # Unwrap MediaWithBytes if present
+    if isinstance(image, MediaWithBytes):
+        image = image.media
+    images = [image]
     inputs = processor(prompt, images, return_tensors="pt")
     return inputs.input_ids.shape[1]
 
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index fef118a93..4a619fd30 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -2,12 +2,40 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
+from dataclasses import dataclass
 from pathlib import Path
 from typing import Generic, TypeVar
 
+import numpy as np
+
 _T = TypeVar("_T")
 
 
+@dataclass
+class MediaWithBytes(Generic[_T]):
+    """
+    Wrapper that couples a media object with its original encoded bytes.
+
+    This ensures the raw bytes and media object remain synchronized,
+    preventing cache corruption from in-place modifications.
+
+    The wrapper delegates attribute access to the underlying media object,
+    making it behave transparently like the wrapped type (e.g., PIL.Image).
+    """
+
+    media: _T
+    original_bytes: bytes
+
+    def __array__(self, *args, **kwargs) -> np.ndarray:
+        """Allow np.array(obj) to return np.array(obj.media)."""
+        return np.array(self.media, *args, **kwargs)
+
+    def __getattr__(self, name: str):
+        """Delegate attribute access to the underlying media object."""
+        # This is only called when the attribute is not found on self
+        return getattr(self.media, name)
+
+
 class MediaIO(ABC, Generic[_T]):
     @abstractmethod
     def load_bytes(self, data: bytes) -> _T:
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index d0dcbb25f..cc50322fe 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -12,6 +12,8 @@ from PIL import Image
 
 from vllm.logger import init_logger
 
+from .base import MediaWithBytes
+
 logger = init_logger(__name__)
 
 
@@ -31,14 +33,26 @@ class MultiModalHasher:
             if Image.ExifTags.Base.ImageID in exif and isinstance(
                 exif[Image.ExifTags.Base.ImageID], uuid.UUID
             ):
-                # If the image has exif ImageID tag, use that
                 return (exif[Image.ExifTags.Base.ImageID].bytes,)
+
             data = {"mode": obj.mode, "data": np.asarray(obj)}
-            if obj.palette is not None:
-                data["palette"] = obj.palette.palette
-                if obj.palette.rawmode is not None:
-                    data["palette_rawmode"] = obj.palette.rawmode
+            palette = obj.palette
+            if palette is not None:
+                data["palette"] = palette.palette
+                if palette.rawmode is not None:
+                    data["palette_rawmode"] = palette.rawmode
+
             return cls.iter_item_to_bytes("image", data)
+
+        if isinstance(obj, MediaWithBytes) and isinstance(obj.media, Image.Image):
+            exif = obj.media.getexif()
+            if Image.ExifTags.Base.ImageID in exif and isinstance(
+                exif[Image.ExifTags.Base.ImageID], uuid.UUID
+            ):
+                return (exif[Image.ExifTags.Base.ImageID].bytes,)
+
+            return cls.iter_item_to_bytes("image", obj.original_bytes)
+
         if isinstance(obj, torch.Tensor):
             tensor_obj: torch.Tensor = obj.cpu()
             tensor_dtype = tensor_obj.dtype
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 21e8bef97..789421e9e 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -8,7 +8,7 @@ import pybase64
 import torch
 from PIL import Image
 
-from .base import MediaIO
+from .base import MediaIO, MediaWithBytes
 
 
 def rescale_image_size(
@@ -74,8 +74,12 @@ class ImageMediaIO(MediaIO[Image.Image]):
             )
         self.rgba_background_color = rgba_bg
 
-    def _convert_image_mode(self, image: Image.Image) -> Image.Image:
+    def _convert_image_mode(
+        self, image: Image.Image | MediaWithBytes[Image.Image]
+    ) -> Image.Image:
         """Convert image mode with custom background color."""
+        if isinstance(image, MediaWithBytes):
+            image = image.media
         if image.mode == self.image_mode:
             return image
         elif image.mode == "RGBA" and self.image_mode == "RGB":
@@ -83,18 +87,18 @@ class ImageMediaIO(MediaIO[Image.Image]):
         else:
             return convert_image_mode(image, self.image_mode)
 
-    def load_bytes(self, data: bytes) -> Image.Image:
+    def load_bytes(self, data: bytes) -> MediaWithBytes[Image.Image]:
         image = Image.open(BytesIO(data))
-        image.load()
-        return self._convert_image_mode(image)
+        return MediaWithBytes(self._convert_image_mode(image), data)
 
-    def load_base64(self, media_type: str, data: str) -> Image.Image:
+    def load_base64(self, media_type: str, data: str) -> MediaWithBytes[Image.Image]:
         return self.load_bytes(pybase64.b64decode(data, validate=True))
 
-    def load_file(self, filepath: Path) -> Image.Image:
-        image = Image.open(filepath)
-        image.load()
-        return self._convert_image_mode(image)
+    def load_file(self, filepath: Path) -> MediaWithBytes[Image.Image]:
+        with open(filepath, "rb") as f:
+            data = f.read()
+        image = Image.open(BytesIO(data))
+        return MediaWithBytes(self._convert_image_mode(image), data)
 
     def encode_base64(
         self,
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 810f29072..0d3b8289e 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -23,6 +23,7 @@ from vllm.utils.collection_utils import is_list_of
 from vllm.utils.import_utils import LazyLoader
 
 from .audio import AudioResampler
+from .base import MediaWithBytes
 from .inputs import (
     AudioItem,
     HfAudioItem,
@@ -84,6 +85,12 @@ class ModalityDataItems(ABC, Generic[_T, _I]):
         """Get all data items."""
         return [self.get(idx) for idx in range(self.get_count())]
 
+    def get_item_for_hash(self, index: int) -> object:
+        return self.get(index)
+
+    def get_all_items_for_hash(self) -> list[object]:
+        return [self.get_item_for_hash(idx) for idx in range(self.get_count())]
+
     @abstractmethod
     def get_processor_data(self) -> Mapping[str, object]:
         """Get the data to pass to the HF processor."""
@@ -98,10 +105,18 @@ class ModalityDataItems(ABC, Generic[_T, _I]):
 class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
     """Base class for data items that are arranged in a list."""
 
+    def _unwrap(self, item: _T | MediaWithBytes[_T]) -> _T:
+        """Extract media from wrapper if present."""
+        return item.media if isinstance(item, MediaWithBytes) else item
+
     def get_count(self) -> int:
         return len(self.data)
 
     def get(self, index: int) -> _T:
+        return self._unwrap(self.data[index])
+
+    def get_item_for_hash(self, index: int) -> _T | MediaWithBytes[_T]:
+        # Return raw item for hashing (preserves original_bytes if present)
         return self.data[index]
 
     def get_processor_data(self) -> Mapping[str, object]:
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index f241e79cf..039077378 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1684,7 +1684,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
                 # For None entries, compute a hash; otherwise, use provided ID.
                 computed: list[str] = []
-                for i, item in enumerate(items):
+                for i, item in enumerate(items.get_all_items_for_hash()):
                     item_uuid = mm_uuids_per_modality[i]
 
                     # NOTE: Even if a item_uuid is provided, we still compute a
-- 
GitLab


From 51c57b51dd51d87715367850faae1da7a9cabaef Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 2 Dec 2025 10:52:18 -0500
Subject: [PATCH 004/258] [Bugfix] Fix DeepSeek R1 MTP weight loading (#29545)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Co-authored-by: Benjamin Chislett <bchislett@nvidia.com>
---
 vllm/model_executor/models/deepseek_mtp.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index 6e23037b9..ca77b8322 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -346,11 +346,16 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
                     # Use expert_params_mapping to locate the destination
                     # param and delegate to its expert-aware weight_loader
                     # with expert_id.
+                    is_expert_weight = False
                     for mapping in expert_params_mapping:
                         param_name, weight_name, expert_id, shard_id = mapping
                         if weight_name not in chunk_name:
                             continue
 
+                        # Anyway, this is an expert weight and should not be
+                        # attempted to load as other weights later
+                        is_expert_weight = True
+
                         # Do not modify `name` since the loop may continue here
                         # Instead, create a new variable
                         name_mapped = chunk_name.replace(weight_name, param_name)
@@ -377,6 +382,12 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
                                 loaded_params.add(name_mapped)
                             break
                     else:
+                        if is_expert_weight:
+                            # We've checked that this is an expert weight
+                            # However it's not mapped locally to this rank
+                            # So we simply skip it
+                            continue
+
                         # Skip loading extra bias for GPTQ models.
                         if name.endswith(".bias") and name not in params_dict:
                             continue
-- 
GitLab


From 2eb4fe912916aea8998d085786df7abd7737e1f3 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Tue, 2 Dec 2025 23:54:28 +0800
Subject: [PATCH 005/258] [examples] Resettle pooling examples. (#29365)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml                 |  20 ++-
 .github/CODEOWNERS                            |   4 +-
 docs/.nav.yml                                 |   6 +-
 docs/design/io_processor_plugins.md           |   2 +-
 docs/mkdocs/hooks/generate_examples.py        | 154 ++++++++++--------
 docs/models/pooling_models.md                 |   4 +-
 docs/models/supported_models.md               |   4 +-
 docs/serving/openai_compatible_server.md      |  14 +-
 examples/offline_inference/pooling/README.md  |  57 -------
 examples/online_serving/pooling/README.md     |  97 -----------
 .../classify}/openai_classification_client.py |   0
 .../embed}/embed_jina_embeddings_v3.py        |   0
 .../embed}/embed_matryoshka_fy.py             |   0
 .../embedding_requests_base64_client.py       |   0
 .../embed}/embedding_requests_bytes_client.py |   0
 ...ai_chat_embedding_client_for_multimodal.py |   0
 .../embed}/openai_embedding_client.py         |   0
 .../openai_embedding_long_text/README.md      |   0
 .../openai_embedding_long_text/client.py      |   0
 .../openai_embedding_long_text/service.sh     |   0
 .../embed}/openai_embedding_matryoshka_fy.py  |   0
 .../plugin/prithvi_geospatial_mae_client.py}  |   0
 .../prithvi_geospatial_mae_io_processor.py    |   0
 .../plugin/prithvi_geospatial_mae_offline.py} |   0
 .../pooling/openai_pooling_client.py          |   0
 .../pooling}/vision_language_pooling.py       |   0
 .../score}/cohere_rerank_client.py            |   0
 .../score}/convert_model_to_seq_cls.py        |   0
 .../score}/jinaai_rerank_client.py            |   0
 .../score}/openai_cross_encoder_score.py      |   0
 ...enai_cross_encoder_score_for_multimodal.py |   0
 .../score}/qwen3_reranker.py                  |   0
 .../pooling => pooling/token_classify}/ner.py |   0
 .../token_classify}/ner_client.py             |   0
 .../token_embed}/multi_vector_retrieval.py    |   0
 .../multi_vector_retrieval_client.py          |   0
 36 files changed, 111 insertions(+), 251 deletions(-)
 delete mode 100644 examples/offline_inference/pooling/README.md
 delete mode 100644 examples/online_serving/pooling/README.md
 rename examples/{online_serving/pooling => pooling/classify}/openai_classification_client.py (100%)
 rename examples/{offline_inference/pooling => pooling/embed}/embed_jina_embeddings_v3.py (100%)
 rename examples/{offline_inference/pooling => pooling/embed}/embed_matryoshka_fy.py (100%)
 rename examples/{online_serving/pooling => pooling/embed}/embedding_requests_base64_client.py (100%)
 rename examples/{online_serving/pooling => pooling/embed}/embedding_requests_bytes_client.py (100%)
 rename examples/{online_serving/pooling => pooling/embed}/openai_chat_embedding_client_for_multimodal.py (100%)
 rename examples/{online_serving/pooling => pooling/embed}/openai_embedding_client.py (100%)
 rename examples/{online_serving => pooling/embed}/openai_embedding_long_text/README.md (100%)
 rename examples/{online_serving => pooling/embed}/openai_embedding_long_text/client.py (100%)
 rename examples/{online_serving => pooling/embed}/openai_embedding_long_text/service.sh (100%)
 rename examples/{online_serving/pooling => pooling/embed}/openai_embedding_matryoshka_fy.py (100%)
 rename examples/{online_serving/pooling/prithvi_geospatial_mae.py => pooling/plugin/prithvi_geospatial_mae_client.py} (100%)
 rename examples/{offline_inference/pooling => pooling/plugin}/prithvi_geospatial_mae_io_processor.py (100%)
 rename examples/{offline_inference/pooling/prithvi_geospatial_mae.py => pooling/plugin/prithvi_geospatial_mae_offline.py} (100%)
 rename examples/{online_serving => pooling}/pooling/openai_pooling_client.py (100%)
 rename examples/{offline_inference => pooling/pooling}/vision_language_pooling.py (100%)
 rename examples/{online_serving/pooling => pooling/score}/cohere_rerank_client.py (100%)
 rename examples/{offline_inference/pooling => pooling/score}/convert_model_to_seq_cls.py (100%)
 rename examples/{online_serving/pooling => pooling/score}/jinaai_rerank_client.py (100%)
 rename examples/{online_serving/pooling => pooling/score}/openai_cross_encoder_score.py (100%)
 rename examples/{online_serving/pooling => pooling/score}/openai_cross_encoder_score_for_multimodal.py (100%)
 rename examples/{offline_inference/pooling => pooling/score}/qwen3_reranker.py (100%)
 rename examples/{offline_inference/pooling => pooling/token_classify}/ner.py (100%)
 rename examples/{online_serving/pooling => pooling/token_classify}/ner_client.py (100%)
 rename examples/{offline_inference/pooling => pooling/token_embed}/multi_vector_retrieval.py (100%)
 rename examples/{online_serving/pooling => pooling/token_embed}/multi_vector_retrieval_client.py (100%)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 9f2107fb1..52c848c78 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -390,20 +390,24 @@ steps:
   - examples/
   commands:
     - pip install tensorizer # for tensorizer test
+    # for basic
+    - python3 offline_inference/basic/chat.py
     - python3 offline_inference/basic/generate.py --model facebook/opt-125m
     - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/basic/chat.py
-    - python3 offline_inference/prefix_caching.py
-    - python3 offline_inference/llm_engine_example.py
+    - python3 offline_inference/basic/classify.py
+    - python3 offline_inference/basic/embed.py
+    - python3 offline_inference/basic/score.py
+    # for multi-modal models
     - python3 offline_inference/audio_language.py --seed 0
     - python3 offline_inference/vision_language.py --seed 0
-    - python3 offline_inference/vision_language_pooling.py --seed 0
     - python3 offline_inference/vision_language_multi_image.py --seed 0
-    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
-    - python3 offline_inference/basic/classify.py
-    - python3 offline_inference/basic/embed.py
-    - python3 offline_inference/basic/score.py
+    # for pooling models
+    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    # for features demo
+    - python3 offline_inference/prefix_caching.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
     # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
     - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index ecb10d1a4..d6447649c 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -146,10 +146,10 @@ mkdocs.yaml @hmellor
 /requirements/kv_connectors.txt @NickLucche
 
 # Pooling models
-/examples/*/pooling/ @noooop
+/examples/pooling @noooop
 /tests/models/*/pooling* @noooop
 /tests/entrypoints/pooling @noooop
-/vllm/entrypoints/pooling @aarnphm @chaunceyjiang @noooop
+/vllm/entrypoints/pooling @noooop
 /vllm/config/pooler.py @noooop
 /vllm/pooling_params.py @noooop
 /vllm/model_executor/layers/pooler.py @noooop
diff --git a/docs/.nav.yml b/docs/.nav.yml
index d30c0f12e..aa98ad52b 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -5,11 +5,7 @@ nav:
     - Getting Started:
       - getting_started/quickstart.md
       - getting_started/installation
-    - Examples:
-      - examples/README.md
-      - Offline Inference: examples/offline_inference
-      - Online Serving: examples/online_serving
-      - Others: examples/others
+      - Examples: examples
     - General:
       - usage/v1_guide.md
       - usage/*
diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md
index b4a30cda3..5a86940fa 100644
--- a/docs/design/io_processor_plugins.md
+++ b/docs/design/io_processor_plugins.md
@@ -79,7 +79,7 @@ The `post_process*` methods take `PoolingRequestOutput` objects as input and gen
 The `validate_or_generate_params` method is used for validating with the plugin any `SamplingParameters`/`PoolingParameters` received with the user request, or to generate new ones if none are specified. The function always returns the validated/generated parameters.
 The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/pooling/pooling/serving.py).
 
-An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/online_serving/pooling/prithvi_geospatial_mae.py](../../examples/online_serving/pooling/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py)) inference examples.
+An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/pooling/plugin/prithvi_geospatial_mae_client.py](../../examples/pooling/plugin/prithvi_geospatial_mae_client.py)) and offline ([examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py](../../examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py)) inference examples.
 
 ## Using an IO Processor plugin
 
diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py
index 6e4fb039e..e886a91e6 100644
--- a/docs/mkdocs/hooks/generate_examples.py
+++ b/docs/mkdocs/hooks/generate_examples.py
@@ -2,7 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
 import logging
-from dataclasses import dataclass, field
+from dataclasses import dataclass
+from functools import cached_property
 from pathlib import Path
 from typing import Literal
 
@@ -16,13 +17,18 @@ EXAMPLE_DIR = ROOT_DIR / "examples"
 EXAMPLE_DOC_DIR = ROOT_DIR / "docs/examples"
 
 
-def fix_case(text: str) -> str:
+def title(text: str) -> str:
+    # Default title case
+    text = text.replace("_", " ").replace("/", " - ").title()
+    # Custom substitutions
     subs = {
+        "io": "IO",
         "api": "API",
         "cli": "CLI",
         "cpu": "CPU",
         "llm": "LLM",
         "mae": "MAE",
+        "ner": "NER",
         "tpu": "TPU",
         "gguf": "GGUF",
         "lora": "LoRA",
@@ -48,71 +54,65 @@ class Example:
     Attributes:
         path (Path): The path to the main directory or file.
         category (str): The category of the document.
-        main_file (Path): The main file in the directory.
-        other_files (list[Path]): list of other files in the directory.
-        title (str): The title of the document.
+
+    Properties::
+        main_file() -> Path | None: Determines the main file in the given path.
+        other_files() -> list[Path]: Determines other files in the directory excluding
+        the main file.
+        title() -> str: Determines the title of the document.
 
     Methods:
-        __post_init__(): Initializes the main_file, other_files, and title attributes.
-        determine_main_file() -> Path: Determines the main file in the given path.
-        determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file.
-        determine_title() -> str: Determines the title of the document.
         generate() -> str: Generates the documentation content.
-    """  # noqa: E501
+    """
 
     path: Path
-    category: str = None
-    main_file: Path = field(init=False)
-    other_files: list[Path] = field(init=False)
-    title: str = field(init=False)
-
-    def __post_init__(self):
-        self.main_file = self.determine_main_file()
-        self.other_files = self.determine_other_files()
-        self.title = self.determine_title()
-
-    @property
-    def is_code(self) -> bool:
-        return self.main_file.suffix != ".md"
-
-    def determine_main_file(self) -> Path:
-        """
-        Determines the main file in the given path.
-        If the path is a file, it returns the path itself. Otherwise, it searches
-        for Markdown files (*.md) in the directory and returns the first one found.
-        Returns:
-            Path: The main file path, either the original path if it's a file or the first
-            Markdown file found in the directory.
-        Raises:
-            IndexError: If no Markdown files are found in the directory.
-        """  # noqa: E501
-        return self.path if self.path.is_file() else list(self.path.glob("*.md")).pop()
-
-    def determine_other_files(self) -> list[Path]:
-        """
-        Determine other files in the directory excluding the main file.
+    category: str
 
-        This method checks if the given path is a file. If it is, it returns an empty list.
-        Otherwise, it recursively searches through the directory and returns a list of all
-        files that are not the main file.
+    @cached_property
+    def main_file(self) -> Path | None:
+        """Determines the main file in the given path.
 
-        Returns:
-            list[Path]: A list of Path objects representing the other files in the directory.
-        """  # noqa: E501
+        If path is a file, it returns the path itself. If path is a directory, it
+        searches for Markdown files (*.md) in the directory and returns the first one
+        found. If no Markdown files are found, it returns None."""
+        # Single file example
+        if self.path.is_file():
+            return self.path
+        # Multi file example with a README
+        if md_paths := list(self.path.glob("*.md")):
+            return md_paths[0]
+        # Multi file example without a README
+        return None
+
+    @cached_property
+    def other_files(self) -> list[Path]:
+        """Determine other files in the directory excluding the main file.
+
+        If path is a file, it returns an empty list. Otherwise, it returns every file
+        in the directory except the main file in a list."""
+        # Single file example
         if self.path.is_file():
             return []
+        # Multi file example
         is_other_file = lambda file: file.is_file() and file != self.main_file
-        return [file for file in self.path.rglob("*") if is_other_file(file)]
-
-    def determine_title(self) -> str:
-        if not self.is_code:
-            # Specify encoding for building on Windows
-            with open(self.main_file, encoding="utf-8") as f:
-                first_line = f.readline().strip()
-            match = re.match(r"^#\s+(?P<title>.+)$", first_line)
-            if match:
-                return match.group("title")
-        return fix_case(self.path.stem.replace("_", " ").title())
+        return sorted(file for file in self.path.rglob("*") if is_other_file(file))
+
+    @cached_property
+    def is_code(self) -> bool:
+        return self.main_file is not None and self.main_file.suffix != ".md"
+
+    @cached_property
+    def title(self) -> str:
+        # Generate title from filename if no main md file found
+        if self.main_file is None or self.is_code:
+            return title(self.path.stem)
+        # Specify encoding for building on Windows
+        with open(self.main_file, encoding="utf-8") as f:
+            first_line = f.readline().strip()
+        match = re.match(r"^#\s+(?P<title>.+)$", first_line)
+        if match:
+            return match.group("title")
+        raise ValueError(f"Title not found in {self.main_file}")
 
     def fix_relative_links(self, content: str) -> str:
         """
@@ -156,24 +156,35 @@ class Example:
         # included files containing code fences too
         code_fence = "``````"
 
-        if self.is_code:
-            content += (
-                f"{code_fence}{self.main_file.suffix[1:]}\n"
-                f'--8<-- "{self.main_file}"\n'
-                f"{code_fence}\n"
-            )
+        if self.main_file is not None:
+            # Single file example or multi file example with a README
+            if self.is_code:
+                content += (
+                    f"{code_fence}{self.main_file.suffix[1:]}\n"
+                    f'--8<-- "{self.main_file}"\n'
+                    f"{code_fence}\n"
+                )
+            else:
+                with open(self.main_file, encoding="utf-8") as f:
+                    # Skip the title from md snippets as it's been included above
+                    main_content = f.readlines()[1:]
+                content += self.fix_relative_links("".join(main_content))
+            content += "\n"
         else:
-            with open(self.main_file) as f:
-                # Skip the title from md snippets as it's been included above
-                main_content = f.readlines()[1:]
-            content += self.fix_relative_links("".join(main_content))
-        content += "\n"
+            # Multi file example without a README
+            for file in self.other_files:
+                file_title = title(str(file.relative_to(self.path).with_suffix("")))
+                content += f"## {file_title}\n\n"
+                content += (
+                    f'{code_fence}{file.suffix[1:]}\n--8<-- "{file}"\n{code_fence}\n\n'
+                )
+            return content
 
         if not self.other_files:
             return content
 
         content += "## Example materials\n\n"
-        for file in sorted(self.other_files):
+        for file in self.other_files:
             content += f'??? abstract "{file.relative_to(self.path)}"\n'
             if file.suffix != ".md":
                 content += f"    {code_fence}{file.suffix[1:]}\n"
@@ -200,11 +211,13 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
     glob_patterns = ["*.py", "*.md", "*.sh"]
     # Find categorised examples
     for category in categories:
+        logger.info("Processing category: %s", category.stem)
         globs = [category.glob(pattern) for pattern in glob_patterns]
         for path in itertools.chain(*globs):
             examples.append(Example(path, category.stem))
         # Find examples in subdirectories
-        for path in category.glob("*/*.md"):
+        globs = [category.glob(f"*/{pattern}") for pattern in glob_patterns]
+        for path in itertools.chain(*globs):
             examples.append(Example(path.parent, category.stem))
 
     # Generate the example documentation
@@ -217,3 +230,4 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
         with open(doc_path, "w+", encoding="utf-8") as f:
             f.write(example.generate())
         logger.debug("Example generated: %s", doc_path.relative_to(ROOT_DIR))
+    logger.info("Total examples generated: %d", len(examples))
diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index aca865f4b..e2d427e8a 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -274,7 +274,7 @@ outputs = llm.embed(
 print(outputs[0].outputs)
 ```
 
-A code example can be found here: [examples/offline_inference/pooling/embed_matryoshka_fy.py](../../examples/offline_inference/pooling/embed_matryoshka_fy.py)
+A code example can be found here: [examples/pooling/embed/embed_matryoshka_fy.py](../../examples/pooling/embed/embed_matryoshka_fy.py)
 
 ### Online Inference
 
@@ -304,7 +304,7 @@ Expected output:
 {"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}}
 ```
 
-An OpenAI client example can be found here: [examples/online_serving/pooling/openai_embedding_matryoshka_fy.py](../../examples/online_serving/pooling/openai_embedding_matryoshka_fy.py)
+An OpenAI client example can be found here: [examples/pooling/embed/openai_embedding_matryoshka_fy.py](../../examples/pooling/embed/openai_embedding_matryoshka_fy.py)
 
 ## Deprecated Features
 
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 6ea2285b9..040107c11 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -568,7 +568,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
     ```
 
 !!! note
-    Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/offline_inference/pooling/qwen3_reranker.py](../../examples/offline_inference/pooling/qwen3_reranker.py).
+    Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/qwen3_reranker.py](../../examples/pooling/score/qwen3_reranker.py).
 
     ```bash
     vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
@@ -606,7 +606,7 @@ These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode)
 | `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` |  |  |
 
 !!! note
-    Named Entity Recognition (NER) usage, please refer to [examples/offline_inference/pooling/ner.py](../../examples/offline_inference/pooling/ner.py), [examples/online_serving/pooling/ner_client.py](../../examples/online_serving/pooling/ner_client.py).
+    Named Entity Recognition (NER) usage, please refer to [examples/pooling/token_classify/ner.py](../../examples/pooling/token_classify/ner.py), [examples/pooling/token_classify/ner_client.py](../../examples/pooling/token_classify/ner_client.py).
 
 ## List of Multimodal Language Models
 
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index 672663dc5..01453483a 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -234,7 +234,7 @@ The following extra parameters are supported:
 Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
 you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
 
-Code example: [examples/online_serving/pooling/openai_embedding_client.py](../../examples/online_serving/pooling/openai_embedding_client.py)
+Code example: [examples/pooling/embed/openai_embedding_client.py](../../examples/pooling/embed/openai_embedding_client.py)
 
 If the model has a [chat template](../serving/openai_compatible_server.md#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api))
 which will be treated as a single prompt to the model. Here is a convenience function for calling the API while retaining OpenAI's type annotations:
@@ -335,7 +335,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
         `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
         example below for details.
 
-Full example: [examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py](../../examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py)
+Full example: [examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py](../../examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py)
 
 #### Extra parameters
 
@@ -516,7 +516,7 @@ Our Pooling API encodes input prompts using a [pooling model](../models/pooling_
 
 The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
 
-Code example: [examples/online_serving/pooling/openai_pooling_client.py](../../examples/online_serving/pooling/openai_pooling_client.py)
+Code example: [examples/pooling/pooling/openai_pooling_client.py](../../examples/pooling/pooling/openai_pooling_client.py)
 
 ### Classification API
 
@@ -524,7 +524,7 @@ Our Classification API directly supports Hugging Face sequence-classification mo
 
 We automatically wrap any other transformer via `as_seq_cls_model()`, which pools on the last token, attaches a `RowParallelLinear` head, and applies a softmax to produce per-class probabilities.
 
-Code example: [examples/online_serving/pooling/openai_classification_client.py](../../examples/online_serving/pooling/openai_classification_client.py)
+Code example: [examples/pooling/classify/openai_classification_client.py](../../examples/pooling/classify/openai_classification_client.py)
 
 #### Example Requests
 
@@ -640,7 +640,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent
 
 You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
 
-Code example: [examples/online_serving/pooling/openai_cross_encoder_score.py](../../examples/online_serving/pooling/openai_cross_encoder_score.py)
+Code example: [examples/pooling/score/openai_cross_encoder_score.py](../../examples/pooling/score/openai_cross_encoder_score.py)
 
 #### Single inference
 
@@ -821,7 +821,7 @@ You can pass multi-modal inputs to scoring models by passing `content` including
         print("Scoring output:", response_json["data"][0]["score"])
         print("Scoring output:", response_json["data"][1]["score"])
         ```
-Full example: [examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py](../../examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py)
+Full example: [examples/pooling/score/openai_cross_encoder_score_for_multimodal.py](../../examples/pooling/score/openai_cross_encoder_score_for_multimodal.py)
 
 #### Extra parameters
 
@@ -851,7 +851,7 @@ endpoints are compatible with both [Jina AI's re-rank API interface](https://jin
 [Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with
 popular open-source tools.
 
-Code example: [examples/online_serving/pooling/jinaai_rerank_client.py](../../examples/online_serving/pooling/jinaai_rerank_client.py)
+Code example: [examples/pooling/score/jinaai_rerank_client.py](../../examples/pooling/score/jinaai_rerank_client.py)
 
 #### Example Request
 
diff --git a/examples/offline_inference/pooling/README.md b/examples/offline_inference/pooling/README.md
deleted file mode 100644
index ad78be387..000000000
--- a/examples/offline_inference/pooling/README.md
+++ /dev/null
@@ -1,57 +0,0 @@
-# Pooling models
-
-## Convert llm model to seq cls
-
-```bash
-# for BAAI/bge-reranker-v2-gemma
-# Caution: "Yes" and "yes" are two different tokens
-python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name BAAI/bge-reranker-v2-gemma --classifier_from_tokens '["Yes"]' --method no_post_processing --path ./bge-reranker-v2-gemma-seq-cls
-# for mxbai-rerank-v2
-python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name mixedbread-ai/mxbai-rerank-base-v2 --classifier_from_tokens '["0", "1"]' --method from_2_way_softmax --path ./mxbai-rerank-base-v2-seq-cls
-# for Qwen3-Reranker
-python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name Qwen/Qwen3-Reranker-0.6B --classifier_from_tokens '["no", "yes"]' --method from_2_way_softmax --path ./Qwen3-Reranker-0.6B-seq-cls
-```
-
-## Embed jina_embeddings_v3 usage
-
-Only text matching task is supported for now. See <https://github.com/vllm-project/vllm/pull/16120>
-
-```bash
-python examples/offline_inference/pooling/embed_jina_embeddings_v3.py
-```
-
-## Embed matryoshka dimensions usage
-
-```bash
-python examples/offline_inference/pooling/embed_matryoshka_fy.py
-```
-
-## Multi vector retrieval usage
-
-```bash
-python examples/offline_inference/pooling/multi_vector_retrieval.py
-```
-
-## Named Entity Recognition (NER) usage
-
-```bash
-python examples/offline_inference/pooling/ner.py
-```
-
-## Prithvi Geospatial MAE usage
-
-```bash
-python examples/offline_inference/pooling/prithvi_geospatial_mae.py
-```
-
-## IO Processor Plugins for Prithvi Geospatial MAE
-
-```bash
-python examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py
-```
-
-## Qwen3 reranker usage
-
-```bash
-python examples/offline_inference/pooling/qwen3_reranker.py
-```
diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md
deleted file mode 100644
index b76ad21f0..000000000
--- a/examples/online_serving/pooling/README.md
+++ /dev/null
@@ -1,97 +0,0 @@
-# Pooling models
-
-## Cohere rerank usage
-
-```bash
-# vllm serve BAAI/bge-reranker-base
-python examples/online_serving/pooling/cohere_rerank_client.py
-```
-
-## Embedding requests base64 encoding_format usage
-
-```bash
-# vllm serve intfloat/e5-small
-python examples/online_serving/pooling/embedding_requests_base64_client.py
-```
-
-## Embedding requests bytes encoding_format usage
-
-```bash
-# vllm serve intfloat/e5-small
-python examples/online_serving/pooling/embedding_requests_bytes_client.py
-```
-
-## Jinaai rerank usage
-
-```bash
-# vllm serve BAAI/bge-reranker-base
-python examples/online_serving/pooling/jinaai_rerank_client.py
-```
-
-## Multi vector retrieval usage
-
-```bash
-# vllm serve BAAI/bge-m3
-python examples/online_serving/pooling/multi_vector_retrieval_client.py
-```
-
-## Named Entity Recognition (NER) usage
-
-```bash
-# vllm serve boltuix/NeuroBERT-NER
-python examples/online_serving/pooling/ner_client.py
-```
-
-## OpenAI chat embedding for multimodal usage
-
-```bash
-python examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
-```
-
-## OpenAI classification usage
-
-```bash
-# vllm serve jason9693/Qwen2.5-1.5B-apeach
-python examples/online_serving/pooling/openai_classification_client.py
-```
-
-## OpenAI cross_encoder score usage
-
-```bash
-# vllm serve BAAI/bge-reranker-v2-m3
-python examples/online_serving/pooling/openai_cross_encoder_score.py
-```
-
-## OpenAI cross_encoder score for multimodal usage
-
-```bash
-# vllm serve jinaai/jina-reranker-m0
-python examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py
-```
-
-## OpenAI embedding usage
-
-```bash
-# vllm serve intfloat/e5-small
-python examples/online_serving/pooling/openai_embedding_client.py
-```
-
-## OpenAI embedding matryoshka dimensions usage
-
-```bash
-# vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
-python examples/online_serving/pooling/openai_embedding_matryoshka_fy.py
-```
-
-## OpenAI pooling usage
-
-```bash
-# vllm serve internlm/internlm2-1_8b-reward --trust-remote-code
-python examples/online_serving/pooling/openai_pooling_client.py
-```
-
-## Online Prithvi Geospatial MAE usage
-
-```bash
-python examples/online_serving/pooling/prithvi_geospatial_mae.py
-```
diff --git a/examples/online_serving/pooling/openai_classification_client.py b/examples/pooling/classify/openai_classification_client.py
similarity index 100%
rename from examples/online_serving/pooling/openai_classification_client.py
rename to examples/pooling/classify/openai_classification_client.py
diff --git a/examples/offline_inference/pooling/embed_jina_embeddings_v3.py b/examples/pooling/embed/embed_jina_embeddings_v3.py
similarity index 100%
rename from examples/offline_inference/pooling/embed_jina_embeddings_v3.py
rename to examples/pooling/embed/embed_jina_embeddings_v3.py
diff --git a/examples/offline_inference/pooling/embed_matryoshka_fy.py b/examples/pooling/embed/embed_matryoshka_fy.py
similarity index 100%
rename from examples/offline_inference/pooling/embed_matryoshka_fy.py
rename to examples/pooling/embed/embed_matryoshka_fy.py
diff --git a/examples/online_serving/pooling/embedding_requests_base64_client.py b/examples/pooling/embed/embedding_requests_base64_client.py
similarity index 100%
rename from examples/online_serving/pooling/embedding_requests_base64_client.py
rename to examples/pooling/embed/embedding_requests_base64_client.py
diff --git a/examples/online_serving/pooling/embedding_requests_bytes_client.py b/examples/pooling/embed/embedding_requests_bytes_client.py
similarity index 100%
rename from examples/online_serving/pooling/embedding_requests_bytes_client.py
rename to examples/pooling/embed/embedding_requests_bytes_client.py
diff --git a/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py b/examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py
similarity index 100%
rename from examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
rename to examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py
diff --git a/examples/online_serving/pooling/openai_embedding_client.py b/examples/pooling/embed/openai_embedding_client.py
similarity index 100%
rename from examples/online_serving/pooling/openai_embedding_client.py
rename to examples/pooling/embed/openai_embedding_client.py
diff --git a/examples/online_serving/openai_embedding_long_text/README.md b/examples/pooling/embed/openai_embedding_long_text/README.md
similarity index 100%
rename from examples/online_serving/openai_embedding_long_text/README.md
rename to examples/pooling/embed/openai_embedding_long_text/README.md
diff --git a/examples/online_serving/openai_embedding_long_text/client.py b/examples/pooling/embed/openai_embedding_long_text/client.py
similarity index 100%
rename from examples/online_serving/openai_embedding_long_text/client.py
rename to examples/pooling/embed/openai_embedding_long_text/client.py
diff --git a/examples/online_serving/openai_embedding_long_text/service.sh b/examples/pooling/embed/openai_embedding_long_text/service.sh
similarity index 100%
rename from examples/online_serving/openai_embedding_long_text/service.sh
rename to examples/pooling/embed/openai_embedding_long_text/service.sh
diff --git a/examples/online_serving/pooling/openai_embedding_matryoshka_fy.py b/examples/pooling/embed/openai_embedding_matryoshka_fy.py
similarity index 100%
rename from examples/online_serving/pooling/openai_embedding_matryoshka_fy.py
rename to examples/pooling/embed/openai_embedding_matryoshka_fy.py
diff --git a/examples/online_serving/pooling/prithvi_geospatial_mae.py b/examples/pooling/plugin/prithvi_geospatial_mae_client.py
similarity index 100%
rename from examples/online_serving/pooling/prithvi_geospatial_mae.py
rename to examples/pooling/plugin/prithvi_geospatial_mae_client.py
diff --git a/examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py b/examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py
similarity index 100%
rename from examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py
rename to examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py
diff --git a/examples/offline_inference/pooling/prithvi_geospatial_mae.py b/examples/pooling/plugin/prithvi_geospatial_mae_offline.py
similarity index 100%
rename from examples/offline_inference/pooling/prithvi_geospatial_mae.py
rename to examples/pooling/plugin/prithvi_geospatial_mae_offline.py
diff --git a/examples/online_serving/pooling/openai_pooling_client.py b/examples/pooling/pooling/openai_pooling_client.py
similarity index 100%
rename from examples/online_serving/pooling/openai_pooling_client.py
rename to examples/pooling/pooling/openai_pooling_client.py
diff --git a/examples/offline_inference/vision_language_pooling.py b/examples/pooling/pooling/vision_language_pooling.py
similarity index 100%
rename from examples/offline_inference/vision_language_pooling.py
rename to examples/pooling/pooling/vision_language_pooling.py
diff --git a/examples/online_serving/pooling/cohere_rerank_client.py b/examples/pooling/score/cohere_rerank_client.py
similarity index 100%
rename from examples/online_serving/pooling/cohere_rerank_client.py
rename to examples/pooling/score/cohere_rerank_client.py
diff --git a/examples/offline_inference/pooling/convert_model_to_seq_cls.py b/examples/pooling/score/convert_model_to_seq_cls.py
similarity index 100%
rename from examples/offline_inference/pooling/convert_model_to_seq_cls.py
rename to examples/pooling/score/convert_model_to_seq_cls.py
diff --git a/examples/online_serving/pooling/jinaai_rerank_client.py b/examples/pooling/score/jinaai_rerank_client.py
similarity index 100%
rename from examples/online_serving/pooling/jinaai_rerank_client.py
rename to examples/pooling/score/jinaai_rerank_client.py
diff --git a/examples/online_serving/pooling/openai_cross_encoder_score.py b/examples/pooling/score/openai_cross_encoder_score.py
similarity index 100%
rename from examples/online_serving/pooling/openai_cross_encoder_score.py
rename to examples/pooling/score/openai_cross_encoder_score.py
diff --git a/examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py b/examples/pooling/score/openai_cross_encoder_score_for_multimodal.py
similarity index 100%
rename from examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py
rename to examples/pooling/score/openai_cross_encoder_score_for_multimodal.py
diff --git a/examples/offline_inference/pooling/qwen3_reranker.py b/examples/pooling/score/qwen3_reranker.py
similarity index 100%
rename from examples/offline_inference/pooling/qwen3_reranker.py
rename to examples/pooling/score/qwen3_reranker.py
diff --git a/examples/offline_inference/pooling/ner.py b/examples/pooling/token_classify/ner.py
similarity index 100%
rename from examples/offline_inference/pooling/ner.py
rename to examples/pooling/token_classify/ner.py
diff --git a/examples/online_serving/pooling/ner_client.py b/examples/pooling/token_classify/ner_client.py
similarity index 100%
rename from examples/online_serving/pooling/ner_client.py
rename to examples/pooling/token_classify/ner_client.py
diff --git a/examples/offline_inference/pooling/multi_vector_retrieval.py b/examples/pooling/token_embed/multi_vector_retrieval.py
similarity index 100%
rename from examples/offline_inference/pooling/multi_vector_retrieval.py
rename to examples/pooling/token_embed/multi_vector_retrieval.py
diff --git a/examples/online_serving/pooling/multi_vector_retrieval_client.py b/examples/pooling/token_embed/multi_vector_retrieval_client.py
similarity index 100%
rename from examples/online_serving/pooling/multi_vector_retrieval_client.py
rename to examples/pooling/token_embed/multi_vector_retrieval_client.py
-- 
GitLab


From 0ec84221718d920c3f46da879cc354f94b8fb59e Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 3 Dec 2025 00:03:52 +0800
Subject: [PATCH 006/258] [Bugfix] Fix incorrect channel order for idefics3 in
 edge case (#29881)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/model_executor/models/idefics3.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 9c5f9389e..7c3933c6f 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -338,6 +338,7 @@ class Idefics3MultiModalProcessor(BaseMultiModalProcessor[Idefics3ProcessingInfo
             prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
             return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
 
+        mm_kwargs = {"input_data_format": "channels_last", **mm_kwargs}
         processed_outputs = super()._call_hf_processor(
             prompt,
             mm_data,
-- 
GitLab


From 52cb349fc010c3d9e8f576f7cc675e6403aadd0a Mon Sep 17 00:00:00 2001
From: Andrew Xia <axia@meta.com>
Date: Tue, 2 Dec 2025 08:24:45 -0800
Subject: [PATCH 007/258] [responsesAPI][3] ResponsesParser to set up non
 harmony MCP (#29413)

Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
---
 .../test_response_api_parsable_context.py     |  87 +++++++++++++++
 vllm/entrypoints/chat_utils.py                |   1 +
 vllm/entrypoints/context.py                   |  76 +++++++++++++
 vllm/entrypoints/openai/parser/__init__.py    |   0
 .../openai/parser/responses_parser.py         | 101 ++++++++++++++++++
 vllm/entrypoints/openai/serving_responses.py  |  45 +++++---
 vllm/entrypoints/responses_utils.py           |  30 ++++++
 vllm/envs.py                                  |   5 +
 8 files changed, 332 insertions(+), 13 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_response_api_parsable_context.py
 create mode 100644 vllm/entrypoints/openai/parser/__init__.py
 create mode 100644 vllm/entrypoints/openai/parser/responses_parser.py

diff --git a/tests/entrypoints/openai/test_response_api_parsable_context.py b/tests/entrypoints/openai/test_response_api_parsable_context.py
new file mode 100644
index 000000000..1b2795770
--- /dev/null
+++ b/tests/entrypoints/openai/test_response_api_parsable_context.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import pytest
+import pytest_asyncio
+from openai import OpenAI
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-8B"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = ["--reasoning-parser", "qwen3", "--max_model_len", "5000"]
+    env_dict = dict(
+        VLLM_ENABLE_RESPONSES_API_STORE="1",
+        VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT="1",
+        # uncomment for tool calling
+        # PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
+    )
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_basic(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is 13 * 24?",
+    )
+    assert response is not None
+    print("response: ", response)
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_reasoning_and_function_items(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input=[
+            {"type": "message", "content": "Hello.", "role": "user"},
+            {
+                "type": "reasoning",
+                "id": "lol",
+                "content": [
+                    {
+                        "type": "reasoning_text",
+                        "text": "We need to respond: greeting.",
+                    }
+                ],
+                "summary": [],
+            },
+            {
+                "arguments": '{"location": "Paris", "unit": "celsius"}',
+                "call_id": "call_5f7b38f3b81e4b8380fd0ba74f3ca3ab",
+                "name": "get_weather",
+                "type": "function_call",
+                "id": "fc_4fe5d6fc5b6c4d6fa5f24cc80aa27f78",
+                "status": "completed",
+            },
+            {
+                "call_id": "call_5f7b38f3b81e4b8380fd0ba74f3ca3ab",
+                "id": "fc_4fe5d6fc5b6c4d6fa5f24cc80aa27f78",
+                "output": "The weather in Paris is 20 Celsius",
+                "status": "completed",
+                "type": "function_call_output",
+            },
+        ],
+        temperature=0.0,
+    )
+    assert response is not None
+    assert response.status == "completed"
+    # make sure we get a reasoning and text output
+    assert response.output[0].type == "reasoning"
+    assert response.output[1].type == "message"
+    assert type(response.output[1].content[0].text) is str
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 164390689..2dd5b9c8f 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1530,6 +1530,7 @@ def _parse_chat_message_content(
     role = message["role"]
     content = message.get("content")
     reasoning = message.get("reasoning") or message.get("reasoning_content")
+
     if content is None:
         content = []
     elif isinstance(content, str):
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index 7a41c668d..1260f65db 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -5,6 +5,7 @@ import contextlib
 import json
 import logging
 from abc import ABC, abstractmethod
+from collections.abc import Callable
 from contextlib import AsyncExitStack
 from typing import TYPE_CHECKING, Union
 
@@ -17,9 +18,19 @@ from vllm.entrypoints.harmony_utils import (
     get_streamable_parser_for_assistant,
     render_for_completion,
 )
+from vllm.entrypoints.openai.parser.responses_parser import (
+    get_responses_parser_for_simple_context,
+)
+from vllm.entrypoints.openai.protocol import (
+    ResponseInputOutputItem,
+    ResponsesRequest,
+)
+from vllm.entrypoints.responses_utils import construct_tool_dicts
 from vllm.entrypoints.tool import Tool
 from vllm.entrypoints.tool_server import ToolServer
 from vllm.outputs import RequestOutput
+from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
+from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 if TYPE_CHECKING:
     from mcp.client import ClientSession
@@ -180,6 +191,71 @@ class SimpleContext(ConversationContext):
         raise NotImplementedError("Should not be called.")
 
 
+class ParsableContext(ConversationContext):
+    def __init__(
+        self,
+        *,
+        response_messages: list[ResponseInputOutputItem],
+        tokenizer: AnyTokenizer,
+        reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser] | None,
+        request: ResponsesRequest,
+    ):
+        self.num_prompt_tokens = 0
+        self.num_output_tokens = 0
+        self.num_cached_tokens = 0
+        # TODO: num_reasoning_tokens is not implemented yet.
+        self.num_reasoning_tokens = 0
+        # not implemented yet for ParsableContext
+        self.all_turn_metrics: list[TurnMetrics] = []
+
+        if reasoning_parser_cls is None:
+            raise ValueError("reasoning_parser_cls must be provided.")
+
+        self.parser = get_responses_parser_for_simple_context(
+            tokenizer=tokenizer,
+            reasoning_parser_cls=reasoning_parser_cls,
+            response_messages=response_messages,
+            request=request,
+        )
+
+        self._tool_sessions: dict[str, ClientSession | Tool] = {}
+        self.called_tools: set[str] = set()
+
+        self.tool_dicts = construct_tool_dicts(request.tools, request.tool_choice)
+
+    def append_output(self, output: RequestOutput) -> None:
+        self.num_prompt_tokens = len(output.prompt_token_ids or [])
+        self.num_cached_tokens = output.num_cached_tokens or 0
+        self.num_output_tokens += len(output.outputs[0].token_ids or [])
+        self.parser.process(output.outputs[0])
+
+    def append_tool_output(self, output: list[ResponseInputOutputItem]) -> None:
+        raise NotImplementedError("Should not be called.")
+
+    def need_builtin_tool_call(self) -> bool:
+        """Return true if the last message is a MCP tool call"""
+        return False
+
+    async def call_tool(self) -> list[ResponseInputOutputItem]:
+        raise NotImplementedError("Should not be called.")
+
+    def render_for_completion(self):
+        raise NotImplementedError("Should not be called.")
+
+    async def init_tool_sessions(
+        self,
+        tool_server: ToolServer | None,
+        exit_stack: AsyncExitStack,
+        request_id: str,
+        mcp_tools: dict[str, Mcp],
+    ):
+        pass
+
+    async def cleanup_session(self, *args, **kwargs) -> None:
+        """Can be used as coro to used in __aexit__"""
+        raise NotImplementedError("Should not be called.")
+
+
 class HarmonyContext(ConversationContext):
     def __init__(
         self,
diff --git a/vllm/entrypoints/openai/parser/__init__.py b/vllm/entrypoints/openai/parser/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/vllm/entrypoints/openai/parser/responses_parser.py b/vllm/entrypoints/openai/parser/responses_parser.py
new file mode 100644
index 000000000..1bc8e81bd
--- /dev/null
+++ b/vllm/entrypoints/openai/parser/responses_parser.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import logging
+from collections.abc import Callable
+
+from openai.types.responses.response_output_message import ResponseOutputMessage
+from openai.types.responses.response_output_text import ResponseOutputText
+from openai.types.responses.response_reasoning_item import (
+    Content,
+    ResponseReasoningItem,
+)
+
+from vllm.entrypoints.openai.protocol import ResponseInputOutputItem, ResponsesRequest
+from vllm.outputs import CompletionOutput
+from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import random_uuid
+
+logger = logging.getLogger(__name__)
+
+
+class ResponsesParser:
+    """Incremental parser over completion tokens with reasoning support."""
+
+    def __init__(
+        self,
+        *,
+        tokenizer: AnyTokenizer,
+        reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser],
+        response_messages: list[ResponseInputOutputItem],
+        request: ResponsesRequest,
+    ):
+        self.response_messages: list[ResponseInputOutputItem] = (
+            # TODO: initial messages may not be properly typed
+            response_messages
+        )
+        self.num_init_messages = len(response_messages)
+        self.tokenizer = tokenizer
+        self.request = request
+
+        self.reasoning_parser_instance = reasoning_parser_cls(tokenizer)
+
+    def process(self, output: CompletionOutput) -> "ResponsesParser":
+        reasoning_content, content = self.reasoning_parser_instance.extract_reasoning(
+            output.text, request=self.request
+        )
+        if reasoning_content:
+            self.response_messages.append(
+                ResponseReasoningItem(
+                    type="reasoning",
+                    id=f"rs_{random_uuid()}",
+                    summary=[],
+                    content=[
+                        Content(
+                            type="reasoning_text",
+                            text=reasoning_content,
+                        )
+                    ],
+                )
+            )
+
+        if content:
+            self.response_messages.append(
+                ResponseOutputMessage(
+                    type="message",
+                    id=f"msg_{random_uuid()}",
+                    status="completed",
+                    role="assistant",
+                    content=[
+                        ResponseOutputText(
+                            annotations=[],  # TODO
+                            type="output_text",
+                            text=content,
+                            logprobs=None,  # TODO
+                        )
+                    ],
+                )
+            )
+
+        return self
+
+
+def get_responses_parser_for_simple_context(
+    *,
+    tokenizer: AnyTokenizer,
+    reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser],
+    response_messages: list[ResponseInputOutputItem],
+    request: ResponsesRequest,
+) -> ResponsesParser:
+    """Factory function to create a ResponsesParser with
+    optional reasoning parser.
+
+    Returns:
+        ResponsesParser instance configured with the provided parser
+    """
+    return ResponsesParser(
+        tokenizer=tokenizer,
+        reasoning_parser_cls=reasoning_parser_cls,
+        response_messages=response_messages,
+        request=request,
+    )
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 81495a077..5ad86194c 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -60,6 +60,7 @@ from vllm.entrypoints.chat_utils import (
 from vllm.entrypoints.context import (
     ConversationContext,
     HarmonyContext,
+    ParsableContext,
     SimpleContext,
     StreamingHarmonyContext,
 )
@@ -96,8 +97,9 @@ from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.responses_utils import (
     construct_input_messages,
-    convert_tool_responses_to_completions_format,
+    construct_tool_dicts,
     extract_tool_types,
+    make_response_output_items_from_parsable_context,
 )
 from vllm.entrypoints.tool_server import ToolServer
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
@@ -228,7 +230,6 @@ class OpenAIServingResponses(OpenAIServing):
         self.tool_parser = self._get_tool_parser(
             tool_parser_name=tool_parser, enable_auto_tools=enable_auto_tools
         )
-        self.exclude_tools_when_tool_choice_none = False
         # HACK(woosuk): This is a hack. We should use a better store.
         # FIXME: If enable_store=True, this may cause a memory leak since we
         # never remove responses from the store.
@@ -413,7 +414,17 @@ class OpenAIServingResponses(OpenAIServing):
                     else:
                         context = HarmonyContext(messages, available_tools)
                 else:
-                    context = SimpleContext()
+                    if envs.VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT:
+                        # This is an feature in development for parsing
+                        # tokens during generation instead of at the end
+                        context = ParsableContext(
+                            response_messages=messages,
+                            tokenizer=tokenizer,
+                            reasoning_parser_cls=self.reasoning_parser,
+                            request=request,
+                        )
+                    else:
+                        context = SimpleContext()
 
                 if self.reasoning_parser is not None:
                     reasoning_parser = self.reasoning_parser(tokenizer)
@@ -534,15 +545,7 @@ class OpenAIServingResponses(OpenAIServing):
         prev_response: ResponsesResponse | None,
         tokenizer: TokenizerLike,
     ):
-        if request.tools is None or (
-            request.tool_choice == "none" and self.exclude_tools_when_tool_choice_none
-        ):
-            tool_dicts = None
-        else:
-            tool_dicts = [
-                convert_tool_responses_to_completions_format(tool.model_dump())
-                for tool in request.tools
-            ]
+        tool_dicts = construct_tool_dicts(request.tools, request.tool_choice)
         # Construct the input messages.
         messages = construct_input_messages(
             request_instructions=request.instructions,
@@ -642,6 +645,22 @@ class OpenAIServingResponses(OpenAIServing):
                     status = "cancelled"
             else:
                 status = "incomplete"
+        elif isinstance(context, ParsableContext):
+            response_messages = context.parser.response_messages[
+                context.parser.num_init_messages :
+            ]
+            output = make_response_output_items_from_parsable_context(response_messages)
+
+            # TODO: context for non-gptoss models doesn't use messages
+            # so we can't get them out yet
+            if request.enable_response_messages:
+                raise NotImplementedError(
+                    "enable_response_messages is currently only supported for gpt-oss"
+                )
+
+            # TODO: Calculate usage.
+            # assert final_res.prompt_token_ids is not None
+            num_tool_output_tokens = 0
         else:
             assert isinstance(context, SimpleContext)
             final_res = context.last_output
@@ -661,7 +680,7 @@ class OpenAIServingResponses(OpenAIServing):
             assert final_res.prompt_token_ids is not None
             num_tool_output_tokens = 0
 
-        assert isinstance(context, (SimpleContext, HarmonyContext))
+        assert isinstance(context, (SimpleContext, HarmonyContext, ParsableContext))
         num_prompt_tokens = context.num_prompt_tokens
         num_generated_tokens = context.num_output_tokens
         num_cached_tokens = context.num_cached_tokens
diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py
index 2e01cb038..5f21e2c44 100644
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from typing import Any
+
 from openai.types.chat import (
     ChatCompletionAssistantMessageParam,
     ChatCompletionMessageToolCallParam,
@@ -10,6 +12,7 @@ from openai.types.chat.chat_completion_message_tool_call_param import (
     Function as FunctionCallTool,
 )
 from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem
+from openai.types.responses.response import ToolChoice
 from openai.types.responses.response_function_tool_call_output_item import (
     ResponseFunctionToolCallOutputItem,
 )
@@ -24,6 +27,20 @@ from vllm.entrypoints.openai.protocol import (
 )
 
 
+def make_response_output_items_from_parsable_context(
+    response_messages: list[ResponseInputOutputItem],
+) -> list[ResponseOutputItem]:
+    """Given a list of sentences, construct ResponseOutput Items."""
+    output_messages: list[ResponseOutputItem] = []
+    for message in response_messages:
+        if not isinstance(message, ResponseFunctionToolCallOutputItem):
+            output_messages.append(message)
+        else:
+            raise NotImplementedError("tool calls not supported for response context")
+
+    return output_messages
+
+
 def construct_input_messages(
     *,
     request_instructions: str | None = None,
@@ -146,3 +163,16 @@ def convert_tool_responses_to_completions_format(tool: dict) -> dict:
         "type": "function",
         "function": tool,
     }
+
+
+def construct_tool_dicts(
+    tools: list[Tool], tool_choice: ToolChoice
+) -> list[dict[str, Any]] | None:
+    if tools is None or (tool_choice == "none"):
+        tool_dicts = None
+    else:
+        tool_dicts = [
+            convert_tool_responses_to_completions_format(tool.model_dump())
+            for tool in tools
+        ]
+    return tool_dicts
diff --git a/vllm/envs.py b/vllm/envs.py
index d0912863e..8b954fa14 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -214,6 +214,7 @@ if TYPE_CHECKING:
     VLLM_ALLREDUCE_USE_SYMM_MEM: bool = True
     VLLM_TUNED_CONFIG_FOLDER: str | None = None
     VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS: set[str] = set()
+    VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT: bool = False
     VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False
     VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY: bool = False
     VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False
@@ -1444,6 +1445,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_ALLREDUCE_USE_SYMM_MEM": lambda: bool(
         int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "1"))
     ),
+    # Experimental: use this to enable MCP tool calling for non harmony models
+    "VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT": lambda: bool(
+        int(os.getenv("VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT", "0"))
+    ),
     # Allows vllm to find tuned config under customized folder
     "VLLM_TUNED_CONFIG_FOLDER": lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None),
     # Valid values are container,code_interpreter,web_search_preview
-- 
GitLab


From 63b1da76ba35cd8cb220c79c44556e07fa4fb0c6 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 3 Dec 2025 01:33:23 +0800
Subject: [PATCH 008/258] [Chore]: Reorganize gguf utils funtions under
 `transformers_utils` (#29891)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 tests/models/test_gguf_download.py     |  2 +-
 tests/transformers_utils/test_utils.py | 12 +++--
 vllm/config/model.py                   |  8 ++-
 vllm/engine/arg_utils.py               |  3 +-
 vllm/tokenizers/registry.py            |  6 +--
 vllm/transformers_utils/config.py      | 14 ++---
 vllm/transformers_utils/gguf_utils.py  | 71 +++++++++++++++++++++++++
 vllm/transformers_utils/processor.py   |  3 +-
 vllm/transformers_utils/utils.py       | 72 --------------------------
 9 files changed, 96 insertions(+), 95 deletions(-)

diff --git a/tests/models/test_gguf_download.py b/tests/models/test_gguf_download.py
index 155768ac9..b1674cdf7 100644
--- a/tests/models/test_gguf_download.py
+++ b/tests/models/test_gguf_download.py
@@ -203,7 +203,7 @@ class TestGGUFModelLoader:
     @patch("vllm.config.model.get_hf_image_processor_config", return_value=None)
     @patch("vllm.config.model.get_config")
     @patch("vllm.config.model.is_gguf", return_value=False)
-    @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=False)
+    @patch("vllm.transformers_utils.gguf_utils.check_gguf_file", return_value=False)
     @patch("os.path.isfile", return_value=False)
     def test_prepare_weights_invalid_format(
         self,
diff --git a/tests/transformers_utils/test_utils.py b/tests/transformers_utils/test_utils.py
index a8d0b9be9..0a6a65b41 100644
--- a/tests/transformers_utils/test_utils.py
+++ b/tests/transformers_utils/test_utils.py
@@ -5,13 +5,15 @@ from unittest.mock import patch
 
 import pytest
 
+from vllm.transformers_utils.gguf_utils import (
+    is_gguf,
+    is_remote_gguf,
+    split_remote_gguf,
+)
 from vllm.transformers_utils.utils import (
     is_cloud_storage,
     is_gcs,
-    is_gguf,
-    is_remote_gguf,
     is_s3,
-    split_remote_gguf,
 )
 
 
@@ -132,7 +134,7 @@ class TestSplitRemoteGGUF:
 class TestIsGGUF:
     """Test is_gguf utility function."""
 
-    @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=True)
+    @patch("vllm.transformers_utils.gguf_utils.check_gguf_file", return_value=True)
     def test_is_gguf_with_local_file(self, mock_check_gguf):
         """Test is_gguf with local GGUF file."""
         assert is_gguf("/path/to/model.gguf")
@@ -149,7 +151,7 @@ class TestIsGGUF:
         assert not is_gguf("repo/model:quant")
         assert not is_gguf("repo/model:INVALID")
 
-    @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=False)
+    @patch("vllm.transformers_utils.gguf_utils.check_gguf_file", return_value=False)
     def test_is_gguf_false(self, mock_check_gguf):
         """Test is_gguf returns False for non-GGUF models."""
         assert not is_gguf("unsloth/Qwen3-0.6B")
diff --git a/vllm/config/model.py b/vllm/config/model.py
index ef592ac00..5de976976 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -37,15 +37,13 @@ from vllm.transformers_utils.config import (
     uses_xdrope_dim,
 )
 from vllm.transformers_utils.gguf_utils import (
-    maybe_patch_hf_config_from_gguf,
-)
-from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri
-from vllm.transformers_utils.utils import (
     is_gguf,
     is_remote_gguf,
-    maybe_model_redirect,
+    maybe_patch_hf_config_from_gguf,
     split_remote_gguf,
 )
+from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri
+from vllm.transformers_utils.utils import maybe_model_redirect
 from vllm.utils.import_utils import LazyLoader
 from vllm.utils.torch_utils import common_broadcastable_dtype
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 5a2836668..83029e09c 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -86,8 +86,9 @@ from vllm.transformers_utils.config import (
     is_interleaved,
     maybe_override_with_speculators,
 )
+from vllm.transformers_utils.gguf_utils import is_gguf
 from vllm.transformers_utils.repo_utils import get_model_path
-from vllm.transformers_utils.utils import is_cloud_storage, is_gguf
+from vllm.transformers_utils.utils import is_cloud_storage
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.mem_constants import GiB_bytes
 from vllm.utils.network_utils import get_ip
diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py
index bf9d295de..87048f2ec 100644
--- a/vllm/tokenizers/registry.py
+++ b/vllm/tokenizers/registry.py
@@ -11,14 +11,14 @@ from typing_extensions import assert_never
 
 import vllm.envs as envs
 from vllm.logger import init_logger
-from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf
-from vllm.transformers_utils.repo_utils import list_filtered_repo_files
-from vllm.transformers_utils.utils import (
+from vllm.transformers_utils.gguf_utils import (
     check_gguf_file,
+    get_gguf_file_path_from_hf,
     is_gguf,
     is_remote_gguf,
     split_remote_gguf,
 )
+from vllm.transformers_utils.repo_utils import list_filtered_repo_files
 from vllm.utils.import_utils import resolve_obj_by_qualname
 
 from .protocol import TokenizerLike
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 1bb5791e1..0cceab90b 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -26,8 +26,15 @@ from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
 
 from vllm import envs
 from vllm.logger import init_logger
+from vllm.transformers_utils.utils import parse_safetensors_file_metadata
 
 from .config_parser_base import ConfigParserBase
+from .gguf_utils import (
+    check_gguf_file,
+    is_gguf,
+    is_remote_gguf,
+    split_remote_gguf,
+)
 from .repo_utils import (
     _get_hf_token,
     file_or_path_exists,
@@ -36,13 +43,6 @@ from .repo_utils import (
     try_get_local_file,
     with_retry,
 )
-from .utils import (
-    check_gguf_file,
-    is_gguf,
-    is_remote_gguf,
-    parse_safetensors_file_metadata,
-    split_remote_gguf,
-)
 
 if envs.VLLM_USE_MODELSCOPE:
     from modelscope import AutoConfig
diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py
index cb1fc2d09..f3fd43c6a 100644
--- a/vllm/transformers_utils/gguf_utils.py
+++ b/vllm/transformers_utils/gguf_utils.py
@@ -2,10 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """GGUF utility functions."""
 
+from functools import cache
+from os import PathLike
 from pathlib import Path
 
 import gguf
+import regex as re
 from gguf.constants import Keys, VisionProjectorType
+from gguf.quants import GGMLQuantizationType
 from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig
 
 from vllm.logger import init_logger
@@ -15,6 +19,73 @@ from .repo_utils import list_filtered_repo_files
 logger = init_logger(__name__)
 
 
+@cache
+def check_gguf_file(model: str | PathLike) -> bool:
+    """Check if the file is a GGUF model."""
+    model = Path(model)
+    if not model.is_file():
+        return False
+    elif model.suffix == ".gguf":
+        return True
+
+    try:
+        with model.open("rb") as f:
+            header = f.read(4)
+
+        return header == b"GGUF"
+    except Exception as e:
+        logger.debug("Error reading file %s: %s", model, e)
+        return False
+
+
+@cache
+def is_remote_gguf(model: str | Path) -> bool:
+    """Check if the model is a remote GGUF model."""
+    pattern = r"^[a-zA-Z0-9][a-zA-Z0-9._-]*/[a-zA-Z0-9][a-zA-Z0-9._-]*:[A-Za-z0-9_+-]+$"
+    model = str(model)
+    if re.fullmatch(pattern, model):
+        _, quant_type = model.rsplit(":", 1)
+        return is_valid_gguf_quant_type(quant_type)
+    return False
+
+
+def is_valid_gguf_quant_type(gguf_quant_type: str) -> bool:
+    """Check if the quant type is a valid GGUF quant type."""
+    return getattr(GGMLQuantizationType, gguf_quant_type, None) is not None
+
+
+def split_remote_gguf(model: str | Path) -> tuple[str, str]:
+    """Split the model into repo_id and quant type."""
+    model = str(model)
+    if is_remote_gguf(model):
+        parts = model.rsplit(":", 1)
+        return (parts[0], parts[1])
+    raise ValueError(
+        f"Wrong GGUF model or invalid GGUF quant type: {model}.\n"
+        "- It should be in repo_id:quant_type format.\n"
+        f"- Valid GGMLQuantizationType values: {GGMLQuantizationType._member_names_}",
+    )
+
+
+def is_gguf(model: str | Path) -> bool:
+    """Check if the model is a GGUF model.
+
+    Args:
+        model: Model name, path, or Path object to check.
+
+    Returns:
+        True if the model is a GGUF model, False otherwise.
+    """
+    model = str(model)
+
+    # Check if it's a local GGUF file
+    if check_gguf_file(model):
+        return True
+
+    # Check if it's a remote GGUF model (repo_id:quant_type format)
+    return is_remote_gguf(model)
+
+
 def detect_gguf_multimodal(model: str) -> Path | None:
     """Check if GGUF model has multimodal projector file.
 
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index 63cdf6337..e9864b0c1 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -18,7 +18,8 @@ from transformers.processing_utils import ProcessorMixin
 from transformers.video_processing_utils import BaseVideoProcessor
 from typing_extensions import TypeVar
 
-from vllm.transformers_utils.utils import convert_model_repo_to_path, is_gguf
+from vllm.transformers_utils.gguf_utils import is_gguf
+from vllm.transformers_utils.utils import convert_model_repo_to_path
 from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
 
 if TYPE_CHECKING:
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index 45a873c9f..96f292f4c 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -9,8 +9,6 @@ from os import PathLike
 from pathlib import Path
 from typing import Any
 
-from gguf import GGMLQuantizationType
-
 import vllm.envs as envs
 from vllm.logger import init_logger
 
@@ -29,76 +27,6 @@ def is_cloud_storage(model_or_path: str) -> bool:
     return is_s3(model_or_path) or is_gcs(model_or_path)
 
 
-@cache
-def check_gguf_file(model: str | PathLike) -> bool:
-    """Check if the file is a GGUF model."""
-    model = Path(model)
-    if not model.is_file():
-        return False
-    elif model.suffix == ".gguf":
-        return True
-
-    try:
-        with model.open("rb") as f:
-            header = f.read(4)
-
-        return header == b"GGUF"
-    except Exception as e:
-        logger.debug("Error reading file %s: %s", model, e)
-        return False
-
-
-@cache
-def is_remote_gguf(model: str | Path) -> bool:
-    """Check if the model is a remote GGUF model."""
-    model = str(model)
-    return (
-        (not is_cloud_storage(model))
-        and (not model.startswith(("http://", "https://")))
-        and ("/" in model and ":" in model)
-        and is_valid_gguf_quant_type(model.rsplit(":", 1)[1])
-    )
-
-
-def is_valid_gguf_quant_type(gguf_quant_type: str) -> bool:
-    """Check if the quant type is a valid GGUF quant type."""
-    return getattr(GGMLQuantizationType, gguf_quant_type, None) is not None
-
-
-def split_remote_gguf(model: str | Path) -> tuple[str, str]:
-    """Split the model into repo_id and quant type."""
-    model = str(model)
-    if is_remote_gguf(model):
-        parts = model.rsplit(":", 1)
-        return (parts[0], parts[1])
-    raise ValueError(
-        "Wrong GGUF model or invalid GGUF quant type: %s.\n"
-        "- It should be in repo_id:quant_type format.\n"
-        "- Valid GGMLQuantizationType values: %s",
-        model,
-        GGMLQuantizationType._member_names_,
-    )
-
-
-def is_gguf(model: str | Path) -> bool:
-    """Check if the model is a GGUF model.
-
-    Args:
-        model: Model name, path, or Path object to check.
-
-    Returns:
-        True if the model is a GGUF model, False otherwise.
-    """
-    model = str(model)
-
-    # Check if it's a local GGUF file
-    if check_gguf_file(model):
-        return True
-
-    # Check if it's a remote GGUF model (repo_id:quant_type format)
-    return is_remote_gguf(model)
-
-
 def modelscope_list_repo_files(
     repo_id: str,
     revision: str | None = None,
-- 
GitLab


From c77b9929a04c56d369c9f6b86fbf5d4891bab285 Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Tue, 2 Dec 2025 11:52:54 -0600
Subject: [PATCH 009/258] Update AMD-CI testing mirror (as of 2025-12-02)
 (#29898)

Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
---
 .buildkite/test-amd.yaml | 43 ++++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index d5d4043a1..67088caa8 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -51,7 +51,7 @@ steps:
 - label: Async Engine, Inputs, Utils, Worker Test # 10min
   timeout_in_minutes: 15
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   grade: Blocking
   source_file_dependencies:
   - vllm/
@@ -64,7 +64,7 @@ steps:
 - label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
   timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   grade: Blocking
   source_file_dependencies:
   - vllm/
@@ -99,7 +99,7 @@ steps:
 - label: Basic Correctness Test # 20min
   timeout_in_minutes: 30
   mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   # grade: Blocking
   fast_check: true
   torch_nightly: true
@@ -116,7 +116,7 @@ steps:
 
 - label: Entrypoints Unit Tests # 5min
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   grade: Blocking
   timeout_in_minutes: 10
   working_dir: "/vllm-workspace/tests"
@@ -131,7 +131,7 @@ steps:
 - label: Entrypoints Integration Test (LLM) # 30min
   timeout_in_minutes: 40
   mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
   fast_check: true
@@ -254,7 +254,7 @@ steps:
 
 - label: EPLB Algorithm Test # 5min
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   grade: Blocking
   timeout_in_minutes: 15
   working_dir: "/vllm-workspace/tests"
@@ -266,7 +266,7 @@ steps:
 
 - label: EPLB Execution Test # 10min
   mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_4
+  agent_pool: mi355_4
   # grade: Blocking
   timeout_in_minutes: 20
   working_dir: "/vllm-workspace/tests"
@@ -281,7 +281,7 @@ steps:
 - label: Metrics, Tracing Test # 12min
   timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_2
+  agent_pool: mi355_2
   # grade: Blocking
   num_gpus: 2
   source_file_dependencies:
@@ -301,7 +301,7 @@ steps:
 - label: Regression Test # 7min
   timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   grade: Blocking
   source_file_dependencies:
   - vllm/
@@ -343,7 +343,7 @@ steps:
 - label: V1 Test entrypoints # 35min
   timeout_in_minutes: 50
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   grade: Blocking
   source_file_dependencies:
     - vllm/
@@ -544,7 +544,7 @@ steps:
 - label: PyTorch Fullgraph Test # 27min
   timeout_in_minutes: 40
   mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   # grade: Blocking
   torch_nightly: true
   source_file_dependencies:
@@ -715,6 +715,7 @@ steps:
   # we can only upgrade after this is resolved
   # TODO(jerryzh168): resolve the above comment
   - uv pip install --system torchao==0.13.0
+  - uv pip install --system conch-triton-kernels
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
 - label: LM Eval Small Models # 15min
@@ -934,6 +935,18 @@ steps:
   commands:
     - pytest -v -s models/language/pooling_mteb_test
 
+- label: Multi-Modal Processor Test (CPU)
+  timeout_in_minutes: 60
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  no_gpu: true
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
 - label: Multi-Modal Processor Test # 44min
   timeout_in_minutes: 60
   mirror_hardwares: [amdexperimental]
@@ -1472,14 +1485,14 @@ steps:
   working_dir: "/vllm-workspace/"
   num_gpus: 2
   commands:
-    - pytest -v -s tests/compile/distributed/test_async_tp.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
     - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
     - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
     #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
-    - pytest -v -s tests/distributed/test_sequence_parallel.py
+    - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - HIP_VISIBLE_DEVICES=0,1 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
     - pytest -v -s tests/v1/distributed/test_dbo.py
 
 ##### B200 test #####
-- 
GitLab


From 2d613de9aef3ef25e9adc8887ac0388da092b500 Mon Sep 17 00:00:00 2001
From: Benjamin Bartels <benjamin@bartels.dev>
Date: Tue, 2 Dec 2025 18:21:49 +0000
Subject: [PATCH 010/258] [CI/Build] Fixes missing runtime dependencies
 (#29822)

Signed-off-by: bbartels <benjamin@bartels.dev>
---
 docker/Dockerfile | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index eb7c10507..006481b23 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -364,7 +364,12 @@ RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
     cuda-cudart-${CUDA_VERSION_DASH} \
     cuda-nvrtc-${CUDA_VERSION_DASH} \
     cuda-cuobjdump-${CUDA_VERSION_DASH} \
-    libcublas-${CUDA_VERSION_DASH} && \
+    # https://github.com/vllm-project/vllm/issues/29590
+    libcurand-dev-${CUDA_VERSION_DASH} \
+    libcublas-${CUDA_VERSION_DASH} \
+    # Fixes nccl_allocator requiring nccl.h at runtime
+    # https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22
+    libnccl-dev && \
     rm -rf /var/lib/apt/lists/*
 
 ARG PIP_INDEX_URL UV_INDEX_URL
-- 
GitLab


From 1d93f116754f6e81acb9287ebcca0d1d1170a944 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 2 Dec 2025 13:48:08 -0500
Subject: [PATCH 011/258] [Attention][CUDAGraph] Remove CG padding from
 attention backends (#29352)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .../layers/mamba/mamba_mixer.py               | 17 +++++++-------
 vllm/v1/attention/backends/gdn_attn.py        | 22 +++++--------------
 vllm/v1/attention/backends/mamba1_attn.py     | 12 +++-------
 vllm/v1/attention/backends/mamba2_attn.py     | 12 +++-------
 vllm/v1/attention/backends/short_conv_attn.py |  3 +--
 5 files changed, 20 insertions(+), 46 deletions(-)

diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index 90e520e24..0b63acf2d 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -252,7 +252,6 @@ class MambaMixer(MambaBase, CustomOp):
             conv_state = self_kv_cache[0].transpose(-1, -2)
             ssm_state = self_kv_cache[1]
             has_initial_states_p = attn_metadata.has_initial_states_p
-            num_padded_decodes = attn_metadata.num_padded_decodes
 
         # 1. Gated MLP's linear projection
         projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1)
@@ -281,7 +280,7 @@ class MambaMixer(MambaBase, CustomOp):
             state_indices_tensor,
             num_prefill_tokens,
             num_prefills,
-            num_padded_decodes,
+            num_decode_tokens,
         )
         hidden_states_BC_p = prefill_decode_split.hidden_states_BC_p
         hidden_states_BC_d = prefill_decode_split.hidden_states_BC_d
@@ -470,24 +469,24 @@ def split_batch_to_prefill_and_decode(
     state_indices_tensor: torch.Tensor,
     num_prefill_tokens: int,
     num_prefills: int,
-    num_padded_decodes: int,
+    num_decode_tokens: int,
 ) -> PrefillDecodeSplit:
-    num_actual_tokens = num_prefill_tokens + num_padded_decodes
+    num_actual_tokens = num_prefill_tokens + num_decode_tokens
 
     # In v1, decode tokens come first, then prefill tokens.
     hidden_states_BC_d, hidden_states_BC_p = torch.split(
         hidden_states_BC[..., :num_actual_tokens],
-        [num_padded_decodes, num_prefill_tokens],
+        [num_decode_tokens, num_prefill_tokens],
         dim=-1,
     )
     gate_d, gate_p = torch.split(
-        gate[..., :num_actual_tokens], [num_padded_decodes, num_prefill_tokens], dim=-1
+        gate[..., :num_actual_tokens], [num_decode_tokens, num_prefill_tokens], dim=-1
     )
 
-    # num_padded_decodes accounts for CUDA graph padding when applicable
+    # num_decode_tokens accounts for CUDA graph padding when applicable
     state_indices_tensor_d, state_indices_tensor_p = torch.split(
-        state_indices_tensor[: num_padded_decodes + num_prefills],
-        [num_padded_decodes, num_prefills],
+        state_indices_tensor[: num_decode_tokens + num_prefills],
+        [num_decode_tokens, num_prefills],
         dim=0,
     )
 
diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py
index 69b5a6fb4..e921f8c3d 100644
--- a/vllm/v1/attention/backends/gdn_attn.py
+++ b/vllm/v1/attention/backends/gdn_attn.py
@@ -254,17 +254,11 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]
             )
         else:
             has_initial_state = None
-        num_actual_tokens = (
-            num_prefill_tokens + num_decode_tokens + num_spec_decode_tokens
-        )
 
-        # prepare tensors for cudagraph
-        #
-        # With speculative decoding, the xgrammar backend may rollback tokens
-        # and causing some sequences has less draft tokens than self.num_spec.
-        #
-        # In above cases, the max possible batch size for n tokens, can be
-        # min(n, cudagraph_max_bs).
+        # Prepare tensors for cudagraph
+        # Note: m.num_actual_tokens is already padded by the model runner for CUDAGraph
+        batch_size = m.num_actual_tokens
+
         if (
             self.use_full_cuda_graph
             and num_prefills == 0
@@ -272,9 +266,6 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]
             and num_spec_decodes <= self.decode_cudagraph_max_bs
             and num_spec_decode_tokens <= self.decode_cudagraph_max_bs
         ):
-            num_actual_tokens = self.vllm_config.pad_for_cudagraph(m.num_actual_tokens)
-            batch_size = min(self.decode_cudagraph_max_bs, num_actual_tokens)
-
             self.spec_state_indices_tensor[:num_spec_decodes].copy_(
                 spec_state_indices_tensor, non_blocking=True
             )
@@ -319,9 +310,6 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]
             and num_spec_decodes == 0
             and num_decodes <= self.decode_cudagraph_max_bs
         ):
-            num_actual_tokens = self.vllm_config.pad_for_cudagraph(m.num_actual_tokens)
-            batch_size = num_actual_tokens
-
             self.non_spec_state_indices_tensor[:num_decodes].copy_(
                 non_spec_state_indices_tensor, non_blocking=True
             )
@@ -344,7 +332,7 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]
             num_decode_tokens=num_decode_tokens,
             num_spec_decodes=num_spec_decodes,
             num_spec_decode_tokens=num_spec_decode_tokens,
-            num_actual_tokens=num_actual_tokens,
+            num_actual_tokens=m.num_actual_tokens,
             has_initial_state=has_initial_state,
             spec_query_start_loc=spec_query_start_loc,
             non_spec_query_start_loc=non_spec_query_start_loc,
diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py
index 8e949e533..fcda61340 100644
--- a/vllm/v1/attention/backends/mamba1_attn.py
+++ b/vllm/v1/attention/backends/mamba1_attn.py
@@ -31,7 +31,6 @@ class Mamba1AttentionMetadata:
     num_prefill_tokens: int
     num_decodes: int
     num_decode_tokens: int
-    num_padded_decodes: int
 
     block_idx_last_scheduled_token: torch.Tensor  # shape: [batch,]
     block_idx_first_scheduled_token_p: torch.Tensor  # shape: [batch,]
@@ -68,7 +67,6 @@ class Mamba1AttentionMetadataBuilder(
 
         has_initial_states_p = None
         query_start_loc_p = None
-        padded_decodes = num_decodes
         num_computed_tokens, num_computed_tokens_p = None, None
         block_idx_first_scheduled_token = None
         block_idx_first_scheduled_token_p = None
@@ -125,11 +123,10 @@ class Mamba1AttentionMetadataBuilder(
             and num_decodes <= self.decode_cudagraph_max_bs
             and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
         ):
-            padded_decodes = self.vllm_config.pad_for_cudagraph(num_decodes)
             self.state_indices_tensor[:num_decodes].copy_(
                 state_indices_tensor, non_blocking=True
             )
-            state_indices_tensor = self.state_indices_tensor[:padded_decodes]
+            state_indices_tensor = self.state_indices_tensor[:num_decode_tokens]
             state_indices_tensor[num_decodes:] = PAD_SLOT_ID
 
             if self.vllm_config.cache_config.enable_prefix_caching:
@@ -137,17 +134,15 @@ class Mamba1AttentionMetadataBuilder(
                     block_idx_last_scheduled_token, non_blocking=True
                 )
                 block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[
-                    :padded_decodes
+                    :num_decode_tokens
                 ]
-                block_idx_last_scheduled_token[num_decodes:] = 0
 
                 self.block_idx_last_computed_token[:num_decodes].copy_(
                     block_idx_last_computed_token, non_blocking=True
                 )
                 block_idx_last_computed_token = self.block_idx_last_computed_token[
-                    :padded_decodes
+                    :num_decode_tokens
                 ]
-                block_idx_last_computed_token[num_decodes:] = 0
 
         return Mamba1AttentionMetadata(
             query_start_loc_p=query_start_loc_p,
@@ -157,7 +152,6 @@ class Mamba1AttentionMetadataBuilder(
             num_prefill_tokens=num_prefill_tokens,
             num_decodes=num_decodes,
             num_decode_tokens=num_decode_tokens,
-            num_padded_decodes=padded_decodes,
             block_idx_last_scheduled_token=block_idx_last_scheduled_token,
             block_idx_first_scheduled_token_p=block_idx_first_scheduled_token_p,
             block_idx_last_computed_token=block_idx_last_computed_token,
diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py
index 888734e5d..bf1d8f09a 100644
--- a/vllm/v1/attention/backends/mamba2_attn.py
+++ b/vllm/v1/attention/backends/mamba2_attn.py
@@ -10,7 +10,6 @@ from vllm.config import VllmConfig
 from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backends.mamba_attn import BaseMambaAttentionMetadataBuilder
 from vllm.v1.attention.backends.utils import (
-    PAD_SLOT_ID,
     CommonAttentionMetadata,
     compute_causal_conv1d_metadata,
     split_decodes_and_prefills,
@@ -304,30 +303,25 @@ class Mamba2AttentionMetadataBuilder(
             num_decodes <= self.decode_cudagraph_max_bs
             and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
         ):
-            # Pad state tensor for CUDA graph
-            num_input_tokens = self.vllm_config.pad_for_cudagraph(num_decodes)
             self.state_indices_tensor[:num_decodes].copy_(
                 state_indices_tensor, non_blocking=True
             )
-            state_indices_tensor = self.state_indices_tensor[:num_input_tokens]
-            state_indices_tensor[num_decodes:] = PAD_SLOT_ID
+            state_indices_tensor = self.state_indices_tensor[:num_decode_tokens]
 
             if self.vllm_config.cache_config.enable_prefix_caching:
                 self.block_idx_last_scheduled_token[:num_decodes].copy_(
                     block_idx_last_scheduled_token, non_blocking=True
                 )
                 block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[
-                    :num_input_tokens
+                    :num_decode_tokens
                 ]
-                block_idx_last_scheduled_token[num_decodes:] = 0
 
                 self.block_idx_last_computed_token[:num_decodes].copy_(
                     block_idx_last_computed_token, non_blocking=True
                 )
                 block_idx_last_computed_token = self.block_idx_last_computed_token[
-                    :num_input_tokens
+                    :num_decode_tokens
                 ]
-                block_idx_last_computed_token[num_decodes:] = 0
 
         attn_metadata = Mamba2AttentionMetadata(
             num_prefills=num_prefills,
diff --git a/vllm/v1/attention/backends/short_conv_attn.py b/vllm/v1/attention/backends/short_conv_attn.py
index de0cb73db..c8fe0faf7 100644
--- a/vllm/v1/attention/backends/short_conv_attn.py
+++ b/vllm/v1/attention/backends/short_conv_attn.py
@@ -83,11 +83,10 @@ class ShortConvAttentionMetadataBuilder(
             and num_decodes <= self.decode_cudagraph_max_bs
             and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
         ):
-            num_input_tokens = self.vllm_config.pad_for_cudagraph(num_decodes)
             self.state_indices_tensor[:num_decodes].copy_(
                 state_indices_tensor, non_blocking=True
             )
-            state_indices_tensor = self.state_indices_tensor[:num_input_tokens]
+            state_indices_tensor = self.state_indices_tensor[:num_decode_tokens]
             state_indices_tensor[num_decodes:] = PAD_SLOT_ID
 
         attn_metadata = ShortConvAttentionMetadata(
-- 
GitLab


From a2b053dc858db461d8d98cff37ee7c67ba21126b Mon Sep 17 00:00:00 2001
From: Navanit Dubey <98005188+Navanit-git@users.noreply.github.com>
Date: Wed, 3 Dec 2025 00:58:35 +0530
Subject: [PATCH 012/258] feat(model): Add BitsAndBytes quantization support
 for Qwen3-Omni-MoE (#29896)

Signed-off-by: navanit-git <navanitdubey@gmail.com>
---
 .../models/qwen3_omni_moe_thinker.py          | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index 39dd42552..fe825198d 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -62,6 +62,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.qwen2_audio import Qwen2AudioProcessingInfo
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems
@@ -1137,6 +1138,18 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
         }
     )
 
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
@@ -1763,3 +1776,13 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
 
         mrope_position_delta = llm_positions.max() + 1 - seq_len
         return llm_positions, mrope_position_delta
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="visual.merger",
+            tower_model=["visual.", "audio_tower."],
+        )
-- 
GitLab


From 1c593e117d3a818815b5d07992a096d53b519a15 Mon Sep 17 00:00:00 2001
From: Copilot <198982749+Copilot@users.noreply.github.com>
Date: Tue, 2 Dec 2025 20:40:56 +0000
Subject: [PATCH 013/258] Fix boolean nested params, add dict format support,
 and enhance plotting for vllm bench sweep (#29025)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Luka Govedič <luka.govedic@gmail.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: ProExpertProg <11367180+ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <luka.govedic@gmail.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 tests/benchmarks/test_param_sweep.py  | 257 ++++++++++++++++++++++++++
 tests/benchmarks/test_plot_filters.py | 171 +++++++++++++++++
 vllm/benchmarks/sweep/param_sweep.py  |  85 ++++++++-
 vllm/benchmarks/sweep/plot.py         | 109 ++++++++++-
 vllm/benchmarks/sweep/serve.py        |  14 +-
 5 files changed, 614 insertions(+), 22 deletions(-)
 create mode 100644 tests/benchmarks/test_param_sweep.py
 create mode 100644 tests/benchmarks/test_plot_filters.py

diff --git a/tests/benchmarks/test_param_sweep.py b/tests/benchmarks/test_param_sweep.py
new file mode 100644
index 000000000..0d47cfd9d
--- /dev/null
+++ b/tests/benchmarks/test_param_sweep.py
@@ -0,0 +1,257 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+import tempfile
+from pathlib import Path
+
+import pytest
+
+from vllm.benchmarks.sweep.param_sweep import ParameterSweep, ParameterSweepItem
+
+
+class TestParameterSweepItem:
+    """Test ParameterSweepItem functionality."""
+
+    @pytest.mark.parametrize(
+        "input_dict,expected",
+        [
+            (
+                {"compilation_config.use_inductor_graph_partition": False},
+                "--compilation-config.use_inductor_graph_partition=false",
+            ),
+            (
+                {"compilation_config.use_inductor_graph_partition": True},
+                "--compilation-config.use_inductor_graph_partition=true",
+            ),
+            (
+                {"compilation_config.use_inductor": False},
+                "--compilation-config.use_inductor=false",
+            ),
+            (
+                {"compilation_config.use_inductor": True},
+                "--compilation-config.use_inductor=true",
+            ),
+        ],
+    )
+    def test_nested_boolean_params(self, input_dict, expected):
+        """Test that nested boolean params use =true/false syntax."""
+        item = ParameterSweepItem.from_record(input_dict)
+        cmd = item.apply_to_cmd(["vllm", "serve", "model"])
+        assert expected in cmd
+
+    @pytest.mark.parametrize(
+        "input_dict,expected",
+        [
+            ({"enable_prefix_caching": False}, "--no-enable-prefix-caching"),
+            ({"enable_prefix_caching": True}, "--enable-prefix-caching"),
+            ({"disable_log_stats": False}, "--no-disable-log-stats"),
+            ({"disable_log_stats": True}, "--disable-log-stats"),
+        ],
+    )
+    def test_non_nested_boolean_params(self, input_dict, expected):
+        """Test that non-nested boolean params use --no- prefix."""
+        item = ParameterSweepItem.from_record(input_dict)
+        cmd = item.apply_to_cmd(["vllm", "serve", "model"])
+        assert expected in cmd
+
+    @pytest.mark.parametrize(
+        "compilation_config",
+        [
+            {"cudagraph_mode": "full", "mode": 2, "use_inductor_graph_partition": True},
+            {
+                "cudagraph_mode": "piecewise",
+                "mode": 3,
+                "use_inductor_graph_partition": False,
+            },
+        ],
+    )
+    def test_nested_dict_value(self, compilation_config):
+        """Test that nested dict values are serialized as JSON."""
+        item = ParameterSweepItem.from_record(
+            {"compilation_config": compilation_config}
+        )
+        cmd = item.apply_to_cmd(["vllm", "serve", "model"])
+        assert "--compilation-config" in cmd
+        # The dict should be JSON serialized
+        idx = cmd.index("--compilation-config")
+        assert json.loads(cmd[idx + 1]) == compilation_config
+
+    @pytest.mark.parametrize(
+        "input_dict,expected_key,expected_value",
+        [
+            ({"model": "test-model"}, "--model", "test-model"),
+            ({"max_tokens": 100}, "--max-tokens", "100"),
+            ({"temperature": 0.7}, "--temperature", "0.7"),
+        ],
+    )
+    def test_string_and_numeric_values(self, input_dict, expected_key, expected_value):
+        """Test that string and numeric values are handled correctly."""
+        item = ParameterSweepItem.from_record(input_dict)
+        cmd = item.apply_to_cmd(["vllm", "serve"])
+        assert expected_key in cmd
+        assert expected_value in cmd
+
+    @pytest.mark.parametrize(
+        "input_dict,expected_key,key_idx_offset",
+        [
+            ({"max_tokens": 200}, "--max-tokens", 1),
+            ({"enable_prefix_caching": False}, "--no-enable-prefix-caching", 0),
+        ],
+    )
+    def test_replace_existing_parameter(self, input_dict, expected_key, key_idx_offset):
+        """Test that existing parameters in cmd are replaced."""
+        item = ParameterSweepItem.from_record(input_dict)
+
+        if key_idx_offset == 1:
+            # Key-value pair
+            cmd = item.apply_to_cmd(["vllm", "serve", "--max-tokens", "100", "model"])
+            assert expected_key in cmd
+            idx = cmd.index(expected_key)
+            assert cmd[idx + 1] == "200"
+            assert "100" not in cmd
+        else:
+            # Boolean flag
+            cmd = item.apply_to_cmd(
+                ["vllm", "serve", "--enable-prefix-caching", "model"]
+            )
+            assert expected_key in cmd
+            assert "--enable-prefix-caching" not in cmd
+
+
+class TestParameterSweep:
+    """Test ParameterSweep functionality."""
+
+    def test_from_records_list(self):
+        """Test creating ParameterSweep from a list of records."""
+        records = [
+            {"max_tokens": 100, "temperature": 0.7},
+            {"max_tokens": 200, "temperature": 0.9},
+        ]
+        sweep = ParameterSweep.from_records(records)
+        assert len(sweep) == 2
+        assert sweep[0]["max_tokens"] == 100
+        assert sweep[1]["max_tokens"] == 200
+
+    def test_read_from_dict(self):
+        """Test creating ParameterSweep from a dict format."""
+        data = {
+            "experiment1": {"max_tokens": 100, "temperature": 0.7},
+            "experiment2": {"max_tokens": 200, "temperature": 0.9},
+        }
+        sweep = ParameterSweep.read_from_dict(data)
+        assert len(sweep) == 2
+
+        # Check that items have the _benchmark_name field
+        names = {item["_benchmark_name"] for item in sweep}
+        assert names == {"experiment1", "experiment2"}
+
+        # Check that parameters are preserved
+        for item in sweep:
+            if item["_benchmark_name"] == "experiment1":
+                assert item["max_tokens"] == 100
+                assert item["temperature"] == 0.7
+            elif item["_benchmark_name"] == "experiment2":
+                assert item["max_tokens"] == 200
+                assert item["temperature"] == 0.9
+
+    def test_read_json_list_format(self):
+        """Test reading JSON file with list format."""
+        records = [
+            {"max_tokens": 100, "temperature": 0.7},
+            {"max_tokens": 200, "temperature": 0.9},
+        ]
+
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+            json.dump(records, f)
+            temp_path = Path(f.name)
+
+        try:
+            sweep = ParameterSweep.read_json(temp_path)
+            assert len(sweep) == 2
+            assert sweep[0]["max_tokens"] == 100
+            assert sweep[1]["max_tokens"] == 200
+        finally:
+            temp_path.unlink()
+
+    def test_read_json_dict_format(self):
+        """Test reading JSON file with dict format."""
+        data = {
+            "experiment1": {"max_tokens": 100, "temperature": 0.7},
+            "experiment2": {"max_tokens": 200, "temperature": 0.9},
+        }
+
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+            json.dump(data, f)
+            temp_path = Path(f.name)
+
+        try:
+            sweep = ParameterSweep.read_json(temp_path)
+            assert len(sweep) == 2
+
+            # Check that items have the _benchmark_name field
+            names = {item["_benchmark_name"] for item in sweep}
+            assert names == {"experiment1", "experiment2"}
+        finally:
+            temp_path.unlink()
+
+    def test_unique_benchmark_names_validation(self):
+        """Test that duplicate _benchmark_name values raise an error."""
+        # Test with duplicate names in list format
+        records = [
+            {"_benchmark_name": "exp1", "max_tokens": 100},
+            {"_benchmark_name": "exp1", "max_tokens": 200},
+        ]
+
+        with pytest.raises(ValueError, match="Duplicate _benchmark_name values"):
+            ParameterSweep.from_records(records)
+
+    def test_unique_benchmark_names_multiple_duplicates(self):
+        """Test validation with multiple duplicate names."""
+        records = [
+            {"_benchmark_name": "exp1", "max_tokens": 100},
+            {"_benchmark_name": "exp1", "max_tokens": 200},
+            {"_benchmark_name": "exp2", "max_tokens": 300},
+            {"_benchmark_name": "exp2", "max_tokens": 400},
+        ]
+
+        with pytest.raises(ValueError, match="Duplicate _benchmark_name values"):
+            ParameterSweep.from_records(records)
+
+    def test_no_benchmark_names_allowed(self):
+        """Test that records without _benchmark_name are allowed."""
+        records = [
+            {"max_tokens": 100, "temperature": 0.7},
+            {"max_tokens": 200, "temperature": 0.9},
+        ]
+        sweep = ParameterSweep.from_records(records)
+        assert len(sweep) == 2
+
+    def test_mixed_benchmark_names_allowed(self):
+        """Test that mixing records with and without _benchmark_name is allowed."""
+        records = [
+            {"_benchmark_name": "exp1", "max_tokens": 100},
+            {"max_tokens": 200, "temperature": 0.9},
+        ]
+        sweep = ParameterSweep.from_records(records)
+        assert len(sweep) == 2
+
+
+class TestParameterSweepItemKeyNormalization:
+    """Test key normalization in ParameterSweepItem."""
+
+    def test_underscore_to_hyphen_conversion(self):
+        """Test that underscores are converted to hyphens in CLI."""
+        item = ParameterSweepItem.from_record({"max_tokens": 100})
+        cmd = item.apply_to_cmd(["vllm", "serve"])
+        assert "--max-tokens" in cmd
+
+    def test_nested_key_preserves_suffix(self):
+        """Test that nested keys preserve the suffix format."""
+        # The suffix after the dot should preserve underscores
+        item = ParameterSweepItem.from_record(
+            {"compilation_config.some_nested_param": "value"}
+        )
+        cmd = item.apply_to_cmd(["vllm", "serve"])
+        # The prefix (compilation_config) gets converted to hyphens,
+        # but the suffix (some_nested_param) is preserved
+        assert any("compilation-config.some_nested_param" in arg for arg in cmd)
diff --git a/tests/benchmarks/test_plot_filters.py b/tests/benchmarks/test_plot_filters.py
new file mode 100644
index 000000000..2b58a9912
--- /dev/null
+++ b/tests/benchmarks/test_plot_filters.py
@@ -0,0 +1,171 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pandas as pd
+import pytest
+
+from vllm.benchmarks.sweep.plot import (
+    PlotEqualTo,
+    PlotFilterBase,
+    PlotFilters,
+    PlotGreaterThan,
+    PlotGreaterThanOrEqualTo,
+    PlotLessThan,
+    PlotLessThanOrEqualTo,
+    PlotNotEqualTo,
+)
+
+
+class TestPlotFilters:
+    """Test PlotFilter functionality including 'inf' edge case."""
+
+    def setup_method(self):
+        """Create sample DataFrames for testing."""
+        # DataFrame with numeric values
+        self.df_numeric = pd.DataFrame(
+            {
+                "request_rate": [1.0, 5.0, 10.0, 50.0, 100.0],
+                "value": [10, 20, 30, 40, 50],
+            }
+        )
+
+        # DataFrame with float('inf') - note: string "inf" values are coerced
+        # to float when loading data, so we only test with float('inf')
+        self.df_inf_float = pd.DataFrame(
+            {
+                "request_rate": [1.0, 5.0, 10.0, float("inf"), float("inf")],
+                "value": [10, 20, 30, 40, 50],
+            }
+        )
+
+    @pytest.mark.parametrize(
+        "target,expected_count",
+        [
+            ("5.0", 1),
+            ("10.0", 1),
+            ("1.0", 1),
+        ],
+    )
+    def test_equal_to_numeric(self, target, expected_count):
+        """Test PlotEqualTo with numeric values."""
+        filter_obj = PlotEqualTo("request_rate", target)
+        result = filter_obj.apply(self.df_numeric)
+        assert len(result) == expected_count
+
+    def test_equal_to_inf_float(self):
+        """Test PlotEqualTo with float('inf')."""
+        filter_obj = PlotEqualTo("request_rate", "inf")
+        result = filter_obj.apply(self.df_inf_float)
+        # Should match both float('inf') entries because float('inf') == float('inf')
+        assert len(result) == 2
+
+    @pytest.mark.parametrize(
+        "target,expected_count",
+        [
+            ("5.0", 4),  # All except 5.0
+            ("1.0", 4),  # All except 1.0
+        ],
+    )
+    def test_not_equal_to_numeric(self, target, expected_count):
+        """Test PlotNotEqualTo with numeric values."""
+        filter_obj = PlotNotEqualTo("request_rate", target)
+        result = filter_obj.apply(self.df_numeric)
+        assert len(result) == expected_count
+
+    def test_not_equal_to_inf_float(self):
+        """Test PlotNotEqualTo with float('inf')."""
+        filter_obj = PlotNotEqualTo("request_rate", "inf")
+        result = filter_obj.apply(self.df_inf_float)
+        # Should exclude float('inf') entries
+        assert len(result) == 3
+
+    @pytest.mark.parametrize(
+        "target,expected_count",
+        [
+            ("10.0", 2),  # 1.0, 5.0
+            ("50.0", 3),  # 1.0, 5.0, 10.0
+            ("5.0", 1),  # 1.0
+        ],
+    )
+    def test_less_than(self, target, expected_count):
+        """Test PlotLessThan with numeric values."""
+        filter_obj = PlotLessThan("request_rate", target)
+        result = filter_obj.apply(self.df_numeric)
+        assert len(result) == expected_count
+
+    @pytest.mark.parametrize(
+        "target,expected_count",
+        [
+            ("10.0", 3),  # 1.0, 5.0, 10.0
+            ("5.0", 2),  # 1.0, 5.0
+        ],
+    )
+    def test_less_than_or_equal_to(self, target, expected_count):
+        """Test PlotLessThanOrEqualTo with numeric values."""
+        filter_obj = PlotLessThanOrEqualTo("request_rate", target)
+        result = filter_obj.apply(self.df_numeric)
+        assert len(result) == expected_count
+
+    @pytest.mark.parametrize(
+        "target,expected_count",
+        [
+            ("10.0", 2),  # 50.0, 100.0
+            ("5.0", 3),  # 10.0, 50.0, 100.0
+        ],
+    )
+    def test_greater_than(self, target, expected_count):
+        """Test PlotGreaterThan with numeric values."""
+        filter_obj = PlotGreaterThan("request_rate", target)
+        result = filter_obj.apply(self.df_numeric)
+        assert len(result) == expected_count
+
+    @pytest.mark.parametrize(
+        "target,expected_count",
+        [
+            ("10.0", 3),  # 10.0, 50.0, 100.0
+            ("5.0", 4),  # 5.0, 10.0, 50.0, 100.0
+        ],
+    )
+    def test_greater_than_or_equal_to(self, target, expected_count):
+        """Test PlotGreaterThanOrEqualTo with numeric values."""
+        filter_obj = PlotGreaterThanOrEqualTo("request_rate", target)
+        result = filter_obj.apply(self.df_numeric)
+        assert len(result) == expected_count
+
+    @pytest.mark.parametrize(
+        "filter_str,expected_var,expected_target,expected_type",
+        [
+            ("request_rate==5.0", "request_rate", "5.0", PlotEqualTo),
+            ("request_rate!=10.0", "request_rate", "10.0", PlotNotEqualTo),
+            ("request_rate<50.0", "request_rate", "50.0", PlotLessThan),
+            ("request_rate<=50.0", "request_rate", "50.0", PlotLessThanOrEqualTo),
+            ("request_rate>10.0", "request_rate", "10.0", PlotGreaterThan),
+            ("request_rate>=10.0", "request_rate", "10.0", PlotGreaterThanOrEqualTo),
+            ("request_rate==inf", "request_rate", "inf", PlotEqualTo),
+            ("request_rate!='inf'", "request_rate", "inf", PlotNotEqualTo),
+        ],
+    )
+    def test_parse_str(self, filter_str, expected_var, expected_target, expected_type):
+        """Test parsing filter strings."""
+        filter_obj = PlotFilterBase.parse_str(filter_str)
+        assert isinstance(filter_obj, expected_type)
+        assert filter_obj.var == expected_var
+        assert filter_obj.target == expected_target
+
+    def test_parse_str_inf_edge_case(self):
+        """Test parsing 'inf' string in filter."""
+        filter_obj = PlotFilterBase.parse_str("request_rate==inf")
+        assert isinstance(filter_obj, PlotEqualTo)
+        assert filter_obj.var == "request_rate"
+        assert filter_obj.target == "inf"
+
+    def test_parse_multiple_filters(self):
+        """Test parsing multiple filters."""
+        filters = PlotFilters.parse_str("request_rate>5.0,value<=40")
+        assert len(filters) == 2
+        assert isinstance(filters[0], PlotGreaterThan)
+        assert isinstance(filters[1], PlotLessThanOrEqualTo)
+
+    def test_parse_empty_filter(self):
+        """Test parsing empty filter string."""
+        filters = PlotFilters.parse_str("")
+        assert len(filters) == 0
diff --git a/vllm/benchmarks/sweep/param_sweep.py b/vllm/benchmarks/sweep/param_sweep.py
index 986561ed8..a438a3288 100644
--- a/vllm/benchmarks/sweep/param_sweep.py
+++ b/vllm/benchmarks/sweep/param_sweep.py
@@ -9,8 +9,26 @@ class ParameterSweep(list["ParameterSweepItem"]):
     @classmethod
     def read_json(cls, filepath: os.PathLike):
         with open(filepath, "rb") as f:
-            records = json.load(f)
+            data = json.load(f)
 
+        # Support both list and dict formats
+        if isinstance(data, dict):
+            return cls.read_from_dict(data)
+
+        return cls.from_records(data)
+
+    @classmethod
+    def read_from_dict(cls, data: dict[str, dict[str, object]]):
+        """
+        Read parameter sweep from a dict format where keys are names.
+
+        Example:
+            {
+                "experiment1": {"max_tokens": 100, "temperature": 0.7},
+                "experiment2": {"max_tokens": 200, "temperature": 0.9}
+            }
+        """
+        records = [{"_benchmark_name": name, **params} for name, params in data.items()]
         return cls.from_records(records)
 
     @classmethod
@@ -21,6 +39,15 @@ class ParameterSweep(list["ParameterSweepItem"]):
                 f"but found type: {type(records)}"
             )
 
+        # Validate that all _benchmark_name values are unique if provided
+        names = [r["_benchmark_name"] for r in records if "_benchmark_name" in r]
+        if names and len(names) != len(set(names)):
+            duplicates = [name for name in names if names.count(name) > 1]
+            raise ValueError(
+                f"Duplicate _benchmark_name values found: {set(duplicates)}. "
+                f"All _benchmark_name values must be unique."
+            )
+
         return cls(ParameterSweepItem.from_record(record) for record in records)
 
 
@@ -38,6 +65,18 @@ class ParameterSweepItem(dict[str, object]):
     def __or__(self, other: dict[str, Any]):
         return type(self)(super().__or__(other))
 
+    @property
+    def name(self) -> str:
+        """
+        Get the name for this parameter sweep item.
+
+        Returns the '_benchmark_name' field if present, otherwise returns a text
+        representation of all parameters.
+        """
+        if "_benchmark_name" in self:
+            return self["_benchmark_name"]
+        return self.as_text(sep="-")
+
     # In JSON, we prefer "_"
     def _iter_param_key_candidates(self, param_key: str):
         # Inner config arguments are not converted by the CLI
@@ -63,29 +102,57 @@ class ParameterSweepItem(dict[str, object]):
     def has_param(self, param_key: str) -> bool:
         return any(k in self for k in self._iter_param_key_candidates(param_key))
 
+    def _normalize_cmd_kv_pair(self, k: str, v: object) -> list[str]:
+        """
+        Normalize a key-value pair into command-line arguments.
+
+        Returns a list containing either:
+        - A single element for boolean flags (e.g., ['--flag'] or ['--flag=true'])
+        - Two elements for key-value pairs (e.g., ['--key', 'value'])
+        """
+        if isinstance(v, bool):
+            # For nested params (containing "."), use =true/false syntax
+            if "." in k:
+                return [f"{self._normalize_cmd_key(k)}={'true' if v else 'false'}"]
+            else:
+                return [self._normalize_cmd_key(k if v else "no-" + k)]
+        else:
+            return [self._normalize_cmd_key(k), str(v)]
+
     def apply_to_cmd(self, cmd: list[str]) -> list[str]:
         cmd = list(cmd)
 
         for k, v in self.items():
+            # Skip the '_benchmark_name' field, not a parameter
+            if k == "_benchmark_name":
+                continue
+
+            # Serialize dict values as JSON
+            if isinstance(v, dict):
+                v = json.dumps(v)
+
             for k_candidate in self._iter_cmd_key_candidates(k):
                 try:
                     k_idx = cmd.index(k_candidate)
 
-                    if isinstance(v, bool):
-                        cmd[k_idx] = self._normalize_cmd_key(k if v else "no-" + k)
+                    # Replace existing parameter
+                    normalized = self._normalize_cmd_kv_pair(k, v)
+                    if len(normalized) == 1:
+                        # Boolean flag
+                        cmd[k_idx] = normalized[0]
                     else:
-                        cmd[k_idx + 1] = str(v)
+                        # Key-value pair
+                        cmd[k_idx] = normalized[0]
+                        cmd[k_idx + 1] = normalized[1]
 
                     break
                 except ValueError:
                     continue
             else:
-                if isinstance(v, bool):
-                    cmd.append(self._normalize_cmd_key(k if v else "no-" + k))
-                else:
-                    cmd.extend([self._normalize_cmd_key(k), str(v)])
+                # Add new parameter
+                cmd.extend(self._normalize_cmd_kv_pair(k, v))
 
         return cmd
 
     def as_text(self, sep: str = ", ") -> str:
-        return sep.join(f"{k}={v}" for k, v in self.items())
+        return sep.join(f"{k}={v}" for k, v in self.items() if k != "_benchmark_name")
diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 9947d6170..163d51793 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -65,6 +65,18 @@ class PlotEqualTo(PlotFilterBase):
         return df[df[self.var] == target]
 
 
+@dataclass
+class PlotNotEqualTo(PlotFilterBase):
+    @override
+    def apply(self, df: "pd.DataFrame") -> "pd.DataFrame":
+        try:
+            target = float(self.target)
+        except ValueError:
+            target = self.target
+
+        return df[df[self.var] != target]
+
+
 @dataclass
 class PlotLessThan(PlotFilterBase):
     @override
@@ -96,6 +108,7 @@ class PlotGreaterThanOrEqualTo(PlotFilterBase):
 # NOTE: The ordering is important! Match longer op_keys first
 PLOT_FILTERS: dict[str, type[PlotFilterBase]] = {
     "==": PlotEqualTo,
+    "!=": PlotNotEqualTo,
     "<=": PlotLessThanOrEqualTo,
     ">=": PlotGreaterThanOrEqualTo,
     "<": PlotLessThan,
@@ -167,6 +180,27 @@ def _json_load_bytes(path: Path) -> list[dict[str, object]]:
         return json.load(f)
 
 
+def _convert_inf_nan_strings(data: list[dict[str, object]]) -> list[dict[str, object]]:
+    """
+    Convert string values "inf", "-inf", and "nan" to their float equivalents.
+
+    This handles the case where JSON serialization represents inf/nan as strings.
+    """
+    converted_data = []
+    for record in data:
+        converted_record = {}
+        for key, value in record.items():
+            if isinstance(value, str):
+                if value in ["inf", "-inf", "nan"]:
+                    converted_record[key] = float(value)
+                else:
+                    converted_record[key] = value
+            else:
+                converted_record[key] = value
+        converted_data.append(converted_record)
+    return converted_data
+
+
 def _get_metric(run_data: dict[str, object], metric_key: str):
     try:
         return run_data[metric_key]
@@ -178,12 +212,15 @@ def _get_group(run_data: dict[str, object], group_keys: list[str]):
     return tuple((k, str(_get_metric(run_data, k))) for k in group_keys)
 
 
-def _get_fig_path(fig_dir: Path, group: tuple[tuple[str, str], ...]):
+def _get_fig_path(fig_dir: Path, group: tuple[tuple[str, str], ...], fig_name: str):
     parts = list[str]()
+
+    # Start with figure name (always provided, defaults to "FIGURE")
+    parts.append(fig_name)
+
+    # Always append group data if present
     if group:
-        parts.extend(("FIGURE-", *(f"{k}={v}" for k, v in group)))
-    else:
-        parts.append("figure")
+        parts.extend(f"{k}={v}" for k, v in group)
 
     return fig_dir / sanitize_filename("-".join(parts) + ".png")
 
@@ -217,6 +254,10 @@ def _plot_fig(
     scale_x: str | None,
     scale_y: str | None,
     dry_run: bool,
+    fig_name: str,
+    error_bars: bool,
+    fig_height: float,
+    fig_dpi: int,
 ):
     fig_group, fig_data = fig_group_data
 
@@ -230,7 +271,7 @@ def _plot_fig(
         for _, row_data in row_groups
     )
 
-    fig_path = _get_fig_path(fig_dir, fig_group)
+    fig_path = _get_fig_path(fig_dir, fig_group, fig_name)
 
     print("[BEGIN FIGURE]")
     print(f"Group: {dict(fig_group)}")
@@ -241,6 +282,8 @@ def _plot_fig(
         print("[END FIGURE]")
         return
 
+    # Convert string "inf", "-inf", and "nan" to their float equivalents
+    fig_data = _convert_inf_nan_strings(fig_data)
     df = pd.DataFrame.from_records(fig_data)
 
     if var_x not in df.columns:
@@ -275,6 +318,10 @@ def _plot_fig(
     df = filter_by.apply(df)
     df = bin_by.apply(df)
 
+    # Sort by curve_by columns alphabetically for consistent legend ordering
+    if curve_by:
+        df = df.sort_values(by=curve_by)
+
     df["row_group"] = (
         pd.concat(
             [k + "=" + df[k].astype(str) for k in row_by],
@@ -293,7 +340,7 @@ def _plot_fig(
         else "(All)"
     )
 
-    g = sns.FacetGrid(df, row="row_group", col="col_group")
+    g = sns.FacetGrid(df, row="row_group", col="col_group", height=fig_height)
 
     if row_by and col_by:
         g.set_titles("{row_name}\n{col_name}")
@@ -320,6 +367,7 @@ def _plot_fig(
             style=style,
             size=size,
             markers=True,
+            errorbar="sd" if error_bars else None,
         )
 
         g.add_legend(title=hue)
@@ -339,11 +387,12 @@ def _plot_fig(
             y=var_y,
             hue="curve_group",
             markers=True,
+            errorbar="sd" if error_bars else None,
         )
 
         g.add_legend()
 
-    g.savefig(fig_path)
+    g.savefig(fig_path, dpi=fig_dpi)
     plt.close(g.figure)
 
     print("[END FIGURE]")
@@ -364,6 +413,10 @@ def plot(
     scale_x: str | None,
     scale_y: str | None,
     dry_run: bool,
+    fig_name: str = "FIGURE",
+    error_bars: bool = True,
+    fig_height: float = 6.4,
+    fig_dpi: int = 300,
 ):
     all_data = [
         run_data
@@ -398,6 +451,10 @@ def plot(
                     scale_x=scale_x,
                     scale_y=scale_y,
                     dry_run=dry_run,
+                    fig_name=fig_name,
+                    error_bars=error_bars,
+                    fig_height=fig_height,
+                    fig_dpi=fig_dpi,
                 ),
                 fig_groups,
             )
@@ -419,6 +476,10 @@ class SweepPlotArgs:
     scale_x: str | None
     scale_y: str | None
     dry_run: bool
+    fig_name: str = "FIGURE"
+    error_bars: bool = True
+    fig_height: float = 6.4
+    fig_dpi: int = 300
 
     parser_name: ClassVar[str] = "plot"
     parser_help: ClassVar[str] = "Plot performance curves from parameter sweep results."
@@ -448,6 +509,10 @@ class SweepPlotArgs:
             scale_x=args.scale_x,
             scale_y=args.scale_y,
             dry_run=args.dry_run,
+            fig_name=args.fig_name,
+            error_bars=not args.no_error_bars,
+            fig_height=args.fig_height,
+            fig_dpi=args.fig_dpi,
         )
 
     @classmethod
@@ -541,6 +606,32 @@ class SweepPlotArgs:
             "Currently only accepts string values such as 'log' and 'sqrt'. "
             "See also: https://seaborn.pydata.org/generated/seaborn.objects.Plot.scale.html",
         )
+        parser.add_argument(
+            "--fig-name",
+            type=str,
+            default="FIGURE",
+            help="Name prefix for the output figure file. "
+            "Group data is always appended when present. "
+            "Default: 'FIGURE'. Example: --fig-name my_performance_plot",
+        )
+        parser.add_argument(
+            "--no-error-bars",
+            action="store_true",
+            help="If set, disables error bars on the plot. "
+            "By default, error bars are shown.",
+        )
+        parser.add_argument(
+            "--fig-height",
+            type=float,
+            default=6.4,
+            help="Height of each subplot in inches. Default: 6.4",
+        )
+        parser.add_argument(
+            "--fig-dpi",
+            type=int,
+            default=300,
+            help="Resolution of the output figure in dots per inch. Default: 300",
+        )
         parser.add_argument(
             "--dry-run",
             action="store_true",
@@ -566,6 +657,10 @@ def run_main(args: SweepPlotArgs):
         scale_x=args.scale_x,
         scale_y=args.scale_y,
         dry_run=args.dry_run,
+        fig_name=args.fig_name,
+        error_bars=args.error_bars,
+        fig_height=args.fig_height,
+        fig_dpi=args.fig_dpi,
     )
 
 
diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py
index 1298e4acb..6626707cf 100644
--- a/vllm/benchmarks/sweep/serve.py
+++ b/vllm/benchmarks/sweep/serve.py
@@ -138,9 +138,9 @@ def _get_comb_base_path(
 ):
     parts = list[str]()
     if serve_comb:
-        parts.extend(("SERVE-", serve_comb.as_text(sep="-")))
+        parts.extend(("SERVE-", serve_comb.name))
     if bench_comb:
-        parts.extend(("BENCH-", bench_comb.as_text(sep="-")))
+        parts.extend(("BENCH-", bench_comb.name))
 
     return output_dir / sanitize_filename("-".join(parts))
 
@@ -345,8 +345,9 @@ class SweepServeArgs:
             "--serve-params",
             type=str,
             default=None,
-            help="Path to JSON file containing a list of parameter combinations "
-            "for the `vllm serve` command. "
+            help="Path to JSON file containing parameter combinations "
+            "for the `vllm serve` command. Can be either a list of dicts or a dict "
+            "where keys are benchmark names. "
             "If both `serve_params` and `bench_params` are given, "
             "this script will iterate over their Cartesian product.",
         )
@@ -354,8 +355,9 @@ class SweepServeArgs:
             "--bench-params",
             type=str,
             default=None,
-            help="Path to JSON file containing a list of parameter combinations "
-            "for the `vllm bench serve` command. "
+            help="Path to JSON file containing parameter combinations "
+            "for the `vllm bench serve` command. Can be either a list of dicts or "
+            "a dict where keys are benchmark names. "
             "If both `serve_params` and `bench_params` are given, "
             "this script will iterate over their Cartesian product.",
         )
-- 
GitLab


From afb1e5b380ff623e478d19a246b42b2903b9331f Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Tue, 2 Dec 2025 14:46:10 -0600
Subject: [PATCH 014/258] [CI][ROCm][tests/v1/e2e] Fix multiprocessing launch
 for the test (#29123)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 tests/v1/e2e/test_kv_sharing_fast_prefill.py | 22 +++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
index 2778b0c5e..f895fb72e 100644
--- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py
+++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@@ -7,6 +7,7 @@ import pytest
 
 from vllm import LLM, SamplingParams
 from vllm.config import CompilationConfig, CompilationMode
+from vllm.platforms import current_platform
 
 from ...utils import check_answers, fork_new_process_for_each_test, prep_prompts
 
@@ -43,15 +44,26 @@ def test_prompts():
     return prompts
 
 
-@fork_new_process_for_each_test
+use_fork_for_test = (
+    fork_new_process_for_each_test if not current_platform.is_rocm() else lambda x: x
+)
+
+
+@use_fork_for_test
 @pytest.mark.parametrize("kv_sharing_fast_prefill", [False, True])
 @pytest.mark.parametrize("enforce_eager", [True, False])
 def test_kv_sharing_fast_prefill(
     monkeypatch: pytest.MonkeyPatch,
     kv_sharing_fast_prefill: bool,
     enforce_eager: bool,
-    test_prompts: list[str],
 ):
+    if not enforce_eager and current_platform.is_rocm():
+        # Relevant context: https://github.com/vllm-project/vllm/pull/29244
+        pytest.skip(
+            "ROCm: torch.compile produces incorrect output for gemma-3n's GELU "
+            "with tanh approximation. Use enforce_eager=True instead."
+        )
+
     sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
     compilation_config = CompilationConfig(
         # This allows vLLM compilation backend to handle allocating and
@@ -65,7 +77,11 @@ def test_kv_sharing_fast_prefill(
 
     with monkeypatch.context() as m:
         # Make scheduling deterministic for reproducibility
-        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+        if current_platform.is_rocm():
+            # Use spawn to prevent cuda re-initialization error
+            m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+        else:
+            m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
         prompts, answer, indices = prep_prompts(batch_size)
 
-- 
GitLab


From 1528e079e2b2cf8a807e4dce86ef05540e16a430 Mon Sep 17 00:00:00 2001
From: jthomson04 <jwillthomson19@gmail.com>
Date: Tue, 2 Dec 2025 13:25:52 -0800
Subject: [PATCH 015/258] [Perf] Avoid pageable HtoD transfer in
 MinTokensLogitsProcessor (#29826)

Signed-off-by: jthomson04 <jwillthomson19@gmail.com>
---
 vllm/v1/sample/logits_processor/builtin.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/sample/logits_processor/builtin.py b/vllm/v1/sample/logits_processor/builtin.py
index 4ee7dc288..82743f72b 100644
--- a/vllm/v1/sample/logits_processor/builtin.py
+++ b/vllm/v1/sample/logits_processor/builtin.py
@@ -110,7 +110,7 @@ class MinPLogitsProcessor(LogitsProcessor):
         # Identify valid tokens using threshold comparison
         invalid_token_mask = probability_values < adjusted_min_p
         # Apply mask using boolean indexing
-        logits[invalid_token_mask] = -float("inf")
+        logits.masked_fill_(invalid_token_mask, -float("inf"))
         return logits
 
 
@@ -178,6 +178,10 @@ class MinTokensLogitsProcessor(LogitsProcessor):
             self._device_tensor([], torch.int32),
         )
 
+        self.neg_inf_tensor = torch.tensor(
+            -float("inf"), dtype=torch.float32, device=self.device
+        )
+
     def is_argmax_invariant(self) -> bool:
         """By censoring stop tokens, min-tokens can change the outcome
         of the argmax operation in greedy sampling."""
@@ -229,7 +233,7 @@ class MinTokensLogitsProcessor(LogitsProcessor):
     def apply(self, logits: torch.Tensor) -> torch.Tensor:
         if self.min_toks:
             # Inhibit EOS token for requests which have not reached min length
-            logits[self.logits_slice] = -float("inf")
+            logits.index_put_(self.logits_slice, self.neg_inf_tensor)
         return logits
 
 
-- 
GitLab


From 3ff5b53bc2330688ea85d72ae79fe84eed63547c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 2 Dec 2025 21:29:32 +0000
Subject: [PATCH 016/258] Bump actions/setup-python from 6.0.0 to 6.1.0
 (#29768)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/cleanup_pr_body.yml | 2 +-
 .github/workflows/pre-commit.yml      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
index 861290ea4..56fbe5ca7 100644
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -16,7 +16,7 @@ jobs:
         uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
 
       - name: Set up Python
-        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
         with:
           python-version: '3.12'
 
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index d5e70f30e..a03b979ad 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -17,7 +17,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
-    - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+    - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
       with:
         python-version: "3.12"
     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
-- 
GitLab


From 6fc5841db14efedae7e6a8d1abdde3516c6c35a1 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 2 Dec 2025 21:49:44 +0000
Subject: [PATCH 017/258] Fix some more Transformers nightly tests (#29872)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 examples/offline_inference/vision_language.py |  5 +++-
 .../vision_language_multi_image.py            |  5 +++-
 tests/models/registry.py                      |  5 +++-
 vllm/model_executor/models/qwen2_vl.py        |  9 ------
 vllm/tokenizers/mistral.py                    | 28 +++++++++++++------
 vllm/transformers_utils/config.py             | 24 +++++++++++-----
 vllm/transformers_utils/configs/__init__.py   |  2 ++
 vllm/transformers_utils/configs/tarsier2.py   | 24 ++++++++++++++++
 8 files changed, 75 insertions(+), 27 deletions(-)
 create mode 100644 vllm/transformers_utils/configs/tarsier2.py

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 8f72bf6f0..0888a9d60 100755
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -1801,7 +1801,10 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
     engine_args = EngineArgs(
         model=model_name,
         max_model_len=4096,
-        hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
+        hf_overrides={
+            "architectures": ["Tarsier2ForConditionalGeneration"],
+            "model_type": "tarsier2",
+        },
         limit_mm_per_prompt={modality: 1},
     )
 
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 7ba4e64b5..2193b1ca9 100755
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -1222,7 +1222,10 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
         trust_remote_code=True,
         max_model_len=32768,
         limit_mm_per_prompt={"image": len(image_urls)},
-        hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
+        hf_overrides={
+            "architectures": ["Tarsier2ForConditionalGeneration"],
+            "model_type": "tarsier2",
+        },
     )
 
     prompt = (
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 26351089f..6b1d24b1c 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -831,7 +831,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"),
     "Tarsier2ForConditionalGeneration": _HfExamplesInfo(
         "omni-research/Tarsier2-Recap-7b",
-        hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
+        hf_overrides={
+            "architectures": ["Tarsier2ForConditionalGeneration"],
+            "model_type": "tarsier2",
+        },
     ),
     "VoxtralForConditionalGeneration": _HfExamplesInfo(
         "mistralai/Voxtral-Mini-3B-2507",
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 8fbd89622..b74876849 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1576,15 +1576,6 @@ class Tarsier2ForConditionalGeneration(Qwen2VLForConditionalGeneration):
         }
     )
 
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        # Tarsier2 uses llava as model_type, which will create a Qwen2VLConfig
-        # as text_config, we need to reconstruct Qwen2VLConfig from LlavaConfig.
-        config = vllm_config.model_config.hf_config
-        qwen2vl_config = config.text_config
-        qwen2vl_config.architectures = config.architectures
-        vllm_config.model_config.hf_config = qwen2vl_config
-        super().__init__(vllm_config=vllm_config, prefix=prefix)
-
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         skip_prefixes = []
         if self.visual is None:
diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py
index 96d1e78ce..37d67607c 100644
--- a/vllm/tokenizers/mistral.py
+++ b/vllm/tokenizers/mistral.py
@@ -14,13 +14,19 @@ if TYPE_CHECKING:
     )
     from mistral_common.tokens.tokenizers.tekken import Tekkenizer
     from transformers import BatchEncoding
-    from transformers.tokenization_mistral_common import (
-        MistralCommonTokenizer as TransformersMistralTokenizer,
-    )
 
     from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
     from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 
+    try:
+        # Transformers v5
+        from transformers.tokenization_mistral_common import MistralCommonBackend
+    except ImportError:
+        # Transformers v4
+        from transformers.tokenization_mistral_common import (
+            MistralCommonTokenizer as MistralCommonBackend,
+        )
+
 logger = init_logger(__name__)
 
 
@@ -208,11 +214,17 @@ class MistralTokenizer(TokenizerLike):
         **kwargs,
     ) -> "MistralTokenizer":
         from mistral_common.protocol.instruct.validator import ValidationMode
-        from transformers.tokenization_mistral_common import (
-            MistralCommonTokenizer as TransformersMistralTokenizer,
-        )
 
-        tokenizer = TransformersMistralTokenizer.from_pretrained(
+        try:
+            # Transformers v5
+            from transformers.tokenization_mistral_common import MistralCommonBackend
+        except ImportError:
+            # Transformers v4
+            from transformers.tokenization_mistral_common import (
+                MistralCommonTokenizer as MistralCommonBackend,
+            )
+
+        tokenizer = MistralCommonBackend.from_pretrained(
             path_or_repo_id,
             *args,
             mode=ValidationMode.test,
@@ -223,7 +235,7 @@ class MistralTokenizer(TokenizerLike):
 
         return cls(tokenizer)
 
-    def __init__(self, tokenizer: "TransformersMistralTokenizer") -> None:
+    def __init__(self, tokenizer: "MistralCommonBackend") -> None:
         super().__init__()
 
         from mistral_common.protocol.instruct.validator import ValidationMode
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 0cceab90b..2911dcff2 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -89,6 +89,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
     step3_text="Step3TextConfig",
     qwen3_next="Qwen3NextConfig",
     lfm2_moe="Lfm2MoeConfig",
+    tarsier2="Tarsier2Config",
 )
 
 _CONFIG_ATTRS_MAPPING: dict[str, str] = {
@@ -127,6 +128,9 @@ class HFConfigParser(ConfigParserBase):
                 if config_dict.get("speculators_config") is not None
                 else model_type
             )
+        # Allow hf_overrides to override model_type before checking _CONFIG_REGISTRY
+        if (hf_overrides := kwargs.pop("hf_overrides", None)) is not None:
+            model_type = hf_overrides.get("model_type", model_type)
 
         if model_type in _CONFIG_REGISTRY:
             config_class = _CONFIG_REGISTRY[model_type]
@@ -310,7 +314,7 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
             config.rope_parameters["rope_theta"] = rope_theta
 
     # No RoPE parameters to patch
-    if not hasattr(config, "rope_parameters"):
+    if getattr(config, "rope_parameters", None) is None:
         return
 
     # Add original_max_position_embeddings if present
@@ -351,7 +355,10 @@ def patch_rope_parameters_dict(rope_parameters: dict[str, Any]) -> None:
         rope_parameters["rope_type"] = "longrope"
         logger.warning("Replacing legacy rope_type 'su' with 'longrope'")
     elif rope_parameters["rope_type"] == "mrope":
-        assert "mrope_section" in rope_parameters
+        if "mrope_section" not in rope_parameters:
+            raise ValueError(
+                "Legacy rope_type 'mrope' requires 'mrope_section' in rope_parameters"
+            )
         rope_parameters["rope_type"] = "default"
         logger.warning("Replacing legacy rope_type 'mrope' with 'default'")
 
@@ -584,6 +591,7 @@ def get_config(
         trust_remote_code=trust_remote_code,
         revision=revision,
         code_revision=code_revision,
+        hf_overrides=hf_overrides_kw,
         **kwargs,
     )
     # Special architecture mapping check for GGUF models
@@ -915,11 +923,13 @@ def get_hf_text_config(config: PretrainedConfig):
     """
     text_config = config.get_text_config()
 
-    if text_config is not config:
-        # The code operates under the assumption that text_config should have
-        # `num_attention_heads` (among others). Assert here to fail early
-        # if transformers config doesn't align with this assumption.
-        assert hasattr(text_config, "num_attention_heads")
+    if text_config is not config and not hasattr(text_config, "num_attention_heads"):
+        raise ValueError(
+            "The text_config extracted from the model config does not have "
+            "`num_attention_heads` attribute. This indicates a mismatch "
+            "between the model config and vLLM's expectations. Please "
+            "ensure that the model config is compatible with vLLM."
+        )
 
     return text_config
 
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 109f2b698..0e8d16788 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -48,6 +48,7 @@ from vllm.transformers_utils.configs.step3_vl import (
     Step3VisionEncoderConfig,
     Step3VLConfig,
 )
+from vllm.transformers_utils.configs.tarsier2 import Tarsier2Config
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
 __all__ = [
@@ -81,4 +82,5 @@ __all__ = [
     "Step3VisionEncoderConfig",
     "Step3TextConfig",
     "Qwen3NextConfig",
+    "Tarsier2Config",
 ]
diff --git a/vllm/transformers_utils/configs/tarsier2.py b/vllm/transformers_utils/configs/tarsier2.py
new file mode 100644
index 000000000..12ebb4b7f
--- /dev/null
+++ b/vllm/transformers_utils/configs/tarsier2.py
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from transformers import Qwen2VLConfig
+
+
+class Tarsier2Config(Qwen2VLConfig):
+    """
+    Tarsier2's config.json is written such that AutoConfig.from_pretrained will create
+    a deeply nested config consisting of:
+
+    - LlavaConfig
+      - Qwen2VLConfig
+        - Qwen2VLTextConfig
+        - Qwen2VLVisionConfig
+      - Qwen2VLConfig
+        - Qwen2VLTextConfig
+        - Qwen2VLVisionConfig
+
+    When it should really just be a single Qwen2VLConfig.
+
+    This class is a hack to stop AutoConfig from creating the nested config structure.
+    """
+
+    model_type = "tarsier2"
-- 
GitLab


From e6f114ac25967b073954f7f3dc733672d173124c Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Tue, 2 Dec 2025 14:20:22 -0800
Subject: [PATCH 018/258] [Bugfix][EPLB] Prevent user-provided EPLB config from
 being overwritten with defaults (#29911)

Signed-off-by: Sage Moore <sage@neuralmagic.com>
---
 tests/distributed/test_eplb_spec_decode.py | 16 +++++++++-------
 vllm/engine/arg_utils.py                   | 14 --------------
 2 files changed, 9 insertions(+), 21 deletions(-)

diff --git a/tests/distributed/test_eplb_spec_decode.py b/tests/distributed/test_eplb_spec_decode.py
index c055b7a3f..868cc7028 100644
--- a/tests/distributed/test_eplb_spec_decode.py
+++ b/tests/distributed/test_eplb_spec_decode.py
@@ -22,7 +22,14 @@ def get_model_args(
         "num_speculative_tokens": 1,
         "max_model_len": model_max_len,
     }
-
+    eplb_config = {
+        "num_redundant_experts": tp_size,
+        "window_size": 128,
+        "step_interval": 1024,
+        "log_balancedness": False,
+    }
+    if use_async:
+        eplb_config["use_async"] = True
     model_args = {
         "pretrained": model_name,
         "dtype": "auto",
@@ -31,15 +38,10 @@ def get_model_args(
         "gpu_memory_utilization": 0.7,
         "speculative_config": speculative_config,
         "enable_expert_parallel": True,
-        "num_redundant_experts": tp_size,
-        "eplb_window_size": 128,
-        "eplb_step_interval": 1024,
-        "eplb_log_balancedness": False,
+        "eplb_config": eplb_config,
         "enable_eplb": True,
         "max_model_len": model_max_len,
     }
-    if use_async:
-        model_args["eplb_config"] = {"use_async": True}
     return model_args
 
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 83029e09c..096217da4 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -421,10 +421,6 @@ class EngineArgs:
     )
     _api_process_count: int = ParallelConfig._api_process_count
     _api_process_rank: int = ParallelConfig._api_process_rank
-    num_redundant_experts: int = EPLBConfig.num_redundant_experts
-    eplb_window_size: int = EPLBConfig.window_size
-    eplb_step_interval: int = EPLBConfig.step_interval
-    eplb_log_balancedness: bool = EPLBConfig.log_balancedness
     max_parallel_loading_workers: int | None = (
         ParallelConfig.max_parallel_loading_workers
     )
@@ -1582,16 +1578,6 @@ class EngineArgs:
             )
             self.disable_nccl_for_dp_synchronization = True
 
-        # Forward the deprecated CLI args to the EPLB config.
-        if self.num_redundant_experts is not None:
-            self.eplb_config.num_redundant_experts = self.num_redundant_experts
-        if self.eplb_window_size is not None:
-            self.eplb_config.window_size = self.eplb_window_size
-        if self.eplb_step_interval is not None:
-            self.eplb_config.step_interval = self.eplb_step_interval
-        if self.eplb_log_balancedness is not None:
-            self.eplb_config.log_balancedness = self.eplb_log_balancedness
-
         parallel_config = ParallelConfig(
             pipeline_parallel_size=self.pipeline_parallel_size,
             tensor_parallel_size=self.tensor_parallel_size,
-- 
GitLab


From 0a9caca9f5e130acbf39d5acd0b79fb492d6c4a3 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Wed, 3 Dec 2025 06:42:28 +0800
Subject: [PATCH 019/258] [Bugfix] fix --scheduling-policy=priority & n>1
 crashes engine (#29764)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
---
 .../v1/core/test_priority_scheduler_random.py | 12 +++++++++-
 vllm/v1/core/sched/request_queue.py           | 24 ++++++++-----------
 vllm/v1/request.py                            | 13 ++++++++++
 3 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/tests/v1/core/test_priority_scheduler_random.py b/tests/v1/core/test_priority_scheduler_random.py
index b4805be80..429b179b6 100644
--- a/tests/v1/core/test_priority_scheduler_random.py
+++ b/tests/v1/core/test_priority_scheduler_random.py
@@ -219,7 +219,17 @@ def test_priority_scheduling_blast(
             vllm_config=scheduler.vllm_config,
         )
         scheduler.add_request(req)
-
+    num_initial_requests = 2
+    for _ in range(num_initial_requests):
+        req = _create_random_request(
+            max_tokens_range=(1, max_output_tokens),
+            num_tokens_range=(1, max_input_tokens),
+            arrival_time_range=(0, 0),
+            priority_range=(4, 4),
+            num_mm_item_range=(0, 2),
+            vllm_config=scheduler.vllm_config,
+        )
+        scheduler.add_request(req)
     for _ in range(20000):
         if len(scheduler.waiting) == 0:
             num_new_requests = random.randint(0, 2)
diff --git a/vllm/v1/core/sched/request_queue.py b/vllm/v1/core/sched/request_queue.py
index 7bc1010db..a00ca1912 100644
--- a/vllm/v1/core/sched/request_queue.py
+++ b/vllm/v1/core/sched/request_queue.py
@@ -137,31 +137,30 @@ class PriorityRequestQueue(RequestQueue):
     """
     A priority queue that supports heap operations.
 
-    Requests with a smaller value of `priority` are processed first.
+    Respects the ordering defined in the Request class, where
+    requests with a smaller value of `priority` are processed first.
     If multiple requests have the same priority, the one with the earlier
     `arrival_time` is processed first.
     """
 
     def __init__(self) -> None:
-        self._heap: list[tuple[int, float, Request]] = []
+        self._heap: list[Request] = []
 
     def add_request(self, request: Request) -> None:
         """Add a request to the queue according to priority policy."""
-        heapq.heappush(self._heap, (request.priority, request.arrival_time, request))
+        heapq.heappush(self._heap, request)
 
     def pop_request(self) -> Request:
         """Pop a request from the queue according to priority policy."""
         if not self._heap:
             raise IndexError("pop from empty heap")
-        _, _, request = heapq.heappop(self._heap)
-        return request
+        return heapq.heappop(self._heap)
 
     def peek_request(self) -> Request:
         """Peek at the next request in the queue without removing it."""
         if not self._heap:
             raise IndexError("peek from empty heap")
-        _, _, request = self._heap[0]
-        return request
+        return self._heap[0]
 
     def prepend_request(self, request: Request) -> None:
         """Add a request to the queue according to priority policy.
@@ -180,15 +179,13 @@ class PriorityRequestQueue(RequestQueue):
 
     def remove_request(self, request: Request) -> None:
         """Remove a specific request from the queue."""
-        self._heap = [(p, t, r) for p, t, r in self._heap if r != request]
+        self._heap.remove(request)
         heapq.heapify(self._heap)
 
     def remove_requests(self, requests: Iterable[Request]) -> None:
         """Remove multiple specific requests from the queue."""
-        requests_to_remove = set(requests)
-        self._heap = [
-            (p, t, r) for p, t, r in self._heap if r not in requests_to_remove
-        ]
+        requests_to_remove = requests if isinstance(requests, set) else set(requests)
+        self._heap = [r for r in self._heap if r not in requests_to_remove]
         heapq.heapify(self._heap)
 
     def __bool__(self) -> bool:
@@ -203,8 +200,7 @@ class PriorityRequestQueue(RequestQueue):
         """Iterate over the queue according to priority policy."""
         heap_copy = self._heap[:]
         while heap_copy:
-            _, _, request = heapq.heappop(heap_copy)
-            yield request
+            yield heapq.heappop(heap_copy)
 
     def __reversed__(self) -> Iterator[Request]:
         """Iterate over the queue in reverse priority order."""
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index f2dfd2eed..33762fe34 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -227,6 +227,19 @@ class Request:
         events, self.events = self.events, []
         return events
 
+    def __lt__(self, other: "Request") -> bool:
+        """
+        Compare two requests based on priority, arrival time, and request ID.
+        Used in priority scheduling.
+        """
+        if self.priority != other.priority:
+            return self.priority < other.priority
+        if self.arrival_time != other.arrival_time:
+            return self.arrival_time < other.arrival_time
+        if self.request_id != other.request_id:
+            return self.request_id < other.request_id
+        return id(self) < id(other)
+
 
 class RequestStatus(enum.IntEnum):
     """Status of a request."""
-- 
GitLab


From 5e5646e2064f925f97ff533aa688a43834e9ff96 Mon Sep 17 00:00:00 2001
From: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Date: Tue, 2 Dec 2025 23:51:20 +0100
Subject: [PATCH 020/258] [BUGFIX] llama_4_scaling wrongly passed to
 DeepseekAttention (#29908)

Signed-off-by: juliendenize <julien.denize@mistral.ai>
---
 vllm/model_executor/models/deepseek_v2.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index d8a081af1..a8eb4a69b 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -1135,6 +1135,8 @@ class DeepseekV2DecoderLayer(nn.Module):
             dim == 0 for dim in (qk_nope_head_dim, qk_rope_head_dim)
         )
 
+        self.use_mha = use_mha
+
         if use_mha:
             attn_cls = DeepseekAttention
         elif model_config.use_mla:
@@ -1196,11 +1198,14 @@ class DeepseekV2DecoderLayer(nn.Module):
             hidden_states = self.input_layernorm(hidden_states)
         else:
             hidden_states, residual = self.input_layernorm(hidden_states, residual)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            llama_4_scaling=llama_4_scaling,
-        )
+
+        attn_kwargs = {
+            "positions": positions,
+            "hidden_states": hidden_states,
+        }
+        if not self.use_mha:
+            attn_kwargs["llama_4_scaling"] = llama_4_scaling
+        hidden_states = self.self_attn(**attn_kwargs)
 
         if (
             not isinstance(self.self_attn, DeepseekAttention)
-- 
GitLab


From 1b1e35aaf9d9561e1b5bf5b8e08b03565188e537 Mon Sep 17 00:00:00 2001
From: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Date: Tue, 2 Dec 2025 23:51:58 +0100
Subject: [PATCH 021/258] [BUGFIX] Fix regex pattern for Mistral Tool Call
 (#29918)

Signed-off-by: juliendenize <julien.denize@mistral.ai>
---
 .../language/generation/test_mistral.py       | 35 +++++++++++++++++++
 .../tool_parsers/mistral_tool_parser.py       |  2 +-
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py
index 1377776a6..e2d6271e2 100644
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -315,3 +315,38 @@ def test_mistral_function_call_nested_json():
     assert json.loads(parsed.tool_calls[0].function.arguments) == args_dict
     # No additional content outside the tool call should be returned.
     assert parsed.content is None
+
+    # multiple calls
+    multiple_args_dict = [
+        {
+            "city": "Dallas",
+            "state": "TX",
+            "unit": "fahrenheit",
+            "sub_dict": {"foo": "bar", "inner": {"x": 1, "y": 2}},
+        },
+        {},
+        {"a": 0},
+        {"a": 1, "b": "c"},
+    ]
+    names = ["get_current_weather", "get_current_weather_2", "random", "random_2"]
+
+    model_output = "".join(
+        [
+            f"{parser.bot_token}{name}{json.dumps(args)}"
+            for name, args in zip(names, multiple_args_dict)
+        ]
+    )
+
+    parsed = parser.extract_tool_calls(model_output, None)
+
+    # Assertions: the tool call is detected and the full nested JSON is parsed
+    # without truncation.
+    assert parsed.tools_called
+    assert len(parsed.tool_calls) == len(multiple_args_dict)
+
+    for i, tool_call in enumerate(parsed.tool_calls):
+        assert MistralToolCall.is_valid_id(tool_call.id)
+        assert tool_call.function.name == names[i]
+        assert json.loads(tool_call.function.arguments) == multiple_args_dict[i]
+        # No additional content outside the tool call should be returned.
+        assert parsed.content is None
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index 89b882d6c..b89db6054 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -80,7 +80,7 @@ class MistralToolParser(ToolParser):
         self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
         if _is_fn_name_regex_support(self.model_tokenizer):
             self.fn_name_regex = re.compile(
-                r"([a-zA-Z0-9_-]+)(\{[\s\S]*?\})(?=\s*$|,|\s)?", re.DOTALL
+                r"([a-zA-Z0-9_-]+)(\{[\s\S]*?\}+)", re.DOTALL
             )
         else:
             self.fn_name_regex = None
-- 
GitLab


From c014de1ec777554d2954655bd564493476d92061 Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Tue, 2 Dec 2025 16:54:36 -0600
Subject: [PATCH 022/258] [ROCm][CI] Fix test_cudagraph_mode.py Failure For AMD
 CI (#29808)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 tests/v1/cudagraph/test_cudagraph_mode.py | 40 ++++++++---------------
 1 file changed, 14 insertions(+), 26 deletions(-)

diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py
index 12621d493..b1895e83b 100644
--- a/tests/v1/cudagraph/test_cudagraph_mode.py
+++ b/tests/v1/cudagraph/test_cudagraph_mode.py
@@ -100,32 +100,20 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
 
 # test cudagraph_mode with different compilation mode.
 # (backend_name, cudagraph_mode, compilation_mode, supported)
-if current_platform.is_rocm():
-    combo_cases_2 = [
-        ("RocmAttn", "FULL", CompilationMode.NONE, True),
-        ("RocmAttn", "FULL", CompilationMode.VLLM_COMPILE, True),
-        ("RocmAttn", "PIECEWISE", CompilationMode.NONE, False),
-        ("RocmAttn", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
-        ("RocmAttn", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
-        ("RocmAttn", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
-        ("RocmAttn", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
-        ("RocmAttn", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
-        ("RocmAttn", "NONE", CompilationMode.NONE, True),
-        ("RocmAttn", "NONE", CompilationMode.VLLM_COMPILE, True),
-    ]
-else:
-    combo_cases_2 = [
-        ("FA2", "FULL", CompilationMode.NONE, True),
-        ("FA2", "FULL", CompilationMode.VLLM_COMPILE, True),
-        ("FA2", "PIECEWISE", CompilationMode.NONE, True),
-        ("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
-        ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, True),
-        ("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
-        ("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
-        ("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
-        ("FA2", "NONE", CompilationMode.NONE, True),
-        ("FA2", "NONE", CompilationMode.VLLM_COMPILE, True),
-    ]
+attn_backend = "RocmAttn" if current_platform.is_rocm() else "FA2"
+
+combo_cases_2 = [
+    (attn_backend, "FULL", CompilationMode.NONE, True),
+    (attn_backend, "FULL", CompilationMode.VLLM_COMPILE, True),
+    (attn_backend, "PIECEWISE", CompilationMode.NONE, True),
+    (attn_backend, "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+    (attn_backend, "FULL_AND_PIECEWISE", CompilationMode.NONE, True),
+    (attn_backend, "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+    (attn_backend, "FULL_DECODE_ONLY", CompilationMode.NONE, True),
+    (attn_backend, "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
+    (attn_backend, "NONE", CompilationMode.NONE, True),
+    (attn_backend, "NONE", CompilationMode.VLLM_COMPILE, True),
+]
 
 
 @pytest.mark.parametrize(
-- 
GitLab


From 5d91d2b292be9b1d6b121d36d242d5077a031e4b Mon Sep 17 00:00:00 2001
From: maang-h <55082429+maang-h@users.noreply.github.com>
Date: Wed, 3 Dec 2025 07:23:09 +0800
Subject: [PATCH 023/258] [Doc] Add allocate_slots parameter docs (#29777)

Signed-off-by: maang <maang_h@163.com>
Signed-off-by: maang-h <55082429+maang-h@users.noreply.github.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/v1/core/kv_cache_manager.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 382338488..33e8c8151 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -230,6 +230,9 @@ class KVCacheManager:
             delay_cache_blocks: Whether to skip caching the blocks. This is
                 used by P/D when allocating blocks used in a KV transfer
                 which will complete in a future step.
+            num_encoder_tokens: The number of encoder tokens to allocate for
+                cross-attention in encoder-decoder models(e.g., Whisper).
+                For decoder-only models, this should be 0.
 
         Blocks layout:
         ```
-- 
GitLab


From 5f67361fd12851bfe8faad4cc173ca24565611e4 Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Tue, 2 Dec 2025 18:40:02 -0600
Subject: [PATCH 024/258] Reverting re-direction to amd_mi355_X. (#29914)

Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
---
 .buildkite/test-amd.yaml | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 67088caa8..ee4fdebae 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -51,7 +51,7 @@ steps:
 - label: Async Engine, Inputs, Utils, Worker Test # 10min
   timeout_in_minutes: 15
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi355_1
+  agent_pool: mi325_1
   grade: Blocking
   source_file_dependencies:
   - vllm/
@@ -64,7 +64,7 @@ steps:
 - label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
   timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi355_1
+  agent_pool: mi325_1
   grade: Blocking
   source_file_dependencies:
   - vllm/
@@ -99,7 +99,7 @@ steps:
 - label: Basic Correctness Test # 20min
   timeout_in_minutes: 30
   mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
+  agent_pool: mi325_1
   # grade: Blocking
   fast_check: true
   torch_nightly: true
@@ -116,7 +116,7 @@ steps:
 
 - label: Entrypoints Unit Tests # 5min
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi355_1
+  agent_pool: mi325_1
   grade: Blocking
   timeout_in_minutes: 10
   working_dir: "/vllm-workspace/tests"
@@ -131,7 +131,7 @@ steps:
 - label: Entrypoints Integration Test (LLM) # 30min
   timeout_in_minutes: 40
   mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
+  agent_pool: mi325_1
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
   fast_check: true
@@ -254,7 +254,7 @@ steps:
 
 - label: EPLB Algorithm Test # 5min
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi355_1
+  agent_pool: mi325_1
   grade: Blocking
   timeout_in_minutes: 15
   working_dir: "/vllm-workspace/tests"
@@ -266,7 +266,7 @@ steps:
 
 - label: EPLB Execution Test # 10min
   mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_4
+  agent_pool: mi325_4
   # grade: Blocking
   timeout_in_minutes: 20
   working_dir: "/vllm-workspace/tests"
@@ -281,7 +281,7 @@ steps:
 - label: Metrics, Tracing Test # 12min
   timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_2
+  agent_pool: mi325_2
   # grade: Blocking
   num_gpus: 2
   source_file_dependencies:
@@ -301,7 +301,7 @@ steps:
 - label: Regression Test # 7min
   timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi355_1
+  agent_pool: mi325_1
   grade: Blocking
   source_file_dependencies:
   - vllm/
@@ -343,7 +343,7 @@ steps:
 - label: V1 Test entrypoints # 35min
   timeout_in_minutes: 50
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi355_1
+  agent_pool: mi325_1
   grade: Blocking
   source_file_dependencies:
     - vllm/
@@ -544,7 +544,7 @@ steps:
 - label: PyTorch Fullgraph Test # 27min
   timeout_in_minutes: 40
   mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
+  agent_pool: mi325_1
   # grade: Blocking
   torch_nightly: true
   source_file_dependencies:
-- 
GitLab


From 5cdd66450910589c8e1a3d25e80711b0b6e51eb1 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Tue, 2 Dec 2025 19:56:54 -0500
Subject: [PATCH 025/258] [BugFix] Fix assert in `build_for_cudagraph_capture`
 (#29893)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/v1/worker/gpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index ee28f477a..8c22ada02 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -4000,7 +4000,7 @@ class GPUModelRunner(
                 num_reqs=num_reqs_padded,
                 max_query_len=max_query_len,
                 ubatch_slices=ubatch_slices,
-                for_cudagraph_capture=True,
+                for_cudagraph_capture=is_graph_capturing,
             )
 
         with self.maybe_dummy_run_with_lora(
-- 
GitLab


From 4dd79783744adbfdc86f9454bffb5a92715a7f61 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Tue, 2 Dec 2025 18:33:45 -0800
Subject: [PATCH 026/258] [Bugfix] Fix regression on pooling models from
 PR#29621 (#29921)

Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/multimodal/parse.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 0d3b8289e..650368dcb 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -134,11 +134,17 @@ class EmbeddingItems(
     or a list of embedding tensors (one per item).
     """
 
+    def _unwrap(
+        self, item: torch.Tensor | MediaWithBytes[torch.Tensor]
+    ) -> torch.Tensor:
+        """Extract media from wrapper if present."""
+        return item.media if isinstance(item, MediaWithBytes) else item
+
     def get_count(self) -> int:
         return len(self.data)
 
     def get(self, index: int) -> torch.Tensor:
-        return self.data[index]
+        return self._unwrap(self.data[index])
 
     def get_processor_data(self) -> Mapping[str, object]:
         return {}
-- 
GitLab


From 506ed87e876e9dca3c64ac83e21051c02e9cb2e3 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Tue, 2 Dec 2025 20:36:49 -0600
Subject: [PATCH 027/258] [ROCm][CI][Bugfix] Disable Flash/MemEfficient SDP on
 ROCm to avoid HF Transformers accuracy issues (#29909)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 docker/Dockerfile.rocm                        |  6 +-----
 requirements/rocm-test.txt                    |  4 ++--
 .../models/multimodal/generation/conftest.py  | 19 +++++++++++++++++++
 3 files changed, 22 insertions(+), 7 deletions(-)
 create mode 100644 tests/models/multimodal/generation/conftest.py

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 4aabe2661..1b6bdabc7 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -65,7 +65,6 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
-# Centralized v1 package - copied to both test and final stages
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
 
 # -----------------------
@@ -98,7 +97,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system hf_transfer
 ENV HF_HUB_ENABLE_HF_TRANSFER=1
 
-# Copy in the v1 package
+# Copy in the v1 package (for python-only install test group)
 COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
 
 # Source code is used in the `python_only_compile.sh` test
@@ -130,9 +129,6 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
     && pip uninstall -y vllm \
     && uv pip install --system *.whl
 
-# Copy in the v1 package
-COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
-
 ARG COMMON_WORKDIR
 
 # Copy over the benchmark scripts as well
diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index ae61d4c6c..394728b67 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -70,8 +70,8 @@ torchgeo==0.7.0
 mteb==2.1.2
 
 # Data processing
-xgrammar @ git+https://github.com/mlc-ai/xgrammar.git@eafd4db51b78acc64b3f0764ef27dfd206c28628
-    # Test async scheduling
+xgrammar==0.1.27
+# Test async scheduling
 
 # Utilities
 num2words==0.5.14
diff --git a/tests/models/multimodal/generation/conftest.py b/tests/models/multimodal/generation/conftest.py
new file mode 100644
index 000000000..ee3ecdb10
--- /dev/null
+++ b/tests/models/multimodal/generation/conftest.py
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Pytest configuration for vLLM tests."""
+
+import torch
+
+from vllm.platforms import current_platform
+
+
+def pytest_configure(config):
+    """Disable Flash/MemEfficient SDP on ROCm to avoid HF
+    Transformers accuracy issues.
+    """
+    if not current_platform.is_rocm():
+        return
+
+    torch.backends.cuda.enable_flash_sdp(False)
+    torch.backends.cuda.enable_mem_efficient_sdp(False)
+    torch.backends.cuda.enable_math_sdp(True)
-- 
GitLab


From d7284a2604ef3fe96f0779309caafb59860704bb Mon Sep 17 00:00:00 2001
From: Arpit Khandelwal <60464796+arpitkh101@users.noreply.github.com>
Date: Tue, 2 Dec 2025 22:38:55 -0500
Subject: [PATCH 028/258] [Core] Rename PassConfig flags as per RFC #27995
 (#29646)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: arpitkh101 <arpit5khandelwal@gmail.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 tests/compile/distributed/test_async_tp.py    |   4 +-
 .../distributed/test_fusion_all_reduce.py     |   2 +-
 tests/compile/distributed/test_fusions_e2e.py |  16 +--
 .../distributed/test_sequence_parallelism.py  |  18 +--
 tests/compile/fullgraph/test_full_graph.py    |   4 +-
 tests/compile/test_config.py                  |  77 +++++++++--
 tests/compile/test_functionalization.py       |   6 +-
 tests/compile/test_fusion.py                  |   4 +-
 tests/compile/test_fusion_attn.py             |   2 +-
 tests/compile/test_noop_elimination.py        |   4 +-
 tests/compile/test_pass_manager.py            |   7 +-
 tests/compile/test_qk_norm_rope_fusion.py     |   2 +-
 tests/compile/test_silu_mul_quant_fusion.py   |   2 +-
 tests/distributed/test_sequence_parallel.py   |  34 +++--
 tests/test_config.py                          |  12 +-
 vllm/compilation/pass_manager.py              |  13 +-
 vllm/config/compilation.py                    | 120 +++++++++++++++---
 vllm/config/utils.py                          |  29 +++++
 vllm/config/vllm.py                           |  76 ++++++-----
 vllm/v1/worker/gpu_model_runner.py            |   5 +-
 vllm/v1/worker/gpu_worker.py                  |   2 +-
 vllm/v1/worker/utils.py                       |   2 +-
 22 files changed, 318 insertions(+), 123 deletions(-)

diff --git a/tests/compile/distributed/test_async_tp.py b/tests/compile/distributed/test_async_tp.py
index 86d409f1e..2eb18e25c 100644
--- a/tests/compile/distributed/test_async_tp.py
+++ b/tests/compile/distributed/test_async_tp.py
@@ -326,7 +326,7 @@ def async_tp_pass_on_test_model(
     vllm_config = VllmConfig()
     vllm_config.compilation_config = CompilationConfig(
         pass_config=PassConfig(
-            enable_async_tp=True,
+            fuse_gemm_comms=True,
         ),
     )
     vllm_config.device_config = DeviceConfig(device=torch.device("cuda"))
@@ -413,7 +413,7 @@ def test_async_tp_pass_correctness(
         "mode": CompilationMode.VLLM_COMPILE,
         "compile_sizes": [2, 4, 8],
         "splitting_ops": [],
-        "pass_config": {"enable_async_tp": async_tp_enabled},
+        "pass_config": {"fuse_gemm_comms": async_tp_enabled},
     }
 
     async_tp_args = [
diff --git a/tests/compile/distributed/test_fusion_all_reduce.py b/tests/compile/distributed/test_fusion_all_reduce.py
index d401d5703..fc8d1f98e 100644
--- a/tests/compile/distributed/test_fusion_all_reduce.py
+++ b/tests/compile/distributed/test_fusion_all_reduce.py
@@ -295,7 +295,7 @@ def all_reduce_fusion_pass_on_test_model(
         )
     )
     vllm_config.compilation_config.pass_config = PassConfig(
-        enable_fi_allreduce_fusion=True, enable_noop=True
+        fuse_allreduce_rms=True, eliminate_noops=True
     )
     vllm_config.device_config = DeviceConfig(device=torch.device("cuda"))
     vllm_config.parallel_config.rank = local_rank  # Setup rank for debug path
diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
index 661172e19..5d2786e12 100644
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -192,7 +192,7 @@ def test_attn_quant(
         splitting_ops=splitting_ops,
         # Common
         mode=CompilationMode.VLLM_COMPILE,
-        pass_config=PassConfig(enable_attn_fusion=True, enable_noop=True),
+        pass_config=PassConfig(fuse_attn_quant=True, eliminate_noops=True),
         # Inductor caches custom passes by default as well via uuid
         inductor_compile_config={"force_disable_caches": True},
     )
@@ -282,9 +282,9 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
         # Common
         mode=CompilationMode.VLLM_COMPILE,
         pass_config=PassConfig(
-            enable_attn_fusion=True,
-            enable_noop=True,
-            enable_fi_allreduce_fusion=True,
+            fuse_attn_quant=True,
+            eliminate_noops=True,
+            fuse_allreduce_rms=True,
         ),
         # Inductor caches custom passes by default as well via uuid
         inductor_compile_config={"force_disable_caches": True},
@@ -384,10 +384,10 @@ def test_tp2_attn_quant_async_tp(
         # Common
         level=CompilationMode.VLLM_COMPILE,
         pass_config=PassConfig(
-            enable_attn_fusion=True,
-            enable_noop=True,
-            enable_sequence_parallelism=True,
-            enable_async_tp=True,
+            fuse_attn_quant=True,
+            eliminate_noops=True,
+            enable_sp=True,
+            fuse_gemm_comms=True,
         ),
         # Inductor caches custom passes by default as well via uuid
         inductor_compile_config={"force_disable_caches": True},
diff --git a/tests/compile/distributed/test_sequence_parallelism.py b/tests/compile/distributed/test_sequence_parallelism.py
index 30084dfd5..d9fdc3acc 100644
--- a/tests/compile/distributed/test_sequence_parallelism.py
+++ b/tests/compile/distributed/test_sequence_parallelism.py
@@ -153,7 +153,7 @@ class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module):
         ]
 
     def ops_in_model(self):
-        if self.vllm_config.compilation_config.pass_config.enable_fusion:
+        if self.vllm_config.compilation_config.pass_config.fuse_norm_quant:
             return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default]
         elif RMSNorm.enabled():
             return [
@@ -183,7 +183,7 @@ class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module):
 @pytest.mark.parametrize("seq_len", [16])
 @pytest.mark.parametrize("hidden_size", [16])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("enable_fusion", [True, False])
+@pytest.mark.parametrize("fuse_norm_quant", [True, False])
 @pytest.mark.parametrize("dynamic", [False, True])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
 def test_sequence_parallelism_pass(
@@ -193,7 +193,7 @@ def test_sequence_parallelism_pass(
     seq_len: int,
     hidden_size: int,
     dtype: torch.dtype,
-    enable_fusion: bool,
+    fuse_norm_quant: bool,
     dynamic: bool,
 ):
     num_processes = 2
@@ -211,7 +211,7 @@ def test_sequence_parallelism_pass(
                 seq_len,
                 hidden_size,
                 dtype,
-                enable_fusion,
+                fuse_norm_quant,
                 dynamic,
             ),
             nprocs=nprocs,
@@ -229,7 +229,7 @@ def sequence_parallelism_pass_on_test_model(
     seq_len: int,
     hidden_size: int,
     dtype: torch.dtype,
-    enable_fusion: bool,
+    fuse_norm_quant: bool,
     dynamic: bool,
 ):
     current_platform.seed_everything(0)
@@ -260,9 +260,9 @@ def sequence_parallelism_pass_on_test_model(
         cudagraph_mode=CUDAGraphMode.NONE,  # avoid piecewise warnings
         custom_ops=custom_ops_list,
         pass_config=PassConfig(
-            enable_sequence_parallelism=True,
-            enable_fusion=enable_fusion,
-            enable_noop=True,
+            enable_sp=True,
+            fuse_norm_quant=fuse_norm_quant,
+            eliminate_noops=True,
         ),
     )  # NoOp needed for fusion
     device_config = DeviceConfig(device=torch.device("cuda"))
@@ -297,7 +297,7 @@ def sequence_parallelism_pass_on_test_model(
             sequence_parallelism_pass,
         ]
 
-        if enable_fusion:
+        if fuse_norm_quant:
             fusion_pass = RMSNormQuantFusionPass(vllm_config)
             passes_for_backend.append(fusion_pass)
 
diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py
index 2c11ecef7..3cd1d4be2 100644
--- a/tests/compile/fullgraph/test_full_graph.py
+++ b/tests/compile/fullgraph/test_full_graph.py
@@ -122,7 +122,9 @@ def test_full_graph(
             CompilationConfig(
                 mode=CompilationMode.VLLM_COMPILE,
                 custom_ops=["+rms_norm"],
-                pass_config=PassConfig(enable_fusion=True, enable_noop=True),
+                pass_config=PassConfig(
+                    fuse_norm_quant=True, fuse_act_quant=True, eliminate_noops=True
+                ),
             ),
             *model_info,
         )
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index a9e5ccee5..9e912c6d8 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
+import logging
 from contextlib import nullcontext
 from unittest.mock import patch
 
@@ -10,8 +11,9 @@ from pydantic import ValidationError
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.fix_functionalization import FixFunctionalizationPass
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
-from vllm.config.compilation import CompilationMode
+from vllm.config.compilation import CompilationMode, PassConfig
 from vllm.engine.arg_utils import EngineArgs
+from vllm.logger import _print_warning_once
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import _is_torch_equal_or_newer
 
@@ -191,7 +193,7 @@ def test_splitting_ops_dynamic():
     config = VllmConfig(
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
-            pass_config={"enable_attn_fusion": True, "enable_noop": True},
+            pass_config=PassConfig(fuse_attn_quant=True, eliminate_noops=True),
             custom_ops=["+quant_fp8"],
             cudagraph_mode=CUDAGraphMode.PIECEWISE,
         )
@@ -206,7 +208,7 @@ def test_splitting_ops_dynamic():
         config = VllmConfig(
             compilation_config=CompilationConfig(
                 mode=CompilationMode.VLLM_COMPILE,
-                pass_config={"enable_attn_fusion": True, "enable_noop": True},
+                pass_config=PassConfig(fuse_attn_quant=True, eliminate_noops=True),
                 custom_ops=["+quant_fp8"],
                 cudagraph_mode=CUDAGraphMode.PIECEWISE,
                 # work around for accessing all attntion ops
@@ -219,7 +221,7 @@ def test_splitting_ops_dynamic():
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
             use_inductor_graph_partition=True,
-            pass_config={"enable_attn_fusion": True, "enable_noop": True},
+            pass_config=PassConfig(fuse_attn_quant=True, eliminate_noops=True),
             custom_ops=["+quant_fp8"],
             cudagraph_mode=CUDAGraphMode.PIECEWISE,
         )
@@ -227,7 +229,7 @@ def test_splitting_ops_dynamic():
     # With inductor graph partition, attn_fusion and splitting_ops
     # work together. Default splitting_ops include attention ops.
     assert config.compilation_config.splitting_ops_contain_attention()
-    # enable_attn_fusion is directly supported under
+    # fuse_attn_quant is directly supported under
     # use_inductor_graph_partition=True, and cudagraph_mode
     # is unchanged.
     assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
@@ -301,7 +303,7 @@ def test_should_split():
         "cudagraph_capture_sizes",
         "max_cudagraph_capture_size",
         "tp_size",
-        "enable_sequence_parallelism",
+        "enable_sp",
         "max_num_batched_tokens",
         "cudagraph_mode",
         "expected_max_size",
@@ -339,7 +341,7 @@ def test_cudagraph_sizes_post_init(
     cudagraph_capture_sizes,
     max_cudagraph_capture_size,
     tp_size,
-    enable_sequence_parallelism,
+    enable_sp,
     max_num_batched_tokens,
     cudagraph_mode,
     expected_max_size,
@@ -355,11 +357,12 @@ def test_cudagraph_sizes_post_init(
         compilation_config = CompilationConfig(
             cudagraph_capture_sizes=cudagraph_capture_sizes,
             max_cudagraph_capture_size=max_cudagraph_capture_size,
-            pass_config={
-                "enable_sequence_parallelism": enable_sequence_parallelism,
-                "enable_fusion": True,
-                "enable_noop": True,
-            },
+            pass_config=PassConfig(
+                enable_sp=enable_sp,
+                fuse_norm_quant=True,
+                fuse_act_quant=True,
+                eliminate_noops=True,
+            ),
             cudagraph_mode=cudagraph_mode,
         )
         engine_args = EngineArgs(
@@ -375,3 +378,53 @@ def test_cudagraph_sizes_post_init(
             vllm_config.compilation_config.max_cudagraph_capture_size
             == expected_max_size
         )
+
+
+def test_pass_config_deprecation(caplog_vllm):
+    caplog_vllm.set_level(logging.WARNING)
+
+    # Clear cache to ensure warnings are re-issued
+    _print_warning_once.cache_clear()
+
+    # Test enable_fusion -> fuse_norm_quant, fuse_act_quant
+    caplog_vllm.clear()
+    config = PassConfig(enable_fusion=True)
+    assert "enable_fusion is deprecated" in caplog_vllm.text
+    assert config.fuse_norm_quant is True
+    assert config.fuse_act_quant is True
+    assert config.enable_fusion is None
+
+    # Test enable_attn_fusion -> fuse_attn_quant
+    caplog_vllm.clear()
+    config = PassConfig(enable_attn_fusion=True)
+    assert "enable_attn_fusion is deprecated" in caplog_vllm.text
+    assert config.fuse_attn_quant is True
+    assert config.enable_attn_fusion is None
+
+    # Test enable_noop -> eliminate_noops
+    caplog_vllm.clear()
+    config = PassConfig(enable_noop=True)
+    assert "enable_noop is deprecated" in caplog_vllm.text
+    assert config.eliminate_noops is True
+    assert config.enable_noop is None
+
+    # Test enable_sequence_parallelism -> enable_sp
+    caplog_vllm.clear()
+    config = PassConfig(enable_sequence_parallelism=True)
+    assert "enable_sequence_parallelism is deprecated" in caplog_vllm.text
+    assert config.enable_sp is True
+    assert config.enable_sequence_parallelism is None
+
+    # Test enable_async_tp -> fuse_gemm_comms
+    caplog_vllm.clear()
+    config = PassConfig(enable_async_tp=True)
+    assert "enable_async_tp is deprecated" in caplog_vllm.text
+    assert config.fuse_gemm_comms is True
+    assert config.enable_async_tp is None
+
+    # Test enable_fi_allreduce_fusion -> fuse_allreduce_rms
+    caplog_vllm.clear()
+    config = PassConfig(enable_fi_allreduce_fusion=True)
+    assert "enable_fi_allreduce_fusion is deprecated" in caplog_vllm.text
+    assert config.fuse_allreduce_rms is True
+    assert config.enable_fi_allreduce_fusion is None
diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
index 515e0a93a..758591589 100644
--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@@ -223,7 +223,11 @@ def test_fix_functionalization(
         model_config=ModelConfig(dtype=dtype),
         compilation_config=CompilationConfig(
             custom_ops=["all"],
-            pass_config=PassConfig(enable_fusion=do_fusion, enable_noop=True),
+            pass_config=PassConfig(
+                fuse_norm_quant=do_fusion,
+                fuse_act_quant=do_fusion,
+                eliminate_noops=True,
+            ),
         ),
     )
 
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index 286f22763..d0ba8385f 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -159,7 +159,9 @@ def test_fusion_rmsnorm_quant(
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
             custom_ops=custom_ops,
-            pass_config=PassConfig(enable_fusion=True, enable_noop=True),
+            pass_config=PassConfig(
+                fuse_norm_quant=True, fuse_act_quant=True, eliminate_noops=True
+            ),
         ),
     )
     with vllm.config.set_current_vllm_config(vllm_config):
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index 4d213e030..9b4486e56 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -373,7 +373,7 @@ def test_attention_quant_pattern(
 
     # Run model with attn fusion enabled
     vllm_config.compilation_config.pass_config = PassConfig(
-        enable_attn_fusion=True, enable_noop=True
+        fuse_attn_quant=True, eliminate_noops=True
     )
     with (
         set_current_vllm_config(vllm_config),
diff --git a/tests/compile/test_noop_elimination.py b/tests/compile/test_noop_elimination.py
index 0ccc1a016..bfe08382f 100644
--- a/tests/compile/test_noop_elimination.py
+++ b/tests/compile/test_noop_elimination.py
@@ -51,7 +51,7 @@ def test_noop_elimination(dtype, num_tokens, hidden_size, buffer_size):
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
-            pass_config=PassConfig(enable_noop=True),
+            pass_config=PassConfig(eliminate_noops=True),
         )
     )
     with vllm.config.set_current_vllm_config(vllm_config):
@@ -99,7 +99,7 @@ def test_non_noop_slice_preserved():
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
-            pass_config=PassConfig(enable_noop=True),
+            pass_config=PassConfig(eliminate_noops=True),
         )
     )
     with vllm.config.set_current_vllm_config(vllm_config):
diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py
index 1c40c599f..6d0ba6b65 100644
--- a/tests/compile/test_pass_manager.py
+++ b/tests/compile/test_pass_manager.py
@@ -64,8 +64,11 @@ def test_pass_manager_uuid(callable):
 
     # UUID should be different due to config change
     config2 = copy.deepcopy(config)
-    config2.compilation_config.pass_config.enable_fusion = (
-        not config2.compilation_config.pass_config.enable_fusion
+    config2.compilation_config.pass_config.fuse_norm_quant = (
+        not config2.compilation_config.pass_config.fuse_norm_quant
+    )
+    config2.compilation_config.pass_config.fuse_act_quant = (
+        not config2.compilation_config.pass_config.fuse_act_quant
     )
     pass_manager3 = PostGradPassManager()
     pass_manager3.configure(config2)
diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py
index 5ebb95b6d..e0968ac79 100644
--- a/tests/compile/test_qk_norm_rope_fusion.py
+++ b/tests/compile/test_qk_norm_rope_fusion.py
@@ -140,7 +140,7 @@ def test_qk_norm_rope_fusion(
             custom_ops=custom_ops,
             pass_config=PassConfig(
                 enable_qk_norm_rope_fusion=True,
-                enable_noop=True,
+                eliminate_noops=True,
             ),
         ),
     )
diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py
index 0ddb82b7c..c336a4595 100644
--- a/tests/compile/test_silu_mul_quant_fusion.py
+++ b/tests/compile/test_silu_mul_quant_fusion.py
@@ -168,7 +168,7 @@ def test_fusion_silu_and_mul_quant(
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
             custom_ops=custom_ops,
-            pass_config=PassConfig(enable_fusion=True, enable_noop=True),
+            pass_config=PassConfig(fuse_act_quant=True, eliminate_noops=True),
         ),
     )
 
diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py
index f38c50977..0a7907aad 100644
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -32,7 +32,8 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
 class ParallelSetup(NamedTuple):
     tp_size: int
     pp_size: int
-    enable_fusion: bool
+    fuse_norm_quant: bool
+    fuse_act_quant: bool
     eager_mode: bool
     chunked_prefill: bool
 
@@ -66,7 +67,8 @@ class SPTestSettings:
                         ParallelSetup(
                             tp_size=tp_base,
                             pp_size=pp_multiplier * pp_base,
-                            enable_fusion=False,
+                            fuse_norm_quant=False,
+                            fuse_act_quant=False,
                             eager_mode=eager_mode_val,
                             chunked_prefill=chunked_prefill_val,
                         )
@@ -97,7 +99,8 @@ class SPTestSettings:
                         ParallelSetup(
                             tp_size=tp_base,
                             pp_size=pp_multiplier * pp_base,
-                            enable_fusion=False,
+                            fuse_norm_quant=False,
+                            fuse_act_quant=False,
                             eager_mode=eager_mode_val,
                             chunked_prefill=chunked_prefill_val,
                         )
@@ -126,7 +129,8 @@ class SPTestSettings:
                 ParallelSetup(
                     tp_size=tp_base,
                     pp_size=pp_base,
-                    enable_fusion=fusion_val,
+                    fuse_norm_quant=fusion_val,
+                    fuse_act_quant=fusion_val,
                     eager_mode=True,
                     chunked_prefill=False,
                 )
@@ -162,7 +166,7 @@ def _compare_sp(
     test_options: SPTestOptions,
     num_gpus_available: int,
     use_inductor_graph_partition: bool,
-    enable_async_tp: bool,
+    fuse_gemm_comms: bool,
     *,
     method: Literal["generate", "encode"],
     is_multimodal: bool,
@@ -170,7 +174,8 @@ def _compare_sp(
     (
         tp_size,
         pp_size,
-        enable_fusion,
+        fuse_norm_quant,
+        fuse_act_quant,
         eager_mode,
         chunked_prefill,
     ) = parallel_setup
@@ -248,10 +253,11 @@ def _compare_sp(
         "mode": CompilationMode.VLLM_COMPILE,
         "compile_sizes": [4, 8],
         "pass_config": {
-            "enable_sequence_parallelism": True,
-            "enable_async_tp": enable_async_tp,
-            "enable_fusion": enable_fusion,
-            "enable_noop": True,
+            "enable_sp": True,
+            "fuse_gemm_comms": fuse_gemm_comms,
+            "fuse_norm_quant": fuse_norm_quant,
+            "fuse_act_quant": fuse_act_quant,
+            "eliminate_noops": True,
         },
         "use_inductor_graph_partition": use_inductor_graph_partition,
     }
@@ -309,7 +315,7 @@ SP_TEST_MODELS = [
     ],
 )
 @pytest.mark.parametrize("use_inductor_graph_partition", [True, False])
-@pytest.mark.parametrize("enable_async_tp", [False])  # TODO: enable async TP
+@pytest.mark.parametrize("fuse_gemm_comms", [False])  # TODO: enable async TP
 @create_new_process_for_each_test()
 def test_tp_sp_generation(
     model_id: str,
@@ -319,7 +325,7 @@ def test_tp_sp_generation(
     test_options: SPTestOptions,
     num_gpus_available,
     use_inductor_graph_partition: bool,
-    enable_async_tp: bool,
+    fuse_gemm_comms: bool,
 ):
     if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
         pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
@@ -328,7 +334,7 @@ def test_tp_sp_generation(
     if (
         "fp8" in model_id.lower()
         and current_platform.get_device_capability() < (9, 0)
-        and (not enable_async_tp)
+        and (not fuse_gemm_comms)
     ):
         pytest.skip("FP8 reduction support begins with sm90 capable devices.")
 
@@ -340,7 +346,7 @@ def test_tp_sp_generation(
         test_options,
         num_gpus_available,
         use_inductor_graph_partition,
-        enable_async_tp=enable_async_tp,
+        fuse_gemm_comms=fuse_gemm_comms,
         method="generate",
         is_multimodal=False,
     )
diff --git a/tests/test_config.py b/tests/test_config.py
index b7ed68fea..019c0d6d8 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1023,17 +1023,17 @@ def test_vllm_config_explicit_overrides():
     assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
 
     # Explicit pass config flags to override defaults
-    pass_config = PassConfig(enable_noop=True, enable_attn_fusion=True)
+    pass_config = PassConfig(eliminate_noops=True, fuse_attn_quant=True)
     compilation_config = CompilationConfig(pass_config=pass_config)
     config = VllmConfig(
         optimization_level=OptimizationLevel.O0,
         compilation_config=compilation_config,
     )
-    assert config.compilation_config.pass_config.enable_noop is True
-    assert config.compilation_config.pass_config.enable_attn_fusion is True
+    assert config.compilation_config.pass_config.eliminate_noops is True
+    assert config.compilation_config.pass_config.fuse_attn_quant is True
 
     # Explicit cudagraph mode override on quantized model at O2
-    pass_config = PassConfig(enable_async_tp=True)
+    pass_config = PassConfig(fuse_gemm_comms=True)
     compilation_config = CompilationConfig(
         cudagraph_mode=CUDAGraphMode.NONE, pass_config=pass_config
     )
@@ -1043,7 +1043,7 @@ def test_vllm_config_explicit_overrides():
         compilation_config=compilation_config,
     )
     assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
-    assert config.compilation_config.pass_config.enable_async_tp is True
+    assert config.compilation_config.pass_config.fuse_gemm_comms is True
     # Mode should still use default for O2
     assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE
 
@@ -1093,7 +1093,7 @@ def test_vllm_config_explicit_overrides():
         compilation_config=compilation_config,
     )
     # Explicit override should be respected
-    assert config.compilation_config.pass_config.enable_noop is False
+    assert config.compilation_config.pass_config.eliminate_noops is False
     # Other fields should still use defaults
     assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE
     assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index fe2547d7f..37f48721e 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -92,22 +92,23 @@ class PostGradPassManager(CustomGraphPass):
 
         # Set the current vllm config to allow tracing CustomOp instances
         with set_current_vllm_config(config, check_compile=False):
-            if self.pass_config.enable_noop:
+            if self.pass_config.eliminate_noops:
                 self.passes += [NoOpEliminationPass(config)]
 
-            if self.pass_config.enable_sequence_parallelism:
+            if self.pass_config.enable_sp:
                 self.passes += [SequenceParallelismPass(config)]
-                if self.pass_config.enable_async_tp:
+                if self.pass_config.fuse_gemm_comms:
                     self.passes += [AsyncTPPass(config)]
 
-            if self.pass_config.enable_fi_allreduce_fusion:
+            if self.pass_config.fuse_allreduce_rms:
                 self.passes += [AllReduceFusionPass(config)]
 
-            if self.pass_config.enable_fusion:
+            if self.pass_config.fuse_norm_quant:
                 self.passes += [RMSNormQuantFusionPass(config)]
+            if self.pass_config.fuse_act_quant:
                 self.passes += [ActivationQuantFusionPass(config)]
 
-            if self.pass_config.enable_attn_fusion:
+            if self.pass_config.fuse_attn_quant:
                 self.passes += [AttnFusionPass(config)]
 
             if self.pass_config.enable_qk_norm_rope_fusion:
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 0f876c381..963b09193 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -13,7 +13,7 @@ from pydantic.dataclasses import dataclass
 
 import vllm.envs as envs
 from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
-from vllm.config.utils import config
+from vllm.config.utils import config, handle_deprecated
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import resolve_obj_by_qualname
@@ -105,18 +105,43 @@ class PassConfig:
     improper state.
     """
 
+    # New flags
+    fuse_norm_quant: bool = Field(default=None)
+    """Fuse the custom RMSNorm + quant ops."""
+    fuse_act_quant: bool = Field(default=None)
+    """Fuse the custom SiluMul + quant ops."""
+    fuse_attn_quant: bool = Field(default=None)
+    """Fuse the custom attention + quant ops."""
+    eliminate_noops: bool = Field(default=None)
+    """Eliminate no-op ops."""
+    enable_sp: bool = Field(default=None)
+    """Enable sequence parallelism."""
+    fuse_gemm_comms: bool = Field(default=None)
+    """Enable async TP."""
+    fuse_allreduce_rms: bool = Field(default=None)
+    """Enable flashinfer allreduce fusion."""
+
+    # Deprecated flags
     enable_fusion: bool = Field(default=None)
-    """Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass."""
+    """Deprecated in: v0.12.0. Use fuse_norm_quant and fuse_act_quant 
+    instead. Will be removed in v0.13.0 or v1.0.0, whichever is sooner.
+    """
     enable_attn_fusion: bool = Field(default=None)
-    """Whether to enable the custom attention+quant fusion pass."""
+    """Deprecated in: v0.12.0. Use fuse_attn_quant instead. 
+    Will be removed in v0.13.0 or v1.0.0, whichever is sooner."""
     enable_noop: bool = Field(default=None)
-    """Whether to enable the custom no-op elimination pass."""
+    """Deprecated in: v0.12.0. Use eliminate_noops instead. 
+    Will be removed in v0.13.0 or v1.0.0, whichever is sooner."""
     enable_sequence_parallelism: bool = Field(default=None)
-    """Whether to enable sequence parallelism."""
+    """Deprecated in: v0.12.0. Use enable_sp instead. 
+    Will be removed in v0.13.0 or v1.0.0, whichever is sooner."""
     enable_async_tp: bool = Field(default=None)
-    """Whether to enable async TP."""
+    """Deprecated in: v0.12.0. Use fuse_gemm_comms instead. 
+    Will be removed in v0.13.0 or v1.0.0, whichever is sooner."""
     enable_fi_allreduce_fusion: bool = Field(default=None)
-    """Whether to enable flashinfer allreduce fusion."""
+    """Deprecated in: v0.12.0. Use fuse_allreduce_rms instead. 
+    Will be removed in v0.13.0 or v1.0.0, whichever is sooner."""
+
     fi_allreduce_fusion_max_size_mb: float | None = None
     """The threshold of the communicated tensor sizes under which
     vllm should use flashinfer fused allreduce. Specified as a
@@ -136,7 +161,7 @@ class PassConfig:
             },
         }, where key is the device capability"""
     enable_qk_norm_rope_fusion: bool = False
-    """Whether to enable the fused Q/K RMSNorm + RoPE pass."""
+    """Enable fused Q/K RMSNorm + RoPE pass."""
 
     # TODO(luka) better pass enabling system.
 
@@ -174,6 +199,13 @@ class PassConfig:
         return InductorPass.hash_dict(asdict(self))
 
     @field_validator(
+        "fuse_norm_quant",
+        "fuse_act_quant",
+        "fuse_attn_quant",
+        "eliminate_noops",
+        "enable_sp",
+        "fuse_gemm_comms",
+        "fuse_allreduce_rms",
         "enable_fusion",
         "enable_attn_fusion",
         "enable_noop",
@@ -190,18 +222,71 @@ class PassConfig:
         return handler(value)
 
     def __post_init__(self) -> None:
-        if not self.enable_noop:
-            if self.enable_fusion:
+        # Handle deprecation and defaults
+
+        # Map old flags to new flags and issue warnings
+        handle_deprecated(
+            self,
+            "enable_fusion",
+            ["fuse_norm_quant", "fuse_act_quant"],
+            "v0.13.0 or v1.0.0, whichever is sooner",
+        )
+
+        handle_deprecated(
+            self,
+            "enable_attn_fusion",
+            "fuse_attn_quant",
+            "v0.13.0 or v1.0.0, whichever is sooner",
+        )
+
+        handle_deprecated(
+            self,
+            "enable_sequence_parallelism",
+            "enable_sp",
+            "v0.13.0 or v1.0.0, whichever is sooner",
+        )
+
+        handle_deprecated(
+            self,
+            "enable_async_tp",
+            "fuse_gemm_comms",
+            "v0.13.0 or v1.0.0, whichever is sooner",
+        )
+
+        handle_deprecated(
+            self,
+            "enable_fi_allreduce_fusion",
+            "fuse_allreduce_rms",
+            "v0.13.0 or v1.0.0, whichever is sooner",
+        )
+
+        handle_deprecated(
+            self,
+            "enable_noop",
+            "eliminate_noops",
+            "v0.13.0 or v1.0.0, whichever is sooner",
+        )
+
+        # Force old flags to None to ensure they are not used
+        self.enable_fusion = None
+        self.enable_attn_fusion = None
+        self.enable_noop = None
+        self.enable_sequence_parallelism = None
+        self.enable_async_tp = None
+        self.enable_fi_allreduce_fusion = None
+
+        if not self.eliminate_noops:
+            if self.fuse_norm_quant or self.fuse_act_quant:
                 logger.warning_once(
                     "Fusion enabled but reshape elimination disabled. "
                     "RMSNorm/SiluMul + quant (fp8) fusion might not work"
                 )
-            if self.enable_attn_fusion:
+            if self.fuse_attn_quant:
                 logger.warning_once(
                     "Fusion enabled but reshape elimination disabled. "
                     "Attention + quant (fp8) fusion might not work"
                 )
-            if self.enable_fi_allreduce_fusion:
+            if self.fuse_allreduce_rms:
                 logger.warning_once(
                     "Fusion enabled but reshape elimination disabled. "
                     "Allreduce + rms norm + quant (fp8) fusion might not work"
@@ -873,7 +958,7 @@ class CompilationConfig:
             self.set_splitting_ops_for_inductor_graph_partition()
             return
 
-        if self.pass_config.enable_attn_fusion:
+        if self.pass_config.fuse_attn_quant:
             # here use_inductor_graph_partition is False
             self.set_splitting_ops_for_attn_fusion()
             return
@@ -915,12 +1000,12 @@ class CompilationConfig:
             self.splitting_ops = list(self._attention_ops)
 
     def set_splitting_ops_for_attn_fusion(self):
-        assert self.pass_config.enable_attn_fusion
+        assert self.pass_config.fuse_attn_quant
         if self.splitting_ops is None:
             self.splitting_ops = []
             if self.cudagraph_mode.has_piecewise_cudagraphs():
                 logger.warning_once(
-                    "enable_attn_fusion is incompatible with piecewise "
+                    "fuse_attn_quant is incompatible with piecewise "
                     "cudagraph when use_inductor_graph_partition is off. "
                     "In this case, splitting_ops will be set to empty "
                     "list, and cudagraph_mode will be set to FULL. "
@@ -931,8 +1016,7 @@ class CompilationConfig:
                 self.cudagraph_mode = CUDAGraphMode.FULL
 
         assert not self.splitting_ops_contain_attention(), (
-            "attention ops should not be in splitting_ops "
-            "when enable_attn_fusion is True"
+            "attention ops should not be in splitting_ops when fuse_attn_quant is True"
         )
 
     def splitting_ops_contain_attention(self) -> bool:
@@ -1008,7 +1092,7 @@ class CompilationConfig:
         self, uniform_decode_query_len: int, tensor_parallel_size: int
     ):
         multiple_of = uniform_decode_query_len
-        if tensor_parallel_size > 1 and self.pass_config.enable_sequence_parallelism:
+        if tensor_parallel_size > 1 and self.pass_config.enable_sp:
             multiple_of = max(uniform_decode_query_len, tensor_parallel_size)
             if (
                 multiple_of % uniform_decode_query_len != 0
diff --git a/vllm/config/utils.py b/vllm/config/utils.py
index 02f2b75f6..3124fcf00 100644
--- a/vllm/config/utils.py
+++ b/vllm/config/utils.py
@@ -19,6 +19,10 @@ import torch
 from pydantic.fields import FieldInfo
 from typing_extensions import runtime_checkable
 
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 if TYPE_CHECKING:
     from _typeshed import DataclassInstance
 else:
@@ -293,3 +297,28 @@ def get_hash_factors(config: ConfigT, ignored_factors: set[str]) -> dict[str, ob
 def hash_factors(items: dict[str, object]) -> str:
     """Return a SHA-256 hex digest of the canonical items structure."""
     return hashlib.sha256(json.dumps(items, sort_keys=True).encode()).hexdigest()
+
+
+def handle_deprecated(
+    config: ConfigT,
+    old_name: str,
+    new_name_or_names: str | list[str],
+    removal_version: str,
+) -> None:
+    old_val = getattr(config, old_name)
+    if old_val is None:
+        return
+
+    if isinstance(new_name_or_names, str):
+        new_names = [new_name_or_names]
+    else:
+        new_names = new_name_or_names
+
+    msg = (
+        f"{old_name} is deprecated and will be removed in {removal_version}. "
+        f"Use {', '.join(new_names)} instead."
+    )
+    logger.warning(msg)
+
+    for new_name in new_names:
+        setattr(config, new_name, old_val)
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 5b3a9c437..735b0afba 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -83,22 +83,33 @@ IS_DENSE = False
 # See https://github.com/vllm-project/vllm/issues/25689.
 
 
-def enable_fusion(cfg: "VllmConfig") -> bool:
-    """Returns True if RMS norm or quant FP8 is enabled."""
+def enable_norm_fusion(cfg: "VllmConfig") -> bool:
+    """Enable if either RMS norm or quant FP8 custom op is active;
+    otherwise Inductor handles fusion."""
+
     return cfg.compilation_config.is_custom_op_enabled(
         "rms_norm"
     ) or cfg.compilation_config.is_custom_op_enabled("quant_fp8")
 
 
+def enable_act_fusion(cfg: "VllmConfig") -> bool:
+    """Enable if either SiLU+Mul or quant FP8 custom op is active;
+    otherwise Inductor handles fusion."""
+    return cfg.compilation_config.is_custom_op_enabled(
+        "silu_and_mul"
+    ) or cfg.compilation_config.is_custom_op_enabled("quant_fp8")
+
+
 OPTIMIZATION_LEVEL_00 = {
     "compilation_config": {
         "pass_config": {
-            "enable_noop": False,
-            "enable_fusion": False,
-            "enable_fi_allreduce_fusion": False,
-            "enable_attn_fusion": False,
-            "enable_sequence_parallelism": False,
-            "enable_async_tp": False,
+            "eliminate_noops": False,
+            "fuse_norm_quant": False,
+            "fuse_act_quant": False,
+            "fuse_allreduce_rms": False,
+            "fuse_attn_quant": False,
+            "enable_sp": False,
+            "fuse_gemm_comms": False,
         },
         "cudagraph_mode": CUDAGraphMode.NONE,
         "use_inductor_graph_partition": False,
@@ -107,12 +118,13 @@ OPTIMIZATION_LEVEL_00 = {
 OPTIMIZATION_LEVEL_01 = {
     "compilation_config": {
         "pass_config": {
-            "enable_noop": True,
-            "enable_fusion": enable_fusion,
-            "enable_fi_allreduce_fusion": False,
-            "enable_attn_fusion": False,
-            "enable_sequence_parallelism": False,
-            "enable_async_tp": False,
+            "eliminate_noops": True,
+            "fuse_norm_quant": enable_norm_fusion,
+            "fuse_act_quant": enable_act_fusion,
+            "fuse_allreduce_rms": False,
+            "fuse_attn_quant": False,
+            "enable_sp": False,
+            "fuse_gemm_comms": False,
         },
         "cudagraph_mode": CUDAGraphMode.PIECEWISE,
         "use_inductor_graph_partition": False,
@@ -121,12 +133,13 @@ OPTIMIZATION_LEVEL_01 = {
 OPTIMIZATION_LEVEL_02 = {
     "compilation_config": {
         "pass_config": {
-            "enable_noop": True,
-            "enable_fusion": enable_fusion,
-            "enable_fi_allreduce_fusion": False,
-            "enable_attn_fusion": IS_QUANTIZED,
-            "enable_sequence_parallelism": IS_DENSE,
-            "enable_async_tp": IS_DENSE,
+            "eliminate_noops": True,
+            "fuse_norm_quant": enable_norm_fusion,
+            "fuse_act_quant": enable_act_fusion,
+            "fuse_allreduce_rms": False,
+            "fuse_attn_quant": IS_QUANTIZED,
+            "enable_sp": IS_DENSE,
+            "fuse_gemm_comms": IS_DENSE,
         },
         "cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE,
         "use_inductor_graph_partition": False,
@@ -135,12 +148,13 @@ OPTIMIZATION_LEVEL_02 = {
 OPTIMIZATION_LEVEL_03 = {
     "compilation_config": {
         "pass_config": {
-            "enable_noop": True,
-            "enable_fusion": enable_fusion,
-            "enable_fi_allreduce_fusion": False,
-            "enable_attn_fusion": IS_QUANTIZED,
-            "enable_sequence_parallelism": IS_DENSE,
-            "enable_async_tp": IS_DENSE,
+            "eliminate_noops": True,
+            "fuse_norm_quant": enable_norm_fusion,
+            "fuse_act_quant": enable_act_fusion,
+            "fuse_allreduce_rms": False,
+            "fuse_attn_quant": IS_QUANTIZED,
+            "enable_sp": IS_DENSE,
+            "fuse_gemm_comms": IS_DENSE,
         },
         "cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE,
         "use_inductor_graph_partition": False,
@@ -645,9 +659,9 @@ class VllmConfig:
 
         # async tp is built on top of sequence parallelism
         # and requires it to be enabled.
-        if self.compilation_config.pass_config.enable_async_tp:
-            self.compilation_config.pass_config.enable_sequence_parallelism = True
-        if self.compilation_config.pass_config.enable_sequence_parallelism:
+        if self.compilation_config.pass_config.fuse_gemm_comms:
+            self.compilation_config.pass_config.enable_sp = True
+        if self.compilation_config.pass_config.enable_sp:
             if "-rms_norm" in self.compilation_config.custom_ops:
                 logger.warning(
                     "RMS norm force disabled, sequence parallelism might break"
@@ -797,7 +811,7 @@ class VllmConfig:
         # Do this after all the updates to compilation_config.mode
         self.compilation_config.set_splitting_ops_for_v1()
 
-        if self.compilation_config.pass_config.enable_sequence_parallelism:
+        if self.compilation_config.pass_config.enable_sp:
             # With pipeline parallelism or dynamo partitioning,
             # native rms norm tracing errors due to incorrect residual shape.
             # Use custom rms norm to unblock. In the future,
@@ -1062,7 +1076,7 @@ class VllmConfig:
 
             if (
                 self.parallel_config.tensor_parallel_size > 1
-                and self.compilation_config.pass_config.enable_sequence_parallelism
+                and self.compilation_config.pass_config.enable_sp
             ):
                 cudagraph_capture_sizes = self.update_sizes_for_sequence_parallelism(
                     cudagraph_capture_sizes
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 8c22ada02..1b250a8bd 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2417,10 +2417,7 @@ class GPUModelRunner(
         # Pad tokens to multiple of tensor_parallel_size when
         # enabled collective fusion for SP
         tp_size = self.vllm_config.parallel_config.tensor_parallel_size
-        if (
-            self.compilation_config.pass_config.enable_sequence_parallelism
-            and tp_size > 1
-        ):
+        if self.compilation_config.pass_config.enable_sp and tp_size > 1:
             return round_up(num_scheduled_tokens, tp_size)
         return num_scheduled_tokens
 
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index ed6fb32bc..edba07a42 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -552,7 +552,7 @@ class Worker(WorkerBase):
 
         if (
             parallel_config.pipeline_parallel_size > 1
-            and compilation_config.pass_config.enable_sequence_parallelism
+            and compilation_config.pass_config.enable_sp
             and forward_pass
         ):
             # currently only supported by V1 GPUModelRunner
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index bd88cb1b2..427a0d296 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -342,7 +342,7 @@ def is_residual_scattered_for_sp(
       partition), SP is always applied
     - Otherwise, SP is only applied for specific shapes in compile_sizes
     """
-    if not vllm_config.compilation_config.pass_config.enable_sequence_parallelism:
+    if not vllm_config.compilation_config.pass_config.enable_sp:
         return False
 
     tp = vllm_config.parallel_config.tensor_parallel_size
-- 
GitLab


From b08025a83bf416d97d0547ac52c3909356e118c4 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 2 Dec 2025 23:57:28 -0500
Subject: [PATCH 029/258] [Docs] Discuss api key limitations in security guide
 (#29922)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 docs/usage/security.md         | 110 +++++++++++++++++++++++++++++++++
 vllm/entrypoints/cli/openai.py |   4 ++
 2 files changed, 114 insertions(+)

diff --git a/docs/usage/security.md b/docs/usage/security.md
index 9d10b66a5..74060d86f 100644
--- a/docs/usage/security.md
+++ b/docs/usage/security.md
@@ -108,6 +108,116 @@ networks.
 Consult your operating system or application platform documentation for specific
 firewall configuration instructions.
 
+## API Key Authentication Limitations
+
+### Overview
+
+The `--api-key` flag (or `VLLM_API_KEY` environment variable) provides authentication for vLLM's HTTP server, but **only for OpenAI-compatible API endpoints under the `/v1` path prefix**. Many other sensitive endpoints are exposed on the same HTTP server without any authentication enforcement.
+
+**Important:** Do not rely exclusively on `--api-key` for securing access to vLLM. Additional security measures are required for production deployments.
+
+### Protected Endpoints (Require API Key)
+
+When `--api-key` is configured, the following `/v1` endpoints require Bearer token authentication:
+
+- `/v1/models` - List available models
+- `/v1/chat/completions` - Chat completions
+- `/v1/completions` - Text completions
+- `/v1/embeddings` - Generate embeddings
+- `/v1/audio/transcriptions` - Audio transcription
+- `/v1/audio/translations` - Audio translation
+- `/v1/messages` - Anthropic-compatible messages API
+- `/v1/responses` - Response management
+- `/v1/score` - Scoring API
+- `/v1/rerank` - Reranking API
+
+### Unprotected Endpoints (No API Key Required)
+
+The following endpoints **do not require authentication** even when `--api-key` is configured:
+
+**Inference endpoints:**
+
+- `/invocations` - SageMaker-compatible endpoint (routes to the same inference functions as `/v1` endpoints)
+- `/inference/v1/generate` - Generate completions
+- `/pooling` - Pooling API
+- `/classify` - Classification API
+- `/score` - Scoring API (non-`/v1` variant)
+- `/rerank` - Reranking API (non-`/v1` variant)
+
+**Operational control endpoints (always enabled):**
+
+- `/pause` - Pause generation (causes denial of service)
+- `/resume` - Resume generation
+- `/scale_elastic_ep` - Trigger scaling operations
+
+**Utility endpoints:**
+
+- `/tokenize` - Tokenize text
+- `/detokenize` - Detokenize tokens
+- `/health` - Health check
+- `/ping` - SageMaker health check
+- `/version` - Version information
+- `/load` - Server load metrics
+
+**Tokenizer information endpoint (only when `--enable-tokenizer-info-endpoint` is set):**
+
+This endpoint is **only available when the `--enable-tokenizer-info-endpoint` flag is set**. It may expose sensitive information such as chat templates and tokenizer configuration:
+
+- `/tokenizer_info` - Get comprehensive tokenizer information including chat templates and configuration
+
+**Development endpoints (only when `VLLM_SERVER_DEV_MODE=1`):**
+
+These endpoints are **only available when the environment variable `VLLM_SERVER_DEV_MODE` is set to `1`**. They are intended for development and debugging purposes and should never be enabled in production:
+
+- `/server_info` - Get detailed server configuration
+- `/reset_prefix_cache` - Reset prefix cache (can disrupt service)
+- `/reset_mm_cache` - Reset multimodal cache (can disrupt service)
+- `/sleep` - Put engine to sleep (causes denial of service)
+- `/wake_up` - Wake engine from sleep
+- `/is_sleeping` - Check if engine is sleeping
+- `/collective_rpc` - Execute arbitrary RPC methods on the engine (extremely dangerous)
+
+**Profiler endpoints (only when `VLLM_TORCH_PROFILER_DIR` or `VLLM_TORCH_CUDA_PROFILE` are set):**
+
+These endpoints are only available when profiling is enabled and should only be used for local development:
+
+- `/start_profile` - Start PyTorch profiler
+- `/stop_profile` - Stop PyTorch profiler
+
+**Note:** The `/invocations` endpoint is particularly concerning as it provides unauthenticated access to the same inference capabilities as the protected `/v1` endpoints.
+
+### Security Implications
+
+An attacker who can reach the vLLM HTTP server can:
+
+1. **Bypass authentication** by using non-`/v1` endpoints like `/invocations`, `/inference/v1/generate`, `/pooling`, `/classify`, `/score`, or `/rerank` to run arbitrary inference without credentials
+2. **Cause denial of service** by calling `/pause` or `/scale_elastic_ep` without a token
+3. **Access operational controls** to manipulate server state (e.g., pausing generation)
+4. **If `--enable-tokenizer-info-endpoint` is set:** Access sensitive tokenizer configuration including chat templates, which may reveal prompt engineering strategies or other implementation details
+5. **If `VLLM_SERVER_DEV_MODE=1` is set:** Execute arbitrary RPC commands via `/collective_rpc`, reset caches, put the engine to sleep, and access detailed server configuration
+
+### Recommended Security Practices
+
+#### 1. Minimize Exposed Endpoints
+
+**CRITICAL:** Never set `VLLM_SERVER_DEV_MODE=1` in production environments. Development endpoints expose extremely dangerous functionality including:
+
+- Arbitrary RPC execution via `/collective_rpc`
+- Cache manipulation that can disrupt service
+- Detailed server configuration disclosure
+
+Similarly, never enable profiler endpoints (`VLLM_TORCH_PROFILER_DIR` or `VLLM_TORCH_CUDA_PROFILE`) in production.
+
+**Be cautious with `--enable-tokenizer-info-endpoint`:** Only enable the `/tokenizer_info` endpoint if you need to expose tokenizer configuration information. This endpoint reveals chat templates and tokenizer settings that may contain sensitive implementation details or prompt engineering strategies.
+
+#### 2. Deploy Behind a Reverse Proxy
+
+The most effective approach is to deploy vLLM behind a reverse proxy (such as nginx, Envoy, or a Kubernetes Gateway) that:
+
+- Explicitly allowlists only the endpoints you want to expose to end users
+- Blocks all other endpoints, including the unauthenticated inference and operational control endpoints
+- Implements additional authentication, rate limiting, and logging at the proxy layer
+
 ## Reporting Security Vulnerabilities
 
 If you believe you have found a security vulnerability in vLLM, please report it following the project's security policy. For more information on how to report security issues and the project's security policy, please see the [vLLM Security Policy](https://github.com/vllm-project/vllm/blob/main/SECURITY.md).
diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py
index fb49be370..1c18b193d 100644
--- a/vllm/entrypoints/cli/openai.py
+++ b/vllm/entrypoints/cli/openai.py
@@ -109,6 +109,10 @@ def _add_query_options(parser: FlexibleArgumentParser) -> FlexibleArgumentParser
         help=(
             "API key for OpenAI services. If provided, this api key "
             "will overwrite the api key obtained through environment variables."
+            " It is important to note that this option only applies to the "
+            "OpenAI-compatible API endpoints and NOT other endpoints that may "
+            "be present in the server. See the security guide in the vLLM docs "
+            "for more details."
         ),
     )
     return parser
-- 
GitLab


From c719c40540a85c1e6aeee9af20f29db581da27f0 Mon Sep 17 00:00:00 2001
From: elvischenv <219235043+elvischenv@users.noreply.github.com>
Date: Wed, 3 Dec 2025 13:15:50 +0800
Subject: [PATCH 030/258] [Bugfix] Defunctionalize TRTLLM AR+Norm op for
 avoiding extra clone kernel before it (#29631)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 vllm/compilation/fix_functionalization.py | 12 ++++++++++++
 vllm/compilation/fx_utils.py              |  4 ++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
index 126ad35e5..76068f86e 100644
--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@@ -103,6 +103,18 @@ class FixFunctionalizationPass(VllmInductorPass):
             ]:
                 mutated_args = {1: "result"}
                 self.defunctionalize(graph, node, mutated_args)
+            elif (
+                at_target
+                == torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default
+            ):
+                mutated_args = {
+                    1: "allreduce_in",
+                    2: "residual",
+                    3: "norm_out",
+                    4: "quant_out",
+                    5: "scale_out",
+                }
+                self.defunctionalize(graph, node, mutated_args)
             # For some reason we need to specify the args for both
             # silu_and_mul and silu_and_mul_quant. The kwargs
             # pathway gets the wrong answer.
diff --git a/vllm/compilation/fx_utils.py b/vllm/compilation/fx_utils.py
index f2497950f..3650ee6b4 100644
--- a/vllm/compilation/fx_utils.py
+++ b/vllm/compilation/fx_utils.py
@@ -75,8 +75,8 @@ def find_op_nodes(
         return
 
     assert isinstance(op, OpOverload)
-    if not op._schema.is_mutable:
-        yield from graph.find_nodes(op="call_function", target=op)
+
+    yield from graph.find_nodes(op="call_function", target=op)
 
     for n in graph.find_nodes(op="call_function", target=auto_functionalized):
         if n.args[0] == op:
-- 
GitLab


From 0bec63fa317e1fbd62e19b0fc31c43c81bf89077 Mon Sep 17 00:00:00 2001
From: JackieWu <wkcn@live.cn>
Date: Wed, 3 Dec 2025 14:20:37 +0800
Subject: [PATCH 031/258] [BugFix] fix imgs_pos in hunyuan_vl (#29879)

Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/transformers_utils/processors/hunyuan_vl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/transformers_utils/processors/hunyuan_vl.py b/vllm/transformers_utils/processors/hunyuan_vl.py
index 615a8bff8..f32ce115c 100644
--- a/vllm/transformers_utils/processors/hunyuan_vl.py
+++ b/vllm/transformers_utils/processors/hunyuan_vl.py
@@ -123,7 +123,7 @@ class HunYuanVLProcessor(ProcessorMixin):
 
         attention_mask = input_ids.ne(self.pad_id)
         text_inputs["attention_mask"] = attention_mask
-        text_inputs["imgs_pos"] = [self.get_imgs_pos(input_ids)]
+        text_inputs["imgs_pos"] = [self.get_imgs_pos(e) for e in input_ids]
         # image_inputs["imgs"] = [[image_inputs["pixel_values"]]]
 
         return_tensors = kwargs.pop("return_tensors", None)
-- 
GitLab


From bbfb55c29e7febb91e90f261dd9adb4200ee3a09 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 3 Dec 2025 15:49:34 +0800
Subject: [PATCH 032/258] [Misc] Allow `fetch_*` utils to access local files by
 default (#29932)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/multimodal/utils.py | 38 ++++++++++++++++++++++++++++++--------
 vllm/multimodal/video.py |  2 +-
 2 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 1020554e2..184022085 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -67,8 +67,9 @@ class MediaConnector:
                              to set num_frames for video, set
                              `--media-io-kwargs '{"video":{"num_frames":40}}'`
             connection: HTTP connection client to download media contents.
-            allowed_local_media_path: A local directory to load media files
-                                      from.
+            allowed_local_media_path: A local directory to load media files from.
+            allowed_media_domains: If set, only media URLs that belong to this
+                                   domain can be used for multi-modal inputs.
         """
         super().__init__()
 
@@ -123,16 +124,16 @@ class MediaConnector:
                 "Cannot load local files without `--allowed-local-media-path`."
             )
 
-        filepath = Path(url2pathname(url_spec.path))
+        filepath = Path(url2pathname(url_spec.netloc + url_spec.path))
         if allowed_local_media_path not in filepath.resolve().parents:
             raise ValueError(
                 f"The file path {filepath} must be a subpath "
-                f"of `--allowed-local-media-path` {allowed_local_media_path}."
+                f"of `--allowed-local-media-path {allowed_local_media_path}`."
             )
 
         return media_io.load_file(filepath)
 
-    def _assert_url_in_allowed_media_domains(self, url_spec) -> None:
+    def _assert_url_in_allowed_media_domains(self, url_spec: ParseResult) -> None:
         if (
             self.allowed_media_domains
             and url_spec.hostname not in self.allowed_media_domains
@@ -489,9 +490,16 @@ def fetch_audio(
     Args:
         audio_url: URL of the audio file to fetch.
         audio_io_kwargs: Additional kwargs passed to handle audio IO.
+
+    Warning:
+        This method has direct access to local files and is only intended
+        to be called by user code. Never call this from the online server!
     """
     media_io_kwargs = None if not audio_io_kwargs else {"audio": audio_io_kwargs}
-    media_connector = MediaConnector(media_io_kwargs=media_io_kwargs)
+    media_connector = MediaConnector(
+        media_io_kwargs=media_io_kwargs,
+        allowed_local_media_path="/",
+    )
     return media_connector.fetch_audio(audio_url)
 
 
@@ -503,9 +511,16 @@ def fetch_image(
     Args:
         image_url: URL of the image file to fetch.
         image_io_kwargs: Additional kwargs passed to handle image IO.
+
+    Warning:
+        This method has direct access to local files and is only intended
+        to be called by user code. Never call this from the online server!
     """
     media_io_kwargs = None if not image_io_kwargs else {"image": image_io_kwargs}
-    media_connector = MediaConnector(media_io_kwargs=media_io_kwargs)
+    media_connector = MediaConnector(
+        media_io_kwargs=media_io_kwargs,
+        allowed_local_media_path="/",
+    )
     return media_connector.fetch_image(image_url)
 
 
@@ -517,7 +532,14 @@ def fetch_video(
     Args:
         video_url: URL of the video file to fetch.
         video_io_kwargs: Additional kwargs passed to handle video IO.
+
+    Warning:
+        This method has direct access to local files and is only intended
+        to be called by user code. Never call this from the online server!
     """
     media_io_kwargs = None if not video_io_kwargs else {"video": video_io_kwargs}
-    media_connector = MediaConnector(media_io_kwargs=media_io_kwargs)
+    media_connector = MediaConnector(
+        media_io_kwargs=media_io_kwargs,
+        allowed_local_media_path="/",
+    )
     return media_connector.fetch_video(video_url)
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 763f90fde..abfc226a6 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -267,7 +267,7 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
         return frames, metadata
 
 
-class VideoMediaIO(MediaIO[npt.NDArray]):
+class VideoMediaIO(MediaIO[tuple[npt.NDArray, dict[str, Any]]]):
     def __init__(
         self,
         image_io: ImageMediaIO,
-- 
GitLab


From 3a7751485b71ce5ef927e4aa03b28602cb90811c Mon Sep 17 00:00:00 2001
From: Andrew Xia <axia@meta.com>
Date: Tue, 2 Dec 2025 23:59:23 -0800
Subject: [PATCH 033/258] [responsesAPI] support input output messages for non
 harmony models (#29549)

Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
---
 .../openai/test_response_api_simple.py        | 18 +++++++++++++++
 vllm/entrypoints/context.py                   | 22 +++++++++++++++++++
 vllm/entrypoints/openai/protocol.py           | 22 +++++++++++++++----
 vllm/entrypoints/openai/serving_responses.py  | 13 +++++------
 4 files changed, 64 insertions(+), 11 deletions(-)

diff --git a/tests/entrypoints/openai/test_response_api_simple.py b/tests/entrypoints/openai/test_response_api_simple.py
index 425b8199a..aee03199b 100644
--- a/tests/entrypoints/openai/test_response_api_simple.py
+++ b/tests/entrypoints/openai/test_response_api_simple.py
@@ -42,6 +42,24 @@ async def test_basic(client: OpenAI, model_name: str):
     assert response.status == "completed"
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_enable_response_messages(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="Hello?",
+        extra_body={"enable_response_messages": True},
+    )
+    assert response.status == "completed"
+    assert response.input_messages[0]["type"] == "raw_message_tokens"
+    assert type(response.input_messages[0]["message"]) is str
+    assert len(response.input_messages[0]["message"]) > 10
+    assert type(response.input_messages[0]["tokens"][0]) is int
+    assert type(response.output_messages[0]["message"]) is str
+    assert len(response.output_messages[0]["message"]) > 10
+    assert type(response.output_messages[0]["tokens"][0]) is int
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_reasoning_item(client: OpenAI, model_name: str):
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index 1260f65db..43783c926 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -23,6 +23,7 @@ from vllm.entrypoints.openai.parser.responses_parser import (
 )
 from vllm.entrypoints.openai.protocol import (
     ResponseInputOutputItem,
+    ResponseRawMessageAndToken,
     ResponsesRequest,
 )
 from vllm.entrypoints.responses_utils import construct_tool_dicts
@@ -148,6 +149,8 @@ def _create_json_parse_error_messages(
 
 
 class SimpleContext(ConversationContext):
+    """This is a context that cannot handle MCP tool calls"""
+
     def __init__(self):
         self.last_output = None
         self.num_prompt_tokens = 0
@@ -158,6 +161,9 @@ class SimpleContext(ConversationContext):
         # not implemented yet for SimpleContext
         self.all_turn_metrics = []
 
+        self.input_messages: list[ResponseRawMessageAndToken] = []
+        self.output_messages: list[ResponseRawMessageAndToken] = []
+
     def append_output(self, output) -> None:
         self.last_output = output
         if not isinstance(output, RequestOutput):
@@ -166,6 +172,22 @@ class SimpleContext(ConversationContext):
         self.num_cached_tokens = output.num_cached_tokens or 0
         self.num_output_tokens += len(output.outputs[0].token_ids or [])
 
+        if len(self.input_messages) == 0:
+            output_prompt = output.prompt or ""
+            output_prompt_token_ids = output.prompt_token_ids or []
+            self.input_messages.append(
+                ResponseRawMessageAndToken(
+                    message=output_prompt,
+                    tokens=output_prompt_token_ids,
+                )
+            )
+        self.output_messages.append(
+            ResponseRawMessageAndToken(
+                message=output.outputs[0].text,
+                tokens=output.outputs[0].token_ids,
+            )
+        )
+
     def append_tool_output(self, output) -> None:
         raise NotImplementedError("Should not be called.")
 
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 0f4b2b4d7..2d34a6a0c 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1598,6 +1598,20 @@ def serialize_messages(msgs):
     return [serialize_message(msg) for msg in msgs] if msgs else None
 
 
+class ResponseRawMessageAndToken(OpenAIBaseModel):
+    """Class to show the raw message.
+    If message / tokens diverge, tokens is the source of truth"""
+
+    message: str
+    tokens: list[int]
+    type: Literal["raw_message_tokens"] = "raw_message_tokens"
+
+
+ResponseInputOutputMessage: TypeAlias = (
+    list[ChatCompletionMessageParam] | list[ResponseRawMessageAndToken]
+)
+
+
 class ResponsesResponse(OpenAIBaseModel):
     id: str = Field(default_factory=lambda: f"resp_{random_uuid()}")
     created_at: int = Field(default_factory=lambda: int(time.time()))
@@ -1631,8 +1645,8 @@ class ResponsesResponse(OpenAIBaseModel):
     # These are populated when enable_response_messages is set to True
     # NOTE: custom serialization is needed
     # see serialize_input_messages and serialize_output_messages
-    input_messages: list[ChatCompletionMessageParam] | None = None
-    output_messages: list[ChatCompletionMessageParam] | None = None
+    input_messages: ResponseInputOutputMessage | None = None
+    output_messages: ResponseInputOutputMessage | None = None
     # --8<-- [end:responses-extra-params]
 
     # NOTE: openAI harmony doesn't serialize TextContent properly,
@@ -1658,8 +1672,8 @@ class ResponsesResponse(OpenAIBaseModel):
         output: list[ResponseOutputItem],
         status: ResponseStatus,
         usage: ResponseUsage | None = None,
-        input_messages: list[ChatCompletionMessageParam] | None = None,
-        output_messages: list[ChatCompletionMessageParam] | None = None,
+        input_messages: ResponseInputOutputMessage | None = None,
+        output_messages: ResponseInputOutputMessage | None = None,
     ) -> "ResponsesResponse":
         incomplete_details: IncompleteDetails | None = None
         if status == "incomplete":
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 5ad86194c..3c9ae8e8c 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -86,6 +86,7 @@ from vllm.entrypoints.openai.protocol import (
     ResponseCompletedEvent,
     ResponseCreatedEvent,
     ResponseInProgressEvent,
+    ResponseInputOutputMessage,
     ResponseReasoningPartAddedEvent,
     ResponseReasoningPartDoneEvent,
     ResponsesRequest,
@@ -629,8 +630,8 @@ class OpenAIServingResponses(OpenAIServing):
         # "completed" is implemented as the "catch-all" for now.
         status: ResponseStatus = "completed"
 
-        input_messages = None
-        output_messages = None
+        input_messages: ResponseInputOutputMessage | None = None
+        output_messages: ResponseInputOutputMessage | None = None
         if self.use_harmony:
             assert isinstance(context, HarmonyContext)
             output = self._make_response_output_items_with_harmony(context)
@@ -670,12 +671,10 @@ class OpenAIServingResponses(OpenAIServing):
 
             output = self._make_response_output_items(request, final_output, tokenizer)
 
-            # TODO: context for non-gptoss models doesn't use messages
-            # so we can't get them out yet
             if request.enable_response_messages:
-                raise NotImplementedError(
-                    "enable_response_messages is currently only supported for gpt-oss"
-                )
+                input_messages = context.input_messages
+                output_messages = context.output_messages
+
             # Calculate usage.
             assert final_res.prompt_token_ids is not None
             num_tool_output_tokens = 0
-- 
GitLab


From 69520bc695ff8fa7fda66ef7c1a16761824ad354 Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Date: Tue, 2 Dec 2025 23:01:48 -1000
Subject: [PATCH 034/258] Add logging for cudagraph related info (#29825)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
---
 vllm/compilation/cuda_graph.py     | 94 ++++++++++++++++++++++++++++++
 vllm/config/observability.py       |  4 ++
 vllm/engine/arg_utils.py           |  6 ++
 vllm/v1/core/sched/scheduler.py    |  8 ++-
 vllm/v1/metrics/loggers.py         | 14 +++++
 vllm/v1/metrics/stats.py           |  3 +
 vllm/v1/outputs.py                 |  4 ++
 vllm/v1/worker/gpu_model_runner.py | 32 ++++++++--
 vllm/v1/worker/gpu_worker.py       |  2 +-
 9 files changed, 161 insertions(+), 6 deletions(-)

diff --git a/vllm/compilation/cuda_graph.py b/vllm/compilation/cuda_graph.py
index a2e0abfeb..0748643a5 100644
--- a/vllm/compilation/cuda_graph.py
+++ b/vllm/compilation/cuda_graph.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
+from collections import Counter
 from collections.abc import Callable
 from contextlib import ExitStack
 from typing import Any
@@ -22,6 +23,99 @@ from vllm.utils.torch_utils import weak_ref_tensors
 logger = init_logger(__name__)
 
 
+@dataclasses.dataclass(frozen=True)
+class CUDAGraphStat:
+    num_unpadded_tokens: int
+    num_padded_tokens: int
+    num_paddings: int
+    runtime_mode: str
+
+
+class CUDAGraphLogging:
+    """Aggregate and log cudagraph metrics"""
+
+    COLUMN_HEADERS = [
+        "Unpadded Tokens",
+        "Padded Tokens",
+        "Num Paddings",
+        "Runtime Mode",
+        "Count",
+    ]
+
+    def __init__(self, cg_mode: CUDAGraphMode, cg_capture_sizes: list[int] | None):
+        self.reset()
+        self.cg_mode = str(cg_mode)
+        self.cg_capture_sizes = str(cg_capture_sizes or [])
+
+        self.settings_header = (
+            "**CUDAGraph Config Settings:**\n\n"
+            f"- Mode: {self.cg_mode}\n"
+            f"- Capture sizes: {self.cg_capture_sizes}\n\n"
+            "**CUDAGraph Stats:**\n\n"
+        )
+
+    def reset(self):
+        self.stats = []
+
+    def observe(self, cudagraph_stat: CUDAGraphStat):
+        self.stats.append(cudagraph_stat)
+
+    def generate_metric_table(self) -> str:
+        stats_counts = Counter(self.stats)
+
+        # Convert stats to rows of strings, in descending order of observed frequencies
+        rows = []
+        for stat, count in sorted(
+            stats_counts.items(), key=lambda item: item[1], reverse=True
+        ):
+            rows.append(
+                [
+                    str(stat.num_unpadded_tokens),
+                    str(stat.num_padded_tokens),
+                    str(stat.num_paddings),
+                    stat.runtime_mode,
+                    str(count),
+                ]
+            )
+
+        # Calculate column widths (max of header and data)
+        col_widths = []
+        for i, header_text in enumerate(self.COLUMN_HEADERS):
+            max_width = len(header_text)
+            for row in rows:
+                max_width = max(max_width, len(row[i]))
+            col_widths.append(max_width)
+
+        table_header_list = [
+            h.ljust(w) for h, w in zip(self.COLUMN_HEADERS, col_widths)
+        ]
+        table_header = "| " + " | ".join(table_header_list) + " |\n"
+
+        table_separator = "|" + "|".join("-" * (w + 2) for w in col_widths) + "|\n"
+
+        # Create data rows with proper alignment
+        data_rows = []
+        for row in rows:
+            formatted_row = [
+                str(val).ljust(width) for val, width in zip(row, col_widths)
+            ]
+            data_rows.append("| " + " | ".join(formatted_row) + " |")
+
+        return (
+            self.settings_header
+            + table_header
+            + table_separator
+            + "\n".join(data_rows)
+            + "\n"
+        )
+
+    def log(self, log_fn=logger.info):
+        if not self.stats:
+            return
+        log_fn(self.generate_metric_table())
+        self.reset()
+
+
 @dataclasses.dataclass
 class CUDAGraphEntry:
     batch_descriptor: BatchDescriptor
diff --git a/vllm/config/observability.py b/vllm/config/observability.py
index 656a5f8a9..fdc27aee3 100644
--- a/vllm/config/observability.py
+++ b/vllm/config/observability.py
@@ -55,6 +55,10 @@ class ObservabilityConfig:
     kv_cache_metrics_sample: float = Field(default=0.01, gt=0, le=1)
     """Sampling rate for KV cache metrics (0.0, 1.0]. Default 0.01 = 1% of blocks."""
 
+    cudagraph_metrics: bool = False
+    """Enable CUDA graph metrics (number of padded/unpadded tokens, runtime cudagraph
+    dispatch modes, and their observed frequencies at every logging interval)."""
+
     @cached_property
     def collect_model_forward_time(self) -> bool:
         """Whether to collect model forward time for the request."""
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 096217da4..fd07cded7 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -518,6 +518,7 @@ class EngineArgs:
     kv_cache_metrics_sample: float = get_field(
         ObservabilityConfig, "kv_cache_metrics_sample"
     )
+    cudagraph_metrics: bool = ObservabilityConfig.cudagraph_metrics
     scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
     scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls
 
@@ -1021,6 +1022,10 @@ class EngineArgs:
             "--kv-cache-metrics-sample",
             **observability_kwargs["kv_cache_metrics_sample"],
         )
+        observability_group.add_argument(
+            "--cudagraph-metrics",
+            **observability_kwargs["cudagraph_metrics"],
+        )
 
         # Scheduler arguments
         scheduler_kwargs = get_kwargs(SchedulerConfig)
@@ -1698,6 +1703,7 @@ class EngineArgs:
             collect_detailed_traces=self.collect_detailed_traces,
             kv_cache_metrics=self.kv_cache_metrics,
             kv_cache_metrics_sample=self.kv_cache_metrics_sample,
+            cudagraph_metrics=self.cudagraph_metrics,
         )
 
         # Compilation config overrides
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 52b98ef65..75a7385df 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -7,6 +7,7 @@ from collections.abc import Iterable
 from typing import Any
 
 from vllm import envs
+from vllm.compilation.cuda_graph import CUDAGraphStat
 from vllm.config import VllmConfig
 from vllm.distributed.ec_transfer.ec_connector.base import (
     ECConnectorMetadata,
@@ -1037,6 +1038,7 @@ class Scheduler(SchedulerInterface):
         pooler_outputs = model_runner_output.pooler_output
         num_nans_in_logits = model_runner_output.num_nans_in_logits
         kv_connector_output = model_runner_output.kv_connector_output
+        cudagraph_stats = model_runner_output.cudagraph_stats
 
         outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list)
         spec_decoding_stats: SpecDecodingStats | None = None
@@ -1219,7 +1221,9 @@ class Scheduler(SchedulerInterface):
             finished_req_ids.clear()
 
         if (
-            stats := self.make_stats(spec_decoding_stats, kv_connector_stats)
+            stats := self.make_stats(
+                spec_decoding_stats, kv_connector_stats, cudagraph_stats
+            )
         ) is not None:
             # Return stats to only one of the front-ends.
             if (eco := next(iter(engine_core_outputs.values()), None)) is None:
@@ -1420,6 +1424,7 @@ class Scheduler(SchedulerInterface):
         self,
         spec_decoding_stats: SpecDecodingStats | None = None,
         kv_connector_stats: KVConnectorStats | None = None,
+        cudagraph_stats: CUDAGraphStat | None = None,
     ) -> SchedulerStats | None:
         if not self.log_stats:
             return None
@@ -1444,6 +1449,7 @@ class Scheduler(SchedulerInterface):
             kv_cache_eviction_events=eviction_events,
             spec_decoding_stats=spec_stats,
             kv_connector_stats=connector_stats_payload,
+            cudagraph_stats=cudagraph_stats,
         )
 
     def make_spec_decoding_stats(
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index dec0e2d00..6961e15c2 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -10,6 +10,7 @@ from typing import TypeAlias
 from prometheus_client import Counter, Gauge, Histogram
 
 import vllm.envs as envs
+from vllm.compilation.cuda_graph import CUDAGraphLogging
 from vllm.config import SupportsMetricsInfo, VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
     KVConnectorLogging,
@@ -106,6 +107,12 @@ class LoggingStatLogger(StatLoggerBase):
         self.spec_decoding_logging = SpecDecodingLogging()
         kv_transfer_config = self.vllm_config.kv_transfer_config
         self.kv_connector_logging = KVConnectorLogging(kv_transfer_config)
+        self.cudagraph_logging = None
+        if self.vllm_config.observability_config.cudagraph_metrics:
+            self.cudagraph_logging = CUDAGraphLogging(
+                self.vllm_config.compilation_config.cudagraph_mode,
+                self.vllm_config.compilation_config.cudagraph_capture_sizes,
+            )
         self.last_prompt_throughput: float = 0.0
         self.last_generation_throughput: float = 0.0
         self.engine_is_idle = False
@@ -161,6 +168,11 @@ class LoggingStatLogger(StatLoggerBase):
                 self.spec_decoding_logging.observe(scheduler_stats.spec_decoding_stats)
             if kv_connector_stats := scheduler_stats.kv_connector_stats:
                 self.kv_connector_logging.observe(kv_connector_stats)
+            if (
+                self.cudagraph_logging is not None
+                and scheduler_stats.cudagraph_stats is not None
+            ):
+                self.cudagraph_logging.observe(scheduler_stats.cudagraph_stats)
             if not self.aggregated:
                 self.last_scheduler_stats = scheduler_stats
         if mm_cache_stats:
@@ -240,6 +252,8 @@ class LoggingStatLogger(StatLoggerBase):
 
         self.spec_decoding_logging.log(log_fn=log_fn)
         self.kv_connector_logging.log(log_fn=log_fn)
+        if self.cudagraph_logging is not None:
+            self.cudagraph_logging.log(log_fn=log_fn)
 
     def log_engine_initialized(self):
         if self.vllm_config.cache_config.num_gpu_blocks:
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index a3078eaa7..733d3ae12 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -7,6 +7,7 @@ from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any
 
 import vllm.envs as envs
+from vllm.compilation.cuda_graph import CUDAGraphStat
 from vllm.v1.spec_decode.metrics import SpecDecodingStats
 
 if TYPE_CHECKING:
@@ -183,6 +184,8 @@ class SchedulerStats:
     waiting_lora_adapters: dict[str, int] = field(default_factory=dict)
     running_lora_adapters: dict[str, int] = field(default_factory=dict)
 
+    cudagraph_stats: CUDAGraphStat | None = None
+
 
 @dataclass
 class RequestStateStats:
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 8110deb5a..88ac6b4ae 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, NamedTuple
 import numpy as np
 import torch
 
+from vllm.compilation.cuda_graph import CUDAGraphStat
 from vllm.v1.core.sched.output import SchedulerOutput
 
 if TYPE_CHECKING:
@@ -169,6 +170,9 @@ class ModelRunnerOutput:
     # req_id -> num_nans_in_logits
     num_nans_in_logits: dict[str, int] | None = None
 
+    # information related to cudagraph execution
+    cudagraph_stats: CUDAGraphStat | None = None
+
 
 # ModelRunnerOutput wrapper for async scheduling.
 class AsyncModelRunnerOutput(ABC):
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 1b250a8bd..3f043e3b2 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -27,7 +27,7 @@ from vllm.attention.backends.abstract import (
 )
 from vllm.attention.layer import Attention, MLAAttention
 from vllm.compilation.counter import compilation_counter
-from vllm.compilation.cuda_graph import CUDAGraphWrapper
+from vllm.compilation.cuda_graph import CUDAGraphStat, CUDAGraphWrapper
 from vllm.compilation.monitor import set_cudagraph_capturing_enabled
 from vllm.config import (
     CompilationMode,
@@ -257,6 +257,7 @@ class ExecuteModelState(NamedTuple):
     sample_hidden_states: torch.Tensor
     aux_hidden_states: list[torch.Tensor] | None
     ec_connector_output: ECConnectorOutput | None
+    cudagraph_stats: CUDAGraphStat | None
 
 
 class GPUModelRunner(
@@ -2755,7 +2756,11 @@ class GPUModelRunner(
         force_uniform_decode: bool | None = None,
         force_has_lora: bool | None = None,
     ) -> tuple[
-        CUDAGraphMode, BatchDescriptor, UBatchSlices | None, torch.Tensor | None
+        CUDAGraphMode,
+        BatchDescriptor,
+        UBatchSlices | None,
+        torch.Tensor | None,
+        CUDAGraphStat | None,
     ]:
         num_tokens_padded = self._pad_for_sequence_parallelism(num_tokens)
         uniform_decode = (
@@ -2820,7 +2825,22 @@ class GPUModelRunner(
                 # num_tokens_across_dp will no-longer be valid
                 assert batch_descriptor.num_tokens == num_tokens_padded
 
-        return cudagraph_mode, batch_descriptor, ubatch_slices, num_tokens_across_dp
+        cudagraph_stats = None
+        if self.vllm_config.observability_config.cudagraph_metrics:
+            cudagraph_stats = CUDAGraphStat(
+                num_unpadded_tokens=num_tokens,
+                num_padded_tokens=batch_descriptor.num_tokens,
+                num_paddings=batch_descriptor.num_tokens - num_tokens,
+                runtime_mode=str(cudagraph_mode),
+            )
+
+        return (
+            cudagraph_mode,
+            batch_descriptor,
+            ubatch_slices,
+            num_tokens_across_dp,
+            cudagraph_stats,
+        )
 
     @torch.inference_mode()
     def execute_model(
@@ -2918,6 +2938,7 @@ class GPUModelRunner(
                     batch_desc,
                     ubatch_slices,
                     num_tokens_across_dp,
+                    cudagraph_stats,
                 ) = self._determine_batch_execution_and_padding(
                     num_tokens=num_tokens_unpadded,
                     num_reqs=num_reqs,
@@ -3067,6 +3088,7 @@ class GPUModelRunner(
             sample_hidden_states,
             aux_hidden_states,
             ec_connector_output,
+            cudagraph_stats,
         )
         self.kv_connector_output = kv_connector_output
         return None
@@ -3102,6 +3124,7 @@ class GPUModelRunner(
             sample_hidden_states,
             aux_hidden_states,
             ec_connector_output,
+            cudagraph_stats,
         ) = self.execute_model_state
         # Clear ephemeral state.
         self.execute_model_state = None
@@ -3217,6 +3240,7 @@ class GPUModelRunner(
                 if self.supports_mm_inputs
                 else None,
                 num_nans_in_logits=num_nans_in_logits,
+                cudagraph_stats=cudagraph_stats,
             )
 
         if not self.use_async_scheduling:
@@ -3937,7 +3961,7 @@ class GPUModelRunner(
 
         num_sampled_tokens = np.ones(num_reqs, dtype=np.int32)
 
-        _cudagraph_mode, batch_desc, ubatch_slices, num_tokens_across_dp = (
+        _cudagraph_mode, batch_desc, ubatch_slices, num_tokens_across_dp, _ = (
             self._determine_batch_execution_and_padding(
                 num_tokens=num_tokens_unpadded,
                 num_reqs=num_reqs,
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index edba07a42..a133575cb 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -564,7 +564,7 @@ class Worker(WorkerBase):
             # TODO(lucas): This is pretty gross; ideally we should only ever call
             # `_determine_batch_execution_and_padding` once (will get called again
             # in `execute_model`) but this requires a larger refactor of PP.
-            _, batch_desc, _, _ = (
+            _, batch_desc, _, _, _ = (
                 self.model_runner._determine_batch_execution_and_padding(
                     num_tokens=num_scheduled_tokens,
                     num_reqs=len(num_scheduled_tokens_np),
-- 
GitLab


From 3f42b05fbc53e50813a1619f5fc770f17ac2a1b6 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Wed, 3 Dec 2025 17:26:39 +0800
Subject: [PATCH 035/258] [Refactor] [1/N] to simplify the vLLM serving
 architecture (#28040)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 tests/entrypoints/openai/test_basic.py        |   2 +-
 vllm/entrypoints/api_server.py                |   1 +
 vllm/entrypoints/openai/api_server.py         | 455 +-----------------
 vllm/entrypoints/openai/serving_engine.py     |   3 +-
 vllm/entrypoints/sagemaker/routes.py          |   2 +-
 vllm/entrypoints/serve/__init__.py            |  60 +++
 vllm/entrypoints/serve/disagg/__init__.py     |   0
 vllm/entrypoints/serve/disagg/api_router.py   | 110 +++++
 vllm/entrypoints/serve/disagg/protocol.py     |  90 ++++
 .../disagg/serving.py}                        |  10 +-
 vllm/entrypoints/serve/elastic_ep/__init__.py |   0
 .../serve/elastic_ep/api_router.py            |  96 ++++
 .../serve/elastic_ep/middleware.py            |  49 ++
 .../serve/instrumentator/__init__.py          |   0
 .../serve/instrumentator/health.py            |  33 ++
 .../serve/instrumentator/metrics.py           |  46 ++
 vllm/entrypoints/serve/lora/__init__.py       |   0
 .../lora/api_router.py}                       |  19 +-
 vllm/entrypoints/serve/profile/__init__.py    |   0
 vllm/entrypoints/serve/profile/api_router.py  |  49 ++
 vllm/entrypoints/serve/rlhf/__init__.py       |   0
 vllm/entrypoints/serve/rlhf/api_router.py     | 102 ++++
 vllm/entrypoints/serve/sleep/__init__.py      |   0
 vllm/entrypoints/serve/sleep/api_router.py    |  60 +++
 vllm/entrypoints/serve/tokenize/__init__.py   |   0
 vllm/entrypoints/serve/tokenize/api_router.py | 118 +++++
 .../tokenize/serving.py}                      |   0
 27 files changed, 850 insertions(+), 455 deletions(-)
 create mode 100644 vllm/entrypoints/serve/__init__.py
 create mode 100644 vllm/entrypoints/serve/disagg/__init__.py
 create mode 100644 vllm/entrypoints/serve/disagg/api_router.py
 create mode 100644 vllm/entrypoints/serve/disagg/protocol.py
 rename vllm/entrypoints/{openai/serving_tokens.py => serve/disagg/serving.py} (99%)
 create mode 100644 vllm/entrypoints/serve/elastic_ep/__init__.py
 create mode 100644 vllm/entrypoints/serve/elastic_ep/api_router.py
 create mode 100644 vllm/entrypoints/serve/elastic_ep/middleware.py
 create mode 100644 vllm/entrypoints/serve/instrumentator/__init__.py
 create mode 100644 vllm/entrypoints/serve/instrumentator/health.py
 create mode 100644 vllm/entrypoints/serve/instrumentator/metrics.py
 create mode 100644 vllm/entrypoints/serve/lora/__init__.py
 rename vllm/entrypoints/{dynamic_lora.py => serve/lora/api_router.py} (80%)
 create mode 100644 vllm/entrypoints/serve/profile/__init__.py
 create mode 100644 vllm/entrypoints/serve/profile/api_router.py
 create mode 100644 vllm/entrypoints/serve/rlhf/__init__.py
 create mode 100644 vllm/entrypoints/serve/rlhf/api_router.py
 create mode 100644 vllm/entrypoints/serve/sleep/__init__.py
 create mode 100644 vllm/entrypoints/serve/sleep/api_router.py
 create mode 100644 vllm/entrypoints/serve/tokenize/__init__.py
 create mode 100644 vllm/entrypoints/serve/tokenize/api_router.py
 rename vllm/entrypoints/{openai/serving_tokenization.py => serve/tokenize/serving.py} (100%)

diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py
index 3d581a300..1ff30de31 100644
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -232,7 +232,7 @@ async def test_server_load(server: RemoteOpenAIServer):
 @pytest.mark.asyncio
 async def test_health_check_engine_dead_error():
     # Import the health function directly to test it in isolation
-    from vllm.entrypoints.openai.api_server import health
+    from vllm.entrypoints.serve.instrumentator.health import health
 
     # Create a mock request that simulates what FastAPI would provide
     mock_request = Mock(spec=Request)
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 154cdeb42..b59f71205 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -118,6 +118,7 @@ async def init_app(
         )
     )
     app.state.engine_client = engine
+    app.state.args = args
     return app
 
 
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index cdc316b65..2fa6afa2b 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -20,21 +20,15 @@ from http import HTTPStatus
 from typing import Annotated, Any, Literal
 
 import model_hosting_container_standards.sagemaker as sagemaker_standards
-import prometheus_client
 import pydantic
-import regex as re
 import uvloop
 from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Query, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, Response, StreamingResponse
-from prometheus_client import make_asgi_app
-from prometheus_fastapi_instrumentator import Instrumentator
 from starlette.concurrency import iterate_in_threadpool
 from starlette.datastructures import URL, Headers, MutableHeaders, State
-from starlette.routing import Mount
 from starlette.types import ASGIApp, Message, Receive, Scope, Send
-from typing_extensions import assert_never
 
 import vllm.envs as envs
 from vllm.config import VllmConfig
@@ -56,17 +50,11 @@ from vllm.entrypoints.openai.protocol import (
     ChatCompletionResponse,
     CompletionRequest,
     CompletionResponse,
-    DetokenizeRequest,
-    DetokenizeResponse,
     ErrorInfo,
     ErrorResponse,
-    GenerateRequest,
-    GenerateResponse,
     ResponsesRequest,
     ResponsesResponse,
     StreamingResponsesResponse,
-    TokenizeRequest,
-    TokenizeResponse,
     TranscriptionRequest,
     TranscriptionResponseVariant,
     TranslationRequest,
@@ -80,8 +68,6 @@ from vllm.entrypoints.openai.serving_models import (
     OpenAIServingModels,
 )
 from vllm.entrypoints.openai.serving_responses import OpenAIServingResponses
-from vllm.entrypoints.openai.serving_tokenization import OpenAIServingTokenization
-from vllm.entrypoints.openai.serving_tokens import ServingTokens
 from vllm.entrypoints.openai.serving_transcription import (
     OpenAIServingTranscription,
     OpenAIServingTranslation,
@@ -92,6 +78,11 @@ from vllm.entrypoints.pooling.classify.serving import ServingClassification
 from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
 from vllm.entrypoints.pooling.pooling.serving import OpenAIServingPooling
 from vllm.entrypoints.pooling.score.serving import ServingScores
+from vllm.entrypoints.serve.disagg.serving import ServingTokens
+from vllm.entrypoints.serve.elastic_ep.middleware import (
+    ScalingMiddleware,
+)
+from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization
 from vllm.entrypoints.tool_server import DemoToolServer, MCPToolServer, ToolServer
 from vllm.entrypoints.utils import (
     cli_env_setup,
@@ -109,8 +100,6 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.gc_utils import freeze_gc_heap
 from vllm.utils.network_utils import is_valid_ipv6_address
 from vllm.utils.system_utils import decorate_logs, set_ulimit
-from vllm.v1.engine.exceptions import EngineDeadError
-from vllm.v1.metrics.prometheus import get_prometheus_registry
 from vllm.version import __version__ as VLLM_VERSION
 
 prometheus_multiproc_dir: tempfile.TemporaryDirectory
@@ -245,39 +234,6 @@ async def build_async_engine_client_from_engine_args(
 router = APIRouter()
 
 
-class PrometheusResponse(Response):
-    media_type = prometheus_client.CONTENT_TYPE_LATEST
-
-
-def mount_metrics(app: FastAPI):
-    """Mount prometheus metrics to a FastAPI app."""
-
-    registry = get_prometheus_registry()
-
-    # `response_class=PrometheusResponse` is needed to return an HTTP response
-    # with header "Content-Type: text/plain; version=0.0.4; charset=utf-8"
-    # instead of the default "application/json" which is incorrect.
-    # See https://github.com/trallnag/prometheus-fastapi-instrumentator/issues/163#issue-1296092364
-    Instrumentator(
-        excluded_handlers=[
-            "/metrics",
-            "/health",
-            "/load",
-            "/ping",
-            "/version",
-            "/server_info",
-        ],
-        registry=registry,
-    ).add().instrument(app).expose(app, response_class=PrometheusResponse)
-
-    # Add prometheus asgi middleware to route /metrics requests
-    metrics_route = Mount("/metrics", make_asgi_app(registry=registry))
-
-    # Workaround for 307 Redirect for /metrics
-    metrics_route.path_regex = re.compile("^/metrics(?P<path>.*)$")
-    app.routes.append(metrics_route)
-
-
 def base(request: Request) -> OpenAIServing:
     # Reuse the existing instance
     return tokenization(request)
@@ -323,16 +279,6 @@ def generate_tokens(request: Request) -> ServingTokens | None:
     return request.app.state.serving_tokens
 
 
-@router.get("/health", response_class=Response)
-async def health(raw_request: Request) -> Response:
-    """Health check."""
-    try:
-        await engine_client(raw_request).check_health()
-        return Response(status_code=200)
-    except EngineDeadError:
-        return Response(status_code=503)
-
-
 @router.get("/load")
 async def get_server_load_metrics(request: Request):
     # This endpoint returns the current server load metrics.
@@ -352,167 +298,6 @@ async def get_server_load_metrics(request: Request):
     return JSONResponse(content={"server_load": request.app.state.server_load_metrics})
 
 
-@router.post("/pause")
-async def pause_generation(
-    raw_request: Request,
-    wait_for_inflight_requests: bool = Query(False),
-    clear_cache: bool = Query(True),
-) -> JSONResponse:
-    """Pause generation requests to allow weight updates.
-
-    Args:
-        wait_for_inflight_requests: When ``True`` waits for in-flight
-            requests to finish before pausing. When ``False`` (default),
-            aborts any in-flight requests immediately.
-        clear_cache: Whether to clear KV/prefix caches after draining.
-    """
-
-    engine = engine_client(raw_request)
-
-    try:
-        await engine.pause_generation(
-            wait_for_inflight_requests=wait_for_inflight_requests,
-            clear_cache=clear_cache,
-        )
-        return JSONResponse(
-            content={"status": "paused"},
-            status_code=HTTPStatus.OK.value,
-        )
-
-    except ValueError as err:
-        return JSONResponse(
-            content={"error": str(err)},
-            status_code=HTTPStatus.BAD_REQUEST.value,
-        )
-    except Exception as err:  # pragma: no cover - defensive
-        logger.exception("Failed to pause generation")
-        return JSONResponse(
-            content={"error": f"Failed to pause generation: {err}"},
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
-        )
-
-
-@router.post("/resume")
-async def resume_generation(raw_request: Request) -> JSONResponse:
-    """Resume generation after a pause."""
-
-    engine = engine_client(raw_request)
-
-    try:
-        await engine.resume_generation()
-        return JSONResponse(
-            content={"status": "resumed"},
-            status_code=HTTPStatus.OK.value,
-        )
-    except Exception as err:  # pragma: no cover - defensive
-        logger.exception("Failed to resume generation")
-        return JSONResponse(
-            content={"error": f"Failed to resume generation: {err}"},
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
-        )
-
-
-@router.get("/is_paused")
-async def is_paused(raw_request: Request) -> JSONResponse:
-    """Return the current pause status."""
-
-    engine = engine_client(raw_request)
-
-    try:
-        paused = await engine.is_paused()
-    except Exception as err:  # pragma: no cover - defensive
-        logger.exception("Failed to fetch pause status")
-        return JSONResponse(
-            content={"error": f"Failed to fetch pause status: {err}"},
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
-        )
-
-    return JSONResponse(content={"is_paused": paused})
-
-
-@router.post(
-    "/tokenize",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-        HTTPStatus.NOT_IMPLEMENTED.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-async def tokenize(request: TokenizeRequest, raw_request: Request):
-    handler = tokenization(raw_request)
-
-    try:
-        generator = await handler.create_tokenize(request, raw_request)
-    except NotImplementedError as e:
-        raise HTTPException(
-            status_code=HTTPStatus.NOT_IMPLEMENTED.value, detail=str(e)
-        ) from e
-    except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
-
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(
-            content=generator.model_dump(), status_code=generator.error.code
-        )
-    elif isinstance(generator, TokenizeResponse):
-        return JSONResponse(content=generator.model_dump())
-
-    assert_never(generator)
-
-
-@router.post(
-    "/detokenize",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-async def detokenize(request: DetokenizeRequest, raw_request: Request):
-    handler = tokenization(raw_request)
-
-    try:
-        generator = await handler.create_detokenize(request, raw_request)
-    except OverflowError as e:
-        raise RequestValidationError(errors=[str(e)]) from e
-    except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
-
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(
-            content=generator.model_dump(), status_code=generator.error.code
-        )
-    elif isinstance(generator, DetokenizeResponse):
-        return JSONResponse(content=generator.model_dump())
-
-    assert_never(generator)
-
-
-def maybe_register_tokenizer_info_endpoint(args):
-    """Conditionally register the tokenizer info endpoint if enabled."""
-    if getattr(args, "enable_tokenizer_info_endpoint", False):
-
-        @router.get("/tokenizer_info")
-        async def get_tokenizer_info(raw_request: Request):
-            """Get comprehensive tokenizer information."""
-            result = await tokenization(raw_request).get_tokenizer_info()
-            return JSONResponse(
-                content=result.model_dump(),
-                status_code=result.error.code
-                if isinstance(result, ErrorResponse)
-                else 200,
-            )
-
-
 @router.get("/v1/models")
 async def show_available_models(raw_request: Request):
     handler = models(raw_request)
@@ -898,33 +683,6 @@ if envs.VLLM_SERVER_DEV_MODE:
         await engine_client(raw_request).reset_mm_cache()
         return Response(status_code=200)
 
-    @router.post("/sleep")
-    async def sleep(raw_request: Request):
-        # get POST params
-        level = raw_request.query_params.get("level", "1")
-        await engine_client(raw_request).sleep(int(level))
-        # FIXME: in v0 with frontend multiprocessing, the sleep command
-        # is sent but does not finish yet when we return a response.
-        return Response(status_code=200)
-
-    @router.post("/wake_up")
-    async def wake_up(raw_request: Request):
-        tags = raw_request.query_params.getlist("tags")
-        if tags == []:
-            # set to None to wake up all tags if no tags are provided
-            tags = None
-        logger.info("wake up the engine with tags: %s", tags)
-        await engine_client(raw_request).wake_up(tags)
-        # FIXME: in v0 with frontend multiprocessing, the wake-up command
-        # is sent but does not finish yet when we return a response.
-        return Response(status_code=200)
-
-    @router.get("/is_sleeping")
-    async def is_sleeping(raw_request: Request):
-        logger.info("check whether the engine is sleeping")
-        is_sleeping = await engine_client(raw_request).is_sleeping()
-        return JSONResponse(content={"is_sleeping": is_sleeping})
-
     @router.post("/collective_rpc")
     async def collective_rpc(raw_request: Request):
         try:
@@ -952,138 +710,13 @@ if envs.VLLM_SERVER_DEV_MODE:
             return Response(status_code=200)
         response: list[Any] = []
         for result in results:
-            if result is None or isinstance(result, (dict, list)):
+            if result is None or isinstance(result, dict | list):
                 response.append(result)
             else:
                 response.append(str(result))
         return JSONResponse(content={"results": response})
 
 
-@router.post(
-    "/scale_elastic_ep",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.OK.value: {"model": dict},
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.REQUEST_TIMEOUT.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-async def scale_elastic_ep(raw_request: Request):
-    try:
-        body = await raw_request.json()
-    except json.JSONDecodeError as e:
-        raise HTTPException(status_code=400, detail="Invalid JSON format") from e  # noqa: B904
-
-    new_data_parallel_size = body.get("new_data_parallel_size")
-    drain_timeout = body.get("drain_timeout", 120)  # Default 2 minutes
-
-    if new_data_parallel_size is None:
-        raise HTTPException(
-            status_code=400, detail="new_data_parallel_size is required"
-        )
-
-    if not isinstance(new_data_parallel_size, int) or new_data_parallel_size <= 0:
-        raise HTTPException(
-            status_code=400, detail="new_data_parallel_size must be a positive integer"
-        )
-
-    if not isinstance(drain_timeout, int) or drain_timeout <= 0:
-        raise HTTPException(
-            status_code=400, detail="drain_timeout must be a positive integer"
-        )
-
-    # Set scaling flag to prevent new requests
-    global _scaling_elastic_ep
-    _scaling_elastic_ep = True
-    client = engine_client(raw_request)
-    try:
-        await client.scale_elastic_ep(new_data_parallel_size, drain_timeout)
-        return JSONResponse(
-            {
-                "message": f"Scaled to {new_data_parallel_size} data parallel engines",
-            }
-        )
-    except TimeoutError as e:
-        raise HTTPException(
-            status_code=408,
-            detail="Scale failed due to request drain timeout "
-            f"after {drain_timeout} seconds",
-        ) from e
-    except Exception as e:
-        logger.error("Scale failed: %s", e)
-        raise HTTPException(status_code=500, detail="Scale failed") from e
-    finally:
-        _scaling_elastic_ep = False
-
-
-@router.post("/is_scaling_elastic_ep")
-async def is_scaling_elastic_ep(raw_request: Request):
-    return JSONResponse({"is_scaling_elastic_ep": _scaling_elastic_ep})
-
-
-@router.post(
-    "/inference/v1/generate",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-@load_aware_call
-async def generate(request: GenerateRequest, raw_request: Request):
-    handler = generate_tokens(raw_request)
-    if handler is None:
-        return base(raw_request).create_error_response(
-            message="The model does not support generate tokens API"
-        )
-    try:
-        generator = await handler.serve_tokens(request, raw_request)
-    except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(
-            content=generator.model_dump(), status_code=generator.error.code
-        )
-
-    elif isinstance(generator, GenerateResponse):
-        return JSONResponse(content=generator.model_dump())
-
-    return StreamingResponse(content=generator, media_type="text/event-stream")
-
-
-if envs.VLLM_TORCH_PROFILER_DIR:
-    logger.warning_once(
-        "Torch Profiler is enabled in the API server. This should ONLY be "
-        "used for local development!"
-    )
-elif envs.VLLM_TORCH_CUDA_PROFILE:
-    logger.warning_once(
-        "CUDA Profiler is enabled in the API server. This should ONLY be "
-        "used for local development!"
-    )
-if envs.VLLM_TORCH_PROFILER_DIR or envs.VLLM_TORCH_CUDA_PROFILE:
-
-    @router.post("/start_profile")
-    async def start_profile(raw_request: Request):
-        logger.info("Starting profiler...")
-        await engine_client(raw_request).start_profile()
-        logger.info("Profiler started.")
-        return Response(status_code=200)
-
-    @router.post("/stop_profile")
-    async def stop_profile(raw_request: Request):
-        logger.info("Stopping profiler...")
-        await engine_client(raw_request).stop_profile()
-        logger.info("Profiler stopped.")
-        return Response(status_code=200)
-
-
 def load_log_config(log_config_file: str | None) -> dict | None:
     if not log_config_file:
         return None
@@ -1176,41 +809,6 @@ class XRequestIdMiddleware:
         return self.app(scope, receive, send_with_request_id)
 
 
-# Global variable to track scaling state
-_scaling_elastic_ep = False
-
-
-class ScalingMiddleware:
-    """
-    Middleware that checks if the model is currently scaling and
-    returns a 503 Service Unavailable response if it is.
-
-    This middleware applies to all HTTP requests and prevents
-    processing when the model is in a scaling state.
-    """
-
-    def __init__(self, app: ASGIApp) -> None:
-        self.app = app
-
-    def __call__(self, scope: Scope, receive: Receive, send: Send) -> Awaitable[None]:
-        if scope["type"] != "http":
-            return self.app(scope, receive, send)
-
-        # Check global scaling state
-        global _scaling_elastic_ep
-        if _scaling_elastic_ep:
-            # Return 503 Service Unavailable response
-            response = JSONResponse(
-                content={
-                    "error": "The model is currently scaling. Please try again later."
-                },
-                status_code=503,
-            )
-            return response(scope, receive, send)
-
-        return self.app(scope, receive, send)
-
-
 def _extract_content_from_chunk(chunk_data: dict) -> str:
     """Extract content from a streaming response chunk."""
     try:
@@ -1353,15 +951,10 @@ def build_app(args: Namespace) -> FastAPI:
         )
     else:
         app = FastAPI(lifespan=lifespan)
+    app.state.args = args
+    from vllm.entrypoints.serve import register_vllm_serve_api_routers
 
-    if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
-        logger.warning(
-            "LoRA dynamic loading & unloading is enabled in the API server. "
-            "This should ONLY be used for local development!"
-        )
-        from vllm.entrypoints.dynamic_lora import register_dynamic_lora_routes
-
-        register_dynamic_lora_routes(router)
+    register_vllm_serve_api_routers(app)
 
     from vllm.entrypoints.sagemaker.routes import register_sagemaker_routes
 
@@ -1370,8 +963,6 @@ def build_app(args: Namespace) -> FastAPI:
 
     app.root_path = args.root_path
 
-    mount_metrics(app)
-
     from vllm.entrypoints.pooling import register_pooling_api_routers
 
     register_pooling_api_routers(app)
@@ -1462,31 +1053,6 @@ def build_app(args: Namespace) -> FastAPI:
             )
 
     app = sagemaker_standards.bootstrap(app)
-    # Optional endpoints
-    if args.tokens_only:
-
-        @app.post("/abort_requests")
-        async def abort_requests(raw_request: Request):
-            """
-            Abort one or more requests. To be used in a
-            Disaggregated Everything setup.
-            """
-            try:
-                body = await raw_request.json()
-            except json.JSONDecodeError as e:
-                raise HTTPException(
-                    status_code=HTTPStatus.BAD_REQUEST.value,
-                    detail=f"JSON decode error: {e}",
-                ) from e
-            request_ids = body.get("request_ids")
-            if request_ids is None:
-                raise HTTPException(
-                    status_code=HTTPStatus.BAD_REQUEST.value,
-                    detail="Missing 'request_ids' in request body",
-                )
-            # Abort requests in background
-            asyncio.create_task(engine_client(raw_request).abort(request_ids))
-            return Response(status_code=200)
 
     return app
 
@@ -1515,7 +1081,7 @@ async def init_app_state(
     state.engine_client = engine_client
     state.log_stats = not args.disable_log_stats
     state.vllm_config = vllm_config
-
+    state.args = args
     supported_tasks = await engine_client.get_supported_tasks()
     logger.info("Supported tasks: %s", supported_tasks)
 
@@ -1839,7 +1405,6 @@ async def run_server_worker(
         args,
         client_config=client_config,
     ) as engine_client:
-        maybe_register_tokenizer_info_endpoint(args)
         app = build_app(args)
 
         await init_app_state(engine_client, app.state, args)
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 1d89aa011..67291f45a 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -74,8 +74,6 @@ from vllm.entrypoints.openai.protocol import (
     ErrorResponse,
     FunctionCall,
     FunctionDefinition,
-    GenerateRequest,
-    GenerateResponse,
     ResponsesRequest,
     TokenizeChatRequest,
     TokenizeCompletionRequest,
@@ -87,6 +85,7 @@ from vllm.entrypoints.openai.protocol import (
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer, RenderConfig
+from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse
 from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.inputs.data import PromptType
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
diff --git a/vllm/entrypoints/sagemaker/routes.py b/vllm/entrypoints/sagemaker/routes.py
index 108fdd773..ea88c0fc4 100644
--- a/vllm/entrypoints/sagemaker/routes.py
+++ b/vllm/entrypoints/sagemaker/routes.py
@@ -16,7 +16,6 @@ from vllm.entrypoints.openai.api_server import (
     completion,
     create_chat_completion,
     create_completion,
-    health,
     validate_json_request,
 )
 from vllm.entrypoints.openai.protocol import (
@@ -38,6 +37,7 @@ from vllm.entrypoints.pooling.score.api_router import (
     score,
 )
 from vllm.entrypoints.pooling.score.protocol import RerankRequest, ScoreRequest
+from vllm.entrypoints.serve.instrumentator.health import health
 
 # TODO: RequestType = TypeForm[BaseModel] when recognized by type checkers
 # (requires typing_extensions >= 4.13)
diff --git a/vllm/entrypoints/serve/__init__.py b/vllm/entrypoints/serve/__init__.py
new file mode 100644
index 000000000..c4fcc92db
--- /dev/null
+++ b/vllm/entrypoints/serve/__init__.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from fastapi import FastAPI
+
+
+def register_vllm_serve_api_routers(app: FastAPI):
+    from vllm.entrypoints.serve.lora.api_router import (
+        attach_router as attach_lora_router,
+    )
+
+    attach_lora_router(app)
+    from vllm.entrypoints.serve.elastic_ep.api_router import (
+        attach_router as attach_elastic_ep_router,
+    )
+
+    attach_elastic_ep_router(app)
+
+    from vllm.entrypoints.serve.profile.api_router import (
+        attach_router as attach_profile_router,
+    )
+
+    attach_profile_router(app)
+
+    from vllm.entrypoints.serve.sleep.api_router import (
+        attach_router as attach_sleep_router,
+    )
+
+    attach_sleep_router(app)
+
+    from vllm.entrypoints.serve.tokenize.api_router import (
+        attach_router as attach_tokenize_router,
+    )
+
+    attach_tokenize_router(app)
+
+    from vllm.entrypoints.serve.disagg.api_router import (
+        attach_router as attach_disagg_router,
+    )
+
+    attach_disagg_router(app)
+
+    from vllm.entrypoints.serve.rlhf.api_router import (
+        attach_router as attach_rlhf_router,
+    )
+
+    attach_rlhf_router(app)
+
+    from vllm.entrypoints.serve.instrumentator.metrics import (
+        attach_router as attach_metrics_router,
+    )
+
+    attach_metrics_router(app)
+
+    from vllm.entrypoints.serve.instrumentator.health import (
+        attach_router as attach_health_router,
+    )
+
+    attach_health_router(app)
diff --git a/vllm/entrypoints/serve/disagg/__init__.py b/vllm/entrypoints/serve/disagg/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/vllm/entrypoints/serve/disagg/api_router.py b/vllm/entrypoints/serve/disagg/api_router.py
new file mode 100644
index 000000000..c38ede30d
--- /dev/null
+++ b/vllm/entrypoints/serve/disagg/api_router.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import asyncio
+import json
+from http import HTTPStatus
+
+from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request, Response
+from fastapi.responses import JSONResponse, StreamingResponse
+
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.openai.api_server import validate_json_request
+from vllm.entrypoints.openai.protocol import (
+    ErrorResponse,
+)
+from vllm.entrypoints.serve.disagg.protocol import (
+    GenerateRequest,
+    GenerateResponse,
+)
+from vllm.entrypoints.serve.disagg.serving import (
+    ServingTokens,
+)
+from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization
+from vllm.entrypoints.utils import (
+    load_aware_call,
+    with_cancellation,
+)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def tokenization(request: Request) -> OpenAIServingTokenization:
+    return request.app.state.openai_serving_tokenization
+
+
+def generate_tokens(request: Request) -> ServingTokens | None:
+    return request.app.state.serving_tokens
+
+
+def engine_client(request: Request) -> EngineClient:
+    return request.app.state.engine_client
+
+
+router = APIRouter()
+
+
+@router.post(
+    "/inference/v1/generate",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def generate(request: GenerateRequest, raw_request: Request):
+    handler = generate_tokens(raw_request)
+    if handler is None:
+        return tokenization(raw_request).create_error_response(
+            message="The model does not support generate tokens API"
+        )
+    try:
+        generator = await handler.serve_tokens(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+
+    elif isinstance(generator, GenerateResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    return StreamingResponse(content=generator, media_type="text/event-stream")
+
+
+def attach_router(app: FastAPI):
+    if getattr(app.state.args, "tokens_only", False):
+
+        @router.post("/abort_requests")
+        async def abort_requests(raw_request: Request):
+            """
+            Abort one or more requests. To be used in a
+            Disaggregated Everything setup.
+            """
+            try:
+                body = await raw_request.json()
+            except json.JSONDecodeError as e:
+                raise HTTPException(
+                    status_code=HTTPStatus.BAD_REQUEST.value,
+                    detail=f"JSON decode error: {e}",
+                ) from e
+            request_ids = body.get("request_ids")
+            if request_ids is None:
+                raise HTTPException(
+                    status_code=HTTPStatus.BAD_REQUEST.value,
+                    detail="Missing 'request_ids' in request body",
+                )
+            # Abort requests in background
+            asyncio.create_task(engine_client(raw_request).abort(request_ids))
+            return Response(status_code=200)
+
+    app.include_router(router)
diff --git a/vllm/entrypoints/serve/disagg/protocol.py b/vllm/entrypoints/serve/disagg/protocol.py
new file mode 100644
index 000000000..251fcf12e
--- /dev/null
+++ b/vllm/entrypoints/serve/disagg/protocol.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionLogProbs,
+    Logprob,
+    SamplingParams,
+    StreamOptions,
+)
+from vllm.utils import random_uuid
+
+
+####### Tokens IN <> Tokens OUT #######
+class GenerateRequest(BaseModel):
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    token_ids: list[int]
+    """The token ids to generate text from."""
+
+    # features: MultiModalFeatureSpec
+    # TODO (NickLucche): implement once Renderer work is completed
+    features: str | None = None
+    """The processed MM inputs for the model."""
+
+    sampling_params: SamplingParams
+    """The sampling parameters for the model."""
+
+    model: str | None = None
+
+    stream: bool | None = False
+    stream_options: StreamOptions | None = None
+    cache_salt: str | None = Field(
+        default=None,
+        description=(
+            "If specified, the prefix cache will be salted with the provided "
+            "string to prevent an attacker to guess prompts in multi-user "
+            "environments. The salt should be random, protected from "
+            "access by 3rd parties, and long enough to be "
+            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
+            "to 256 bit)."
+        ),
+    )
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+    kv_transfer_params: dict[str, Any] | None = Field(
+        default=None,
+        description="KVTransfer parameters used for disaggregated serving.",
+    )
+
+
+class GenerateResponseChoice(BaseModel):
+    index: int
+    logprobs: ChatCompletionLogProbs | None = None
+    # per OpenAI spec this is the default
+    finish_reason: str | None = "stop"
+    token_ids: list[int] | None = None
+
+
+class GenerateResponse(BaseModel):
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    choices: list[GenerateResponseChoice]
+
+    prompt_logprobs: list[dict[int, Logprob] | None] | None = None
+
+    kv_transfer_params: dict[str, Any] | None = Field(
+        default=None,
+        description="KVTransfer parameters used for disaggregated serving.",
+    )
diff --git a/vllm/entrypoints/openai/serving_tokens.py b/vllm/entrypoints/serve/disagg/serving.py
similarity index 99%
rename from vllm/entrypoints/openai/serving_tokens.py
rename to vllm/entrypoints/serve/disagg/serving.py
index daa739e41..5c1d17156 100644
--- a/vllm/entrypoints/openai/serving_tokens.py
+++ b/vllm/entrypoints/serve/disagg/serving.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
 import asyncio
 import time
 from collections.abc import AsyncGenerator
@@ -14,15 +16,17 @@ from vllm.entrypoints.openai.protocol import (
     ChatCompletionLogProbs,
     ChatCompletionLogProbsContent,
     ErrorResponse,
-    GenerateRequest,
-    GenerateResponse,
-    GenerateResponseChoice,
     PromptTokenUsageInfo,
     RequestResponseMetadata,
     UsageInfo,
 )
 from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.serve.disagg.protocol import (
+    GenerateRequest,
+    GenerateResponse,
+    GenerateResponseChoice,
+)
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob
diff --git a/vllm/entrypoints/serve/elastic_ep/__init__.py b/vllm/entrypoints/serve/elastic_ep/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/vllm/entrypoints/serve/elastic_ep/api_router.py b/vllm/entrypoints/serve/elastic_ep/api_router.py
new file mode 100644
index 000000000..21d5d2e60
--- /dev/null
+++ b/vllm/entrypoints/serve/elastic_ep/api_router.py
@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import json
+from http import HTTPStatus
+
+from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request
+from fastapi.responses import JSONResponse
+
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.openai.api_server import validate_json_request
+from vllm.entrypoints.openai.protocol import (
+    ErrorResponse,
+)
+from vllm.entrypoints.serve.elastic_ep.middleware import (
+    get_scaling_elastic_ep,
+    set_scaling_elastic_ep,
+)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def engine_client(request: Request) -> EngineClient:
+    return request.app.state.engine_client
+
+
+router = APIRouter()
+
+
+@router.post(
+    "/scale_elastic_ep",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.OK.value: {"model": dict},
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.REQUEST_TIMEOUT.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+async def scale_elastic_ep(raw_request: Request):
+    try:
+        body = await raw_request.json()
+    except json.JSONDecodeError as e:
+        raise HTTPException(status_code=400, detail="Invalid JSON format") from e  # noqa: B904
+
+    new_data_parallel_size = body.get("new_data_parallel_size")
+    drain_timeout = body.get("drain_timeout", 120)  # Default 2 minutes
+
+    if new_data_parallel_size is None:
+        raise HTTPException(
+            status_code=400, detail="new_data_parallel_size is required"
+        )
+
+    if not isinstance(new_data_parallel_size, int) or new_data_parallel_size <= 0:
+        raise HTTPException(
+            status_code=400,
+            detail="new_data_parallel_size must be a positive integer",
+        )
+
+    if not isinstance(drain_timeout, int) or drain_timeout <= 0:
+        raise HTTPException(
+            status_code=400, detail="drain_timeout must be a positive integer"
+        )
+
+    # Set scaling flag to prevent new requests
+    set_scaling_elastic_ep(True)
+    client = engine_client(raw_request)
+    try:
+        await client.scale_elastic_ep(new_data_parallel_size, drain_timeout)
+        return JSONResponse(
+            {
+                "message": f"Scaled to {new_data_parallel_size} data parallel engines",
+            }
+        )
+    except TimeoutError as e:
+        raise HTTPException(
+            status_code=408,
+            detail="Scale failed due to request drain timeout "
+            f"after {drain_timeout} seconds",
+        ) from e
+    except Exception as e:
+        logger.error("Scale failed: %s", e)
+        raise HTTPException(status_code=500, detail="Scale failed") from e
+    finally:
+        set_scaling_elastic_ep(False)
+
+
+@router.post("/is_scaling_elastic_ep")
+async def is_scaling_elastic_ep(raw_request: Request):
+    return JSONResponse({"is_scaling_elastic_ep": get_scaling_elastic_ep()})
+
+
+def attach_router(app: FastAPI):
+    app.include_router(router)
diff --git a/vllm/entrypoints/serve/elastic_ep/middleware.py b/vllm/entrypoints/serve/elastic_ep/middleware.py
new file mode 100644
index 000000000..23f45eafe
--- /dev/null
+++ b/vllm/entrypoints/serve/elastic_ep/middleware.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Awaitable
+
+from fastapi.responses import JSONResponse
+from starlette.types import ASGIApp, Receive, Scope, Send
+
+# Global variable to track scaling state
+_scaling_elastic_ep = False
+
+
+def get_scaling_elastic_ep():
+    return _scaling_elastic_ep
+
+
+def set_scaling_elastic_ep(value):
+    global _scaling_elastic_ep
+    _scaling_elastic_ep = value
+
+
+class ScalingMiddleware:
+    """
+    Middleware that checks if the model is currently scaling and
+    returns a 503 Service Unavailable response if it is.
+
+    This middleware applies to all HTTP requests and prevents
+    processing when the model is in a scaling state.
+    """
+
+    def __init__(self, app: ASGIApp) -> None:
+        self.app = app
+
+    def __call__(self, scope: Scope, receive: Receive, send: Send) -> Awaitable[None]:
+        if scope["type"] != "http":
+            return self.app(scope, receive, send)
+
+        # Check global scaling state
+        if get_scaling_elastic_ep():
+            # Return 503 Service Unavailable response
+            response = JSONResponse(
+                content={
+                    "error": "The model is currently scaling. Please try again later."
+                },
+                status_code=503,
+            )
+            return response(scope, receive, send)
+
+        return self.app(scope, receive, send)
diff --git a/vllm/entrypoints/serve/instrumentator/__init__.py b/vllm/entrypoints/serve/instrumentator/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/vllm/entrypoints/serve/instrumentator/health.py b/vllm/entrypoints/serve/instrumentator/health.py
new file mode 100644
index 000000000..029ef677a
--- /dev/null
+++ b/vllm/entrypoints/serve/instrumentator/health.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from fastapi import APIRouter, Request
+from fastapi.responses import Response
+
+from vllm.engine.protocol import EngineClient
+from vllm.logger import init_logger
+from vllm.v1.engine.exceptions import EngineDeadError
+
+logger = init_logger(__name__)
+
+
+router = APIRouter()
+
+
+def engine_client(request: Request) -> EngineClient:
+    return request.app.state.engine_client
+
+
+@router.get("/health", response_class=Response)
+async def health(raw_request: Request) -> Response:
+    """Health check."""
+    try:
+        await engine_client(raw_request).check_health()
+        return Response(status_code=200)
+    except EngineDeadError:
+        return Response(status_code=503)
+
+
+def attach_router(app):
+    app.include_router(router)
diff --git a/vllm/entrypoints/serve/instrumentator/metrics.py b/vllm/entrypoints/serve/instrumentator/metrics.py
new file mode 100644
index 000000000..efe0c63a9
--- /dev/null
+++ b/vllm/entrypoints/serve/instrumentator/metrics.py
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import re
+
+import prometheus_client
+from fastapi import FastAPI, Response
+from prometheus_client import make_asgi_app
+from prometheus_fastapi_instrumentator import Instrumentator
+from starlette.routing import Mount
+
+from vllm.v1.metrics.prometheus import get_prometheus_registry
+
+
+class PrometheusResponse(Response):
+    media_type = prometheus_client.CONTENT_TYPE_LATEST
+
+
+def attach_router(app: FastAPI):
+    """Mount prometheus metrics to a FastAPI app."""
+
+    registry = get_prometheus_registry()
+
+    # `response_class=PrometheusResponse` is needed to return an HTTP response
+    # with header "Content-Type: text/plain; version=0.0.4; charset=utf-8"
+    # instead of the default "application/json" which is incorrect.
+    # See https://github.com/trallnag/prometheus-fastapi-instrumentator/issues/163#issue-1296092364
+    Instrumentator(
+        excluded_handlers=[
+            "/metrics",
+            "/health",
+            "/load",
+            "/ping",
+            "/version",
+            "/server_info",
+        ],
+        registry=registry,
+    ).add().instrument(app).expose(app, response_class=PrometheusResponse)
+
+    # Add prometheus asgi middleware to route /metrics requests
+    metrics_route = Mount("/metrics", make_asgi_app(registry=registry))
+
+    # Workaround for 307 Redirect for /metrics
+    metrics_route.path_regex = re.compile("^/metrics(?P<path>.*)$")
+    app.routes.append(metrics_route)
diff --git a/vllm/entrypoints/serve/lora/__init__.py b/vllm/entrypoints/serve/lora/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/vllm/entrypoints/dynamic_lora.py b/vllm/entrypoints/serve/lora/api_router.py
similarity index 80%
rename from vllm/entrypoints/dynamic_lora.py
rename to vllm/entrypoints/serve/lora/api_router.py
index cc0f437e5..6a57e73f3 100644
--- a/vllm/entrypoints/dynamic_lora.py
+++ b/vllm/entrypoints/serve/lora/api_router.py
@@ -1,9 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
 import model_hosting_container_standards.sagemaker as sagemaker_standards
-from fastapi import APIRouter, Depends, Request
+from fastapi import APIRouter, Depends, FastAPI, Request
 from fastapi.responses import JSONResponse, Response
 
+from vllm import envs
 from vllm.entrypoints.openai.api_server import models, validate_json_request
 from vllm.entrypoints.openai.protocol import (
     ErrorResponse,
@@ -14,9 +17,18 @@ from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
+router = APIRouter()
 
 
-def register_dynamic_lora_routes(router: APIRouter):
+def attach_router(app: FastAPI):
+    if not envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
+        """If LoRA dynamic loading & unloading is not enabled, do nothing."""
+        return
+    logger.warning(
+        "LoRA dynamic loading & unloading is enabled in the API server. "
+        "This should ONLY be used for local development!"
+    )
+
     @sagemaker_standards.register_load_adapter_handler(
         request_shape={
             "lora_name": "body.name",
@@ -54,4 +66,5 @@ def register_dynamic_lora_routes(router: APIRouter):
 
         return Response(status_code=200, content=response)
 
-    return router
+    # register the router
+    app.include_router(router)
diff --git a/vllm/entrypoints/serve/profile/__init__.py b/vllm/entrypoints/serve/profile/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/vllm/entrypoints/serve/profile/api_router.py b/vllm/entrypoints/serve/profile/api_router.py
new file mode 100644
index 000000000..166f13764
--- /dev/null
+++ b/vllm/entrypoints/serve/profile/api_router.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from fastapi import APIRouter, FastAPI, Request
+from fastapi.responses import Response
+
+import vllm.envs as envs
+from vllm.engine.protocol import EngineClient
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+router = APIRouter()
+
+
+def engine_client(request: Request) -> EngineClient:
+    return request.app.state.engine_client
+
+
+@router.post("/start_profile")
+async def start_profile(raw_request: Request):
+    logger.info("Starting profiler...")
+    await engine_client(raw_request).start_profile()
+    logger.info("Profiler started.")
+    return Response(status_code=200)
+
+
+@router.post("/stop_profile")
+async def stop_profile(raw_request: Request):
+    logger.info("Stopping profiler...")
+    await engine_client(raw_request).stop_profile()
+    logger.info("Profiler stopped.")
+    return Response(status_code=200)
+
+
+def attach_router(app: FastAPI):
+    if envs.VLLM_TORCH_PROFILER_DIR:
+        logger.warning_once(
+            "Torch Profiler is enabled in the API server. This should ONLY be "
+            "used for local development!"
+        )
+    elif envs.VLLM_TORCH_CUDA_PROFILE:
+        logger.warning_once(
+            "CUDA Profiler is enabled in the API server. This should ONLY be "
+            "used for local development!"
+        )
+    if envs.VLLM_TORCH_PROFILER_DIR or envs.VLLM_TORCH_CUDA_PROFILE:
+        app.include_router(router)
diff --git a/vllm/entrypoints/serve/rlhf/__init__.py b/vllm/entrypoints/serve/rlhf/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/vllm/entrypoints/serve/rlhf/api_router.py b/vllm/entrypoints/serve/rlhf/api_router.py
new file mode 100644
index 000000000..3b37840ae
--- /dev/null
+++ b/vllm/entrypoints/serve/rlhf/api_router.py
@@ -0,0 +1,102 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from http import HTTPStatus
+
+from fastapi import APIRouter, FastAPI, Query, Request
+from fastapi.responses import JSONResponse
+
+from vllm.engine.protocol import EngineClient
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def engine_client(request: Request) -> EngineClient:
+    return request.app.state.engine_client
+
+
+router = APIRouter()
+
+
+@router.post("/pause")
+async def pause_generation(
+    raw_request: Request,
+    wait_for_inflight_requests: bool = Query(False),
+    clear_cache: bool = Query(True),
+) -> JSONResponse:
+    """Pause generation requests to allow weight updates.
+
+    Args:
+        wait_for_inflight_requests: When ``True`` waits for in-flight
+            requests to finish before pausing. When ``False`` (default),
+            aborts any in-flight requests immediately.
+        clear_cache: Whether to clear KV/prefix caches after draining.
+    """
+
+    engine = engine_client(raw_request)
+
+    try:
+        await engine.pause_generation(
+            wait_for_inflight_requests=wait_for_inflight_requests,
+            clear_cache=clear_cache,
+        )
+        return JSONResponse(
+            content={"status": "paused"},
+            status_code=HTTPStatus.OK.value,
+        )
+
+    except ValueError as err:
+        return JSONResponse(
+            content={"error": str(err)},
+            status_code=HTTPStatus.BAD_REQUEST.value,
+        )
+    except Exception as err:  # pragma: no cover - defensive
+        logger.exception("Failed to pause generation")
+        return JSONResponse(
+            content={"error": f"Failed to pause generation: {err}"},
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+        )
+
+
+@router.post("/resume")
+async def resume_generation(raw_request: Request) -> JSONResponse:
+    """Resume generation after a pause."""
+
+    engine = engine_client(raw_request)
+
+    try:
+        await engine.resume_generation()
+        return JSONResponse(
+            content={"status": "resumed"},
+            status_code=HTTPStatus.OK.value,
+        )
+    except Exception as err:  # pragma: no cover - defensive
+        logger.exception("Failed to resume generation")
+        return JSONResponse(
+            content={"error": f"Failed to resume generation: {err}"},
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+        )
+
+
+@router.get("/is_paused")
+async def is_paused(raw_request: Request) -> JSONResponse:
+    """Return the current pause status."""
+
+    engine = engine_client(raw_request)
+
+    try:
+        paused = await engine.is_paused()
+    except Exception as err:  # pragma: no cover - defensive
+        logger.exception("Failed to fetch pause status")
+        return JSONResponse(
+            content={"error": f"Failed to fetch pause status: {err}"},
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+        )
+
+    return JSONResponse(content={"is_paused": paused})
+
+
+def attach_router(app: FastAPI):
+    app.include_router(router)
diff --git a/vllm/entrypoints/serve/sleep/__init__.py b/vllm/entrypoints/serve/sleep/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/vllm/entrypoints/serve/sleep/api_router.py b/vllm/entrypoints/serve/sleep/api_router.py
new file mode 100644
index 000000000..bc01e1853
--- /dev/null
+++ b/vllm/entrypoints/serve/sleep/api_router.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from fastapi import APIRouter, FastAPI, Request
+from fastapi.responses import JSONResponse, Response
+
+import vllm.envs as envs
+from vllm.engine.protocol import EngineClient
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def engine_client(request: Request) -> EngineClient:
+    return request.app.state.engine_client
+
+
+router = APIRouter()
+
+
+@router.post("/sleep")
+async def sleep(raw_request: Request):
+    # get POST params
+    level = raw_request.query_params.get("level", "1")
+    await engine_client(raw_request).sleep(int(level))
+    # FIXME: in v0 with frontend multiprocessing, the sleep command
+    # is sent but does not finish yet when we return a response.
+    return Response(status_code=200)
+
+
+@router.post("/wake_up")
+async def wake_up(raw_request: Request):
+    tags = raw_request.query_params.getlist("tags")
+    if tags == []:
+        # set to None to wake up all tags if no tags are provided
+        tags = None
+    logger.info("wake up the engine with tags: %s", tags)
+    await engine_client(raw_request).wake_up(tags)
+    # FIXME: in v0 with frontend multiprocessing, the wake-up command
+    # is sent but does not finish yet when we return a response.
+    return Response(status_code=200)
+
+
+@router.get("/is_sleeping")
+async def is_sleeping(raw_request: Request):
+    logger.info("check whether the engine is sleeping")
+    is_sleeping = await engine_client(raw_request).is_sleeping()
+    return JSONResponse(content={"is_sleeping": is_sleeping})
+
+
+def attach_router(app: FastAPI):
+    if not envs.VLLM_SERVER_DEV_MODE:
+        return
+    logger.warning(
+        "SECURITY WARNING: Development endpoints are enabled! "
+        "This should NOT be used in production!"
+    )
+
+    app.include_router(router)
diff --git a/vllm/entrypoints/serve/tokenize/__init__.py b/vllm/entrypoints/serve/tokenize/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/vllm/entrypoints/serve/tokenize/api_router.py b/vllm/entrypoints/serve/tokenize/api_router.py
new file mode 100644
index 000000000..a10e78c8d
--- /dev/null
+++ b/vllm/entrypoints/serve/tokenize/api_router.py
@@ -0,0 +1,118 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from http import HTTPStatus
+
+from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request
+from fastapi.exceptions import RequestValidationError
+from fastapi.responses import JSONResponse
+from typing_extensions import assert_never
+
+from vllm.entrypoints.openai.api_server import validate_json_request
+from vllm.entrypoints.openai.protocol import (
+    DetokenizeRequest,
+    DetokenizeResponse,
+    ErrorResponse,
+    TokenizeRequest,
+    TokenizeResponse,
+)
+from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization
+from vllm.entrypoints.utils import (
+    with_cancellation,
+)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def tokenization(request: Request) -> OpenAIServingTokenization:
+    return request.app.state.openai_serving_tokenization
+
+
+router = APIRouter()
+
+
+@router.post(
+    "/tokenize",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+        HTTPStatus.NOT_IMPLEMENTED.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+async def tokenize(request: TokenizeRequest, raw_request: Request):
+    handler = tokenization(raw_request)
+
+    try:
+        generator = await handler.create_tokenize(request, raw_request)
+    except NotImplementedError as e:
+        raise HTTPException(
+            status_code=HTTPStatus.NOT_IMPLEMENTED.value, detail=str(e)
+        ) from e
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+    elif isinstance(generator, TokenizeResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
+@router.post(
+    "/detokenize",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+async def detokenize(request: DetokenizeRequest, raw_request: Request):
+    handler = tokenization(raw_request)
+
+    try:
+        generator = await handler.create_detokenize(request, raw_request)
+    except OverflowError as e:
+        raise RequestValidationError(errors=[str(e)]) from e
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+    elif isinstance(generator, DetokenizeResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
+def attach_router(app: FastAPI):
+    if getattr(app.state.args, "enable_tokenizer_info_endpoint", False):
+        """Conditionally register the tokenizer info endpoint if enabled."""
+
+        @router.get("/tokenizer_info")
+        async def get_tokenizer_info(raw_request: Request):
+            """Get comprehensive tokenizer information."""
+            result = await tokenization(raw_request).get_tokenizer_info()
+            return JSONResponse(
+                content=result.model_dump(),
+                status_code=result.error.code
+                if isinstance(result, ErrorResponse)
+                else 200,
+            )
+
+    app.include_router(router)
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/serve/tokenize/serving.py
similarity index 100%
rename from vllm/entrypoints/openai/serving_tokenization.py
rename to vllm/entrypoints/serve/tokenize/serving.py
-- 
GitLab


From 7fe9c1a2232275ee4cc7d65af3bc5b648543f367 Mon Sep 17 00:00:00 2001
From: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Date: Wed, 3 Dec 2025 17:51:08 +0800
Subject: [PATCH 036/258] [CI] Add Async Eplb nightly CI tests (#29385)

Signed-off-by: David Chen <530634352@qq.com>
Signed-off-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 .../deepseek_v2_lite_ep_async_eplb.sh         | 73 ++++++++++++++++++
 .../deepseek_v2_lite_ep_eplb.sh               |  1 +
 .../qwen3_next_mtp_async_eplb.sh              | 74 +++++++++++++++++++
 .buildkite/test-pipeline.yaml                 | 20 ++++-
 vllm/distributed/eplb/rebalance_execute.py    |  3 -
 5 files changed, 167 insertions(+), 4 deletions(-)
 create mode 100644 .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh
 create mode 100644 .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh

diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh
new file mode 100644
index 000000000..d7167161b
--- /dev/null
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
+THRESHOLD=${1:-0.25}
+NUM_Q=${2:-1319}
+PORT=${3:-8030}
+OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
+mkdir -p "${OUT_DIR}"
+
+wait_for_server() {
+  local port=$1
+  timeout 600 bash -c '
+    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
+      sleep 1
+    done'
+}
+
+MODEL="deepseek-ai/DeepSeek-V2-lite"
+
+# Set BACKENDS based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+  # ROCm platform
+  BACKENDS=("allgather_reducescatter")
+  # Disable MOE padding for ROCm since it is causing eplb to fail
+  export VLLM_ROCM_MOE_PADDING=0
+else
+  # Non-ROCm platform (CUDA/other)
+  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+fi
+
+cleanup() {
+  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
+    kill "${SERVER_PID}" 2>/dev/null || true
+    for _ in {1..20}; do
+      kill -0 "${SERVER_PID}" 2>/dev/null || break
+      sleep 0.5
+    done
+    kill -9 "${SERVER_PID}" 2>/dev/null || true
+  fi
+}
+trap cleanup EXIT
+
+for BACK in "${BACKENDS[@]}"; do
+  VLLM_DEEP_GEMM_WARMUP=skip \
+  VLLM_ALL2ALL_BACKEND=$BACK \
+  vllm serve "$MODEL" \
+    --enforce-eager \
+    --tensor-parallel-size 2 \
+    --data-parallel-size 2 \
+    --enable-expert-parallel \
+    --enable-eplb \
+    --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
+    --trust-remote-code \
+    --max-model-len 2048 \
+    --port $PORT &
+  SERVER_PID=$!
+  wait_for_server $PORT
+
+  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
+  OUT="${OUT_DIR}/${TAG}_${BACK}_async_eplb.json"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+  python3 - <<PY
+import json; acc=json.load(open('${OUT}'))['accuracy']
+print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
+assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
+PY
+
+  cleanup
+  SERVER_PID=
+  sleep 1
+  PORT=$((PORT+1))
+done
diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
index 8106f50f1..693418da6 100644
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -50,6 +50,7 @@ for BACK in "${BACKENDS[@]}"; do
     --data-parallel-size 2 \
     --enable-expert-parallel \
     --enable-eplb \
+    --eplb-config '{"window_size":200,"step_interval":600}' \
     --trust-remote-code \
     --max-model-len 2048 \
     --port $PORT &
diff --git a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
new file mode 100644
index 000000000..937a43d1a
--- /dev/null
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
+THRESHOLD=${1:-0.25}
+NUM_Q=${2:-1319}
+PORT=${3:-8040}
+OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
+mkdir -p "${OUT_DIR}"
+
+wait_for_server() {
+  local port=$1
+  timeout 600 bash -c '
+    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
+      sleep 1
+    done'
+}
+
+MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
+
+# Set BACKENDS based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+  # ROCm platform
+  BACKENDS=("allgather_reducescatter")
+  # Disable MOE padding for ROCm since it is causing eplb to fail
+  export VLLM_ROCM_MOE_PADDING=0
+else
+  # Non-ROCm platform (CUDA/other)
+  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+fi
+
+cleanup() {
+  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
+    kill "${SERVER_PID}" 2>/dev/null || true
+    for _ in {1..20}; do
+      kill -0 "${SERVER_PID}" 2>/dev/null || break
+      sleep 0.5
+    done
+    kill -9 "${SERVER_PID}" 2>/dev/null || true
+  fi
+}
+trap cleanup EXIT
+
+for BACK in "${BACKENDS[@]}"; do
+  VLLM_DEEP_GEMM_WARMUP=skip \
+  VLLM_ALL2ALL_BACKEND=$BACK \
+  vllm serve "$MODEL" \
+    --enforce-eager \
+    --tensor-parallel-size 4 \
+    --enable-expert-parallel \
+    --enable-eplb \
+    --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
+    --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
+    --trust-remote-code \
+    --max-model-len 2048 \
+    --gpu-memory-utilization 0.9 \
+    --port $PORT &
+  SERVER_PID=$!
+  wait_for_server $PORT
+
+  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
+  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+  python3 - <<PY
+import json; acc=json.load(open('${OUT}'))['accuracy']
+print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
+assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
+PY
+
+  cleanup
+  SERVER_PID=
+  sleep 1
+  PORT=$((PORT+1))
+done
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 52c848c78..f79e92665 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1373,4 +1373,22 @@ steps:
   num_gpus: 2
   working_dir: "/vllm-workspace"
   commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
\ No newline at end of file
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+
+- label: DeepSeek V2-Lite Async EPLB Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
+
+- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py
index 376dad8a7..55856d940 100644
--- a/vllm/distributed/eplb/rebalance_execute.py
+++ b/vllm/distributed/eplb/rebalance_execute.py
@@ -322,9 +322,6 @@ async def transfer_layer(
     num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
     assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
     assert num_physical_experts == ep_size * num_local_physical_experts
-    # A buffer to hold the expert weights in one layer during the exchange.
-    # NOTE: Currently we assume the same weights across different layers
-    # have the same shape.
 
     is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
         num_local_experts=num_local_physical_experts,
-- 
GitLab


From a21cd9ed239b853bd587ffe3c9140fe68cd41f59 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 3 Dec 2025 18:05:10 +0800
Subject: [PATCH 037/258] [Bugfix] Fix incorrect `image_grid_thw` rank for
 HunyuanOCR from missing `merge_by_field_config=True` (#29950)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../vision_language_multi_image.py            | 23 +++++++++++++++++++
 vllm/model_executor/models/hunyuan_vision.py  |  1 +
 2 files changed, 24 insertions(+)

diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 2193b1ca9..560ca768d 100755
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -309,6 +309,28 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+# HunyuanOCR
+def load_hunyuan_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "tencent/HunyuanOCR"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholder = (
+        "<｜hy_place▁holder▁no▁100｜><｜hy_place▁holder▁no▁102｜><｜hy_place▁holder▁no▁101｜>"  # noqa: E501
+    ) * len(image_urls)
+    prompt = f"<｜hy_begin▁of▁sentence｜>{placeholder}{question}<｜hy_User｜>"
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_hyperclovax_seed_vision(
     question: str, image_urls: list[str]
 ) -> ModelRequestData:
@@ -1322,6 +1344,7 @@ model_example_map = {
     "deepseek_ocr": load_deepseek_ocr,
     "gemma3": load_gemma3,
     "h2ovl_chat": load_h2ovl,
+    "hunyuan_vl": load_hunyuan_vl,
     "hyperclovax_seed_vision": load_hyperclovax_seed_vision,
     "idefics3": load_idefics3,
     "interns1": load_interns1,
diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py
index 2950db571..6537b6df8 100644
--- a/vllm/model_executor/models/hunyuan_vision.py
+++ b/vllm/model_executor/models/hunyuan_vision.py
@@ -785,6 +785,7 @@ class HunYuanVLForConditionalGeneration(
     SupportsQuant,
     SupportsXDRoPE,
 ):
+    merge_by_field_config = True
     multimodal_cpu_fields = {"image_grid_thw"}
 
     # To ensure correct weight loading and mapping.
-- 
GitLab


From cc4e296ea62226632de5285621fd0cd287621ddc Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 3 Dec 2025 18:27:36 +0800
Subject: [PATCH 038/258] [CI/Build] Avoid duplicate empty inputs test for
 common multimodal generation tests (#29907)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../multimodal/generation/test_common.py      |  14 +--
 .../generation/vlm_utils/case_filtering.py    | 114 +++++++++---------
 .../multimodal/generation/vlm_utils/types.py  |   4 +-
 3 files changed, 69 insertions(+), 63 deletions(-)

diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index deaeea059..0eaf7198f 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -137,7 +137,7 @@ VLM_TEST_SETTINGS = {
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
-        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
         marks=[pytest.mark.core_model, pytest.mark.cpu_model],
     ),
     "qwen2_5_omni": VLMTestInfo(
@@ -152,7 +152,7 @@ VLM_TEST_SETTINGS = {
         auto_cls=AutoModelForTextToWaveform,
         vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
         patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner,
-        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
         marks=[pytest.mark.core_model, pytest.mark.cpu_model],
     ),
     "qwen3_vl": VLMTestInfo(
@@ -173,7 +173,7 @@ VLM_TEST_SETTINGS = {
         auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
         patch_hf_runner=model_utils.qwen3_vl_patch_hf_runner,
-        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
         marks=[
             pytest.mark.core_model,
         ],
@@ -350,7 +350,7 @@ VLM_TEST_SETTINGS = {
         patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
         hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
         stop_str=["<｜end▁of▁sentence｜>", "<｜begin▁of▁sentence｜>"],
-        image_size_factors=[(), (1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
+        image_size_factors=[(1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
     ),
     "fuyu": VLMTestInfo(
         models=["adept/fuyu-8b"],
@@ -707,7 +707,7 @@ VLM_TEST_SETTINGS = {
         max_model_len=8192,
         max_num_seqs=2,
         auto_cls=AutoModelForCausalLM,
-        image_size_factors=[(), (0.25,)],
+        image_size_factors=[(0.25,)],
         marks=[
             pytest.mark.skipif(
                 Version(TRANSFORMERS_VERSION) == Version("4.57.3"),
@@ -760,7 +760,7 @@ VLM_TEST_SETTINGS = {
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
-        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
         marks=[pytest.mark.cpu_model],
     ),
     "skywork_r1v": VLMTestInfo(
@@ -812,7 +812,7 @@ VLM_TEST_SETTINGS = {
         max_model_len=4096,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
-        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
         marks=[pytest.mark.skip("Model initialization hangs")],
     ),
     ### Tensor parallel / multi-gpu broadcast tests
diff --git a/tests/models/multimodal/generation/vlm_utils/case_filtering.py b/tests/models/multimodal/generation/vlm_utils/case_filtering.py
index d42150bcb..116eead7a 100644
--- a/tests/models/multimodal/generation/vlm_utils/case_filtering.py
+++ b/tests/models/multimodal/generation/vlm_utils/case_filtering.py
@@ -62,6 +62,65 @@ def get_filtered_test_settings(
     return matching_tests
 
 
+def get_model_type_cases(
+    model_type: str,
+    test_info: VLMTestInfo,
+    test_type: VLMTestType,
+):
+    # Ensure that something is wrapped as an iterable it's not already
+    ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,)
+
+    # This is essentially the same as nesting a bunch of mark.parametrize
+    # decorators, but we do it programmatically to allow overrides for on
+    # a per-model basis, while still being able to execute each of these
+    # as individual test cases in pytest.
+    iter_kwargs = OrderedDict(
+        [
+            ("model", ensure_wrapped(test_info.models)),
+            ("max_tokens", ensure_wrapped(test_info.max_tokens)),
+            ("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
+            ("dtype", ensure_wrapped(test_info.dtype)),
+            (
+                "distributed_executor_backend",
+                ensure_wrapped(test_info.distributed_executor_backend),
+            ),
+        ]
+    )
+
+    # num_frames is video only
+    if test_type == VLMTestType.VIDEO:
+        iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)
+        iter_kwargs["needs_video_metadata"] = ensure_wrapped(
+            test_info.needs_video_metadata
+        )
+
+    # No sizes passed for custom inputs, since inputs are directly provided
+    if test_type not in (
+        VLMTestType.CUSTOM_INPUTS,
+        VLMTestType.AUDIO,
+    ):
+        wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
+        if wrapped_sizes is None:
+            raise ValueError(f"Sizes must be set for test type {test_type}")
+        iter_kwargs["size_wrapper"] = wrapped_sizes
+
+    # Otherwise expand the custom test options instead
+    elif test_type == VLMTestType.CUSTOM_INPUTS:
+        if test_info.custom_test_opts is None:
+            raise ValueError("Test has type CUSTOM_INPUTS, but none given")
+        iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
+
+    # Wrap all model cases in a pytest parameter & pass marks through
+    return [
+        pytest.param(
+            model_type,
+            ExpandableVLMTestArgs(**{k: v for k, v in zip(iter_kwargs.keys(), case)}),
+            marks=test_info.marks if test_info.marks is not None else [],
+        )
+        for case in list(itertools.product(*iter_kwargs.values()))
+    ]
+
+
 def get_parametrized_options(
     test_settings: dict[str, VLMTestInfo],
     test_type: VLMTestType,
@@ -76,64 +135,11 @@ def get_parametrized_options(
         test_settings, test_type, create_new_process_for_each_test
     )
 
-    # Ensure that something is wrapped as an iterable it's not already
-    ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,)
-
-    def get_model_type_cases(model_type: str, test_info: VLMTestInfo):
-        # This is essentially the same as nesting a bunch of mark.parametrize
-        # decorators, but we do it programmatically to allow overrides for on
-        # a per-model basis, while still being able to execute each of these
-        # as individual test cases in pytest.
-        iter_kwargs = OrderedDict(
-            [
-                ("model", ensure_wrapped(test_info.models)),
-                ("max_tokens", ensure_wrapped(test_info.max_tokens)),
-                ("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
-                ("dtype", ensure_wrapped(test_info.dtype)),
-                (
-                    "distributed_executor_backend",
-                    ensure_wrapped(test_info.distributed_executor_backend),
-                ),
-            ]
-        )
-
-        # num_frames is video only
-        if test_type == VLMTestType.VIDEO:
-            iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)
-            iter_kwargs["needs_video_metadata"] = ensure_wrapped(
-                test_info.needs_video_metadata
-            )
-
-        # No sizes passed for custom inputs, since inputs are directly provided
-        if test_type not in (VLMTestType.CUSTOM_INPUTS, VLMTestType.AUDIO):
-            wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
-            if wrapped_sizes is None:
-                raise ValueError(f"Sizes must be set for test type {test_type}")
-            iter_kwargs["size_wrapper"] = wrapped_sizes
-
-        # Otherwise expand the custom test options instead
-        elif test_type == VLMTestType.CUSTOM_INPUTS:
-            if test_info.custom_test_opts is None:
-                raise ValueError("Test has type CUSTOM_INPUTS, but none given")
-            iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
-
-        # Wrap all model cases in a pytest parameter & pass marks through
-        return [
-            pytest.param(
-                model_type,
-                ExpandableVLMTestArgs(
-                    **{k: v for k, v in zip(iter_kwargs.keys(), case)}
-                ),
-                marks=test_info.marks if test_info.marks is not None else [],
-            )
-            for case in list(itertools.product(*iter_kwargs.values()))
-        ]
-
     # Get a list per model type, where each entry contains a tuple of all of
     # that model type's cases, then flatten them into the top level so that
     # we can consume them in one mark.parametrize call.
     cases_by_model_type = [
-        get_model_type_cases(model_type, test_info)
+        get_model_type_cases(model_type, test_info, test_type)
         for model_type, test_info in matching_tests.items()
     ]
     return list(itertools.chain(*cases_by_model_type))
diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py
index 0c03c8449..ae2f75481 100644
--- a/tests/models/multimodal/generation/vlm_utils/types.py
+++ b/tests/models/multimodal/generation/vlm_utils/types.py
@@ -50,8 +50,8 @@ MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PL
 VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
 
 
-IMAGE_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
-EMBEDDING_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0)]
+IMAGE_SIZE_FACTORS = [(1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
+EMBEDDING_SIZE_FACTORS = [(1.0,), (1.0, 1.0, 1.0)]
 RunnerOutput = tuple[list[int], str, SampleLogprobs | None]
 
 
-- 
GitLab


From 42c194964341bea9fc59e0d35db04dfafc3c473d Mon Sep 17 00:00:00 2001
From: Tsukasa OI <floss_llm@irq.a4lg.com>
Date: Wed, 3 Dec 2025 19:33:46 +0900
Subject: [PATCH 039/258] [Bugfix][Quantization] Support BF16 tensors on GGUF
 (#29948)

Signed-off-by: Tsukasa OI <floss_llm@irq.a4lg.com>
---
 tests/models/quantization/test_gguf.py           |  7 +++++++
 vllm/model_executor/model_loader/weight_utils.py | 12 +++++++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/tests/models/quantization/test_gguf.py b/tests/models/quantization/test_gguf.py
index 3b9597507..064ca94f3 100644
--- a/tests/models/quantization/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@@ -47,6 +47,12 @@ QWEN2_CONFIG = GGUFTestConfig(
     gguf_filename="qwen2.5-1.5b-instruct-q6_k.gguf",
 )
 
+QWEN3_CONFIG = GGUFTestConfig(
+    original_model="Qwen/Qwen3-0.6B",
+    gguf_repo="unsloth/Qwen3-0.6B-GGUF",
+    gguf_filename="Qwen3-0.6B-BF16.gguf",
+)
+
 PHI3_CONFIG = GGUFTestConfig(
     original_model="microsoft/Phi-3.5-mini-instruct",
     gguf_repo="bartowski/Phi-3.5-mini-instruct-GGUF",
@@ -87,6 +93,7 @@ GEMMA3_CONFIG = GGUFTestConfig(
 MODELS = [
     # LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458
     QWEN2_CONFIG,
+    QWEN3_CONFIG,
     PHI3_CONFIG,
     GPT2_CONFIG,
     STABLELM_CONFIG,
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 0809bdfa9..0496b7a84 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -921,7 +921,17 @@ def gguf_quant_weights_iterator(
             name = gguf_to_hf_name_map[tensor.name]
             if weight_type.name not in ("F32", "BF16", "F16"):
                 name = name.replace("weight", "qweight")
-            param = torch.tensor(weight)
+            if weight_type.name == "BF16" and tensor.data.dtype == np.uint8:
+                # BF16 is currently the only "quantization" type that isn't
+                # actually quantized but is read as a raw byte tensor.
+                # Reinterpret as `torch.bfloat16` tensor.
+                weight = weight.view(np.uint16)
+                if reader.byte_order == "S":
+                    # GGUF endianness != system endianness
+                    weight = weight.byteswap()
+                param = torch.tensor(weight).view(torch.bfloat16)
+            else:
+                param = torch.tensor(weight)
             yield name, param
 
 
-- 
GitLab


From 787b84a9fc9d1744f82addf40912e9fb84c0b4c5 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Wed, 3 Dec 2025 02:42:49 -0800
Subject: [PATCH 040/258] [Bugfix] Follow-up fix on MediaWithBytes (#29951)

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 vllm/multimodal/base.py   | 2 ++
 vllm/multimodal/inputs.py | 3 ++-
 vllm/multimodal/parse.py  | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 4a619fd30..53eb4c591 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -21,6 +21,8 @@ class MediaWithBytes(Generic[_T]):
 
     The wrapper delegates attribute access to the underlying media object,
     making it behave transparently like the wrapped type (e.g., PIL.Image).
+
+    NOTE: Currently, this wrapper is used only for the image modality.
     """
 
     media: _T
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index f4e38b1f3..397684fa2 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -32,6 +32,7 @@ if TYPE_CHECKING:
     from PIL.Image import Image
     from transformers.feature_extraction_utils import BatchFeature
 
+    from .base import MediaWithBytes
     from .processing import MultiModalHashes
 
 else:
@@ -59,7 +60,7 @@ Represents a single audio
 item, which can be passed to a HuggingFace `AudioProcessor`.
 """
 
-ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor"]
+ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor", "MediaWithBytes[HfImageItem]"]
 """
 A `transformers.image_utils.ImageInput` representing a single image
 item, which can be passed to a HuggingFace `ImageProcessor`.
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 650368dcb..c3c7cc2c3 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -484,7 +484,7 @@ class MultiModalDataParser:
             return ImageEmbeddingItems(data)
 
         if (
-            isinstance(data, PILImage.Image)
+            isinstance(data, (PILImage.Image, MediaWithBytes))
             or isinstance(data, (np.ndarray, torch.Tensor))
             and data.ndim == 3
         ):
-- 
GitLab


From b294e28db2c5dee61bc25157664edcada8b90b31 Mon Sep 17 00:00:00 2001
From: HDCharles <39544797+HDCharles@users.noreply.github.com>
Date: Wed, 3 Dec 2025 06:00:56 -0500
Subject: [PATCH 041/258] [refactor] CTMoEMethods to use QuantizationArgs
 (#28871)

Signed-off-by: HDCharles <charlesdavidhernandez@gmail.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../compressed_tensors/compressed_tensors.py  |   6 +-
 .../compressed_tensors_moe.py                 | 155 +++++++++---------
 2 files changed, 86 insertions(+), 75 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 02086c3c0..b91ecb59f 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -767,8 +767,10 @@ class CompressedTensorsConfig(QuantizationConfig):
                 targets=self.target_scheme_map.keys(),
                 fused_mapping=self.packed_modules_mapping,
             )
-
-            return self.target_scheme_map[matched_target]
+            scheme_dict = self.target_scheme_map[matched_target]
+            if scheme_dict.get("format") is None:
+                scheme_dict["format"] = self.quant_format
+            return scheme_dict
 
         return None
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 80ee443d4..c7368bf42 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -7,7 +7,11 @@ from enum import Enum
 
 import torch
 from compressed_tensors import CompressionFormat
-from compressed_tensors.quantization import ActivationOrdering, QuantizationStrategy
+from compressed_tensors.quantization import (
+    ActivationOrdering,
+    QuantizationArgs,
+    QuantizationStrategy,
+)
 from torch.nn.parameter import Parameter
 
 import vllm.envs as envs
@@ -142,10 +146,26 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
         # are supported + check if the layer is being ignored.
         weight_quant = scheme_dict.get("weights")
         input_quant = scheme_dict.get("input_activations")
+        format = scheme_dict.get("format")
 
         if quant_config._is_wNa16_group_channel(weight_quant, input_quant):
             # group_size=None means channelwise
             group_size = weight_quant.group_size or -1
+
+            valid_format_and_bits = (
+                weight_quant.num_bits in WNA16_SUPPORTED_BITS
+                and format == CompressionFormat.pack_quantized.value
+            )
+
+            if not valid_format_and_bits:
+                raise ValueError(
+                    "For Fused MoE layers, only format: ",
+                    f"{CompressionFormat.pack_quantized.value} ",
+                    f" and bits: {WNA16_SUPPORTED_BITS} is supported ",
+                    f"but got format: {CompressionFormat.pack_quantized.value} "
+                    f" and bits: {weight_quant.num_bits}",
+                )
+
             # Prefer to use the MarlinMoE kernel when it is supported.
             if (
                 not check_moe_marlin_supports_layer(layer, group_size)
@@ -161,12 +181,12 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
                     )
                 logger.info_once("Using CompressedTensorsWNA16MoEMethod")
                 return CompressedTensorsWNA16MoEMethod(
-                    quant_config, layer.moe_config, layer_name
+                    weight_quant, input_quant, layer.moe_config
                 )
             else:
                 logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod")
                 return CompressedTensorsWNA16MarlinMoEMethod(
-                    quant_config, layer.moe_config, layer_name
+                    weight_quant, input_quant, layer.moe_config
                 )
         elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
             return CompressedTensorsW4A4Nvfp4MoEMethod(layer.moe_config, layer_name)
@@ -176,15 +196,15 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
             or quant_config._is_fp8_w8a8(weight_quant, input_quant)
         ):
             return CompressedTensorsW8A8Fp8MoEMethod(
-                quant_config, layer.moe_config, layer_name
+                weight_quant, input_quant, layer.moe_config
             )
         elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant):
             return CompressedTensorsW8A8Int8MoEMethod(
-                quant_config, layer.moe_config, layer_name
+                weight_quant, input_quant, layer.moe_config
             )
         elif quant_config._is_dynamic_token_w4a8_int(weight_quant, input_quant):
             return CompressedTensorsW4A8Int8MoEMethod(
-                quant_config, layer.moe_config, layer_name
+                weight_quant, input_quant, layer.moe_config
             )
         else:
             raise RuntimeError(
@@ -650,17 +670,19 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
 class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
     def __init__(
         self,
-        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
+        weight_quant: QuantizationArgs,
+        input_quant: QuantizationArgs,
         moe: FusedMoEConfig,
         layer_name: str | None = None,
     ):
-        super().__init__(moe)
-        self.quant_config = quant_config
-        self.weight_quant = self.quant_config.target_scheme_map["Linear"].get("weights")
-        self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
-            "input_activations"
+        from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
+            CompressedTensorsConfig,
         )
 
+        super().__init__(moe)
+        self.weight_quant = weight_quant
+        self.input_quant = input_quant
+
         per_tensor = (
             self.weight_quant.strategy == QuantizationStrategy.TENSOR
             and self.input_quant.strategy == QuantizationStrategy.TENSOR
@@ -698,11 +720,13 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
 
         # cutlass path
-        self.is_fp8_w8a8_sm100 = quant_config._is_fp8_w8a8_sm100(
+        self.is_fp8_w8a8_sm100 = CompressedTensorsConfig._is_fp8_w8a8_sm100(
             self.weight_quant, self.input_quant
         )
         self.use_cutlass = not self.block_quant and (
-            quant_config._is_fp8_w8a8_sm90(self.weight_quant, self.input_quant)
+            CompressedTensorsConfig._is_fp8_w8a8_sm90(
+                self.weight_quant, self.input_quant
+            )
             or self.is_fp8_w8a8_sm100
         )
         self.disable_expert_map = False
@@ -1261,16 +1285,14 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
 class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
     def __init__(
         self,
-        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
+        weight_quant: QuantizationArgs,
+        input_quant: QuantizationArgs,
         moe: FusedMoEConfig,
         layer_name: str | None = None,
     ):
         super().__init__(moe)
-        self.quant_config = quant_config
-        self.weight_quant = self.quant_config.target_scheme_map["Linear"].get("weights")
-        self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
-            "input_activations"
-        )
+        self.weight_quant = weight_quant
+        self.input_quant = input_quant
 
         per_channel = (
             self.weight_quant.strategy == QuantizationStrategy.CHANNEL
@@ -1414,36 +1436,27 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
 class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
     def __init__(
         self,
-        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
+        weight_quant: QuantizationArgs,
+        input_quant: QuantizationArgs | None,
         moe: FusedMoEConfig,
         layer_name: str | None = None,
     ):
         super().__init__(moe)
-        self.quant_config = quant_config
-        # TODO: @dsikka: refactor this to use schemes as other kernels
-        # are supported + check if the layer is being ignored.
-        config = self.quant_config.target_scheme_map["Linear"].get("weights")
-        self.num_bits = config.num_bits
-        self.packed_factor = 32 // config.num_bits
-        self.strategy = config.strategy
-        self.group_size = config.group_size
-        self.actorder = config.actorder
-        self.layer_name = layer_name
-        self.marlin_input_dtype = get_marlin_input_dtype(layer_name)
-        assert config.symmetric, "Only symmetric quantization is supported for MoE"
+        self.weight_quant = weight_quant
+        self.input_quant = input_quant
+        assert weight_quant.symmetric, (
+            "Only symmetric quantization is supported for MoE"
+        )
+        # Extract properties from weight_quant
+        self.num_bits = weight_quant.num_bits
+        self.packed_factor = 32 // weight_quant.num_bits
+        self.strategy = weight_quant.strategy
+        self.group_size = weight_quant.group_size
+        self.actorder = weight_quant.actorder
 
-        if not (
-            self.quant_config.quant_format == CompressionFormat.pack_quantized.value
-            and self.num_bits in WNA16_SUPPORTED_BITS
-        ):
-            raise ValueError(
-                "For Fused MoE layers, only ",
-                f"{CompressionFormat.pack_quantized.value} ",
-                "is supported for the following bits: ",
-                f"{WNA16_SUPPORTED_BITS}",
-            )
         self.quant_type = WNA16_SUPPORTED_TYPES_MAP[self.num_bits]
         self.use_marlin = True
+        self.marlin_input_dtype = get_marlin_input_dtype(layer_name)
 
     def create_weights(
         self,
@@ -1812,35 +1825,26 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
 class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
     def __init__(
         self,
-        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
+        weight_quant: QuantizationArgs,
+        input_quant: QuantizationArgs | None,
         moe: FusedMoEConfig,
         layer_name: str | None = None,
     ):
         super().__init__(moe)
-        self.quant_config = quant_config
-        # TODO: @dsikka: refactor this to use schemes as other kernels
-        # are supported + check if the layer is being ignored.
-        config = self.quant_config.target_scheme_map["Linear"].get("weights")
-        self.num_bits = config.num_bits
-        self.packed_factor = 32 // config.num_bits
-        self.strategy = config.strategy
+        self.weight_quant = weight_quant
+        self.input_quant = input_quant
+        # Extract properties from weight_quant
+        self.num_bits = weight_quant.num_bits
+        self.packed_factor = 32 // weight_quant.num_bits
+        self.strategy = weight_quant.strategy
         # channelwise is not supported by this kernel
-        assert config.strategy == "group"
-        self.group_size = config.group_size
+        assert weight_quant.strategy == "group"
+        self.group_size = weight_quant.group_size
         # grouped actorder isn't supported by this kernel
-        assert config.actorder != "group"
-        assert config.symmetric, "Only symmetric quantization is supported for MoE"
-
-        if not (
-            self.quant_config.quant_format == CompressionFormat.pack_quantized.value
-            and self.num_bits in WNA16_SUPPORTED_BITS
-        ):
-            raise ValueError(
-                "For Fused MoE layers, only ",
-                f"{CompressionFormat.pack_quantized.value} ",
-                "is supported for the following bits: ",
-                f"{WNA16_SUPPORTED_BITS}",
-            )
+        assert weight_quant.actorder != "group"
+        assert weight_quant.symmetric, (
+            "Only symmetric quantization is supported for MoE"
+        )
 
     def create_weights(
         self,
@@ -2065,28 +2069,33 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
 
     def __init__(
         self,
-        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
+        weight_quant: QuantizationArgs,
+        input_quant: QuantizationArgs,
         moe: FusedMoEConfig,
         layer_name: str | None = None,
     ):
         super().__init__(moe)
         self.has_bias = self.moe.has_bias
-        self.quant_config = quant_config
+        self.weight_quant = weight_quant
+        self.input_quant = input_quant
 
         # Validate scheme: weights=W4 (channel or group),
         # activations=dynamic TOKEN (A8)
-        wq = self.quant_config.target_scheme_map["Linear"].get("weights")
-        aq = self.quant_config.target_scheme_map["Linear"].get("input_activations")
 
         # Must be dynamic per-token activations
-        if aq.strategy != QuantizationStrategy.TOKEN or not aq.dynamic:
+        if (
+            input_quant.strategy != QuantizationStrategy.TOKEN
+            or not input_quant.dynamic
+        ):
             raise ValueError(
                 "W4A8-int MoE needs dynamic per-token activation quantization."
             )
 
         # Weight can be channel-wise (group_size=None) or group-wise
-        self.group_size = wq.group_size if (wq.group_size is not None) else -1
-        if wq.num_bits != 4:
+        self.group_size = (
+            weight_quant.group_size if (weight_quant.group_size is not None) else -1
+        )
+        if weight_quant.num_bits != 4:
             raise ValueError("This method only supports 4-bit weights (num_bits=4).")
 
         # CPU only
-- 
GitLab


From 78f4bb0ba8c70f8876311cc414938d3d68997fc6 Mon Sep 17 00:00:00 2001
From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com>
Date: Wed, 3 Dec 2025 11:36:58 +0000
Subject: [PATCH 042/258] [DOC] Add Arm to list of compute resouces providers
 (#29894)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
---
 README.md                  | 1 +
 docs/community/sponsors.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/README.md b/README.md
index abbb63158..5c040fe4a 100644
--- a/README.md
+++ b/README.md
@@ -137,6 +137,7 @@ Compute Resources:
 - Alibaba Cloud
 - AMD
 - Anyscale
+- Arm
 - AWS
 - Crusoe Cloud
 - Databricks
diff --git a/docs/community/sponsors.md b/docs/community/sponsors.md
index 8abb07caa..fd1c82376 100644
--- a/docs/community/sponsors.md
+++ b/docs/community/sponsors.md
@@ -18,6 +18,7 @@ Compute Resources:
 - Alibaba Cloud
 - AMD
 - Anyscale
+- Arm
 - AWS
 - Crusoe Cloud
 - Databricks
-- 
GitLab


From f5d3d93c40417c296c20dc301100e55708a17f3f Mon Sep 17 00:00:00 2001
From: Amr Mahdi <amramahdi@gmail.com>
Date: Wed, 3 Dec 2025 03:41:53 -0800
Subject: [PATCH 043/258] [docker] Build CUDA kernels in separate Docker stage
 for faster rebuilds (#29452)

Signed-off-by: Amr Mahdi <amrmahdi@meta.com>
---
 docker/Dockerfile                             |  66 +++++++++++++++---
 .../dockerfile-stages-dependency.png          | Bin 149377 -> 177867 bytes
 setup.py                                      |  14 +++-
 vllm/envs.py                                  |   5 ++
 4 files changed, 74 insertions(+), 11 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 006481b23..8bcd7f118 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -150,8 +150,8 @@ ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 #################### BASE BUILD IMAGE ####################
 
-#################### WHEEL BUILD IMAGE ####################
-FROM base AS build
+#################### CSRC BUILD IMAGE ####################
+FROM base AS csrc-build
 ARG TARGETPLATFORM
 
 ARG PIP_INDEX_URL UV_INDEX_URL
@@ -172,10 +172,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
     --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
-COPY . .
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
+WORKDIR /workspace
+
+COPY pyproject.toml setup.py CMakeLists.txt ./
+COPY cmake cmake/
+COPY csrc csrc/
+COPY vllm/envs.py vllm/envs.py
+COPY vllm/__init__.py vllm/__init__.py
 
 # max jobs used by Ninja to build extensions
 ARG max_jobs=2
@@ -195,9 +198,11 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
 ARG VLLM_USE_PRECOMPILED=""
 ARG VLLM_MAIN_CUDA_VERSION=""
 
+# Use dummy version for csrc-build wheel (only .so files are extracted, version doesn't matter)
+ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build"
+
 # if USE_SCCACHE is set, use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,source=.git,target=.git \
     if [ "$USE_SCCACHE" = "1" ]; then \
         echo "Installing sccache..." \
         && curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
@@ -223,7 +228,6 @@ ENV VLLM_TARGET_DEVICE=${vllm_target_device}
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,source=.git,target=.git  \
     if [ "$USE_SCCACHE" != "1" ]; then \
         # Clean any existing CMake artifacts
         rm -rf .deps && \
@@ -232,6 +236,52 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
         export VLLM_DOCKER_BUILD_CONTEXT=1 && \
         python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
     fi
+#################### CSRC BUILD IMAGE ####################
+
+#################### WHEEL BUILD IMAGE ####################
+FROM base AS build
+ARG TARGETPLATFORM
+
+ARG PIP_INDEX_URL UV_INDEX_URL
+ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
+ARG PYTORCH_CUDA_INDEX_BASE_URL
+
+# install build dependencies
+COPY requirements/build.txt requirements/build.txt
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+# Use copy mode to avoid hardlink failures with Docker cache mounts
+ENV UV_LINK_MODE=copy
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
+    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+
+WORKDIR /workspace
+
+COPY --from=csrc-build /workspace/dist /precompiled-wheels
+
+COPY . .
+
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
+
+ARG vllm_target_device="cuda"
+ENV VLLM_TARGET_DEVICE=${vllm_target_device}
+
+# Skip adding +precompiled suffix to version (preserves git-derived version)
+ENV VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX=1
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=.git,target=.git \
+    if [ "${vllm_target_device}" = "cuda" ]; then \
+        export VLLM_PRECOMPILED_WHEEL_LOCATION=$(ls /precompiled-wheels/*.whl); \
+    fi && \
+    python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
 
 # Install DeepGEMM from source
 ARG DEEPGEMM_GIT_REF
diff --git a/docs/assets/contributing/dockerfile-stages-dependency.png b/docs/assets/contributing/dockerfile-stages-dependency.png
index b327eb2151f50e4d682fe533fa57e12f0b6118c2..7420ca4d89441e6dd320657092aaf3e1c0491e9c 100644
GIT binary patch
literal 177867
zcmbq+c_5W**Z!m4>YT=NP9sgHq9PHI%$nO2GLxaf5Q;J+GIcuDDUI8dS%b)&Aw!`_
zX3Cf%8W2L6$?&_@raJHY{rTIk&Zlh8zMuPE_gdF=t?Pci^Qy`_W=!Ro$}r3f*3PYJ
z4D*`_!%RH(%VhkEI$z=({GZ7Oly+=oM(KZ{8R4D`^C!dFx=G_i;OAP0>zWN|67_Y#
zT$}#pI{b^pVVz$tOp<;j{`)^WR6Spv6x=Dc;iA(YhaWGoR6Z<n=px?+!F|yY5hnzn
zz5PvO^6!%--QMZ9-_5N*?aHk`>zR`aI=-d%diP|u8k7vi#Ay4(_ql4_&=H(B5rg>m
z>rqwE-aYOA`S(B3_dVC{O#k11vkm3{Kfk@`Q2zhEZQT@q(f|Fo!c+erm%Eg|m~%aM
zzPgWjp6jh@Qg>Znx6E_p?)=Zyi3N*)-FLn;-M%&HaJ6W`+7x}g6f1-FjXwr3<TgH<
zc!kKQsIKlZm8N&+mKGNXX9a2-I`S=b{<10UAeY#JBX8~(O<s8T>ZJ{@S4+329=u=Y
z`c2Hd^mfg?AD42yIyL*iJ)QR7IR8>^ooH54EI&F>Qgh<NCC4w1r>sBr{+Z2=3H^_C
zO#Ms}x{9UEvW9z=N`hs}QjdMW558Zh|Hejbb&3gwX|#Cq_pfO!kx@29;#D7q`|3hI
z_*cg44Ob6W);00gEKM*eNILZLz}K_?ImFyi4)SMtu57q<MJ}I)f9%8aqGJYc?jNyz
zwf|hh^@Er56xYNgU046{IYag1i}NUDcaW60ufcI$q4ZBTF?lyCGaG#N3O#5~w{+_M
zbWO-0{o@Y#!99Lz6U6kFx^(Y3J7b~Grqh#6zSKYFmOg$sX1G8wao1(p-dBOnoq6W9
z>GpW%_q!20F+RbnZ|X*RQ|KK7{U5F<+0LCix7xnSz_$D4j_|!HR(GSL1rKg^U+Qz<
z-t}c$&VG!Hip_s?YT|m!%IMNetr)$A&xs|<9d9$#_w3p8-0S<*nHlWy$Z>8yJxeZk
z&yL03Q<j_v6|0Kab8C^^i%pWl-DQc-c`Us|7Bj5Qmphg=Jeek37&1E4v@1j|L#!@L
z$yfhmTXOS@8(X-`xV~RjY-5v?{!Gr_(uRBAB#sQWR$1i6R~epc=&3R)!LY-ZO;nJ5
zdH=|pp|4lm3Z*+9dOn>gEqG@0GFB&I$n3)<iDFG{1I@<2zh9D>;W0i^`F#}?6$5S7
zZUIIkoxXlDjy7yzw=XA~f*cxq-&mJOwMMJs*9!A61OBbC4}`Xyn_1-48sp3BCvtH6
zWx2smpR-0s{G{wH{AT~0<>_Y-dV&tWa6WEcR*C$mOomfW{=qA<TK>}Y?h|bqytVH{
zMn+!DV|0Ei3s)(*8y@UmAoM`c$fY|aZ)Jcn*3!J}&ThY-lOXuAIA*oO*hk=sa^D2y
zhnrQDW^nt9nX{FIGsGm`p15^(I-_*`&hCIlwvtd@twUXq&)U5!S!ZW04UW10TA(BI
zo7KOsSUoj6Ziz1^OFWwO<hsbggb$ac8{1Oql-beoPo}2f{R!Fg_IgQjGxj}=0oHd;
z_O&~X4)=NTw!Ka+KxF&(RPK#?`1r@9`s`}b@{%4~H;<CrEYH$gJFcj0IeWbAb;G@!
zC1qE+LQj3d+6Fr|`q{OV^XK_qI&u81W7E*Lx=|t9A9=`~&-44@{D!@?JH@KDvs6i#
zkM%mmI>ao*WkYLRmaCs9SNyu<BX2frIXj)zR2<~VyVSLRKT?a&*AF?q81ne*5qrnj
z_H|$Cj$l4LQ)>0ovon;~NtRVYvt+wfwG8YlL|0ARw@2rAwMo~biL;Dr?W=aP9}YAI
zRNI!wsv8&>G#Y=u7^5_A_t=$P8KKPn_ce}9K`b_MKoervtWxLTCDHH+`}XMVKQp(p
z{`4$UZ%r*0TV~*+r(}bddVU}k8_sKAp4#ep9@#eM#{Txwu`_Q~VwtjZSbDk?b|TGu
zX~xMuo16*zL>$}GniAUXzcGD&k(+<cRL%})3O?y0J#YMC%p-Pv%>|oP<Ja=ra71sW
zZ1)QdZS7So*S-gJ=DEz5iHW$MCPW2`J;x(^k>kf}7?jF?jcB}HC`|0F8y&9Wmb5Wu
z8-Hs{Dd}(<{U*9<Lg*q2_M!Llvv#tb2b;qsNBTb$1dc^GK6ziB>c8*k-N|=n<|iO+
z-MMpTWs$#l0p-Bnwl}^QwFz!E<o3aw%iQv=&TI<xX3U_U`j7oyZuze;dd?pn$oBX5
z=greK3kq8{@zJcB3u|JpD+#6D7HfZ#Vr^OyZ18;R_o-ldjHqrFEaq$!_cN{6b}Y#g
zdhnUetBbtS^?afA?LD`)n<!6GaCw1bBhyv5p2ZHZO6SL&8u`usIfT7GhcK<K1JhXA
zJNEeupIiUarHXHc2YZxu@7~Q__T!_=t;ch#*t!zgKK)>q-fCV}xN=aZhtbf@!t+do
z3YDp$kE`Ta?2C&eMf>X9-2VNMMn8VtZKNwi=}p!^k+s^McX5AlMYJJD3m-9Z>8UW;
z^&`uuX3f2T5?8Z~GXkBBE0qJU(@ygc&fM8r8M7Y6LJavDH8H{rvlNtEWhmW|(Pt<i
zAkf+Vpu_Z%c;#vb!SBO&A8gdWvV)Th4k;`-QGd$$YxaB=d*aNE3R@;KjS)9*E<d-_
z^#ogZq1g0A8{=_5-(M}O#`grzRg@v``+cttUJ+f%6`I*v5v6VZt+j8~_jfWQD*s-c
z^r*XcsSJ8y$A+5}_;5>D>GR-EyAD%+u<cDsN$#~3RE;k#m6y}h)NHss27G3`Mm6ai
zY>hKRIrbAXS3|AUYv0Kz1tOzzfQ4|Dvy^>{m$#PQ-I{BR``Nad?G4JDK|Ff$wOG2t
zc&Ps?iyd_QHH&WTF6v!cXZ{*0ZV$ganO1!)XEA^Z=lwhvs}s$d&dfK=4c&CQ6E}v$
zZJ5^-;%10J&M2O`Ci?0niMJacd`?iJ+B|cyl$Z1$%!^0kwfu#`!=cWCu(skLX<w0F
z8Kn!mH&owF8tyC*wj4)d(;f#{XZl6OWlE*g*eJ<iSK8!o9s2V8(n8;JOPp3yhEm4A
zMyECUOMVJHZPWkmT)4r5;|-UmFk;O2fcY&hbI`*4<kJ}vf(QR#eC8Ns895KjjRlHe
z>f_T>nyAdqR*|`evSev=Lp8T0DlS+#NXq2&j14;<9B<=57eKPz{$l*<2i*BelFZ9Q
zfv0xi?*zIGT?ZN-Vu}b~m=~Sw$nbR?9&8A-EzWZru5xQAS6AtApUs3SivdF8b?DEx
zcu(o^b2As`U;mFgUv^c;{1rPc)8Oc(V<Sr`u`Q{Ls&kdUb!D_JCGPgx2jM6&y5?ct
z`Ch+;zFR1AVC&==JYEEDa<4ACi)e|~O|+rX9Q&X>Klo(3Ut+OTfrAC(-VvrcWZEjr
zxxCL#f=WxAzdYv6Lk&_x=rnF+1b_9HunxYf9&U<z&RrAtP;DG{#y?wtyK)_FHEOyn
z-5w#H_13AiA>!8M{7q99djaZr^DllhtLoFsTefY<RgLMbu?47<ie28`-q@e6o*2Oz
zB)ByPH#7TwwYA!oHk&hu_@lc5H;S;-_UyUVSQOwZIRlFympOf~!euOoucvTLoJ>hD
znMZV}KS#be$l<LYO8APL#SVce<0c9D0#P3e*OwGhu*;8j$(vWj8nAC8oz6FMK1Si4
zv*cuSAwQQp-}&)sIpdiZzdD<j;GclNP>-;)$s>0zlnh(vpkvRu8!o5pjL95qvCZf`
zW0KKR8E|kINI@vcHvUlZp_j{9odZoFzLH~MU+$)UXIGxwx6htiW=isgxb*6>jkD)!
z7rJvz>_q68UXkt9i3rJNz6`HTE?1Kr{B)}e8Rd@K=!mXUbC@SD7P}DpEU<34-2uzS
zZ>z}cW7Y3X<4nzrt@m_x{bBTlYu_p7irl8CZ(~Jdomt1jF6H~LEuw0Gufoy@6*^+d
zZC-4eVjet8Rb_2Pcd2Agxq8=Y%Q%zppyP_24NbS20&SHELO2h7%`s8_g(-I!Nt(j>
z7_mcBmO39rhHS3*4KR0AwTZ7m>CJ5(pMgYx*IE`;WsHt|3u|q!imO|m*IBvO+PnBz
zK7Y6%+m`eDjMp1=6<P1K_g^iK%jnf(-$sT?`_}%T>x$dZiZ#*kn*pWP$ye&z7Rd~K
zd79S$-lGgl#hdrxihM9(^o+i@Yiy;MP`NU%v9f~jsMIxUpo)ZbvgxaJXacHP>%hzT
zIQPkxFG4pBG^@DzpP#*IB}>Zg)7AY?&xV<m1e+*Ne)N~XoT-xU{+=yroO_j^3rp1I
zW3F9uDW7mgw;F-E9`Ere00Kn0)cerV(jv+?_Q{7{wIgo3TPoH7Q!26TI^N0(Km4*8
zAE63tQ@#z6)iu;rVm91UrNX|rSb76-DxgL~{8Yi^_BfGz^haY~)~E}IYgtwb$~txD
zTDx`yeCJAsj-H*)%?3UfPBbl2WPg6pk;%RB=stgOOD$YRa5`Z|V860t%PQWyzki*b
z05shglk!#oz%MlO<&Q}!1B{=}Ey@ezS412O5Y`7dm0k*`J877hxsvYYGEZJ)$U|T;
zSz%yTG)Yg&BQ>LB`s+_uTRmuF^L}`KNeS3d)U@zA5FXPqd%R9@uEj9*qu)?g%c8X7
zb_L5wvJ$YiO$ct~ZD&H>F=AT@wO|bckg|DMog?2m!nNZJ3G@SbGCgPHXZ$F{G%_}(
z8=OdP;LFolp1g04q!?fb_r)yABMO1>y-=A%PkgDTG|plJy$P)fK5>8`PtTe0WmBC+
zH;vt|zv1<vThXcSFPPV)9`i9cUTyd3#WJK*2cfK?P9(J{UD4~0yiTe)`g8^X)Ur&M
zZ(yCvFD;Z%4%l#{6w}+?_C}&arfU~sKWP1xC9+bhg6z~PjogL-2LyKqY$!Ur(CT{K
z=t!R<W-_bmRmxkZHD<w%rs;OgY)vg)K2!k~lBO2$M)#+eE9gfQ3{4lOy$5A0IyGg<
zbrd?<59Tj#Zo|gO=x(o-2FV#ZP^4BTgDpBx5$Ea+UM5i#AfXba71Q})>FCCDi)?lV
zpLl=XCTC2T&^a{N)9MR$$;;9Le;&enc9KpIIMqG-28NjiQl8jds<etV(j6|}fX`WL
zD}ep9dd~dkDMFmcUj5+mV(CUy;GG!n%H94N?HdcjDT@L;2X`#JZ=F(mToSdW41b?9
z^^D%PB|yH5Y;u=zBYI_8qwhWEAskDbJWMyyG`Y7Hsiztgqwz#db9hLwZL#DkhnizM
zX3F)~?+TK#zuc;2=-43Hcpa!idE>jkX%Ud${LG38T&)T`x|!Z7cI35PaBEXeRb2{|
z$=jaF`A{)}-ykBSuSmZCXN3)NQPEc=hxrC}d#JigE|JC}+qJ#!+1n+1vbRvSucj+z
zAh}Y{^78Jr&o3_yz?LZfy2v2iE)OY5ylRML^PJm^s)OLArlu7)QPmo$V(pfw?G9?4
zDfKCIsAP0Vr5NNj&~0RZ#m3Ya4|X?q$xLZz8X3z8Mo84Y3Du?n#5oW=#UQ5}a!MST
zL$9pTo9`?-`iP6gCfy+}YsBewlDT@L2BX0JQ-1ogrRUSFE2gb6DMI4LIu5n=wqV(&
zh!;OeTT?mke3{PN6$1^zZn{#9=T`;imbmsN0`(?n#oX9(?#_-YORxlXfTA{qB00vd
z;`jS)Jazz@&wAIP{^?6(Hp&m=sjzu(ZNHR{bR9s1@06`PuE<%O*H`yjVV+MPMj1<k
z8lbMO?uj(fh~!Z6#yTrVIJ1Ab=PH%rX~GY$sw+!|mAmhM_RsHBrkbkf)PED0=P{5i
z?0OAvmEEz}ULzQDL@8kQDrF8x3YI<TQ)vLOP5W!TZDZj!c339Ss&t1A!`6&C#0LSF
zrmhnC5<tOVqNL{*AJ?&Z`1Lud0VcbqaZ-$`1Tta%<0(r6ksN;VmvZEi5M&l<RwrVt
zQ6g(#H_?EGJL7rQ>oIneHw_|KF^d>^n6F_z*OWz90X-#Yr4kB+0svn5v2QJ7(_11t
zqN~DHLevu7v0;RvlbPJFu_+)R42+^{mDW%ivp0pZqGWsOhs?xEV6!0LKqpNtEuUT&
zpnS=XSGSa+b_E`PZSL4pp-t)KBKP{0zDBN&P!~jV{EeKyZo~S8G|*P#P-iMTgS~A5
z_LaJ;uvd3?H5Qen*)|0_ymjQ3vOA=gbV#JGqp7c_Dr-c1mR$cn%7#M&C2qxc0<8Rj
zhzRdjAAfVimdYUT<Jy>|Vuw_b5c*;`>8g1+9VEE1wPxa!nSzTGL7a+ghrtyzT0;ty
zrQ5_t=MTy*avS<e%vtG!hfw-zi8^Q7`@n9L4A<%<<<8rC_!<c|6DLp4g>+>Lxlyom
z*I4fN$j6-e<NlELzdm2sge@vyyPU~y1~M<fpAg<ONE=<C@II2iGpfqtK>N`FEx~z-
zeQ!^i<$Lk7*huO%xCQgqNzM9KBoUm6*y2UU>IffW8N$Tgx{VHJbq&qeKe`D!s7sc;
z3c2av%3as50=o$kk4!-Hnqea<LmBL9H5zq_HBy#_RzN7r=eZj|Y2si@R7sBvII4N^
z#t#=eypL@fYLXw#i*xOX`i!h&YtVI0erQ>cw1dzk6!j+$aYYZAy^=E)ZV-e*cEtQ4
zbfu5aW~&6Y4o;jUvkq0xn)Xii?_;!V-wl84Jb<-_yew?DPT#B{zDqpf0!X#_=4a+=
z<?{LJtMP<5wcO>O=V7wzKaAIRaVGxy3=qL?uOME^X^JY^{X#B--F_d2z!P;WWk4S~
zq7%#2luWSn0atF<ZNS#<ZmkmDdnCDHQD5Jumml`p0suox)mP`SM$W;OcNHzMUF_Qb
zuC6-bz<$cyS+;IBR3nl}M$pzjmVNWGrk1b14J$_26IIS3I4961pWj1t3?4jMZi{Gc
zLKRkGliFAfsTUH3``|uEy9E>vyaR|wZ|wUF@K<-6@1SE|qGeph{mN7n_%8dJ4Q~#6
zf`7SLW@<)hB|pdR?v)8CK8l(irXyIK{PJlH61><BloI_u>*NY8%nRi*^LD5Uh1fTW
z9|khsA~x*WYaCB?u8lygG>%EG^lwVHs!gwO>aflnXe=);7tB9ovfjR>oY=HjY-FT=
z+{z^j565q!vb*j2n4Z7yUBJ5in}q;;c_Ak6?tD-;97{@q@YFM#iviC|`!B99zTVU)
z0+P!bSE+R_QW`})DNVh3^FU@u64g+%#zH@g)4VRz#XJ*}u6yv3xZ(#;Pf02&s5hQS
zQD^4M<w52P#@HuGnSya8Kc7~5=pD34#d|u1C}EijO@T%Wty14P$3)>xJ;5r-ny3$h
z%kwm>Q2(7xi~Kcttm7X~wmm#qbQKDwp2oN$Kkqz#Zy8txi@gZztBV-Or=>B5(9JKK
zTVEtmS4i?Gn3{s*_cn#c%YS}fp=i{i$fnc>k<%{7LQ6yABfzW@Az52e@*vqKzQ3@>
z6vAW)Y1+gD?4?xL(MXTeoD09adun+J8=+0ANk`}P{Ug$KmRWDTma$OBz0WKTH#~ua
zhd1b1#tg{a2eY0gJ*G52T5b>W;d38^)ZjTUQ?8w}$aa_cH|$Fx(AZG@ajNT{eF$Pg
z^7FNO@9VF&2Uf_DLG7zP;IY=CNCk}#h-Y=S^{1xXLCd1rA}YoPA7_SAT-PcSqnE4~
z+LL`iB*b;lJSHvOu#5T=fGLidv>hEQE#5vHa1KeqOjq_JsQ1=?R{AS_s7vo^-x$XX
ztqBIPLD5;rVlO`aYMK1V*DIWOj{s>1J12+8S_cfg2AOT&vF|Ko-YqQD!^#s#I);am
zC8~`*Wxjn*q%>;#@!Ag4FmJ8M<SZS*Mikm2((i#n)*wgMO$fOKlszrU6`2f>=dO6l
zy12Ah0fENRTWssbU>V2GCzsWQcln8iRi;9^?s7S7V$$&B@)*>M&#ggH3~PfbAAOO<
zM3g}038ZYq5{2l3dY?-BF?;vV5RQzrxReD>b;kAbk^9vvwGHewp?`XU5j8-#EEb50
zR<kT3$(m3*HlF^crqOhSUl`=Q<i>f~;=(iQ3Tr|4MEP`+G93*dvJ0z^r3v?Kt*F2%
zohBceu_3)d&3lM1XuV#73dHTnFb&^ZXM)>_=f!-5iZHJ_4SS9XtNzi)lSl=(tuk<E
z$dMm8X6^<)8zSc-y8p@F+x+*ZHTXn{Ygbvm1^!hRoYyokh~|QHiQcr*Ki>zWA>6Ac
z70VGa&^wge(h@NG5W3yQx*Vu6_WN_ju$d0;_Dek8DRr*WtWv0yX9lZhF*8bh_FDNh
z_!`NvI$vy=1=N<#nX{kUz7j#Mp{eQp976cJ3F4FcGX>VHA$l`?shkWTg}u3J=f^y5
znG;`@%l54dIILzVbKu!O+nh2deFY9F?tAS}=aN#H3f^U_uf2MX#U-=iK*^HV9tn~;
ze+@R;Us@z-lX=D9NALv_Y%1+VsOn>X2>yx>O9Wb3IncR&1SlZ<e?~_?F52+M>p3;z
zuqm5bU*TP=(4)Z@6(w`m`bVdrR#dxwYirWIg?!gsm*p0dh6OD$>hEZd66SMmfD(7T
z;~<J_1NI4cu+{z?MfStq*Opz$R;9OD$^N340Tb}pkPojB4HY~XV^deOQ3#LbVN@7h
zi@RZpAGD2J#4-Q=aR;0FhR5WX)hSf4lQOWE#)LX_79)~|d!S9c_+mb;)p=C^zGKlI
zOJ{6{lB#3)uaWsTGx=}TDZN>Q7BtP!IfxNAL_e{yHMOG^y@XwqrKlW0vk^J?{31)V
zsnuYCA*9ud3ni?ATS?tQ;w?Q<=hC-(TXvwlsV<Z@%S&|t4js|pD4({yU2t#{@wStn
z@9i+hF8c_YXdIm50wOq$N*JY!xTEBxjK`A~dT~@n#FF>@EZ3!643FG6=iMiQ`c<2N
z-jdgIf(6g7EFBHXA&oZ;$&<|s$d?B}Z6HQ`soT)8C4{VkM*&i!#771_7tK2}S6g3X
z$5IzN^P%@1!i_i1E%9$U2JGu<{=lK*tq1R3dwYB0rycqTd0S}zKF<L@o6EvW^sC~K
zhyeyz=0hOJu4coQv1u!_4;+)5ExaUWshcylmAUidN%IT4IHLYrgE9H$Q4PTy7Mp6V
z{NT$g`GJ$&`5sd{?)69uw9TP)KdOW-1kJT6HSBq7^>|rwqeESvF>pUQm%&9<RpsR$
z|5<T4Hw*L0%NiXSwjBO6+UnHT9;6~aY!}-U40WOyU4YUPLH3n-_up83a}hSl?CGeJ
zN4eT>?wq^qoL8)wax8dz4dRlchwr<)Xi3j^FTgeN%MsDhg%p=GJ+YPmr%4Y{al^i%
z%Ml2AT5RuvB%BB+>hrv>#<oO1t|zT2xM16S120syfs?n5>^9Uncj;XcTjJ9FQtuLf
zW;4=L34Z36pP&C7riB(0Y~(esPB4ivtzgzk0MiQ{Q3>355Bc<G=3IT6069KJMK9UH
z`(we<3~SVf0W^Dk5P!xHTB+&Mp82gEY*SpK)=r)M5h%A93o2EW4rF4w0i0GJLFkB%
z$a>(pLTH=l;uTYIA-?7K>Bc!WFL-^)42ha}9oL<8(28r2I&|HJdpvm&<ayf{9`OS8
zTDM~R#a!SaQ3NfC0@Vm=sxd2B&3f~^(LR)dBt!y@&68hOeI$}Ysxw9SVkt4A!Dy~&
z>(zm}>+N3K*hnNHBaWJ-m|`y|R4b@%nidE80gwwr6gC8X315WH1!a)=6@0GN#IKKo
z+n1(V*JV~?jO!1-iYKN=+7fb*EwsQpsFk{wG0CK?`-+Hg_`##001$k(mt5RLcgKN2
z*)6C`{rmaRIXjTHWsKT6ES5fAxX}9HHB{%)`Ts<WzIE;@A{pe0{7An;Y!qUG8doUW
z(sSn~pA;$ylw42KO+9_;Eq2+z#V&0?7efWj%dF%MC}E1w%NwJLMRFE-LZ7zDtXyg2
zn$`yBPP(g_x_VcT<!Ecx$eJo6Hy0Fxw1HUpRm5(Vom(WD<AR1%!FOf}0{mXz5-F0f
z7Hx2BKhFWF4)BG5w;3nH)6<<|ErCKjX;q2-b&|4o+0qcCHyU`3bbi80Moml5kMS{b
z9f*c<Jb*XJmh@mA9h`bU55x?h&Nn~=C<ZI+56U4doQ7Mot<cnN02PNjf*3aXZdJ|=
zW?yYaIWt#(E&{(sIvVL*L*P6D4vQF{-;8SIH@rRZ0k!I3%`rsuhaWu~M&5X_lpWyc
z77{;eT>5Gq-=AMasvb$~pn5yM^|mDfT5xtcfSh;^kJ8#?4Qhfea{6?GTh95!<e7`E
zlk7$jPU1x2b&(Ym!&h9pZ&6{W2KFRD@=ij6jfl|^>WAn<(;x;y#PK7!P{5EyL1ExC
z;f;z;P$adCAPmVm2lC1x!x4~5L#LM0p+KcBMtTe6)dkSxj1E=12=F>n!QQPW$|%gT
zDwbBJ6B;BE-GB;Edm=P{V|-j80WWCCf)|wZ4naWmA?ysoVLUQemDTt)XK4wL549FH
z9C>X_<_1)L?~LBpQG%m#V+f*L^M>kb2dZr?X`5k1@R1&?bM{TFCdPSb`%=zY4E^R<
zD#_HH`!<Cc`tzr>!};0RfBuiNsOJ$8_{}%2;tMT{w3=h{(ma)=?%HJ#X~ar0*H?U(
zS?i}n(<lGXNfk1}#v~4fiT4Uc3wT**+_g-(&lINctKZ>eK{MFXI3|YEhxc6h-qzR(
z1(D4wE-ucoS(GvHx37b;vPU0W`7zQjZa-R2diM|Oz77U$0+frt9T6K_OkI)k%zwR^
z?|Z}YAGku2lVkyd(6HRc62Ujv<h;opE5@!YIB-WLBp98Vk60o{vLB$v+=fy}{Dn+I
zO@B=j%0+KzencaZmkl~_#_0VDRvd<Q3!v;7Yw+L>au56tW{M%3xEO^kKicdsq1u)9
zqA$*YPyCs#S^b0K%^~+}GtdN;8>%}|+0f3_@ppqSN2#mMxfZ?2I$eBlM<Gffr#nOX
z<o3&p{4j?f@!ni?J+7h@iG>H+C=w(LbRBF4UWZ1B2;xPITaiXt19TeZPu3T}X^xtO
z7U3>mA%g^{jt%Ir=i+`cHJeZzgbZ@B=Nsw!$KfKaJ-y9kW?XOgpo^FKW6JT(zD?gz
z(u*(MD4*E%+UR1B^0Io-H6;rKr&GJjk4;br^2cEwaSJa1gm5&Cn$al6b{47fhr4Ds
z6l+l+9}N&a@=3+-e<}*E7VHdWU|N8$M_oHTLwH#Pkx8>eHXJ^XW{LrM{TPtc@z2)*
z@c0Hgo9CcKPL#Q02~r_13v-HmTM#(r=&C>0j243m@cTWT=pMf!R2B{Y(MG5Nz5WMd
zXx*Xn319jKof?d6{hPyyyOHk`5WL2oyEo*aqvUgH;*>jps4O-xtT4J#ul3hsidL;R
zE8fK>HH<pTiDZO;;00Gzp-sVBU(k*Q$byAf%)X#1>3xsfOYH-pjE4=N;-?vQ5ZmpL
z09%>&`;SN^3748)Y-bdz<Y3pCOdVoksY(BrcPuvg%i$oHCdy;GT)}|)1dyo@Dt<s3
zxowo?YHLu3D!vFPGVCi99a@F2dt0eZOMTt2uc#cOL6SnL^?zaAO*h?~$S7ot3z7;G
z$%|6j3sAL}n6LvOU-TB(!dXYhu^PJ#sIVBnsrZ2?I8c0@V<9z%G&GzCW+%dTgE$}B
zYbl3LncfFLdl$2!Fdsbu!lXmbCo}F&aeLCnsp6N<fMEy>6$=Xs8zA{6EP;xNi3yB<
zs<Vb6DDxkIGT6>BLd4f|lx7IKBp=5NAR|cvUYZ737=fjQtvt?B)7A9@HWUT3k5R8h
z<sqHlJ`!1SodiWc>W?0eaAKh-b>5^(K9C)VDj<@!Hk)@8omn#bp?a<i69-|Bv@ZlP
z4uTrjEWJ>U)IUo2^SVVDCe*NWtcMlK#Yy?tX{OZCL(vQ(LkE`ZQ3@eLGW4){7amF4
zLj@1k8iwJCr08+`_NGqMB;TtLSb?<YLcjH0q_IPV>wjy%1ACk-YIXj^Hd<@E-X)bO
zlLaLv4N$i{jsGc0?9@bJq2=uj1<S0fNTQ_v@#$6#2gpCfeOfdy$vT}GAlBq`Gbbbx
z4qD%3{~0|Y9gF;RKztAJ(`xiVl)g!08;Muytbi`4$gTmv;TE^hP==R@+CrCj2F~-q
zJR30jeiCKNL;J`eG@VCmHOpq8!50RHM=AR7RE4zxaS*9uRE=cNY~d)(pvum@)n)@=
z1O!dU^k*OW4MErh)7@z05UiuE<z6>XjW2iA9ljGEU)bh6?En;jur|mSGA!cSQdP3B
z@04Y^jV4)C2%rHJR<+`a3>!c)y|{;qap!ybL;HcW1CcuUIdwf_?o20(6*{8iP6Kh5
zWr+e7c7Lv34@|Ax*IwII7BV764Twxl2>Cn{zVE^71`f626kfD#WV&B$A*K61S}5>o
zg$ADhV74vWEy2fvsWk}ZA7a3a`(VAW+kjGY$y6g<Oa_+i1_H3o-?Ac76WCQZJUuGr
zzFsWQm7Xmss3J0Oi-rV?+zT``@W(YpG4L4J$3A$ueIYeE{X|U^$<Bk2Gy&3zv8}~-
zd@OSRP~&)@Gvg2{Pf*L{1tQC5$qyaX(lZYuJ6~88Qjd01A;Ua+A^}^j-UlLLRp?*$
zMT#H){K~qhE%n1?M_9Yc!0CoMMn^!&;jkHiIWL&_6LE9sS~f^6O`)e}M#7_nsLy;c
z9`&avyOC3h`fDsU+-z%+&8cPA;!&m|FN@vAVgn`JaqMd3Br3sw$hJtmbvc9uH5^C{
zCIJ$y;|I$2B$Zef?FPn!WT3olTXtXUw*sRPQ8L>AD_lc@55uk7pVv)*$)*W3PiJ*<
zra6+N9<Ti<T83-c>BJlp$ohs}jc|sKp;I$Ju~dZ%>M=E*oGl>_gp>f%=!L)LObqD9
z87)f`8r#4+<oHems3|OqoJly@o))e(@f@-lb$npKJWL%neftW{YUHtbdO{4ddj9ud
z359d&Pg4Gg7!dv8v@^$zdnOy8&r%6yvC-OVgeYy#ae_^20j|$O*Gb#b3VRHT4vT^r
zbdv(=`M@RTJCYJB|JnNO!!*;Q9y(BS%TlcC&KULus8kQ<qiXAfrxUlq#Ck0q{<Om!
zZ3Q0jPE9TB`UR+57ME0HzCP2pd%4ZyqdrwLie64zW^|-ul;f&FvQ%p=2`3&0O9u!0
zK*-SNm4wId*jI^a4ltUbiiAAa*AZfv;l#%x?~EBb*Sp|4D+rwKZUgS^ba~@@4wlLz
z|DYfeEKGOE7F7UaqUeh2#gP@NSbi(nTFf7yf+9EXM&PPfrB)wKETYb2McT|6XcWpO
zJej)27=Vgc2^phc7pp?L14m*5Um@Z<L~x%06<T}O4u9(_LHT&F4h8JdtWx;1R4qv~
z+2&ddz)FPxyvd{(f;S100|@etG5r*aiY#i3esPxz#_4nDySGY}m?+HzR(}=Qt(B#k
zZx2x!f_E<eBYB{xUB^oq)Arf#JG{^^V#E#;L!gGVW&8ZB{%t`4qIjdEB;_lZZm7GD
z-DDp*#kLIHP{hLi8&$)AT8eLga;cfH{mPOcxH!(ZBBuJ6L8RSn!4ZYBokjo=$jMRr
zz@Y?wrR(U<ugvo;S~hvsl0cX*tnW1<^_{tUvu0dMv>Snr;v2l-7?n~8rGnJw+7C7*
z3A+uOmxvOn8yXt@5wOb73#-sJ8s2M*%ouB13-#g-!U-#L>FjjDYlMHH@ClI=g(gX2
zbW*7zi%k$(**<)CfDajx9f3}|kjKq>KUYs%fzb<!NIA3zl8NK_DA>!qPgx#R4Z~Ge
z-&S31{4w`haxdCQ>mUMxEDp=G61~DI#)(THN(PE?7d?A!*^J?geJ?4XM^Ka-uWp%%
zPJ(TrNOMh^E#RIGc}#>zs6cWyaC-lV+br;%9D=p(3v`%|V5qsYOuw(#ZcJ4XIk`Zp
zqJ|_fq*W7GA#-^wqpzVAwH{fqakJudrC0)0woS#!jew@tfL0=v?a_v8gYzU@jKoHo
z4x|-%$aE3vN3g>Jp>byQ9V1tYP})Z}ZyvmhouKc=XbxX*x;J~}u3QkttGv~44I~0)
z7agm`n8b~AD^XbVKA8CEHcHz9c)2uvl^EM*HPk%bGD}vBMYw>ROC;aGHxj=Qb_8*R
zq#0QTd0E&cXoQ7(NzP@M@}-n7$<>EY`yBstCdZ!zrCNj>r$LtJXH|2r+owS83nH+9
zM!WIud7CW`ogg#$<{PTvJkV|;#Dl@o4I%4@S$rITvrkZ$DB9~_u+EfTNLEFNVSJ@Q
zf3a+n{`zco*+a*!v6fVvEqN!1uibrKLme4M{AtpK{S=Dg+e5c5HKzBrR_#UK-fz8G
z<!gCNacNKR(I>xl$>7!nr;hKwA!Z1$OhDG99mtM_(!3sfnPaggTahc`s0mt{5<42U
zv9xz*7s4*PVl#G(;<jzuNFlK<a_h>eRTxfzjr}8RW4kG1LJ_z7{7M8ampqaa+Yd2J
zyaF+$(q2&f%;eFi5?8h{T!InF)<A*9oc^RdrkVU#YEbl@GKm?$yS1B5DjJ|$heKm4
zHY9=3wDyqy)fA<xdW^E(eta`yab~L%7t-j<k@>W~^;1HzwWuzTDe5AOI@Ck!g+@rp
z32kNbv=3bIS1X`ll19gg9Xii}&&!k?d4(3Ck1^UuXExtZ7N?{~zAz{yJ0T+_pfZt>
z-C*ahr?O2T%oISuFtoJ7uCyN+MOSURF`OI4WCS9y*w9Ga1z6<__QUZ4Y-n#@%C#e3
z&dGnwU2$a>@ShDj9=`rUhjA(D9cJ1Okvs5;+~Dd}A<j00N8rfjML(_p@<a%=Y=HQk
z?~L<3=+pKvA6({TL8>7^<#s|sr*u~aDl=^cDq=$@*hOfN`5?b>0)|R7gycp@t2^0Z
z)^BAbpiDYc){u=4OoPx--sr)O;-`_ZU0GMMW_r{=xoLXfP;6LQN9ev!?^2Be)ja)K
z=J$SIc;tEGeSg!!m#%uISr(<G?EF7=E2+&^RiBkJqh_1_QPyM!gp2-fy2Zmg9GZ%M
zFPPJxAhWcp!ak@=a;aQh`0%nxP2c27NuwDC7SckaCJE;bGQgYIyu!l5IB9b*tE+A$
zR#a7H*JY!H_G9FdVIov#E|H0-uiqCWX}i5tMPjj_Zo*bMIl0NcT+D@np4O_lzyE$?
z(zJOy*^yc?k#pB*MWEc#4;jXx2@qIOSy{`*CgFH{&0X~OBcZepf~{Xvo~U|it$wQN
z=_#`^&0S!@;@NmKlxrKnSn3HUCjkiwZA5`8e(|NGyXVt)FQ6XIS-9|IfvmV?WjTzt
z^k;9o?)}4lC^IohNlA4N8}XiQbb%s4O}AyjwParuB(06@8Vu8Z-y64ISfG~`aNm|g
zwWzBhq%M6<cusb9_BOPuy_YXvZq@o)f%P6r660NJd=`9&^HShTbnG(iX)fDQU0uz-
zjaK-ffzE~$XWL0k=umEPaV0)C$*ISJ#XfiL9BC^f!`~L_)BM$B;9OG+q!o#1`_{_c
z3pX5I7x9aN$#07n#}F9J&&^HUFS~Z_Zs2<H2pp`KGsP-$o356sYAAr&E@Y=mi1k|@
z^MxZ7y#-K8u0iyVK^$t1jErPCD!aO7&J)s)KzGdo))gt2!K7IIwB^>;*5XHAYhWvD
zZ{7Or$PDFvT})%<=*Y<<)d|}WyX0~43J8dTc<~8ypbafS{LqU(v6L#Yv9T#G?HA^*
zp?S0b6p}jG8_k<nQlj$s@narYr=;}bZBB2@xcJU@Kz-9toQ{mjdAaAWmXy@JBGdT~
z`fV#%oqc^(e@&WJo^jIQbO8dv0$mFpnG^94fcaPuN~)fmnO2cvZA3q0+~12Ftygft
z<}7X!w<uRtP;fuIA>foIBj5oA;t;Ua>onVNtYLUW#Ok!v)CEBRH|-N7Dkd=P|EZHl
zh<Ys;?`~n?Dj;4*00w=6@t%EKKr`|a)Db$nWi0A9J$529v;=ddZfty;=86@m9)8(J
z$RH95!vfBd%?SatCp_2k>fSBX<Ol$uqg%w>Z7lI?{!;dl6DCZUvtR)kA|kLT+s(Kb
z)w@U4*E03WUxp_#?uzGcjM*Nrk^F*WPio<4&r3Ku11-U0;*oFG%fh%a*@ug<+Od86
zS!8VSBER)HX_c8SCuu(*ql?7T6|OFtd9Js;HVRGoPx;<!$mw=bd4jv>LD+VjfP;cc
zWMyQ^Z)}<30?w8Wh&@AaA`{wKe@Bmb6jVPR!_WU76C@Y~KCN)&#kkfkl7)7y^yH-^
z-16eaX{5KCfq{!oPvc^?<rNf^Ar;<4_5XDC)O0Y-zcyIa<SI}sFR*B@NlVa+NeBGL
z=Cg!1XBP;*nA9{T-wPY7_KOTQE!egyE<8hE0>&~h(0A+1nKP4p87AK|B+Gheh_peC
znLndOvAC#cHw+aoF*%_S!&vNC!;DC%VFt5gzPN*1-hKE`=HLo0hVANfx@2get0KWf
z2~AumBD(C`NC7PXTovfy38|>4oL+;Zqgu_kYSkJ!Im4!s5QkXGS<TJOleM@Mszu#K
zoZDXA+Y;j3c^*Jm5{B{3Xn=TO)4nvvuy_m&4cWJqe1*96CorppZES1;J8pda`c)(q
zx9_zQN$BW;#bbaJ9BCoK=jOeGIW4eCYi-(e%C@$y&d6=BEJVz?|LDNCZwqyPVZ6(5
z@B$*J5x;<dqPe+wvAt*BZO|^Qj;zs4i;75-eakShUq;9CU?l;I(9l08O`a^UZk;L@
z7gx>U=}gvgVPTc__V(>&xH)sOz)79>5^7Nu8&g=g>lSR@4Udq{?9hB=vC$!j*yb^h
zTOZS`nD;Y-F;+Nz6batl{H-IaO4#k<=f=i6Fy{#f3hqMrivSQY-N(mh9L~ACxD4I;
zxl5M{2Nd0@si|3Ln4%--gb1q$Q<{HzDsJEXra`8&?keR#Rcwe$%3SV>W+?Jf$KL;4
zXqx!f>C>7SPWHE8Kv<oKToDO1A|FdEDJf}VYr9_RS4Q=DTXo_NbZW2h%A(6jje*CM
z^}W`Oo2MD(sO8@KhhIXgyp@`|v7xD{t~)&>Bm}LTD4Yzsi@e_1)8ic%7l&PCs<4*v
zQo(LjhYv6zA;GTmBa0ex1RSwAR<B#%zrP9TrWH*i&+5Y)Lg~EO-0Fq`-(=TeC%+<7
z#}g+O1Wja8C*r716~HndOEXeq4>AzC+PC4YtFA{C$PW$=an<W;D)H?udoc(az=H{1
z-I_VvyLb5AEVrw?XesU;gt~5i6ufCQpC|W+F|5xpsbOGu7AS8tL1^pRZribAjjU|#
zfD<;qsHkX?O@rdZHC&<Zql~gL;XS|k?&-NG0IF=)%%|_J<{1n0+EP&8pF3}!1yZ-(
zy#r^oqIJkw%r7r*1Qe(R>(HfQq&4OFLi#VTrhz4KMp>~)<^?4sS11?Fp?4SGn2tF)
zuzDg>FF_{ufmCeS$v#-l(&wt*OZ_8j%G>>4KZv^yoxp^sAo>%k55K;5@LX}Qtfk!&
zMq?w&uLiC(hleK;0=)*_CH(TnuONhFz%$18bm(hCfF|DZ$Acx9?EJGB%6v)ZQy7Y>
z9ICH6H#fIlqUoj2#~<Kp=Aw;%CRQJ|g9v9*Dr|jcx3cna*G};C6n0?XXXlzbyBAJ`
zLZe{5UK086C8n42a_?NTV#O9nJX)rvXm=%ed3%SUO!5S2@(0Sh8NvgniB=+_^XUAC
z4<CMPZCztzWK<Druov>D+UdzN6qEPkSczVOao!9|WZYvj%|DfeUvYMJR!pYDBYZ$a
zDuA<<D6PCK^w_j<OnB~+C9!Y?zN(jE6pn4gaw7ijAbxp*CNK(qo_>B<5&5}u=0sq+
zC7cIw52I^Wt=fUhXeldSZmEik1GDq_v=1Ov_0F#2>(j^Yg;C8M7#Ki}E%|g&@!jiW
zOT}HgyoyZK4<C+J4wkv=<uwOIG53=g)1Z$$Q&U^(?d#j5ucV@)t)by{^XAQu?d`D$
zG669$4bw#3Q>RXKe)+O4egUqY{kXQ)0D6=rjsZ}$6+isy#V1}yL8Pm%PeKY5TeKC4
zaF0UJhP>t9y#NaC8xFq;1$bzM@Hq5#26I$tLjdhx_U-uib(66hRP(N5DV~1%acojG
z2{&eeroLXf-Ce9HQh1B=5#RtGDZ7YM_KW}<acTirjab6HmBTsf9TH;n@hv7$VD;*q
zn>KBlI&-Gh&Yl0@=9JQC<Vd(@>7wvdJApBFqy2I$5x)7&)X|5~r^716QbfGGwPPV|
z#VErQ?<V0UmDs?DYoaPFIQwv$C-)CnpunvLRE4<&+T%xc$ji$UXxT9UCFyVI@X`~f
za%s#%#i+u18mGK5NqLiuj@RDywz{k=zyrMt_83wwsv~5ZiqvBVZ>b}V)PP0zfI&%y
zq#Svpr4^+~R#$uAO&*=e%%j>jS#DYA<ME$+3S)xPPPDI_5!m%B=L;KpWo2a%{6J6!
z$vCRPqA89t%6J9mS#9UADw+Y+YN}oZzQR>>CLehMa@BCl2olpL5(2}d7;jC4vlyV;
z3?8waY`ej3a}s~CT<A5HLX!zRIdS5|OZg}U5gJIp$xxW7)ZDvwud}O5-NGXBr7q(>
z)vCZ}t%OWnvZXJ9%`C`5ljj1|ID;qOxO(-s7~RC}1R4>T6^Ujlw-T{aciGu-peD-+
z1ZP1m_9vgQ&77XEdNgi&g+7}zp0EJW311;kM8(H{xu01Hw2cwsB)PylMW){#WijDR
z8@_nfn_fga`heXZxZIguC~kWpHmDvy-aDgp%B&?CFgs0;oWN+z!$e-VbV(E2<No35
zo9^z+kyr7LKM69-g>1OZqU49(&0_OLRYbmerE}T?gJl6GV;$~-X(+Pk7I7YqTOSZo
zvD#ST7N&jd6)~Mw<Uwt0fB%WK2I(58aH+R!VbYq5pkF;3fpG_7Ex)K}=_fo^!(H(P
zijl;0<1f9vS~@z{r%s(ZJ(@;G2?o>OjRRI&U})Dunt#bc5)MF*EfR+SsjK(?{d*+K
zm8kPPfU>JsWDa#IH7T5Z@j{iezgT|%y}tx*c6Zy2$1Nav84_wo%R{0eNV}10VTvMi
zCn!{3_4B(65kmzz09YMd%%47&F;PZkq@@HT&b3I6M5i*+Fe4o+kC*B)YG4#Edxl^7
z_19k~&*x(D|3Xi{8OLK@Ak_I-aCjLXg~{5x>|ba?br-;qmzURV)aSKegXrq;-%pg4
zl}$D&n3s&|^{V(k?g`6RuH3F7H*lLb4*&o)$20rDwd@0Xpz?ft_iq1DKp%x_RUMtx
z^XJdMwVp;3`h<V=YE|4}vfHRDOMbY<3CF4@YjZId0YW#@fgZNlzDLYj!-ucxk+1gs
z_Jg4@M<p;Fx3Q1%OcTvP&eEL+Y|iU9!JmWunW3@VY@+*1^q23_cyg;KapRn3!GU|L
zk$#Xdu{qW>Ky$%MH4wS<4*t+gIYC*kc<5+Lc;*FsnB5S<cCv?thP(i$U{0biT9=PY
zRoqoy)zHvz7mPrW#d-;~%h(Q3W8LH}+?=c8KdG}F9UZ6s`s+)mze+=TM3t~H1U&OJ
z-d9avRv4R_TEJqZsHn(B?gwGB>apB~GBoKw|H(aY4_^^T<U_+qy7^~@hMdnTaRE3A
z>clIk$o6=3IilqvW}N%mjKxwKC^#FyAXKZ76><moKj9SBMQ9sGkP{#MLSvXZwG=8m
zAL~QTl`5QJyLf2=!)zyLF#d9Afi|Ol;>3ybhPNQBVB`DZmKD3D7*(G|(#Llle_((w
z#0xOZv$cJ1;iZ{6V+Nt{SkNu61AJ&gSOWm~&l5I`M$1$UZK5*BcGk4DO`ZT>04`bI
z4Gpb7+k=D^34L<1CKvOkiLr468qDcO-~Gkg3>Ux)bj_#7PGG$51BmHD!}aDR%+Alx
zPZcf|e0jk?KgbqSb7{WeiA~570{6Obe&wVr`Z0XR<ruFiAONU5_|nU8N)}ze-ulEa
zB*}7Y$mx-|>C|ehwwP_>uA&h6I>!}P0s;aqFOiLO>S^KDpMn@YiibS7KyUSzc8Fzr
zdQv=~EYwlPAw_2fx9C>3MzVz_A0OXcG>a%;6$?OqyJ7aigA-(By{})7z&vtt&h(#X
zi)nxEy+)g~h(FS}H;Rg?f$Rzb?u9=5)^PPKV!0VP<Q7E5{S->>WoR*qq2;x(5(nG4
zih^bLf@iIVYT%w=2}O|DKMYOgc9<6SZC^tt7P`;AyeRpqEEe%`2~3!3DWy#08G`U^
zy60`%xARag%xC_!nNsds*To{LTcNIIA2&soNj{h}6QMKrjUS`>J0STQY3aQvC!$jL
z)kn=(Q4%Lmy-kw!=5ll2#V7dfT6wFC6HqDda2X1GD7m~W<VSjTPQR`gqp|N>UxzmG
zffuk7%LyIjZWxZnhV#qM;ixWp>-z1;rNz?sb&9c3)ejuF2@ZatK?YGqH5Qq9rG$h8
zm1KoU_i$pU=hMs6`%ltmt11V{x#&ULeu4Ajd@L}kK}d*zgO*3RAvA@BhXa`yN<sVP
z<p+EM_wxoDd~AYTbaCI$4Uv5#K3;X6$2Pn<vo#S-$_RQ+$%06r)D=MCm<YA~`)}Z`
zR(fPF_sG79T$GD^{YhqgL%DCFCIlq3rrqgu|Ik-Hc63;C#&G-2ok&0pq6fTpu@g`U
zr|-qAW<C1>%OSH7?1(jx4w82pxm@xKHZV42LraU^MH9x|$P<~kzP>)7vbFr~En;;@
zYA69M<##=~2)96_J$|?@)4)2T$A^;O`0601&D{zRAUMwhM1+@x1pBhHp`ZscEf1uS
z)V;XfN*p?}|N4F#OAW^|y{=vj#hGJ{PLh8;&pG->?aaOnRz@tbqrcU%JmLn9@?SWG
z6qHYymbjCvtE)tG2-FY}5fLXe)+^F%%_ua<#*eoCEgW>?(HOhj1yPqlJLGsB_%*Zu
zP0;5in;ADMWdwtq8wTMa0_|t^ZB#}3_8MJWlH|sAFQ|mrO$2mNOo0nx3^V*O3MO>t
z%5zWOt&>8U<GkE2{E5<p=D>?H)8@B-YK3_+1|6t7*P}4)J1`?-)C4}$ThY<cd)tq^
zym@BxmMtb50!;4dd~9noxDbYm-d-f@qyYoU8TcP&q&;}B5Hymp+5(79)fR%}!b%u0
zZ^DUuZBr`ae&86=h%C-mk(R}qXDn(CuMl-AWMpJWma2)x;saf%dCXgD6RR7eMJb1>
z_$i9U=>dg5!=b$t3R({V4D3wX*tE1XFptsuB@4t3U4<T~^M@njF0C;JcllPX6yWD)
zt<j1Gk~)6vq=fa`XrwzI8<gMG%6Mo*1*}$~@S-S#gABsS*#oD_LK4`l)Pb9P1a)sh
zPt(BxhikmUJ#lUonBhw^CNdhULAvRzIN7?Jz~_qQV4cuO<p%amT};ai!OE_&keekx
z;}so&??%oSDV)JDLp(qC_ECxZR<)5hKKlt4kK*JnUyhQk16(Pjv$NB+Yr318TU$+<
zI!c;Qo(A*CAit!9Sh`7KwQ*u~5-5B_V`F|!&JIYJA1(oXdZ9`LmEwG97(h~8{E|6y
zZa;gr^)z<squN)QE)QTDn@B(cClEJCO<`2Ic8ACrpiR8T$|{yO56~1Wr3~pzGt1TG
zbSoI#5!@!JZOH%>O?p5vbD`amzKh#r@_+r;ovfNVZCVe^OwEuQ2%_<^peH$j4b0)@
z=4a7Lqwx1dHKnGe9%{(<M#WtF)s#{A3aFA$8D|tnJxb7>?M=w_(i%EC5rC=YkWJaQ
z(MQq7!wAYDXlg>lqjBM7QUP6~CtQA*Fq&Pgqmz>b8j((rn~<N)_ASP7ZjkFx*eNVV
zh6bSY*u~CTBBO)u@9%Ef5Gvnf|BNWtIDiZO<;yo?quIyWx4qVcA1xV)wNZbKprBXw
zftM{6QPz%?=vLpvb9F}7zBHsEJc{$Ue7O=}Au4OQ&6-El_#afeOUj&m<~0$@-ynRU
zq$Pm4NLJO>)`~;t<t%V@VmLI&Q{Iefntqn60T{_3;tsT{r{K*hsAH~pe3&L|!RzAD
z&-L2!Loj6O`*c!|-dMh3h5g8oBa0ms6-5Xg{)#=2$Ryh2k;;)89i9fnyH+A$9(nsv
zcgd0^eBiOG-@qp9MyfET-n;s+7SI-rHB^lsx0q-AekBL0Jz6a3Hu8`}QHpz1hm3dP
z;PtYwXSIL~BFWz&WT5dYGnd*DkZY2yYNfJsaxAf^`NhSV`&H0e=`L{_QKuHM;}yh7
z>g<2rz;XR_J67Do&I_RKfuy3A2TIzOgX|g&zeYZ?0Opm5hAedmxz7Tf;`7}Tn0#N<
zP63jQ9%74uEc;+d;1jwDGPpqJht{-sl}d0qNGUWLL4c__Y1j->xJC?CZxy)aKz~0t
zY?PN0SLhHrRHfM-^Krl-jEFYck`goa_v=E_qXLfG>T+0%kuUt$$Wsp2rfZU%jba*P
z`R?62e@WXr7M0PCd(C9v*7uh^dB1C5oR-f&xdzRVyuv~a<S6R_QxO*QPVccSI`@x1
zz<le}1VK!Z+iVDyVNOl#CNuRyZM6@y!SFhJdws&g!vQO9piPnc^ywC~oB+8a5n&!Z
z{=g!GXkCZ{f1#i%YRBsHS9WnRDaT=4jUeC-T7x7hOeh3(PGKPDXRup^p_*eGT!T?u
z6QG^NewdNbgO1;t<;&kY1lbRCn&3nu3l{iW)NVmX>x+jQ<F1a(uBN7jyn`<yKY@XP
z%lmMGDbnRzFEF**(WCJ=OuWY0x<gHfqo{=*@VIitx57dba^gyr;KHMmnbkR3vHEvm
zAR1|u!-8O$BXH-)b<{e-FhJ<V=^w!~JG;9z0Gt~>d{{AS)|I_8PxLqFk~RtWgZhM#
zB%SQDLa0T~8TrVle#M=NTh-`5Slt7*UJ<Ucl$ZnjN)V11)q6kHRri}zxPJfrcXD{y
zuKn{*1r#1lB=Gi6h0p|>(VwTMSCF7QR~wSc1jxAl(1iYcP%8v>48SfRFQ1j0fo<h4
zYH|VHH-mg{T1ZA?7q&d14h$~P5_k#1{vIgG)R?F_-&z$Hg>b|c*%190j1Yt%3y!fh
zKx<ndiD--od3NufOnREmgv!|1+mpuY?Bb${$14I27eTqgeTw{$iQPwgaiq^2hvKEM
zV{eDN<rOzATri#r9PE#Sd4slGSTUPXJ+Gf;vkhqyvYKa+X&KNM66_~um#^=$dDiW7
zKXnFFVio#VFmK$)se@_QD5MHA0{=in0P@s2+57oeY(<=r5x9iIu<Zdxq4^|E8_u0Q
z`{1h{$dIvU8r{XYzO&+-nP`7yJpZS%uzC&oSJAFsEhAGSdG!X~2bAZ8(;gQZF5RDQ
zj6UIc>`>6rPjCp(69B!gUAuv?XaS?B0uRjj<BtP&%NZuD9o3g-!{O`DNp2!nw4h-?
znmVKLCmv>Vom#9wUipp_+i<Aku>!&^JX$MiS(W{<V=rIbJD7aDO>Z&;JnRQaQ=5&8
zX5L!C87=;^?ln|xB9LKWVUkr(o<0=^H-K1M_K5TTN5arGKVh~j(9pHV16!J$2oJK^
z%jeIR3rDceiL0s6hqT=aC&NJiXD@Y#Y1$wtB=lKSMej|C1qF>k;QPE5hoB9K^?+#B
zDNX{wUo<QxO`1f}g#JO$L~d*ZakJt@Yjxt?f#2K}-vNc{#qT@C5uw3vvhpe_G><>1
z#av>e3<9uHFPm=7{STz28^~VkuR_YW2e^0_XJ5mPXP{QD;p00)a1c?n>C4`o^bKAL
zVfNjh_Mj7xfivg%1qGhG*f-S61eFs{oe6z>J(7$pyl?4HS1RD`y0k!KGw=-(sHoHo
z#(g=JD%zM^xBlJRQ5Q$05TeY5F@?olw9q2ir(Yc0hzj8_o~#1xZOzwK&Lk+F%X@cp
z<HgHkLqRS~%6|6j5F(nxfk`(3HhKB%{Op0G^%!Z{tsS!f=-wu^x3ol}ABu>9FhCjx
zDTtxJG427xchG7UmqPRs|G6C&Mq<wDdtD2~Ep88VHLY2<F8*08=hN3c`<JF8;QjgY
z;S8M9zi{D#O)Oz^N(;zogg+OoTDEKx9bl-n(uOpFY<Hoisk!+si71CR;42E$i6f$X
zR{&Gj<4ns7Q0xoCH9&Q6>zH+Z{mS2)$ob^66eDA=(9Ty0CLYxJNJ<iG(mQbi2fl9-
zL)(XPY_bF573%mJB%Z(j{!iGQJ8x~`e0_a4Z{FMuXwj<THmJ_8HIC~awE?S?V{doE
zil~WR`|A6)84n+dflh4Ox^)hyD_-uR=WcLffWE&F7aFQS{s17ea5NfO?39!g9F~ZM
zcFM!y{>K4eV+<Z>L9sl4{`_m~^xY@%_?KUCe#c>QIp<zf<rc{~>!6MULl7SpxCGuL
zfcZl6gmlX7+qd5fuDf?|2LXB~oT9$aqfaAI@O%2BCMjee|CswbMtKNbi{j)*j}*wc
zfIfmagdF$IW^DVhra&18nmv3!E^s2z*#l}^;^oDHGXV;G7+~g2n2}nbIEx}N2t*UE
z6B1Gez@nZDmdIn6XQd|#&M~8ISa9;6KE3s-gaaVFB!8BA4SfVSkH~?)h4fSZWZ}2U
zkUmgeHm0Sgb0#J`9#0X0=(PitQiMbRn%QQYuqcP$n~xP79PEU<?a5ygM?@Gsw1@?0
z{iCfI2gwk6?WS$DE7ei!u<t;1_;`5_J^pKwgIx^FU^{S7E9}XWC+!;+(q&a$f0i!Q
zuf}F(t?)*d0x|_!!fIR!;~Y|xF$GI1)Rb5%r_?D`+=F@oMPUg?T2!n7<rcH~xD$+D
zx(eqqZXmCN?vPO5K)xj}NHip7;v81am8HTqM&JQNBvH`rs)WcvZ@S7$Yd&?>tkK%4
zwu&fUu+|lBZle!c#wJhNv-Tp4f@81Hjf6W1DFFg5{T|dOY}X*7%K@HxaEj_KP(y-_
zAifCB0qS5W-T$~65)ubFO`E7X@lUcP1R+bR-UPIWLRZETW@buwpm%?w<;^KnxQFMk
zbR^j|s}T;(zAzU>R>;PP$n%mVLXg6-De{d)Ym>rphywSbYGu`a#NZ93mz>LJXMFhB
z(&D7;Hj_rLFkt-Or6F}Mn4uHQ8MOZ2;ga~l=hGp4Jwq>y`C<vnnEEiDi;}rN1}KlP
zkgDocq;dj!4h{~~HqLH94kuKW3@Z(*6awjOY6;-&9IKGmWo)2Ba0}}_JaG^7&)RM#
z<7JLwd%?p)-P!pe%>{~iKreZNHg43}zkfg3Nrr}pty@1sdoagMXQjnKU#AD;;iKIg
z;0r-QeRBP?sU2sZ(qA1~0spBh^~dblL6#7HiSbt4X3d^h05;PWfwWEJ9o9|hDoEU8
zSdW*LnVH!F#)AGfzED$4G=!M?nebaRL$9Ld-t7=Mb32^;cFtc-y%s!Thj0>45Eo;j
z2G|JsI)b_ix(Of8hRh!+g>&~7&M|F8%Lx13)7Q7EBh#nCVz}tWbUZ1hoP<<-%5Xan
zEnagy{<}!jEB~uV+zzp7fz2%%87$`#9v$el9!-ZzuoU9Q=ou}RbME&6f$TFp^Y`Cl
zLh`zQer57cLBZNWQO0{0(1oBoG&K@i@mkOfzX=e^h4bh4=<5ffy?UVrr$=UEGw{pE
z=pzb!Jv<5w4Dj@mThyVu76;g>iAOA+qC?>i(#OE>lj}bv3xzbVpr9D?N5ex>oD8D}
zeS<&|C9BIEU!;EIfLyB-%bu1BSdI=~`v&g;T)zX-gIi?)w$L5bK6EI8JOH#ksN50m
zK@-6J>A6`%y$~p67`8noG@v&%J)ImBm|t*6@+o*^+(JEvG@rIV2ERNAgFYF^4rEb_
znhCA>seXqJRpZFp*_n$Y@#sGJ;^cq+`2~+?*n$@9IvT4NsZKBwG|8jgewFvTs|!Hk
zsB|p)?hwEiVA=6ny$$n4t#fpH<{<GQ+~=po0k6=OpEj)w$G%pA1^v+mL-9x$Xu4Dg
z3_&eVXUQA?^Cw-PKI<peRdxN?)MN?2F6juEHyja{lk*ojcs3Zvhe;4nFehmQvOYL;
zLq|s(OghB`m^rUe6~rhkRqsE1pcD+TZC+X&+#qoC;0x01wFeJQ-W7*!P7^*_Zxlh!
zjs{&8Ot$0{>ioGsm5%&#5F-xZJON6QT`bK2Aq70yWjb*F*elc+58WgqhC1XE!td~@
z)N7L)3hhj7>}MZgqgZ@fmy`p|g^Ye^p#iEwT{Ko`Rzdp<5VQv-=(~uSvtt@&zRbUd
zka_+30!Y{JDl4%+!pj1$hXu^_APt1s7>)moHjNG~g$;+W3GT_}*fEi@fMG~7#3Oeh
zja!sc@2BN;aui~}92FWG3XlFTp$RyUq=`XNn{NNp$#d;J6e01#)c|!HUz%}-vJ(|K
zYn*!UXU+67Kn<pv+*lmcs&847dJo@30U>duA^|oK(p^Y8SSDXy&{uvCFHj_Wkp~IT
zAr;LLBzo=)JiQgli`KO`h>A4!slWU}mN;_L(6PwTbCXnW0z;zDwvrM7I1E|GuYGIB
zChuu`qYVVy0y6`2Ma4Awe*ggO@n1*-(OtHZdKO5o|0lXN#0qXSJdgkG1@Kv0h7Ouj
zLDlf^Fx1bvJUjx4)zQ!#ybdg;3x|q=1^9yjy1e-Yg|K<ii!WXAXQ^MbZ<%|3J!&&$
z5oB=mYK5Tae#`{R$hSx)+Yy@n)6a7zQDe@(CUGr_4aGQIcenrgYXXaX<;oSf!)Fg8
z7{xeCtAT-!wR3QB*z#-E_H8gP-HM4M9|5GDtSmQvA5J0@@I-X-^YU(Giye6O3mv<A
z3DlhI^6fZ!HRfQZ7|2k?dT=iiO88M%aEX?hv_yyuN-LHsNcsc^#{w2mRYfp^9Xv3<
zf>KSTZQi^)c>LjJ2z?TvW3&9nPmMvL`dI&${nU&@@1-~ynhn2{l)uC=ST5AmA??A3
zP~CPMUol2mhQ7i^LN;-E#%d6IdSJUm=8A_zLlZpL0QeU*k`EpN7{dtre{ni~{62}M
zaHU9(O5&(nTyQfu&-f1D^P3qONNrR_#S4^pX3xF_9Ua>p-D=T*B1$}PtCEPFA4Z?R
zcuD<hTb=#yZPn<k9sz~zF*abR=(m9x&49sUNEpHRddA7p($aDeBGmU#JMxx7<tHN|
zY&WQabMSbaXe8%?H*faBttaZYD=U2z<rii#&kaza;YB2x1!sxGXF6~JcKVuKI9YY3
zeuk4Knu`~%T&ZI7DOpLD(*Y?A+!IO)%uY~zwZ_N3;KC0q&m1&(cmm1dQYp-XZU1Nc
zsV2eVh(+fY5!r(!q34of@E40GxT{@5SA%-<$bLv@8sMxP1vBEy<mn4S`ILNirYGKT
z$A%@{p2m$<Oe-QA9<&P3DL30k6WuQ~;l9Y4MHvGo3k>mXQVuvO(hCx-6To`K&27`h
zlg8@xKi#V9icz@5@d#56FogqI48W=)V$W0J-R^H3;9=IhtCs!g;(_%zQUd0^4X{yM
zYLWsshn2!Wm+m^8&T9_jGze-rg9>;=Kc0&cH#!h9dX1OkESx+e{X-(ME7Ct73%+_a
zOw)xGzbkP2qKl0cM|iKnN^Gm$t_JTOD)9fK>rLQl&foX{gE3|>8Y4^Aj1ZGGTQW3^
z<rJw9k+l>`C|azI8H_DCSrb_biBLpJ41=OXNz#T&D239h`aSOxGxPcWAOG|CK0e=>
zk<<JATJGh#ulu@9Fd;nmDXXyK^*)~xrO(WN<J-x@<(BrHyPin;dHa}W+1bYkBIJlm
zetp=whu3dSNxP>^pFX|cqx7q<UM=97OqAL?`NW*%%{=?hL$_hepMNfGX6w%4e54h;
zjZ`7c450ttuW$#>pZ`)%F@I{E-x7v5HN;?Uv!oSV;L>(q_eYm>+|Hdl$+2W3TDER|
zf+7yARrRs{)Ozr-FWyM8`yPmZtTHFSAcYJekVgW|$x@>m*~r+Kg0k@PA2r*z*Q0<(
zL@D|ueUiNhkY7`pPzA6~64Xn<N3WGhkgGy>26>Snt(XF1)tGH<j?hzc55SFVN8x$*
zh|Z?|(kbe;qN8VM7k&LnMB@_4(!kvaxw6;Cf9~D8xA1RCb!;zA7XrbT^RW*8y_dlk
zY~=7);3^c=%%xDE=2CnP+Ar@X^_V=JDB~3Hiqy4Np5dm{B^BKDQ}~X};c*m-0Y%fl
zb0F=Lnip{WPks8F{o(3x&(QpW)UY=Xk8P5pPMi0gS4bat8sB=pG$tiws-%=ubtz^p
zzhhrbx{Z`qgw8_Rfu~<yI(OOp`hAU7uDtTu$WTxjN*}tE`pL(HDz;(MrhKSX`k$?%
z^Wy&hav{1FzPPout$}j@RIohtWsR<>bYOeq>vkkJCrA1}jmM2Mv#=0K@bsK${}Hhd
z>aYg-3!t#9JRdGv6<fT#_UPaX$B>)5Mn67j-bHv|LN9vkP+xZh3Dn<9h4lIRSB)C|
zL?w(>;B-ymhl*bL-MBH54C)n0^`fU2g0Ki%A~cPZ)K4F9EhSU}A<stuL(h3ONb94s
zvqzAvN)hiArl!YUwUaSB@}C4XR6>2qS?TX$X9BSjp?$qX!8t7NLzPY}AJFX&Km2e3
zo(!o}%Ls}hGcgkEmf(;yW35X`S@PiUZ(f~7gy5FM&`4}9H4YAqQU!-y_{Adn{OJ;#
zh5bG%VKq&~ssQrXHzIJC9VS%@&;z219z;l)nVGkCe|z{VVMdB!{r`2y*yuoO@HN88
z4Fgkey;8rQ{M)Tv<fY%X=q9Zl_kU|jKIA)MUG&jo$09bv4LbY{Sg;iM(FF;j+(xx$
zA?KiyK9dmqYV57t+FIWcFe-t7Q{-o`s_7puYEYemf?}fT$jl2sZ(WL?Y@#J`z783W
zE}Uu5Y^gjIRr<<9d8Q>%(#gxFjil8*t7e;o{}~+|rr5QOm|RdeJl!6c!U)PK;ne40
z3!I#sh?dn{L4sD@fdi+|FzAK<)`GOg$EY8qWc5GsKW6XS(|AOlkWANxp97pIdeAIM
zC<MS(=+35?U02}$Q7O79I0L-{@!jA_=j7PJPTe2>GBVn7<jD2K@B1V;JUMrSXfIIu
zI0VCkalC?%V8h0Zix4}IbXx1O8aC;AT4!VbPpj$qcrf!|qi2Oe^d&O+ocBV+n3WSv
zGAQ%}l3sBoR0Jk{c7T&j{|%iqgE4CA?(06Y_mm`^C_w|;Qo0qL%54Ee6ra{oj~v`|
zp)glDg@n9?l?BvD3_I19c{hTX9uqT8dR(Le%9`gc9U>K7lBI<t-=x2JGihq$v*mI4
z;u+vTr@+pj;<&A_T-MB8u{2MJ`(Sk9Oa=|IgycKAkc4G(oq4N60?I6UtKV7izx8;t
z8u8syEgJ|$KW{p=xP+NF!Pq(JlTbXn1es9$^IghGlzWy#;+xUz5imt1oy`LP{qwYY
z^8y(8d?xmoi3ihH2dF)_(~k2B<N?s*0b7v!($(HS!7cZ0OOI8n;3Jx0B9pN}TO%~D
zp8-=EGvd>`-{;Mn_diaY#BD0+iJYp7j;V>B7V2eq*Tkn{9F|`Fxju6SH7p7$Yi=5C
zBx9tGvR&<na<5B045Gf$f=Z&3f%h+!3l*biJ|Wl%gsNd&xzKDBG+s$;xxRbGsK0Tt
zh)VhB5JtSg_(Wy9oSJ$zgot>Y%C#u-%=-1~=e=%VcHz;Z$rJYX9}UxkY#<4drr`4O
za{S`tV9N;{6)Dak%6EikSqNw(Wd=U~<+VKXqEak?(wmIPOIV!VN(pL^pgqP(8r~I1
z?Xcf8den>0r#eQTIlGXQhJ!;>#_LZ-(OfyeJHf)G__othzWn6rwH@zjD>)UnoCOxa
zpccN-Qw)|066vS7agv9V;>Ibd@7}db2VO>c%DK6tB$wmVNYMbqf%2{-jScERawGG9
z<E6`8h+h)Pkx%oONfkY&b4jVqJfz}3f2nNw+8x|qgHBn6W?w5YXIr*ydj#Eq0?Lm{
zf_>yzc9EW(;0an%pyV^zlI?XHUW46am`{-5@ZcehhAF)9rc&sq7si60odWxtmZ}T2
z_cGUvoofE^ZS6JFapOid@itme-y&dC-|u(*`b)qcKbU<#*}M15zw(0Hltj1Gj96It
zr*o4Z0$Vqk&~)^|o!wLqng{4=C)PjWQ@=-X)$g5(mulw*^y^WSGTkaa&NkstzMo%C
z+@ZMD;Z~16h47FL#B>sK${uRz+X>1~cUG&eYP?b5bxx~n&-5=b!u5J!97)jSmKVRA
zpj2($x;1l6*-Y^g&dx_W_QlRQsl2@fFq}!X9ke6~khuS&i8xnrywwP8@8H<$C{0aG
zSHjhh{$(dMoas4!sLRW3Jtv=>b17urtu2eV(G?$w1%l0vj9gesMkv^#v5MgZ9ja-7
zh}{!sG|Tl<`7$wd{GLhxGH65ZjM~A_qOfw3-C0RP<mBkp9!*qg|Kd5m?L>1mWW;uO
z-ek9q3m*hO(Jw{X1)8+({fHhLr@3-9Y#&h_)BOH>ZKVKm+}yoEHPmhTNZw1+2!8{P
zKVsMB6!-O$2Ya4&fps(G^t6X>gjGK71Y}uyaeCe=YOQN#1OVV&S7L1W{r88l`NPf3
zDuCKgO%77&1YsJmyHmnne*IxRPHiy+D)8k;0Lja-;I?U`1VP*_K|ANnIlQG+uS##9
zx);(L^Y8oEVE-Obh^8*{)H((Jv=B}x-G%q%dMWzb6g)XE&Y-K30;~lzKK%@PHU&+o
zQ_#i8MSwR73goa)5`4{i=1l#o8Vaq}v?2(ticW?^2~VKV2Gb;^xc&>VO%endY)Nly
zhzTiW60o>M#_<IYkDj8%#413fJ9^H%a>00yUXTFht1CXZqNo1JpG&_u{jxc26ykz0
z=e~}Sz*(|of|2+RT3l(=S)J-Cyo1gEDZ_eS!RjOWH?}P>q)MgI$=Lrq53O_h%?|3B
za#}s(ge6`=EA2Ry9A;2Dd#fzTL(m)5Y~FlsNH9%!!U6*qoS(W@F{F?dBr!!ONcK`z
z!UPYdWNM|JJf$UMN}fov_e!RGX|fXhD|ubN<hN|xQI5+i&V?#~_aI@<pZn|A&QTy<
zi&zd}sdBScP#|ygRFe}R^nc803Ej#;{PI>hb?!;y+tIQ+>Q4_t=bS$U&;|B9kk33M
zD=E$3OD?H*Fmxo?hVd@lBQkG}^CD2po{|I51L##0w<b*mv@+RD-S-zzmn5ikelBUL
zH?6Np>9^X8Nk%0Jz{Pe^uB4(Ys17ZT)BGfjv3kv$P;`>yJ&^ObP#Q9qz4-N1+G%hD
zLW_b?#^2=V@f}aY>~}z}xCr9Qd;XN_N{IDHTmsR%5QlrtiYgyvst)ZGdi^s~-+Zwg
zo_-56qdcUhY+$nIf9!^H=nyfep*MsKkpH-sQkZELj(bU$d9~tvn(^!&Zr~rW-M%}D
zLMATu`V){_s^vnRkoricPX2o9oD?0Qtquv(9`!mg@S^+sd5z^F=ln8yi>CBh5r@RB
zAdlUFddm@{2`11%_yE*QO6zOaaU(S68%0aJEVvkD(UGB9c+xA<nF%5y2w&>{8CW<z
zF*}BIQU3$~bI#@Cn<)Zw@s(#HLV#TX_IL803+n91Md}UvcATY$4<5p7;UD0XXUhAW
zbWE-FUsn@Eys?m0OB(HFh8J|{D!ejEH&Xm&C*t+Se4R5Nj<YH(EQ~{s)#y&S`udx3
zF}aw6o*Eh&fQb^boH9GIK6$4?JHe2+OFF5!bFVmr<SpSte{kN9MH1F9zvke5{}I9h
zd(Cfuh_Yt@s~MbKPh<4?dADH9cZLkkRlZ*swaLfFhc}THRPhQ#<E^@JaE(lFmA6;)
zFUoxHLCiYk_;{`0E`fLL0tOANJJq0ivr{iGJ0|%rRO5a5(1nA@S%a58ztO525w4Nz
z0twh|-V8fE0&~4v*L-AG<6UME&bgDJ4QuWNusuuOW}g~8$)?Rl=}ndBkl*zb7B0;b
zAIwU>Q&cEpRb2^Dj{)^b2D*rBM&5s_iLr5_SSIihF*8tqQ$PjIcr{dbm{R<ka`e}4
zq4QXS7)^yTbVlZ6dA9%Ky@6s_`#|H}7E+KqHRUJTa%S-rp+gg|-k$W24LU3bk1U>(
zJ_YlsLm#UNuQ(Ya(MxTD3YMaAz>f69PTU$qv1k$sL^-*+(&)9j=M-a7q?b1QGhyL@
z9U1BA9bf2=OM?R!NCOP}Y6L~};?QP_97Cb)%eo0-#MW8fspn~pcdB`S^iK);->+zF
zmjAf=1g!uPR|&~Tw(ZRjIhEgJyC}|CHf!hq@3Vi1<duU-te#$7btxhQ0z5=>C(|K2
zp-Or>IzPK!kJKhvFVx%m;Raac(>v&o!lw+t3{YaauQ$~7EH3F3hy_97AsqE#+;l8_
zDfn=$0vdXz_aLqkt|l%)I)tFE&z7}k<2x<L&&iQURFF@&(dr+cOZ4nJ%p(+ijDV^>
zMHf>YPgzHVdbFPrVr@W@CQK}K81l7ediQSKZV&VNnJG@vn@!P7Y=XuG*e1gI5l{yh
zA!YpqdGY$Zc+A>}rK?T<@r^MFSho|D+ItQiI$cEx@)R=`j7ZaValUW8=5KkHe5;{B
zb>w3oaMDbtt6-7D*k|}{f{U+2hE>n_8-+(!tWS3cWQ)#NM_5qye?Ua{Had$`QH}(4
zl2%>6^7OMOLRv`$SEemSe%#QwrNFJisb*bF(Iq*r+gXvbhUZQq&6D^aTm`T4@Y5mM
zG3c%+Awq<XJH6F$4yKXoyw%MVkMI4J?Xu-d4q&u)xtJU%mN2IdMuhOQ;3P->F*(}R
zy=O<Ct}Uyr@#nqm2iur)@T7T5urX4zR^7hx{3HZmIk@z~)znliak&axiS%V4yUpTH
z+xNZVv=ASKt3vbQ=8Q@z1;QAVleVRGA1CvnikNGr{=?6}12Ps4r%+}>`j{Q|6qfQD
zU{+3u_l9qY6})%1@o!*sf7i|1d7ATCZ25S8vg4k~m(jgv6OvDiC3FbR^bEC_|F~-c
zh>)92#ac#4ky5(07_2rQ7}siFGljMinuK5jLbRo|Ezx>5k)FgnAnT7eG*E1XA21dH
z)8j0LV+h+A&S;omDxR|j9f-fNT%k~2SyL4*jo<Vo#}vNY!EW>f!KX5RuCLl3sZUac
zb4<D4(K@J)SDF1c2tD<mFCmB$i;>80im2d46%nT76b%0f5ptCE)F}65+dGYz#d)QS
zxpe~vEjEJRDK9GhQ^<<-P0{Ey+#2R~zjcxIa+<)IegtS<Lb&0ZkGPqjY(ZEA&6W*3
z+v{?@q=6lRL`!-Pm6Rsb1wE9CPa6GJzg2fRE-U*DZ{(SN=f!)2e*z2b5kpp&pOe4>
zZKi$s(x65SpS$L)w#ACXBHVyE7gt&@TxJq336W*WNwdFh*eVob{S4hqB~u5wieDi_
z2%z%GK^9$`_|!8hm4*hPh!H<!Sf^Zcm?{HOq%~(ltU}>6WbLQR(kLz-FrRXHh%n7C
zfOS4#WnX#J_@nPD-0yx=9}1qoBTF!ha5+XV<{9@CB<K}ec3c~$p>RG(WE2btW%tWQ
z4vGUsTjH!h{w(JQj2R94_ayzop(w@vQ=DAl)_H&;rz;VYR1M><xd0zck6s-i5*1G6
z3Qy~!*iK<5O5w-A(9lGMlyhrbeLl?WRwwwSo?YbZ7TPD5r`xsJxO4GUc@X*NwE(ja
zfeZ-meN%84CIl@~Ly|XFcy3FL@(7dA0iH9XPspl?lTKngTXk<hZb3p`f`aJ2*0JLI
zjvP5c+svgQS_<2PIJzL7mx}n#<$1&25})#4dTsTor|2&#87o=e6{Q7MN(<b|1-tmR
zHH`>-y?>W&kIL)pm2k&!{)&IJTl0kggI%)w<!itrJjo)WcI&3(K%Um3%CHEjDl6~j
zl!7dO{q?JcS_<vG0w>{^g{q)T#GO2%FB&*H^@Xj%dmx_nINOmK_52uSk({yn;MEGF
z-mOV$z#@{*%$Ta8l<pzDQy^yHybvuyH;ENK9N%Ox{KP+0tennJmgGW<k;Mv)Q?U<$
z@Lv}IFN)o`C3eHJq=|;cRGt{by}LSiMVqy|SSMmu@0OI#E>bk5x6xPyKnNK8_jijG
z&R0Q%h3Uy}Pz4a)<%5en8`eva(_FGq-awMN*QURnhx-I);R!DWqo=iajkJ!B>$W-Y
z=P!Vyo|M;C6lKmN_wScNJYXz2AAySAc*!Jf)A1@L5X$q}C{=*;g^8Am-@${uOutsd
zEJCTr5|b|c2sPzP%4t;DtayRMKCs}4tWVQv)mbTx2*Ob{FzDysR*^H1=s^NCe4~z1
z+#;dz?b<wkqqE|6p2B;bZE^YPVgFunjeU?&gi1ha#e~)a2Uy2rJG+eiF!5LZPR?{p
z4TK+mgWu6?=p_m=z-rTF6=%+~v1y6O2sJ(vm2jZh$CkaOKVns$T?VIvy7UI~rV0RF
zT}Zs8*{Xr(umVaXl4%JqSgGQB0Vv}P#6~uebNH!yyE}Q=5kGwCFLpjJx-!I@V(zWH
zrJeOD9TH%pJp>!|w7%E3>BoZ9B(8jZelVwCsXkYF8DWJ~y52lK^OJHYsS$vV?XsSV
zrPctS;s}a2?jYEHO_d>gqlO|To8aU)C}S{3NfjXd9mplMI>AQ9T$4Lov+H2COUUid
z5vr&4BEm_Cf)5F)bj6m6DiCF{oa}6yWdhV{yP&gZb7tE4kt)KsKnzUKxSc=B7Y=KR
zZ<1DNiItVY@DSDgt&{3`+6eH(`BPfp_DfKMnKZ4b;={M-<Vc?{FQf`UYkZ_+3b%sz
z9Ur=@H*<bU-LZSE?see5D>UK>xdyB(Suo+yFe|byRRAHN8XhHutBY9T46(g!2P`;D
z*%+TFIC$fxO)m^>rFho4$#>t;zj4sE&Bn$pdj`Pt*{9#Go{{MVsB}Wz2AYSaAnl>R
z5p!1PDXxzSpYvCJ_5gi*$d>dk-%U$Pdoh_Osc6K_@Z6P~UHsnEYR^A@@GS)=jh_)Q
z^BuD6D0_;LqNdOZ6M{ABu0w+&+<2g6AGKnIrmezrwKR*tRNJW|Tmr~Et-9Cu+y#gm
z4DGG!55eodZ~>^rK!GEGNPkyPQSnGRL8Qr0CF2`q;sk$>JsrWGDhltuc^?tqw`rH3
z){Fa<PiP9NY6GaiQOhz>!R-;$R&-oUO&>jD^Nlj!5mh*#$)E}Ve3phrXE(P?Ls}}X
z_vDtCsaq;W)Q(e$=7w~Tu;ZSMWE+ICts~THsCj!y#$*RABi!E~6@?VCQ)Qy0Qry^S
zf*)uX*$}v0p4tkN{BH>Ed|9pnOM~MYsD_Xv)dgJnY);O=78HT3q4ZdSV+y2m+Dbpz
zrk+ACoJeuSs|UZR0Hz00DH}&sPoAw-@=u{fQQhCeV~Ozw=dAlXpLaF!V7vNmZ=ZHO
z5O2o~Qg}vfCEL+9uo^JDQ^KQ#bqj|R5stury+zL-e_X#_wmim#BF}N!JWW)LG4YVR
zT1P2pHKoR`N`{7<;3m#r_M_rl8S;Tr<x&N}gLb(|SF4JS6ODoJCR7*f6FN#u^$W2{
z9_=_+M$Vy&Ry*l8y%}?N&gZA_j3ZP^La=>+#|K-gs7jT@K_E#&V^q6jMd&P$5P}Rf
z@DdxVl-^FZ?KBBqOlPj}BYc<E#v>g4SGaLb`gtyeC>x!ccK)bSW>k-<Q>S(?aQ>C3
zIFvup-ZYH}08i9QNeH-y%z59=)FBeLe$DOld}RAq`Rb@i;W}SDzwHy^wIc%9iYQT&
z{ui7wrVI639ix0WO;6D*5nj+?43!B7U-3lXZ1_exO4(I3Rpzo#mu(8uBd&N9AJ=(}
zYELc(%<~{|e2}yo7JkxL3aVIl&JV~rxA1BC-}t}T+%_8_CR3^HB26r6UJ`oduK?;S
z5Xb1nU{_p|KZ)0Y9fam9n$Vcye2SC1e}_FyzH@(2l#dV{vI=*&Rjs<!Q{+r;e}_wD
zJ3?Ogg01W4Ju&*SCnb>_a|BG!($MKLleLzqaZ=Wn2vK2FYPj^7r$>4|xi}%_G{&5J
z%Lb4!h$5N9smMr72{dlWp^UkMvv9m0Qpz~SzEzw;D;%}pphRrlK^H}df|y!1Mqza5
zFJgyhe~wp&HTII%%E#y_?6VWNgMAIPxFix`V(!&okM`_oxQvR++%WZfL8Y32o%dOO
z!ssT3egP|Ak#Tk1rXie_fWLvsDBr4<=Y8#s<Cr1i*|nF<<=QU&)$Y!}xr$;dKe>?A
z#=brxbk<QK)*{Fy9n(ifZfI=fN9&lveF*zEe|WRTjYIo>tI$3XS?2O0GeJXf?gl5X
ztFmEZO*8VqUZxEcF|qRN6*0rNCDc=R-Hod={>i_oUylC?V!WldP=~29`sP#1J%+eb
zuw!ayUA*2g1cFyATo)1`$;4>|Sh51Gm1a``kvoi7YmG$SBc5<_z_{jMwK1Cn3gx<t
zQwfES;zBGJ(9=&3f(qHFIfV)cTvXt?+S}WU)llTDlm~mgb2E|oiGpTzWdJJ6BR0;}
zB|(yo#wPtQM??r{1=st#I-_4yka<-eNfkhhHQn!Pg?BrWTVvJos?x=nii8&f6d!_#
zE{glzInN;zP?S{5TZce$0+__bAt>$`WnO2)y*uvL`VVZ;Q3cW}&?3&xTU|SV3jUB{
zfKaU<NN#Px(N$<<T^*c6^U#AEIttJ98+Y#9SsjJA14ln5p|HB@<}$oo^SV?1vz+ph
zyRZI}*S-7xG~9b^-}DIQKpl~G0C@*H+baxVbvST+=(XCK$coF*0n{g}RoNjE1CIR$
zr6R|^tfu<;5FGC}Ub23n9mTY*oWFc^p<jo&Uv|}vLnQDAanSC$ZBzjSj1H6+CCXDf
zbVYtOx8!9uV*hdKci{AFT{U8?;@R9QRb_XeP!kzl81kJ$d+;-UmqI(ASW6Kwb<BTG
zDku)qZzWs>c-Bu5vVw%GqzZNZC8s?lw4FL&!e#_0q3SQwofUPNTh#RC1Ma{XuLj!w
zS5~#pC$bS|(Uy9S8F-m`(}0P87r$|}|Ip!XcfIPMP|T!1cOCSK$l?hh92`k((E?}>
zV*HTLTLvJebaOh_kPWzh|9(&F%ja~Q>>AAJAdB^SMUnm29EksZ_rH09?Gd(KkVibE
z#rW~tbObBK{iYzvc``;=<mq{Z%fccfBX23aTT_W2JLo{K)U#y7(>nB-F$Ey$R^Fj0
zG!E`}tffG26|SM(6*PFKPDYeR(hD8Mho8J`I?pd~FP_`}MDDK@6;Ik<zL%P61?;4%
zyP`d}CA<xEi;JE1T}(#^aLV^?`6xu3A37^~@5BG5*jv9GAI1p%b9^YN5n<TZMU(dT
z@&{{kIJB3%klo{>-ap&H|H3-mXkd^gvwzG?W@aY9$8M4G3p=VVl08!sE}P!$d+oz|
zxfO(6J*Qtd2|4lft5>gV(*y`vok^SJ{XKe{G&&6`Yu>)SU#}{DTTE<!=ctOlUP;bJ
zqud>3=@bSNR^IX3mh4&3j~1xnGOCtwZ-&JD_%B8?ehB(+(&$0+eEO$5k5Jk^L8d>G
zV$?-7WUtUZD-)bW;fxiXBnm<ZR-YC`^~=Yj?Y~({gQhplz?-1gUl83D75gUbuY}gX
zsP|s{A%Dj+N*655$)I!RzmDo)i+T$t!h{K#8bN<a9lELhLF>O$-+cKk-f7mCm;QpK
z7fdZ~1(j52a9bzCp9Q9nANp`b(07)^lD?lq9xt^dlC&IKU!i@cNs}g%=zd=D;_q*j
z7Gl*C4)vlBS5oe}k8^|W!C#!ek<$lmX)yHvx||v&fCCw7-BDNAMv@f>=k(#jhhj-E
ztg;wJUMr}UGI7R1`V!*TH&vlw-I+S?WU4e4Kt^hqO#RCu>ix4^*Xu1?woF=Mq-2jI
zoQZUg3aBkDPVkQ}4(a^)0iT7Cb4bsiaK@V6^wxnaHXe$kN?OvTO%dRpTXU&Dv7h>*
zz3JFLxp=^HSZ7jg70tNf7a8!mO+^Js?X14o{>wk1V;~Ppz^7g)**ivT9mW|*>#$Jt
ztQM3puv|(@qVY+PKTS(bjo>$WkNUTYXECSEMmf=BmZH_EDzuI=oPSrd*X(F(nt9YH
z7LmvXFr$<7L`y`*Do#d8SxU6jFQ(A_mi%7t;);CB@7na0(yH0cA4g9HYm<sBL1Hqo
z>S?b=1XScu(y~vOHd&3;qnn6boJ6Y`9ko*Jr1HtQg&FFl3JB1lap5F8`;<1Z-4pA@
zoFj31MRYa=)Jvq#fV+d>vJ3PlH3*|BDfV$+g}q3kA^uiyASQfXK~AmWo&WXPKEXTU
znfe(7>0X+51xS`}0=)_*aF9MKHGDaAMbC`t+oVMVg>MoAxANA<Ilp%aP_O~evL7Hm
zd4^CfS+De9N2$DucBa%lNDE0{rb5jYHALyB2Tlq2Q-XHS_PyxGk~ojb7d+fEVES)%
zj_qj93^4=^DjPlR4mbJvjSY)3u3SXBZt}S~zX(|cI+x(89*4Rc;<qOPlS^kceTO~7
zZvY@n0l5Fj*x#QB1^_pVr}`7ZNyYQ<FTayMccJ)CVKF8tX%ddwoCHfG4PX@8=@vt2
zMcSd-gx#GCnlJ@=pKL4Rkp+3%+w|jpKhEv~n13Rd3c`c*$SLRCyP^wQ1fL4%V0YK5
z&2TLUEbAJUHCpzZ95bx@QX~l)J<YnjY!urKiOgS$V?cqF<jrOJ^2UVs@C1Lt0gH@L
zMdnAElSlvvlv?w`6G;7H3MR$3;(R0tZxI8qyVnd$LMgD$z{zk#MEQ)0THFJVEjiTE
zM-e$%fGF{RJ*mOTjpO!>zOrtjbIUKr;+-egwjE_seVzTYSu}lQ66UNJz}{$2#7A!H
z9D5RIC)wOvBi-<JJ2Y=OUEgKX+je2by4t<*cKQN&5H@s86M5rF>8{+qSf?a(d~DOE
zk_D!gauiIKbwZ<e;qW3P>e3cx|9N2WlD0Jc8=pG6UFR4Y$YW4Y!wWcuxG;2ZK_?@-
zcNO)X&2k&Dl{*lZ(1ork8;Ch$2&}fGk&U-Q3au~I|M<83`8FWKw%D?u!s5a~>{;BK
z6Tj;XSXc9Dzz+Myt$X_wgmYzkI3*En+FpvNb=U9Z)&7KmvtBt_;FGJhMYU4?KMxB&
zEP*<9J7Nyw^H)KqnzwE1YgZE&<7HF4N&?aaoWQwZsfnvK2n(;fqqV(_#tFk+YrrVS
z@V62RrmEgE+odf?wTJJzNjO0Is<tEL`al=M?_14Pcq{~>Q64+Pct0%8Md!5lQ@D;v
z!!rFDY0w~z+Sb72#j42Ty}H!4TWV`<$SKowp<noZIZ~zDfwWHeUIL031H43jwG2Y^
zF-}HM_!1s&RQ9QDe{SB~k2d|c56U0nsNDf#w-dVtKFuR;+72ncEv1!a+dDv@om-F|
zp+7SMUAF~<$$?LoQ0&<bGG{|j)FWJLkmIy1RZl3k1D{MBQqo=|1+y^;JxZrwVl^*q
z-?3vb?9K(dbj^RfHHeVuHZjM-Qa6yuY-B$s3`~+j)uvhZphMVDQy;?#>##BeZ2LB4
zu?{s>6qHgIcWTwD)aU?Vv%!U;Cr_R<3g5nEiwuf-{$SOUl$62AWz(l$%i1`%4X}pm
zsVd>--aNZ~cmH=4?>lw>;`s_bF_=z3GUAx7N^&U*h@y%joT;rz>YoZ_v4H3fhyoW?
zYm^N}a+f5Q6oLVCU>CbcNJS^fS)|@_E2xtf1ZOFQHEE^SH{~j`Nbqk5q@xMaW<I)l
z;*3RZK@`JV9dD?}aRU$E4YH0L(t`WGujo0#lu9UVYT9FV&;0+j<Arf?ZGQMc<SK04
z$Nc<kt_UfsIYV$nAG1qBFp$>e+Nx`{KXLe#iEv~XN4H8S6x{F0STw1z(;};j&jVC>
zOyqDDFk##k{bR4ucncsAs5&xiQ^AlyONcC`Y4~Q}5>6*^rWJYqE~l*MGa_wAe?T&k
zOE>Z=te;xG7t@>Qw`K+Rcu`jLRmx}Rc6R4tjpw+hux7jKI1lIvO~-ld(K$F|*+^M)
zq{J=W4yFJEBlR)I^(?0e8@m9bH<Tm<5Lf5|{1sD>A#WXi%9L7(2&{HkvtRZ}J5Bgl
zpuAJ5fM29;Fq7T~ZnmsFdAeQr0eyQf@kKnTG_(O|B!l}(36R4s#B|Yni7(~7Xw_aC
zZi}Z1qwR=SCy4ouFP^xrwwl+w5(pkyjmCw91d>?7ZASEm05dR$xi-mk2#UBid7D%t
z866c2{w<uA*O)jFKJ)1H0^l0eD(Pc0+*QG(M-jY${jGz(;#!f_T8?zXt=qTXNHrX!
z2mofNc)pEu1FBY7F}VtVSAxS-sstXzHobvM02Z0V_LpFpPTIz*Q~}guVzRPEjLQ2#
zI4=|fC?q^R7(9()XHU!1x=LWk2MkffQ?8Wrf{jHd<H9O`X+Pm#?5+XPh=OradM(yR
zm{$VIB_4EXET@R9nkI-9JgBIK@FsiJi7xQSBTyRO!aUz(Ny`aU7U6;WHytH)M9y7&
z**lVb3MfBHG?RER1&rt=*woC8rGrIt!goNAQAI(+I=9<0id~hDQ3y(;4h(`!57}~-
zhPv}?ZtyYLbB|W~2eaoU{v38;7)0`o)Vj~Rzki{bc8w7Vu4&sCK$u)JYSgH=PnV>I
zc8wp8cv)X@E6GRl-otkFO-n!6!C>I)gWoyOZ|eFMs!g_L8kaxiT&m+3hz;?j<Vnqb
z*3i~3|D-@@elfiFcF_7>+Itfm_M3a-t1eVAGOE8g75i~MwjkA7L{;NmyWLJ7)WUx4
z-d($FiPAGZrP8OD`R9%uZVS?gs(g#WF4*0ODr?g-P-}G2?wwX9ML)MX4Zp?wq*>=K
z0|pF84Tmrtp1%(KY7B_ic1}#Uv>Hz7Mi9`{8W3JL-AQ9?%g$G1xZ>se(-XFDUL;X+
z<o?kk;POwdy6P65LUlFpsVfnjT^c_1_|tIQrQHlbsBDT{rsv;QpS{Hq+qWny#)?-Y
zhngPov0OBcT<?4Y40Ur)BaG5gZ99DU@Z_}lwsHG2m%ZgI6ok`$=i3};Jso=dZ{f7W
zkHPZ4y`LS$@QX3T6WfSUbkf!#g%e+jeIG-OdsM<rb8O_msy<Ug{FV_|Xr^`idF7*b
zA3pTg(Zb6Hm98ps7{NRb%``5g78qRUz3*GJ@L7}vf*iUk9?w{znXnAclcZ}A@aKD~
zi%%AZVJIh;EqnLumt(Lf?W(}d+L2VmhF3$I+XhT~U~%=QPl?0w`ooE8lsUM^!QdfO
zTAsgnF)tjy9Mrr`n|&{;^o9<7ibj-gYBb<SkX^rRhIwBXhHKDM3m?hC(GazXNS93%
z$dprSRol*;JExh3S4u)NbbIGmcldhS@j`)52Fce@hR>+jjRiGK4d-->;o;-L#dEO%
z7V&Fpi^?zq+v_)M+=eC4r&8|ZM@swW)~)(NzO*4Qw2Ne20xEoSzS1iHR6g-;?z+$3
zO1o196&HxcCnQ}x{h|urmCuoos>tVTHCP}P>3_iGZqQ7Vb{tZpm(2P0Spm0UQ}cxW
zVu&g0&hLE*@Cy2X-IlkjpfO1yk?X0^g*IMQz`u#b0Q-w&C|M&nA`Ly=Pn68=_<tNy
zz~{t**p3`5Au-Q$(>?Gde%;FtaAPmbyVZD8cs1cPEad>G*RNz?X~>^i&QNF&U@#Y}
zQd%X^CakvjTr^KbJxoR`g}%iaFTP)?8+#;GCv2o^#{+~fGl@cati6WS5S6_6Tq5qD
zSm@Mn)CN+J6N}Gfba&U}H4|x@CAU~;he%WqhTNg|j`jxqldYzXsC?B$3h>q)StlGz
zQaX(n;DyKdWm4IX7)G&cQ+VXckEf(WqD&;|3+p<=u06=3zFnLd{-sm-A%Z+;u6Die
zV5UdZR_qY&1vqH;v@9Cf;_py_YPKV{!$<hl`*r|#9>zkScoGg_tWpL5hSYE@MaC_#
zd<|f|NKkS{Q~_Liak4$auPw{o2+zW)_bSB5a7E6puBPEvq1Z}88kT*CUuxf`5?e?e
zU8iNAS=OSd9vtit`ZgHqrXb_@Xg!nQ;AWLi0GocoJ>&&~M3V^B;&aKgrtrEG-mkI5
z;6#?C5(mo=oK4R3d()ic0j5nLM+>e%s!F%}9lY8NzJDaT!!o?ERKG{mehMRIk|~qI
z)$9?j(X82HO2u)xZ22kSO@*&;b4wvyn?ly8Oe7vEVc<^SPgzK?NG2!J^_1Z(aSOs%
zV&PvwYn^OlWOSwS-ge!)Y>9ldXX`f|#e|~$v;#|>Z`>7^`YQ#NMGTqjsjd>Bi0NH6
zD{gOl0KJrMVv<{{atARDI>_p@nJVMEMLoW4*6jZMam=D6382?amt-a`YTr{jhDZh;
zz#-y^K8`OsCWTw#1@itmvmDp{4}@WTysVx>3EXIkoG(MW430{VUXvE3^<($Uf*+L-
zU?#p_g=>)Br;IC*AG)ndx-nCoh?UbA26sMg^|(g1cg}mjKzsy-GN3GjfgZ*0X}4o(
zLIAc5Ty8s6fXD`w&LB_a3H9QucXTj(yt;tpkrqVd#jAL@nFlCZ7a!M?#fkVG8W(ZB
zeeX^~mKs6YR=PyuTm{MiH=9C7aj5>hjWTRi=!09buDQka$nfZ3@soSX;a?g^72KL4
zmfOownc11kW=*#lrVMKJz{FJ2Z^uc1q7b|uTf-5YvSl&c+p|P9E*@NK*RvCyWq|Uc
z^P8(gg~zqww^XQ>OnukS0BVwJUakNwVcw^@B)Wu*G_I|Rdsr5)pD--@fy0U-k3X>E
zB9PqdqrWdmvGi_pP6YvwZ4h!~n*WG(nISP(IP3xrz8TK7IwA7|tecU7b8>T+-sVx@
z4iCOnHVWcs`@)|t8ApHmR+fZMP6JvQ-=tkS%SEyYAzRrglU>xPvl_4UEU<)dBJyDI
zt{mUpl;hOxoLK5a#-JX-Hru-Wy{R|W+{yAJN7-O`3oFjY<Ot{az>odgCDn_Kq_8E?
z`lS#!-wW?sAKXKXyMTpr%?D~8wZYiJMVRgOF2eib`S(6w%P0Y-!R=CK3L_$8{8O!d
zSecB=%WgW&#C6iMcw+p=Dy)U5>bd4isUPaT?WrsZKV&;UpW`$hSvc#kpD*x-D7<K#
zUWX=&;A>6G9NKKeu9jS1De8L6l+!Aldc|dDDM#V$<J^~7JA^P&lr@`lt?xCe$1d@H
z29tn07*bu7SC?E!-4$0V&=Yy5aO9-x0QBDK%7|g@p{_n4FR-ooBt3cHB>|O&QP7WP
z0!iYz$&S))Zh3Q#g&Vf?*k9G$W!p=BkF)kL)!>m+g>?9qmfoLc)M);l^A%S&+)i|^
z&GIICXR1QcZ?0uo`})E3h8n7!tTo^o3i6^3(_`&A3>6i3|A=%7#B>M*?l<V$_koiK
z#)-{c;+rO3F~)#7cc&pO9vFOf;(<ZElKUN++olp<6Tf&;yRFl@p6QoYZSx2ofG~4=
zgkP$oU>c+*QswI3=|jgdui~|z-j9_cBejtp;b%ePMrFr?uP;DTv{~L|xAA_{>-|Os
z^|V)`h7AK#-B|i$y0%1tsM&nZ@%b50?6;0ele?mKbR|PGh@WpAsqLqHy)Mf(Cw<H!
zkx>aL{-&%ZN)iDKr}0-Q4p=sY^4DJ`PvW94cdx1vCrCi@5W_7^pOR37oLk^5x(B2V
z`|rK3Fv#fj5@fNJ-n7F2A%+(ggIW3fJlk0)@yDd&^hmVxPyb-;F?*#u7X0w~>Gq4m
zQcDnLaUco^vYaWb9!1V~pgl`Z&cqr?wO!H~a7t@pgkOMJM;K;XQD(jo_1A<elN=%2
z-#W6G+?C#vX)jnyO!2Oap(yr@&srW0@rTx_TTTakd?p=VGBgW9Py>%Z!`5r;j_cke
z;q!T0MphKR+<c>mfM~=utzk5<-4)%NfOR6X6S;z_PaJ<thSj}lx=bJ5-}CNLp4GPM
z3<byITl?#z6v?c<*GHQQ{7WD=_fqe{HEYYOs)VTMJABO*k7XCIgu-z{C}6;hB}OL!
z*Va~loGP>iQKjNgbHqDCd^?dpa&b<3_<mtOosugkSTpW+#=GMP`p|}auXOcm`oz1!
zA3FZ@WzDrU3tI;ECf>~WP`$w;^!MaDFQFy$gayi1c2!abA0ICDVo8sEZ_SmFRfN8}
z<nEFZg}GGriLa$`r(h4d4;`L{keCTXT0#If?#iTHx}7_Bu50C;b_@_u^d)&^CdT63
z=-is5x9t)Hunvd}Qrs~J*PSul=6;EndDp6Ci!67T|Gs|x1vHwK7O(}#2&J2q(TQe4
zkxpC__%hR_8L7T%ZW|51V#4%2IJfz|4GjzeryISPrQUwsQ9wa3wifkn4e$xl?dsmC
zOdR0N(ORLEYkV^D1{IDNyBzGbY3uo-`AW0@VQL^kXhGiWZ2Pup_9d%FI*(v92km^%
z!##HUB!3Q8sp?5PId!ZO!qfu|wQ$<UIP9-zr8uT$5-4(7P#9Tpc`=ZG!Je|BEdS(X
z&VNMePsqrPalGX=|D~O?LSd23)EPX(y`-chYd4?`E{T%Yx?5U3n=W1&gI~r!6;4l&
z?%jL;6~7&k!?#Y|_K{}e>Q##)DZ^mnMQVXyW9_u&NrAs^{$XKw53|EYSHdG|Yjw(^
z%fd{EECIo$h(;iN6g;V*hN+I>jiL+J9O`kkJ4m6lQKn_A7OYP&AHn^kl^CJ(*|?i#
zBmArEjX*W7+)T?{b<C1d9xVkq@51G>Id3E|^4)<n^%<b^p|`^2<}N=m=grkTrL*e7
zZNJwYjv1F`KRVs<7#4RD&pUAMv17+X<c^alQV_x<q<{6$^Xgf=G=E)9qh^vv5O2d_
zyxzbj5yEXkdN;(<xPfaA%YYQ@@{E_xf-dSka-T{<+hLI#F`kqLt-<$0Z{Dgc?9(mO
zCC_wIxKC)!8lQ(Q?F7Fe=NHjS+PRT9H31=B{VWgBzKN-e_a=J$ap5%B4T=+w06ay*
zkJvTsd5Yn&ycYe7UG6S;zB%d-i+%Fid_KsK(wUh0OOdewNZZia9DBa@q?EH2wmYiq
zW0e!IKn0}6(SEY|RZAR!t(Ur|l|&sfvg~OUDqZdNmqN(u_ZPY`q*Oqqj4M_4-R}L#
zpE!V?YMf3exU~%o#58>NSUX>ndo+02xSc;+@@&HaEg54zVyTadW!y-Ha-`VW|5^Ou
z!`;c(fbEr~1Me3U6i7LB?N3DgL2J7%q-#<7D>JU7S5`fQV<S?9qKyTmrL$7xv_$e9
z3oX)&_eyGf!eAfKwt1s4>>6-VCFguf^y&)7)b62K<P7mzqtDWThwoSf8B60r<f<?e
z+S{Old@$1=+Ox~L!C^0!oY7%M<V4rOApy1w8OE|sKD9VB`^hIkejrLG4$RJ;3G7pT
z70a@KI+Q4Gi3JR~=1g}CaSdO8*DfrGOkb30fNu}JaafDYcw{aaT`-UJ`o5l$=q;fn
z?*~@hb5nldzG7fARR9%j>OOCNJMbXE&)^uu7iH;&_RhW~UIt-{w)Ip>Ls!O-UIt(w
zKuh|2n`r7D##CeHh9^pKMu?bhw>NMDQJ;7q($_z9H5N87H2r;}vU}g`E3J?8Fg<2Y
zo2uPjyO-;RP+M7XG6Rpy>9i}4$|Cyh;N*~7^)dvbCHhyOi<7Thn(%hmNaL48oS+Xq
zIMIm*Q4p>!@u}aSL5HVV)kT}OwVEjJ4-WbEfvug@GADye|J3_Dhb8oFx`t^Jh`|l&
zqs>KDk70sJ@fpxP?R_Zq3<}C`_bf@jyi<<dD*F+tL^t!a>eheO*Vrnp5!m0{>f!Lz
ztSlZpTT2?L{{I-^)DJ_)t(^1t>}l1^!P_k<8JRS0n|_w_nfjq^nq#<QRjO#a&8WQi
zrtH}G{rv^ZyFjEQ`Zea}=Ay1D{4xRaxW)IDzuz8-ogau=DCNQQ-dQLL2?0khwbAKf
z#%;W&4Q9ZSmo)cPHW;lX5Hj0sd^dG)WKK(~HIC6svWAn4-T0(sXlUp_WWJIT6Z4`L
zLm?ivbkb&^qMQ2mGEH0h`=s+rL`>jwpNmZWyKhq&4=jHm{TOc~`7$#z3tclm_sNg+
zb>6F#Tl#J(fR^Y~h_apze$H61KbV&VPesa$_=n~~gh~Bdnios!<Qj&7OGQ`wpzy>5
z{>*6eo(Pz$Z%%1kY-TkX@Pv!HH-gVbQ(3gze^08}c=$l+!vq!aVFzYb(X=p5#MVw3
z=ogP@Z*PCcscXVn8(AHiFW1U&4;{LY-thdE+zTOJwd`&EB%~Unr@tcelMKDUkmVK>
z$a4YF2?w_@V}{g6p^I2#45#Jhxl0?bKN0CXdGhzITTS-7)xYHYsc9W~ff#8dhFuJf
z^n1{AN2o3QQZ)@rp=kdDpyZ{3SKCHL#8)TeXQmz{8tsAkI3=wDAwzDE@Jkf|OR1Ms
z-3Q4gjL!L=V!YFb?_7m~MG)@b<p(mw3wV^sL1gbKL_kx>F)&nxbI1PLGjZskiN?m#
z>>jtiDGm%0y`=LrkDlVY?cMhzREdF-XjcgH(p4*^OzeHZQ$O^<*sVmUqN=yYpp3cR
z<FEH`+GW&itZ{-WPgufmEs){P_^y<r7*@5GSU@_Mm72}sOr_^hI0-z`PxR3U$7ar)
zXAMRN5hxsn3W-3?(A2d^+RvNnEnTqET`daFMEoVct#9hJvQ6#Hh;`fE<mVrF-jCAD
z*sx)<U58LQfWp0~l}!41jCuOV0SpFm)6leyX>77b&m6LuFv*ET?pv<|S{BlDoJvI{
z`De2Qz|rlrYOD+_yz9A{ed$*-IwZAMk<S!e{pEQ#B89NPfk^G|qZYl?F1uW77xKuO
zMnk6v!Y`{mU5NDWRo>O3iW`}W!06DZ#m22W)mM0T5c&Zjsf^DQ9cegqOQcu=7}hlY
z@78E}0}e|#PD6;rma1Pe`*Lq1+Z7jGdp;i#9#sIP_1M_W_s;mVXw&A^gALW37~`<B
zr-u#7qTntpw%+((b3{s-mH9VkN5#a7I+v8$0eMN%!z<QMY5^z_-9Qb;85o$z+|z!$
zH5Hz<JFXgP1fQ}_AHH_U*S|EZH_+a1bVGZS_|7L12Rvy1pz(k(zh3#f7W!_tJQQl`
z6=-#6e7w#HqkF0C6NeczX{Ouk*{-`Mf6QK6`)+=<w)ro=-FjDT-8v=Jp`yS(qutPR
z{#VPcv{oFRqf6D67;5`liWm7nHMdJiMWr0Uk$s(N??151yx-1Y<X455hfTL-_?niC
z9;_rd;&}JiuBG4Yrt)2*X6+x~B2=%}*vI?so?C@&i7NVW<2pOzK7%uF_SoX^2*aPx
zcHS<<6HOuhv;Nsm+C7$Kj}c*}B~kCkY)t7T)6*ChpXGM-+O2sJZ?A1S`Y!CQhWl<I
zBl0mi(t^^9GzWod-`-@8NtcS86uXIa^bI=84_$X$n1Ac^xZ3Aqm5~3<k7w6-+{*a-
z@4r{wuTJo}8&{HWX9cN-Ue;j4tcgTwTL2xJbsByeSfpJQG4&&G5-jtud&OTN0Zuf8
z@w}X$t~-n#hd@yjO{3Xw%>Fyrx)8V)_{Z_^-;Y`~-@40hwNo@9Pt!E+;g#AsWq&<>
zYzc~x$X6#b^V4P>v=|Tym$Y)B7C*tnFLxR%ydrN6*xZSP#nZQk8ojRF@>L_|CvxNS
zJi?FO2eJ}TS3vkZb8Dev?|NMhw4-&2MoU=1KJ8-xQ#y|wJJzO(wA-rp{y#yU5-;Oc
zPg3T;?Fu&tMe)$q7d-oAulR#q2qpwqIbFSUnNUWW`^(EnCG*0qYyZyw_T}M!ZrPDg
zsLjIXp(-%ARBcD$B-2S1*F>F9#!O|^`~sH&zG<0$5)@UqjIW;%64*ZR+P?j2-Y4vj
z)!m(~QK)&|bo55)S?9B-@IZ1=cbHmdz){5O%78QAF=G{r8^D^J#Ma;5T)IC3l^8KF
zQjh4q!G2Ju#wSX;N9pM3yrK@ayJTJW$nwg{OxF$La=cjhnuHb;U{$Y4@JI^Tx3UjD
zi@^)AiB%JtHESk=6!Z%WIj)O2454>0!G#8HzDR)V?lXH<tF2vA&b|;RN!v}RNH>*_
znJ&Ce1dK4QGDrYXDDiNczio1YyNeE}{KvBPX@oRQ^Z$x_>@G|Tvt8c12L=V`wd|zH
zk&tn<=MSO~22N<du;z3~<l@EZhDjU4x7YqUO?h$;lVkY5_d`wAx{Wy7W!M7q4Pjv}
z8{f*bJ7{pbw#T2DR0I0=k@k9QZ>5*Zu$cUB(JQ%~u#n&~bzC#!FKzakjnV3q6oF<c
z(Kl9FkiTu;^X7E)U+UCMw=GG=F|xQUmvj)^-F|+Kjw8ruZf`nT^V+|=Q)d3D0R(mX
z*DgrV1NezqI`AC@PdcP34wV_MD%A^wdXpqwGw+_?)T-GdN)*^Vn$9W~XE%U{Ln{f_
zAbmwM%~`}vGIr%W3MI3pq4oB<7Ozd(c}F^bh=@3FV8+I`)PpiyGPxx^1`Zs!b)k$;
zuv=(NRIl&2SI_E0ct2Kd>Bbhsfgh)G%zK#>FQMN{>aaTRS2baibiYd6+xyosV-~@+
z8MEve?ZY9Q^?fbRJc#sPG0V;&N)DjzH6G4+&*KB!4XdHih#GEu%O&b+QbqwGD|CRV
z3a{il*HBwq`(D{hNESM=nVGYw7RG$2j$6)l*nZ60-9g+bLCSnbqOs|;c|>HF1DfBZ
z#QY@M-`__E!JqC2lv@Y=yAkfv3r`8`BdIdo(&nlXhNp{!j-+$3uX%jBPs|YkrwX(-
zr61cSK==8@4i3S1%&cXrF&8;l7005d7xP1k697C-rsl+($k<^H%lT^B+A@oW*CH4E
zAtM{{*9oL#>IsMVr(=3lQ%oc$5aI_n;FbI<{Ymn#c)iT`Yjr<4Iy(0ETd<Tu%+J&e
ziK#4bvSOWDB(>~iP?0{YddjG-(s)xgY^|%QwYO9Ctk}C#Cf~eqqrc-|qWh#}097H&
zsxakNE1Mlssg9C|C{^TD?jwYurTdjk2L?)Mo5qPuVmGxnzqp<rjvv<BZgTB@WK!Ec
z)mH`#8dQ2H{|=4kB+x@m?oe-<N(0)qw+1dR=?N_jDf<@TDe6371Fq?imD=iRATVUN
zXNx8?884E)Difur6eZ~S75nV&ZcS+(`i4T;RHA_GZv}29e2HD|$XE8)jF7<_j4z1F
z>xVNxLFl4Xk@ktcLZk6BFPMF)I~+Ee0nZ*IHs^xD482xD)u5%z@a;3%G*83Oix-ob
zKJ1#Jp*vw;-(P^g2Zw`KiW;1u7d7e>pNYd4I)Yyt+u!$#U0y)2Gmz8<!fwUz#Jod<
z$F}9)r?71LexFJwf|`rOkmKUT#XnMhvwA_77(_s6jJ+N0>8TJ2d2m)~7CWB>Q4G|`
zdVchuG8U-6^<U0hbk&g5u4i*|T#7}c0)KM5lzzGrsC^mBGKJ*Fbc1`YFDW9oz14fa
z68Qiq7E2sgxx2rjBy?}NBa_4WYyPnsYGY+Zo`2C0H#9FrhmT;<<jTjZ3?~}nN{Ga(
zuRadsu#)s@`&+(P7$g)#{DvI7x3;$04J@E4#D@L{zquJhZDYL0-;o9*t#Ih?XJTSv
z#@{)9{J6N`E2KnwoCzPLLwi7Y7g8X`*)VO|erOFv$hN~*yN5Uafio`PEN!dp?#N78
z7!rl<{=n3)H?*GRt+Ttwq%ob)JKM9vKAdv)EQT`OE)@Wuq$A%eTPhZE%cq7udHQq_
zHc(B~Dq^G%{K%!pvnltB4*QcQrIa_hZ$(4aQKgo07tD*hm~}$BsB3#C?E2D`uIo*i
zK;~lL{?f`H9DoDBo1Ug#Iluh-u5vCEwZhm3Syy7h)Ep*}2#}^Ld_5bYp5H}>bHCd}
zEI;2=PI3D{2~exk9O;u0inAP68Mw<}5)meVH7XLjtn+02rE`Ag%uCb>PEl;WT`J;l
z!eljcE1b-?OKwOv#hcwzXFUXOg_E&}(=n@mzkW<sLt$vW3`-$-Y}?5;npT(hoy+5D
zI*gvkop^-Gb<-+qYineY?JbTT?a|%;0KIVzw5^9;x-_^{TFXQnCuC;iawubnOTU7e
zD8lK|IZFkjOWJ-at4BG^4lEfh!TV@I3Zf`1BK#O3VPBGzbyp$-o(R=wT}q1%(jks3
zSE^(RAb>L>m<usISE#x2UxZa6!!QM90?z63`f<zgVWA?0sxa%jjpxBj-;4%SbAGYB
zV;d<_!grS5E?GNHE{}SgLtw%;PQw1~_Vw-g+OqRW4FB=srm!4g4-E>JXXd7eW(xp@
z8D^MFT(EaE5v224%eY?QEwmf?68&&kBZ0hw1Ex@(5#2!<k1Z&}O8WRvGUj$aa$M5~
zK9C3=VDXHvOmTmOFO(4i#L=?DvPbY^7Kl(0qygkXr$}=&PLS+%cj$^?`sD4ea7ly0
zW$rZ3k&hWm;nvEyH~@I!?wKO_G{F7cGso(v(0fFKqL6UD1K|m!Z)UaEgrE2F@`4dC
znLX3VSot<gG|6EqiLL~GT-Z<O-${JH4{W=!8K7y%sE5NsV;ZYZutL!=uC}IZL@+~i
zrl{Tv4ww&$AMYh$p7sGzui9-ZIA^N4**AcnBW^zJSDgZ()Zx4DzPreM_~d1yW7VtG
z))k<2kD+SXRb0luUwoRHnz162Yc``rixy{qG~L>}0LU|6_~NX~OP`0h1lEcuC;vJf
zDU>~d?+LVHO8PO;Dg*x4^;Ma*5CbnB8oJ`{P^u_~Uf+K6jr+S!-QH<@swQ#6(Oy7A
zC9OP7#$9}hkD5)OXQ_M)`{<>a@xJ+Z0<A-lQIyNAqKh&PEjeC`#P#`Q{#71_ECsRa
zkWGY=kl2rT8Hl?kw}llF_?}w-<)R{&StAC+1PLYLE<Lm(mLvl#-7@&hnZ?Iz3%N0I
z`@OyA2$FJ!j41*6IK+>vYALtqt=YkxhBM;QlW*P1JyiUZj;5@Y`Mn%0EE3n$j-S)4
zW-V))bNWX4Cz;!+b*O6g<)xkPmBknO7O#DL5nl6@aIfaATkj2gkL+PTuVrUH9?@|i
zP3l;_Y7H|k@|*ODn)AtRSMtSa&dwiZ=561)HLR3*wug!X&$v+QbHCG%n0tRm8~od2
z@`o&!39uv0A;#4B&aK7!g?*}ijn=}R=)n{(qw*^+t*foE^6egZc8YD;wJ?HjTi+tk
zzAJgFR;f;dN?(LzRPI$<11L4^jAif&TClC|7UN{Jv|v|<F83!wwvFkC+Rz$oPht7f
zF_mYD<)^pqHGTOV|4Ej$wY^$?DyuEbF1m9kV(OhQg#rV{x-7zSh#oDHRJuyI4v+By
zqB84K$H8puOJQ72e<dN0)I2&+@0Q|*4j$DC)$d2pV5A|!G2W1zoLZbEyZ3-S6HzNP
zB?U*vCMFWI3VS}?eVl>#u}T?y<@ltpp!k%}ACeHwx3M_}Z%78mOAVP>5%*eNAx@jc
zC|rsh{bjBNg?`aYOWo<cgK0UIz!i@P1$9Idwy%Nt!(B-U@%}5{^q_<)<6o1A@1-4r
zAqTO4{q+k8*AxU)a^b6s{U%ktm*~g;Ch-ab-QUfTS|s7xBSMB1>~i*^0Dhf}>mqZS
zE14JKzqWh)`$$wmi-+LTLQNIDC-Q+x9Q$sD2|)}89TQg}zB;$C@KB(KEV!l~phP0J
zx~md*VF_3?-gL4%H+-TP5}BSXbUyCwWAfCN&6^*A%bJ>&7EEul$PZBf(q?3p-kDoc
z62Z~P*oMc1_8-ws(}fxfB1q?kG!)37aUlVTR2n0@{N5Hw-+<(BSfquJ#YGKIJO#{C
z(Txx+iVn;^B(EJttQ~f94UNQya*Gn!t_$2a!AITCijt`aDS;AxB$z%4i1vmD{cY{F
z1#rBjWq$yzK2jrMew+A3KBDFEW#*g=oB=4yD7dV(3#}rNG7#bpa2E{i6~b+$F!Cd4
zDtNRhTtssT-)ZEGLBY2mrs1Aj*r)hrUruhBGeEo0<~f=JOGV69l(D#OHuKa$gT_(V
z1H?K<MIHmP!~PD;Ni=#_*1$8heTNQ0h2`ZIp@?Y5WzhT{QwD};HSAjb!o_eVg~oNh
z1Lv1?df&cp`ev>9R9*gew@afw|N85{i_(XLsG9xZKfe;!kCpGCCOPm;Up_H^Lkr!O
z1`SUHv^FpFTz~I=!IwW=+FgVyMy!nzCR#x-$4u(R(xKf=bgcgSiOG@>FWVKyg{}Ra
zpxo=#=lU6b`2pLP?5+hGB>gq}3rhx=^%!%bNCAE&jME{%OU|KhalOvlqA^dink!-)
zhV5vmNY^+dZMivmkMndF5e~}4SMqVYg`e<dXj8k|tfD9o(egW8vqJkdraeACk>_gm
z0jkYHk@rxmZHmbtCqVh)Flo*Kb-${SDgSkk4%O@ZeqSlfy#Ky^+gL@MGv2iR;gc^s
z^c32W1au;uxj9ez^9(k8G}wX^t2TIFVTcXJPM4XPs1z$Lc$;K$)2Kfi<v5f@4nOe^
zE)w0G{XscGdYmO{(+Q%)Au@A<b2_W#a3lUgy*%-Zk3-V%@4sSpreGG-u2eNeMc*iL
zI_~Hd{e=r~@j})J0F-X9(=|28D{n8ZTb-t?&MAC}m<+<9NfPowzXd+gM5&v)<{~PV
z;?NMB=&A%hwxoy|8}!?!=WA2)TqByR5gJ}F)B%uD3P=PxU6tYC;VZMZ4;JwME{S>$
zf<XXaCQmoDmoM(np%*5^r$s%*he)hZn@a}!zw@+8^e@@z=ci|AI9%04aXFGSvh5|0
zjwIsV`qTW6>548HS<!Kk)AO=n-=(9Pb1|KgxyQa{G1m&+);RgJn=8MGkG9CDs{`t{
zX(_t4CpmnBkjkQQ#7tx}6q?Sw%OIysfBdm;9~)>EFf-NUZn@i6iVqe~i~O<b*opL<
z0h}t&x((dH2DYV1OuPdgHJKW~NuoA^nC0lVZ{I%W)vJ>Lnr~z>AM?6V(@{^A9r?BP
z!K#`5ld3)+u<HYW9cu}QPf!-KnN5e!K(|8c5HEooG@Iis2RL;a=5B_a>DTW&#bIq}
zBU~0qlGLqRJ+obsu8D46z>I5q;iye8{Ynv2<gQ+(VwUOT`HL5&4??=zW$;hI5ToCg
zU(0tnkp9h=8|P+2L<u=TNF}<qOaD3CMxEs+yc<PVylf|2c*y&M29B-NFeu1~O!-^x
z*N@d%TT60>E4=s0FO2D3cY=mJrXjfCO4i2+F<>rTJ|n0K+inb@VXn&q@Jpux2Zr-X
z;RauOK&?@wI~{vJXIP)J$O_=$MLC0$8;t#iDK3cG$M#r{o<PQIq*P!SOEPF8tE9V+
zGc@_g$c8f+7RcsHXjky|ZM;{*!{1z?a3|Vo3~mtDdowwi&g<^8YI(7k4yWnMPJOS)
zFP`Y=Y^WCH=068m0SwZmInS?rKG+C6(}i3}O|^;jh`V|7=H)dc459=CI++3i_AJd2
zC#k-P#PdMm7Eyf}>($;IfWs?K+JQ0jI!dDcXE&V{o{>_#5VNmVjl_J;@g-_GCBSF^
zS@>(FfWG{?z9Po~K{PXD=XGdJKQNL@<=Os5CC%I&G(!Jl5kf@W0r}WRjJz5>W{j~4
z1WX1@IB-W-yOZ53`u<XBu`%w~(W52fk%|uOqC#MxOvU`u>EfR$+lZC~xz@0ydrll$
zdH3>FJV~QN8VWOaq}dWLAFL^Aq}bQFx=3_M_H#SDvjW$3OoPnV0x1NA=5Sd^tB>0G
zE?WtyUoO|?Kgwqu^W3W^5`K$3N2zoulIFpB(eEelq1gq|eFJq``OE_h2lbaD>^GgO
zCZG?J#_<k5xdtMx2>#szoicLWhoq#;5-Yo&s!(h&p@<~Xp_q)AhY!sm#7z=Xuk5?1
zxo{g~>dUpYYZ^9c<dW;6xV{*gh7jwdd4o!`6i7vU%@uy5=g8kb8Yr}X>bP`G&E^SC
z8$VwTv*paLQyYKSGUhR6lt`p6nk*_%?Qr_;-?VA0SDcTZ-)Sn`!cP?UCTat+tF#kJ
zyN@(!&=EmP%zH1FfbF{j=-a0~;^nwO3Zv&g;6-?WaaX2PD!FiB5*?`<97I#&8cp)C
zn+XUXLST}FSeXc-r*^tAbJeT<{_y8jAM>Swd~^y_yn%DBD;8&5Nqz!CXCif2F>dN*
z_Z&dfBxDCJ&4+@0gdUdUXZov|jx$sBrT0UHT<;8m@Lmbd*yIsRB9pOKaME|`2*E{2
zBL*!M<D^V3*|2)ri)ekRO}G!nl5ntX*HRjU_b~M@Hi+$R+)2zrWbAJrE4u%Ka>SUA
zfZ7YS`8YgNrRw0pgR>CW5VaaX0}Q0$)w4vZACL-@ys}_PNFkVCKZR-E(vCtBFiu5b
z;~1?n`}cgO@V=(*v7$^YUAu9*_6kK9S?DR~NYKUi>c~)PV$S|Nz4o<o(HbJ|S6Xm%
zMEpt~U!iF#3+v|7Ky4R>wd5J(F@BVBd@#9uDB5rk;IT>I2flJdeGp<ck&TZxEJV&t
zE>m{Tyg`Eo32w2ipD5j2zE9t>iX|7Cg3T(bK;4el^W27a*z?@&5h?qG@hDkLBtqT)
z{9>~@JK9ke%rG|Sqk*8{>QBcf1nRVS>cdA(d6EZm2S6o?$93`3(!l@&u}RqsAj$`_
zF{jZ-g4}iowso2zm=*zJXQc!TT(rF>DmYh3P+fxvHjGu+;S;im$aeIXFdwwOyHX|}
ztgWfASMTfZQX~hgR|QDV%})clB02r#8RK#uRi@9&4He3Vd<rK4i-;65_$->@O4Ve!
zHqbz2Fm&Cf8X|3B{#o+_&QDI|dv5zNLPx=r%)!r!hPslsyrC_M9GN&uoF`ILxhuyg
zjA~H)?je3J<?QUe+qZ8Q@g$*y6--ksJpd{#j4@RJk-jkFAnOcbx3B9-5s0spz5vYI
zLfYaaSxA*&tnttC8YRwh_v5eLQNM)Dl|zJ4M&B-kEHkF_C@L=(r6e!L%X76h7&~h1
zPtVQ$ow)iqaxkwqcR$3rxF~b2xGp*=O|(b|L_-%MuVCGg_MLy%ktT#bQSa6V%*bW3
z?6&Vy4k$d260uIf`X`gscU7_-$Gz(rMJGqPEg^F;QNf8DE$!~zy4P3i>&k*k)$e`B
zHxvj)i;p(yICo7m#W<fNOu_DnYGnf+G3Gd6t~A3z1ySjcWn2ZRlwmXT5ztaci-#A;
z$ZVu<km*0p4Z}=I6EYw&&TC7L{hZF@R9jw)Ha`hz*qU1MGsv;8srIP?u*7c!$auYv
z)hOg=)DudSk~~tL)1yt#XHvcE2c;1ZgMQl~PjtE<&^kKzR2a=#MgN-w9ZK;4qF-Pz
zWXL{IdOG}S4lm>Fyn0Q|xBr~`P(3amh1!kMT2a72Q(s{t0do4*g%0WBC74MeLW4M)
zDV!`6P2`=p{B{l@yop-V7ow)fOK~peuktcH4|EQ!Arn-_Vh7hAj=+)t&UeD*Gl|g@
zO1lO5hqF$1g?25K_=A8k$0^;V0`Ve_@aD<+V;rAnhscQaAVaHbU!wZRdp*jTjP^n(
zr4ivmt0WS{s7o-@?F2c9?#gLss6E`%B;Vzq3XW|&cU$^xpmesWqAV^^AlTmLS8R_c
zAJt=}$eSXGK=v&p7haU6lCXg^W5`U#K(ha~v&DZxMCZEfo_o4(>S{;bNG=Mv2M|l;
zIIM)n;fTZoGOugmAw6Z!+0juj{zSb#@5v|BL}m6CZLpKTsC7h_hLKQySaI!qQDl=c
z#+50KQKb8gdiO$GZk<g3z=T3H8mN>JpJG(md~E4VZ0X?zcLV=p!wP$M?K+zL6iq}Q
z#&PQ?rGN`I+LipL4uNTCLUIEoc-xH)7wT`kaJE3#^7K9&{g03+IDYeW=bYpj_T;^0
zoF_>0{)|oY)YFK7YhK<>-azt#cH&?OSyj?aDO?r0*6zMXAmzg*nqE#(wI?3)))9#|
zpg-?h^ZFT}7CDn&3ke(^ohvW+M>s&h?%p$>=GY>E;|!RE4>eUQzDO1hhh{K#MPc;t
zHgVaOL)O;*lfKk*p4}(VIaaU1*FvJZ9SzG?gaJf+VRNh)k54kTiF!-CS-7|54XKE6
z;_N(nb4jIBU{vbJ>EDjx>H`lIU-!7fkXwrH1JcSI$UsF#Pcmi=zvoUIiR2@r8c7DC
zdW1@uM5TZg_mZ>gD?Vg`FUfQw0ITt;M~tx&^=8nWZrvMr7DQ6v7K<-LJW-+eNMW1d
z%*pbueyuPET%R}elBfcJ?qs;5^U|e#StEr)eQ<Z)`|rW+O<M(?aS~}_z>My{i$n&w
zx$4ZBGj(;xV!$Y^`Aur|=3HioX!uJZKAT<kj`lnPl`W#=8>jsux7bummHQsROT-y^
zQ6aM)zgwoSh}3VkP6h{Cl;BUXH;3lqctoQ^h;2ZR-a5j_k*1?R8X5i8iX0ik(6&?k
z(hnaJc}HP6`S{dRI2+>*g)&95rc}?a5eE)X!Gf@jZE*o+<+*=V`dNHQVYR5uERqRb
zWUbrQ5SXRVRt}=qt_5F5Luj9#J>P`i*g)VXYYbp`@3Hm5TTNa!X)XE7^XGHlH2O&S
z*bOrH^F!HaADyKjF&mG)n{1|fPVM)sv!wHCMM!SzLV7b~8sZ^)G{jC*4ZDZjhPwV7
zB)vlT;`EG+2`FJ|^v1D)rC%1<;I-vnPe^-3HbGvq<0G6q&)NsckZ~%J8wW8|&lOg0
zU(II1T+LY?7yiv6uGdzO#)!VF^-eR@|1~@@KNBxfF7h*%9pe}`N%g%4vWUSP)VOtG
z<2Iso0k772<yRUn0e;=$ExB_t!UA~OXZ!XE9lJ$lPzx%_@tODr)KR4a<|e^_II*Az
zg`#W;T@DoCE|b=ERdTf+$fg0K@is>H4M53YK+TIPEu?uw6mMxUlpRagAS#6(>p+1+
z&CeqOumB^l1hXD*iqeXwp?Jj&CLPZv^x(v(KiX9N@_AXNqWWxW-OU7<RDxw%LlV@8
z#P#0+6)fD{-4Vpa@{r*D1XbXk3z#I;j|^SwakH<R(N31MwMpS7QZG!eenBw#YUbLi
z#UfBgtMfi(tdkqX7D~s`qH{Z1A_IQqR7e98P&tk9LX=Q?>0~ha`LlR^k+u<}h(C-F
z;R5OZN7_PL3Hd|jtqerEPwgFXi&H2%B}4ZW9-56mUHsr!N1+|;Q~N=yzc59O8?|_x
z_fLd9I)j?f0Lcc@!sDN$pu{eiGO_UhbvEF=hUh>`y^k`wDkmq$F`n@?dyXvae8%$#
z@nC3J*uB89bG(p!ou-cev53=&Ws)e%6~B%Bio+pL`alRpHX_O)^WR}r-TE6ta(Wu0
ztEI4u&23vHda6hl<OKhyt%zADIz$wDOPEcsRV6-IsFrvX31n?Y11X88A^n+=c&0^p
z=IY`JGO2<>_&|017`jDHv17M0|8DJ;(Ek(ed^STbejmlw$6TV?(qFT8K+C@EX>cHL
zOn7V!!?y0#rMHm4{bgd8m~}EKg5*r@y`*xY-Ql&XE9wdl=q^}}qKHkF2sgnl@KzZp
zVm8ULp{u+AwCnb8f*!K(i3A6d<K~E~d`*lrl|)r1_V(>Dz-NyE>4ODi^=qbhHxX%|
zN5m{2sgty=k`S0+_#`}Pbv%FlcL{|WZjJrd*m@on-D=6&IVjuzzc%TT0DhzsW1f$k
zUhgLCM8+}roQVL8V!BYTV@o@~m606y`_;XFUcX*SRWgGP`k%e)E=NO;4*{}ExJRuj
z5ENkwi{dsl_io1<D8?mn-9Azodrew@mdfe&YN<S<%WwwQ=8KAWTyvsyQ5OX?98hX7
zbm$@!6-3}iDtYsUDm-<>P*O%eBBgeGUfU|Lm0(WAyU4bLe&5UcR?Hr5^{*?{94LC2
zfEH2$waJ2dF@R=8s`PcB(dY?C5uwq!{l<ophE1UgTexQ455IJ2FZFVITYF5Ln9#Ig
zQyOsb@v*nY)(-<=m5wjjT2PpI1(z;fl)1U0GA?65!6~~P|H|_yQkkRScs`VuBW~?=
zb5oAMNMMcQQ7h=y?JGsG00Rum5N&!!4lOR=TezH8r7;GB|J%o=iku?|MoBNSOsj@1
zc)aP^iTHQnOC?dwSUhheCo^XNj>o$;=jOj5l94&M=V`ijtZ}2{jLb!#N0~u0A`Yx}
zsb>2Qj#xK*6r23*O_HWvbdH~i5aT2F_Kzb#kbX-wL}@~7ov;)xU%uS5)-v4D3ik5^
z09e6Sh^XdS7%Oz6B+X%CbP=kcwlKxk)^_7Mw-qarWp+Fagxz{pi2=KKMyOA3vTAEC
ziOM^y7TfYm<XEqQP8_n3!UbG?X~pdp_?734*hLrLfMh+rvLcu<ErKfGO6}Sd!<vzq
zRzhjxNRBrq-uFzG_$h9Z;=e}gFO1e_&5Cwa9cj+~e{{VET+jRa|NpXAR*I~UB0>Wv
zTgs@2%1VWXXpn3YWtEXt*((&4k<82zLUu~Fk`-lUhW_{Kot*Re-M)Xf@Auq3=Nw1x
z_v`h1Uf1J#T#xH<35D@DXjNUtMCQ)%&E}LjN4wg$=w|w9jA1|+dDXNe%MuTtr8hTo
zWyncWhMlSU1{2hGxr_HH9n)C;1wN*$!OVh@FIts}%w~LODAqm#vfn=J45Dg!Zs@ag
z?9qb(COf!DSFT-ayE442+1W{$c#x?3OACwqyFPEd=;r7_+0$1O|01Y0&gaWX_KJQY
ze;zkb_Hw~i4x<D;QJ}lBgGkT`;&Q*{W>F4kGy^6#{i>P#pEKYpHsGiZ8Z=pY?9idx
z+-Y22b@%H+1q6*>vSP&@<`6mcht-2mp6)!iC3KeHOOo`kl$&t914H|*!0GrFq3sCE
zR|E{h_K)pXHc00j=7+$jOqirhJm&A+y}Q0~6{Tt@A_`&T=!D%tw#7p3X@{DZF<WgX
zrv?v1&f_Wc7W&IBxJ}pP;95%k0i@y)>XFhhO}>nKEUj=JtyJ*OU%gtt^Wqq8^oz?O
z|A}BJvM}s_A{ebymU{cBP*u~O{aNP9WT@=wRaXz&9@MN&n>J2UMC=OxXIxdVRcP5@
zhF6}2m|nA6h&i$sVNB|9l4c|k7N6Y|r~q3}S*S(<JcM+mT75+dB%QVIT36ASN!}7;
zGHC&zlYGP5U`Ne-%DG6}0}gFi6B#Uf9Ju2u_8u#)LS~Zqf|v7W%6A_}<YT$go9-8>
z(9(cIH1#5wfRI27*&#{`c@k5mgfm+O$JkB|MW;>=6SABAa+dxpy|WRNfbA@uwwc0Y
zNcT_F->`;1r{{DoEeeBDJb{(C*=Nbyo+IQrp!~a1o6itIj3BX&)pxQqaP5Q_E_KhL
zZtLLCAc4Ij+C~|=U^C%3cXoKAOHC=Cq3?%*&^eJ8`sv%=mE8`A!^5IrfJTT}N$1*C
zO&T_ADBp)35@keg$Ck<^3r>xY=yH%XH9e*`8cb*FH9c5?gdfDvhOZ@xuiSU(-7#m-
zhvP>Gzy!w~E=^|QBzM6&WBZkxyoGw;L${+yZ&?(-R~*`12XIU>k53sV!(ZqX#9E|P
zTfSPlE(Sl_L$qbq(5rs><9di<i}!$m6YxW?^!~=@SxeOQuCADB_!s6ZLho~sT2`iW
z4T#-g&zR%`+R5-#dlh8(=tvu@1lIxB8POQ$)6UfNnA#$V8oZ&c{qgx{6Ii)&nQZ$a
zO#u3QsEj01sz@ReO*Hv3Zi%9WZ<tyj|J%gm<IxY8DM+MCTW2`Y{Ql1Z;5-O+-S3~D
z28kUW+5B|Dw0W}F1MeZzR?S?tN*C?v`6c&xI=kZ6aRJ475~g@HfR5umz_JPtL1ki(
zDttiKxoGZW0TaLJ+F|bh?Z3i7jk@Tm)+Qw-$&^<9Y#@@#>xMnV;4e^*4dsZ}Xs-%t
z*j2Eg&-v&JYlBgiqZ6~)phLYNVJf{jIVjrn@k|T!V8?aQoQ&iX4+YNH46MM`$hg4$
zUd7+n3>EU7x@rh?&BJO=W{f!4AVkcffqEFqgR5nOF6PwBU%Vv|jKAn*-ZW5FK~qR-
zXlcdJ;1Jqq!wULiCjj(1kWf;;i6S94iqG073rH1-vf;(|NfXA87rTl)j#c=UEhD!!
zX`-w${P#;&L<TRNMb4z26+%2QO-CKA>*VC5A`r5BiEfOa1`7I7HW*3s7@u~PDpf+h
zaZfOOiY>X6dWAw<heDjxm%(-PNO0(YPZo<{?h-u!Bg>X9O~+|aqd^1jpA+vFP+RnP
z|5HS-@)&>`jVzrG=(cQWKrJR1<%0+)2pEq)&xK#HW5ECL^a4leNx=k*aRxKX+Ip2f
zCLyv#$1Faq@}S4=9vOi&kq;38vrOlp85SLNWW}CFV@866ngMEk%3m)(nffsvz1fR<
z#(24_m(pWqz3Mh>xQA5eeQQUb)H8%ePwF%|xGK6~)kqS7n59s24@-t^$eX>@cJ<cN
zr%zK(F5e-*iFy$t)uvq!b;WG%4CqE#*CaSi{2r#$8ndk|*B(H?8ncCBwFKGLu!34=
zyHkEGfzc95`LPi`7cWf|`Lt|YmS-a3Z=_^Trk%pU1jT(U9X~k=V$~>Um$n>d5GC0t
z?hwhvLA!g|$ZV9jq4<F9p=Tm_3#$0oa%p42`(cQ<lOrc<gj>kcH~pVN*zmhvTuu<V
zh3tG12gC4OxSo!G!{~e}jFHr%>~LS7FcA&=MtX<CO?b$cF8*5>AOv|ZeA=WEcH+x(
z?nu9W7ggsZr77oxlQvek;YH`kw6dk*WuEbt&6~y2O2qqvR{97zNDzCyqL12_FJGjT
z4_2?pJsZ(x?SoWp`Z+LjVY>8;*<a9M_@Q!(E<pIo7)gZ+6-4;Vy|CYvE4Y>0K~R*1
zd3%%ik&`cpM2So8+&!50CRU-4mLUgc!(06OC0p<zneMrR|Bk3~tYj+z<~z}J$?vG@
z+f}}rhEayjmIf}J{7>{!Pc78~6u#;n@A7>RFS74FxiRpZ_9d$wF4cp6N8T9?8}ah;
z2ZSUdT7(ngZ$2*L5bU8wOr6g6&yXcwl;FrHRR)(dyV_hkGUl>+?wn|XvJdg@cvyY?
ztKiwgsHpkId1RhjoLZB{!~XzNW~E4~rJxL6ekCzB28!hPKY`SipHbY`jV)`Yn63Rc
z80h{t0=|Kg70&{dh=oWQ`N|64F)p!CiBfAn`cQS&jHlVzk$k86i-?Wz4PxFrIZsk5
z31G^d_|ft0MxJ=~V-ihCQGc=hq<PlLHBXqyk@=9YD4~mKVgH_U<1;}@=+Uq!jw)uK
zFD?5<>Wcv`lee;-5P1}<tMiT;qmq<ydq%!Q<M=q2lbcwRZzLXSlLJO^BE;f>_jUQh
z@iIz<$1SIiZK?V^e4>t0dr~S-6?XMlkE!ed?hRZJNkxm@FjxAULI0vWD`D`*>AT42
z+riuOI75aEr0(=P%Q(BfD}O{V8_?yB%dg2<bq00YPIUGL9v`2sh8TttyzXW5idM1~
zZW5*=FFOA-oC-wSvz>B`#R-FDA2J4UvMx)gGtdK1aSbGho|TmIwDAtpL_5u}zI5y*
zm+^g}%SJUX&-Y!%<$wbpgiHG6TeE;E;Q2OqMl|ime4s|Vx<1o;=<)mHSJNA^)PmYk
z#-J18f$So6k~%<z5jC`B#8{$PiXSs4(NR%jR1|-Mk_sSmPrnVSvC2r>2MO*WVliMa
z1p%9s0%YJDYra=2%d<gO>KG=Q)rczvXI@ebg|%}|j5P5`ljR1THEWx-An?NZwj)}5
zlBXnuPG!&2^9)ck-RwBJz|f+XRO##c4QMtQDJwnLEH#-j^|xq#A%0|C%Zlqr6{hys
zUMlCgbaDFIhpiyJVf5=4$tHK9qCl9c)Lmf^Sk3OJP;Pj%oZt`s%bN~MvBim*s?21a
zw(yX&@kUDl28ZzC+bP*;l{hJPa=_#H=gH;t>qH_4k>957l{wd@eE;YnYBSa1LkMBx
zmhNQ}LnI?rsL+b~FYxDw5R<OaJCQ4*EDH|^2#|dfvb}1dB<UBGSs}#XmR_$o_kp52
z&y}MiAQBIxRvr$}K7S{<F0JGV3Vnq$#=S(7Llj)Hl}B7-R7dwx5-D7o8W0ov7UlrL
zDZ{SaaV{-p&XBKe!fWHrNI|Fis*x@aSedCk>)Gl7gG|nh7Dpw<KqK5gY4#R9Hn2rc
zR%v7bLEg^ZPk7zKTMCQH8a3NS3}pxQF7JoIYtcjz(aMH#kxN3?JV-^<7DI`SdCXc)
zE;cH@=JryZu6gt{j#ulAvf%5qz2I|ktmAjhy?K|$U2dJOhw7+47^&nHL0Ygn3oTsp
z@iz;X#aJ+zLWc2@EdF*i0H8aAa`@f%y5z~cEJA^$b^^*gG>l-ya?iwf5(KE2*=1s&
zL!9W(1l5hz-}|m;G~!vc`nq|PZm9+6Dqn6ou5?>ZIqQX~U;|d}z@bVbJ9o@^P>*F8
zCBYH8(PDo^#5zM}@zUsUnA}c!YSB}w?|kn0F6sRc!B9Oq%P`goX|TbRw-$`Xgd~-W
z%3%M=@KzSgoruj?3_r7GL=(lf&|z=q5f&m<;#c;(HsoKa?YK6&rO-_LLZ!%r@M4b(
zi0KVAOi7|)AVVLj#px5%fW|^+l(8U{jPdc=)PE((na{UFM8nL-rhWPGYunFXzdj>U
z|D?SnO8i}9Uz?uQ58fnF*6gCKvW7-h_vw0}p~8dPYQ>oHtER{^V00Z$Pgdr@VP2l<
z%*&LzL;RU(!${m!UE&c!+)7+vM#d56OW}7LzeFuVfR=O&owuQG-e{n&=xS-4u3fR^
zOXpLx6F>YmQnKd;n3swMSJh<f(-SYmADG=NpCbi7+_-e^RT1pje)g48dVtIsuK_0i
z%SP5S(l8v=t@LJFBJy{ZW*L^H+A$H^`Lr@K<1udBx&?Vv_e*z{B(qLehTgoHnwxV&
zw$-qR%Yip6_7b8n7;IeTSr~tato#%=eA0Ww`t>s(XaSgc+}4tIX{CvP!ICqv$_|ud
z*Wm~g{Ltl3{z*k!1HKoU41u!v&W-bWD+^m7i%!4nM2Zo~lkm&a(U|}4Sqey$LjnV3
zo`iC~6m=b?B`!=iBEIyc_0rGKKoABMero)+HN#G=I^%a11rTTuIpy`}ejEW^u}hU!
zy-L%?sra~^4o)63Pq$Dm9Sn$0?@DEHh<r<zb-hJi)icxrGWbTVKiJv%;heyUOi+kF
zIRpnxpd|ytmvW+%$q!1w7hs&whX#_9kNG%R>p98_w0u`mQ(M6Ka0)r4eMAXqYrBe?
zS~WV@Wj{MagsUe=N^|3J*6h>N73mP;@4D<F7>N}VjMJ8a?`0IrkkYPFavMVYMizBa
zt&VqYCAGGgii?a!RlJND1X&Eg8V6Zz=)$yu`Oz1`><L>Z|HxHNVLREV=4Z+`v4vC0
zAV{!TYrcuDqPQfYeiKVHsqfLtrOw3?SIkc-)^6Mh5Lr5yzSaJLEzuFs4YVFGAc7V#
z9s7VMQzSh@ZG}(WqZ>d|rn6?9pehIb({n9qc8}tPf1EdvqDe`tbI5w9aE3ITJ$_6i
z(Wxj#%Ke@<jgi+C>wC!6<=0g|gc`E{k6xA07pfvFu#Cv5r4gd|jr)J<lK<&~X6idy
zmULV8->;~c^$0_(50sXLHnarvup=76JUBffMQJbYyCT6aVLCgx2a<A&o6Me}J^J>R
zZ-GDO9BR4NGE;qin+2A({hj&_$lRz?uf1p6on1~C-v1F7t)FoC=-W)MuAat&56Asl
z-LS|$)qX<9Bf7npbRV*+=|RJ!o<9<Izr8u^(zVMU{d;JCwd?Wn*_6lIv}>-OePdP=
z@#krP+4#YO2Mc%i{B?Qz-kh)0k<O^vUsTyu?2h32#GCymb**Hq3E_<Yl__(Wd=uXi
zUFi*Ek=~c-H4jQ}#tVULk^Mf<#cS5BoAIazOs$Ly0R?zz7hm2c#I9`wg$^Mw7}p<;
z=x*-Qn=g|{@sME1-2`7UXqI?)3fT4I36o?TZe&{;oPOIc6h*D~=K21`maR7;AeKS<
zq5l3SRHeUkcAy%Me@C<#Q8hL;w#K4rS!<2Pcz9@fwL{|&l2W0_fRKbO=0Hatz5YP1
z>W}<GH!>hi#Gy?@?!P~-OY)RvE$ej9S1M*zWZ{u&vps2-A}7pv)E}9=2n*yGh4#cZ
zP%Ji!{T_oOrPZrp@-X>`sn)$K^?frb`9en@^St%+=!>$pKXAr<!%*H$(QeR&mpLh(
z8oP6r+eL<N*bocn=Se?WmOE<FVnyA3#rp^~+b|kxE#kR35rWdhXQpc8^mmWvoWmaT
zr&!n1r;g>~${6wBDNsNm+crp1d74j6#-ILrQW-^aC0eutH}gwv0s8piQP~|^9a-yY
zT(?1kI9m0%oE<Td{#wjpvQOi*y7?8<sABuh%D6@mIz`|4z&ayzK@K3X>FGpVGph$c
zZiLdzN4K&1kyKpNQ>HCcQV~6_i@}eVJ&R*$ljyf?drj->`}a4Suk0qCINYt#U#Fq5
zf{Qiwt$1cNmY=Hb4BEKth<uMn9Inr(+^oZ-o~BfL^wC-94%!+lXeW^jC1O2YTsbo0
zN?KYvyoLs3wPDgvd+3c5Idi2r0y$D4V@n|;9&7!;M7A66UfGQCApiH_@%v&>R9L3>
znoRGZgd)~4jB=|lzIu;PJ&G2n$Itf`7=;};zoNPlW4r6GMj|4JPT3ne>(dhxJs95-
zt(*)`NPZ0Zi0a!(Hm+2ymme?uEQq1cMr^qTlP<yGGxS|NF~6j$FNi{e5tROdCK|q%
z8QXprww!~sNY8%~M50kY&h_KK;yKc9<}jxwo(^@T5T)n!_T6=3EU!dXW%_U!E;gto
zhjMV*+<Exi3bH^@lvO%yh^2+22Kf5!XB<@a$O?}jgAgX|+p`j|F;>?1N31RrF_}oq
z5rpNrMMX7r=L2g&&WJBKL^4G~mNjYCitFOVyK+43e}rc@EPm9SbLN2}#HDPe4SUdW
zpKvx+tO7~EF`Qb}!d}p}S7$|eu3gIZ1ht2m@InoMH_Jwjlb#g?>28|-yO~YdBp@rI
z68;%oXy(}^)@ssZVuZ9j3xn(3{#}<`4b^i)-??(NYVEDH$eCjj95yFX5!w!eyp1Ul
ztQvRaO#1DB@B&iI63aW$0J_5#jf8==rA-1kKhb;t;lmw}jCGCZVrkD8#7(FL-{hdf
zp>?K|c0-VmakD>%^x4EfZYHAyvg9-mamP?5Wr&vhAH>6dNd<XGr<}fpkV^b#GUE$l
zG<^E>FVl*!hm8YQq}DmW5V#7PY>tjuWW(&bG~wWXk~{omfUk=}N47Xi?=e;8TM-@n
z&Z)DLJoI}w?6BQYzX@dogWLU{$}rlm<p&q}RMRrIu%Ku2T22waf(|z2-~pXg_7+u@
zd=E8g!C6{bu71h((2Zg!l>1evP~x%7cDY?1ISP=}aGQZN)?_m0=txY{MCt|+>JeeY
zzTdYqUSb4h52#j|=HZeUd#_AH_xkngLjdoTlfa<%raneUx>Cavu<yl2>Z|%d-hpqw
zQKPns{2o$efS?2xq0Bn>3m6}ZURIQ4v?0QG?;cmBWd$WN<8R&N7yO^N9yPk%kVb@A
z97OdwYyvWZ(pxhkv)%9Ps1!d;7)hT;ET=FP+gYRC-Tv}OM1Ik7pX!k)5kqP1h*&}B
zb)>!S4{1gsV#yRoC1UmCk3aMB@*w<&@y21VWh51mV?w7L>wH9y!L+xo7043+x+PR6
zn3gCM($5!Jj&16Sh7%^i4x!yt+?V3`)DOE=&)%h>p`lO5q9hyMXLPLkSD}p&(nrzx
z!~38g!MrUPPRSlCn~FrJM>>?LW2(JJeo0Uv+&sd0E2hNiE{k3Wh_h{NmsawP6ph^+
zT~*M=tyvYc0Wo4u(n7|NqWd^HZd}o*vs;AW-D+Am=@f6OcjjkZw#;+d8!oE$rED-|
z{bg39TN#x6fzPdo5Ml_4Ib|RCsDDxmJk<EULrJI0o;%=sy-NnaU~ox9b+2rG{z$gY
z(-?k|afbV7npQ8r_;l`yuA4X4GV8M%6L*acT%<nS;;e#P)TDH&VgieQ|9J4^_bwv*
zhE*EuL40)e>qI`2Ftr2^86T8p0RY}CwqI!(V(TmoF1m;m^Mh0GV!&T`wo;S#9lRB#
z_4>`58(}k%1|x21X#@<{?jE-zp!Z93_(4I+PQRS{WrLYd+B!g0h$+7E9Yt-z#EjU+
zrY2r+v%9giyEqNdblBmras#PX3T`s!g)S3k{lk!bh>2fUK%3ZxZC|(@<v|PG^0u`_
z5=#I0nbiYgl?mrgiX>^1@&<5V%Ly(LF*={kUVsmpMd@rx=1FT*T3ZA?03hwAUKblZ
z>bh0Bk7m8x!lIoX^)z|a*SV8jIn32m#Rb`2=NkH^ceaTxhAu2BnTM)l%i)smWqXU$
z=83#jBmuG!Ib;Gi_!dnZV1+?=dQKpdm13wt$dT|Yd^7re8J+;0jlEb-L9;1Ks|2{=
zy}|J1_J(hVqk-!FKDit*osq$lt&t6;SWRrxzI}MopfZYwW8c2PxZCQpMw-T^?BqU^
zGnf0nbm>xZX(lzUA-ySy0cIQemcC5Ct}N4xVJnF2U^YqxN9`cQ#X*m%j_n1#KX*EL
z6|;U<uU_RlzkK<!<U6yFij1`R&vy<QeMJin+3!*HL-Jke)K5djuqmQOk_TkN32gNl
z8LED0r;=>5WaUa$=Lpz@t@7%2H(S2-A6YiIU_jCM?nF$4(4vEo&e>oSa)*J`bZ`Kz
z|Bq-cj|68BqKu6k*5keYe!o334>WG;tiGI}HR8ACjZR+L3Zjk@N!QYpQKwu2?^V;n
zcp{3S2@S9Pq!`4i=Ydr9j5-ib2n^E%Qpqphy$251;(Qf<>f=&65xEE25)H)9<n76K
zJ;Kw=C~1diKuEpMZ<0{u@Bf3Olb9$iiq!CvNPb7UeV6G1l(|<hX~1Qf#=%hvW$XNJ
zG-&!Tpaap=g4am1q?hT5wui}}q6-$;{hrL$AZBUh*tYvO^b94qY-L_otYA|e$S#G*
zR%QN2^%9e)*7^}BO{&)0-`3UA+PZbRGYA{UY9pXf1+_1-1=_a_r2;Q19FfaCJ5a^f
zKY5D8dhRHPljS@^fq5NTHC7^bT)ui$+FNqJ2taRm*`^N-sdq^oqSyp=ZgC8IkJV2e
z$If(aAn(zK`sv5*ZNHx#b}y^=9kR~-I*@>2^!M`ZxQiB1CWJ$39v68;%^u&(q<Jfl
zcU9D>Q{7m!lA`aJcr+*n_J-@3n0H)OdNGO<a<IbeDi(km-l+<B1buATwyh0-*&YeS
zVai>u+eY*VL6d#J@FE@pq8AuFMg}U;RooO>@ne$oU*A3>s{}Xznh!oey35{e+6A3L
zhckZNB;fr#wC2v^nfS~ei&$n9={M}!Sk-tmh?<fKl>+Tun%a5@38|zu5mke#{;kuq
z$D-qZojH7TCZTirrp{P<Z-)|>`arnL$fs5|Hq-Q-6OS%zW?-;Kl|E6RWYslV7!tw|
z(-l>2h?yztR2f0A4ue(rELLlP?%nxSl^{3a${`j9ijL&mOsY5WawKbohm}#@Kltwj
zqSzZa@cLgkRC0m7PN?_m7w%HD(w<NBZ9+Q~G15+Ig(r6=QG_?AwiY-FVM)8?dcz)`
z$#mFZ_Xsie%svQuRUOUke~;$R)zX({h+3t0O`r)PZNz}&3QABN`kJGLCrV%;e@E}$
zU9Yqn<YkiNjz<C^BNxifQt?m|XJ_}lB5lM(20^Km`)2o!O?dE+_}4J|K1plh<5S0_
zjm0eXWw(U&5jBdXZ4W(T)U~1#Va-z(cs1tGJ=y5+3$s%WZKqCmXPSTe6C8CKkE8R#
zpkDgBx3>|zP~;ZQxPl9xJ|k`cGUvmk7R*Gfm<nd%^;OcIX$E$bR@RgWuoHbsBhUNU
z|38sOW6-4XN~9JOXKxthXNQfSzRO)oHP{cQsN)QniRmH@mQl&Iw6)v8<(Gh_KC9QP
ziKaZ0jsumZ?4aX-U9{Q3hRq$2EYb`UIq$idNWsMaJWb1bLrRJ{Q$53(Xo?`<QcGQB
z*J1SiBc}Ths@X&JB<9Q&q<><@yLOyd03cej&oy73VX$A8qZjAQMW`nQHAwGLM_4PT
z>2z{M^$m~`ktuJKCxO??m?Msy`)DG?+^rnt!oR>VaW(by8<|&yHw;aHNb^LZL0fr|
zbdvRFBWU?U*Z1i0Sl_i1Q+7)~PFh18f<T0^a&}6trKNSL!fK+F$9Y1dSk#E9h>pFC
zokw21CmRoJHcmZf63heoRWLCy_r(@_aS|4PLWt;gRzk~D@DE|c6MDz&9)KuHi~+{~
zBQ1v#P=wbBG0lJheURwsoEw`V)d-UM=`ZVuv<XGg5GFSsILbv$-0;qc)~1Ya%ZPB&
z@6Yi<Cswg4Lw@`F)xV7fl4Q_=m_|#;eY{kp%*wD7Pc>d|eGKHgbG%?8E&zG;-wRMi
zE&!U@7k_F9CBnDs^!%H|S&u{a8l|!lNVfZlVJeN7j;$Kcy#lt7m;U_OGs`fLNXN;~
zEG4wj944{^9n3@8DW`*BMLGN<@eBzrdh}{ZE_RozNgc5|O%6dq3e!#So@m-biEKoS
z726-X^9QJ@8or%zvGI%;uDx7#2q@@Grxf|RZU27V1L)aQ2wpsXA|uUIlM>=gfg_3Z
ziX&>2kUwA4gNX{SdA-!%=%l4+NUjCVf$XcZ7ky0m<#B^sKU(b1d4^N9uua=8l@j#9
zVv}I|@eD(VzB_94^?MjM?B&I6V$vrKsY^F$dx{gEw1t?G`WRd!3k=ac$Q~}*Cq+(O
z)?RSC#pU}9DTWC9*L1gXO4_{8P&GN&TybDP292|ciKEe?fk-&pG81togqrrt-!`#j
zI=D%-NRw?p?s7yJs6@okSm*}ZkCdZRQE#Yj@&SgCxG1rfQ$QfC|6&WDJE1MbLWN&H
z=aF2+u?fgmBbPR)IN{28F=ul_L=DqDJWpX+wiW*z*QZ}U$p-nBitT<G4<#ns0j9+F
zpmXlg4Ui<m#fub^sDvBU75zsrzq=`eMNaIPH1{+dcKm~jaKxy#g$C3Pc;AbDp{fGV
zX#O`LOS!jU+*aeQZwt5?Tr6^_HKWjysggTmg(MSmC6J!!vbuR?`}}1BNYc%$_Jpk}
z)dJ8Xsk`jL5`&V?qJE(ChG3p-Q*BoOl;0n(qpVM!o4c1(%9&WRb?d0{Q;P8CqbHPs
zICtrUfr?okTgnW<L|oc}y9v3$+jbYxJfR-?1O0^Xljd5sZfW%%`BY|crNK+ENY^5p
z--arUBPxrqJ?JU(wfQO_+iC94L^zLS9o8XwvdEc#dSNr<So8~3Y$Wf->_V^l)Nt47
znhh1^g#+2<&a4mtDLsGM(&@O+#ZXD^He$pG=?ink3h@+?>d5qAJlzyHByl^SL(&52
zu&M~Ru3XBz#H<&SP^zic2ra^3P72AtY*&2z)VvBnJ$&QF*Cij4vuk2vvb4iG^tvUq
z132%SwyLMcw@w<GS_>5gy?=Wo7-q&ufdq}=t9#FF+ex}fjvPO_2cz<*lvAuznd5<8
zX-)jw$5w4*eAEg6Zm}>a5ohQvyI?QZJE%%2^KN#5Sy~x$^Wx7`l}DK|b{6y{69lsZ
zrIRQ2@8AD6;)~Ff3bV_cgR?B$RByr0wg19t7)n^|zi>L}sU{jD!NJIi^Yuw&CTUuv
zgwE0xA=GaECy6$KtZ=V4YUWw5i=g9p`YrVFFO5BvM{9c6s8Nrn70sMAE0%~YaVcr;
zFIfbtk~)Rkyd3GMx?9Q+EmGZ0T0WZf>WvE(iI`YY^0p-ded{xD;0g*J4^?fKAHU;0
zdq)`-Ag7^#cx#(nt5Ku;k~pmA4nmfnBM=)3^!2<-+D2W0DSgc4fan?G$Q=7=oQFpq
z8$vTk&$1*@IFQdX>Gu%oWHTcXTtaZ;ZZqE#f~hd2nB$>q&&)0ZN21~tg3?p%9UL4y
zr?c-~ekI@gFPJFozhD-G84`vIw>8QrdaVr&2Qjncr|$#@EL1-Dz_j~1<WVRh-`O5@
zF?6l0ruJ;M*b$^f&!1nGU@(9-d+OaMzrUDDQ-`aq0@{J0CNxp5-o_OlHRHNV2_k)l
z$6#mVj@~OODc>gi0Z}4i;XS||7X%kUVMx}5QytD*`*w;EYiFM>r6*iJ3we%gm*~>%
zgAP-Mq6Ho~<B<mrGiI~aZQO{g^ykDf7Q==OTm7Ph$tHZ9#lWqE39*icty4Wu4$ATb
zj1hz>khCY2fob3R7WbwD4OOd~xi@GDrya_ro;k}^Ah-<KadGODDYv0=B|ZA)R|rx^
z@?`u+yUg0YEkv^Ys&L595ylyJ%nU@7a)yFT+`7DbTvF3VJ`_2yj53h5t-?qk;?;TV
zv44L8fW*XzN@TYz&2~&nupiqF{)+52>XhGv9CJ)-DLJE15Oq?eH?w=P&QfLuey~e~
z86U$u8YIB1MvI6{_Tx~`Y&7v$DrVLpA9&3weR~JbT2GR!6Qt)#Y!MS!p)1fD$i%EE
z0_5zvd)bL_e&q)Kv8baMgOF8S`$RZ8v!W8C4+s*!l@i<&Vv^<aXwt>TQg;g(&vX|T
zbDywUK?l?R=k!H(qBrbA?^_C0{t#U&4p0VqT~Y5c^=mc)MddzZFa{7UC&~uXf!FYA
z$6zHRlUb@NTI!c4Po9*#(X?ZQg;QmuMHUG`0AE{CjwuciTrWj}jsl@^DxJ6vDEDO2
z*w?q4)4s#R8`&c%-bfxE9`$o+rgy=zMB2*(Dk+JYl6~O~(Y4ZR6mhx`CA&Oe{a?#q
z3C!wlpvxn_+0=iqV}`=y_3r)t5aA|K^Y-_8v>@eI+8SERvfEhIVo#vC;YH8#A+^;v
zn?L-wDnUpI+V}E}Ykn+tM!bp=4B8i&?{g^q^K0Z!(i}uZ_?fr6u`!XsDxJh8tQIxI
zZP!y6fQX|Vb(}uJO!%O&s8RG=wYrRl46+UB+cG1w6^jo21pw?QsM?6TAK*dR5#1Ku
z03bqEF30h@MF>IE=p*t_m;+ivs4Mh@eLa7$jM*$Zc3&YYdVRg?yCINYQ_)ttZ(_<!
zbd%7YJO)Uo`{!XjBTxOsanH?NUAN2P9)`*Fn>V*eH>}r%WkMFB*2fd)a+NYlbT7;j
z<WU5P!Y1v?6@zqVOgZT6!DR9ZKaHs_Pl4ihEKgj%EY$ULBW+q|vYLn|d}`c~)Khdy
zq{a68_tzvJ4z;Z5z6Z`f=E+-3yLPSP+;>l(>XZm2WLY&o;g1T8c3Nu=n0#|cf-uqq
zB_Z}gZDB7t#x~$y5nu8DI{Xv&(eisyI_KOaG>T`MBp%wAv9uAecA0EdYzR4U<J>t5
ze#8kN;%BPEJsc{A8|agS{!f1{xxkVk2!@_9wbF8LL+6r^HjFzfC8?3c-v>j!=`%Nd
z75Q5ApafL#Stpk=Nw#$vk!SOT>Ndk3H9EQ|B*aHX#BfRg!rNFcelQJb6%|=#zLpa7
zAOinoZjKnn(d1i8fpmN2tnB16GQ}pHLQzl_(vEFJJobSx^v_SYC~#E5VbH;e{|4mS
zqDK_-4GNqvU6fWJXy=KiKd*((lSJ;fHa~wndXKxRyPJ;#C}RYWMm$vM2}=JAvYHK=
zj-?1mNozg3{y8+FM`nNZ4Vl2(yUn_--jWnG5pnM6F#F4RDYS}bpIiaO3snOY*)M+x
ziB}ddAXBFDwXs%HJO;sI%AtpKm#BCjX%^0idl*X?bu6AQ4Oty6(lA2!Kh0>!>9_og
zQ0_(iPmLTFCDD;EJq%9ShA*aH<Ig3wOQdge4l~F2ldB58CZ0S=E5Kjc4X9Ks%2=St
z;GEYPE|EX-aQ{JrvS7Jpd21`#Gw3tOw48KsRLt1Pw<r1|Ssf1Ym>CblRMnF{x)jV{
z+_y$=krtD-_HfK(eKVcD3HIiSez9=EOfSjQ3O)T{bLXB(xaO81W^fYs#VN?mnp&f8
zDUT9#uqD+jMbjNr51yge^1Mc?vic8#Ifml=o~rY*YW@0Geyh)4d;j(83|`(kqlLl2
zAIF%>O-%BlxZqC?6Y^W;=~%ZDFPAOavRAKpEU?)Rv=b3njRy!Lw-ci51V~np%?#C>
z^l+BVRp(~n4Io6LYy*MHZVwqa#2IQa9bvp+9cf$Tf}EvI-VyUx1WxPMOWWW0FDlR1
zZ__42emhmX$CxpeVT82~?x||x!tb14S&4Sw0;vG*IfVP6Nw+7i{V(E8^puxkwwnJ`
z)b4QP?NyCAU}qd<st}&*uZ>E5pJotG7QW@`1R{}CA|zOP&Pp7y24_@N{Nk926fvlH
z+ZoGjx;Rg;_TC`NNHJ@*Dm;8A2-$~<xyEx*Q;8}pnxndbfkY*u+21cSULpZrNpMP`
z38IQXv_xhz1)3zcgM!4fNv?~Y8oBs3XJ}c6;lOTuN08mn6_ZQA?dVcc^S0iBB98&V
z5keA(gRJ{0&JF5CK|0B8IJ$DnN8zdo`*g|G2?Z1emKOCbTCLr>bw8i?4b9{`6#CPk
z|29B(+Fp;^z55M)kSr|amM3r-=<<zdU@~U0kl&IwS6{t4K$Hk#;zavd&#LaSK609;
zPVKPBmttyVX|WrWK9P2t3`kITH_uyF^Sn$qq8hr44w%~BHi0o;(JG=llgU&r5vUJ9
zSAwN!8f$C&Y8aKTP~k&iQ#p=$t)ckET2$b@BwYxPZmnu6t+faw{5jKDr5-W?Zc$`S
zbY92Xclo&*h~XpMW0qrmIRB9=>xG0?2Dc+x!fn~3&^tn*?Y@`3mOm>nq~Rd7xNl$P
z9aYAO4-;=`og}==k_^u86%kQJM(iw=&qfa)Hf-$<7FQ9jxagKYp696}QCh+Tg)2Bm
zKMB#`)TigGs0e2nk_=4>pBP+%vnNKhg{dO3mD~D{)VYxvt-vg#LYar7?V(s(jvc#8
zTIw=FM5wIsKoGz9auXJ|SmpV(2mNO9--`f1>ySemap*jQpC!&an>cs2qsB831J~%~
z<y9j1M;AHgN1@nEMCw<qR!yZGCr^`&!*v%eTquT+#I%28R7pSjZ;;FrFJ8)0FM?vK
z15ykTw}TO7_%&@Mf@<_Qv&+b0HBJOYu_}M19I2p2ljd){_j{9m`!-{=0fhSgq>&mT
z!P!CYES9X=BHF?ZjzZS;iB%OpOei~{7#deoYzNBJ2OYy9#Ccs=`y|>233-!El;nL<
zL2LWJYL@%*ET2hSB7`ZtOr2)ds&^^3YQ3(M*u<~2-B>m-r%UsboJ~h%J>H3Fb!UH=
z|I5SBrPI_`tAy%|I-Y+4BE20zl=;iPB909oCg+e}8u5H-#)4)`yGeEL9ukW&xE9z#
zg}_Yh!=-?JkvtWDyVz4~rkP(c>->>2Up8ZbRMzMs=@&PjaD=|{X^)8=2b`1W-lInE
zUxo|h0(1!;d`ZPN?cZyaODGiG7P8RgstAH>G|yB<y(4__?Cio2UP-nR-AIRZyLazK
zALyg1Rhdzi4rD1us9c{RW29J#y*u@3MS6OA=p=gc^0|;9Mi}!=D@n4UcTSqrvqH|Z
zidQfDrJzKWrTu9lQHCPeX<;R^imcU(!^%VcU6B34QmJ7?4f5M<q(W{FyNzeMiOFO+
z0wp}7@(IrJwVZ7g4rKH`HZn4M*UQQOzcAVCf)Nv_qT=J(ZzJj6o1{dVQ~z(4`FwdP
zX_qb;rDBvI?qjqzR&0W0Po8!LHIO)qmsqDzCV=jrAL^j!hd`2uyj;*mm<)+Qjf>xs
z%8(<vd}&c?BsZPtxs#tT=*zJ<IErXetAlE2v9S_ys`MV5gPK9iI6Ue&9p6R*eF*ay
z#{A(agD85@ca9B<0+z^{DX7%l{8&$4BPGT~8ZP2@$%h-t<u`syA6y1|rFqCp_jDsO
z+d4`o!kaGez->L+T1gx8%(ti2i1r&ct0~!SWW@51K-x0wE$gHZ6|JdMl@PO}^qs74
z#juk|0TJsgPr&xrlYPr4SRpTf^g8C@e<&mhru{S~RbM4krBnf^?4`ZawIl%rJ!$V>
zC=G@Ou2-Q4Vfclbu!@E6rfZlE#r*kl%We9mrde8F3k!wU{w!&>Y18EM3loQtwjt+t
zNk(~}Qcak7GoF)(SuxdaWA*yx_5}#WuGkz`Y|DtDnnXROOmwZJ45ZKGj)#+CTlMeN
z2pf)KiSZF{6we%jRk|pv#h=|DJ7DJO1`MpM95einqzeitkEVCK|G}yBd&H;hPu$8L
zCWS18nT&o&4;Ho<;f<BJQvuDd!f-slgGh?<wvE`g^Z8R}^4o=nWe?mi)tLUmk<+Ii
z6_q6YRWEw{e72j7XFS|agz%kNL0RvMvzLFt5wz+wC=i+)6NS9^Q(1i{6^r$Z$Jnvf
zVIq6L%}3==wLY};RYvUPtauSDB<-ilrYupBaKHdIFi7z0RQ6&w#;%EwvcV9qarzU_
zf{F#eGMXqV>lBE<-ma?pE5Z(5=q9vu1c-*{EdIS|(O9^@T5kIIrqH3VLGGtG{r(tJ
zcjWC0CD5Y9`C@s>pC8YDH`4UX*lP^{oEukFR)R+VgmySVbBHU^lE5J&J6tefA0ik2
zLQr@MIf$+m74~q7QQ>6erMl>plFGJ|MKc)qGq;I&C+pIsQ}w?)`qUlNy{v8AAB9DK
zEmo7VlWjC7_5apXQIp+(^6v<41V(q!h`Iw6d86B>P@F-UQ7_i1^YoZx%fRnfM(bKh
zZ45roND2(}_2&(>HZ>iN!geJ*W^*f%q9LU5;e!T^_}f{P(HX|C*W+BLtY2LEb~=sz
zH)`BS)IUfC^aGW42CkJzppp%#zDpEfGBq9=8v3tV!nHCEVtC}2swI?2HwUPAj^{4w
zm@c7}7zbiXL;OCeVkg*lc^FEH`}L&w*Eatz`LPH9?Hw5P(HwhY>pses{$WzUi*pT4
zD3Nro7jp>+T%o*m#q$T!xr10q8C59_d4BsjGqg`+LbPM5Q~}a@CTwxEY->kHN9Vz-
z^%eTt6(A*5@#+_%T4f|5^rIjGDqJ>Q4gI6xvRy43e}y=txw?J_?TLhh1b<|bGTjt1
zK{^-|0iSt1;E^j)3qeF};D-mZLQfoq@aCO^G*^Z;WfHnY(T%|0J6RZQ`6K$kI3|xo
zR?GKNTC5-F(L|M|mCZYlUdXx@iQ81DR$)vJ$Q}k^hrys}hyE}1(Ns;Agh-4+pjfk}
zX;l#@ev1?rNX6ET#1Ku6qlse%jUzp59K+Av*rlYMe)!N?LI7OW5TMYxqs-SV*Y55J
zn~~e<4$%1MDb%!5KInQ;Yip3!kz-S!VO4gRCjg`EMZd+Lb%L0OD#`$tR&CNUcb9yV
zWSdT8xZfXE26}=*Qyu8o%2cCxyn#s5mX)d<oJ8#ab(wbKhGBXD(WfM@!hv4GSIcUT
z>QBplaT;+=)|xF9ahQ>974#&D*~znjZ-UTHW6{@z7RR}xgy+{lmSMbJh$=Vxl%;vl
zQZkf6OGx&Y2vcC&hEpsCr*z~+9E-+SeADG6MV}XiBC1_0A;4+K46@^PYu=dZJ+Gv*
zW4l+bw=fILl&sMm(Wb6cj#TbzS;m0LNJPg!4ZyO&3m29r4A%YA@)!CHF+-_-YH2?h
z0+CzNSiP^`yc_eukZaefS5z+Ki*lJDBq3VD#rS3H?t~fC(A2z8-AL~hEmF~K)0q~P
z+TOiwAbg~X);AS*Kj;&yboZ6r`(kmJ+zZ{UwDklkNPmFK$a2PUhp==DQXh*UUYibG
zOL@T%inl$o$?F~PXAiA0>cxRAc9pfge{Wg6d_&qD;Do893Y}{nhNBXX`Za?V;&RxN
zR!Z-@^>u2r&&9u=6yL_5f6eOE+u`-ZCSz$IRE{DkmKSM_1mtdAeCrAHQ*JM_5=qQ2
zyqwSY2GDgSmu#rAgs(G-RZwULR_Vgi8?<S2^;vHem}CE(I6W(1Nfyb_cd`y|G-cPi
zH}>+xg>gRCk)pD@tX+5d{ezY#pCSLK$-+g8^4N5Ad|zi7MZrSP>9;?og~&mRkDB=P
zn0w*-w+b(Fhi_gvdQy+MXTN{bEcqM$&CmwdE7>-ZsYV8(M6FMG(&x+vb+bHJx!r-k
zKW(!fIB*yYICDM*{E>&;*UT3B3dRRSMTh13?b>#&$;+Z5>YF2l2I;ufe<VtXmMYh%
zp%S%o;PvOgK;y*VHeF3gY%A%l;lxv-qG{E<`8ll_C~)|CC^&Q<@omXu%x}0l=gM>=
z6aeK!f9BnjglmEiN-nsE$AYW`uo8h4CuR%}%*-od)22PrXaMm5U9+$#I$p7%613Fg
z#HM>y_OC&cNEjLjVLccvEy{?QPM^dW2e^82UuPooET>{K>p_E(3e<=QX9_k|Oajrd
z7UApKB(mwoV{>jd{rmiQvOLUsB~SLxV~)-VP>#6~jX(iHYYS^n$BMHO6n>ufjZh+A
zOEZi|`1Zt`c7mQs=|eb~;G(N57FBIZf@6KaR26^s|2BjsYDl3fl8WjrRRP5EXMkGd
zfUc9DEtYW#VG}W`{3o@0w{EhLN{mXynte$XS6LuQA6OnDr=~+!xR49pocYvvR0NFC
zQscG<6m<Xj{+w)sLf9caM!2F3mH9#W4WrUn&>D$!wXduIqz|*5h;1d6_{5VXHMnh;
z&G?#vfkbXG_2;)qge0@$GoZ9ZOOJR?40S+``$Vz`7#%`9zP2J{a#NK~G3UN|@ZP<9
z|EAqQ7**+%muhOKBQ;+ISVH0;VGLQoZJOw5$4dbb<8;7|iJ`;6gm_xU(0Yju3Q|B5
zfx<Ks^d#9S16>c@)3t);5_o*si@>-6fhC7U&x#x<<dE%I2MY3P*E->RepI+MF?mA7
zd&ZZaev(>m8e)ovAB|<=34Phafdi<H#X{)V&(A2MrTZ|^z=DAl0`ad08=FJ^(NE%E
zqQ*J)$oK01ac2Hr%{BF3=5Cqs*pA~Va;FDf+O=LGkDOtqRoBK&NsLU}PhUd<i5wp0
zkT0<#2m?Gybz}I{2pjs?GEXBMEN*o^WDE4E+kha`ADw|8LM?ZPc{FH^tgxEPOjHWd
z@d!BJ9vz+qRx2u%d{bKlIQ3NJ?zlz)rlnc{h-~SjqsDzS)BH4KFv(Ar0*ccuM!rEG
z$MV7D_{dW#h-R10uuRzpMb%ycR@wy0Q-fvOgFIe5Nhui>{b?dK&dzqRO6S_|mcR;h
zv)zcQMm$~w7SSLOYZ>aQ-&e=;icEH`o7u{O3(Rut?ZTc>iPw0*bBQGii(PgWV_41U
z*oTGk&Y^yB1<HfBh|m?Ct`sfMcfONbM+^f{Wr`PnojSQjyrBIM@iGD`jPHl-EtJS_
zFglL(*2H>75+qqEq(H<RqIU1e_PZ8(y}eeIw9rHgo<#{FjeE{9hV=jsj&WBFqYiAZ
z<e`@a!sLQ)G&01E6F-m(t^q>rp=%}eN0?&teiJ)FaZw5RAS5_?&^vg|$#Nn46CX*O
z{m`b$b|D!rCx!G66K`ca=VYQbDE*~)d@-S?YbAWN=x)TV+4`guuiFB#bdib^xY?p5
z5KV84|B_+GG7l^!WT4aK#xj7yUKvWV%+bF;)}y->F-BK{jj7Cwrp80A>_fR7%ad!+
zd2$pS=vX(Ch!#}yw^+5hSG1qx?bMmH;f1#oZ&HS!O^ur=LmKhcGo|$^O^bT%oTa)U
zWHsR%&JEL*=}@lC?I*>*XPkx*Q&qtO)y7>eshlvk0tD||n8N{XZ9P%i2_PUuX5_#6
zdTMkP!JPx0tEl3lzVD1DbB@XUzDRTUQ>6hcA{!<VZsmI_X^A2>LUJpom1^YWfwoxO
z%A_*%NXogBOGiMgOGQCR9P$BPOmIkYb=h?AZ5mXO4Jvqj14>cX2owI{I;k>R*Xr+T
zw?FZ8A0|0AwjZ}bsDRvEjp)%Zk~>-)TL<+Ofm9mu1any!CWb1ExwwXM@4O(i{j=v)
z8ZaOv-GwMETtuVa0kn1p5o9v>R5}R23o4XX9-7hNlWH9s3v}qWwBjLrq?O22)y?x$
z6GuYn^}Q##82G0`pU$Aql7E6v5#9GvLA05d46D(T+VB<q28$ENtDXpREq3bTv(OQU
zY6x!P=^=}jqFEG$I@tdaeup&t!o~fV>vn_3sYG;PD&r3LPR|O$L$-SjVDW^00f9gi
zwN}zq5%yb-VvxGYqr7M}1gSD7No*P$3VI$QHrG|E@_h6cK&BpH6SGiAvb@1q8*w6S
zqYW)L3=v*JM8HBZ<yt)ejSM5J$fC#J1E3kHZFft104i109{4RwFVS7?2BF#v2zgSf
z$#IpX8Gi{C;fD0ocsSB`Eaetk&xaKdF*roFEG+0+BS_uv4@t*P$6ILa8y)3N4}>Zk
zibe<m%&LmP3i{sq1_u2puxwSdL^<X!?l3t3$hgcTGiT$cQw8OHh_sPp`Z%luu5YNh
zE`b6m#5)~}a0LtqeF+<;YXxzuD(_*?xQU(K9FXH}oeZ-eC>n#`vsx&TSA$Gc*)Dv$
zZgf<XC)58jOJsZph%6#fK9u--W{+k3XrSTg!m!^sXf<@7@o3sQ$#8_JI^jddgZ{dJ
zB<#*G6>*oeVbSW&HTYS3gmiYQNeS`}k+>A;+uq`1TcYY+qbq8nKZ83m`W#2{MOI<E
znU}juWEQAMllFJ|2f00-A*(aJJGL5m5B_KeIZ6Qe*nUVsQ!1nLgqIg%O3d-@<aa{D
zk&RLzwa5wP99>_B&b2um&91lg5K0RQOVAuFy$=l3!_9KFBH9bX$>dq6$x#@LJen%9
z%>a+wabhEP%=eMB0Ib4bV&wp-j3;P8XvjLvT^F@y4<J*AMG;nHhV%r-&t~ATx#%*b
zI|4&PF=?0V$6T4_^9fGOv5<hZn1`*a>Wy>|DL%RB>DGXcR=?hiE<WMg4IWu|C*88P
z52X*vnB_=O_mQ}aCD{nw$4M0eaY>c_NrX3vfpopnWjv0KiSXS_WOwL_8>VKeFja7y
z=)xWF2^9<%sm5Apk^SrD83uk|2JP#usv68UJEb+XV*Cus>SPBz8&Mr}`D81fZ-C6j
z$;IZLhRZ@zPm+(A(HuXU-9e1IDd1IZLC9xsQikkTB{?3_!#I?#%<~Lla(tbxP;a!n
zhSP(%d^znXS4Gu&ILD&YzwA$Au2ls!Xp2=CeUx4++o>q+Mal?Sy^G-dil&#yZ{b^g
zr*}|}dHo9^8mD>i*7uTZBs$7(T2kf83hX_Gh;r@RdH@K;7gNQ)_SCYiWg?4OL=Ke9
z@^)Qu`)A;nw_RH=eG}N^o5jW8pjw(`DwKD(sBN!NO{?0`^VVfk`?}faEbhC!h4s(-
zH?<aNIfd@%yFh2{N|(?b^CNp{?2VjXb$(FO^youHMGMaTbbej7n$N6JFa54RMyGjd
z#@KliJ`HeoZUk8r=WObEvuXK?6^A+UPN(KGe3fR5|Jg!J*3}Gj{kx)bbA|dCy5H&e
zWIaY7HXd#2-Qjt0PYTsqwQDzfH+k{m#W7xc2m9u?Rb!X+=!Y%vJJ<2QLb!0HmT+J<
zA8#_4q=ln~!r9Tn+~d@h=Jl&puYLhF?CLFBsw)Q$A3jgL^8xDb@^KR#9q+~9z-)rD
z7V0D%@mIn}KLw9n;FcMAXXwvb_to(V>}+Lc{hBU|mo7C&K(XZo&4)16KY4ZkhygI^
zvE|nwKArrsJj&;Fadn+X_$j9#mVBRouYKp?eR_Y^Q(uz(?%hhlp$<|Bw=Mhkx1(Ja
zLaDW)SJ`QWVB%`q)xdor(b0N6e!OZ<m3m1&;*J?B;dD;*Zz^9`XGhlN`QI{Q+)DnY
zBKy`-AodqiJ}F8@M#cdQ*iqK;ABL?#?ylCTQ!VbRk`|U;j8>`_%IfQDYS&)97e2w2
zS%6NHpH)^O;a@uuuDdolJVg7H?eD?b$+a3cUe1(!muu0nu~k?I+01Wyx1h5b84<;U
zmy~?Z#oh<}KYM>GD%#2B2o`13glSs@@HW007PFc^9fey%0?j!pb-g>hqO<qz)rV|)
z_Rghy_gNpy(^NMHdEY{-q;FvQW7-}5)R_4*K3mHU5TaaJ+b_T3larGh6jiENF@??S
zo}*9Atj)h$?$FFG7ZTwDZJ62(8Z3pEcuH5FX}U#MkA3pazQViAdGVsfrwOY9PVFh~
z$)Edp(wlEJ+L5Aa*S;JY7uOnj$Vx`YvZ-ij)(h$Lfuh-hiNMFt?aGmaTp3z2Z+ec#
zw;x73F?;r0k@pa$H+1OGMqQ>>Zr!>ysj(>;_yaxw&Ghvvqh$ZsF5Zqbp2OiX&raFX
zOl>92k=xF3ad9`Zvg(zsmsU<IE%48yTO9MY-RLu+Fmvb^s$Q8oZQ5!!pk9ny{J00z
z(cMGiY62q`6H1m$gSMN$XOBUn#*LqBs8wdQ<wDqv9N789IYs~YJ~^|TX!H~XM)e<i
z0mCc4;p^hz&`R}Ht=x&vl`BK=HtIRI5d$P=VP&2>LjcyUefsU=<ulH!-ek;WIYQU2
zRp%#L4j4cKu~NDEkIMp2*OOo0NAE!cT#NEaZA?=g|Ne|(yF$}kx_WhCQc_PCIAa1Q
z|BVH1rjMU@L@rPTJDo+5i-woeLMzr(eM!{T{v$@z4-O7icdhoknHnOkr&CH!GAFn`
zBlMi@vgh3r4(tUiPu{!uRRKwFKII8(qslQJss3i~zI_R>hoP{OPrc*cl_jY#<UGt#
zyMFyDFs_}kN|9gH?HnCj-)h&Y)#|}|^7n-;T=mPTp2THFr%e#z+1c7EHa0f1)>)Vs
zu_uDxqbNd4ZSFCr0Torml%c(QXRJrd?1!#hRxLwnGp^Q+(xECl6-{-<H*MbBLdP#B
zvsN&ln#;jWUD)<igXEhq<P|kkKa9V0+)hYlD7qp4J8{~Zx^$}+a5Fm2d<=YK)c5t4
zuD3SFI1d@3A)Zi;y3MS`)NRUSDDDB9JJm*P)fbKCGl%T~8#Zkk?zn6F_Iad^&O<EZ
zTlKDf1aa-dq`)SWCkKukS%lc_RDTHuJ4_Xe`l<sKW5<#Gq<i7md1{#|M0~{zI6etk
zC|SSnnLf2^+l6~RKGm6$vMLXdA@*#3<5{rk0a)=_>ncXJnYrC9+nf)m-EC$Np}6F$
z)=sq5PPvKaL&ef>TKjb04jnqUxVx9b1$DzUjOT}7`IgOOO+U!TTm07bn>OVVPS@P~
zXYr_hC{0ilhH!G^g*)5WRL610;Y>hh>T#WmRU5Ww+qODgIf*8Ove(^L9zWihfhT3k
zlqoqEFI`&PtDWi*OpKu#$%V}L*lvM*;m1a59)E(PHxJPjM@RD_#C*N^49^|pg&bj^
ziCM0O0I@zCQ8>Uw6u4@1Lp{R^>ea3L6eAXbdst<c#MxilF-S9P_i;Xfi<{egE?58Z
zMpNHCZp@jS7&Y5W^%3|>{7DXM4!kL}d%u}8J1LRKGSoaGkA+$4&gUxv85U54TJODm
z^JX&KpPb=Wd3l#(Q+`05oZ}AHCdeYOL&Q=};e1q>GUb#14Pd}BV18QM09Axn{l&kW
zam2im*bygY@ngo0Evtxn_AVy-$N4Q`m3aPbMxM~%2llnItBu?A5|D?4v=*NzR)t`x
z?7TQMRKr}0azk(b&0!4*R$5wGmFHDMMp4IHOE#_@+E^OgUlla1yX+RDsf#bTb({NB
zi<7P>g9i`pJiM>26K#7K>Ymwoc}sv}OQ!Yh+t+W>q<P`tb>IeD?XKGJExE4~e0%hB
zI`h@5{&_j7uj2OSPSzeOCRMoZX((oc2GwNbAYuxNRoy{7e}8KZfS&a3ad(2F8v>~H
zU1#`ic>$bNPpLbqx(l{d^WJ;R0El~8m2VeO$%xIT71GiI9)y=Xp~Z`%c1A`9@eIo<
zkW_yjGEcta%(oz~Tyn0FiHZ7LL@)D%<v2T~j@279D}0<v@TO=Qnk8IMNh!-Hh~Lzy
z5f6&uP`EE%whYBcS=wenlo6S;+qG}sUhzoEBCZ*A?6_#dhWcXjyl1jJznFPQgAOpb
z2i2zzVa8Z~@b>M^7_C&p>z+^BXGh?`fi*;k5HK2-aV^I9wMjxDuvisg@H`LR#}#%l
zC2dnu3nidL+qG-!78Vw|`1ma2JM847RewB?`^g^?3@Zd_x-%-OJOx!Us!SB9Y5_E6
zrfoeG8L3bmA(vwzis>o>YkIo<tJkknGBPw^78mT?*}U-c=Z4OKefw5~9bSkwDi?yo
zcIePWXU_DH>3kw>%B4#&1BC$20YoKE%~O}WgBbvM!@Ph0YM7+;kmtg{OY+T|2hJWb
zT|+73PI6D2UeJJGP7-J^HMF5m6YgdX9S?IeGX+KVs;ygVthOvaa^%Ru{Cjoy-pQyT
z-)N5Cuz7PR_cVu))N8;1HM%w1ckQYuzwPa#hWLpx_G3n??oBsSZnmAfU9GI=CR0i-
zW#K>s`+eZY{cxI8l6g|bNaF;2_5L^=si~Tlk>Zx11=Vqm!htYRZAlNHiTXUrOkdwU
zxyYS)bGUokwr!UrFZ!qi3zeQez4T5j9%TxVdY(~8y|pwBnlO7lnUt+Z#gqcmlwDA;
z9NOOX_>LVqyz5XXH+Kq`$*~1o{%@eW)vHzo@z`9y(-C#?^&RzY7Qwf+re+21m0*k=
zL%1RSS7P_<*?Hgqdrs<dmR3164*d1G-mU=~?A^C9BLVY$EPZn0#*MKG-H~RDm0vCM
zSEJZ@vjE!xB^fCa#J5B4(Q@U=x%&7R9OfWZZrE^>^)JbDOk_l+bq4p>Zqz6gE^J8E
zY6iRa?hWPhxIxD$K~zjGs1X(~U%v0?Q6ig+gLJ&}#Fj9!9%HTpH@r<y3eb42-l9c|
zHFuyHRDgZOAHa^RuV?b3M~ySLypHAuf-29F!@a$|?K9uKd9w`8p=q;b+f(P_9pX1_
z+67p#$f&3xUcYFd-8?g=sP#ohC#UARy5(v8Xtf`!M$ctW{_b77f|H7WH$*<S=g3<$
zv#vCM=`K1SDcXsNr#cFsAP}{ApFYuPJ<>A_edwN~6hL{n@p`{8|27FM*WjY%>`;%>
zD@Prf^=YiM_^Al@9GM9Bd;p8+0HguxUyrxeSzQ@*;=Ge5XYYH`-22Iy?%W7$lR_MQ
zJiX(u6xZbKc-z0xcs{aBN%}iI7oQ8m@+O?Bpm}P&dzX*Qw|Ir#tBiipf%|k89gg8;
zjE{eV`}gl3ICyaW>C@fk;tBKZ+qb6Ox}CaT)4n6A)58Nbb0r2F)Oq;l`6jD2Y*<1Y
zkG<-RQCH5}w5cJZpXE6z6-}suRKIRD?B(Rt(9zLR{mPh0nYXyK=WgGw!<a_pDpk%B
z_mE^i$)1&0P)l8XKCvK~U`GG1R@wTbG$T~25W$s*tKm*f%PGWnEO#w*SjY!(YuF*4
zXG%x@vROsG?v5;55*wbbF5||9P$@SvG^|NA!3e9qTJ`Eo=IH4WoZI1{@xCx`u!ED+
zPFTQi6vXl+D5P{cb<(<X=T6Egd9LyC6L)6LfgCENKCV3GReE$ap5IDPE;@0YX3ffA
zGdOSc>e@;+#zo(FNgtjB7H7_qF;qaqMND}Yk==A!wyXpyq(V?Xlhm^AvIKl_2dp<L
z1A;no{WuRC2xV7!j26Dc6m|=tq8Ulo>DH|gF)0KX$niM9?+R^bT7#CvwEaDM^w1`#
zXm#|~zyWL?#bZ=-wA3zin-w7;J9X;Rv_*?ucfQjwTgXW@Z&UZ&rAw9hu)TnW3`?zA
zvt|)^FB_MhMf?Ek{{26|@g_fhtgW*;=K2xS3JDHsvuDrdwY#7#B_=v9SUJhPU?(&I
zt5r3*L^m@s7M?uWm9EX|b?ZWiA&l6#5I^M+DXQAE>-0AKTc6FQJ$p9cbzfxC`2EM{
z{V_z$hBv)P%Yt{ll43fWcdOOizk`DLa(x_h9T)&Qd~$<iCA(IOeU_!fWy9QDN=r+D
z8uGD;SLI}&;~+1uRs(Eo7A^b$&$XoH!NrSTJ*o>l0sTl~;jArQxw0Q9@&jXOZj87O
zU13*}Btn-jzeTS}08(mds?k6H1oIA`jMFXolXZ3+a~wOi1(@T*v&(hJJpX)~vQD6v
ztnbxboA>5TmVf48|Ji^MdFa3ZReZgZM~=FDxr)3r#wEABs9w2pGL_!@pWml+3;f*(
z`gZ2;pPje%m{WDagaZGp*+pL_IE)z4YxL+Qyth|(6Pl~3sb#-<wV2PJHG7mC<rDso
z_D^jGr~#DbEc^^0#NfY1T3XtX6>~m5tHRf4roIFf?+VC?ArG=xJX5*87BfjS=6yDI
zH8QG&cd&U8pz}aNLhPigJjD;-+A0M!L)^JRNSZ97&j5E|-a~5XQ)2e?^z<ZL*TZ)=
zgetIiF=wmrb!N4iO?s49B;_4rN;F~XkHo;@9yIk9F0++lo!)RS{KZ;-zIOTYS^NX%
zbM4CLSIa{NU<MntXG?MBJL}NKUF*U_+&wwN5CQF#)<uSR9X)+kh-t!|nFmj-TUQU&
zakIm;8>)~mS{MB&DssnNhzXoB3JKi}x-av3b;TlB?JQZUOvK1%79?_t>!EXJ=yvJS
zfHbFJUXikAsdP9x1%CexSZlt2bW)ARjkmNe!WPtxG=V-{QP>gUg1KwQ>m!H$0^T24
z5xZyj%7l6I${=wf^ty5#6ViT9>z=wiVI{aa7`C(OMH!9$Mz+|Kzlj5nDyDKX-5GBm
z_9An~?VEvgCuP!bJG*(?wl$$SJj~<C(xRsfOK)>NSHZyl(eFUwpk27bZilSeAn*&g
zHOQL=9i{4H{3pMjiA=)0>4Gg=8keIavYZb-BZZb#E?v4L$4F7YybBT%j0tV;*`_3H
z@;8!w?S>7PVT91lzpyg#aRH?;U~dthFBvOV3Gshm#+}>T*hPyfpsO+nzcG?TWzcrd
zo6BE@b`JPvB-k5il_Wx(fh&P!DNOHYACH}cC4v;S<j(c<4|+ER<upEkW9WGp2iiuT
z{X@yn3g2lF-yGt7Dd-w8Zu5>EkMx@M_0_F%6y-C2elw=_l!0sPO@mb*s_^<?TiaxW
zISZ$|x#{3{)hhEYtf85%dX7x-7_aIziZ}&b^SX5wu``Ert9hTpaTe4|{Jd^GPffXr
zA7%i9<>o~kX!BKz_L3L^5IC7P71i$~xW(4x&|%Z9l;}?)Q4}&kX0XAK=fJl0-DXW*
z?oP#1jzqQnG(9)=-VD2EN*B$o$p|MWce?1A*So`~EeS7#E(=9u8v7<`#yf4`-Ja9y
zLwG^*Fb_kTv|H$xF_w7(gaL|2v*wh;b?@_jX15)Eu?2Av9y_z5i88HgYJAx99wHyE
zb;)Jy*kKczmb3kG5?bVo<C}5{8o4D+SBQV_eKX40yBE^ZtUYgR?py~fZ$E*{v^XXq
zqg6yR9J6NHk92lcYuk2T-&Uo^=VhkpW%1+dJj|Rr&j8e(^97`>L{#`LXM;(Onp6rO
zu9-e@;od`h*U`{?v$=Zz(gV^v^X{ox?GecZfx1&JT&RE>uvV9;o(o={4)|_@H_LLG
z4K8Sz)}su>qP1b8y+7XPSTyQ9xjcdoGw?A^KcxD3%;;9k`nd~*nFR=)aOdc8sqX}$
zX-<B=TKDeV@!iRWa9Z<2jur$P>|y+jEhoV26Ufczv<5z{n)*39Cg#%D<pN{lq%>Gk
zr}#?G@OUfgL{J~|Buq{3%)C<z7wlLQm+cQ3N!Pv4bL<1Kje2YN#vpTBr%7e`8}<7T
zc)j7CeHyuYdg?nrE2}{F`!UH8d+hsbt4~8ASdzI$hKD%?AH=op-V8zwV5`o3klVh`
zN4CILyze%W6be4NpT2tQ)^@ea+A*xwFlRa?5u*t{TZUU$lp`|59@)kk7{4OU=(MU4
zp4Rc&NpTs^wbq4y?g87c;mZ_qW|xU9k9rT4n|c3T@$YxV3m*VB9S*qkTK-EqYCm8$
zsQORdDG=n;vBmQfQ+rH$TK>G7vEqX2<J8i|C2-0KD~&QtO&BVXsGbfSIutIYXkizk
zR6QYLolpB~^7UoRwYPmw)qf*@>8S1F;{)iq67ZPdV|}GV$BqpHK3=-(%D{9}#j1C&
zAptCaghg%LHVGE!>a}Zcx=#c%0ONs*W0u1Jlv5}ig6IDFkvV&m0ugfDFY_jlWpQsE
zHSKO%R6w&9ynuEv<T(7TIqa7gbFSZ$ho2&yQy<=OsI6_SGY>|z0~vvWI&S?oY!h@w
zE~)QkX67PT=A`!4oXm{!hYufq&)~KT1fCKEVN9u535?PIh&zLEt2S<23g88|oQbvG
zSI5ceL4&LrB#nW;mWPJw-v3;s>z>QB6q?=A#`BByKr5T9q!OxD0QPXhr{l<q_QuAs
z#tU~mqzxZQ2~<9sYP2%Yy(>KS!X6yQl-x=R-QanAxU4S6ND)$j!>N>0FpitgFk&wo
zn|*f@6z@mD$mn_(@%vt(_&)N>;7n@l5(??wU3GNIN<%m!cau5Xj;ouSy-!cC6_bvq
z-tD(><HnQ%tf}(vAE`zHjhpnApW7g7U0_noL&~DaojV8nT+>WSIzi;5EZcc!lHAti
z(F3=u$@E{tsJ)?7zNXf_N-nW(PZ;X?mUN}op15w1QLDDyH1XSzqo4>W9Q+rJ*FK#i
zr8AUA^X}anQ%+hA8nmv{hom#3&IF8j{*kxc!IJ|r6#zKI(HGCmY};U9@^-|YZX1>^
zwO(LRKd#G$*b~17<LImfy?cSiHRWH6g#9S6k^WU=P$YyfE#&>v-P<q5qYyB^TS0XM
zw}|rPY19v9yQ#Hy-v7<>tfX}6+_~4tk*9CH*;BK9`$G+~j?w2E_zMcS)vfE7FUO8{
z9SRg!U3nmBW7UtYwZ-_UHxS1*ZW45JSlqVY;EJddL&-#C^iMv45mBEvgQ85mD3=Ov
zA-N=DT11Em)8`a2&T(18DR5+K>+(5#(w4Va54tw4g%{w|4Dw=T^+G_~hHGsN4t7iZ
z+^nDnf(Tui%P8E`eq0SwS$2ND^Sj7pOOq<4SG4i=@i~{AtT4Q<1Yt<XD>NH%eBf>i
zb-wN8(2N`ATf5DgPg+!hNWV~z4fg4m0L`$PI0@x^p-0)`qBZDsnkn2~&TAi9iw@Xj
zmQbXs-?~G!VRm23TNxMW8{|M@0nC{0ilpd<+43Y}EqKwkRjb&^v)s3AwtMvD^$uPN
zBu-yghwXsj>eZKf$46{jv!(*n>Q73{lo_U1kb~5vg>n{2Ma%q)yts~uJ><vvcXx1f
ze9EThWNJG_A+Mc-XC>QyvYx{@<Z@x@di5&9f9mzxK6+Pl^w3;?J-z5IpN9_}y2kU0
zj_rPDhSiXoEjkJ#gVd!(rUT`_r?<CG`}TDdxwYu7%twz-N}UAswa@zcx=NARJ@@R)
zI$xUW=;;2~v~i<#_Zb(m7Ruu9))Ane8b`jwCf3AWGfHgW%1hl?v*twz5geEn@j|GU
zyUrL)0em`Rj!)NA&C8RUY*@eEg>HgSwTfAry15h&Odi#ut7pVxrW@A?@Ms^>_MbxI
z5!z|10Cl|mij{^)+|68<v8@LyAo?V>ouH%YKGe3e>8(>63dqIN^HTP)EHfSen(4}s
z`3rQ<OtR_SyCY`w-P^QnJBfX2VC!<g{b@fGsbWfgJ+4l}W%<YBgBkJqz^aiY9*mP5
z*_yAroMt(Xv)9x113;=3Z=B?+I2M6LQYetAfz6miJ>LcTRT3u}ppcCl9T&W5#(Ub4
zmBl+TB0VY|^F~T7PIt)QGpD+tl6?Jo4Qd-3NB5ENG^XCguQ~dbnlVNU2Z6dODy!D6
z<+k#(rrjS4R9R#j{PDZ@>@DdBJ^IJKjQ=+B3fF(h7m69Z`!udo=u$>)Egs5;50Bf{
zq2<fr0o9w=UFMS?2<Lefj6&tSHZJ(Y)wcV{w?fDq`GPXA@mjWsx;Gi2ojZB?k(I=V
ziuC2$I3BJ{|4no4#njZN$DYhRhi1MaZK_Z$<Ffq6=aIZ`kBqb}2+(X2PB{pnxp&Gn
zjz`#&33`0%S#}Xe-Dywz6#*2zBivMv_thh56-WZ&X<h_UJRezOd;YZ1-|L-(5e#qe
zM7JezIEa%}R=+6n3(V!@{lTTL20n}&Oc}WLo3W{B$kC(I7rdm2U^ITh1XpUzpHo@o
zUUBGfn>KZuH*Kn*bm+xnNI+1YGo>rpizZJ4pf|0K@{}KGB`m~@WJIpbK0KVx?NLFU
zTr!uEX5Ps1AIvRu@24;9l9AC9N4Y!3X=(POryNWFsrhE#UEjq^%V=iWF&laBpEB8G
z+gG^1)cPk9#*3#^m!}MIU6GSWGvfTpy*{ftVr4xjXU}J?a?6ueiXXf%-;gk8$+HF(
zCeqZ*q3I~)wJel=cydPk{o~I*td~;PTg|@b&SfiBU_%iTBPe3|(3%pUYoBfoC%y!+
zy<&!i(dIqPBJ*v2OorT>ymuj;3k@Veb>UOzIIUPyO7HL=%@Ue|M8&jxIW8>qz&5`l
z>9j3e!Pqn^QjljGp^51{T!lGptVWJ(jQ+9KeN*3u$$`bcTHbm|Y`b*rS|75Q(d1{B
zcD4JkFB%g1R=b)5!iQb>4dv(SXVdZ86I3@#reFAh(99w4S%6m{VnV&!_oJzWjcDem
zS`!?0@udGrZ}C-Dd=p^KlONRvW=;>?<lQ7TCdLKhjQ5{EV*uL#!Ltz(7wgXyN85>R
zG*f)nd>RrkU@P;tk5^i;8mi(GL%D4~!U#n%F_rOvR+Q$=oAcB<c-yH`Z{e5r9Xo3J
z_Vh2bT(QRX`0yvS-Z3cJ$e<ZC#Uk#xpHwKSr}yW~y}HKgPc=kW*oc-PdRrhgI&l5&
zT`fG3`+>JG;amb2bSc4}pR5zMo0%)?%?_s_=$Cv}rOT^ZyEnx2X7r)YTXq1KGM7B2
z3a_0CTVZ{5^@a^2c25}V;bC=n+SIAujR&vG4q5sM;Mf1K!JTd4;mtokj9&l)zogm%
z;SmKq_HPTJWZ!+R+}AhGWWA4f`$2gM#l$Q)GSbD|DPhXgJKosPes@~J#2@Zqe3zKb
zL*h>`cFKt@s~C0YFg|1cpZiFh)DLihGeAgSq2MUk^x;3)qPo-PEZoJ?wnOiWe`Xdp
zFF`Ky*C8X|?WAo9WRO|3l%Y0dlJDPdz+vu1I$<`eERE34!vq)hjxR?bQDer?6U(Wl
zwBQs5tQI03h>44Pii9(4_>zfwaMbV_l?$Njo;`nV)$t3&fqFC9VMnK6?!Menr)D;r
zHV1Aa!ZXRgS9OIR-pbxxDX(51A6<!A<!!xZi<?_q@1tX8g+4#Ml3CFTG^V6=OtIiF
z7j~WnhQnSL@9Nn?DwQ-V&9wL4Wi$5}wkdhdulv)u<H1!*&Ulc;yjHMZQKo4czcp?r
zi22lLhb#k&|J<Nz(A+t>x#a|+j~J1|ZAj^Ju$2EiF|8V0#rh6j>Hv&q&c>>XI*`Ut
z%ZKd{1W4CZxhs)h&i0!9=7Xw3f-Pj4t@4Bf2UYEwI*NrRJ0`fhH_uGuNyYxawZ&@l
zfp~hg+THyv3lk0>?k$<%3k9Gw!8bP1pFMzqX-Rdk9AW;O_nvAns*(b7I<lhnvJ17A
zUHJdVIuo#**6;oAOouX+GGwMm#=?;dB}Fn-N*PmzlZ=^;DME!PG#D~fL}f^nP?V_1
zRLGD_nI%LeLjTVSzw^DW|9hS59EZH`e)oQ!^{jQT`@WZ-eZa81v&$T!SKMs&EIn^9
z{U_Muo;!DrJK}*kUQMdd^;5}$gTL9=_S;<grIteILQbp(DlpMCguC?o==CtsJIsvx
z6Uc9}y_MQi7f)aqfr?(A^C$auqhlKR_luU#yZ`AZ@)uuMT%pAT+%as}ut{^~Yy!@#
zS-@Lw0LOSj7X|yql@W*bND`(V+?4ny&8|4AgH_m73A?~^ifh%&*8wDSa)mPGHvX%7
zhhB)xqtYjUW7DaNFa7-GOCz&KN=kM%cA5bPeP9xtXZES932+`Kwrvh+<|^jj{iS5V
z3E94VyJgQ2TlFVe>yr)LF@UDIP<7AdwcrNee@S=b9v!$hn4ZnHXmRh*$%SVZc{?LB
zQx(a$dajh{Bux<^CY&-<6N)EaV`xt~rXpi4wKA<89rf}?J2=dVbBUY=#O56wY#(d_
z2DRniCASsYDv_T~m4Z`0icr5?XJuFI>({Ow$*pL*aLBfxgoGC>>f7XyiZPh{#*J<B
zb*m|WxQZw1M{<wrIav>Qb|Wn9V#?aiGV7~;wne7Nu;9`QmElK15TNK7cHyqwyR+dQ
zl8Vq48Q}O2oFR&-HlUIOE@00+%{s~%9o}mdsgk*=DKsixSQc(X3t5zy<2QvmMx*@}
z%DKcl4IV#x_RJl(l(2r#An)boTE(9Q%9o(svy=sP52|C<Xj|J<l7w+nA4ANv8Xt6*
zskYI^nCkm+7^=>{v?e$>*u0!GF5aKOJpJ&zDF?`RxsslxTS^=EO6xDG#<^iJAsmd3
zryeqyXZwyF_V?$2?xYryMbE^<@Xl!|{(+b%9qw&Eabo_F$=vjH+JVP=?7ryYKKbr|
zYKpY*XzeLaq)AT+|9rgq<xFtxSX{UIE-$OEJbC}#e`No!`yYwmIv++4?_kO;qniki
zyiVvl-v>*VE~Pr`^E<ByEVpjN*=IUwQY(S+sRKbo_Q7(?%FdPy!+^tbdD+vo^ZMMk
zxs$PGU1VzkBZ=G63xRSVXfo?1EIQhkQ#OnPyc^jI1J}%R=X(%sQg7U-2X19<Wu;1B
z2|IL1jhK1w{(Wayq_oGjobMMnarW#kF!whgsOanQO_N2B_)SZ85xp%J#E*pZps$Mv
zk)4WO7Lvh4EDapBb#Xp`LiWy06*vB`&G>pkdyVyis(`JgJbvty-)9>oS7XX1R*Za+
z-lK;|+@2FnGgaH9;(K@gSW(yoRg|mPPVJd`Hit9&BS_2z_5-{g5&>_zPhxE`e&btv
z4pCxXz+9X0vu4d&@9C2(9bw=~7|E$z*P(YOcn*0<OGdepN_>$TJlF$^eNRgC^K4(z
z$8;JO7|2JLE^EP5lY7Ufs#UTdQtus@1{Y*9_|%jNZtl#-=bBTv9ywk?zOldV?#=Ci
z0cPf%5*}C1p2XIL2D5!V5bJw$Bnyel+SWGu_{e$l3|CbA_&2QZ^=rNHc@g2T!ZoSr
zC2dv|g6bM!g^bwcIJJUi_SM~e8o`87GE2R;Y_Zw>UKl@H4tK~*^i0SGpKzWtrydiV
zz%d3+``l&R$=Xyw8FXG|3?F>RQb$8zR{{9}hgZWK-jSo*k&+eU_~gmI>4J55<s0IU
z3Tf`lgTr^W`!H|Hd(i#uC<f{))=o5%`C!L5S9R>1S5xr`>0W4AkOQsl?7YB0Xwp_8
zrmRXx=!LM=uj{$8DkuYjv(HA2nK`qQ0L7Rd<a5g<r32`cy$zXOwgAItGa$IG;RiBb
z{A)8hV_)pq!sJH7&$DIkt;Ikf=-=KYr6=?11;AIThG+J$oqyM%_vb?+38B64N??hE
zvwew8(6Y{>S*8cQauTD&g(<lQd*_&0FP}euf-!muyr3t~o~@C;^v^$SX)>|>Pc1HL
zBZ&g-NnmNy^e?BDvcyuMNyO5_BEys6J7p^yYH4}11{*+ExCk!gPte&hs~3Ni?`Pd6
z{J<Ub8$iiV|G_)pQ<wi+<Lu|i5o|cg{G}K2JISqoDU=hG)IwMYLJkr5o*#V##@CNF
z-vE~}EC2(h<Mp;Lp53i`_Y0^9!=NaJzAN8DDxHp{&Sv-n+<UT#)GJri05mj4w78p{
zO*g@5cNg=&IhB^9ryx<gfCOvq=a<9DXIg`u(avv7pN-B<*W#9JHhQ(Mv|lZlJ2wT4
zE;Q0{(c61O_-q<Gg40rx3M4=QLHAQU9*D(N=^zuq)9G7IC|qCWH{8Oax;&{<DY;Tx
zLVAVKl22b<ScmW;V^NQRAGXEbuuJ@%G(u6>!GmitB}>W4i64-0DylS_xAFrXl-?Le
zpnRgUb8BEyL08Y;ys5!zRXj;<Z!dK=!UGIG5bl>&R&t1QXP&_fUtE-5o!{ZE)MhlD
ze0AaOC{lVwl@yI2j@;I4oS9WqI8iLRw%iFgh3RbA=Myxj%Ga@|<eE?Y`(a|Blgb!v
zx{R8$AKbfFv+MWQ*Ele|IZIsFM`rnx2TI@rB@wUkxwg?jG{}L}Se%sq)n)NeGd_9#
z+>85IRcX<Gc}<+#%f5c?dCDh5euCWn0{M~>6fyLuV{8?0jdTLgS}B4dA_&Z`JmIfy
zH(sdEiKrcabl&m)EnvjC<37wV`_`~}Ovi+Oq~?u<1q+S8E^dx#T;4HsLK~a)wzYgw
z2X+9(<f_-NUmMTr3W6e^N%T<^8YUNkuZmd1ZJ^!AJpc$-6qwlCk<fp9xp*={%K8|W
zQ4-W5=NcB$qZZs6>A6#l9mA7Pdi^W`X-Xm5v4)NqIE&Nmo8RIFlAitddcCkgqeg2v
zoz8=GiF~;1=ccHr4lBwFd&#oJrmgH~WiD$oPy%{*KO~TDF_um?qyR*Auf32xoF@R@
zT|7g$P=sCPJ(d2X#^?LFxvB-YA9=kHKdWQkMKsiSUwF9f^rkcnb7oIQ%bj|Ly{hP`
zSuBgc;~?f7sh^O^Sj9W-hcdE(+%F|Py@nzTV7`|b<SYt$IIJ01(0nW_njH2X-fA(Z
z^oR#kDyjTcVFysooQG#X7Ul_hh$?6uQ+2@y&dF$R!4oJZ?ij~FWkr+jO$Ap~dK^VK
zyAR|Oge80DPCBrXJIs3}g&?D@p;5)d<A?q$va{XC_TJcQ(zR1ELYHGngwWJa3_4Q1
zQm1xpotNU4bZj=w6paE2#H0~IMs>uWtUp)Th<w=V%o$_m`uV~NF@rckt-HDA7B~fs
zUjD>oT162cX7P7Bhh&(j1aG&83HC*wK1G)r<T8kLSHEQqL7Qmk%$L6XyS7RhQmkW0
zZ=4^2P(lwdrR&TWuy$}5%o^{8B%OJURrQwTK^_@6Zd_vek9`oB*nD#Gv9Ej_JEgM`
zq<&`^ni3?)Wh5n{W+Rvji{=S8Ktn}SPWeyaorWKzCtY)-%>BuUtbVqRd7CZf<3_i?
z-qSoyd*a#<{piM-rgKhmpPaFaO3k_YAa42^cIl-BO`TT$=*h{`nK|h8_FB`XO(SGV
zR}}L@j+=>e2%UlTT<;!jP6v9**uShxUuFjA4uwomPay%ykbq7Lw3f$$1q<XU(=d=W
z6WBY#!WJ)Y?A?_7^P3ucs=y@^UWDW?uAg1NLD;#B<^+=`e0_&&bhCDXE?tH!f6}a3
zonyz-+`$PII!r2{ynI!zfnfk@%s{I{7s9LfX$WfI$uJBc9Q>su(Uy@)45-(uKLLIa
z$!;t|03-y%NOL?sr4}{y+aSB~jU>K+ig}fiE*gMJhCRfCl*jXgaF<SLvYGmVcdLhX
zFa=I?R_LsjCM&+juNd}U)mT{v`qR@AY9k9ttx<4?e`d|9L+Q1#rOCwE4ndU(d4dCS
z17WT|4VSTs!SgLX_>-LELk_@)_9DswDWEBhsJaG>8r6VCh^L=l9hj`1C?D@G>jRy_
zXvrrXIj0d&3p<75r2cwyHfL25+NXp_uJG81&oo9fgr^r1(-o%>cXXvf)+HKeWo3D;
zUtddDgUpsmxq0*WqmnDRj4Q6+purm2_fH53r{=$^i#{rp#Rp;x4DAA(1OgbOADX`~
z3nu_MjiB@a-lVZCVFy-GN{ppZk1t$R@DyQ6K_E}zcwZxh7p+g)8<eUzZ}o&NlASQe
zi5q<Tjk^#oC7eOopr@Tm(o(-kllACug0^lIj-a&Lj~waDK{~p_RpIPf?%A(8Rdx(N
zuAR6wAzd^K73Ikl1A&p`MzP{ACF;~}VpxqN%V=3?CxWkh#N|YtS~Q*|&r(A9|1w6P
zQ9k52fDRzIUd6FGP)B3H;K6^f4n=$*8ODxTQW%1<N5yWU>(9)i_2d{;_0)#B!z{vD
zPd}pW-(GA>>XA;>3;}s3bHy5~YRA~x+ow_Kb(@`Ml(hV-2HkT>b|ptR5t7+6r>IpM
zHx99bbkMbALSY&4Bo^FGF(ok|0(G%BT7_ZO$JSmjgE(Pz)h~V2#}ZYtC+a*>bD=iT
zT7J8DlbZw8K1~8u=ZWA)^5HzZR1T735WW>pmVlVQY|$Cth<X+eXGP(WvUZXte%}0!
zBu{b}6x?pNkF~2TGXMN5KnfG+E?zw2d;+gVfiRm%5ZK%;SSmrIyd};QA3X&*?rb_o
zO)Zz$QzIs`ub8fDf6zb(q)hldQ$c=PVCTSnEm&?c<#KxZf$XlP_~3gYz;dR(G2HC{
z(;x7pH`wXt8rMF<<1#M!Kw7iLf5(9X?KnE0zI{7(#$(^}OSJf+N3&a+2*S)#o$(G3
zXCoQQWIWXj?;q&&CHoL{f%8BA{MDyVqEC=RXDXHD_^F7%q3P5WQWmF&>V2s4Xb?fc
zhkHZc!YlCni{Ui2g+VObFQE+{IB?)omm#STW4r=2jvP4>MrQcRH}c&3M`zXO-q2_z
zFVGN-m-1ukx#lFhQEaNa^&n}O_4)hnhEN|3kLbc<tJk7${oJ;LHkwxUzxN$E;^+P>
zy;dRB9_@w9xGte@{Qpw7@eqLoppk9K50*k$WLd@DbEue;<InYm3BQ4zG|IJJb7M`~
z2|1;Bbom4Z24>Y&QyYl!q+Ll)8Uc0-&4jpru7%i7u3%+a>73<sxU=j56uJRCNVFpK
z{>Oh<Vu@HTaV^VuAGtHgwn^W<eRW#2mq+=3uiFdIA3e#>=OFa}<Q&&#Gv`iS$Foa&
zGTl}1<^MQ^gub&cI7ec<IT%zExZN)tYh7Lq9E!Wu_-DbFnr1?VdwXu{)~y<apAJP_
z$t17R$)#iraLc2D(aj~lFg*0eez<$zb?e?DnW{-g*0v<)&K(G0Te1L9(#Ry;n$5>=
zlek!`kUaPHndTH{J#g!GCF3@a{<dGY;ReY7!X9FA)>vEnj`91PTxpY`r@Oo40nNNq
z5GWL(PKdM2?pLAdQY!-<f})@n667X@(g7kus}DE-OSt&eBR5hr9IK8cB#1VFc4q6M
z9VZZQpe0y)@1A2Pr*q99PRu*J^?&|ai0Z&5mhW*zoXa42#S`riVw4x-)q8q@r8EC^
zBbeZ&<+EYi3SX1jGG^4N6y_7O&)!<1b+GNZ7o!~hZ~^GW<#GVq`hjjsmo2Mq&6=K;
zIswyX`7Pm&X}eIPo?7w47!htkkE2Id-l|%qiq``&_u$n_qi+JWtUt7!fQ{X6%!q||
zo^#b;7N@g*a)w4|JH$GrS%>BV*H0#)RXm+9_zY>@#{3i108eB=vLb2FcE(J|pNzT@
zRrseae*g15*S$W1I?j&*Ka5@{R24nWViH(Y#d_4J3qc)5Nu4BL5n#ldbA4CG$`~_7
zucAxB;p7Xn?B*nVs%RRsr-7`!IVMw=G_rOea|Spxfx|+7%!kDNLRMCNtT4s0WEKB=
zCGld7A;X4UWZRSMt;SJ+o_Kv@V`I8U>97bOyFYl|@-^ElpE^b&J`7b6@%8N4Q&Mf>
zSO5NHPpWRz?iu~4yg#i>8c2$#i0h-5KvwflNipSWp0GzQUB5nPL&MIYM~>8ixX(_!
zK#EJ+a>!x0-%~E-<LA!}+(5@<y$dS=noKBdwK^dt9Xizd-H&1`L2IqWX8?pti|6&j
z#9sZGVmjz?dth8Y<CPv1y6aCKKR&af*uHOT$6Svr9vgZeRocpmCtfrR>oCvP|5&HQ
zxWy)io~0kEE7DV<#<Q@rtVx&5lRn|^6BA*9aA1jm4sXoo2vht5r|R4nnOboDg=iot
z^Ua&WY?CAauEann@{c0UHRSQsuV3Gjs!0frz)&2spouMnqfcrQ4rKqS%P9Rh<@0y@
zM3?nCd*GD?q$pazjdBTkJnulaJ_ut>9UU8c9?x)~^&mr`Hu-i`t6O)gE6MkSXm=Kh
zH(1=qw7tT}#<J9Q&cBp^+-4sFJu^s_y-_=GchI6mf6MNIOY{_-`Whf80L(11hI3Iv
z=fvaBjV(YUxxLWV=;USL9S2x17?aSwZ+$R@ItqVIov%aw@iabxHwSuNCx99Uh3(px
zK@B#f33Csqe+?%JsJ;s5mH=-MKtBR_Os0L>*ucf>cEzSlEKfxQBvHiuSq(%QvckD*
z+e1QJ;Bk=}dC|+*{Zf*=->;>2G5UPQ2V{U3Zr-`GW2`im1dT$c><91TQk0qesM|26
zahU<`^EIANP$ihQMjlALR$C!iil^_#iFKej*UM1q369W1^LC?`Vd<@9B{{PcQyVl2
zeRpyHZ#|!o?8qPdp-YYq5$@(<YFxjbMt%BBMzyJNMHn-1$n5&OB!TN@V6Y|nXD&w8
zEol_gucDf%+pb+30|Rv+jn{XMG$mbeDj5Gl(kfhY>`uRj*d$Qfyxo>MlB;pdOX=@0
z)`=Y;v~3nWMN*1n|Ik_{L;kb9JGu)5kyEo~%osQMF+V(+$4vl<jO&8VjrEN{<^ohL
zp+3vbks6&)N$=T{9iIX<CG#{(#}6#gVeC0-Wt?*D#~zJ|*(A;l^FCzmnHe)Kp)Mu=
z(YP*<X6bVX{4;1%3qK*X8ap9cQx{>R_z}D@w-0^yd^hPOd6Y9+Z0LuEGV*iBUk#G3
z>-)6hfo9Hp&r!er@faaf6h453Q!}i#LU!EtmbK4a&en-RiF2P;5#|m(Agcm_gNZ<`
zJ9VnfJxsw#VSVrUFHJfA!<{BqR746`NS%n}za<Y=_YVBB)?6n<6{D3u`^uw2-Y^NR
zLC-sPw^M+_y|<l?6eN7E*)j{@&fs&G6a_!vCU_bgG`>p@&y(g#D`VraE=$N+Fer0J
zv+|u$2iqCQ2<2ZVp*;KEjfF}I(oBxl;H?|X+psz8d#A%J=Zvmlz365ElKVsNdZC6`
zUXtSwyp@S&H7Uqu1Wy*P&Ce-;zAJA5K#iXzPfSgvhVr)fOFZjMqY$KThqs~K^21i+
z7*U~(0Z!NN!*OG>aCeWjTQz|_d4cm?QHV|bppCtCT==Ys;gp(G$Dtg8A8a4#k%Znm
zv#=v_dR2};;nS0Rno$_Nn4rDoDv^+8fC}uEZDq^qDkJywJr5r61Qp1DH8rFg*R5N(
zNcVHHkjS*riiuRK<B^fWhg-P0=~GPA0L_xr3@|15?aDE~dLPVC8asUgsWqALTqxhd
z^&+d4>Q1tpe*MmleN{>|L(<F*ww^wL?k{JCJE8+oSm0;`7sx(7rsRKc$jEoyi{Zkt
zM5_`Qg3ryL`n%w3+Ux!9Z~j79$xgJSiUWZdj6Gt~P-#S)Pn&jN?l_vFm2G>QQOhRx
zH^9Uz9lmlm5JtMGwMixfd!r@Lv09V;2M(lynUhM}?%q{ujE|Wbwsl>@9wHott#v*x
zuNlyZDbP1?qU=W&pC}4u-CipF4Ajas3qQ?W!zvA@z?S0-@fT;bNIW1H90o3<Hj(0P
zX&>rh$&x&eH*kXT;yCLuFhwVfyNy&%ibR-so1!`)!vpwyN<nG_Cz7jF3+;3a6=fBk
zNlZz#-RU$R73n$FU(ez446;n45p6tRA7et`Ux78o;;)}{QGpc1`62YFl~7V#{E^lB
z3?hjctpOTr>qM#y`Xl53s2@Uy#iLDnL^m(LnsbOJTPq>w_H8*2c3e9W9j(fqdRpx*
zpgud&^2e%~Y4x{~nC-|ZJ>%y4SWS`bFQU0P*g3UH)agA9VC%59gefiw3#hx)zFe=E
zPv(uNE!|xB;a;s8)1NgmHm+>Lbn7Nv*!Pc<H%Wy+jI`u%MH|h`G%rMKmm;PuL6)3K
z+rf)qn4kFJoYbyD!jss@9U?{#qJ2dicuJoNi!Yww<-~XTXA5sEoI=PG%+TxskCe0^
zr*viU>n-MqT~ySy%13h!=wFWj1S`2G#=JGBie8N?4_4_KlX>;3rkRV(%(&ZXTXfR5
z@jh=c#aE&G_OZCw9R8`G%)^)$c3^;=`6ua+L4{NypHIUb!g=acZw`}Ze09mW$k{h?
z9|^MONp$e4?0N3}{CV><_WZ-NX$_)BI;3sqDO0K;DCZV~!9FG84WRZGOoHPzpCf|S
zS=6FOz(gW!MH*hsx-{<&WM&$yGCf!nC$zrtai~X(F&{m;p@`NqHdVA-_N95lw49%H
zzF%1#8o5sEaBQfJ`tYDo^`@&MHD{#w=-SwHy6CE*xmU+!-RfBLwlB8~o?M4OYdT!>
zyp_xOE`P5MwzLVk>Aa-kNk-CD+eHqmRKBk`x}hNC#QbS5A5B<vedg&84g4tWw|#9a
z3!fB3D;GGMNhWGp)D3Xs@;PTj&ne*@gj9Zt?wv^10kUo{IrTsL-z^DoH3w`}#yJJP
zPW&fylFxIraUyEq?A+NyMY`TL5gmse7(Qf(O+l|G=5t)&Aw~Rr`GAnYoKeM-j{g&u
zM{pBu^(YYpry6b$cIer0(O3Hn`o$4en?>Gv>BT&ydiUs<vGJ;q?&zx}W@V5P!@C{;
z$m89sCG7Wf@twd;Jo_hQVztmo=`X|6YS4yR&F^p3-iBZkD*zRRz>0@tD{?yPnS)**
z&v$qUbfKv=<=C-fo(v0dopvR)N;Nc<o+nOpmjPTi&(&*Y)T+~*hi9~szkk|b^*~Gf
zriq(1w6(QUXOT*H(N%+@9P3fONN`Hi0q(U3Z_Gd&65-)=W<f0yNk#RM(blbwUt!!W
zXL=hv*S{2LD9C2ha~}(i`x{{d%f@ufW~K=-zu~kKCcrmh^b}!$qy8YrpG=mT3nCo+
zzNvDU>0CjJ&rLdCg*~+X+k%2SbvO?Oa?(mK7=3bNQbjlh8~*fFe7n4kkBbwjy@-t{
zMCGKEgA;O+tU>_0mHQ^!x2I8p>V9ih3z|4AV~3*|uJ?|$7F&W+Zkzqm7BA-Yrh~3~
zrf@>!@v%)}Hh`wv45;zXKd)SdB=b3gIQ_B@j%+F1I&c-7F>13#&0qQWwlMJ(=gr7g
zGW^R+K-q$LB>10qZQ3M}sfO~gsAvF)R%WM}qin>yD0^OZ(cvC&ur#*LnmN-vrscvT
z+;Z`FI6c3;^t7HXA`XM&gtNs&6UpjPQP^B&at#QCtdd)5c^s*#2$)46^L<!v$1orE
zum0JQ<ltH@4pA`#|2!yj$~C7*e*#@X@a)eA#L0wnA*|o@oBIvR8#U~zXE)mHNSidX
zTE68=N0B0vuQkZ!h+@Q?f98XZkJgg>bm*LUjFuEWaADjZ$E3xvOI1e2H)ztN!}8v@
zaQPp$!?3y(fS5qt4pWbMA!i5454cY3iXTg}jcJJsWY`$GAEE})vf-u(aN2_RwjaBE
z1lT(SkAGqdROk>lZrbz$a3O0{8vC@gfHgLPhHgD$g26*rD~Y+3$>tyS7UUdS%YNQR
z{@9ot1f9sZndwIYXf98%+x(R9+#vm{8FAm7RpUHsmLG7Gq-Ibt1mi!u?SIqxZFtF5
z$0a@;@WlJ^UM*}f+fo(R<)D*E$l*05GFa-<*At;L4NA$%7BEol?Pii)-$ak{lgGvz
z8HmHi*`}G-?IPPqCf<6W&$<B34aei-wR9tyOb~RYYsZcqv$wS;qtH+L_%^4bkYtpP
z7&=DA`RT*)Dgu;NEksgC-Z9W_b66yaem-aH;F&XLo=&rF$f}_GzfQwSlg3INzFaly
z%~Q_N)e8R75^8v-dfGiAFZQQyldpKL{-JRRT|hO*yc||aAl#zU`*b8yx>l`PnYxl5
zCpzXYv7tTGm`KpnBmsA&4kuE_(wP#9UziZ#f>qbbS#jr1L$*~JO&NQu7SUEBZ_T?w
zdC7Y=d9k8)+S@YTZPfHcQiQfRlP}nr-zd4~@To~%M3KU9qPS($&J+||bCdCs4X>h0
z;tC2qB6fvVzZC*r7#TUbj$~}7-R1#Xkb=BkBZG@TgQ7G?4cDM07IW^K6?a3AuaC)Q
zy~qiPC+9vu>}bCJhy8#h6ANxnA#2du$^q;TQnu}W1yEi1N#nG7zCUj$^?fbRc24^d
z*`Pt_it)nWyFNgzVJB0HwcExm?B{?&v7`-zj!8#VI{&0+-nX6Aqn?8hEQaZDrITw<
zD7`4;R12v$zkgU+G3v|*@_NmVm#ppBg0YCT!kX8w-%7vuK;*G?jjXM^MIQcg16(Ve
zrekn=85&0j2@bA|%y`Da>=>U6=J+}m6=ri1hoBZ~|HE7&58y)T!-*tKE4JHtTq?^Z
z_z!Cb2}cXjp%@_0)QoK<jE3w!nGo+B&$&PI@!FD=I0=9r>XE~@e}Sc<2I__zYBuj`
zRsKV?DLKP(!u1%^q)(}BHKA+Z<&RI-wp1-d`DR-VyMDv8t^SY=2#dioP*F}tK`6v5
z%6Ci0f1h7cr+KAT$Z2Ed;E-^27MQcmbf+piLPEm1YgXl)6vG^U(%*sPeQ-h3)8o5b
ztywQ<V$f4c5tNMQvRZVXHf%@QU#eQyt-ET``n6G66JA~2YM<5KfZIpLHR6P@D%bMZ
zBjN}bmZhlfDX%xdzM!=+3HBJJ@9U;%dfp@+>!u|-!BxQa(e8Z)bmB(wNVxXhTtqzS
zylC9;u9?<EZ&)5}yLTVuI&$aVN3LEpQnnkphj-gIZ^erKsGCL|9%-?urA32^xcGPj
z_xy{7UpfX}HupfBScFVj-|yf7gKphYFm=y-cZ;ET@TOD?Nr+ghqZhckb+|EOcgR~{
z>{IhzFm|1$0&Ob4n|BvMKa89(dO&*%UymyV-w&T^NrNT+q{BO>(KbHxoTU$^c;c{H
z6SqX$0!O>~9R0gYR<@A!h*74X?;JE+$Hl6L`+h=_6EybNu`CqMy4v;R*oY{H&Aaht
ze`vyxhBR}ho~`$do8<ihUQ<LlJ9%_20CCT7gUL)9w#qom5?yz6dN0oTVx#{9f1%M2
zr)&aGlthn~X!icU@EP+Y`;@X2k2lx?9e;e^Eo;*4WccMc?_?Xe%=$yKi^5&qTe4uI
zzsBvaYpMleWI<Q|%DY=<7S3?y^lB30lspJ#h1EN1Wk;#P=ACY4dVc0Biw4cLuq2&K
z8JK$V#aD}~tRwS6KvKPjgW3g#5fl(^>n9j#($7~Tn;w07rOf1e!6_!x0Y0en{GEhn
z0WXv+s3Xi_JbrNnux<EB<l0sS9&osTy#l8ET@0SK%Xp!c>s?kA0n+ZwZiL6^i&|T~
zwHdVg)sP~Cb9lL+>m6Fyk|Qi>genceaY1?GgJq_yS(`u(7>=oq&yA2itd%ZJ#djgd
zATXpj)mylet%1E&qr?gBHHt&_8%}IQGC+S%Ggtifqc;1aV3MX3)d-fkyNp%=R5To#
zc{H-<8C>)5{O)p5QDqzRYu}*1pswY=xKZ(k$4(LM%rsj?#p{{XpHa|OwYCIDB*iRr
z3jxl+0j<_78gC-pd$Y(wElx27R<i4VAiu!L-D!@MyUEzGZzuS3S|88H)|_cfi&v3&
zKHE9xoK1m|Ej`8kWAFCtDYlFH-y4q^Glr6M5E?XT)PjmnhWkV3)i&h&2)p}sr&lmf
zy!TazpLcJthui_t!A%+#j4K{%r$x)s6pE7P@#F<_a7X_*aT4*8oz+H^ZiP&(tJ0{^
zHru<eUcPjpA7Yk>eGRB_<juwK%KiACTK1>D6h8P18D_kiLR9+mvLDQ^q~S;-YJ<n7
zT)L3LzJ|MQO5UgI%P9+^H=N^sASE|9T*xYtltc{#K-2Gp|I+r*Q>ajp>G)lgceR-s
z-O!>507gvQNDv6~Lf{J923q3~nn@i)_o-4z1g|2T#6Y(+nR9&)9?Ubh7AFLQd2^Pi
z&LXLTmJk&;i}-V;v;U{i?WeXggd$FFAOk0k^<|!)M!8hW+-$RP?E;@-aOBXT9h?3;
zztS)1EQd?DOMTJ?{q_^Iv5~e98ql0B@8-)>%*>X6H17C;@I(Kt)8cUpkC9;u2xvN>
z`Yo2s=zFomrXm{TLcuc~u=*Vhcp+n9$fH@%b=#F#Jeh>^dhblgu1OOJcvM^KrfGsH
zjsDsg){3UDq2`{w8%8Hr*Ps2JyH`J+Q#j1_Ds3ayv9LxxSBH1tR(`OvJrezF8kwa<
zp=1YGT4i2cZ1d=@anJSRy9nE??f1PSLwXE(QDBg!bDO$0c+;VK2}!;$_0spxF49Z;
z8IeYXzp_^KB;gMLsYNbq)&Y{kp&l3d1@C43N*P5&kRumKw&U#Tmg^pErBgx&^?>bT
zzzbjB5NO8R9D@c-u(%NmcotE5A|ZjB-bc~Y^ajWL?RVitW)$g>$FTJzqKxZKJS@jM
zMuI1p+4!(6iE)}8pQF+I^RgR7644rwKJ2gykREi$whz3FE!7&TqGqHt_AkGv$j{ov
z<D6bicm!}QW<N?=&s%o5wx$&xdk#SGz)Ffj^YwcYZcDG5kLv7eD5k{ITc%a(HTAMN
zDqb_&S?E-yn<~~p7teX0xK9Tw*~ZLbI*V#G{$63wVo$iF)v?;Qeft<2DNo+#^%j*7
zC*5Ek1`t|MyZX@qSM3*Uy?N8uP<jGElODprb2GT|57LFmhmf{|6YO79oYfJc1=l48
zVnCz$lI>K)1n4b8Px01^#p6O)#vMpxK5%!+2Nl^Lz`p^dp`y3_;UadFe)7pjLGkfD
z$*(1OQL5K^OqnE9o~*=w=k##9T@xHne0l`Iq?&h~F=H~VTyZu%wl`$FUov6OLhYeq
zedz5ZFnJ5fBU~(+c=|RmdGz=D)F~zMGzJfLIGx&0Ye-JfF}4_@Q?2wMRs${;k-OfF
z$=1&wd+>lX9VtrcDUJItH<F25pH5PJgr7X_9vcbw#PCbW0nHnqeKTwZ6)ovHJ&)Ql
z0}6x^8rm%zu>SVOfJZ&)Cko?N6uDkrUY5=4sw<)R;_H9u3GpjlZG%S8OOirbIkE!0
z`7#tTt&ua3B}dF2RqPQJ4gn~iJY_K`mgnj(B7GO^#9;BqNdnm6RFdXv9{dvi2s!25
zMab#GODV|4{0~!po!c&InwgK1nt-!*6w$-jyEJ)3wFT#!3FHyx>Xm@OLKku^l1L>5
zNJeN*Lz@w-?Cq%zD{DcWjmUr2?YCx$1@*X<)Fy3A+bdQFNE%FjHioifUP+IBa()d?
z0oqdxcSJe?Wy0UB8rn)dp)dBe>vG>-T{L0h`c+eVQ(be&<6Us5^kim>3fd}tmzCBw
zJ%9Cc#I*%`cJ1oq=CG~F`Hr7Ydz5`_O?@b;y7-e-gG@#$NP{ee7+2^q^o6*tFr;J+
z1_+!#b%6<no^Cj*!xer%n{)qr@vV_~aG7yobegT+5%$N4VSpsjk$P)fG$0tYXi*n(
zL&OhhYcP#RC34qY3*=Xk9t<#U)_1Y>i)Wf80Bss`V%8N6G<Yuym`?7TW?vp?PL-;|
zd&izD$0<i(|0fl>XKv8HThp5gzP(lCupvXD3@w_-Qfenu_PPGeC<VEY+tKZ!7w@q-
zxY=7jt==QA{IA0v=4<=@7;0?tlMBYl$Qvpzy@8!BuMS3Mrlm!U)m#~EDQ<|IF&$qx
z8t$mg$#wyI^CwT9w9;(<skHQH(6a^|_Faz;2YQR#jqAHhGI^JHiW1wbmo@8rXeHau
z9fGzPB!~&e^Lf2JSEu$Nj>*tGMS&c*mhKL10F*n9Z-3HlH=o5>SoZB(wAP%kUQ0|y
z{)t?~B(2LY=8<rwViQR&VUF>ws|Upyx>#XE94Z-Cg;cB!$zFQRWRu4j3s7+P{@hz#
zT`Mpkpw0K|4)D+5RA~&B_Y6KVRPI&(vgG_J)3U`bKc&=O)g+*sdlS}Rs@>NKMI
z-ZrDQZ}(Z01c#DlS+u{44bfz#J((E}p{Fl};cxO>r_waB!WU)a@7i;mqhnBRSEfJS
z9=%I9DcW}V?40%EcE2hG4%psr+21%isAyiD*O$ytb``CyfuxGQ9<AUbx9c0z^BTLe
zI!O+`JmFv%f(~<$9edUwkm6kW1eS^DQGo(B1IV%(d(6&jEVNn(V{pjo;Lg>o|8mDT
zkLlH|+*W5Uo)FY?P3#%X4XK)b5lI=FGS?*@uzj^Fs?5Y;&!+MR(Orgo4!+E-tyFr!
zwiV}r!i*vAcN(PW+&!*WIx8nScx*<O;UrqpuBE$LRqxcD%a<-$e;qu)_e7FjIxT=z
z&^(kBiJG(~=B+z34_qv2(55xfVCUxEE%(uSgus_Vqg;%vR83lB@8lX{7T5KFvK8B(
z7<=r8YQXM4T!3T8QYgT@9|+frR0^C;z9Tx}O_sz$k(UR%aCUnFYqP|U!C;c!9!zre
zlJ)-n>1-jP4htWKiq?I-)01tQ6+3O4^^)Z2_bYm5S=r*5z<pSEE(U>i-R<vR7wFJ{
zDW(ADLmIS_<jH=S+*sN%f}x2NLO8eF=lylD$cWy~AF3{y4L<2?W628qce9m66V9)X
zl(~v2$yOGpmTH~o<I<>+mO!P$E@pF$baP&kEkWm0f^N+F*-rDH@m)3!bfc+<Rknu4
zm!>`4yrll=9eD7JUFXcuGJ)rV$JN&2vAp$NH?GSzsh%6C8q#T>#3}Zghj+T=A!BK4
z$i&Fs+78gRsN&VNz2U?Z0j)e?WUNi@7rKzC$A`5W$ZqUt*kg!&dVyKdE}_&>B?Xnc
zXD<70y>Xem>Eq;;y$205Evo39Rxu>nSm;_;YvMNLPs~rYdFsn4zb@C?yS`o6JqNOH
zT%RoCD(GN{iDuy`WRPMay#CH!+#<JM^MH48FK&9^*jR72^6lGk_X?%HnPFlVp9It|
zepF)hK-xRBd0m^+@MG%7BlB@Oya{J&07bZivCPMqw<&@ijUzg1JjeYB<>EIBiV{Bg
zpJ~%1K@?%r$(i?UXSO823onJjF1{}FHU<BA@W531JUroxq2;uHm*@#8(ggrVY!v;z
zc5W-reaH-I<4W_zN$k%~neZgq018&d;FBj#obbv9zusbJr=K~O!va90iry*L$PSI_
zN&yKzA<m)7Df*{alK6^`g`QrskNIh8xGXwDV$_=VJ^c0#p<0|sx}aXIMh)Lm6Lg{T
zI$sh4CowNI9dLdvz|^)}2ZN}eq%|)ZX6!_zVjs1a^%{BP`usy>x-Qe%Y(9j<no>a~
zov7YK{$E5CR5q@8rwjz$hn+g4`Bp2)<a(X35pmdQwFJ6>eV+jdWG=q1HVQtvDa`*<
z3lsg0v+b|$HED?qJ0|W~)Y6Ha{fyK4X=c<<b1$2fRQMyZL*_Gg$sTY^S5YY5E@DvU
zdu{yDXgCbNs}aR|=A9sdM&}#hy>YS?N08T!(VA1mP>c8VUc{+rLq$7p!v?plB*?NP
zSFKr76<{6EwDm$ub2pokZ}k)^0`o!);MeEEWxRLJBQa}sZd?cU`Xl}R_Klk~c}g4o
zqV%AJUO}@kwWO^go0;5gEtGrLK4~pCMbCdIiSX~2-R`q1erU5&gla2nrEFtPfZu#w
z@Viwo;mmC+Q<LF6V8;r9hCaOo-XTAKmd}BMH~8%_fjD8??%aG~>h$Tg0BP4_A<(8x
zhe(qbbL)&+P~$VvSjJ_$mY0`TRWMHySB>`>vu_=!Lk;7-2=Di91!oK)v%Y9CGK@Oj
zH&-U-oPLjo_b|s6-6omt=<bxj_UW-^IX^qL)~TsB!-ieCeA(uVU$jMQ>;%jU!FHmW
zZF4cRnNH)gwcEBYpY0~(A=spOOYZ3jkp(b5ixfHPf-PlBBac-xQDHjw(29`<egdDU
z56Sv12Tpj=dxhP!JgunLOYFOB8`x!A$A>RxZ77U1ZUM=~*LMOdHF#{2g>Siymi=?1
zG1|CC(2HoOh`sIMaS88_96e815t46Mb0#Z%T6mF0vlqSLS=dmP`#rP14F+Pn$zHHX
z)#~j$R~P76fbOIaJM4d|)vQ^6mI+3s8s#=6p^zLn$J<h!NPh=7H5&~{!zrO|Z}?&T
zFMhth8u40c6%R2wpvhvEDA4Jb{9y5JN`(O^%7W}daL>+n61X{}rWW4+ZFc0Z{E;&_
z5EXG9c7yJ>ettMbkRzz0%z<5za{m0P>c3<(dDCZ9f(7bg#}nL58-&LS4`-<`!wTQL
z>EPUh)ua>g?>W6SqBd#rH$n*Z+G6rXmVrofM3iUdKY{TB0*bp{c`*+e{K$w|6=ZL!
z>uc%l`F=QaSbnt5;f7QwLC0n~JKIciVj+^tTQAtfy^3D23l2&O{-Z2N-4%MaiHm00
zB)SHS7*lR%LAhhm-C-~?{JzUhMc<gk$j!(j%pq8?lZ6*tI$;7IR*!u)7^i-o4n%Mt
zm-Q7E6oT@GK#ReRr@n#t)kP<Wwwq`jGzQzn>OWg3rf)JP#Pu;(d*pK<`0U(A)}wob
z_syus>1!jBa{Y`>+pf?*sKal714buJu(1A;q|Iomc?RJ(Bflkwm2KB=H>0605VJ3B
zMDz7|FM5yIs>5?-$5&A#X@=i<BC=ax8*)7%6_Kz{eSeomtmAbYoS0{Q(~%O>+#-7E
zm-iE<b$ZJLoxhk2+(v9VBX2H(b5lQ_csG8wflY}$H%0`8cmoKxh6@xe8_7@9z_v>}
zq9U}L{!YdEUEU+w=Qe!|3~X-CX2yk8IYgI(J=lT55U29B`r&Zcv9-QYF+3p`4~`1m
zr7c&Gl*AvM-n>~e9;pg;CaOQ){jslSWxbvyZc>jQx5R~M0IYg3HigXYEJ^7xG0<d(
z(empD7}DN3PU25-m$G*!Zg@SevDU}Ox82OPqzZR=;;o)<gPSLU+$}~DM*=eYI0dPA
z>P`O}fj^y~gWay2hQ9`wJ<2)bWNfVZZ+I8SyqI-!70r?F9j5x+1da_Z-5nI9V`q3b
zCnt^32KyK6BpF_p3o~55`Jva7?<cRUa49Guk(@QP`>l+Oj6;2wd2p^-RBunK#Xe-J
zF<8lq{X5ut8*+Wu57HfCX=&YY$3guKAmNne(x8#TS42j%e5VviUzFuZfAL)A1K`b}
zj$&Y7<=r#-1o`HrDTdf<n7JZmd`hjTKMRW$Un%uWY4k~Cig*xow=A26nawH@EW^q?
z6jun5%KL8e?OM6MuX6Z;+wME%ygJay`v%<*wJIuU8IQyFpK7T!pxv^>D}8qiYo|TK
zu1Qr94-LwI@zz+3iRrg<q@-kI9K84W;{n)J7O(D-EOdC-4$>0De6t7V&xeh7>9F51
z?geKujIpSjg}&*tF|mdY9W-OSoyYBHdo|$WduNyRq9y_V2fg`=uRgT7w$14+S6&#c
z_<ri<3iPnyGcs9mjXEyQ!Pn46bo>#G?RxiWgcI2XGV`F)Tj**z?7SBZu~;do5Jvd5
z3Q3N3;ZYW01UJyDNNjW1f@`cENXqg1Wxtg#e*qRASlL`Q5#utuWe?+FiKtK9mB^ZW
zXj+qBw{^c*xwwwvk9kp+Q=BY}n{$fSuZw(nX36JvRJ|&i!_v>dkN<s8#p+cR?(6!1
zx?kSiUqfG}ie80IUkd=<M%QkFyQ)N-SVhV*_D${gb0ld`^FKI@E3hi>kFeQVGliCE
z#Kl+&nGFHK@NW^#jeq$!5m1Ue&(&kBhg#XqW(>&dGUyok^Y^k>HP0z_X^f&W@4gYH
z`=~d6a|G+R+}CFB!Em~lP*~OI1kRK(vV|Q}xu9;?chl%F1>Hnx<%PS%=@!p8eZ^IT
z!4X3J;Ua=hlES56cN$a*+PBNvXun7uC}@Lc2X5V3gCTU{6)8v#xs$zblz!CfyVWag
zpQyiG%PBMC9<l_yb!MWl3qIN5MiH`|F^<=-UbTA@%X5|_LD5`Zzpry@wRrzAk3`S!
zMrnEC>QBDY@`B0zF(RY^)9J(wn$2FXsw*&ku5_Bzn%Gy<BCszVbcl}T1r0PbCb_w(
z5-~YEPhRv8x`3m!DR;!Aal}LgCymvEm;2b`OLJ;u1{~F9LZa}!3x5P1pq~_e@eU^x
zEVwhbe@gxdyy$&0Dr$Ocle<InA&b#iMT|BmqGCP1x?vFR&6ZJmPc6k9B=U8U{lS{`
zb81t;3#pic-$|@N;q!>GgTMw0UCp%)#1%zFf)}q(mzhVJ1(RJ|s({;u5tOF$9D+_i
z<}OE%Z+tilcrvK;GR!J`TC#p3E3tZ);F{egyI>g!n0^|PpbpCq9OHL_Ps(Zd`xWgf
z<k?!aR>905kgZ{xZ#Y-+4y_{k2@&H+o%8K0)jCkM1weOgIE{5eC%K#^ku}RF+*<0>
zQK^&|_W2GaaIS4XDYcqyFboY%J&6K6>)gtU{u7f|)PZJ))o`fUrU?f}G++Ls<gQsC
z;^^ZQ<((m@TfJ-9pg~p0;R$<woaqFH$)?fFP5x2Z6ysPrAdMUA>1^vbYQcedxE}?d
z%*K<RzN(i_LF&a~5;=`>j&f<are1x?9qt%gwg$)X{@C2u%#66^v*H)oZs-vm8?wUe
zPWNPObie!!Y>2H9lX*jQj#=0pg-O9nikN)+CwQkr$j31^{}v7*vmiv5OX+VPbgz>)
zWv197l4Nj9rQn&h7I0!~fq}>pu*sKR5O0{uMtu6})xn!5A(%RzoH=I4`YW7~o=BW4
zP2+iTqoYhOQ&-Jpk|Msnt@ZU)!KF2>J)hXzuDGCz0`Tku%A{JzUXw~^YXA1o0?GjG
zRWx<drI#5oIrvqX0bCf0`$4XAu(l>W=@g#~AWm#|=X)!aWj$WuvQaR|Fz;^jLB8PX
zc4FoQ4auT<k4Ij^+Fg(~CeW+KUC0C%DAkA?sk0bEL%&LY;a=OkO}<&+Xe2vT^pvM7
z|58YIt?~wES3D!+Z5^A(aZ;Vjv`gHcilt-hf;YA_4M|ha6Yw4LUI;6i)KyXS?XsLP
zrx6Xa#?S$RdQ?q`bckuekcOc5CNvAFp9H1@E`$&e6}Npbb&A~6r{7EIZao1h)-4qv
zknqqxjtK3?y4UyH;O4`BT{r;+92JQsq`vj4OY7&@0lVV2-3(H9xovr~eoHmsE}11F
z%Glm}{<K(fi5zU?_hs=5EdVdC)vJ8*L$%irIUs{HVNuxb@DRZjwQf~~w5i*YPtP^V
zp`qE@jeiF_5HY&*j=OPG_!*FDswrYI1f3=F{RGH}e_r|y=xPKh5*i%3N&0eXkCSJ(
zxus&wg=J<64J;Y@5<<(Ma!Wy>(O5C~4BUfkbFjAJ_d<L0dRz@JpeFDie&-|F8w|g<
z<OIxqS7<sQ_=0bz71mU8s&HJias>%NJ*$89(GI#qB;L+sr$RIa-!j1cS#<-R0DsHA
z&ALuq!jT12y$BaiQP8(-TUGdGYuElo8Q(=e-#Uakbo{BgDgf!DJEK21q;-q*$-BuL
zar#F>n-A(`X=!PS!iT%3-msw$Ye3DrDh)~lymoEBlp{J-z2Od{o|`NP5y7h!iZ9M;
z>c>6;H3B%pBZz9&-MZV*LqJYhkB&K4#e~RLnmxa)31zy)itj~(h1vw*m5N}{E;oUl
z&X4I^bt?VzDHWEHh&`rEnIaQR@Ty6z`1!?T*xvrlA(*<Q(MlcD$>t5K$E|Q*U#O<e
zAPHQ#Vr^|v0bx~Z+ERzoUW{D}_Wulg?P6g<85cHhm)uTLEiqW8K`#6pECxi^$6OWz
zD9R*B`=AC3w(HvN+i}g|p*2(k(jHA2y)?@Hc|mv<SQ22XQ_Yw;a0-?)lu%f!(*NYN
zgj_4y%enLCi)Akd%~+a{P-Y_+zVCXY=$X{gbodo<X%PkpBYP{J@bkDL3_!&yZ^B7B
znT$iG<ydbvig{b{uLZ?_Lu?`1#I@|ZSm9Ka+=OCXDt-D#qRA7klOvN$#@+b%K0jZG
z^Dg-p*MlGo1S98olz@j+T3Az<?W&i2V2uYSw$uuv?haqwm2D)hBh+8+s5)9t9|hmy
zcK+WYF#}8TE(PTy_^pK)Ar9s=#06f)n<Z@DzBPdBQOzC`Sp7I})Vw2QO|JO9IQYgJ
ziw5?=*+{90e3FVPDV#rea`g}b65&?*DoE_5rWDGY>3O39vU`A{4iA&_zxFmUaf&FK
zhgCxvvT*N{+)Z+<BARHekmv&62?bYdD<}sPPg3`WT_+y?<PlR>6d&y6zaUZv83*Y<
z6SW7<_0qSZhv(c^XX!Q;y$A_Jz-NFDu&{w360oLZLXLoXl2bPx%&u4q_7b$T4hW*I
zzS|dyjmasVD$>bzK_xPOei1fUf`!0-ZB4B=ahm$U0Ajy0Kg0^SK)9EDo^EX|Z$F@%
zweY2D{b0>aaC6pCm}yi63$?a+2XX5Yi!vef<AotCUTGJRtkC6?Osh4;;FKv|+~Wxm
zU*1R}`unBD>0gBorx}DG0~UI=mnpVXYx%l>Q0w3l!qV3&e^^~k{4GJ`O|jS^hxG$@
zWeM4*5}qQD(;%n^`uCT0w;iv5!FZ%?-DQlCPEWt@#}AD8Zh|}wIv^=spFU)6r}L^V
z5e&Dq_@;Q>bHr6q1Q?`2VU=*7shu~g4O6ybr&;Nu*h>8aQC|Os*)mSWQ;CTk3-6xu
zSl0Uf$ZS4J1RCq6eI9)qtQ&?IA!P945t_Y?jq~C3)KJ7(9F$_T_BK3AaA34Z_<_MC
z;`bAqpWB8Ytx~u>;XB@J_$I9)VcO~Fm|M1CEp-)|X^$=zRHj}SkQ2Ex-_eAXwI|ZU
z=`IVfH6|y@Oxi2H^w4^MyP$!_YLtr#2*QEhqGfalgvR#v-Cgf9|9+?oIjot{Txq-L
z#f}_u;}%8@$DoD6P}+8SCF#)OrAu9Sn!49(_E}!`(d9(eBk+L{BSwVZcVxOV0dd?*
zyh?I)lfGJKkR?pQCi?9+lJI;2mHo10Um9y4$95gUSqT=WtADi{j0nR(KRSAA<hj3S
z3>#0j%IkuHLrK?&2<ETq;kqp1b2uO!<|q0t?f+<yMRFI_-H>}K(~-q#z~_*&e-1j)
zCFxpOm>y;-z#i$6jgJW$x-TNaTV$>TP2F}6Y4p`5U$xpaX2DF4ad2pOpfF52#c!f%
zMPL;DE1)JliADvtLI+gc#XuR;`(GwY*Oxd%sFtC`Zw=6atbxO}VcO-R@5)=J#b4(^
zd*`d^Zn->#w5bW-!m4mSsSG_VQ$1Dd0FHX`Mu8*Oq30t^gvJ(^iPMh}<hQ7DJ0?ap
z0b1Q}`<NzfWllKXHUtd29#@B5@CK`L2Ds^vaF|yv4&gCdJy`4QeIAcZj_HU}K6dWB
zc~V*59--024PG`0#ICUqXq%2HOtpI}bswD1w|9<gK>zD-&pmn#5(NM?C%dG`<4u}2
zZA(KJczf#4ks85k$x(vGexD|!8HUwifkaevc)8-+mk(k<EIxWuOA|?vX>QQjEHNx+
zc7O}wP=dLh`&C-9x_sSsag6T~AOl9SnfE3~Iv1l3J0-V3FKV6JNN;h=VgRhR`ubMW
zAG4^RCRhCEih#^6x0lV5r)9q_f!Uv=bUYuK7>B0ht4qPPYuBW=I~ig9oRZz`E0g@G
zJ9^C#j8f73eiXR-bNG+}Z24)3LE5mPyqEhEEgUSn=<>t%eL8+E9Hd^!D-cJh$3|hL
z!>I<GvC0#FRZ8%+)Vys8zT~RrBsJX_p?L4!y-;c=nq)B_7xyoJ*S$7^Vw4_&JxXvS
z7!Gi7XvB^2#v%ZmAe&bIc>L<#{6vfC^@3yetc{f17wFRp?FSW=02!R(QgT`+%$^{W
zKmZV?ux!jcQHvx%8;@WW6vvdMHJwZ6u=nWEPGa|xw`4R;Q`59F|NQY|rEf>$#)P<<
zH2+Oud~q{{S&8*$E$ZNi>N-|80Y8}$)(Y5+uQnJN3x`)2!b~*E3&_q$s52P6CwWqQ
z=?}EilP?G8is=6M@#ErTY1rW38NT40vf8k65XlT*(Li|433BJfHuhP19f6F=jVwou
z7{pIZF>C_P2CK5FbTxs+<`l2WV|_}85j;`tL=8PCp7!>|1>u|F!83ti4GTrcLvlL7
z7KGo^dRKRti7M>J;J}!nwtJodcGYM%a?eIE+I-sfV&4JHV$1gJHBXQCXNl~i5h25&
zcJH1LJoHrH<tcLZ2|*b+LMUP)puw2AIv$O3IC3*zd<$Cu-uIL#k!=V2i-!XsP6rY~
z8^FLGa~?I5o5LN?S$bH&BC%T_ZqmN{3%i95nxmyDB8pf!Bv6=7eSZP5BGTy$dfn<|
zUX(VH2$BE{f`=e1L>dGR8}6S%yd#@Kn0H9sg>^zq6h$W6ByP^*7Bpxt@G(55BcE1)
zL5hvML}n;Xk^yH{Z=%SQ8I}wPk`V?PG7m;5S0tn`7|oeyhkeMw9{0CLn^1p74Qv%{
zE8CyqwGP~`N~IxejMfP5$+4vMiG%<VoENOMLp|1v23Ql3H)+4%l4M$g7{;Z9vbtmd
zCp}X!R}yLuOhNs$>ByT{ZBj+bwjeSJ<$);r475Z%(GkSVefZ|d6HlJfq3=d)C^5&B
zpXFJK^BNNwggHVC5=)M6rG*;^+77Pj!s7k?3J);1uy7^^5Ea}nA_x&$bZ~Sktde(D
z3Z$dmFhG($BRnywtbFP=_eGhTlwEBuo&SFLJvIW5GaKMTM)!;ATdT(>`|arlc!Kag
ziYTxq4%?hd-Xjtpil~9m&sc3w)~ZWBQVm|G`QX9(pxhZT3gPzCq|ZjT02|KM@^2}B
zUitd4mF$KtoUnxis>mN%ANNl>&#Pq!)Pq`srDj^Eh+J3}82Fx|kmII93RYoPu5SyV
z`-y2Vb$E2pm8zMZzvuy_aTVwI#Vv#}8?9K|p>J!xekh}|!>qOwb=3Z|N0*>j`Lb^-
z#R||P!BN&nwlP4A7!_j}4lOZ2&rEJ@_@}9pCV8^^!s&x7W-<!{$~_5pPsu{fx!n<U
zH{lL?nGDKc!E5V_7cVM$dPYs^*H2k3^Eq@<!)x1h_!H)aG^+XM!~gqllnzsj#PNaX
zCyr>QUAAqwgO{Z`Moj7Sc8zQ*oqKaEMOn!Dllu+^op;WosSGKDR3XIFs0-Yb1SS{x
z@FQ(0-A&I=xyVsv!4ahEzmfYHtsU%Z2v+P&R-K%>IxOT0m;i8q?b-p;BDERI`D;di
zJR$pGPdA;!;zq~c%d?`Q??;c8>grKzo>hWIa^W`DKGQdSmv_1WtQ<8A_&bd~7!~zL
zmnZXLO`3`eak%ykHaxlGv6Cl7AV@fJc60OPi=>`EKj7_0u>Cp=6q(F+mU?(|U|>~6
zRP3T*AXOgPOa>y@Tj*;|6%yChTL2fG8p}gzvos-z_M&1PGqK<LwyhO2`^T~rxpS(;
z{FUH#08t8MLskB;^5^+icmgRj6IxYihDgBg)ypC79Lcz(1_V~&6s-@Jt8No82y*1y
zmsgvJkvQ+0!6_t;E8~+EtCMq}L)&6(-lmOhHu9a{DiPI$`wt#WM(`2d*GfJs+%EH2
zhmcnG+k5<C1s0zq@>G9P(Wq*x=?odFsu%flTv;AX?GAw$p*^YC^q*O--Pw!j>8ZaE
zY0;NpwBx;6P(?U3QXO!+YhxACkol*4(7&VEL^3Y+*@+wdt6G2Ta|kBKK3K0c)zkuw
zjJZm0IQ_Uy>1$4r77S$picZJP5DM32?zgq^39n`AZF0OaI<#5Zl=_EEu1ZRHZF`IC
zDY?LEmW^xi3>tywi@u6KsNBNxubgY-zcT8E<ZKcRKiYC($BqIFQ}zrbpMry|B5=N&
zn^oh^mCGi_nsY;HB@h5F_#a9u05ZN7$W1#*irl>jLIM1tIDWC4D=n9Vib7HfQzQQs
z?rP@ygJ69Kblay-AD93x=+Cv9H<0&NKU<X*=KTU=n<0&BY0=u!*b4NaE7Z2=4OF{o
zS1dBr<q225qC+^zo)#3S0U^SKZizVN(&fu43VVc-Ng7=eAt@1LqQ{VC2J#6uIXwUe
z#2k)bWT3EDCldo?z!R+ykuI)p%MwyBr)#lm*DjGd<fHpbE!?~OQ!N>($!T3{A5HA=
zx8@Py@=*EP;%C8h#Flmoe)Z3b>&i&UV^?*qN1y*sWg!zEY!)O*cI&l;h4brWGls9y
zjjAusHCu{64XxLIV@<_V{>`b!56Kj>hE&%|iDC3wC<~HCNWY9piz*5a+!5SPpkmS#
zk(+}bYYetBFP;tdoDSkC@rHiQ6O3BdjPLT7LP)B?u?DSXEgXW2Tj=52EKHp<LSpth
zOAQn`4Yo>!<g8;C>1rsW?<%>4(Hp{_S!Cz;mz7ib(m4x@O-e1E_eFA0ivqHu-$N;-
zO^3NyfSzn7Zr1R2f#-<;lu|hVq!SvL)FLIaU<RO`Yv3}7@hSvc;VbfH+l*%teRL^y
z1Thl-Cq@i{sxZ6A{`Nr*H*`vCH*8P?0Mzjf;=-f5`z_y^S4l5uQ%m?moDJag4aprO
z0gW2C^cA!?1b|OKIN)Ea01cN_?Tb7i5Sl;Mj4f$fG1gRtGun3TpYLd1e4W#_lKX<$
z0GM0$85G&#6+ty^LC&M|i(wx5QuGvJ8UZ4$1+bx8IvoNAHK#wP!axXh5M4|O<sEz5
zayvSLR8ZNFOr~=%0gqjCv1qJ%7DROJqDAQdA4`rRG7C?L4X~(ew6BZ2iEYFM5L-oP
zwt&Kx2mnG1r>atoR#zA+jF`b&=<7E2T1+u!LntCb5g7YdC#!P9Ed4j)zyTM69K@mq
z0(H#%A70=-Kw1rROZnz39fpO4;jIW>B>fb)d7@W))MR)_k5xF2i4!<yH*6})=0X9n
zU)eE!zyGh6@1uT|@<b|`2sJO?SR`z|hJ@>5GN(28EwEeaak9Su*#c)1xj2;k;_Jwv
zFg9)kw~OFGx3oW_E!x?5RX&Xm2e7Lnj;K)GNU@5SAo6`*nr>paBHlH(TWhH2)I@(O
z{&UDPIW7c<7b8=0B0%jux1Y)_suND@b8pv$^()`Ec=l<`J62#2ORG_GHW>yQQFr~%
zk$CfpxGOHM2laZ}cI|3FFt%OGd0@YBe3!MX8t|E#a1kKz3MH%Zk1F5Nym*8q1I3Am
z(%vT6MBMu(iY%P&*a6@7RI0WjeXVd&r5h%&9{5+(h1PANN7MQYi;1a6mLyA+BvA_F
z$dO>~lPGx@`$r2uj1aA1Ax8Z@FIqMHJq-6Uw=Trq^-w5^^GUGQ6;Hr`NfaKHG*0_#
zl6cTOR2cDWF?a5<N~y!I5R|i=|B8fyAqGBt*=%DV2~k<%ZzdDeelKzMSmY<+l+cGu
z<HD$5Y-XN?7bm$mOaiVA66nzSm38R+N(z|V3>T8?bb9On@i>RD=<3pp7HrBcSj^Zk
zSW!Ij)j>K`Nof9cNjBGtnMNo&AQ|gNEiz`N2OlF0R)QSGsKe@-e<at@J|f%!E};=4
zz7N>ySiQRonOL})p!wni%hU*6LLx_A{YoXZtO9pT7%6%GQpD0)G+w{9k0cnxTg8;k
zJr~KGoPNz<Z2q}0mb(fG$h${pn^IYXFJxb*Kq$v9Vzn3r{StgiB#l1FfhE-iYgXi5
zR#yAq0bwA}31YA(W>wfcI<2<kYeZ^KvZ+zILEGpbbxl3O%A&^x?W^<pef)M50;p%G
zR?;gCGz9yhJ*?yZd$t&VeH#5484$e!Ei7f~q5oZM<Xyq7%znK&f#(gb<z>i1)luq6
zG{p?fu_d`XmEm8E&TfptnL$0B2JGiv*;&y2{`IwkXI|RenoM6sK~W@*!IgbRH~RG#
z7?4v#L1=LvPH(Hg%l~^_-vX|@kkABk5NoUPomz*|N*7uy=Cw@1ZetU~r&PXr4#p_}
z`%al7OEKHFOP8Y;R|x5O_wMG>u>hoSoj%=u-hO`C&bu;{$b|*J17*O0QoJKU+r_X)
zoVF`BqQ-hAEK_(&`K-uSrcmx&iHZLD$edf$=`!pS4s{z`Q{qNY2gzOLFsG$71zP{$
zaJ@f^%<Uv@<YJ~GurWG$fap;!$rP00(1HV425}9qU5CqZnr^x1_=n+d^(Du_$(g>D
zg7<5UR<zUTerXIQhS<uD;B;0|2nqM%XKeeQ9ckYc=#H~lHYHJ*w7TusIj*kj2zf53
z$%F&(i+U$F;cOx~lRQuSv58=3?!D@zNSek})OxgS#e$f?sr&1ZUe1isBMBG`gdrlx
z7Ky(O%y4}M(N_ii68NOj4GoKkAR~SFTr=f#2X#`>wDU;DGK)Sy<@Wt`7tM2|Hz5P)
zoK}3T4$#~12)Kj@w!zf|mWLoR^uOB}8A&u~!(EWQ%4kE7nxXIRLTq9C%U)$mlH*1`
zuiO<HK_hK^@KX5x%B12vnR+Zl{@bKKeOLdv=*r=wd=|ihaLFW$+`m7hc|#7Q0I}kr
zjc-nai$Z8Ff&p?8E1@L*enrReNI|O5$u=S#f-I_aWKMQ?bwOaD{O&_Lc(YoiFS>kH
z{mb~xf#KGH*C!L{>89zHVxiZB0$i0OnP7uU?7@Jxa&D{I_P1yvk|x?0iUNQH6W8j}
zl;a6x#FejTAG!t3PgA`WPlQQ=%K*Zxy=X&NvOsL54!u)+gMZCrU%YJD(Eo`*Ik=j5
z^QfMooa$WqZ25nzOn_(~ps4?rM)m_SIa|GEjVup{LqS2|41?YvBu+ekXbgF>$f{K)
zfW15H4X@3#QIUthaZ(XG=ko~1xUWTb)a8ul+H$0bl9WwBhOiENg~eI7ay4t|GWJ$<
z50$mP_`cJFEF0+lJMgq=Q=N{xJaL)CPWI$tk~#-ZEkR^3rU?aF)Ufz{uZt`0C4T<6
z=lb`GXEE>8#;n_%n`S%It-k%ZMm8=(-c3Jf=^LPJarO31n*(RFvTt7Rwfbc0ZO7Gr
z>uH!-*}Zc+bT+7=uEz29TQ7YozYx<h@MhiPyE@&-SXgrA`sokld0#rdAKJh0;H#v1
zp5Zw9gpz4?G&HmY2(_Iy?Pb-;-1r<=2Vcv|E+L<$R;-~@T~VrA8(B2bnT}QLw+D}Z
z9v}o;1yF?bn0B?XYIlW~X%lx^oC{$v@@HFhazLI*KlA$5RHj<HxP()^MHAMB283X7
zPmOV!xoo>?%U@I!z23~%jDl?Pm|SBAz0EB#H*#~wLMO?FVV8rRaR<df5F-HPl_o^2
zTw**21cwsC-cPOdpa1W_y|i2x)6RtTZhv)%ijp!}KPkL+fM)aNM{sl1qf!n44L3Ra
z`Ffx4^>lfk&|Q>fcR}%T@TJ(dFLWI(&uBuY@x15jblyZA+;bC>ck$KkH{?349bsEM
zFDyL5i81udmxjr#zENI(RbG&ib+%~%HJX;Tb~HQYO+v!sp7<WjNh<PTXGGKD(EfAn
zgRXFM!NWAUZm917jjzL=={p=f3@UvkBV#ZK(bmAgQ3ma~<C{X~ZZNdGN%O9R5-dcu
zJLe7w^>iE&I$xbCpB&T;v=@id%_Mp~nL%Ch{^5hymU?Zl%7MLdlZruS2nwoPnptQ$
z=gc)!Ob6^nril|Bl-zAPLwdU1&cb7(nATj=KM$kZ9dfo}WcuTrJW7J8KR=aVfE|sO
zhBlUO6Bj?NPPx_O&ll3ofncy-2IjFjeEt0?gzr?H%mII!?Bgq(DcXT<a41#ei44?S
zVN+uWKXnH?$AgtihzPtNO}8b5lK#7h$}goLB+7<4f^SLZA-2d|;^+(0Bxtg$wuLSp
z`1E*l;m7D)h64)DnMEd0dqfRov8@n2FiC&oGY>8owZb6~t{UyJ8<*Ec<{2Gk|9fB+
z_6-EQ<?A`QaZ#+`;`F9-BCUe;Yv@#~{6W*{K`0~f^73B0wqu(%X4EM4n!kGgz6&fX
z%E~AJXv1E;y1v}6C{HRnhOorqaB-Jqh1pP$PH!V4SE~89)TcI`WPPM4SdNT_5GSJ!
zM~$Pt)oj-6Fdx`td+oH={rXJ+9=wZAbh2Ahqq(7|in{_&Rt;29TF98Np7;~-pBuOD
z0PR5c9|C|`quuZC<m)th_EB;@O%@&~tjWi8GoZ^KUvI43nw*>r|IF3es`5K2L0WV`
z<gT%2;#d4wx|@g-_6$CA4r8X}F#)#Bn>$xezE#i+3x}Wq0iu~7&fIUY@QJ#1<;qZs
zp#6qMyq|TZ2V|LVu(^t9d~VN$02=|74D;Iblqq^6_@ofdU>rb>!Xp?G056zx;1Fkj
z<6U)?kI6u8qlmj%un#A>F~R08rU6~ZoaH-LzS0U{>DwR@M$)+X!-@o_B}<p;W0Xfo
zwe?ojs%;6*$IJ866NHAOfYuCKJNKh>+|+;mG3l0m{rU*D)n>Hk82N;<z8Y`VR6OTJ
zxU!kPJc{cD%Ff!@nM-R+j4A3g?q-Te(c*Mpc}uf~P?J%i4#PU>TZu_~x*$DKBbB_l
zmGNXjw@ga09x!F7AVYGY8y{hZMf1(&f%LsfP2CfTS7)bldBsa+gUq8Et<$o9-jfBD
zm#df-VcX#^9bNVw{@qsGsIT81=H(S<c8qbwQt{KrkKObnGV#hhM9v{&6MZ*syiIKv
zyl2nPEO*X`V9GWQ5PESP*@s&5ZTZ=x@UEF34^e`7d@stt-84XF%-FGRWL(h@j@&ze
zJS(A|lpRN*!nx9;VnDZHJo789P$Ajy|8<6~4Um!O->x|Q3<p>rwUbI3$WCGkar|)#
z-(bj@r6tiK6YWgD6V;SKeDcRswXvt>hH;ChOfs&#Y9(|r>DO4uK-u__hS80GRnWax
zFU*s+(I=8PEBp9Q>^SuS5eVUJ8z*EO<1fVBw_bAxh<HnuEt6=7k}cTJZ@5Im&|&)L
z*lIx`A<^V`v+_>aaMIrD^C4-}_@S~J7FU7lXp8{>y@5jrxSucUL&_LP3p#=*U^86n
z>*2BSR#bN<F*|f~<Dwj3VfR3)Dd!yes}wKA`W&PXH|oZ*$0n_k=Y;ZjloNA!O6bdv
zqQ9|x&##RaI*jTF--ZI56bxrQJX5mjC+X#Mx~bd?&9qCu*0FWI#9B&3mZ%URi;rR3
zP+0f^3I<gVv3eX2v3siyiZa!#sNvKy&be21r{_)OWYqj}Q|@PrYyg!Q$`!DZ-cE^U
z&z_yQtgz?T^72D2X}&cg1`hd`6PM^P5<$oo=)s>#?4#|5JbLu#EyLZ|{*hdwb^{G1
z^BAGgm7`l06FvJDiyt`QrqPp3#t#d2lJ6Q3zk;9Q`-pr-;t5nAJ<c?zy?%avvcX9#
z7rxKiky;s}IkJO+)AsY(5+cRGgKFS&N@D;KhIWpIFZZdW#Gd6_#}bc^sHwwjd{QOX
zU?g3YYen$d8<sh>U?#&taEA8h9_8fNb0+Co`;e{5nDOc%7jE1bNijBZdB1Y^mX-h1
z`~QA1a>%_+<i;$Pj&i!&VMEA99ZW;Pnun$qROiKDD=S~oMN>7tC4aBi{Mqy8``Jy0
z=ylFpv}h{?vwAhFrc4GEcHl=*5aE-5kgRhew4Qp+$L$+bhaEg#3`clY8+!As<YnEL
zn;o~b#7*u!zC9~LlR@KDYu3ynU0D)a|JP4t2GRPH`cUpVh#BIJ_{v0O8?lKkT#~<`
zD?2fIdtepCI{Io{599NJ%t5-v$)VOG<MGeZgeoRI7#QQCtE1DmsAT20TYFW%XFVbN
z=RowWaxI%=p-R6VmsaFuRQ_|}|NF%#jnaL#5ms*4I_TR)hC*N{ps~Kq+uLH5Vc2(g
zZ+AF5dq7wAKyxgq7kDc+fm5^t*=BH@w70W%{wA}RtWI318QasT^Uy+v)xW-B%7;wk
zWl(!5TjOUxG{$(bms2v^ZlXttJp=7CZ{4~T8uaUDO|y?DBy8gi<eNime*>N7ckD(J
zI7RzkI)u19!E81sWg&eD;4T5(wB<yry0VI*ShahfQiJ#Un8*YCXF;_fbt)y4#aj~+
z5-5J+{<+aYzJ@U@X518yVwryb^*Y6R8p`{DH~N0wh>J-WSyq8}qlud_7|V|RbzPoq
z#5T$e=zr|-DeI4pKOGQr1O0v1v&8|A7C)=Qu{x3p^1uDDmpFrP;KiFaQEW~psI%R)
zT!F{eMje|@n#-C-vE7CE(C<s$eB2C2C9n8l2aK_1r7GW{#tESKzW<M}H-XDBZ{N5t
zJJ~5ak>XaC82eCJqO!G<EuopRhU`j~EE(IajJ-vrMV7`cl`JU+CHs=7k+OwIO7Hh<
zG4s6d|MT9T=P}0QzOVbbe!uf~p2u;V$9ZtbfQ{g&`#B{|7EsPjph2?Qo|oK_`nGl}
zTe0;ze|Q|+jSWUMujR4@SY9*xpA}zT8kPR9&Hv{&2h`EpCi9X|qOVvH!z%VwtAjXu
z54NwgWH6#d4OADjjH533eDl}$VH^s>67@gn+ePS-YMceo3u#q$Gn}6RI7m*7#EwjM
zRNRGX??{BLO-#Jj)z>~koz|=(c&pCJ9-thhj(36kvKTeWBl`@&C7Ggu6b+Pf4L&G7
zvS;)XZdgz7EkE7$Zjruosu+hHIy6EJ&5RpFPf0X*oYfDNXEv65-afl90buYR20qS(
z^*&<jZck^=`qF2ToiYR$pP$df;!jZYw@=THCDs<AtXhZ=h>f;zp&iQBOk_qZe2JJ>
z_I=KPztDqoIXI1Dv$qe<6!HpilZh=KNk!OPBWJTuZOA@QyZ=6={3nzUY=8Di2IoPE
zL1=wYr5MKl9{BQSK9Q{cZ?CmwZ)WmvK1x!?n}Hk*rj5*Q&YX8urXdGUqmCifO6FjL
z-s;mf-2K(f*1fR80RNMK-LAQ!WK}lhS9w_$U~|!nWaaN&-G>nV)d;jYO7hx*Eb-#&
zvL8}>r}Lj4(d+m>I}quxWG7x;3L)Ep7Ov`g)yn=t@+r<_19A^JmKe`DY}nQRqu{%;
zb1rc?2bP8>C>7q{h|R>w;gpdQ@FYN0{*Zib(wE&6qX&dq&(i*R|NbbofGw-Hr(yY*
z@KXOnG?$GfqDBlRY7~O2YT-bLyy9cAPT56S-r%W(P*BNP0-lfp-O{H25&kd-H*tt<
z>aIz_ri~E!N1;2|dy`}}il2!<bYps|gAy<?=;tNbj~~D1*|OX&l7*$&LwfL=iR6QZ
z6_=zOBzwytZ%vaS?L40vg)`H_0>J}j0%JWft6O$WzHQNU*wTth!o0BHgVVq_R(CbD
z^l|nZEt?u4eI^qFTykG!WrcI0C|0dw7ZE8vP|>#N>0%ieRZUTSRVA0bo7W4JB1bqv
zF_|z^1BT?kJO6$La9uPSt-h@*euz{#5}hXUDrcw#8y;M&CuAFm+z5C*c^>@Lx)uZy
zc|Es+%eA6dV#MoO@Gp8;^Z$DS=R7@s8j5jZKraC72x{3Mb4t)79>LxtoZV1uJk^O+
z#jd>a8u4z60ZWcSrZ~?a?CckjHFea_tXwriOfFFYNOl0IGvIbQLY8V(t7dQR^UTsu
z->xDr$6o~ftybg42dQwQU|0|(|64&77UqQfIeFQe7l+9X?obDQT|Z5VK-x))o$?6q
zZ7TiT0x@iXe45Z*xKrnre?6iLC1Va`c^^4oozdmRcYxv3vkQi!TDr{x1s4i|vrHfJ
za#DpBU_~)fh-F!RV?Se&NiA;wGj3RF%DF-D#%7&C2<s2vFeK3~+}f12C`T1K-K^~F
z0^?VnJk^Eh$4dI$Ois3duDWOZBd<Q7gz=YKR1Fo6Sve?;yd0FNEYnf5=~~jLTi1`)
z?^Q`<k4(`_BfUDoVj;oq1U%v>#|cM5=KJ^Epnd>|erCre^Uk?ttR^+q#QKPgcGu)L
zpVJ`-?;f8voSZd`f}HdoyoQ7}0z@)rUYo6NAxcDb-lxy$%Efi4dNu}?Lo9})*%-2c
z5DV;ngiI(BVOu27)z4z`!x`AX6LhVXMmQP@suZJ-O9Rx^YS5spjw(-VOw0k4!>EuY
z^pl2Lj2N-}{|NmjFd*IBs^7=s_Yd1g?ZIwUXYSnhtFP8je7k8nm)#GqJkHB}o(t_d
z4(jQyuHm$;jAf580L}YnkSjVse4mXoB}g4A^ruzkp8D(LNk1Nt6cvIF2_T_a_Le5*
z7*It~q|^C55hC(rSI5|*+w*wtek2Mw@kRm98KsP8bG1N;C*3&IBwD&mzA5?{*(}5k
zG?_7xG>Y{+DH$zok)R5FDx!#2y0qeQt0SoT4BEBJSmlYXmXs&w<o(Gz)~s1m37(+z
z<!jGWEqit-Ptu)0qIG53G3xjf%B{1;4V4e7ipArlW=!00@eG7noU!cS0irl)z)>Po
zOX}v*mnvO^|3p|9@)a_j>?*}TIF@oe*J&O^vD188#WQl)@h`MTF`Intn(D;a{rmQb
zCNr0MW<lWJwG`Eke?R>dS~McpHfd$B*-Mqlk&;CPRB;3btYG)0P_YTjMOuUy%)l{@
z>VVi6#8H#~_VHMd*Plm?<vr)$ev80glxdw40&nJWJfJq3NK)_Jznc<p<KLSd3gI@2
zIfOjk&{N!37c5io5ItD2YrLZ0m!Bdmq99_^z}xR>90P@hcrv5p>E5<o+5PeQ*nvub
zTe2j3@y=>@gUuV$kYPcV-tF_*N=itZpOGaRa3}ZgKIyje=SP<UNm3btSajt}tpWKh
zeK||{AZLD;N#|@qt2_frNQTth8nXFN<Kow2Dlj9(@_^(YBv0L<ztPSvfStZd6?W|&
zaEe96YBxz*jFyY?&7gPhdujK$Epj+<D}OF4%X5ro)$h<@82q9P!*%BvUQvuISq>W;
ztSXHPS|fyHSYC!VXk6$8L8;?-2w4~T?RHW++sv7j6pLy<QyX{WASo!a4nv&ekO--D
zNvQw!H+Ezj^sYF1EYG3a97e}=>dGIVv?noQbJY)0IeolNiHkcTBO`yV=eFV`OJN&5
z?p_4#iC7W`lD}(SU#569+FTKBRFg0!CrDR+%_sLf32f$yVA(cA@^@Z8OlU%9fWlFN
zkt+pK;oGQboagu8homB>k14gHY*0P~kjXo61RC7yg-B8k&K&KMg7u)(n-<PiOg0X-
zDt}jBBgn`W#-1@O+)=a9lBb~M{!C)ma#sb;?11F&B!vRB@JMzvfZdJzb7#+vBJGp!
zDoxd#qJzLHOJC2wN=2elUE6jNYR6gr7Mc3G!_0AO^>ZmLi(+~L;DE*G(L3Fb>gOYa
zkAfLl@JXODUdb0o^?T8t3GL$<AYZvfkLeq*BoE_hHQQFcvz0cJz#~r3<$38IUe*z^
z(#b2D@2sRuY4-2KJn#Nm4KSnCvgOPf%io=Sg;p*Uy=B=qR82y2Ci|lkj()UCtqAHT
zg&63q(Y$s1u`4>AF)J?UN<1S|b6@q3jMiF+_Jk4~LKEPiTJkYQ{E3%iwLNQ!&#W&#
zv$gbp{N`xq_lc-7M$OjJx4n%{D^kq;_<#NV@`G{OUyo~l)pBPDro28(ht|r<$%$aw
zG`YSE=>bPMh2ail^R~yQn%d!HO<9F{!zW_w(IcFwasp7Q@JKrEoTymt#8^I=)1C!w
z#U@}|{N^5P+~ZDab~#XCGDr$1=;iy=%e|;!DL%&{%N@%qM%6ojZPv1NLoP+7MJ98E
z160#x3jzeK>~<#w4FRvHmJDa__<Vi!rd`GdXJ%${7w+8oE53f+cTBcL@b`b-0?J}C
zzs^mYHYM}!1bj>8QSh|igRRR8N&yPTL0`+$<1xQ+;omfRphL!u!<@huef+pph=t#D
zyfwf(=?_V!4;Lv%gk|7rPWG(KOkHXdo>wxB#q4_da~x-<<GaveCVvSx?G-bzhs;6N
z{)nb7px-pS*+W7?w7zOrQhb*J4$r_YO(HUYlR%Thh9|H8xhwP8G+LUP!36P{2n%7k
z_knzAa=Zij39d&L*Q0+2ZklBf492l|S>z^3FJHXS;rmL>RQif<oJ(FV_z&H#5u}o+
z$n`(=q5~_6c3>B<7Yj2p)yI4#Am+Sh@i|gcEl3z-vDcnHokVBEIjJS7u3EMp^Be7B
zONXs5e`Adi%a)zH%BTd&Pn!woJp^AHD09zB5pRoA&~1Vngtpv$AhT{f`?2%wx|aBR
zFhcuAW5;ghFp72nHNYnsm)4U96kopoYu~<_1)J+)BC%lA+;>xr-#Hn-n^yY2|JQE}
zvOUW#BOsGmk2{#~7{#K=KD{-44}1V}ycwy%8_B0TW3vUwe=Q}&ijt_hUUje3S5&kl
zK{i}H074M?%WDp-`o(S1$rd3nQqxO*EC$4GvCUrHbH~rev4b|7gA@f!jxtw6ne?RF
zY?Qyz{=L7I$Di`s;$=Q-)?p~w!=$CJW}apvfv3LqU){v8no<<&?^$S0L39@$M=%f;
z(XK+`T3X!7mB7VO@!tFVm7l?C(WG<k<}kgF1h9zWze}A#!dho!c@R}pV^HRwgsT<c
zi+wvRvFu{vOT-s>GY!26NpIeNhAx#Ymg_fm@^%uHL9a=l0I#HOmLs8{V2RCnayV3O
zB2-Iy_AK_y3(JaP!x_Nih;RZBr1}O1Bh;DM*<tX_28M>WE6{S_{V7b?B6-*ge-tQr
zA!$?h{*6j+ipklp{v}1j<<tg-7&&`$i(aYN&86XEFUjC8RGH?a9(T#Iv|6>=ceL*i
z@>CKReVVynY#8+O$f!>j{eEDLU{==dM@2<NlC%<YlH#ZHIVKmmo7tSQGoHnqO|g<s
zF1p$+t|$ml!VV04yDlH*iWH$mzxl!Jt-9qkiRBMEJq6JR74{2{g6ok$M{~`s#l02F
z%3_&I0cDE>X-;}<m)Fm}zI`A2{m8SGB_0mr#%X4K{N<Z`Ixmy6AHYg;AjVt#y6A9t
z7gAa#npm<NN^>Kx)r|T|bNunKDS5UvlCvKD<v8(52jU5pPGu31s=?Ae)8*+xKMiWD
z?&W8`$+!6ojYF`~)p1aDsI4oPFXMi_bv0DyU?R{AMlbtp+-OCAq~6o-u^xC`En_@^
zmbSKIaPD3(>@2O<<#nh@EjX<f<i7flbL#0%PJtlSGlA|13q49d?L)T6a%AnmOsa!{
z{0$hf7#3L~k#b38LIBH9a8Ex}9<L`~Tqh+e5N!n<r1~xmXMpB5fg`E0an-|{eU4Cj
z1GWICb(wg^l2)2k95H(4q*G(v-A~ZywP|Y7?w_gGS$A3;Sg<qsLZPP+`!X(iB<W)r
zF7UyHgDiwLL%oh2N2n<hLxq!M4n-M#_+dEOox|w8a#_!6U1H(i7?QK*dh{&=I!qp(
zq#yCZ$7BO6G(vU5(fE`yBfV$2K)S~75MpB3mR8|xi&8!eoHY%Xv4qJpLYF$-Dx<R$
z-|CQ{sghu~YSo#LZ~irQ*G!D`NN36xR?%_D=b7|-hOVw7;1lVMiUCNZn3YL$Qz?J#
z;ygXft$`t*t@tzwc=6Hm@`dCqozZu3wXqpg9o;qa)~vRXCZ3fW0UE@+?bL;B8O7lF
z{*YpUN1b;PW5#CxRi0~xT%!!>g#>B;z~%SSdQXX0N{<VJvAs(xq7MO7<uh6EM8O@p
zM7kWulEOd1oTu>hd2MZvdJ?PCI|%)C%_Y_2+uI`0SyRsW`L^aH9}7<5PDh=&HL_*7
z381L*0~#Z!T_>+BUf8;?n~6G;tOEUj&GB`E4e!Jp`>p&6Eg}iRxfMHQHVLDggm~!J
zEe&m%i}6rk1zu&k!P!>_WU+NByLW?;S?<A<2K>8|IpkUDgt-?&FkSh1X>}z=fTzvY
zzxDwcjstL==5)oY$5l=M1N+11jNa;+qp%-~YIq@^J9xsS&fMdpw|^^71T4<7&1T5m
z5o!+9U?y8}&%ouUU>8fDtk=gkAk!`IdX~XP=J0^GrAwY1q3j}58<6+0!T$NaxELw}
z(jd9w!G??l@db}wNV~JSnhKZIB45mT9ETC{%pLWsxoA5yTedW4*>dmd(F5<q#FxW|
zUOj>-90W0+yzI4}xiwK4sc;$q16j_^2W~t$)qPr%U6+|fl8o-58Mc}%c&{D-ocb~&
zn&hpkdH;DY*8a#py0ZDcvWO7y85jP0Gml>QP5+vVpYuRjTzbcD(l9{|YQxE{YmUnF
zHo5X7)dHJ}bD>}}0kae34|2iiQ>S+73eWuU^JiNId6uF9mTd%20=rb}a&rWg6z=Mf
zECI`LwCnd|Ow^sn=LCdr-Fmj0K7Y>DroN_i`VJdQ&wR|ld|w8sF1T~SOYw^S8{SE}
ztI^`>y3%{dr`xT16;c5verNzlprN6WO4)Qb%&NV)HTP6@mT<-liLc&~T-0Rgc8IO_
zUCs)Z%2{7$B!8#`s0J{^koGvWZW7cv+p|YsNk;kk*9(}%iP8LsQoKVDQ>kG4YAFGI
zsS<8{c4%vkh{+z<bJah=KQE;Qz(*8+K7)?7&{O<i7y?*1#Y+zY%gLDK+He@DgVD1@
zlIvIRSb9@zg}MWx09ADu(7E&iln=WY)EbEkjOy24C#Z$~Iq7=m>T_E~X)j*tB0<q*
z*4>OJXXaJgv$Y~yi7%06Gh_ooAkUvSZ@ca!rVa2Ut<j$l>FbOpjCn=Rpqjvd{E%+a
z-Mc5_OV%`#ublK%R5VQ3kyn7NM!kCX;7+=gE_Bklx}&QqhN)yscCer3sf6DXR7o8V
zZ2I}x?Q<TVK1wS_Agi$SK+|m0C<udVQCXB85+)Ar@JF1UW@;JR!XdGy@xMuwNfM^6
zLFf%E%kH_+x1oYmux^?qUAbZk-xvE@22-5ps661(wW8cowu;PCbWxnt>{UH;i3GwU
zaT`R-z&884<=mJ+WF>L{b2G{g;L`6&ST>_$Vk7y9t~pSku=}im0A0*D{7Zw42#r{(
z{w{6)is{(pRmI<0fa*4t;`uX!ymvh!Ymju<?#RT4r8!CYPsu-g>x0Nh5spY_I#n#2
zHiA=gSYl<xw*mV5Wp7X3*EL73p2E=u81tpC&5fSj)D^i!Kiqf1X^C|C?~*D+2)wy=
zMKW0SVJ7{K<8|UZeY%;Nvej%*b;ZP}Tg->m94Q^m0ZmU(hqPH<@8@l+_J6GuP8PP3
zHN)O6RzPFrUqyAkN!N)k{XH??sXMZ=q6!Dgn%aNzu_+g)IgY8w)B+mNWI9$bc>q9N
zb4eL$)w1GFxs%2GL~P36(eex3M_|S_YAJOh>1dSg;5!t9QX9$xJ##qzJ0uF)xyycD
zw0AJOA9d9Zq=GmSb}{e2<U_#!JnQl|=O13t&cGs}FYixY{&&2t;-$WU6p2TymPwPa
zt)sRs99WTZdkw~-H<Z)@62eS)NL}&yppMulPnOo0HI2(pu!@WC&2Ux=sRyuIaCU2<
zn3&(gY41)RxrcV{_@5VS8qbs)8Q}~0IBGUq-f#W-^|+W$tpMyQuGb<?N(AGg4{LN^
zL%a$eeQ_hCoCB4z=nc%R$(P8Tw7>o5%`YRPp{PnhU>OBYVWtKm2_+61s8nX)rKO>-
z;sf|+fO{dp%*aF#zL-_!iW0Jl<R5WcTXT59yQKF2^X7*s73I*NHKeOK?q@crqxj(u
z;~f9_chkzddo5>^|JMHtTgR}L5^%!h-r?P^3*wB2|7iO2K3oZq2S+t54~`=F|M$VA
zW)<YU3?&n3*>YD^#d18{W14$Wx_|qU+CLxI)T!kU40G-OePH>yhyJez)_)?AP)${T
zP}D~Isw<W)&YVBLcgxD+R!cQ2GEl{5H_wOZKS#-dKsiRLKO?2S&ND<M@Dqqr0)oxa
zB9#0n#vghtTQ@1Y97fU`TnbepPj&CcY>btc+57JM(0p9G|C7Wq^upQ-ar8kvZs8To
zOCK6;MnG8q`_Ozf0bgjI4<b1E?%L(zUdI1=HEKs|QR)SWy90bme%x5HFR0E=(}v~Y
zApT(9moN8dq5vDfg5pH^E))k0EpRAOmPat|Qnjde?NR~uum3{gGn&$>%RfU^ow@Wq
zbh-}tE;IzkkLKLA0ox{F-@&Q5%*Bz_2(vW)#>OdC$@DS78F=D7ILDQJ)lHnDPn^JT
z|0KtZhyoA3l(j+ps|m>Et4p=gVqev*FY$IkZSi)<cjD3^7*^xkMhIL0dW}Et;a?-k
zZ@u?4G<lexo__lG>C*=kE;QX$tL&ZP+lN6>h@O;l1=Qp<=@<)g8(}&ejTcbvwbE~<
zByXcPYaExV)+PP7APW4_w_$01`rD0NflwmmsRl=5f~aea@>am+kW3#V1RcUxWe|Oc
zwj{EA2|>V6`3Moe3=oe%)zM+Xgr@hlel&xvI1H|l%{K?dQG=r48XL|I6zqfme{WN_
zSkCS9Q~%D7On6pb*(MSJSH@5D$7PVKJ|z4u@E_0wdOk}c-rbm`3&2H2iMht{nVJNb
zbs#b)K^jGGb&lyrKuoq#<9o@olJP2>@3lRiQs!FIyxV*Ka#W{e58komNM##;@(h}=
z6z889s8^$S7BqTxDG__QJvRY38A+T@r77gZw8VN+pdsoQmMHmB!<OW^c&ez#;2_>w
zv_3RcM6uXIGPgN|XD6kj=n+@FS~zcB($@}otd(p=D{fs;%zYKLz@#|ONldkhprmqm
zKGItTjD)^~R7^V4_R3-IhtS5lod6a9<ooos%8;i3sS-iYSmj#_MQDPwQ(r%4ae5s#
z0FoC?80^Uad6uAanwpxr+S>k=kCwzy#b;wSoI<$pL&bRXJvW%Gn?^!)l7S;^F;{fl
zS7Ot>&te@+s`brtrEox~fS}4P)`6<snMdv_72L=*32HyT5JY-HJ%B0HZ@mMeip{TY
zWHedgIATVT1m)~s{A_z<<jjrZ4jOWd$(cmio^<~JoEy=HB*Q{<5yIr68nxNE(|f_D
zOxAa=l|PCG@2*|8pjGcNK?nio+;y`CDoAp=9~r&Ks!6?e>IzkXNJ^FZ2OJ~MU-l!?
zY*EHo5{YRx*}TO&{U<+TxmwHx<qAbDMCT~Q|7$*8biyZ)CoO#c>}l!lPz`2f2o5%S
z^p+GWix}?yOzI(ZFoq<OfEFS%t~*+fjA|6JnQgj28D=cDJN`dDcU)uhbTQ4!Y<qL{
zjQdBrz&x*M92g)3EnYD~N!Hg?EwPr&7?b1IF>qZXggD^w#bIE6nhiX<C4Vkk4UMA;
zYfZxAGE6cSl5Ai2{_ec#mtheC`vPmozS0Genv4}I^Ys1{0GjGE7jRo(2%;Hsr(T;G
z5)CYEk9@~R_s&WKpJK&}D|ZD530;5*H{W`a=&0|EG42ARLV6`Ltwd@qA}Z#133y03
zS_puH1gZU{`>HUAyKE_pJEeP7sv-a#tKNnSm89z=CWt5#M2G~M`G*{4`20RU`8#oi
zMbcymLBH(fnOpMXeaQ*hcFe6AZGt!`m?NddK8-l%IS4h&Uf%@6!73M#hX`<WJF&=Q
zbCaxtd{1*5%#Xz{iR&`a%38yr=u5jdYIW{d?n0l|i*wSF;&oMZaCf@Pz<7FcbwJ-`
z%ERsZG!^a#QwkCMi>`aY^a;x|?i~(C-7CPa)ZDH`qKtG&2^xH>-BW{!pYF%~xXb)G
zglOZ>xrYmN#(z&n-%sH!(lj}+08Dlt5!zu?pB`F%FCNraOlcc)q{-f%Tn$1`b?{|T
zJIFb(pPB3k0==Xqy66po(Ibd&v=!_zD*7;F_ulmLy>JMzn|mRDCsHVj22IbpR%ijz
zT5J{W?Q@d>=F3nXU~Zucs9^%kxhyCA(C0@pkGiG=nMT-#tAm%%y}q~CO#4;$87X{p
zA=1vux5~?TCH_*;mslA~vqt@V>rYTq6^L9QA4`fz91GOJ;8KuBVO*I+<}xmO+J&Nz
z*K_V1+R7Fa4_L&P&$7+b!SpAD0YC}dFC-t*-h=J4KBb#Bm$Llj>VSCCx}a56YCwuk
zqb|ND2n7^V)0`HRzoNs&9`Eb<Y5Mku!vwd2X$U<fyfiIjmVcS0qOU$enY-&IrgL#p
z$jANa$&wUMGkAG58%hSd>E87=O*2+Z)be=X^NKQd5$k3mc?kW4#8Ol(dNVdBigjZ#
zPH#Q6;*b#RrABSjUrYfCa~(@&^mqL0x6PQ?7hay(QCHW6#i!OCGv-#u1LZ;uaLG@n
z<BCVI<%xXzVIL0D0obf{t@=Ol*np~$a9?r`gZ`j=F;6Ape}1yy*6umgB?(DxzhJ?F
zQMmh3-N?|;765GSy5`oHcbx!(d<$=bd2g@&qm3ZjdJnukImAM1>5?VQ_SLM`9ceZt
zjGVdSV^^G|F+;z0?M;Ix9mIujzPo#ozqDcOd!O;(@Zrv)BlO-Qy#|D>?)y=NO`v=!
z`E(BP#-1(EA)m2&Z8GDCqgJA);zgzl(9*{B4QICV9M6(3LQD)u-Fwn!+U7(_j8Jm#
zx`|CriQ5VgE)fIXS!!l>t?|)f8s-t%4uQ6zHd^*#Y5C?=3I?KAxC=#+FWN-Lxp0;n
z_HWc?bwx#NQj5_A<yFPC&+Be+T)I-PQzp-iZK;+d`uSI8c|LV7N*&P+UEq!Qo+=eL
zbaR`5<2gNco<F~Q`(c*)rr~WKvl~~us<zbN+a(%0Cy?BSeroEtB5(VV@A;d1#35eo
zoHj8|e+%)C6E5lD!*AQKik}#f{g?gZQP~J<^`>r4B$P_lwf812VL*FtOO{Ro5F3@q
z;e3Z2s?GAZpo}y4BX#iNXG_=bol=qNs}d_j2#RkbfRQ_N1omRk$(>6!?EM{N%|z5P
zxlcgwcS7Z;M5(m9*3`H{%GYc9wQVcrs=9TiPEQ4-<Z#Nrb{|iLe}3)urt)e$Ns<=h
z*?4LAxtpn}Jsh{MWhZ3AN_S41Wu5Y_h63y`9D80iU1XcwbNu?xuO$c5|Gskk>W;pP
z-t0m(Kd1lPJE4EKpY-+dJQO=yPtHL~Y4k0#vAH!1L;GyAa>WH$+1}2ZE9l(kDdhoJ
zlzk%3z;f^kCfckvk)5#T^mrTxr&0R__)7l6QxKsK`VTAeMgSI}J@0Zx0178(Z>T2P
zFv6ieZJJn&S#b@(t8M85J~=m?9hko?xQ}7eP^e~rd>85mIYdM(LPeSk>;`Ww`<;sG
z2%kMLr3itu1SVaPf&sA=`QI;sDwp0IM14q=djW+ru^U!<G{Tp$K>VK*YIh5x^>~bL
zxQd!~28@avTnqDFm?Gu84rR&XA4@8F7fup#bYNZ)-{jF>zi$P-521SmTcMexPqy@M
z*s?M2_$QIfj?1k7>OR^8vsWu8Qq_OE8Fl!u#=;*Zo^8jfLG;>udqgJlntUnr6NkS@
z{=9dN(qH;LZ#W({!dt#Ya>dXy0p9ufG&CQQ?aB(#TC+maH2=Fnw{Bz5`Q4@LmK>O;
zkc)y?f~^4kzaLmSj6`KDxD{o8@wf%QJ39+e&b9vOS&&f!a;_7)9GFpS0v9aX1m6Db
zs`Y<g_n~23svs6cX!?8Z6G`h4v|M=Ncaxf=^{4Ih367C2DyrB}AY3`BxPg+MqTR82
zS35*A|NTEN#!cg<eY>fhznIcvln^NBoy@IadB}3|ujO(X>jhY+aPKg3*N5y?<vYh$
zAJq9i4T`{JO$3dD@hV>V=IPRjqra6H{}}F?-hOkh^X+syICT8pB)Zq!4wsvKUw-?>
z^24884R=ldt=WWYJrXWIs6I+J+MrQm>+J)z0?k5375~V7{B7ZpBUuw|$L5_$pEKw5
zr*k)R7f!yFdv5fheV>*MbPStxzBj@94+vc`iwY`4b|E7^XlrZ0xLI_^IFE=)0cG|O
zZtL|fUjVQeg)7D!RQ^{W5T`gf$=HLcJf_urt<0gPlf`Eml_Q(38A7yeJ9n-l{vG6I
zi+_w@CHMXc(E$LuOtK@6ryNVX;!{%_u|ozUo-l-!p!fB{;v%s*<{-n0eJxwtgc;fZ
zrbFdNIPgy<Ru|VQa&)xyIzU(3)auG;cA)7G@^D<zV+S&yRYpL@Eq-Hr+^|!5|2Ms#
zdZNFffSOuVkcVa6P$u5nADz6JG%<l5ci~{%H%Ozb?I6h0R+u*Wo_`N3BmSTmk8Gke
z+w_I?)UH#fT13S)ST0jFS_Ktz=ROeV_1~u7zJ2U+W`8C7;a?ru)Ox2!abO%nlV_ec
z7FnWe3IDp9*$pnMdt3G$)CYqgddseIQj2#OgKawvwBJs93=>%#-^EJb$bMzlwryU<
zKMJZ42nVr7Hx`TM1}3d|+ephYe|H+voG^~v;0HdU0@~H{8xpZ0!-wa&ts<Prn0J|v
zgX-1vJMx+ByLXFIsrc^F<#Fuz@r(G~ii4ih)XyXuTcjlKdkXJOk^_qv@woQ({|d@o
z6{)%3`|k8TbN+PgzSJ1|4^m&4jmyM5ne!RMv)11_b|P~q$ho?S^FcSq*v`j|(Y(U(
zCrtm!`QwCfoeP0%e6j6s(FO<mU{Png)&IbOuKJO`I6XmdYP)6ACIi1+`g>QeuFMJ8
znj^#@@2^Sjn~m{TgiYQ=V)Rmm8ZC>7j`0NNez9KLvw=EEbL{6&G=Ht9110|Q<*HQn
zmt<B9YcO)a7x2?OxQdL@j%SP-tByG^U=Y@HGzocuhfJg4FQoTz$Hw=EIUr)AZ@hEf
zuV2xC;>ght-QO{C_0H75PK`o8F40Dc@C)*&wx>5U;hfrAOycMP-1G%;uyn`s4h7uF
zV{q`zTC_+WcoQrbPd(cg`QXfbQ69{G$={&(s;p4RXV95=`WX;H9&pm1OP;b2A*HrG
z{s&+^<Uq}#Umi_-RLeE_;8s$DP028x+tS2agbi^SHnEZ-6BOZnf`c2<$dbqxt)vj^
zSEDJ}V^wBIL}RDj9Lm2xzYjRmmzJQjw<dy%Y{bX%>}*=>6i3H&>y1_#$kTCN8~ZL6
zy`v6JYq&tF=G`iyoDzd%L`&kn{RjES5)bGv8J`vAi2(x4s@NUSYTX>O)zJ@Q0!v;|
z#)<I>T4Reh2Itb53^xk5|5^gzP_I3y`VvL47yvvmLW#ctt*1<>6wh{m@S)$ejXnR6
zVcPr|HP~gIpcN^Xc^*~TG?LOY^zCf)xZ?j7M6(HHu$R2$+PADl8Gri3h>gD<ZQKiF
zRpJ|fy^^;Fry27?hD@B;>POM1F~A4`sBiUO#u7jz0DzP1G#gh(!RhoO!NBcD(b6JX
zOfl5Z;iNhA;bDw#LHN{fXKYsedq<~&(BP&S54PZ4s{;y6H+Od#j+FWAbLnh#>wh?v
z#M}@c+bZz$Y|ZK<4`L)i_29jX$2pv4*Xr#(`VbTnI3T^();V|XoD9XH|6vUw7rp%y
z;O=98{dEbU&N0rw9$Tfp>AU1ZTMFwbIPp?>Orb9fhl(pK*W>6%#FU!m3dM^&>3QVQ
zSoPj!Cc9Cz)Ow8iWbC;2tPuGKd2Gy&5yMKJoOq4^a?AsZhfJtK4zAV6aKN(9HA$N>
z7`^uEipn7}kf8vOc<ov-0h8y8dKXjf4!U(@Tr>^84NwW)%PVH3(4iW$;#(t}5Z&g@
zyU4*U?~G4DyS8l=@uIg+UW(g#EMcNi(kP?pR0BcL(VyN-y4JxSTO9Qh``%9RY^f1?
zp~+Wo-KtL(KZuO<7$pMtM7l;F(5b|Ax7TgMa?qy*U$>9zpn1*<iwQAW9I|b7r>k0A
zYl;;Idu|_tWl<@vVaPKRnZ&h;NAps8RKGPBBQPF+lV=koQoG*iAkT^8{E`G^w2Zdl
z*40=&;%6v+&S%X#>dTCbi<d4{fm|%WfZUBtAfrWZ?5{Y1s;F8GfKhSB<$Pw}saM44
zkmn%AWMaDE;-Zqsh&8Eg7%QpW>zD#d+TAlqdj~%RuzwpoS}U;r!%BCo)kifu9mssa
zb?Q~Oxc&(vnRd;0F+CC2MP$c>q_S7U08+$C2%6mAKWi=aeLUlacoG7@qONPHFpqpa
zU5R40g}+Jy1rDa88-XEpFqK3Ab5Y!~$?j?;g5xgSZ8XAo<&ObmVLW5t(FB&W8?^fX
z=3Wp)#vPwp4f&s<+2Or*U$?fRlv?B{w~F9}DernVt5~U##F#RkRcyjL?`lAs4>>Nb
zmk|U0p^6Zn`gZNsFl8F$!deQe=M)PvGmS34wgVh#bh^w%dRL9f5#v=(rrRWA%wu^;
z7}~&K8aB0e)kZuJJRbH0zgkb^Y1(9noMP+{al;(2qCluKv1t(HZ+3R}b8g}TOy3b;
zP1JGdZ#Q|QOm-`K*>2=uazdtriB41dWq&6puU@~t2Fc1+5*3%egXJzqxB&WNuUM16
zG6hm9$b+T0+S|JthlfH;!Z3&_cr|Iax<U<LiGm~W9m$J@Zsi4w77fFV5=gjnosWt}
z$IFPf@Ud$s#=lR_z|`ZQw#v&(ahm#p)y98lH}Ve~t+*cGlFW=eW$5Zi<^!r3kK5U8
z;0Bp{@Z?D=9-p|6$%IzviA{~)Zsbu|vpZt`m^(f>gb?vkV-JWehvZI-Y;}2XtliZu
zt!uAbxzg#w7;HkN$>7@q5<jdw8a!J}Hj{Pl`pui``1aiC8Ub#p2QlLjx8-Tms?hS&
zta<Z5L_PSL_U}FS*~F<qE4D)m)bbxrkz;oiD=wSBAFQP=-!4@0jyzVBUQ(<L7po{F
z?5lVq?#DVw7r8iG7}w!>#-j7)PhSWWmX>qy431CFsp{5Zmcs~$031G$Pfx|Z9LS^w
z-V+)DZVGem7i>cmc+<nK1J?OXPO*OK3?H6oJuvo7P9KH_Y$I^g@_I94xb_+_9487K
zF#{s}tS5m_;MNru7K$CLxV4Ly^%+lEGp9U@S*otS(eF$Cbvob1MTMc=mchIT+1IiG
z22YqEV|m0f8O2-expU{@;$vP|2cU<6)a`vcV*f!ahFC5e_U^q76F4q>9h7t%=8Wt$
z-1Y%faop)S`-6>aJKdUjZ~#J^;gl!X+14kcvc-b^%(~6^ebT9*>3I)CPK^J_r6;z3
zKMhI4;Jv8dH+^{;v$Qd~6}cI*I_0`}va&~E#l3)*G!|B=Tv@YTTAi9Tw|y7eXVbX*
zn@VyiUM+9kyJx?on9EQ<`x7Yf_CFq;P0eLeYsTXC3WhQfsrl6Y9W5G6n?AiNzPm|p
zDdC1XcUrRu5M~Q$f*cZnj;U)(4>s8og<j%HN!7Llb3&R9+Qv+#)mHW%f5u6><8f~y
z0l#O|h5Eyjc8pBpU$6q|@@K&&3CH}KY;Z2h^gDxSTZ%a^ncfio?=)qK0!qYG7lqn)
zoj5cG1OzBDTA0e$n^^yxo)#<!e;qk$l#`1~P4*zU=7`UoJ?M#c{^-f2u7O?9U>03<
z#Xb9WC2AR*?z9eUYV!1xF*WnC4}-BqV{wmo(w>t1$cJwjGjBN1K5pr!>XL<s@xFKh
z0uOs(BZ}v>3_%rNbxx6lPrf8bQwanX{#{T4wxmaeS<A?zNN-HBRob_IzxVQD*y%*z
zcDKxH4T0*am^dZmbMkI_H@u4X%LYD)=`g}6KRB{wO#MZ;n$<GJDn4F8*C0nGZ(42A
zq)CeRt%WObbxTSgLZ#IJ7ikc;01#a4KUd)fEJoLz9ipqsjbgr?SHgl4_^oZrTiNB=
zT)7nduT7@57w<3g9d>n}w$8A51`7$tr~BD6PD&uOJ<Efrfm6)<9U9og$w-31{U~yz
z7GSh*mo7Z*i7FDrmtgGbsSn~-8dFXn&mSSo!rzMn9uU9g(G3|Wvy<8#-%Xoamm7@A
zB1fWD>2$K0?>D$A4v9l<aSYx@@8tav%)sJstEg+-gzbCx)(&u+vLz_#4{FOM9>W+K
z*ucK#_QZSlnvjp})EweK#@Za_ZS6-&kzhB!Ik*FMrmMs3X5bON*%W92jjy>3+N2i3
z$g7ef)1FZmSTB77_?{rr&e5ZH^_tlPz4WtAMx)No&+^%Jql!W#=(IE_#umg+({C=l
zrDtd7eQKf5|0cOdCf6o&WMaR+`_+3P0YUfNO$1l1DN1dZzDbSs)Ee_41&Al)l{L7S
z_fS=`H~}NWAAFWti1aZip^%v0(4z^3@dnhG5677P$y_v{IEn^Z{`K1^agrb0uS=Jx
z8`ZbOZsi1>Lhg4Aw)Ff;60w9uRH&R2$p$AhvJhT3I`#I=o7=!B7Wapz#6Qu=(M1aa
zFd34#gPvIj&Pf!Nc>JR!v0qts!ttwi3!cWA^XF@4ai~4#$*UhBkgq~hhfGHl&6It+
z*uN<IdAP&h#S)c+Fnt+J;Zqr;Y89F^!|ic>2@Lo+k5(mKz50SPJEX;2t_@O7f559A
zy?bAk6YoilE|lYyNj^BKDyc?J<y>-4DuyH+OvOT2wJSIrN?<1cjIqQ;$_tZ|gaRhU
ztYe`m3MeOITW7R05@_-g;>e+SDvLR7$B?x%jN-_LrHd%)WK3jWD@{#Xo!4u0M3dq_
zyow3OS9fT|z!*#@Vm!XvPtCO5-u>UYKcz+jI}6fxFoQiIJ8CK7AdVDf0~v{D&&a>T
z8Y<=d%=Fj`3AMH;9}rn=Oor{&$gM|z(zxPE3MTK7(IwoXRz@qDDIZYdh8^y9mlR$y
z?!d!`4t3l!%V6t~BVB3Hw&+sbE_mcrcQ>~N_J%by#va-kR8aL5sC`n@C#J!s<YYlt
zNcEFEt$y<!Lo-Zve-w{C=>=86xsHFG>86A^#>EDkKly0(?ErU0td#>-P?Ut^fBN|G
z!_&p-8B|y@;+{Q}Hh!b&9nj}v81MuRfV)l6?;jFEV_;AvN9(a6#I)ZwX6kECv#uWC
zR&DuG^Tj?`Ej9JH=~Mlt*4Rh)kG?Rsk$He^CM2JnK{6MV*#+3gW_RDqNT&*7<c*hq
zo#R(*>kdX~y=PVph4##$kxTdYomZKX%emF=WsHBiO%tE@_m&(4E$W27dGA+Wyf7qn
zRJ=EyWVeqP=rDTe=XT34bp3HUL^C?yPtSQtabYh}6)PsBQ<4^*EY$BtF9l^(=k13}
z9meFBu){I}rDt9`eiF`mc>)=+%krjI(@FyW$?7mp^&&f4rN0t-!v!~>ghFz?rhapu
zoXPX7lw4CoqtVm%i|_$vJ^7sq2lm1+$9q0a!I{6`Py|aNJyb>g!qQ2(kih|-6x9^+
z*utwAdLRMp^!0ISuFa%THM0m1Z5ae(ku3)Gl39i<^fA6OYRs5FPpDQCiMcDb_k<Zg
zMvvaaoL5y`tMB;N?%P95EZ2Z;U=ZKHB;)cbkXG?ZzQi%ntJGk+q22U|4yQ;}#`X+&
z0rI1fOV`3i`}${@?%c*`McC&8COVmLY#~#Z(FGf=BC&{Tw<k0A`E!M$-tTR2ozNwY
zKswyCZ3B)Ufsaa91O%B}NVZIUSWUqj)?mdLI-Te!Ynbe#?sJWeJB$QQ?{1gckpDr^
zY41mJ!$^mDj|Ne(NB-!aZewlTNQ~T<bwOqvRnSwU3JJ<F?ulxmRkp{&2M?a(OV~77
z1{W>;7@WL_P0=lHPmtCcBlj!D#-{iK+-6KzVA1o7RlFNNx}H<$3}`UxJqLDH@)w8`
zJ}Mp>AnV=A?_Z*}eW;<D7-eDcVw+@(q&JO{K?0gi>S)lu6Z$%<7kj8#lOhlJHSLt{
zh&ICeEwy}fokI!cBStiiYSWuJf++!alT&LbRz0*7{Te07>eiNBmdCuHr(qRdTbGCf
z9CI`BZ4OZP*QG4Kfdj~$zzQqll;M&kf*v$UhT*pCw{#!{>n(p{aTDh&+umyz+t@j2
zWu?3Ybxj7Wa%=+yVD$}9>3x%3m7_PFfECj_nadosuozSmNgiL|CY)O%1;-k&b`}}l
zbHE9`oMP~~ppQ_<x>qa*^jorG#Q=J7bi-RKej5v&-_K-5u8C_g3TD&;7U#xJ!d~2J
zw|R4;I{ztLK39TAyh@Xd%+Ya*en^s-66>*?dO)LEwQ4Q4V061-gMJWES^w!#00;>0
zz{4}DeUkZlwhV+hnpz05TSckUW<WE{f-Q0*hAg^09N<XzWF)-@F7w??EmvH5Me^sD
zkB^1!;T}&zN3~=U{k5p3EIa+k9WG8}v3$twYd)z6dRaVY_V#qOZjw>cXXO|GT<gM>
z$H;_z?A>d$-J|Kay;ugMo}SZM5yTfJe{Eo()9(CN85xy0$^+kIWrg&7cpdR;!$ys)
zU*x|?EEosT7Ik@P=!Q4FqE9p$U~z%dcF#wak+|ctFYUhlI-EY{W*Pl!YW`SZzW>3b
zqVe15Kj_qav}x$9YDZrr=d@CFiTE+OlE$rXIuVVQuP`HN$)NGH&X?BM{7~lVPtrE^
zaqOeCg<**L5}mK5msP8F?I9*6&~I@}I}r_rz45;XDPoJO8<P<VTI4p>`@2oQ`O`Ok
zU!mn4vHb7oTN~}}{^{|<cF)&PV)W3YnkP=2Kt|aye0JX2S|%CAmYom;%875jZ~{)2
z`qzz?%lKPjk>$O+cTLHA^;$nBQMZj<@hv)d5eQD$<O}Cg`$vB#;pp=Efi<BjfJT(T
z4!zBd?MCN>Y$<a~Cm9d27&2l+ZI}lE(VzeGkF*Lxwb!Qop)aO9^cAf`*XGiHq-5?9
zCJ15e?sYP98TN3-C6kP*OD9a2;N+LT^2fItVgvM~7A+q0&MbP{f-hi5D?zXJW8ab8
z1x3G!=#UICaN*O?DGLP}B4Qv9s!qAM9vb5%E6}3jr^v#hO3%Rq5*BfkY2Mqh-=*_;
z^$64aBk55QmHPU`EX&s*8$Ux(lPsO(1qC2+)oCw4xDLI)PC?)ot;Nn3uk{2F&J+ab
zOO`B&v00JzI%E7JlRql70GHMmfYDL8)L;QgAspo&wOHJ@fud!|@HM7Z&X}PON+hP@
zfnaZ84u5r^z-kCeESMuhJrqNxev6-3-_fQcPFc4hTIt@Gk|3-VK0+rvT{(xTPQ+>*
zy}05<0u_%bzARq`m`jBS*DiBs)X#2xpP245=1mPR6hcXBS{dkszgl(fmC4c_^Xqo|
zaeAF^`k&f9Wc6Ca6~*#Qp@Tz_QKzj&KG*6^#ooX^^1_?N-MV!e!>lJta}7l>KXU!F
zKh(03<&}Mja<K-Qt9DL>V8={l8gMr09Fsc*^`InmA_dNB9Wfmj7K^V<{VnU8IDP!C
zL%|*Nk&v9;#jqbws-wk6S%co@F3xsQ<F!Dr%tm0pCDaYyPrq^3upKd{py-0;V+s_u
zqU48%RS}u2-#(KGUv_(?4NbMadNuh#oV9;P0Irub_Fm)w%VSS6UMD<eW${0Mw=_=-
z_1SD*=J+kwZRAF}BTNk<;SFN2O5Q0Pk&L6`PD_cur`~q?$shg{HWxW|KPo<{9YP#U
z8(-J@hM?*g3Kqo#Rsn5-4-Ed!u+w-xD3bNb2$++FmRS_wLDBsiM{oY)4~t$+9KbaS
z-kwhXfSOgcKfd$Uh(h^LnHg*E>l+vl@ce>N`%#@xRc<#NGULg|>KnpyW)cpW=(S*X
zon5Y&dIEQkC_dw?hnG6oX({6c+6w6IJ1!*ju>0%OYS7gQ0Fm=Z-5^sF5)*4+nxmJ)
zsqMs)laa})PM13RudwcVJoYGjpiF^vz5os}6^h&<H*7(d9s5sA>}fW}-eFQ4>rQOS
zW%7%hcj^OlvzAi{iWg@F<Ib9MR_ducYEnPw>w~{3^j}ySJkyDeYz8l14*=TYF==<L
z?lbqK&Gb)OU}6&-v-QOq$GZcK4I+^SPQ@}^@gl%Z13_l`W`w(kX2<1kt@~VDH_Kz`
z{RHrk*03V0m?clbCPQsJ1OAPCt03((%$qZDbu$FZ6iQmX91yR%SZ6L@3}Je7z!2LJ
zc~@#C2gT|@8iO|lCNJZFzQh`-rBDMle9IAJ6M5@}d2^pFhx9wGiP1gnbIs_WpU0QZ
z-&bUDUGE2UXiE>V^}tTIu1-%h3R`wS*hu6a8Kej5sg|!E*!KGmnU^wL%4N==A$+m!
z$PHvt9h?-+kO_zlajQ-4;Xe!fZRpSl8k%ODSo!d-k;M=?lf=l@X<H&RxrMItwQJW*
zs0zFibnAtTTGZ%5|0=&F&!0MZ*s#(mM#wU19bL%N=H_EM?tS*`(v9z@qhA6q{L{UN
zgwCnQd`?ZxdWVyelN)sy*tApBAXDep1Vg<SJNNV)`gDN@*e_`0F-s<qH<VyRJSMk{
z7#fH3%SG_ayEzA=1)RmR85i|B<Z@VS*Mp4Izu%0eVp7mK<=nv)9wTI@D3`{KAD_T@
z3hr({>X@3~B?CN*#-+v5tC`heu4%x{VAqHKB`*w`YORdweAE#&$`y!A87D1s+CY-d
ztSpHQ8x0Z&4qFxHl9BuRHYD-i@!g2$q?g~x`%T+~<Y77Qdv<!|)xqydx`E$*{5lM8
zpM?M(7d_oPyriy88;@(L3=Tp;W_Yc4`}S3!Pqy!yB1Lo1eBg?qS;WcbAYs)N;GMwB
zGv@qUPoGO#VtGvAuKUD*C$fBFa2?Q^ChyLo+B@}RT9IYavqNJSZ?EyN&|2L+H`~nL
zv+ME|4=)#neA_pu_aJYbkmAnm`uA^v%EjC5M{ahu)c-;=BO+t&@|{Mv!U3v3P$XGz
zUej~af1{BOQp&5`M?-pbo%BXfs|piJc}~Q=Q^x*&UX^m^&YDe|HXXAlCPKL}CMhNT
zfaJ|LZVctQQ0%<2-a4pvB?m;<;a~H*lC^i-a~mkW?+S!RA&ZL8OV%!<TW+yNX%A{x
zw<d!u?3WvkS^5i;F^Gt&?n6K%!V_CJYNTUv4HMT142!m(UwK1)b9}9-t3{o_e$Ie+
zHu$=>jw#FqhH;BSUM*_)Ft`pK0j>_-`GdAy7?wPtR`}tgj;4HLL;N0PvZf^C9)&%b
zwLR41Rc7WCHX=aX@h6$XqVu3RT96rpw8>pRG`XLNRwC<B1{WyVPtN{=XXqj8F*_~W
zc&O0+PD@GV)Eu}5kC5O~R~}=m6LL9X1qDE3`;MRT@>XH+HvTwP5{)R^^d<w5Tjuut
z`H1~i0Z0iEPjrIl3<`3i!%b$C4&D#5!z!=B!D7nQ^wi3!k%%zX7Hb~s$P2SPe);9E
zuUpI?<~hIg>+<;F!wf{2AuWbDB;UG~2+0IB0j>0aw&LJdzMq|(tjk^J$%D3zq|lp(
zM~je+rF0@=MByG~SQmlf5`Q~w{Gc6j)|hF*N)(ZYCj_4+7ucxoQECZoNYi|e-OqoY
zGpDQm^&>X|sSf5^{Mt(Ku3`h;yEd9KKxl*r56v;jw{c$e&X+WgpB`_gJ8<BI%dhMw
z1AFV<Wk`?|g(3ln`BIoa(|pZ_v~QCczusOSIPa^r^Wb%emPhXIuZ7+CFtjPp8F3CI
z?%Kbz`MdPEZnJ>0?Dni(OKJ`;J$C=}p3(Qs9{PuWJmZ;UgDDo@q&keNhyZ%ed9neD
zspj|kt^2PfrX|qTn1)o!+LpoQoloxK@!HQwv^LE-X|<cc0lXJ^_)t!(CE6i*Mn;+;
zc@E4En}#w*(b!R4>`bEWUe~wKPDZ9JME2lsa8qh^51HSKoK8iir%g`b+^9W}N$D#k
zaVkg^(5}{j?;Nw-Nn!0xoK|NtetY~wK97MUgQ9vvWH$EOWUTF{Lo2E65%dbly>kYP
z@hC1l88*wwsh$4yt4D8sD)f0yoB}Fqn2ec+)8KVmHIL5txu&$4Kh|aBo}RS;rq^A#
zFwiT(9J9VQ(vam6-$O1Z<vR}0^`vpdcg()N?GEeA1B7_O08B2r*?=W(yFaHRFtYka
zg{^)Z@oRu-1iG#ilP1fTf4APRT7udEs;aVT-?b-5enaw6kap0Qx$0?CJ01Akh5Xdn
zV`24-nB2vYH@_}pqMp;^8#n5J9QZx?G9(X{PIjLR`(%1BjFuM^^^E?VlW=&_pY?0M
zetzRNznT_svmbS5H~s5B_W9-KkI*3hma+|x%0?I3v}!dm=*XM=sO(2!uFgwm+YZfH
z8bBE{y7BviI_gnw<J<UO;ix>0p|T<)KIAAfH}9iAUOR%E#ME{RYT>l5aC-oFTFC?$
zy&Q7qw%D?LjEgJzF(EA$_-%e}*^}c}k1*Kvj?0E&A<<UXvwrQ`mk4-5zDW0gO*_wR
zVEYP#>;sE3lNu+PHx?woBH0qIFb(Ci^LY~Zq&%%L+v;gP8`E`k?;mXAl;X!e?`Yc(
znopqUrCH0h#eZ>-%baL3$e9Mbn!CgCS#o0D9u^99hYy#htgV&;^})@u$t{ZJ=r2{g
z*hGHg{*F0u0&L8-YpjoV4>&u78A~13+D`4&C_Hl8h}Qe&BMftzkgVUDmE71b#B-Q(
z8(bE#MF8c~{I4yWHLLl!dGqG_TRZ;ReBAEE#dF?2Iom#bvb}v1i1fAdU006r8^rhq
zJag7jT2@kE8mf_Rhs+jz2cs@X6F%0o3TFeA;&k=F1=pmwjHXjN^WxHI<KdhHLGv9Z
z$B`h_rFjwa6CJ7==VDRp-exFc^2bJ9aW$FfyFrE=s<%v<PKA=tFKV>4wq5eaM`0V%
zW;z1agmqqCPxH{muT~A2&$X%1FJ|95KkHa2RqA|1M2hzs2ga~gq2BS%XSzxT7si2a
z7iJXf^7LFUlE}3JZUrx*4s7Iy2ftOsyni3{VF*BjS#lg8dsGz8eAtDlFiD}?9~emw
z>neO8EW4^y@#OS+Coe2|yC>`kt;FVUaVOU^o}hmqvyEzn{kVOziqMz}$>TmtAr7}h
z?!TEFwf3tCOZCUQ1#Li2o7@`|g8W()xy@|Iu8Kht6fy08pEAXZ=1?IZWU`S$%2%4Q
zp?^S`6rNF2rd4DQ!QQ-Sicd>aD~Yi4Ycnfd+0olqoJ^TfP+Rf}vpt=cLXY*>6kcG_
zR#~ex-F9i==kVZVS9bKE#b}gsF@p9D9eOvNdlY!G4h2(nMaDO0QTeJw7@XA|j6y$r
zuA--|0wHB@{cCu7)T&jL+!rqNo{zgxaMBa&xI;c&Ij1O#>*DiJSCP2_GAG-m^8rry
zODN>nVQTxp8eSYBwW?f2o%B8B$iyAY<*^tYXhy%6s+0IuFtf%W!sg!nZUKgqJ2k$?
zTuG8YpN=(ELG+JfMi97<oc>$?9Z2R-eJC>N+6oh8^S8mO>oX7Pn7%jlE6pe#J$jT)
zA8o?9JxjEm162;Mzn}PXQq~ARIa2l)(gv==b%;Y>6zH`<6n6AfP6Xw*^CK9Q0-rP+
z1ZTD(0%)BAgRq<G+*=TV($}&+z`Ko}c1wp~JYeLZ|73^3ffK7NZgKXSX1(}o;<r7j
z4?dFSZ-G$s1}^vbn7L7vH-v<7gzmi>7FSECw9I^}D9EA|1#W5(gKl(B8+4%<4T|>l
zZhc_D(ja@aTSTk>tc4AVrcvjOfBd0oU$=`&CF7}BL}PU(l*aMx=g4<tmSNz&UAs2W
zeN_o<sX)A`c4tN$e2?HxBa5O+1kbK}QE&H{<SgR9Al;fw6(2;KcWQ3{2=xhz<#d%C
zYq1Q8p-levn;l^vrN8Gxv&N0vyH;MD;?u|L#pM>X=`}n>oY)MnqZ%-oI3TUc(djy*
z6OBhj(?QZ7NxR=Eb4FcpaOi!tTG8ltDg;qLobX(<=+vWH1Lr)8sf7-sFLP?y*;Lgw
zksb~L(W9CXMMi*|*F9PnN1WWsnx2OKP$vHI-@L18TTeX?=99dP3S49+UI|)DV44kH
z7XaqUAVMJ&2Tj&qjqc3Gc|QpECi}PZ7A!EZIz(oFMa(n~@7TN7p#NN>;?Fi`^XUev
zOL`UfCO6lh(>ieLn&tzi*QgzPG&-iYexqj1#!c<j(Lu&r>0Qpg8~LZ|`1a1DVzx$4
znKi5PEVQ$J541HkVUlVPJXNjMQtQ|_{R;N`T4Nli?oQuvCil?v)7)E`p@|1!kA&P#
zegA5b?{Mv$$-fV0X7qT6xvUI}T<w1IhkpKsWnb~rf2m`ghG;cqjEO2SF>xAif9?8p
z(Ss0Zdp3KzJ`(vz(5qMya((|OVdp6i2A^erWv7Mj$TaH12<GT5B%axx1yp%i_jYU+
z#xovGqHUWJZg9dU2Qs^q3okzE?42%S@AMys?QmbMU%x~7j44^}gUa*7f^Xky?a)Gm
zml8On1|v#-S<p@5<%K`#STo^mR86CHP8!hM#DUx0c+}1((axJvlfDa2J#yrY<wy5L
zZP>oOA&MhCUy7y-Aj}VQ-;p6ng{|J=AxF}AusDisb=o_4-i98-5I<R;%lQUjzc%S4
zz)#SrE8@*;Ywz2jJ#*rC#DBTGIH5`B?kch`%OA|*#28%X^qkS7>rs|(-@m`hw*vZ}
z8v5}tju^TQ_VhHOZAGtcTYWWJA8csq0rm_7MuIjJN+{{=^}&pfk+I^`F^aeUMOeNO
z7yJ=(WKeucNi%L-Gu&M}pM4AE=G)<@Q2HW}pnqr(xFT?QX*K_nJ=};D$xtv;Din0m
zZsb2n^XzO1OQHfrDW`SsEK(jA%=W$zl@|L-Odd{-^LdQR!8XmI+eO7ngPEeBC+Yj^
zuh*k~XY8*uR;nDa^%5tJ{;o#2xS8FYMp6K%7XH17uYN9?2~O?;xf87N#Va8s`0%yS
z*aUI7mUJ2z+Tq!7hcH09WBa#lvz%H&@$LN<fz0)R>o6=kRt)LpT}wq8Q4Lp*lZ{(T
zHBC2l6+)TQZ@<YDn~g8J*3C*uJU{XGx^?Sz2U*K_A5F)1RUELRmIHzvo_N+;MSiUb
z<k96qACsR6DMKb35B%E2;)@Xg#3=927bWpW70Q{TXg>Sok89AAp_fd*i%5iUCA}uq
zARfA$HO3l6G6~nc@Y^CQ82nH~{t0uJ=EWhLLu_xmr+{szK^YU(;1=|}UZUe30UTLw
zwX)z+odB#fa?w7#lnQa{?XZ2XUHU6!(VSl+mY{DC>=adTJ7!RFWbeZ7!>#cJQk!zX
zO+S#+KZx@n$gjPScyTLBdc2=7e0U886^VF3hPrXpQl?0;K~sX@ZBhzQdjlAEL^7Cy
zqMDRhM_)f;<W$B+Fzi!DV2l7a)6JlzQ)p_D+~kkVo54Y9BLNiJz-;8mp%gt7=o3N`
zyBzQhB<E7Ur0Tvz;wVyR838PnCsSjDYvV+(OAna|BeFP%MQr<$gGtdCTiw6c%$*4@
z!7w!8*)M`TR8qJvYnT+zi6@7s0BKw;Sj!;t08Dh(p)@kvavrF0+&TA7xP)z{(~EB)
z8i8^QohfxPFpsYA=G1>zp@W2TbwXXD-eL^HDpGg^vL+eRZ=s*hpp|kM@zjzQT+nQx
zcu><1VWvg!`*1Tl*f#3cb6$AU+{#xsNad=bv>LFa0S8UzyPl+;ZipsQX6EP`b4%8u
zC&&Ck^enXVYLVqK-)1;cI?<oe)g)3x0>edoKJ{9kF1S3CA-^gU4t1BT1IgDIhQy?4
z5xT*_GZWklX+|CNPVycQ8UUK!mt~ygYUHvBmwO_W)PKiL*=6WdnKs;wY>z;;98;#_
zk-5*KOIxNhGr0PUWja0XuQT(gEnjeh$DN#^B|Hk_xRGDQbJyshz%8H}nu&%jTW*r#
zURSp&H>VhM>TKGM&%XI5tq#DnPo=+*;pufSh^x=}vTM&C^e9dHF8HSomhyQRL@M5K
zjoI3^NBV}40ylwDJ+`vMQ}M2)b!yI2A6g6pAwYm}TS=M`VCY~D7?auN(v>UK81>n}
zzQcFan8>Yu`TXW#W6I4T9OMvdYe{Ei<Tp+kmq8Nst}LeE1#!yQg61@3Tta4@{&n`B
z%eCXrJN|y2QH}d9ft)kvK$S6Z1OpIZ_m0D%i=0)FIkSL2I}4XnQkA_5AF#8W0I`E!
z4VU5Yy=U#Jez5QZ9rg`Y7Uk7I;yM#-JcDm4M>%n|7BU*pyV9?|{7Yw<*(kaxlZ*w0
zM3+ElTlIG|3e8zYH4vLIqQ0&ujL^T-BISP)w^Dogmp3w|*?W017K6-Cy^#8<Q*@pq
z+CtQyiJ}!0O!lxtV~W5lhy~rUZ?_YfD?G4a&z_fwjCpu6$E4RN*uvydQLoFiX68Th
zy}tqFCOm!G8jN}k$}7AF)*?`ZWUZNGn0+b8E}(Ok2o=j-9%-;C$H@ffitLtfxei3<
ztcTI2AJ;aKdlNAZ_uH*>Gx+86rK(XyDqggT80Oh;XB+z1U$oREE}=4Skjyk0%l?6m
z_u%xsFSbwSB$lo|y~!&%ia7P`M*moZCW^Q%L+!j9XHwc%qH~liMI>_i(`EoSyJ#&_
z5T!RHb&@H&2D4{UX^5(jBx~?|uNTl>^AKYN2M0?@h*jEpCUP@qpzHXMVLqSflbCq6
z5ypB6B~>)%k(#ZpRvTRB$cf2NNIbQ0_q*SPl9o}^$j&4)J-d(|pg_0Exh&#5x`;as
z97|>|GuK5M2mYy^F#Oq9W)W@V2R3!^x_&Su#GAxI_)l<UzX`8^Mw^Ukb&H{FaKQk(
zIx+t+fl)fhTpf|0FdcU6_~8f0I3-Qe_{ze82XJ`%GWjnSB$TqjlM?@uq8dn#8YhcV
z9=cWYo-x3mD3z0nPtSVTNX|K2jC9V9B973g+kV+>(igB!n@2|u$JSpcfe#0rcd@`k
zQ`s{T6R9A4+n~znJuYTRck%8!F&Q4r2gYX_2bFMl!F6Wh=x`YipshrOh{LrNSgm*j
zN<IYzOZ*nHi`Ee-4nm)*w?rL@KGPc=O-<6<`MHQxrx2iJ@-Mw8T3>+oM|B>5Le#T&
zAAA>KnFN3@#l+MgZjz$l%G}OrTMv>UDWfmgTUvUR6u)2DclnpY35(#bWD2a<j~}~9
zFvIh66_AAYLg^EcDwA^)uzKT@{uGf{#TqvdYuu@=aB|7$9zxgXS~AHe9$6_hKD@jC
zP=_F%KsVOfsyZLhN=&26K`K6X_h%kY^)bzDm$AW?kawABkza0ll#P>PP~tN5fv8e9
z%c5>{$>w2gq{L6(w?<<xSylLK=zYPX!sl@QAm(|_HG)A$95zNOCtqDLq_Y)W(YX2A
zNIc?6S?%tnM?2RhJ6liB89<zEX*C;<I@Nh#IAhF&`{N`HKb)JHxt6aPNbuIr1>w?2
z1d3}f-2+0jk$Tz{K4E&^mvMjnR#%$xJN-a|AbMByAa)L?x%uO0!cACyjgI5N(bG)<
zXS^9J*JgkT*p1y6#+Lp#eYu$zbC5uhHXO)JwxJONoVtks?@!idNs|qWPVxkf;IKcc
zsYcFmw7(T;WKW-p2`MY<w*B~HCbf<(UisqRtVw$Sk*xECQ+2306ZpQc_#5a3V-DPu
z7ne73NqR_td$#T%QN~V(Ym>syaN~U(sV06u3cg1>^=d9k&S9`F8ls46YyJGR__#G_
z(+u{TP2K|>ehd-DK6FZ=oWUZ;aQhv^9dc!?Kv7R4XOQS%v2}6bXICeak$ma9qw!wr
z(4hk@Kyn;I*NKz3PWSE&X=PdqH%1tdA?>JBO+K!jL>5QHs|GjFcixMtlADk>aIggt
z@g&}(F_0mlu0baTU>_9EW|3gLk4zena46>!3&IlREaYG<cr{J?Ib&reFMGgn>gAdu
zHlv)7L6_3*!O(ijtw6R43Q<Vfy2of6OiDyPa76^9EChI+O4NsR+^#=2H_lo#lY$5U
zane_J@y;C^R}X@SG-mag^CTW%-i_ooI5hMH{3){|O$h+r{x1PbXe@3**<x)F%D+ra
zSX)bWCWplbdXz>SP)$r}-lD}+I=O^%k$VIY#>T45Q5Xhq#WE6^9z6+@QeWAlG1I{*
z0A1H*Pz>Z9t3{vCwQD_|8|BAZKo`=`b?ABnNNc4e9sjk1^d~~Cw<VK6wj&cgNu)kN
zFQc|U$jv3EtaWR+U4<5)7d3JU>}^t_N7Jl%vwgGPy{pGJC4twWXAzxR!pUgvn6%{~
z<Ec7i$R&+pBa6KNBe+(LEzNYGq_I`o#7U5ylM_UGD2;^hbyYakLK11asRo_29u;zJ
zVv!s)<U3ZRd_8BR)#O<WhRLwRePsT!-?eUipZq(D+<GKA98oW+QzZ#QyG4>+`n1s`
z^R(A88moD1?eauIM@QljwClEGUni-@-Z@WaTrK%XCQJxthuyWP7SXQ?-!P!%!bOYL
z9X$B**RKboqBaAkPQl{f!C$}Ezv*wh6>&U(MHNJ5!-@8CScx%Q)bI0o<X6Fu=aB)m
z8n{dVetAzRjgWVWP8s<3D*2^B+Q59C{EPBV8);P_wWH(|Atz*_H!^(+5pUS|*gFfB
zMHHOS2oCn`Sn*YYwmu`^8WP~F{&4}{7NRAqx6@`}UN-pdMS^@R$joYWuZo|Se+Jdx
z&PpJm5Lt06{AccDG_aci?^?5V?JA1<n+FZqwRhZx4s|}nlSINc2nJpxhMh*moVea!
z_4B<74H4D1M*F+k-(Q=$qmsf|noV4!PWHaLMDWE8X20<C<{_L=yx7x}RlynZ#~Z?W
zGMQlt$<Koe%TI?6Y4`I@bqd4&!#y_)ze0}0?HI;2C`wsv)l~vE0C+i+{W-5cy)+zh
zl@wOLVUHfETstq@J&WEwsRjDV1Xh>+n*V-6&{mEAAm!U1?ddaji`022ns49h8uK@G
z`8*E&1tx|P7q_n?a@F8BTy;E&#F5u-rC-38LCaMXSgD?^q}4=TR#z<D{e!b(`S<Ob
z@8`$u{|_IUy`R-Sl_9Ruo&<JYNulhzL<^2k#nL2R)&o;|l|l7~fR~rHbr2z2Y{4F<
z^c6N0yN+rKwU0~RM&x~+ebVOwh5h+t%YQ%Uf@I|g#b99!O9lO435*Xmp-x(NP=?_~
zOq7<2w{Pom<aan*c3%X^JNEa#I(BbFREk1n)MCq8^lh77#Qoyqxz+6Azdteb^D}U8
z8I=pTbCI-PY0$Xw8bXQac2I$oN#nxiv#Ku1P5>k)at&s&mENjqQilksGjh%L3Q?e4
zE7Kh$K%QL=FB?5HmEPlnFm(xW>`Ny{?2=uy8~fj0kNUv$!EgVPkvge0vB2SZ2arf9
zUJwVOhvzIdg)=Ds!UZ%Dw4aq8x6-lv!>ml_BPVe>Ur^Gr{N8Rb`i(tXUBOCj<2!$@
zsMCBnDyQ?=UpHv}M-DhNL?#8&Y|#kTNtyjNVZt^JJufIS?rG0bfvMFM`y!{g3_+^i
zzI0g!`2-h~)K6=!D<j3_^|_sx)gEij+?R`H8}$8YX3~MVOEm`D8nu3e?F^riSQ5t_
z|MOVG(q8|6*ALuzZO{iEc3>HvuOAArqVbr%tQLRp+<57$Uz8ZTV8~s5HS+xX`iu#^
zfSr;2QBZPF!TU`wl00nxL-6Y8|AGb1(CO24t5ZM6oZ@&tG4p=Ck{{oXR&QUrgm!W2
z)0$pML7y>v^5g^S%BC#qjKT>a!Ovi<qR^2*SJ}FX|G>vKW3=j4yl51Kb0s9hg(wij
zj}HbGQv&!Sn#;C7+4-NP95#{-pTQoX%wKc-cz303hYr=rzt^5V-G|D!SF3iVn+^%h
zX8w95h31f5*BF(?OZwhf3Q9cv!S&!W>z1-!a`$b!qpb^MeJEa-<Fd$R_LX~b4e8x_
zYJVl3sC@Ta`py5Wbla}nWIk#YxqD@$ymJVb#im$8v?!g@9QGocxf93!zB{q>(`7PW
z!XlyNn%0!n&daJ=&nfYW(|5A4(nq6z4&2KL*4-iM)PL7kR;a8mQE`>6ug5#e*OwS8
zXJpIU>#yV6UPSMpNJnJ5A;zU2;IRUA`8-0G;>C1;V;_I=##(s4>^heEjW1|rX&E5#
zH%w<4Y_OsI!-Ry&qBf5l_@97ibr!A36;WLAW~I%IQtFF-6(b|)x!Z1Q<v#8~Uc8#V
z14Xr>Es*lzvbes%%%*acDtY7&ig%<@PHt}L-7SqVK|UdTwd_YH4V1PdC78UWNuaiE
zh|5;fQn-?R+>DVacMtq$4Vbm3_*(@?@CUx3OT@rTMxjHoMqa!oH>2k*tZ5&7`TBLZ
zr)O;SgJlQFDEQ?r(f}Gb00X|d!-mKO^Rk^MmN*Zd;elAX_I*?o<5WLd$pBK;-<$pS
zLA02L^8%el7io(n3YV@H(KjFg$jltVH!5_&yCQ)(<ceTR<R2AZRe!FO7c`G3av%w&
z`=3p)3qKtnc6?=pl6VUsz>L!6XC_xGn1gcruiwYu2{ribwEv!iuKP%m`B2fXC?@13
zNP5<C_uCRUE~Ur<aJ`Be)LX1$bP(CDNhu4nOLIw%<!4D~oygkv)LSoijaOI#>tziV
z36_(YXDD|r$cLc{BDvH|W-jT{Lofe(0d_S}!gLaphsgyd6V89Q=S-nXgY`V}?6Q;C
z_J=?zfNF{fr5J~Ac=?(L8{kTn2XrI-)a1(HWk;x7h5iYu?t`q0(l+Y0%&xt}cd;@R
zM4d=qNx>0jHHS*fop~o{k<hUC#jtcS$uB`)#iiuT`SUM%kA#^Zr=s$5g551!8p$Fs
zHH1uqKN&(gCEcVbY(g*ne23ECM7V)+d1YK$Uqun<g14kxaglNMslG%A+2>GV6vZHo
zy5tH!x~~72+V$;ox%{%%GcmMo$e?v0^M9^UlEPlin#;HOCVp-i4DD0!`l^O$R#5qk
zsd$48`k_vjei(mwH*j)pzx0xl1ySx>bkv{Pu4$-cg%D@ZmW?CB?fz*yZEyIT$8~0Z
zYJIoOqKAKd9T~aXX~y2;;jMPqsQwt{qaN3=@%)-YZt6I=pNU;r;PL0{SH3gyU%SPL
zHA(I=NB>*jKb@K5Z=8Jc+~mj!qcP^uO``fgfbOK}#HuD%l+`;I+#1*i*m#7x=y6=}
z<D7aJ+4}lfgrq)#a^yAc@sV>m5&IV0?B@`(1b=#ZZO%SlU3-3pchk_ACr=!dDKE!P
znzWyJD;*PC|38$y2UL}J)9$_17!yl;6I*Pkaoam6ilUfe*>=T-1+al&2P}Y;#1=Iw
zZYzjj#ex)3RFD=;1XMslz=EABB1#bvgzq<-BIf<hS?jRYlP5;l-1ooCTyxFLHEZXf
zzP&BBQi5gX3bYOTKK+<<IgiPTN>y1&c=-_e{<>bN@4R{$@>xM}TYHrDIe|0Io;^E-
zRAA!c<p+<VuZjGcT`ss*YTfgY@Jc$@7M+)%S7*Hgu75YA>CCU?TjVR^W9ln!zX^Ps
zT1oI3)$4MKni_x>bLvI}TRmEKIjxI9u}VF$tPNzttnUUra~|-(>YzzSjp)|7qBW0K
zCquvwfP}`pu2w6Hp4hLTcJ^?#?^WMtl~a59ElDkq@3f26%Wzk!R<B;|Sa7`EB4AzJ
z8MXX-Pn|X`4Gk_ITVo@mAg?+t)yhRDr<}QX<ErNQ&UHljT1CP(4;C-^+_b4vQv+`G
z9!mzVLC-41G=-_dTM{rCcYbXgL+ugj-EV@-X)$&onb2&`*Ta{&8aO%1-i3U6|460Q
zi5=8&#++6EKD=)?v-shC&Fn(cxmk3f!_FNdut++GlzV4F)wqtk^rx7LCV9nI_6EX=
zGzN;rYT3Wut+YkeEuK&+gtxAHpw^_;xE~j|wotS7*S{atX1C06TC&XPSIq}FH=01^
z+`2GvMg{|!$6-6wN@`@r5(;P|GKCPzv1cNiSMjy&V$P*U>3&Y1maKVVo1u(8zH68p
z<vM1vXa{Qf>xwF&DE*eb+vTKGfe}0pEf4Mnmuy1uV1?lMeN)EOcG)%b{mk!%eR!sf
zCV!?)zv=p1`={Q;rp>B;lk@n52}5%B4taB>BiYz-d08#ida{0h+)~VJt+npi@9sA7
zssVM@z9+|!vMRFe1BwWfU#)IfF=9=3@zpoMOORa%!_^JKvZ&N44BfM!q|Z0>{lh=9
zHNR!pwC?)?5P~|UBY)KGR`UA7MEU58*eg|l>EC})9n^`WQqL<5ofG+9D|9|jL9yDF
ze=6Fegqnz=`ZOR$_tCk+?0s^xW<AwB{t-pvVING{l6?S{Ff*TH$9_Vl(*j1Za?||J
z*dy_7jl84z<;QDo(K$EQMkKT%I%g!#Fd0uFiZ(rjN)?DHc5wDNcd|xjNG$%nT5;|y
zDq>wS4zD$SbLHhux4<S726aG>c^PEh)Yb>uOgC<Q6>TkrqUlizux$(Oj<TjUF}eEO
z!0c5XPu)eN&|)o49o~-^M(Va6zEM!%+a|~l-%nl2Q@QqAFw*hE^>&?$Q!k$@&zk5o
z1_+cNQKxa!IrSpW(xo~7yCJXr-F|O~6oVd8Io1i|2PP&B->lqPdCy0lq{^lxlNfzN
z@@|Uepepa4;lz`uWVF;?IGX81#w$B0B7)!XAJaneV{c)uX4oL5zcc^_+!I17<3M=n
zq9=|e%oz}mZv4h`g*;x<qeQaB6qL9dls<2Kh7d!YiiT$tS<O$SFIt30A>T2JkM_(!
z+uWXs<5o#V=4nhl8>gn0NT2<@4P(yo9G(^GXmn<3_rzBfhpv7eQeP*?q`MAtu#1_+
z>slvHo3RMv*VDhD;>a56{PcmEF(J13Q$WctcMj=o@bT`h(zZyzC+)_QreX@>D9O1X
zV*(00?i}=!Y!qI`*i<v@Qk9pwXJUG!LD?#;^>6=n3&g>lBHN(Hxl4dMs-3MEt%fGm
zh1iecc|+nea!z^JSvr=NJUs<ZmHpF#DjPa-8N)JzGLLPyx;vu9A_m8))WizuVu6T6
z8~D~yoNgO8v}jIQnn~Ho^nrCW(&4Da9K^_{4S8BT!KZw)?DV~!bA$V{6fq35Jc#2?
z^DMVD!)N5Py!u<(u9>;&cfL)r{suY`d0PxhJ77>_xmB|-ggr1jM9WBT5<L@P0`}5n
zT)Rgt-Cq2sn7L~o@ALKz?#^GCWaYm{*;SAEVp$IZvxCQJR~mu-`X8Qc$RhRP!KHQ!
zM6h#?UDiq18`bYz;lSg;!Gn|MuAM7qN6c7{&`ZVXhfeXRdZ{kI#)I9LA{0<}T5Uzd
zX78!pnR96zjZsu{<Q73c72{f_4>r5i>+<c!j&C=(qjM=2=;xD~x7kPTcgTS`R;seM
zTMQOL$n^-7$sBD~{qwyZt;Gotjw0lFWQ_9;!!uKFj_nh=mqc7T<|GzamQVkTpUrLE
zM@#oK=zXO<8@A?EGP1Ny8dch!wv;Q}#HH+^v?lRfPH?tw%OPu)im|JGOQEogqoqq@
z6-nYmF_E=2C{k_*S-G2+9xXq_#_LmBqoRW20<Y}%e)UtkY#$2ER|UpRRFhpaK-V?5
zRZhQo*EYtL9x2O47s_@gRRRMn*QM?rmG|N0F2B247&>c%7mlSmMK2BCM0QEpH8H52
zr<K95=Z@=de)@3F?a408;XwLbKi&SH&E@6?=B`mu`D2wA-Q^U?S#Jl;c}?<}VQkyU
zsq|&Le6C_Az;bEI0*<o>gC&MMT7GrerTg=@yLq0k?Qi*nk8>**tG|%PG5K&|3xl9_
zsN%ZiYVHv<(lgud=)6PCR%=$sd9{yN2G(T4;H(n`*~xt<z*z2qk!xd@d9*mKkCUs(
zr&lxR(w%UZx`!o+I5w)QK5DL}p0(K<ti`xPy1kGxHIC^A(L7<=w8SwPRD`VF;dt;9
z!lfBv3oJc7mE!4=KaAZlWA5t27T+D8*-R#{wsNv~`+GgDC=t={y{k2<&8m$h-xs~~
zthqvaOWu8}S-Mk61&)}FQk^<2;q3SS`kkIq(2#gGj{E@2uQ*Yrj~`*BH0C_Yr9%+A
zrg5iilX2FCBbT4p^}xgS@c+T|<)y-;*%|j2>>AJRCXf>?e38GV(hZNF0PeJY_Io|A
zN(&?q4yMoyx<iAHus6*Qf1~hE+K1#MYm>!Uq(O5xEapOTRJIs;9mJ5iFg-?~`ycW3
zaVW<JJlKUuE&GL7gK@-L2}CQ%^tL!GKr3R_BsLwwc_{ZVZd5ymz2Tp7Rs?a~45v-o
zhpvb|f)JK>w<=n1;$2^JkRLFRDuK(_X7#<vS$B?Jw<{W(baM=_C=(lqaQ#o@CwjJ+
zYcsibaOSa|*h^JBZe`Az2aB>$!^eLqvXc-VJ17?r`puEyPoG}%2dsFRWo{9|ps8EC
zc+5QR)pl)4KcvS#anA9C>v3e>4vv5;x_?W~O}@^Rym+I$t2Zpsn^7sdoYFPNdSl7E
zZZ*&Qtb`H)tPx(zC@r$3$snICNyi*@8(hzHgG`Ph+usx3@|IjYR0o?KE||8t<xP1N
z#nL=f>UKl&rZG6j+sOO$=_n@nx<R<tK&g16#WQX1*46yPh-~-(Er)`;1@w%WEu*>O
z0dJ5eF7q?4<tL&iX;7}Sa`w{mBQREb0Pe$him_^a{HR{RonLhTUaVe74dA6Po!g@a
z7~@z+&F@O;g4n}oG^}XDzUUSR$X!Iu$b;3%b8Em)+|to`X(L)_e&XB;cHsb()}5+B
zv7i*9yaPDQ;2NNqU)8PVubQD_c3XZ`KAO%+8l;Q5>Udmo{m)1jyY|38?^uNncak~0
z&TTQraD^4%1DmyiL8HbsK&xWXqe`^)p?=Mkn7uq`QH4jJx25fVQojrFZ*^N}V2$QY
z6oE(2P3^vK+fL1&?oaAD)4x5r!B|e3Ws*2K?Cv5EOb@(v_1DZ>oA<S#fR``-4lKl6
z?L{~Dl5*CkkZHV{z3K>$dyCf0`Z&+(uOzR4DIkQwnqO}EYkvw!3ZObUgiB*)abI=d
zGdM%%A~f@!Dt^qg*N@YUZ8VWm#OAY@SX<2U6gtf=thwH3;0ag=!58A?s=gu~npRA_
ziD-pE?PUe(NW+T7?Vr8eRcT8+W_5qwmPMErqp<b1e{ZV#F;4cY?lm?__iH&547qv(
zqv94whRKY?<^!e)eQ_qHU$5RM-R4(c2aH8?N=?Lk3T&WKllxx?yp7{13yi4#3iGYG
z5)1N)X-?u%C4%`suSBy(J^BP|KEiZ?&7Gg?Eh5R9MQ$$UVWWX5F4z$>Sd6+1`>g)*
zzFB!WW(EA)eD*?;D?yMc7t;36?yQn12Hwk@8IDH5HWptj#DSV8R(B2c@bf`_+8}?o
zKp2OGaykCH5MdFY7ozTv+z{(G__fbR!d*`XPwhng_w1D#cUbX|Z*6NDyz$Y$HXgbc
zE3t@@iUCW+jzW|M@Vw1y-uco!)%S3z<#&TKkLc$`b^N>%Fym>Uias?D><9DB6^E*+
zDV9q<)r#Y@(A(z{Q&Kqbn9+4)<@N%?n$OW#--_px5PymcD&Xr-0FuW2m=5kbc(rBB
zED7n>yuMF(bGGJzy?^5KHW^;dj~!x;uw%=9^M4buYo#k5SK?~|Sg)E!;=WC#q4d{+
zc#ESJLz~`{*XJX(RBAj(AN~+6;RW%^shW1|e!-6w2G(Co?`=dK9qMv`0DfjUn_Dj*
z4Xb0t>1$DQ?G9;Gct}N(4%MKxnCX2`MCR2CL9LExMFtS8{n6m_Q8G=KG`;)2F(XE4
zR^n~3G4OIKQ97zL-bO<*k8IFJ1nz$=MP~I<IQO|k0E&^QSPBxXO54n%TLR$C&8cyy
zoBa7#McnYIVUV7AW6OKM#NO12(>J`3glVF1x4W3NnwQ>xa;qvcq3}+}NsOB|Z5k?7
zCKYCae6?)a7eBG85<HvMiRU;>oj%=+is%QH-i+mt1OFXmD-B!q?+OI(b){OKnZQ;{
zh>NK<0#y_Wlc$1bfm5j~oJwZLJp8#{+r1dflhW+CspdP)t9ESXJPxmU*o{qHL1)@-
z;@VCsEs)x4%uLInb~fVHtG@i7*Yr{|b<o*bDS<FFOR!lEp<aw!kHqP-o7LRn*5|6P
zhkv?2D_A3Pui<9PctHyqFX-%>Rnv7Wy;A8q?vL&dDNJ+`^zKIM1R^t~s4I(#yEznP
zF<`hs?~3=9Cv<MKy&^bOw04z9=<cX?G2&#DMvNw*p3ha_U7UXYxbv&3UI=$lt6xRT
zV%qT{Cn61~8$*-%6sRd>H1j*GB5KuJ_}>$;xu%Oc@?M{?A`1@DgH}PeK*I@oonVw>
zr_WwCt>#;uORn}&!*eWwh%7@q<)v<cjN9tRiHO03&G_Oco>r~HSC#AF+IQYB0@0PW
zStk#~12L(&Xiss}s+#7~zfAK=W{0dESJeW5Vw(v3H%DPWFa6*GwesYYZWRCB?y#ke
z8nx&z?*7d1syn>Zf<RQL2;?QC@PDlOHiuvIp`iu7I2dJJYq+YeH((+XqyZYf2(uZy
z`c`>qT)9}c?lp~I{ekB~GAlg)Hfz>7l^q*TMeUL#q0w+4>2ioy&8x9nT9q`DX_I91
z&*hR(h+!dWb*bRDn~W&;pvff_;o;sjFUhufl|$!LYGiWtChEWYO*1j0+rBX^QLG39
z7-vu@+51YB)H%O0DYU{T{;@;qe)FzA{p6b|ki)d2F1X+|(fA1Sh><zvjTQ{b4av`g
z3hbhi{LQ@Un-_69zc3+#b#71kI_WI&pgyZ@c2~H$h>=U#4cH-vZL!YduR#O~o0Gxk
z_!&{5Fq@u*!<00BVJ)-&sdj2J4FZ=x9;{;?JFa@g_#ty<x72jG5y9t;?$%lwBF7)A
z(Dbm_fTdqKzQ5qP2w21$be6%MXU?6IN+0WVSMzj2)Z3(#5^z1Gugwi^t#HggB;D+p
z<M`3auhY#oYfoAtZwoQeyz)bFD6?&9FU>=l&<)6%R?pzW&l1h;9dnY5kalDJx5KHk
zJRqMZAxza*#H<;{l_Ne{Z(er!u`UsVy_g~hlrAbQ2;_^)9_eBPzF5(R=9N3@O=`O*
zlDAPzmUHI@4OHcE3e2ed5T^18(e)R8Bd7AZJNx;74aNOol_l6yMvou(c&{uWEX9|<
zs_O7~pR8D_+XNY|1aN?s-g6Ev*fmsFem@b_9EhpvuX8OcJ=@HV1Yb7Xb}Rg6XB7@y
zc4I^1VS||5IQ5Hb^s(~7o9<Y4HRqBmG+gyUP?dccByo~Qt?8&|4y^<d{?qsF+b0Ll
zb*TLsq(Z`#(GrW%Rgn%a70Y;yX*Hj2{pX*{A?*5MAvgi<OvmVgbJs+5Rj=g#s_OB4
zWYjiOj;*1<ydbYsYG}${5;`NE?83Z`(=|cC2g}N5oU~-;u#cHETWE3)>JwADFG5z&
z0$Zp>VQ?jHtNCruZdKxs;XamxcgtA~akHo&jZsrHFc5lnG&YE}tZDh)7FRytktEK`
zNytNuQ>R~`lZrVi*HN14E7l04Vf4jf?5}hj=X8W6rnc1;hVm2#OJxN-JH<%}0y))$
z%+C8NA24EYBKSls5$EOs52{xBg}$G)6KY#yoE-`#fANUlR-THk|4bg$b|0sr&DxYz
zD)p2zduibEf`POxeHU5tHz}%qU=qxeBu5s3n=SaCdZAczu+9`CBUd0m%{MxvUG-B_
zp_#c!2jKrs5>TtuuU@^9b)YvOnrmLuftngFs=QR$n^rhv&lZxn`gXv43O68Ze9hb8
z->nk1I2Vqj<Vzj}K6erOHC7$+{^1h6vJWqHOe35n$gdgZ`FE?jh=s9L3&9zvT8MP#
zI^RWcv!>Ss=ZET7-Kah?SqSQ7iQq;4*Gc8E?!{K}3?0~ajQSV9>AX&bhj%WV{{OHR
z3(Tt4BD88PVrfO%d%!90?t57bEME%z%NL7rzG^WV{la2sumA9t&g^dN*<5uhWrA_v
zc5!#P5IeXBN!xBU{FC#J9hC=g>7=lt(A-6|%jrGe*hJ%|1xpBwgXcL{^XBVzuDbbu
z!`3ddf)164N^>%VXBkaE@c(l%TGy#u#QW{5?g*n8WNS4^GL$n8F4(EpS-7E^=XPmk
z)!pB439Og_nRA;=pJ!g)#9{BlNW2Ma`@hzsL)FqZGOk(=4}Cq1lf(agJsRz<T95ua
z38o}jx3S3tGMu;)t)YYKX{|Gb6f>-OI9j)=dau(<FU05u!Npla9D9L&RcbV6w1u_k
zbYPB}uBxUrLaK<(*D8Wih3*)MYQTK*Diw=C1XjHmHE{6KZ>kpKcp!^m4$UJ;5<}y%
z=us~q(bf|9?x7>F`5aBgQe)8~KE_u49!oN2abziUZ1Ylu=vJu*K3UhXi^sb8k`F?=
z^h8bTnKYwnpH+j1D}}skGEu*uYD)Kg+5nL@q_obQJ=?toba5`MUOw$hoQ)LDMi9hd
z0tARM8UW-f?pZyycQnL*{Tc|`)M0+*p+Dp<F$9iu0zsQvnV6W^vj5^qJHd{^Kz?!N
z!zv+}^Z0mU`<#7n>1O1JyY(qgeVIoxVTd4G-1-+cIlKB0E=)u+g66$wNh`EnbH9Ts
z??&qtsVDYKxM^g5cps{uH8*Sc?y7~~v-Y3wN1fokyBxNeqVXhvbnzru)S4vEk*<}i
zH(c#=`N?3W%UNqORtS@*@*t9j{-2IT@Fjqg!vFkfU*A~vYDVz~$-@58p!aFpVJy2W
zLyxwGjDAH=j_Y?Rx>mh5E@XOF>?<-M-jGbAI4YpwgiD|h*(M7nIc8{ulanaTa4Iv8
z$rmW3$4q5|qWeIE{+Cm%#>V7>U5AkcHc8V}m?mgDmVW3-HvZJ%HM1DpI+;Y``jfIX
zM>c$S6U6EvKi3W)BV>Uj^WpRtUBclFct|Ll8@Dzr|8U^c3gS+V;qYZ*p^`LC03W(<
zm*_&mMcnonS~5270rRcJXi4_f#4bg)M64Baaj%&hFQL()Ebl<La9lln`WOW)JXt1b
zslA31RHOn7E<`$gN_I*kukUO7gnCu3qjMj~^^nq6Gs16DswIQcskZ7uDM3{8*-Ize
zv{I};O1`!SAoe!$q3$N-ACo0kV|=8CU_P1?Jgu*N+vQwR7<t7Q$@QS*KeS$3n>(|7
zt7{1qrpv_2SpE(D>j5R{l+}+=r6i>gq%>#U^GyTds9$o2pw56CxJ3{dkJL@+o3-J$
zcqW6QQjpOA0R{9y$pw)yK|^vb8hVgbA}`aAxV@NfJ_N_v2gazp#d}8lb(QogHug&V
zM*&%UMB&({ttnv>4i6^OgT$GHsN($+DEn<=|N6WIkm;_i;%!UNM2t&^U-J&<uJ0ll
zM*|WoAq2R3@BV3GIC=8!TfMLLq&CPn?UQ6pN()XI#E?agr-zkY8Uh45`|nzfNvW_a
zZnC46KO+n%8U4h;mqS8It>Z$;x(MDhBLP<YZy(3q)B{PHU{2OS>e$o%(us|d7v_Ug
zVCj+fnii8SjPcGvFaGRl95P5KS5Be_*}wJavp6bIz&2}!FFTrbz0P&sPXo^>b_I-Z
z!!mS~!WmfcS384YsMAfvH6~EcLvgDPtMehF?9*3@Z8qFj0W8JQ6CKs~i{2`2D0m5-
zC5sGEU9`_$@(b`;MjUjtWSWYL<D5Js8!-LRXYOLAPTB~}s&~2+Ues+d$0c=7AFERG
zO2xPPX|h?GqLto3sp@@SYb#ac0jxMsomFLY#+nl?dfNN27AHtj1ySuF!GdqW!vV>2
z_9%<_j&QRT-zc7yUm!zBu|_k!SEr8$kee|RU=17+_RT^uNy?ytKsVJMW>~RPb`7H%
zR#3XVcW_rpky_+Tboyv00Cv`r8fP{};A3kSGW<u=-@!5#xF)aQ5GYmn$#@n&?(uO{
zr9Ab@%Kd0TB)htV&KY-nO47MLoIio)LIgm=TLnI;qlh?QbY<l{i7VtOT0i|TQh@j<
zE!z{BJr$>ENcEJJe!y1mQC67?syhbzVaGB&B=Mh$s$AJ^J2IlXSj7-{*}#XN8TN=*
za>sPrbM{Z$mNKl*r{E#&dP~Cm?_&dCC*E2Yk@*@LuOVp&!-_T~zoLasol0KG+vsYR
z*P%S3`;%1gJ-|Ir&mFH&oI57`rLYiUIwjx`KfvU*?Xk6;Uk4@|OHe{ptGi$fv+xWL
z`*^`zM07pcUS0R@h5M=-Esbb<$#_!;pDwv}kYC`FM%8yCB3XjOrNgDlz%3H+J2xwk
z9`8i`<%wq#54^cRlCx8B>cGh?Xr}cpJ)G+W$>3qbHCbF`P`cX^_4>j7waA&+!k*{=
zFogmYH$(7qVde8{$1W)DZ(N3xm8yp~7?{`o*T2J`KV7dQjtPa^J>07HDl0^uGDwyd
z$A^+*+DjHT(N0sBQt_}Ro+hgrDFRKQnfaio16rCCJ&EGtStVtKeoj8Nr8z5{>`x{e
zI~J$8MN%?T8t*__>#AF1pU{GM^RM-lp6i2BF~!kBB?ARhGmr0z6@$8zlP6iH@Vo|{
zNWBY5FA-ua8C)6&`T==Vv}p(*-}La;5uGy;-;NV5LLQjek>k}+!<~_HyV=<3!}Y)L
zmiyl86{T2POVWZs=WNWJ>h)HHb;jr)X|7KY)sp@$USp4EY3h<ji)gc6UiRr}`A}5=
z`O^%_(u}eSX_X64Uxmg`n2jG!G)W&Nop&v?k^#ID>q&utB%USFD1{PaN-x}m(F&A`
z2uZStofWbts^hM4l3k@P$}#Qf7W@IdqDH>#p3R#Sz4DKK<!-VEls(<D%{YAdracHt
zDGV+l_(7a{;{0BpU%_Mx$a7MIHBTLiNc5GA-Bo<-_lET0lb`w3(gXd249tEMgiruY
zPD?`zlY-s^X9vI>`q&s#V;1w06s%)bV0rPawwS)>EwqXr=Q<7)#LM^jh7`n)RjH|5
zDKQBin0*5{I>n~8;#>fZB|OP*G6i|ua5EadvZ#RPllD^=md=h0oJ8rJ7xAp@q%dpZ
zua|h1^Xdzc%?|qTV*8S#pSdgNM4>G1{Xxf$Pyd<Ev%5lbu^;A149+;%pif9&8j{6I
z8|2$zZY;G>vn?3w_fDv%i@7bjQIyJh8WPG$_L)MFC}_ff?khIT(p`M#$gw60(@^3s
zjeQJyv8_iuPQ-o9LyI$~PmiV_W&4CWrcM00t?BTyCO$_SssvI<0oX0r*h_1!e_rgF
zy#Qwsc3uS9EsWRAQ!k{Q+!oMomF;smouZW}fS24-$whhU`2pnz>^jiXDDPzQ2Nq?C
z7L&44$L`9(&Tt*~C<P=@$T+<?lsxPiTG)Y~_G^f~Uc*9-$WZuiMeTli*-L}61%Yt_
z(nuoZVYL=<U*>GDk+M7Y9p~6|s3CPgWGK&Fd{49^GpWHP_Tq;nS*}Ng%Z$Nh{LZts
zAy|Swp*IY|%i=N|JV?!@V@zWqgxsmib7=OoZeR`;st<MV(;rAJ4(@OH;khfwg-oQN
zCxHY18j>~pb+E!8yCykESpK=k9ptvdlp9blYc#gz^YxbJt8)jqGRHb*zyoecK4lc?
zw{_xf(-9o-z|6C3au^Srr0>W*4!05G*;^$WMeAfi4?m~B>$W&&rWbe;k%19i6cOF+
zxjdE*U9b1P`dUqcbE@D2(EK#-<l4@CtSF&O#dsO@ut$qfmq2u>-TT);)8Bv2Aet8L
zZP6B!?j|4`Gp<tW7pLxNi9o>^zZK{IZH~7cuNJsYB;k2A4kIrQ+*uV?@;EoixUXV(
zuSXmXp~D4^*p?M%=%BM(r=_CnZ3IlDQp+cCD2dtEB*#pB-c*h_=OjfWUwAO19e1ct
ztbYFO()5Ah(X=V&F}rmz*h=!)mc|A=3K-LU>WmrYq#USADl&gh*ZsJdmyq>~4%_&h
z!7Zd=?R;#9t$Y?i&iD0_oPLxmN#<A;Pe=%3@^;B#6D}yZ)MS06x|fKuQQ&)c;yGAH
zD`4sg7L&22V|5?h9@SdWG#A12SYnvKq@uY18cA4^Z=$Tj+$Xnj^&N<qksni6ao<@6
zES6`KmwLL}rn}mLv+^E2Ez^}-5tD6D@*qz0N2!Ss5`xyqZgRF`vPs@YNfe=sX14-=
zZnWsf!8VIy@+mf55V%<zR;sf~3Ih^b-FzF@=;?<W0~;=KF2D68bW72ka;G*<uNPi7
ztG8uG+MvjO3;%S#<o<(|rB#%>qpM-pwyFJ3Z42Af#;T!_TE}S2J>AHkzVB?_ZOeoE
ze?53zT5>(@aNxYw`43(@UhSP+Xcua7Dl#Kq%i_JGqvF}O^t4Qdt4c*ykL>K};lJ1R
z+WD2j86{F98P`lx)Xvx8jE{4G@3xat$%RhU>rwG9`Uiu~aJZb$funa#Ui@5U+x_<2
z#(wFksV77YjV5Q|?@4ubLaj2CQ0Em*9VlIjbX~LFb7rE}kzuG?OEKJ+iPJ}^s?UUb
zpNbF|lX`k0HCLbPtJPtOkSr;i(X+e3O|l_sZ8P-qk*;`>md7O`w*8CQagf(tR5Fqi
zxkGoeQi{^<!p~3h8rPEWVJv;}Xpx*v@-y+H;SD@J7Ex!$aC>xn!3U%qV1N-NMqU&W
zcGcIPLoBri?a<e+Uyp0sNa5U&nSHOg6jp22#NM@LROCHc3z!?8mZ|rjl^Goue-A9Y
zZr1(#_oJ95t)7bTT~EJueNCM~wMy&!Ey?5TI6gMavf)EzBsD#y7D<6f9>VY^P|j;Z
zD~e2;Ck~i~8=MM4ZdlZF>S>qV;aJ$lF_X^C?6<&?chbrK;eroqe8Whc`s|rY+EC;5
z??x<`yC&(Wy;G+%kkLjUPsPI9G|`e8Po~i1EZL14_eifr^=Ga%W#p46E1Sx+csfmF
zx;(;6J-T(vI`2_P?d&UxdlShNwaX6s&r7cBd*L2E>xNq*5*8qlV)v;qxs!?$6<-d9
za2=#ce@EP-vf-HIHiM=ISuK*$<vhd;Bvi)jZD^|^DH6Ca`Uh1!&VjdF-K0xx=aQ>-
zg=gvHzlc--Bkj47ug<?-VrJ$=A<uLQMC^NNDvD}6%(J7z0{XA6$W^!)Fo5?ghB@!u
zJMh~MIswgIjaDd9|D+%^Ix=z;TE|S5O`@vnuiA~so?Z=@-9wL|-8kE1tVY`F*QEyy
z|8u_DcWhc|*Y(cAn=;CNIwor&lWkw28m2!P&2t9X?xpvGl-Lm(J{FT2Vj|c*5*4ed
znCh9+LQBh+lEzoqgX0kug#pzs5YZ%JPerP@{U~&r$C0`|-kv8=yB|KB#_c3$*}85k
zu$LD^keS^s!dqi0Z~S|2^+#0v%a!e?sxupXYoYLuXv!E`{w3OwDY%OPy72Uy=*nn!
z<vl(ip20qzlcs$6G0vsit7bj3Y_;h<OM;0%13{F9<f4q3H+K58Iw&~U>ySNSD^LSV
z#*7=+|L6bpF&k=pj5flt%n(0LD&{nYEt4KJnH~-k+R<p{%+YFuV*CJAZiBjg`7jX)
zJ8FE26`Vj4`Z@ex?@i390~NO7N|EI8qN(06@;}?Q&eF)Oy3e5f3iKxLLNFBK@w@i#
zKNZOZW)4rMy&hqLac%pU&;WjxN|PQeoZ+ZiZ@*!g7Ohy(r(V5!&3et6$}SwBJDpM8
zJ;#nM)BEfj<blU+^?Xs4D_i)GpgWKTcj|*jpyx3ILGm*lYeg7(Q_XIP9spkH6t(E8
zX5FXuU9J4$eWTXi9*={r_;R_*AY@VZCJh}r=0I;BVDw^6Q`3bC=Qa!sJw^2X3ZUY`
zXe_dN-kTb<=+21wdv~ioK=c3YpZfQ&CPjz8v7reQy@`pQ>+`l^iuNu~K|}){V>+iU
z?fLVgc*a}kIU!vTK%3~#^A%BtrWhKoMuAI=5skx1ToPHrOr{XtMJr>6ZN2Z>xOjSc
zk_q)8^Oi+kYA@p?zTNZ1ZbT&Ps@_QF{*1PidoMM%=x>bH5!+Ds`hg*bY#O~Ha%qq4
zRlK!9&qVtn|8c-WMg-o4vpvOed<EWA3($+kvQiCp7hLW~)6K1i4?k`-{~W#Z`p%^h
zDPo8#zJel2-rQpxU85iSMs$r5l@x9UeBZLcs0AVJ(}xCLd`?6Yq%MwPJ+csJyF(!%
zd*6qogi1|+QvqT18GcNrOK;s%wdtB)5RtI8`W+Qa+m+`-gzRCsQ}`+w%pykTA_FFY
z<$wIrP+|HD+Jhk!;rjv>42jZa{=7M^susZQc1&)mA)<znGFQ4$Yu#}SH|RX!Go+H=
zII+dHiuxod!(lC$_l~DqLV}C?xK}xi0d3I25{;)me?ITUMuoFI?yVa-Wr#T4qtw5P
zO0ER9y%2rLY?!0Y6KWJO$>%q&@%fjK^G-ZtI10rg7iF`qlZK;o_mNuYMDB2tQ%LcT
z<_HDyO)&@i{mNO~FWniVq0y!>EOcu^^)ASRM<o1Sy;3d{mputFB3QnMmP02Mr5}I&
zv0=jiTJZTL5&vW<6s{-;9A$i1CRE^AT40q~q{=kbq}Qy4FDkRr^09om6itQ#srbq}
zO49QT)Bd(0`Kb(C`e2HTze4(Byn921OIw6P?-IGCG7C?IOW4mDX06O#EUf(TivM%D
zT;l^>VY-=c<Tz%`7`3wS&Ewgqb4|vzkK+U1b_FL$+e#=&Nc9OkqvS!`t$+Lb);(2E
zr1{9<Pex3xI;Byg!alv9O?P)UT#&i|QhhO0AQCnGS8)zsn&S32gEPU!$nm%;Ehr}Z
zNC~L?zM5Zoooj3qLKVkmm^e&Bwnlw|H2rLluR2KYY4@+4PsMO|3Sp#M$>e8C&_m~O
zpG$M(6K$IqSX2?w#S!dCB9csx!yIO-1^YUX_3S*Mt|CgcBCObp8{bP-ZaRgj&04jx
zd@;4=GbvmyPpaMu4=d*83Hp<fk}6Ba-6HSt6aCpuy3|pmUZnr!G$q&S6EII%0@*N8
z<K1aQsqvNfUG+Og7%m6%KV%q;^33|;ypOQi3u&$A_d9p_n+|wLgap#%Jyde9;ii`U
z_sQ1ElR3Ac5U-zX5)A+WXe^|qrFBARPnx?lM%3d^j7+yqo&LSoPtpFVTw}P<4yu6j
z=kH>}(-3~q{Khlb^;uWf_%OP#lgfmd>#$w(|N9#UzMZ^S;WGB0fBwnDT+Ns?$p!0s
z0t^Zi8Y4ggkw_XD^CAl6&f@HC7>mZhcyIC$c9fsqJHKEQkoiNH`6Zm+<JBx(vz{|2
z5sDjvi4ytSX{RT3h;awTWK;M+1Dd`4t*1}RkNTZ=DlZ#so&Fc&mzp7#JYz=k7T@S>
zs@9K^q38<id^{5+gKvM}jOr6$#Uy}m6A`gRPvSJdO@e~rkyECq(JI~R?%q>H3zeFu
zT_A60c$!)EGRm*`<Hs>-vKuFel5e68`A>9>ox`TB6@Z>vo3@q&AdokGnhhhLI+Z&#
zVvZOlTs5#c6L?7rEFl8Ty205vMil&tqU<DQG<h!9NB0(qn8vr8|Ihy%kt0niBY}W#
zD(|eC0zBTURv)7X5vicLc2E;j`A;QpF3h|BbuXiU-_-Yz=wCnC1>yZ!sFzITWpLx?
z$O%qF*liC+QF(@TK#lc#n=0al+!%=`r=m4m1zAx@ks!dsQvMEaLo(aL!;ZRx@$TO!
zOvmsG<AG-vS7%TEgi&-uKT(XytC?S{gNwHqlW|Kag_ZA6o<SLp#uZQ2vm<IPGG1vG
zx<>TR9`X10w^_mrt9uM6`-FW~pO7dG$;0k*uK3GTH>!j)*($k9{ISQ{wM8ZA5I?_D
zyV3D@JG2nUFO6qE4T-11KD=l!H^^!PqJX-jWOYVSlsg%fU#RFa5MUjyhc;=_q!1oY
zr4}_){`Oq>WLtWpA2mg9RyB)K-_4u)@hW-8NgZ&mOfdddYu6q?h03nzjSoqRLxWo>
z^rs;vYf%~=hLEyINR1sE)~TVQ_%V-(u8l8IUyR4l*-XA=r2NToZL91|#S&TdvNo)T
z$6m_bj{JEu8`2GwMha&-p%08)9}B8bJ~AAofE>N|=-oeMC82DT(lx3tDlyAau1v<;
zO~aQDV3feJ$HCJ8<xD-=yt#3^vWT8UQM7Cvc<0SGx%Hj*2N|EnK#2+Jra%68(5vgO
zxUK={BdX#-?76e|iW07LR4FZ})anl7ne^ne{wqA?$Vb<&U*D+WiLDx=6ykcQyd>vO
z5-b+5foEycG+y_-ZZG4InUtD$9`}u6;x0ntY+$qc1boeDg0apMzEVUrLM-wWoYiPG
z(6c{!CB!{7&xv!2pkC9TL=J($Y|3bq@u|-}uwlcmo2_Xo^`)lJ>*_G%CRJ08{<5in
z?1)!ua)jsCuV1fJfjN`0?g8!zKs)h2cTEu9cY6oe_hu4ug-<|P$l?ZKyF1#w@?mV2
zBqq}u+D=1d<8JSn{ztyu!}FwS;dkzFO?^lg<9J6RJ+nv2ta#KLN28e*ysbqaU%<B|
zRPVN}Tl=jNv`9_oNxprInu@RfIPrw9Yb#Q_6N#xqLqnBWREFNctH`CP`DN7y<L<nb
zi*fhE&&lAVo6*r(zvq{iOe#l1@FYiOteTGdPD6%VZJ50JP0EACD;{3#HrpsPB;>F6
z1P<|k>7kWbw*$JKLVEUpNx0L0Sk^lB5gZ#eeRIe&9Ks$s0!v_jZp;)z_ZMyGk|ij5
ze2W=<jAU%3XLVxLGJ?`p{pN3^2G);ysqwV=U=9zwxf;o1^>-{?7fMo@czd6*L*X>i
z731kf4b63<IN(=4V%C+FN7v*&@sXbdxF(WXJB_rQAbBpHAbA9$&fz^fZ_oV!fSUU1
z)macVj%My)l;J3{J&U5pk1yGuhhYv^<#PSdY0PO@vRN0*cECA|JJ?E*w_ji|AV__J
zviCE;Y^tw0R^zud?{!w5BmQvhYxHF4uKKCw8Y`kgdrEYX=MCO(l>>%dWcw+!ae=*^
z-IKM(V1XU)si(TIey0*0hE_RH7d9^?DyA*(B5Ib!B_-Zsex{uGQ{qWng-Q*<c!o^L
z!tBzJ7jS0G&5d5U6s{)Am(iI|*7{*>1+;-@rlL%a^uYX(14VB7wgJo)aPwL`3W;mn
z%0)<en|#<~AN-9}#z>cL@H={BEMd5}?b?;hbL|t~^AH_l%-{*$I18$%fBK61{y#%x
zVMCufm|#yLHtgkA!@zl9%!Y2-yxEeQJu7T|5Y@<CRNz2Ez$i3FT4tu)Mj-?wHDfsU
zZ3Bf?B6e!WdqUIk2ftFd%qB`0P8M#r1S=FW?a&(P7QWz%pl0~jns6=8`)b`EIPSpg
z-D!nrRrGxb@C?ljkzfiqqPBt0?CIH21-serA>J|TModi107m|S>AJx)VS<k;voLMb
zk=Ox67KAgxZMJzvCoz>kg?FMYed9iwS_yw4Pk)A7qwB-9m?lXsiUt(FE{l_P2U2w^
zLbw0kxY24;lQ(&Vg#+qq_g|heQ~WNLZ2-Nu5*A=rWD`(U&9zSl{2Zf5Yb;p@hOQq_
zTCigEzxn14ebo}I9RE>r$`{`D1|OuD5OE7KKRnD~7J_`lQrD**`|aBW`p9fUt=Z9>
zJ?+v`qecwf1;%L4!yp_MofXc(Xf>Nu9x;C>Cf=Jq0n101e3V%Tt=?rW?!p(peQ{D<
zhG-)9)EV5ZeKr~OOnUXYP!{=)Q~%c*sHWT@aY(!|yM>n3DG2-}Z5SgfahGm_RhzB(
zJ$^u&#oTi@J!>y?)6WWZq)yGv^+P1lEc<oHi76+?fVR|k8-Mqm?n7SqR+2*)1+n0}
zqmT>2A6eXBV!byjeKP10PMy}!G3tB3BCO?&t~WcV$o}qUx!99a<gJFI358<T*tM9y
z)rF*J#;TFHnRaqavjuVF-bbqu_DDqLt?f{|sWO|G0GU|L+}*PoLj1FsyuP1a2Io_P
zMOqD9ehw0I31kiW9Pzl;Kyb{z|NJvr-P<_iG&yYx5@!GycQHT|DNSTqbXVCa`i3%&
z;&=`cG1rs|=aksw<m3;-mUG@QoQwXuz$Jo!$35dAB=k`>>o!F>+djqN$MI@}h_nV3
zz4;KjM7yJ|?sO>=zyc^$gmg&&GRLRIVi7Su&46rhT4U5Qi9zbAv*wg$GvFHI`CITG
z4b=-5Ejo$eyvbzQE0++jRH=alxb=Nr?Kda|ydr>dhO2%IAniaO{ztM!2|0720gulm
z?c<7OHquliC;Iw6?KOP2o7-8KBS;!cL9+<E1(XhE883toOQkl(+Y7Zt_03UW#<Hhj
zb2*Pv;DyOls+>L*9srJck>H4a{CF-L;43I=ys-JhxA@tj=8LYBe47bNdV+YB`IOVw
zuMd&HOisqV-CrEd_Q^+qvIip`Z;4b!$7|a=HSN-=Q_CFtb~odq9_y&p>b-7mZf|~e
zP)T?af_rjAKLxkctaa-iSrg`MVYAMH>5GP*`fckEZVXzcc6i>LLqqK^VL`MD-CWM&
zW}d8l9d2U44cn`Pdzc9Jyt#5bK6EOiFAJ1-u-pVCoRN>1D$D#1p|L2x-H~0?yy;fv
zsEAm{XkkjDdW`Xd*EcfW^Ari)q46KFopLR9T^xnm3uHA$J?yBhowqv)ZyMdT_MW*M
z{ApAqYFRXX6WNxg&Q%6w|9-l%q2%4Vrfe9I(G2(~x_k%tF!Kqb^B#6}ANsE<>G?wf
z`ZOTp)~#D(h2$kU7GL`5YlVjyGIr8YqC7(pXw%b-xOPD;Q1SpmDq?g8Gk=u>z4U*f
z<%&c4WXprx=0l^-V)a?*?fMH-xpg<({;ID%daYLfhTndxTKhOQb^>(69fpJh+7q{=
zx6)qtC+Zk&Vb{-c<}9Ct6#ts6KY`n6h#Jl21(f^j-?AmLPVyGyNQpXI?u3WT6eW_>
zzl<iIo!X6?xX4m;rdKolNu|DewZtI<bA1NSPw;mD7r5`adwFx!l}F)4A48eux=y7&
zbZE9${)@XQy*BQiOINR5YD3LG+wAJzp~LgtI{L5U?%aOnnDY0h(O2sgz5GtI#F2jB
z6~wogz2S+87Ky-aKmBy=$B9rg@I&XYkVb%+d(?3xBH!7h^ViN@UJ}J?t}S%&?SCwY
z+JDCsfz~^#2BQcxLL(|teQKW>bft==vnki{rxDvSu`Tr*H;zlv;^d4h!2mRNw=a4g
zWS?9*gETV*!0lAQ`FFw)_933}!u;GyNzqRTVleuS;_1_;yBZf=5!GZg0@rU|z|J<F
z%lzA%hT9ECuid7tZ$b7fx*!F$*&lQ48#pce=gAY}_)kywlsC`4Vm~oA)Nw`0n>q&T
zth!G<{dRQr*u7ofMYS9w|J2jk=i`&p(VSs*4ykQYb`81G&Z0=OSaSS6P|`3?&Kq}j
zH9*Sd<%n%fCxbMEp1?c*vVZ^j244*?dkZu9V-7aLwePoz_I_APue-ynQvVcF*~IG$
z8pb(gOP3!M5x=paCkY*2fzpG?Jn+){PZkypo3xgTQWCqJyTt`oM#seDgyjRUy7%eh
z*K<+S4w<eAFEZqMzs8MsZSh^ZY87H&lQu6X?=|(UvT5%*gA%M8H0^47k-(}Q=CsQ)
z)FjRO+;@<%y;OnS%o~i%rw={X=gn*%E-HANwMS+E8;+WUFr(GK_PbkauQVvz_vJtJ
zw_yPh5M3=;5R`d%c?y{$sI$GqVUlV)89ugFM8Z$~``=q^o?gD8mg2*TYO1N?JPe;D
zsf;ET1PcgCbyJbt^9(4>T7VSU$P(cra~S}<bJ3M_3n`c%HEYF!1$)f58!}QO3YQC|
z6in{BPR5g7siHm@>Gm9OAh0iQu(^3gq#N)^e<&l<G|8K<zLE<y#nRH!jWHC~D2q=L
z%L@m;fBRvcGo8&Gd6f~RUJYt1qI96Rd!daDRPNEU=WWSP)%dD3yML07lK@v0k0%e9
zv-c=|M*?DrIyfRJYoCNJz*XNgo|*bBhc)l*+dIVSg<!?55Ny!OS?M6tR|uP`PXL_s
zlNxg7r+`%gLVU55y9Q9R7WIgzjJWVLamo@BIk2hY21epG@bN&ZX%KYdU4BqlokB$I
zcn(8T#odNn1L(n;Z~N=7d-&w>Z9mr(%Uf8r)x<Zb;|sqn{7wONdzQedS6GwH)=hr+
zp@2l2=q=nBKmua*>-&?bXSqWhcaa?I0K_RC2a{JAEAe>&bqnb!4Sw;(+16{Tzitxz
zs1Q&@fAKMN1Bc;-l-**I@4aNEZs&mmFGmKSQsNKBva<s|%B)<}eFH8=QiO1kx$`PD
z69YNUhkE+#+PYO`xN<%aywsz%;#ohz-tdp7u``R|O$qW;nw6;7##2Np3;)=wa48uv
zYSdV@<V%>4@}mFn_l!STOwB~gowyyLRwfSm{;iGm%9TeXO<UtDr5?5_Vk6p0-sGG!
zIS(vwqC}jS!86ors80Y6K2f4#M;XR6m{yy{cNjcpOuzSJf(Pn?P7=(V1<I>WU~>>h
zpGXzmew&P1R)mUON@~CRN1P$nJ_%eWih;z5XJG?01g!&#OTq>y&tNfLr3790Ho@!(
z4!7A4Kg=(nQtfcs&<UQymc5<~qu0}KBCPM8qtyFp5~?R~d&hX_#S&n^*^nBDkD)5S
zTnL*=?yU3ZZxyy<!4k^JF12ipq1j*=b=uHt7f997mM-q!tP5ri`RJQZiXPWk6=pbF
zjN;z*J<1`M)Hg(Jpy=5Nv)szSF+>#4YS*nVs)(F7sD5C<<BzP~63(WqAbFWWXyTh3
zT7hk35g0_#OErbuav_D6`gfDRPUt%M*hXOi@npiMTRsRldGkP@K1_8Yk!upnU>{{P
zYKoLGP5-@i-Er<e0<%V;yDPB)USUVLXUeVUXcaDeAD0sR-CO9~6VS<CQ;!U<C9Ww`
zs+%Pj|80{mOfMn8HX%6X1BRxPccw<n*V$RYK*lM65A7Qbnsq;ci<kDLmcF?UE}Q~F
z7VuF^SPKCf{g}?=B~df=UfKf(#=T4aBjqT_?IRC9fO#gLK$3*n?wX)4ATV$mP()~!
zeZA*C{6|H-fuoSwB(#P;UOi;U5V>puScsAqSwvs%i3x~vcXw~vrp?lV*8LYBfq_W0
z`TdIxQVb9Kx5;-3mj+F{PaVy*N~BkRHLARG!`2&NB)J%S$Iv~}puzTfzAGT*$Y(cg
z*6d_{MqH0!!;E|P?%jR%rPp<m3ADSzrKqQ(brbpC&aw2}DCiaDKfgJj<Jtm^E~G$>
z)dhF#KbW8^a0fI(;Cj&K!ccvElomP@=e^<#+%SejR`1)F=#VV@8a}x@!f0;UP)0Mj
zB%GG{b9ziV;{2`<6?C7aX}2Z5Xl*X{e(e4tYEe$Js9lBEj?VjN?-V|I3WX6DLa<<y
z-2P~Y?x>s%jEkIXp4hGLm-s>!X1E=0#d(ew$hJE*=a-TO4<2lm)xi;&@#BjlUf$zr
zBg4#%r~1y(W6yifUcw}uc4Ie#=)^p@dX#fo&DIJ3IdN?acw<zUqxKYZ&qfXNCT|b3
zHvdv`#=Tr*H1b;GBOtfEspF9=i#;6I_S-Kvhd(_F6VM}HRwzAQWljgF^g}s6Ev)#D
zHUBxWRV82=zEkR4<Nz5)*8GNwe~;1@ep2%_@iMpXV1**Z7vDJi(#+2pXGK&vhPVKy
zDuzvX6DGTmQ7*q9ry4!_4%VCzkiDUOoAoiC1H@Omx;|K7@FgjVH?4XPev!u&2~3Im
z#s5<yvfu0+b`tPthzOomu{-Bu2`X^n)0?(valq^xzFKU@llA%IN!@jM{1BLmI%v^R
z^L2K1@btZ70-4*!XoH1=o-)3gT*%F9-e0^CxA~XP5aR7`LbMY4CSi5|S^||Jai}<7
zZ1!A8szjo!i|p9a!{1Z!z+W({I_k=F#0^PL2vZ|aU$UrZtXYt}kfM<RYv0T<npIXZ
zu?yOcdGFslW*RYRYz8nVVNQNgQ8E>l+U_P4XA$`fnC<u91v09t&EUBwjw9Tqv-K>A
z6@lZ2kTLTE;#ROQB(cYP4x1}%%ae64cl7jJ#Vp3qx7Qj7^QJt5E{m`T1dR*azlT4k
z2xu{BCh>u^;H^e^TzaYrG}THrO)53i(#wYLYi*tq+FJ#l&eBG~`*GKhmCWk^XGoxT
zc1}<L!T*I)w=dV#+2V6{Orifh)d&)jPIh@t2_MssuBN_tp*Hhi&LsTW2?};bIyyRb
zr>xE^ZuGcuQ=toY{LzpjluRAtRNPble6;HaFg%C2MNowL1g!<4DwsgM#Ew9!oYYDu
zCnqxQ(K8KbtAPqC4bvlH>ng0+9&c}Nr3H+GbO(D0kyi7o@6hU#&wnnyE@fDRfXB<>
zIZpM6TIok&!fAp@KIm3KOUmh!p#}abgAhz;w(!afjCLWihChCSr+vjuXlZu-V(H~1
zDK7Fjqc3m-z?k~_^*J(P3oSnvhHD-ns^<B(nnSe)K5Je9^)F{xq+vFXp%--=M&e`X
z2el|T?CwrcR@@ff{QP{KhvXck9Vb{1F?Qjmc9fAj@`f25W|MpM-_~ePl}!XCHa`4i
zHHX6Lb6%Y+VG&48)@5WkG#hO4y3AnyWd89l1czZXT7IH%EsENMquEe5<UGb659i=F
z%xI*ge}6ILfCX2waY_-tIsqQ4Gd`jJrgk;g7xVe#=g*hS<t6!?);h1KJa4a1{r2<J
zLtZAt{^NYCt*q1+E?9e|Z{E1^L{>5`%rG0aMvUrlCr%t3(TPLJuuRi@W+i4)FVFOH
z6&8(t4)qBj(ivzs*PCBt8ZeJ7Zf%)Z?VZ>4C)qM4A^h?;*RD71F?}2}32ie&s87%t
zU(QQY`4oGvB}+DvnXpD|tf3_0Vp<Pcc?24xgGx|??$Z11+qLtw$(JxdR2<`BRCV@M
z|B8w?PaRg{+UoVD%4_`7q$J&hB}6#xXKYd6w)iOGYy~aLo1y;xyigcDn#Z>7*s)LP
zK;>!6=8G8lB11*Up>-m;ul?vhRd~)8pELK0)GHe|ZtU*>A6wpJldG$dSGojm<7Gk=
z44mt*4CA`>n?}72CP_ix*ijACrVIgS&@brf)z;P&!3-ecRmIDBJN%#R`|@RC|Ngtf
zcmpOJB}o2Gdq8e@-5)7NJO}x*@T2?fR{HDwC-41PJ*>RcI>uEeGIPbFOXFGfDahp~
zQnnoFIPA;AvCgVSQIJCa6J=ou1t?#*Wd%Yd-o6(CO7zM?CYU36ZXK1xL~eDCw<A+}
z@ZgWf%kimW)!5L^qeexx`7$kwM`>7{ROh5;WM#Q}IpJT%`88_L;MTnm;uN!zhN{6U
zpE<J_z~a0+LD2HL51GPohZ=)Us^FE+|A0?Vi22X{+BW&jLJZ%TpO-f#;VGfISCR%?
z!Yg~FW3{Fbk&SdBQgDq0cb>v|+<f$CHrk8pGE2GM;`c{7(PE@NAr+Y%IRYj_9D|O-
zf7U4Ka1A$UHHTm)Ei22Sz^No>AhG?MqaxLCAWH{;>qp(vgUl@(H)c{t7klp^d8}FZ
zC5`s(P@6DD!UKlQ3z%|pj{7aSJazwdZ=a3Js4Cg0+?=2mpI<PdNk3GPj~`|js*_WB
zy;v$VHzO&#r{`hHla)rZXOC4g1Vk!_hT4oL3XoDG6+<)b5n^kvPU1){T3(iG5cuKa
z$L>9Qj*|n~{q>iJKw)K2W2d?t!^9Igdf#mf8kbSM4KqfL8s$x`3*3(63BBTk`_R}#
zRGUu`CU+xfyIxGW)g%^UHL9wrcsfX?ofy4)+~H|7#D&tHVDVmk0^5D_#*NMiu3zrx
z@HdMrUR6I+`*OK_IP#`Vn<`ZTn+i!^oR$0#nrV!a@gZ|0(MbIU4H9@h1K|r9LE^3K
zrmZ~%Z*;>EumNVB=B7h4^J!*^ON|p*FE^M_4pbtgwTZrSXG=)Uc=pkw|CCsT1Fee3
zufX~g^$ZoI<t2djkxz-eNip1xo&9Basj%uGi)J+r28Vc}3jqp|^luV9>?{!7y?7E!
z?d;R}ScyHjO_HWMF~sv>W@>7|%=f^(bq|p_&m^NiLt-bg(5{2&;8^!peL@uRu-K?O
zK=6Hg_e#g5=;T<W>kS$th0J>ii;{fjE}27@NH~Gt4o8Zkg<g8&&Vzdscgea165ew}
zCxF<a`x*kg3lSSRgA$OD<52L%WOA$IJM~mkfbN<q;@tnfV;A1B@osnbb9{`~Rcc9k
zLp4#KdGo=8;@|Cm8#O8$1FSwlUHfTaP-Cx_^u40w@6VD1xpANEZVEN6>Q2uXvo^I;
zUP?}e46afN9MI;6MgGbvmim6F8#+tElfIn?O@<C3jsb#C+f={VRBC+3`CXtahLgPd
z#XsNnn(03rv5j>n+K>kqCYH&OKFl-hzW|sRN`3ObqaI>HpIHd~ASEE6pO26l_b#t6
z5rjr&$7?6|O=}McElMUWM8kxn$8kZKxTM|C$50qV6^2qVO>L=$&57dJUJr#ve`7xZ
zTIV8D79`(-B+~{V-~ptN3B>;r7C(9N#FA{$C6ny{COX596Mw4W=>R&M6Iy-BHUBdF
z&99Tce%1j;6Tkk|tdS6xN5mIVjKIH63Vugx+>iF6k4j;=<>nH1q_7SSTp0B~V%vYr
zCm09zC{@t)FlWc=H}83H`7U-hONw-CGMc;uSsdCrW9zA(8OR|+-Bfn;L28$NT&JGk
zp?`*Kk4g<n`|wZPu0Q{DxqSJuuwPuA?g`mU6p?ha)J;-#lSE;6cZR_)%~(ePOaWyx
zV}lL%o+PzOgnU2=5wpnNgoN{{$;@4Zhd&81;1v{RlQqQwYQ+uKh~6X9V^q_%??MT+
z<v)LZ-}rD5&ch&Hj~vV-ey5hju6ndQg^Sw6f|CCn!-ReVS1a20S+#QIBr0SW;-@o!
z`Cn2s3r33d80w8+*ez<%;i~jJI+8CWs$RX*b3;fPiA#AgZirxP!^pdwwS5N<p614S
zlit>Igg9B~l}@337XPWSJ6)>-S;6t#wFD#s)H&m%jC-=nJoXb-I)tjOqtBk1%6MVY
zcMqPlmXmgYlLjmrF#hMw<nmh94Y+pgn$%QD{tPv|fz!r)x$FGNB@TohRhyd7m-Ajx
zUo2+{{eWHYhDuFk89?FPZvpg=7j1Q~voMm>N1$0uD^)zSoONjkg)-iR6s?BZU7WvQ
zLGF+hfN@G@k_m;iu6T0l$b2hRfYs^ni<hiE!KsfddE>}09^HfG0q#o)JH<Pi@(&6B
z@nNeOA+b91&ff>a**IAQ*kbKz|2wF4*~PQ3`;Br%CTe?29g}JbiDcIAz=8}tM%Z4v
z_KQ@3QgUVey3Mo6C`OH?6pYyYgv3BEGafv6aE4%$B#oP-_gJP=Ee!~hq<EQ-<0iB|
zC&Dh%1LEWWZ5NCp27?7GO%q+AyS*nL`|T24H;Q#`l<=SBxJg~aDxAO?hJBx=>y>89
zRjcm8Lj>9tT5>p5>dFm~p~9fQ&e7i~g7$HAlss1SKUCb<Mn5iJ&iqNYL0?Ausl!Sa
zHhcRcLr6qkxRs9CU&?NS2F>J&<91)bN)%8$wFL6gjUlpPM>=MtYI-c-$Z&q&OGzsx
z<t`;PsrUubEUoi-K`(rQ?YecRsqJh0xQMs@NZDSqcJ2C@o15?JV^a2eKE%FMyQ-!T
z#un1kwQ$f1fXtZDqg|j=CtrE#`{$qEP=GK-%?<B?MOMXg<7_zLlKRBX@4@+PwlC+R
zN<Q<h0i}v}%OJ2;6E5`6XTML=4)*4EzwA|`VGD?xI;rT%@2mB7w2E-%BgdtX9@uR3
z5)GlZfD(07@oZK9RWE*HgO}hpVA@aJj%jDS=QOEC;c#TSBH4m*2x2;9x&a}wc-@iZ
zPf9L8VcF!4fXduauR(b>W?Wc13K}_Ly<P%{l1aL>eDLUzH*9_5PaU$D<8YYS^T0rt
z<HwJ8pMGxLQ@?JKd^*BGkY*bR_hDkEV{B-z@x}3|)GuDXTuoOCmDG|5bhR@8VZ5P}
z{3NACC%fHXM(Jrn*H-`jSayrL3_?}$CuAu*tKw<O5gt6o)4+r@-@2b3)M|hE!sW{c
zY%<F0?NO`OU1_dx>Bc?A&<z@N@PC~U{Sn{lcc#`)=OIgV^X`Q=!`g&#Rp}wf$TSLy
zp~JZY6H5u$X41sBn_!rOavJxTMf0hYD*-0YkRQsXwa}2#HKtD(JYE|v{-0)jcPWFy
zD4YglouwWUmPcyPG|8dWka18N`>}680sE()0HZzmRpS|q@OUPu=6oa-V$GU1J%Pg5
zbh72saa>bb^nG0?|AHO8xiXh6Xqin|f;sKKKpL*c>ZIr7_)xk0WMIb2rcl-EXRX2v
zctp9b<-{%q5J}wt=CzXwlkOsQ`6Lieb2*H<w_iE19I+`{ZiBYO3h5RflEK8qll??|
z+oot?bu|?JQL3R%4B0f27P=lp7f66e-;SF7Ce}NOJGaSZ_L}fY+}~&dDM55_ZBm5<
zt~%9CYn7i=Bk@sg3{Jz1A7=`#6t7sVUY&?BlQxch-3&)cjW|i0sRX$;b%$d$w3N&(
z!Rd@PbU9jKGBO<+Xs4vTcySEgNwOv*S9#N(AjB<yKcGyjzM}X+XyXJh@F7Y3^$99@
zS|hUIBP%INtv&Ybo;ck(b;sa4^fLYs)3$NrSD^esqvCG*5TTu5R_bqBT3W&yQzwdn
zI>qUbOrum3mz3u`w%<E_b*RDHqKK&RBfe=iRoQ;(&iY+XzNkC$-Zx+Ev9Z`cYRuL{
z?#BvduIqNo<)>Y3pDPEg)AQO_-^*og*V~i&Puc&Y)!m)jT5S8D0ln+juAN`9!E@JN
z`dttFUN5z9*7m^n`H$_Lip&l2KgH_bcFVOt#cIK)s(K^tacp~%)7uU0POy=iHoZeN
zVByZ8@9)3H02uk3{;W~QLmcrIG8ci5b2HY4m&|2&!$p1C!1dnWU!YN)MtclQE?w|<
z{qkv-b*F&lS~ycb&iUukV(W9sk2whlgr%$r-?h-8>#u6OM~}!h*0;Y4O5ZHvO_~N_
z5u{qiMTrLTd%%M&ooqr<tNYT4ZvC@<-DQ9{{MpcqcI6);8<dcg47i(|`gqgEc4i_x
zOGT7p$)l&DMkd<XBEuR&H^0dHN&mlPqyY`P^0+!ktDj>cBj)NhG?OPRvCI#+Eut18
z_CuueICTJZbm<~PE}aI%d>7!`0OMUSoXdfWdvJHB@b;3G$@5v^u;jv<I*R+DvfCmB
z*V5d5@;3oXKn?UIYs3{0SV(ytMjx#^S<NnxZe~YC{ZjA6pB=SVz4)h<Ut7}|@vvXl
z-UGH~T^pmgKTW#SLFQwoe9HJ-(U`sOK(OM&>4UVt&%27Tee}tWL$oUFW1?TW*P-P%
zFNYQSFbK;2s)@CE0QqOpIT=f-TRyHTgWN?NO1OLI>P|g#%3}=4NfDG6xp7hY71=ZV
zQ}fvJ1kbX)J^Vtnqd)p3a|kd250ysk{27{fN0j_zx<_wn+h{Lco?KEKyEbf(G?j~z
z1AfI-=?YwxZd}n3NuFapo^q!{h_PJL^DHmT$Vby7?CnPpSzxZiL8WMX7+>)mFG|ST
zD4h556}@XwztRmV-#rSwS&@PvgiCo*b~%aOk^}N&K%h904Wq12#<;{dm1Q`I)QPAR
zY39`!eE7R@UQnFF@(UmJ(eMyGPo^1bgOqL&8V`H>L8PZi@@z_e`svNu{Loy7KB_#1
zH}p$On_aZvvb5TB+VVsQ8S+`BMpj5>jI<s){v(F+R*SH=n@A%3gkieLw&gKFBY0(O
z9J-1OAsWbQ8pwMNqdm84D`f(|QMSEao0Y>wO>=$LSd*JrMQNZBtpp1ul#9+E<vH{(
z7mw$6O_@IZ(&HP=6#h#}kbgJ~4cheDk7~J~NHwM$>$|xTx1*=fSU_4GdX`J+c)rQ?
zNuY>Vrv3z>xS>1bs|L@GpCFI6DF?Wp#{T!2H$d^aK09r6|GjlJ&Nf}}OUol)vtv3U
zbGJtcASpNN^WjNz5yYas$OA-^fvCRdgkLThu(q?P<OQ$I^-xM*w<vUp(x()>0%vfP
zykr0_EFH`tS`yxyv>Ue8?TM0(;3&sJZ<CYcNjEuQZEuNGGyusAYDnZ*tjL?LD4c&j
zMu=+ECUhye>3&hdRGG0NBaLo*8W?azV;FTac|md;y10Fb{(BAc<5AE+dEU9GoD~~C
z#o>D<`0~aXV!1?^3Mh~!ZTSY}#c`UMBUX$NO6y@#JVrD|MCS+xc@s|C;NnxPUq?dZ
zXzH@_amg!kA9b~a!xphU(Lw&jbQHsu_NB4V(b&2<y<R_;%O@k6lr5Qj(81;TW9!~m
zp8X*YD~e1a-pHaF2<{r0oP(_r(PibC_n$<Gr|ZUL((})Cj^S_e(O>CEzUArim-Q9*
z2Z&9PzY)pqW%#IA(c#!}?c}8-8A3rG(j2KvQHoSA#GuI4TOOR{C0*nLtlB!y{rBoT
zM^zs0Fr$IZSI+ZwOCM)k4L7OGHW9uI{iUU81G%WM1l>Z6DY4{G?x(O&blD$p-K=2}
zX3QQG8Y&XBE#|EJnUHo5inXgQH(0dJX}CZMwGv6DNrCU-G7KQS>;nIN0>F>)EGsyr
zj5eox-SYQ_tv~K6|FCNu>Te|2qy0DpwAA}|UUAe7p`6pR+F!qlL7#Hp5&m^a|Gm$f
zcz*Z|=JwKukj8co-vUuZF2FH!)qtFTnoPWn!25ot6lIH^&2}woGzrs0l|uwPvEXJG
zN|w{ArcUML`tF`}`5p})RwX%?SLBt9fek$y>Vb-`y3qOsAf>lmEJ#oWI+z1dgE+L?
zHft{(6bVW@2sq8a5Bcgw>aO>L-8x9RXT^b4BUZKUz1(32#?y+Pr4@$ZK%m=VwS8pp
z#iNiTtHC3#T+fj?kAmB;g=4gR`**#8mIhp}1VS{QH)~m1F?(T?OK?EpG25FsqI6}>
ze@%QSW2{gUS$62713#sMe3s=hbiK@_?SI*;wfG)atntVkQXFNv4(YZ$d4uhz2HHHd
zICT=VGyCz1`HmsCO4scwe&^9v&w#FhO`^&tl1fK&c;Q$M_`qV<Ws+sm*bi}gV|xcF
zWydP*hp1*1Gwj11ES;D+pw^n{c)_kqnL_fi)48uWQ+UD#k6Ldo(V!pshrh?9ukY8E
zNdkn<yy`ROI`Lri^)FjL9gPoLGjk~1ZY(8qG8HTx8frZ46l?DX%BliSFJ*=Shi1%H
z_;+(EzGV_%m3H%3iAaU=z{3vd7k%2E(yISr7p3Tvoeczx8>nQm(=U$<Ck_<}O=UE~
zivH0{mR?!G={%(G+j;4$#&e%G`f7AipIASURJw@?w!9ce?YxhFXvY>0C~7nR@b-B-
zyE(u^D@sIHOn#dCB$DfAh`t`q%IEB=B_d5n9Y)Hz1<B`(4A-5V%jwznwSzJ{Hw%@Y
zTNJtI>Rs9%bDsRm)rW2D*E)8}jX9sw&$sJx)S_Ak&b5l}^eE)?8u*UZz4{Jg^)B1q
zEf#+&2&Q%5itJ<gunYm2_G0Xo&9Vt2se0@aJ<x@ZLNeE*>r%|Labuqx%*=!I5g9~M
zF<5)=Kr(FW54AWvk;{Zt%o>?PE!0QdUV}ZEMRd1l9k7F+Pbr-lSA7h0FN!&4s9N+I
zr&%Q<TFl)mnN_FqvSQR}4k(MC(pb;9C8sE?jHnQ=NY1YFx}glUkzxSy%<E|%a1&J{
zeBjzMw`!Yq2_)@v%{&7hKOL!|-m6AbABG|AjyCkKD>JLUX+4vgTdvIk82!Mvew{r-
zg7q4N??Pit#M)^tk>jXidGB<qt7ThUsm<4*(UJ)a8ap;|kz+XCs1H@-R(Bo1C^4!o
zhc8B3OG87>YFpdldY9zyH(hG9p^l)v@x}`f?TL{YX`)L1Fu~#SmhruI-d{QS6JF`^
zl12mOU3b|ba^?Q&!g*g49WucBU){p|HxB4aE&}9j>KAyt&6@l32o+<^G!j#EYITmT
zw<|}f5woYre*Dgy7}o`@!&<bC?Wb@qnoV=ocMXDO@s=8pxs3j%GtIl!eLHzAt7uNO
zlM#O<dJ*^+?|YqU`S1C7%cM-5Y~nOjB{E+zWpkBP3m5U`x2|=T{yxz$Z%^X2uO+R(
zn6n<Bj~GE1;uWyb4;J5j@8vPQ{O2d(?d=T3b3t+ql1VrSro8PH7MmpEN0`oSZ|nzN
zd2wB&LO{AbgICb5|KMT#Hm*zerJ$i;|4@Wn)<3i5k)s6_5)CFU9-4bOw1ro*d}d`W
zM2u?odSmdnh?_KhoQ&wvwfBVemxC|4`O^X>>iLgS;=&QA;!4<$s)yPdP2>Yh?)Pf@
zNM*TnY50)HOOaQO@$@&A-CKja{&MSZ=#pD_kI6pEt_Ot%;#2p4_Lf;2<gOKCdZ6eS
zDhMM=9;bM&YPXOSdFx)yY5XeIEh^cuVtq%1qb3Aia+~R#LhVo+rfkV2x-tk&r0rpm
z)Mu9J79Kj-E?3*&$pXg^^HH(8MAn7Q*V|MAUw}grL0;(870ZLt_AmP`w!0Y#A2e#F
ziXx<-x#*AO+Z})>(#tBKtyOsEh}g4q3T%2C%S(K_NMX8%42#MAH&hya+oL*-ao_ga
zMb676Vd-R;n-M*2vN0@gPr<|plh4Yz!oh2HX<);8>%toqVOZ8)u>6pcG3-E{#`=9>
zWTFH*_6phHlgy!DW{<<6k5^C}Tyom2R|2gN*WTm9bZt{0E9!3#deZKP^FbZzx9+We
z`asmVI8C3LdkX#6lY&K!Ej@+$5svo3;>k~ZUp=0*xz2Zkjwq9#s@wk=!`{d=1?7Nt
z&H+PjH2XKyY1OLndTDzNFO3by&Y#|0_famX-u#?AW4ariu7*Z($RGCpl=RR1ocZ4k
zEx6Ol$p4cg9Lj^`x#x?|dlbLZ%6~a@h@GQMCt%c+(Q-m3xluqm$y)ewie(1Gjcd_j
z3CIdeZ;(c4GHp5PS!DYINBY+C@1aLhOZqN(gT4rZIZ;-dh*r_XdyBX;ws>!Af=8uF
z+<+`qC*SIr$TUN_x0YoD+ohQt8QsP6wy62Z^e!Nm)4cM?Veijhdta|tya`$Wcddtw
zra4R0@$lJ$VG$OD6`bs|?BBMpjfcJ4I<e^`vN2bqUwUv`ewie*%+KaL>f3(-&pRT{
zgk;vd=OsJ+Pd0cc-+bcOvH9Em@_$}%0R^ji?bff91Vky*)ly!BtsRMxH&8Ft|9Z_x
zF`DnmF!V%du1@3{fUtbLa)6V`qXlznDX#irMZE$7v}M{DsN;YP=7Maw#n=riI*!sA
z6(jN|mx9Z=y(!E>MBH#}0R73zY=b@T8L`C)a6m1?&n>u_^V~6VWo2?44o=LEtg~nt
zp6uPg`S5LOK@XFI?xLbvFXSiRdN~S3vyD;<&>u7WPqXSX{n~w__V7ksd-S+PymJVt
z0&`{owA5WX6HqYi{!<RfJ=`Nlo&t=JfM`e2s=D@m^WU}FdEr;&8wK<Vmyo@D{y>q`
zl~GU{Wg~BsPu=N1>eFg*Ut)31^kwgyM_^64b8-(wr}A#LKKI(_^76Sc&C6>+k<Ga8
z_;Sa<VK(PK@m<>Vqqg#UDPT}0Li~U2oqbf4Wg5m`thO)vGPUJ+klY3pOen2TQR`G3
z9aL;}g@{qGNZmjQ5L=25&mOU3da?$AOpsCu`4CA>G!pP+DH@ut$%L|g1Y$&DkOGRh
zziVKS`m6u9`3Gk_@Xq_Z&vSoV_kCX%6RVI*y_UZ|u|tWjqyep$TLF-AKJ<*)E<qK}
z*ko&b!^|uX^F6_NBoKBbG7z%)6vt)Vy~mx0+V2oC{G2hRLl<t15~<*PkcH$b)-&&^
z^WOV+zMD_fB$a}ZUo0Yk;WX~_O)$zY=F#R)#1{a2);2vJ_;?SBA{w`Dd7~y4DVWm(
zlNM#*o|OtSlHR|Xw$N1YlqUh1ww(1bY?o!5C{G{2SB;LYJrMtMc*uhRw>-0#_bnke
zE{7ynfh-IseRI5gieJk1G$wjXY*~AxmW7yV6$v@vW^w}K=AxBNF_pDfMoN}&(c(^G
z=ok_nx1h2!H&E)RX$<!!0-T>28}M<`#1|C>k$?|1ON<oGzF~u0<O6CYXK2+BSUWVN
zvlk|Tu&PZmEa6{YmfU^nGyqFvU0!}+_29b58SGf1LeHrukE&tZBr~XOzNO&@sksWh
zXFbkC#g<>uyi<adcb&}1i1w>!u=T^qP21MFI<M@?XIrz`<JFz?@|~9ul@&C@2hFjg
z_il{}AAj}2#^lG3g6?4^USE05b|OmOU{?NI;{D)Lw;PRz6}ZCzC)#beDkob-Dayi0
zI7w`(l?^%gKO(7VOcvao^%kP$A{`sgYV~n8I?HIdsU_(2w{lcBUub&TuqD($8n_nz
zLaUvWm^~7F?j6T7FqX_w<~$n`HnWv28oA?P<Q#7#>MROlBFJbbrC&>-?mT3}37nl&
zAeQr@=eEYz`<#};EWS`pbnI^}2re%C%f;G_O)RJ%2g_;_ksAGX(Zn^ca))-tQIefk
z8B|jp7jS82{)w+odtSqhn&LPe(ce72Uuq@o*PD{ESO#&+$3dod3|2^&CDS$PK*F9<
z;JQ*-Db>FZ*o<NLagolyG1)~7dULO_!E1(8otmSV&m?lWyXQXWQ^J^W|A93fFaS>l
z>W;=7*OH1pCxk0^dP3f%J@bfI1)BH;c^XB7)Yk;GmMXj!>sWF1$ZWjJ0kl;nUb^uF
zL$Zh5b26*|8%i|IR=~76LY!Cla}%5fpGgpXBT{z5%y!jYZWxQdG;^;qL9m*wm)^v<
zxfS$uPK4GE!wGz=KPiC^b2M3NJs6iqqYn^UGtXoqoHmE)F4iH)X6W%p3RH}ZO8sOg
ziBqN0>iCv*r%#;=r>$u$14&zRgWC_B<n~7s>BKjuWJ$wEf))}-_z_+y!4V?vW#l1>
z(&~CTp|>n>l|bI%F+Z#_Rlr=khI)Js#cby>uZP0Nvi{Bohf0dsr*S<<<Z3x@U7OOq
zI1(Y8^uZpe{1`(X-1Oc!SqX{2R$PrM$|p6PP4tmvlBDQNLb!f`JKFJuS#=8EQwr|;
zy6sqc>6#<)=SS;u%(A33H#b|b%ObW7Ip^oJv}P9ZmxvvSvV_5y#fB~hNx5HPgn>S#
zH_wOe+b&b9vMf2qIKEuX(PE=$${1))qu~%VnHk2<fUvj#&rvk`uULI4F`3MwH3Zf<
zHt<%*Q7K9p;T7vtv|V%Zgr`<<xU_xy!`t_{uBW46yje<mA}#f7^a1;8E-t%1Qi=6*
z2yoF?GqwpHZ7{<!5aY-7M#_U5t`qEks+mPH?(G%fAvifa!PK+j8`u-nx@&}Zjk#Df
zE%AZjRO`PKzjD&G;-vf&{}llajc!=J>djj@_hAI1?Xaw_V{PgrCWFt}PMdkXoD4P~
zox{6uTN?IQEW*y-i!6!Pe+rq;c-;5WP2_GB{0YnRA9Hq@@Hjtwg!i~47~V&$+A?jz
z(PCO^lV8DWT-h7xZ#|N5F-CSJ@t2h$l6VZFm6cmbF$raw_-n_9@^Kx&1c#ZIwNsS~
z6xz^ol6&=S-7~@N9)ahrVE{nE-%XPFZT;oq%#XaQl<29Rv+i<2{dI059aleElBkJf
z*Yl{*#I;m<i|krTk79H6Zr`J|_a`#aF57aw=F@ltdHPo?PTN&Ravm9#8T>MKXryIC
zML=>`#8104t`zWG?$<+&_%)gVP)$j|43u7DIVeJ+R3vJ`%<V=krx|OW&E!M!W6Moa
zY9<CBIJd8{?yT%3)ZL{Lt3}r@xU}snFOiD7#|9Z|4p0xvbiICIl39&r19$d>C$=5i
z@Dq769@;x;ztqF2X1JLZdBy|jxkvdv<IX|#5j;5uMo|moh@vX>n|*BHPqdNRPcbCt
z`jWYa7-l+f?Mv6r_xK}pILvY1K~Ch(@bF?fgWjiFaXhf>>UYwPW8!1@tlTw|K{{jL
z6?HQ!#z6{r!@dD~3pWBA+&{*~_at$7L=fqU{63+Rf+>;vkN;(#2r<ppro`5g-O$QR
zu?%pWjB@{`+duQ0?PR!4U^_$G^K;}dv#6XY52o+hFtSBMIYiNZ+d5ZlK)ol4)jLF}
zEXM^3>1Gxyd;dPpf{w%`zWeO*LAQ=FB0k3S*5m3CKwRhTCFm~fa-YUTjNXhp!WOVM
zm*4C1Qg2^&&$G^k&9f9P?Rto1zX73+#;2rxo-uM%gsn)qFqyzX0&Gh|v|%h6q`oRd
zSVU;d`7xQ<EUALk*~JsrIYmgSMr_TJ{k7IglXW%5ykFpL<?k-Wdu$@$H6J6E1HYf3
zK}lB2u_U%9?;zJIk@Oq@fex)AHSRmx^Pk)0(&4T4leRkLh;7H~45KlPWls~xEm;Fe
z0S#KTYlt2fyQpM`D6K`_s(CC^#ehJ|`(SE0`(`;6JtECaMOI&wN@I-dq+h^In-%Vk
z%`)N_3xF0t8tWXqiCXO1@o~|)hfrAlnaQ)q6nyipZSq(^@D|bip6<L-yd5*IijnIz
z+kSg_;CQxEQKoi=sy_jRmbE6)_O?J=zI%sn&%SPa`L)z8QAK0PDpfk@cAIW3n6qb`
zLNy7qX#uZX;<;wi69@lwkX6#Ts&n;Tz{@P4UbbnH1>&+g<|!tK>H}yld7qY<^j)Kj
z*N$na2+QVS#=5kzh*tv-GG}Y)8O10soYXa3RlaBwZhAL#=+d7zRbW|KlD*=G+-Q0F
z#>{gEjm>xJmsJ;_+&t8*D3HMC6FP0&3i~=9ZD<wgn%;zZ&!kUIMHNXUTEoZ$+pDWe
z)4VW<qS0TGe@ke%Hl5J#hsxT%QvJq!jLcub9eeN5s2=t?qr|gYFr?&T40H}#E6@_<
zO9W&sNc#LuaJx}d75fF=UY^zluWFk5!GYzHck0A5{xh9s<0&upy5s4NJI*6^0X1Bx
zSP2Kvb>bxsF0OLIi7A#LhHDtuu^xazV@!^lS*=&vfC@CRsVheGY$@AiPz1efm6`CD
zEAEP6GIm%{*?~8Ji$p@+Liz0pP_B_t$wtIx+_CauO*sqlmOg@&AnV==71R1UkJ_|v
zK+my>U)=rFAAWoAn+sG`mC>Tif))=vH|^Mo3@VrPtfntda@IKyrzMeGsgkM1|Bx(Y
z#GVLJNz-Z~gRlp#bnAV>O}G`pDAM&hv(dME)Xsdut6>iVyDe2^olnw0GJMC8OdoH6
z=|lmCeO<;qK?{=S+*G3^i=*eMO5jU7Bi0gki9>sMqtB0<QdEDO&y$eQ`<?kctJJo@
zLkCZ<MvIi_JFXO$RZ`ZCq1V2i7`#hdNHUfxoauN=f>@JTx<9IR)8clAjJ+iypYW<q
z`tj7?B&OSOwW{o}SVR{tKBf90hdCR3$8B@Z{e`xk^$N3JdbrZWI37+5*05Fu9{60+
ziM!p*N_HhgLwB6NRe7)!X+S*f?Qa01*^jz}rq2LLrcEl0;8o=O_>#Z>x{HWRwH~D;
z?0&Ugx>(%1Ru+zti`^&9aNP_ZubQ@KeOC?g_LO?sU`G2m1lwN!hVFu9yh2)$-_q5e
z@FykMp#`8S=_P3pk67B8x?d#&99lYQ$dh;ph^K3FJLrLAO2x6S!Y{vEufirB->MtW
zNWZJ-(Dx$rCrXfjIXc?UYY(dMO(mbYr$=m@;}S;6NXIokl`=S+VKSuhPgS=PvdyPE
z#%=Rmp;aQ9ZNv&^d-fC_m;P-TU9`Mj*eJ3pVIQCOSPIU_Jp5wMMY{h{et7clW<BY^
zcmUV+yVpDScaLe^{QQMw-4Z-EKkE4gH$O7^59zW{y*~Qrj_w=m^_g$?Ve9qL@Mr)0
zYQ2`@kFMH=FOJ`K>F8&4#Boe>j@u2StCK{sVtaSwH7;K^s;I^SGH8#WX7^-&-It6=
z6(aJxSLpnI%xUP;wqUGM7ErNN`vyDnNCALuwy0+uucbB52NAc<%3bOB#PQQs(eY70
z?g+<Q$4{Gcw<&jgz|TK7K(ECxj6b>o{%?-W!IV|QV;}g<8u7+w?W9iE58j$l`_3~l
R8}*-NU;pO|pAG%@{{Wg+^oIZd

literal 149377
zcmbTecU;fw|3Ch;j&sa|j8Ji;5(@2|nHMb$?LjJ<G_}LALgketO{CHuD(xJSw4}X+
zmbUh;-~H+w<o)^Mce{OipYuMvs^{~1Uf1J#JnrLhJ#Wg(N-bZqaS26H%V|dsDNxjk
z3lz2B?C-zflcz#l8}OgsP9K*#M9q-@1!snNQPdWSc4)uSMgOjbbN7^60>rx}7;?94
zxw1<B^`+khx4!-J?@t>T@6lw+Uq3jsCY|ri$vUCZkh0n%sz#4ZjW^eY2M8X~Te#5V
z<&G7H4)5O^zDnxQdFoKGWJK}AR`U*)+Eq)cmCtJ#$Jb8?O-v3ujppq5R^PV6yP(Aa
zW0~`%>UTtiAz;o2_@nN@kk^0w@=@FhjtldD`6Oz=G5=papgLVwv;Xo*j&Sh*%Vi|>
zOXvN(jw^TT)JS8=L~nM`)urnVKL2y|aNfML@|>KUA3|mPo}VjY>#j?YE4aV=mit1=
zWZ^x*pT8T-te$M5#(eS9C28M%+sne_N^Udq>@)wE@mO!3$)4mov-GaC&X^k0l!kq0
z-g++EuxHDijeMNK>!`R(vlCUib<Qx(<hrd*ZE_c}D3cM;e|K+(TAJ0l(z_J@MnBVJ
zwc$pWnay_1cNuwSN9Km1@H#&6o;wy@%htkteIKp+oiXWL18pTUuZ@MRzIrl?*`K+!
zJNam5Wpvr&!?%X~R|kKd_;h7u!r9Wt8GUay`R>&_vTnEf;LR%Q_7B_hm>o>qg<2mx
zH>xla3tLF>Z<)IY^7m{TodopW+|G}6nd;o7k!<3fmt1QUxoi{vu2<JrmblJa@FB}-
z;<CbglNZ`fLlvTv-(E3FI6L-aj(psu8mH@>C(!6P;BVLJD>OaYl$g)v$8c{obv9@2
zoa#D7#>5nF_tB(V77MZ8TF)J$W&R<AmeqI1y!pDs<f-9$%aVJ?p3YaCZ=!iJM$<Q5
zFR%68&e%eykp`cK4<FXf4oK4?f4fk#=fNe?zf-)Da~~m2Tx_B%QE2?@lY!T3<jQ0C
z%s-0evA8(OiVc-peSQBZ$BJUjKhB+bZ=hI&JA0y`S3yflE8Y41{>3f*1>ACZ?o3Oi
z7gM}jp6qm!{keG5rdu<UB{R}N;!Z|O{nxHtJN9{{%L5;6`yJ~w&tIC)E-6-;WL#B$
z{?&3a-AC?hR8DuU(z1Q(Fw(;)Vq<ctopQbQT6Rvj$GJO=e!e~y<}w~uZ{Hc6WSriW
za6Be+xOT9&F+1lB<+w^O&%NkE9+QudWh*P~*i!{r!OwqB_WNW@>+9>k#P#e8f4Lr4
zkuMMXg5Q>f?>>M3*i*6uzWk;SF|q}FEt=nok~tZCf6$Ay#X<<fK8GQCZ$EyY!G8r6
zUh>QGFxj<ut0atkbZAR$3p`Fopm3%%Iy=0%&FIA@^ANw_T2Y#8T32sVXPVj(5ot8D
zENIWE1H%orjTsX?>5IY^P#uv9shV8tndWSFe%=zX5zXJ0Z@gzW(7K(LX4OvrV+~u#
zaUZTl><fZpnC-tl+HCcHzgnE`etHNd%~#aUqBg}`=lnh@IFQklea=#-78sNS25dce
zt$dwex@~XZVirgFOKWr&#8+uMw0yqKA{!<fK%b0$A#w5Br(K#a;y#_BB%PkjO(Vvo
zb>`XR3TJxiQ)%>h3w~>sa~W3@w&_yE{VGGXY-+Zz6dROJZm_9i&Bu<n`1W#fSVTm`
z?4=(6a;g0OmIBL9mln6S_~^*?r!WcTd9VakMr*A1U*>B4$g-{2m-n1PZIZDMs~vvI
zqHu2>gZ~mLT<(`C-#u34GF3G>R^lStKjS85T9>?Aa4psOVTDMSr$bLFyPsLwIeN3p
zbeoGs?pl@E3vmB>fzhCuu^>&X9F5*u<kz;+W%S_e<8V@y*Rf$+1+vFv5JU9lW-qyz
z-9rngO#<8J%xU<Ol#lj1*#j@0?jFN~1O|#Ztb6dg>rnTm#?d3PLM``L$#V{5_BwGZ
zD=RMwn@_og&YcEptNIIh`pB#_daujOgiuHL6Sw6HCAX5GL12i{`u|zV-8?x^!Y#Op
zl7Iex!ee7Z<IMEPzX7UYmZ~4<GChXG9QsL#VrU^@h<%rEI@gWYtX{AzROa|dPs1(u
zMHGYH^ME-SQg?Z@X6inPsdhQz67K9ti$$`ocQo;%171OsthD}8DVCO~#9~e|1lGAv
zuX=E1s&OV@z4pbew0(%0&5qw*E@Gb_EI!)j;o@(aT*F4&XVX<JikyOvDYsuEKo@CU
zs5fJP&iYj0+3*JHA)oPmmx&aYNU@ok66ce;8>!%<za*&NdR{cSXN<M_(dZ|h9C@mf
z?L2!kq5ikq5?`$lVWovi`_kJ29b!Hmq9j-Svd+sjFXXOG$_w+iIChCfg?n)B+-dpG
zAME9UI@T4+i$dp7ZVH?}1uc&c{>)sG2m3MahCW(0G<sLYK(ViorNPUa_+8Y5d$Iw-
zEw5HeNTYTPRBD(%#pXPmH=n8uDz*8t@W(Y=E3DcAY%7^rEDQH8Vw)E%ZrqS&oi$cu
z&Nim;Xdr8%XD6mKE^Eh}`krI75oMvfZ~tN@CEe9jNBWhO;xC3rSsEoM*E<KFX>$48
zTJf{YQp<DGd$as@Cs+SfQBhH2U8Ti?8uWPEJb91Z8p*r-%(Lw7b7?({POjxN&*(dX
zKrW4^T!;Q=_4%X&g!%CMG&;(C_9tGd+}6(|fj0cnp_et^U&yk*);M;p|L>G)&@Xe!
zCpu8bla%K)KDcAg9~g;5mQM9cMv9;7mqptc=rmkY+cdi|7P?-0EakzRl!u>)yKv%{
zncCwp+{H>mibjf4IKPqFcsaoKW9slIb`=*c|G%?@nu&xb<H!FjDr%SDa%%ZYpRB>B
zwM_sj>mDqmR2zRC>SU&4S(<g{WiRTz{Bxv<Vbz;!{?ACoQIuI}z%xr_=Sv<e#XjOa
z_W|+$axn{o|8M{2@$VJ1u+k1+_4qRXTdw!9r>rx*P7A4O!BM|$KLUXK;r|zknNZ`y
z-F^zs?%|*PSAJW|WEddQ{k+Ub#~ia!3NZU=&D;nf*ZhA(fJ*C_$Z!Am<vRW}B7!re
z-s1Qhq~5R~XW@a?qT|-q)+Fo3Xr&vx_1ro(oRS@YV0ZfLi;`2+ozfVHfD<8rF2im>
zL0Q?mh`(-^R=Q1*O7?gKsYHx>bR~~H{UGdUO;K^v_ujQwq<;PL`z<L<Kd*7+t^yx!
zxmfMYeWta~A2-F_1sv=qK^9$UX_h@TyxHRI)}ZlGCfoYEUUK3>h}<`Z#pf!c)JQ`0
zM_xZy9$spw?a=5n(lXIF<NVQaps1v%D%G;h-?2ZRAeb@?PMKK*+0^3IaskJl+b&j<
zeR*vO1zZ}@Z8>S5rWR2A$NmSXNKT!f9&2m!Jh<Z7?(@!5ojN54SBl*`d}|%q=!J@M
zV%}@yME81odq0bacsJG-6oj2AhzFS|_gyJ5dH&i;mOx-Q9zYxd;RdSpJV;(wOSRDd
z2m}UfCdRn$%u#x?w~BWDP4>VC0HbD^83q)k>N7W1sg2<(I!=5*8~6u%>zP)gdG?f1
z)l6@!T;0`y@y^(i8p9whhv|W!8ByE%3sMC35v;LW{rS<}kD2&*EGQKhS5Q;)WhEG@
zShzDvu4t5drk`6a{`6t`nZgHqQ@auhK4iPhJU43)2J(JKHqYksIrB)-l@)FeDRIoK
z(s6O>?i3s<S2V8L#Ah`4EVhaZIE;aYo%7>MciRX?)Hd4W5i7<bcwW~T!*b~uB9Ir)
zsOsR{m%G00Sy<qBwRNIlNiiUSQAzedK(C{L`%<nKTbP8cR0F<Jlzi+j5pL4#JXWA%
z(Ej0RQk%qdpNAr+xA=h2%y{g~LpQf<C)P|&uC@=yxu>f6fb7*;97`V`4iZYFd9l?v
z^k!zCHKC~R5zbpHzK=3a9JRM4I_2XrdKBs$kh3h^5jT*nOE&G~b?9%--{Dh0qa&|}
z;2P(uVziQL4E)lrf<dZYCrG++vQ;3d4HNwK{iA~tBX%0*nbv8xz1rzztza<sBV)6K
zk%^)+!HW1JMKR<iiT=9A>0zFwj{y&i%`%57E1VednA|@fbLhihd)Ywt%tZFs*C!r?
zb0{e*>(?acr+&J;^aFC0a8Y+-_Do)-d1}jj#{s{_sN>CD*)!7!s1{CR0?fI9e1(g|
zP!#(C*h5f)&ec9Y_fWt*kV`AA*(ZBaKcE#zk~JT(t30ttfc7C)CmTRubg5X1VM$=p
z_F^D2QLOJ+hf22k#3-&uFAflCLx9rg^;Tsv1kYNu`WhOHe*63^YVfWNMe&<Be_8b3
zymk;2Xg5$Kkf{6ea>mT`1OcHaG0vl}SSru;ELbV(StchjrPnDjmM_<#^u+|bmOVyA
zMJ2Ycuq1(B+;b9<+k)Uw*N--D%(4!g=+v1J&YtQvFN;u)Bq5DNEfaY%x}RmHkA>|2
zfVKr-`LB?ePYU(cCUvfNHoLxp>7<ZLe?Gg)jteK1l$w!6ioSh*rHIVLYS$H?X90fl
z6xU(oH&MN|g6Sc^tE0&}iZc2A$EI@?7Q!T0;z42)DuMZgeFXgk%%^X5BI8Au)pI+w
zA145-EJ43O+2($Y^F((iuK-O}VrsA)Ux6xEwof^itOM&1Js#0XSjl2b9ml?0(cx;n
z$S7qcC9>hHA5Ng0wiF38N~ha&e?WTW&bYD?A#vOGqn)Am0Er>&`=O0R_7mMH63N3!
zu}-@^MEj3A4?bbh?<jwk)VXxsuD89;Bk7Ypw%Ny@DZT4S?+tw7F3c&#Y!||Pe;%cK
z<<Splx#=cYyF_f(xKV4NuNOi?zt~@>=3H1n|1F)#$Fi1){E7IaMN8L(XmW)qaYpym
zzr3=z4ZJ)OxWTlJU*;h%&4){?W3@nlh;1=~{A3toP%H3|4!b40T%e{y#>KubQL6EO
z?K~M3K2V?CdwIY^J;iM5S-Q9L2~Z&OEG%PHU<xh;ji7{JTPq|d*~EF2Ym#~2mt14h
zN)(cbo1K{sBgIV~5`ouQoCOK1?9Y>!zTo|jVXEfr#|U&$KsvLSjQ)4GHw#%rx6G$p
zAOBE9a*lj?UM5>WB0_B<gh}CKmH0%L7YZpUOIY^kDRZsk{&l7)T10aYZmWB5lGYxI
zn<L15cgAMyBa;du<hfFObPJMs>_Gk4KwGRu7D5`lJh)h-9cvttmR4IFh<bKE3GCl{
z02_*Ly_Qv}aB5YcLyxc6NIj2==7qzdvH=f8dNX)X1}sTUsx|%yhPoLdNb2->Cm{jm
z0nHTf*Wn*2wBs3WM|NC30^p%LlZXN6UQ}7e?v(m|J!7z-=(0jq5`C6mA6>7sTakON
zF1ee%Cp<0GrbgdqvqM+BuYkGsN4vHF!Xe3@3Hb|4EfF1%&g%D$B?8A=Jh~sUpD;;V
z?!p}$B98MZNv7w<Rc77;NvS+-VLQB!wKrzFjF0pNl|GSbLtqty*OiIROb*bdlX=>*
zCkwLAG{3!TJk+}rv+xtJN{S`owuXo?wS+UY?poWdfdFhD59lPzSj0s&Op}y}(gOS$
zQT$YRl=aGQ%ijg>zfw6~kLn`E$H!;a8J*G(0NoOw#~hg%L{zM()Q_1Cu@=)Ek+CsV
z69Pu%99W<8*OqU(AM4a@Y&Tdc<uw^CZI5|szOl_+HP#NInP~)v@Q-XCk$Y=>timU0
zy1>VaPl_$2K-eK3aXmS5XLic~%{)b#w+>v|c^t$%fjmIL?iA@~N)cL(27aa;ytXax
zyrBWq+Si!=PEj3C#GS@NtUD`{+K|nxjEs!3z72rozK7O=e=;Wf3nE2rh-RV0sUPGt
zJXSA+GEfABko~R}A=ZD@k^+%F^Mi_qKhZC^-*R=m_FJN+U^x0YuJf!PreKhvp`mQp
za4}O!5LsfMK7W6HIT4VqEk8~tn~msLo6Ixr0)}PDTF#7DWrxZIiPPwamVICr$5YH2
z`sHQ@<=U%abpk9GP&wlt!cRQmbN;x}`P&suq$C<0*@hjlu1jQx5~rTai<a&?_0Qj|
zw30wEiJ`UuwL}A_N4rG2;$Cr*dN*2Fj}+Bd^CMm>C9y9rEoOdzA&4VYNCU-LRG+%u
z0yQskpo>I*qCwHSzyDa%Pm(!6r9FUA);fh(*PdX~+v`&;hmoRr0sLeWwK1aFCcw5<
zAnZcEDoro^xss0!F`z=2dw1W5r^h9RM`Z*Wl+yKDG2+a3dSQX~Mnqe5nC_jK?$!Ak
z2!^G7f0vqqNK;v%p8)qt(Qi3z#eQNA!<=pD-N|)f+7lkJDQRiK$W*E=eW=Hg!$W}<
z?{)+^n4`Y0|4{~Xmt!>x{e^e{RAgmjWV*+(JW9Hpx-us5x$6k3OoUMK^x*XsNu2<^
zyZ5ZcII~<P8Hcfb$EurL0DX)-Y#Xerl$q;{BBL#aDpaZhC9rpaS``(zd?bdP38vyo
zZKkv4GdpzMHFue)3-axik{CaM+K?&dH2K6s{GCvnzoNs(XpNMvq$=$o`>B~J->I2(
zk`kRm^Rb``YF6I^{cN}m!~-x>ks8QdiTH1rMv7T`O`>6GylX`{iv3_`RqQZ9M|SN`
zeL6J1_>0(@xr+@~4}LA(h6$4PU>3s+`|ui-hsWHV2cB_fv(P>jZ99E>GuF45+qv%+
zNim>Ug)c5N86AHJne1p=ae&0|Brvfk(o<1;SWqCuMG{=I<rbCL5L>gb)z3T-uy~K0
z*pO08T1Uj>^hmZG-3me~vg1J8bkqa@%61H*kkQcp?xTa(4UnW|ut$kVOaz0}Z~rVE
zfat<T!)p6(GJElv$8ZI+s9@1UzT<#%^kyg$?{e4c_}g^Xw&b%16{5J^Z#EaPao@;i
z#OES_df3h3Ezucy;=lRQUC@VF)(1qg8*dLA=t?XJRf*LmYtzANiHcMM#8ahkK=&Ze
zg)grO7LJS}5@<?;jRrzkTb>hX<}5(SeQupV?M!=ep~HEX)Pd3$Gg-;^fX&NghN9%y
zAya=qRg=qXMRCZdu?O7Co*s$~awxg7LYM=y+rpV%JM(1i&Xdh4*;A%mX<r{tws5<Q
zO|HKi0&<+%lh*kmOfIM-bC^IlQ?&svCEyx@`)yGlxvk7qeKK6AoFnI+|Iqw5Wgta4
zc3rsu=%9n_oppKjyiHebaiwu!1CHla2HCYTrVr1TToCs6L8p^iS^>r}sceM!VSuT-
z)Sq*p+~-4<^dTT%uuQJxq=xBGswIKGB0Vp*HRXAfls{AC!OHAXOJI#wPkJ|8XC4&m
zJQO(wEHZJ`?Y@fuc62@ThcWOE|7<*vU2o<{g5y~KbCa5VbtZ{2gyRxzyvAi_O2v5b
z1hPR3QKvzw?vkWLOUPZPamGVz_$`?+=c#e0&uVd8b~BJY=szDc3z5=_Wz^D-hRo~B
zV<@}-rt6(tq**#6k3sqLLx?nikZ?iVL%yCxmE-tK#j*W6Usn31-<=g*tvch_OhR8#
zYr<@MKenUp+0>p3gFUh75<qYmVq552&Vs7iLLVJxA<Tf>%=A<uFvO0(XVuwqy&oz^
z`EkSm$$FDx1Bt*sRp%<f&;k-m#2t2TU%*ibSZMKq#=d>5j*BbPe&`r3G|<E{BM4x`
zE%+}XpE>$H!@Alxkzj@`;0IjC>LEUUb{&`I0}RP<xU1%{)PisUW|zsn+9rbC0Wf`q
zt&IrjTK`}P#V<?9HcA#%ZuxU`n^d%{m5D&ET#4B8E)0@r?6II`vRP(E^9WfdsD+>`
z?yS+%WC}nD(=c^$r2w*rNw6bBtYcc_`Ws}#wQ5<8cJwpwVRl1vi4*@pXXMS7Z5E76
zew~U%3~`y6t^wsOL+!cjHLsinRl(8*7-ci)Dvb`%c?Mt<TBz%t<9__6{nHb@hQwQv
zniqU&G;icKDlfyY5>B|lwPlR-&e&}61&dcm1DCqFFL=KNIu1X9cmw`7*RUIaIbqhF
zD&L7x)t*yMi@U(wpU#3?NLQ}oW_XySJQo+1c@+POwTBmfPYSQT;r3yuyRtmceIIP|
z5LLLeCYT~lP8GH-KoXbJ#BdQwQu$u&>zS5P9Zw0CqLJ9nKrLGoHZOSV#m`sQ8+pu3
zi_fP_Hvgz7s@s4lOPce&EZ;C1h9JrawItDrD2w}z2y<lKY4A}3P}Mjgqn_e=@#!+=
zOmeCmRj9TJF^#Eky>@zhustbH%zlt#Rti)8q2st-GXyuNfkKdQCZDcc83qLuA-(?I
z5?8(o<m`GNLt+UKWfT_+0lom<F)uBpg0*Y0U5!1EY679Au6^*E>$OiT$7g<(=5&*~
zQD6u|H7eN&-!D=n*0KG#4kwDAam0nDH?y<~(2SLAcZ->+k)5pza>7R6sbncQOwN+q
z|D@cG^p1`W;sw$Ox3%i7O@c_9Gp_WD>iYhhW2Z*rJ2pQPI;jZngZI|}MiY0Diu(w~
zg3AsH4GAk0`Vya7>f5vF!7vGZ)Ej9!p~!^*LJH^qqIAVJ{&YSnZ>;p2_swqIE>%yR
zsSXuJJ|m@n^)@9dR!Y^F=r(OZF5=KXeFZO^b{G6|gXpJncbG(yhcANpLb#>e<WKx4
zyOr6fcaOJGajPD{z9~j<qs(##2wT5{dGP8siv&>{uu)xwMUX&@Jz}x-=6g}{K~8|K
zsqhuOSV_8_w|KKW8LE@wcNw($DE6iVNT4+#s#C*tv0nvYtCGr6+eB;zvXPJ6-f$N%
zBsxclinGC<+e7RxdNUx|+d^NSrLqepod#hBZbd}c(3|0A7NZht|Idwb08+GS;8>Qy
zBP10*6=`eW+pCqpQXh8+Z=^Q8IL?*ITBT_zbHshKuMB?;D=h&E`<d%ZHXMSCR1{O*
zW=rYrK4`PrM5K-^;7X4oBDDarwbU;nbxxXP{&pYYo@nbwBdMkiRcZ`)$2vX@R-#Ny
z$R0pEas!*a-Fkhqg4Je9btlPp#5*854FEcEK+XWk?Swl#^cu-D0a61yjif4q*5Dn$
z?8vd_ul?cbZF+HVg%C`I9fZ3Qsf}<;_!<fG*{M!th@3ni6^mpkiMMTS-+KZ0IZ(i6
zi#00|$%?`0F0Yv<uLF_S-?~yQsjbc5(jC8V64LdZWXG)<1;%{{v4w#Nx`ghPL}W~&
zD-{#R96lkT?d_0Cp}BKZFBhoyBy64b0p6SFShLscMoU=ppKqV@M)J}c*X4%+Jl!Rx
zN2MR^*}79DW|!~^YL<G4L@TSPv|$?-OR+d9o#&+7cwMF^{p~xRCAA?}o%poS_0EGI
zWs-WY3lzLyY$(<x;MA*Y1hWdERt$tmOzj{g;w)wIny7qTV1_h0hS-KV-UTn>uyEh&
zH3O)x+i7GYEMpdZfo&>z_n7{S{eQJnp1UYeOoRYgFd++M^|efksgA2GQ(vW0zqsub
zPj-eZs}B`-=ayB}WDya$u-w~OoCZ(8bm4)2-vwU^W{Iv?I@0Gr3G87S9lD4pk72R&
z#BldgS@=_De}EfFF;b?9mkmN#jAW|F<wK;H!T=Eg5RhHM%oIZm;QdIWi;EeQVv_L)
z83hjxf&(0qkdW~8A_YZpI5DWg`40kZyh*l&^vFTHLZTh)JO3#M>f3f2bm-*GuvCdr
zx8#RWCm-YwzK}Zl<(7(CB(N}I!wX0WNtWseBLXp!fHXF}JHXQ8CQ33QGLi>f4YS!U
z0h?7HE<KU2UglG(RUe@rk$wlF&Cr|wUL*VVsgJh02?rG{F8@6fgv%KeKDZ0ZgpFnc
zay6S2357FBEaHdl3rFDAMdaHLDdvr}t4rSA-DZi6Tf29o?t)=3sWTv<`B2^hSP7|T
zh9^hR5V2szel&&1H~OQAfKl|~`K<AZSd@3j5`5pv73K7?dPQGyZ+Wp2W6`oZsVemG
z<Hz{rDAVKjcjd>Ode#5_;L7bZVcQ;m0#JzQi05*(KkyyFkjh$_yqNjomc^UTO9H0Q
zS+V~v0>EE``1pccYBy0`FzkW1+dNol#BLxv6s+^$?4`~;j~)2oaq%0)W>4Fbx|o3^
z1(2Y-AfZHjqB+#hEMm7Qky%JaCj^jK(AU5&cmQe*O)n(^8vqB=v1z~U{Q-vnoVy9^
z^fe6h1@&%OEHSztlYIiUfd5Y-G_XlO@VviUgN?=}>;Hf_z##J6$a&U%cy8i35+9I0
ztE<2*3jz~Z>%R=H;E$?bNhg)gUoigb1}eu$75aZndQTcF4FTee8g&MQq5#uKx+l&H
zQ~V1FW+(T8?8ZhjD0=8kY;{D`F5EOS5=uArW+8&c2jrNImS)ra5GjBc%xM?EqrA=!
z2v%N-WwQj)3k3@l`sBKxKWD*82Btr#(aYF)WU8C7zC}<tK$BPs2_OPfQX*qu0IVtm
z!Y_Lw<-wXy<S=u`rcFXn0rnxwTrQjU?J$u!NQ+IeVAE|PqSF$f-nGuU|F%7vxJa=0
zk3yz;aOrxkSzbrdID3GV0%<&89jl0SR>mUfTALs}q<(#TtDkrWDC-4;PL#=tM1NJK
zOmwHS6zhnPRGWlu1F`Sz9kmDkq=pLU+mBoHq_%~UPKSN^$LJxCAO9sWGge}Wuzd=k
zwupTpJkwun^5CuNNCzZdXK984r#ERi@<;t3&qEqQ_>>hDz1)^GdaTSz8-Q-l&mQDx
zK}?(4BzRDJiTt<%Ju{@Yrfi-kD=AhLDmnqA4+g~~QVUWv2vzIDE}-^Bp$n*uo~6@I
zSN)4Hb(cdB{)=4jA-YPy?}ZV(CRp$A9C)Vwq$16#W<+Zfg8yh-MM40zb{CR~9vGHR
z3&M<h&(@qbozMdch%RS>EG6{$?=@(sxwA>2n1nKN_|Eh5smR<PMbP>iS~?8!>D$}k
zS(FkIeAbm25M;8MT_PXO*7JAv;4J&l*+Y~%7&fyVJjZ^I`|JQPyAL~+Kp7$xA<rAJ
z?DRfH;$e335nqc!7s9A2{kg3m6U70Bw~}Bb`$BH~VlUAhNs|?{F&VT6q5r^-_)TE&
zJGbnXeDxe^0QyY!kYE9}4+K6v1DxwC;WC{UV_N^>!V2N9!DP5Jh)he4E~QMi{Mck9
z#Srxxd;D-773@yX9g!P|Bn1Pv4A9rwxn#YL<6Z)q%(BK>Q7>!*8dth1L&@8X%ykC&
zl!a_#4%FE9-+pW(h&nQ4iN=RIeR`dG9fU^eEz5vL0Se=;5hsUa;|Q_X@bKBjF&K>y
zvv@HM89X2H9f@&41VALHhs08Xq;{?$&6>aFj2E#!0W)aiC(&_6q-atXx|8tp_{a2Q
zsx9O-QjjqH(t)BvWFUY;FS1;gQv4TDvW^nh6rC2gH|+fj7u_kmnu<F$$4Ey@Apl}u
zQy%-QYyx3j+%UlUq<WGO*?A|ca983zI0A}xwm4%Y3JBOn+f5jnMv_Ea%@Xq2#@gjv
z9+KRM%s03BPYs&Ceg1E8ThC(r-LcSJ^vTMiWRsQ>lj`q_<9erxq!8k0kiEZAz)Tak
zPIrMkb{R;q2<f5%@KaBphU`Izb3e~w$pyT4s(L`us}0%M$ElyMZ6Y{tJ$wypQ2w;X
zU%$6ibZP)2kxFlcg(Lza$*LsJRZBuBIRkCW4$7M)e5+7s3^Y2FDos$jOB;nO&YT)y
z#-4|08iU<Oy?6Z~Y*vSpYE9Zh@J;RXO61x}peLEbS9@PBqAFuZB+YjBVMk(pLV?Vq
zE}^vVhHBMdpJ@oEz8KEKWL@@@Uv?xE0$!6JJ&3M%j=)rhUl@o`*iJgq(4G``n^{a4
zA;~qX01Q>2Mm!{mCbQlcU5)vWJnvmvusFtN0?!{OduO;p^xu4vbL0uw+E^^iP*xhT
zdjauV?`xO=0Rk5Az{@qttZxG@wFl8;1jwaam*4o_>*IQ-`1Q>-1e+)+jegX?6~!Bg
z_kxNOl{iMC;lW-#qE$pEI^dh~H-6dR-x!ey5-Lt=v_Krx?o32FZL7B^*iG6_s>Wfu
zQcDDCtKR)=Udu65g;0wJ&#G*LZ`D`?t>+P$__%qIyME~P6cuC!!5s$U32xUR)9J}E
z!o%L&+2n&pw`Q_>#)-%{0g*9joWzK&s+GpAI@o6tzLR)LUJmHkkZ4E56VsXevi=<8
z0Ma5#LbaeeUXKTDNYZ}%yIU;mq8s%_8f;5|IQ^lj#17TqYUL7hGTBf$>9KsK7-j(c
zPMS2QC$eWmAkf+Z$n$RiEH5OJMv4;%A4~t6P3DeYtclHqOv__j86^r3ZEO__7ix{H
z;8qw#C&(V~sm6_Xe{_vXUICDWvF+wQkBTeC_{k0=@`|?eSc_w~u}%SWAb(P0NWK(i
zmb@}Z<V&=_5>J16G*6-x4G5#JJBhLaXKdfLBe&n?Tv~uj66HDe#LYy&e!@4&dZ)d?
zWCj0EZ*5jMB#pLvin95meNAO}V1QRD%92P~uw(qLGuv~}K0t^Tof<4V_c6l`;B2Ei
z<YmLI>Uc&x5rLK@ldz<RBC>1rJjC{$etiSTp>gzewq+KWUOPa=$+o7HgV&bj-x2uu
zU<H$Kc{?BG;r*67C%>cJjeiqA1b8$bp2!1`>t!2xIe_(InS|!K-Z_AbNUQ+>S}}ZR
zZi7mXTwHDp!cG)tk}B&_tHHab`)^j6BGnV{vk$$S%b0|DAy$sp*G!EMO&mzS0A-06
zJWiH)x5yPy!^Rkj?v*`h2|uz+P#Xy2+f`DV?~*j-Jp?bw<<ZLO`4q1e2qmGt1PYj-
zK>)qb;Xc5f^c{+!XHc!ipy@q;)9Uoa8YA)H!BmiCLI0>6=T4R0rsvzF+$ZZ-8^WZ)
zrb!8b_#DZ%iR@30AN$k3SEF(KB&jm2G(Zc>my6an!ydS+MVj^aEg&VRqm}d?VM7Rp
zBDb|W#chaaPFgw0&49Xfi76o5FaT6AR%p?BSBlve?0gprV^o<F9=CeS?H@DWk%@Mx
zPzX~sr*G{hoiZWDRWXO9u!DoQfQ=Pmp9ezZAyW5Ph{r$4Kr%!<S^@LxHrlmO%G8WU
zxP1z0NdFG7Tx>#Cd#GQl+s+s=gYlI;pcRr=1m;d}80s7mq>^EcDxDl}B%QsRyhs3G
zr1{vXmU&FJL>~;gh$d1~AJT04`o=0oVXLzbdCw)t_5+2HCxt3&*R9@|<wSh@FgW#{
z%`j*wUd_4vcfr~qgSN!ry1hw&YYm(9Tkwq0cw^GR4ke2eQ7j7SO|I0)Lb1??fF%cK
z{SS73&!yZM=L5WtVBXKZKeF>@hn|TmVF(}wp)i3!M@A1A6#M&<S;DR$0jZ#<I1q14
z%6%8we;HAANJ)mZyUvrBNDD-`(6n!V$|&x5aVwfasIz~5ublV`{~`2#DH04wOg;4f
z%sNFV1;pvYHD%n~7kh$W5bc-j!Rc?;C7NM_Oup_UJPOcD0*_P(b&0T;n6{806Qsgv
zQLMzLfgiRx`X6qrkeK{N(l?6+=_1Emr%bwZHP~Kc7^<D=D+#5`ul_yrw~%YA8I~Vi
zb?Enbt0nh@-(WZ)`Dfqe{c%hSm6TS%tavE3U(%6z(V?Ru>)U12w}fxW)N}pY^B8?>
z>ZONMb-7Hyy^+o|3oVhv5yz2_MQkFCHK}comy;$ZC(F~Ujrq>L&-r`VynBZP%<A7G
z<F8%4`my&;6)X>DgCGkFi*ZU3P)7dgc!u+#m>Y^}zIjg8z;O1DKmPBnLsnK+`??Ek
zY-~m-h_&gqvDN`TK0Y!h!h(W=IF-WXJ=^DVyaAwj>goCC)oa&`01(qG+hiYZU}tAf
zGHXzO`SN8)T}rG`MTGu=|L!9u{By<0oh0`@w;wVZ`dWIBJ_r`|3`!Qc5hVF1P-_nE
z*s+7r%s5`PJb3>CT(MTY9Fu-rNJM14(m3T7ckdnpljz0w>Mv3ujaxNkWMq`5S{g3;
z?Y9UlDyK?}<~&1keua|E@;kEXKS7B69bXIwA$KdFth3Ulr>E7^ZK8*=!oxX?_qRbF
zc#a+<Vnvqq9rNaVihO+(7^Re_MAR!bbpeO{X+6E93r$DJBcs5kS+pEWyD-#Ip{T1H
zeEIU_@_0Qdw^xX7ippg=O!nb_UrUP7k*^dg&6t{=?iYZ8N#2wPi=7&((uq)bdhFUF
z<q>~zCw0hJ$?L3{MK8+t_4S?mf-)bY9C@-u<sIebiRn3qvbE%oKT3h4y*)iW+1`|v
zo<PDlAusQVn>RN#>HRm?53c<+r=faL6J*T(kr5S`-bs#Q7HQEK6@x1<s}Vk-hd@z}
z#6sDaJG2!ukx8wMx5Qk#-MXcSN3({HH+hUva_f3#=9u^I<*)wz_gVOr$9j2{Y;A3=
zd+OEp@893ike(8u^z8J1_h(qbFlVjbx?Eq$a#B*#HEDv72wtagE3ZT(M2>01XBa5T
zQ92O&69jR~#VFthW29+%2$D&JMaw%o`Zm{pc8iN^*QJ=N96fr&d9qJR>&5x44KIYP
zJHi2CmMmL#FFQM1Q%h^9pyKRwFM0ZFL<e82OS6u&AL@9DRi-IG{~Uzw_0S(?vt<6K
z&Q9Y@C!EF5Tzg@Bd|dsO;H!WBDThC!*|s?NyxWZ%@6kBLuyJDqpK&FJu(0s#CR((3
z@lFnoqv%-h+_mob_elFJH7C$q9lahr2!{&(Q9Xxf{hYkKN^3cl_6YHyJkcK^J6G0W
zT8b-FpDjJG<I6%fAH^`)9jvT}9zA+A4iWeszS!Or70E!>C(#vX-?5`2{KTujrDwy7
zL!IM$t;aKanrmumGGM18C3b+LR(eQGheObN7=X4c$(yAiv^A+|X`EZPZml}(Z>RR>
zk{rkU{Ctw!4jep~bUJs*6darOH_8!8w0ZO9Em^WeK~j>k{1;9p<e%Pf?Uc>x{~{Ux
zaZ8Dd+rxM6!=d8hVgsmpl>8Ub(N9-MSITB4rnO8=y9ESNHs2x7k1=rW-Fu?QPe4Ug
zRh8Lr$(HQg*RKhJgn)4Kz<~n{jEuf!ny~N~Hf#w0^hr0#wC?1J6)P6`@0h(`?i=2@
zS>vrO8YuRN5M^z3B&LwulJ4cT7LiHRE~%y$FJ8dPI3jG(d;{{1z;;&Fw-pt99*K)Q
zt7L=3qlY@Hc4CUrVXk=k^y!Biq@<*h&XpZO0NcZ4uZTN!aLqnk9JTu5;~R5YGRZ^l
z)zQ)E>;|cCdvQZCP2*4d=C`Z1Z{OaI`f86O83TVErtCPjZ$G?vC9@LZluYbwW<s_M
z#$kp7mo8l*yF^q}bjg2ldh9p1IR^&FT{j%pdhJ^m4~lz+>DFJe!RK$>x>bfY9TI;?
zv|3q?LJcnv$UFAzkte&ZI{t}&K*0C7KgRrju2<0cS8>M1rIJ0G={7Qe(I?KH4L=#J
z{`A3v2fTYR&t&A$SN(wK_HF%l^Bm*<KNpFVe{>o7;VF)J?cB9X`r4_tr{kN??!wPm
zUcG)@Bi$4|i;CAyh1|SgYpa6Pu$F-#1RCBm{0`fh*(6N$Jo?}16vA`{NVsU<y?a;K
zVK^!yGt-h+_USQirA&Kktu(8fczTL;`}T7<GC@KGrfCfBAGS#aX%ljT3<?ZVnone;
z@qgQo+IL{i_9KhPV2LWWWchM$tCTASvbO|#?~p@h4KFSnY%2*;!87-B9H;13Q{&b|
zK)~iuk3W4CS;*VW-^0TL8I{B&X@z+4ii=#2daE-xt>FX7y=}AU<5SMOp`Hd26}@Qf
zO!W^9m*w3AFV%}!cibZpj<o|){EEbO?lh?>!`*c|v3|hMipb|OZy9E<eP;dKdV!DR
z3mbG^3+caO*sU5TNwPemwgLrS3|q~@(z0M!m1I?2UERR8r3|)nwliuHj4pF*=P1<^
z4e7QjC=c)lLy=6^{#Wz2r2LXScGGAy3yUw2N3gFRhoDb@Q!DK`HV7*XJ*^dBCquwM
z$LR>Qmd?&dC>sSOC2}A!Cs981<5fT0_D;L8#?=YG0UpmrY9nmzG!*w=YU49S_Ew$Q
zXJxps?$w@UiC(jRnf*U`Ud>dC!%QMJ<zP1z*)A@LWk_?9{85-sG>%r1!t65RY`ov_
z$U7fybH(x+%!Hkt-3}okHRJ(hO-=ue{3hYR4NqJH=gANHi#ePG?pDS|Btp#V*ROf^
zy1Tm@m4)6RWjfu~oX@Pj`oCN%Den6VL`X+Rl@v1Q3lQ9`w{joDRyf9;@$GWWIgDw`
zVmh79fP@3JSk;9jnWrHkXVSFLhS%KI7KSp<`PW|uu(jOW+!UavF5$2Y{<d`G$|}gL
zBorVEb7YqBl0H~C)tUq+RYdxTiiv#>1O{sE2CQxaL?Es?H%$SRz#5T;$Mz+)i|T&|
zf^7Zu{Z=$J>r9OFR^d^t0iSlI)FHEjU?@_I^RFsmZtx(FkUxl$x1L*DTA;YDP6N;f
zrBeyisrkzng&bexaeS?j|8B+MFI$X3%#{G;&uqAc`kLf4VIu}T)gX){U`LzbF17BD
znHo4`QVM^Z3@tY|x8`+WpqK{mGVhLqIijadog%%rfJ|?AUDm@q{S*ws$bs64^kr20
z{MjFB|K8W@#LxSBt~qhy1S+rmnKL0~A2ZI`2uo&7pidT1_Ai<}h&U3=8H*zv?{=-b
zg^6ovZ-0h;^$ZbYfJ3C)0V)Knn%*oweB=n{zI`9MtmGGNXJ_w3CyDTdFMt2@>Xr52
z*Arvo<3?y9HVFHzS!D6z#huP3fOFQ^Ap}3q)?GQ#zQ6y=0aziwd%H0qOL$t_C{N30
zZEX+@_KG}>nloFT-mhODGHpmx-XvgF4snxV-MUb$vM)+7!zO?SrCaNEH)H$2+YE<_
zU}0m!FS48B=>PKN%U}U_HmQlij-zq^{PU01P=|72*?ia2udqz3R;@Zlqn$ucHm}uJ
z`Jt|wL=JyZBvse>uO~NC{XQuvDQHiR0Wo>k*r<cvmT=VIBH2#oIjJgyea8+Oma-gf
zMw0WC{b4C7zkt=$y$yI~FJLDeg$#kH%#h4oJ1<!NxU_UR>h>Ud10kQtxZSzKC$gR5
z*h>nTt1GpfSyUAX`tFr0S88@(L*%d)mdwi$y?WkR_GkXhnF6{b3Pw95%MT#@Wq11e
z`xU?vl%v)8>@Hq>e`*0`@d>FA+k|}Idi2{VWae!RFMvvn(INn8F$AD~iQsXHLzqiD
zBYbAEO@cJy<HGmNq`f(n%O4+JimNFzGczM!S<N&2gF4)Zti-T(?e|iJIBI`W5v!vE
zA?H0>*p*{6Q|A{GC}VKSc?GrT$&I2CU@!cA_RBNm?c1Y85B45}+vjW6m}P3IL+<lL
zY9(PQ(4&=shh^Tarrcs7u%CtgJ`O!M35o9Tp+nX6y`Minf{Ckz7Q{x5z{Rc;hYlSA
z@)Q3MrIyIBe!Zck4mii`)w*gF$x4Bym?+$*<Q)TqERUdlysSiGCJM!^9mgj5M^uo$
z!LQ$cEq&6`+G>CwL<KS@ldXBxaqeVu)GT}T9yx0WS-;NelBI=(0wir-i{_(ZP|YmP
zpWm|=lc)xulw{Mbj-$C!H~^qvVUaMDg?-bG^FXDaUS2^Wmo8jQ6?*&oM?oW(_F|Vi
zX=U}?I`h`8TOb@^cehDJAZ{mP7rM>|HVr{OA(z_3Zz6x}nA=bmRMM}IqRY_8t$+F&
z87ogy11ieD?*%Z%xP!m#_<O+eXLgg!x1JomkU99_crEa`bpUPzTK@<VWjLIEHiN~K
zX9!qNJM_wOm;hE_C!OP1ek0(G2#u7On>F)Xh1@p_J%%5D4zfk1bx%*vMlt*6+AdSp
z%Fh&XUw2Wc?_Xa3kqOybJ>z10(P0|gkSFMIivWrucc5G|a<Gq<WFOck5%v?K+>-_e
zDMJg~;}<UkY4qIFr2{1ru~4PL!FkyX7gI;OVVm1S2^l0EIhEGP2c=j7KHVH@4bKAC
z%R^|-*FcL~%DQ9`LBX>KXnQE6JAR+<df+QTj8IsTP^df8aFo^f<>f`d-23)|8c@x@
z4Omhf>e7Khia#^rvHO4jy)YBH<2ce_bZ19pbR>E&6cF`$@k_2^gg#+?I*=VItpQQ1
zpsjCRzkWR#%B1zguxTcY_HkhR=uTK>5u~62QT+S6fGC4FY~G37K=09(CRy>}UcGDn
z{{GU%<hal9Y3!84ezad#4r%ETiZW3LDy?C0R>F?^3w#iFq|W4_F7e?2Ghly7S`30l
zJ=NkVDRG6%sQx_c_O*Zh8H`^#mgaD}6wTL<FtPGExXWAo2H)w|zwcH{c%2plLy|zh
zV?JD_rwuTYxW!~@n57^5GBwu-eFYCBn4dKtgb6BJIaFhr;$@VlBKB!HvTIazwJOav
z_F^_R0AW$hh18Z_!ni^8E1=kJry&&*rwJ{-5?*+Q=5nZE&)`pRfWFWN;q;!Zh)|AD
zjenf)OHjZz63?SMa>j&y!nWX15xf2nxS~;rS{mJ`GU{1Hq)H^RAe-TD)Z6H(i4g+)
zpiSIm#W&;tY)+!4lYv_FVwZP(czP>FJCl<Zp_83;GFCep+s@CQk=l}yY}OEgcGR!P
z3-k~OS?8MG+=<XkP55^#d?-rtj%(jf`u}A8@)>wC8Ms^TVO(u**xjQBxgYP2m^b@y
z-sqqyK)Jxl=CF`-_7kR!Sy~`}dqGa$rh<n)gI$#7IF^9#t&G-6KZ(%_;CqtVf2>@o
z0wNFlYzxh(^zqeXlbXNe<mAXjsT{G{IoattHpY8BW8;6lEnqcqIMEb}BdDi$e=@=E
z*z|ssna55s@F3Gw0J5N`PLK8>$YuIjPL3ogO?c$yLTFaay}jmQ`-vD{fkGj)AIQiT
zey0(DaD-x*JL?XR5jpX(kS|S5WDY56HHnWMOHTii9Pbl?xOsCFAjwfWnSp4v#3M-S
z`^W=1gAF|-2oY``jqW@(q=H5i%YDmc1H@tV>eU1)28h}zd}vs~A~7?W2>K>X0*i(0
zL4ZAV4;8rh+#AImU*xMHFkHuksC&D{#O5XwhU4({WBASLIcO2;NfDJXT10Lk7;cLM
zg6cS-qX5j?2?ZHP$Cym<5-On$&U6`I)%K~zn+ID8d<2AcVz}rkzf1TV>hrPL!*u}X
ze*X#?Z;-~01y1@0X)NOs`ImKJO5A*rr@>7$z}jkR)JVwTtW=@wQc7Nu-=vxpbbzA*
z2uXw;LKh?7N_Y-cdih3PoUH8WiGic`uBxi4If|kdk;8OLR;;*xYJ-KfHCeotwl+f>
zb=+wC3NraM{imG8@BA>#sbJOA3r*Lw?c29Eq`SAEacRe=g|4d!E_x_xm%vKk`<2Bj
zamef*W>2ye*eC?edKGD+maWpVe=fq0qB~?92U59c$BrIF^Wqf-78Y}hljMd=aYISx
z!np-J_Z>QHY_&4&mFeYSaxCaKtZcrGVt)}wVh~N-?L9YDa5~;SpAk2MDWngKc#MwL
zzzAsK5B&b5av0E5br+|ucGJdTNDV?udFYR}1Ihe<Fuhw^rY7r8*QYNdcFXLS<f@cY
zuYR-cu2rTh!Dj*;jzlD}87`vs6rjqWY*{vBIcb0n)Mh!QqgUVr0d~BwY9k<a&=hd?
z$*QY*ubsU;k)v{ULH~u|7k%%+0}ew&LnguVlAwq@z=@RWX%B?fCuciBiJ&7t&b?6b
zyEa@S6;sv6u<*ftY^!_cKYsi;4hNH!1_c!d-!sU=s)OP2H(WL>_Rs(0F(v;G@TjzT
z)>tTOJ7_5mRHaB4qsKXTR1dlS=x*+|HmjegDEA+90R+PpR;QOoDDOmWms@u$mxP1C
zaaq~>v9Ynh)Z1wUQ-G|K;GHfZr4aaorhx8Th_}LCJo71r-ymv|0LCrZX|?fs*K3na
zjc3;~cRg84`~I`evU4775X|Z<bo9y=48U`~Eal3Yk3dxnm5%2^w@V$H!2&9I^_n$9
zC~U{+AOl1Ecs<cE?+-ZNy0Wt$J^RL|@d=f?iCq0iztJywp`UE~Mwb~aI#F%MCMNu_
z(7}7`AOsB|-5;gnWJEaWcf!Z@N$9wT>bRd?9;w1THa12+4BmrYuQ<RSv|Bwt=jF~U
zW@G`pL*=d^w=?MfH4XKnU&l0a3da+QUQ1Kd)w{qH=$ALp3dGr@n>AF<Osb2-Fi=DK
zO<xd=ZP3y0F!H=fD|Ft4@SE1f!n~E%E-cx7?bHT4-+wg&+)Q$R+WF6yp`H-fhPOR*
zV2@c?SnNZ{NN)A=^bAHgwxipN9vK;ltQ`z{3!<RwY8WeegnzN-=jBO#{`{HEa3RHU
z8*OPyi3TzxWPz&5i&5$u8j6q+sNiZcf;cIzwzjgbKY@wcbDX*0z%HiqnpE#u0y(Rl
zBnH4q)SR!AK*uu0zf00au$R=$SF(uL&px~F_3PIIoyX77Y~?mI#GSf=2!&e1jq*Va
zzwQ2|jD<Al{9T5cyoP5U#b86S{SZGYZlTT0%p+992M_*0ebJLxhmt(AM`AHmy;ESo
z^46&hN3_wk^xv2KpHS_UVH1I#K11~+eM^WILg$w}Yi>S6NCfZ<(769!fBn_m)n$T=
zb}abDjo-<4psW~u%t!?<E0p~M{yn(}+V(+<lmX|hrYG&(U2vo3d8op!U2k(I9XEdW
z9<e-k2|vfL(CG8$&lkM3it^k6(Gk&hoDQYw>9=p^u{Gosx@!`pa3gw8Pu_gWlTkoG
zz`CpY1l_9d%i#bqhuI`Xxh?+p4n@CA&>7VDxAy7C8f*shDGpA&y=LpIOzHRF4_Er_
zDOF0+reVfz|8)+6bzdTjd1?s-%tcUcq%^9UPmGU0B(oX*VrJmWbjd1;<7gKKlMsn+
z%*wi1Hrc#<b_DO*e;ono)3#3s#s7B%j*gB|DRP0L=T5Ux#}XVS;$>xJ38KkISAg66
zVLhyMKQq}PHZ4}a?8pR={T=N+KAnMqfoz5gD2^qZC!@}0I`vvEq?XiX&Pd>M65B$X
zD9IC_T3YlonHU+#U9jf}H2l98>~Y$m5ThxiwriaiPRTMX;^N|hbmsLszn3bs?49vI
z9Oh%Uxw!#_YnUgpAkaMp9t16*5_X3Df_Lu{5KSX`j&$+D+){KzZo8FB?<qkCgQ1Q6
z*t@zqkymYjD(Fe@?-@l3dY70-?NdOf#;n<o-;t!{I`0bc?>1le+FAXS5`CoTk^>}_
zmM>rax`{Qy()=@7f6}JSzp%n}axc0ImdzxiX<>b0+0H%-;a|{%+O<<h>6<r?zFR=)
zW;+CJ8WESGf`j8R__sG5*h}U-g~ea&1D4|FzV8Fd)oK~zzC+hesk*pbfjagK|8l~^
zM(chg1YxTtSj2ixAK{O@^U-vknX>KFBR$hP%m8=6dkXTW4fc11tv!-^p>Z?0<PyKG
zVj<tid1Fm@CrbRZmg&cgnl39(skzePu^WK)XCV`&IZY%J4HwL^u<PEB5dk1P`N_H{
zj;#P|ruV(*bm?~sp+y?Pdm^H2&5vhr+whY=!UXt&6F1jk1hIm8KpRnnoBxog67pXO
z2mp$$K>J1opukXXqYn1?taLSdcaFb)9VZakIdDF=n>UF@4wy%FyXyf{bo-G!iLsj{
z#sJEX_vNt=9`o;)ywV&6h<sGvhEO6pKMwxJ;;7(pdc?`-h$U+|kEYcHBm&HA6g(e{
z_KH&m5&vDn^?>_N*iJtCG@}95Lqr{5VD&0^O`YiLB`0aHP{vx>7yC|+jgG$e{_anB
zZk;<Nn$<5ZJRyP(ni}4|dUcSfqhJesXwN6@G+l$zM8+eeo0MX-zH9@2$Mw#_0;^5X
zzg5$YQ+`cQyoh}(Gg8E<9B~$)ZyZQSDgJaWZ^1}19xI4E9ug!^(0orlDfi8#x45$i
zX(#-PFe#k*d5q?l4iJ`eUrPLio}j&goNd9??%|$KFu1cIn!<p2FC{L`aL0}mk)ML-
zstR3I3EF&fYilSDQbr<Q6JEq77f4u@zC%5j98tuHe2#yWvB%&EM?%OjYRt;~c(G}D
z_pBRn#-R(u&=@a$BEOEVqism4z*!M^9Sjg`C4`#j>gs9}5A1mW+wE`|XQp~xDzcm$
zARQb=rhxBvpFAdx3y0j4qtQq&a7%A*EK2CR+}y+9OD7<n5~vKE((F(SIldB2KNZQQ
z8pKG$70Bk=$a^*zXQ)o<IFCIYy6BY&?cxa_`JD}W4?vE+2UDHM>Mbo#3A4c(sAoB*
zViQ>28^HVxLec3&`?=E9C2N)NhCYbasyK#z8=1jp^Bvzn@G@g`a*zOpR@U6n5rKsx
z;yl52!@Z3`>>id76Umzg$jy-MP|u_D7UE?p9=v^}rKRKe10Z}D_yp`*g$S+mWYXgX
z*t1(osH-7e1p<^jw8yAkt(7ZR64Ht}FC%&u6H`E4*hB6WEaFi(M|0fS*%^WU5H=((
zvz@?8NN)0H&z^O6R}6uRAclGH1Wak^=-|YaNOdFKFvE?syQj$l%ziPPKpEp>WdwpX
z2)n#ml0IYz_;&;<R3A=I)<wxp>BXzwUbXGW1|Z3Hck(KiWho7q@3gu>*e?yEq(i=8
z6zgs{x&h3(px;go$ASztIn-x4)osALR6=9#u<CvL^zw+b^zFT;{#n>5jCW_GGcS0%
zTwH<*8HUodL0em!KS&X}{2LvYDeuqxpzC|d#!Hj|X}pc?q(62sCMM<@YZEzPT@ZFj
z6g@nTca0cArwHgte%2^AGDA);HJB>&nj7Q>PVqVnF9J9}B`7F(W&^RW^Z<%0_xd2C
zyhu+s+@KSov|}49D{H}|<u-}oqTUq!f_1MN8yn+Pcd2D9hMIc^8;9}c&6_qYSrBiZ
zU`MFA@63ny9n%MZwQ29(y=v}${{ALtx6I{$CUn2)WaP=BQS|L~y<2~EX%Gu0*Df-=
zTSp8BSk!uY(I6pKWP{gf91g9+>mOK)_=U#IasutTZ>-w3uTT#w{06l04sjtL_IhqT
zcsnG76@E<<0P_(<{JuLQBO@us+TLs8<Ky1|>UBSq5Ec>Hgp>Tw<6(J9=iR=2n-@nQ
zjg+EPOJh>?&`fn3bIGS+=Ylr|9eJkkbRSa+TD8a`#6yQ$rgy}HIpDM2W*`~J;CHl2
zKI080`-sr{Srr)T-T68Z+nx<*=G_PLC+O5|FE6jXhQ*sZFJ{k7r;|-6ZAn~)Zu{M+
z9JkSbwNYGL+(t|Vfu@I+e4CjVe)EqeiBrf%Wv~4OFZ>ge=YH$f@kG-@hyL!v3sY)_
zn9V@MLwl2=rn8p6+Jh`|N1CM*XKE5BlJ+XD5SI`S`F^MfbLlB_?Nm4(l!#L}%SFVJ
z9m2xua56~(g}|tH#}A;Mh^DXvyb`%KV4^xopIpzH8rJAcBiTU2rt3YVYSL~?EEE*T
zvVX2F<vagLa!ukQ>YmlGRb%1rT0$o_;|%zEA{Gwe^ukv(ni3X_2OPw)-{WRxW^~eB
z_6++AonqO<UV?e|49#ccqz*u@HEfV6C<+m3iJ>9sOp0W015YMP1|D?o^0UPzg!bTd
zY6DSIF*)Rf?hfc(_^M5{48zgZwL(HdgrpPk9k$%euJm*hv}Gelmtz#f&cPvH3_;87
z_H88v1v>CE>D8Xax5RJ<nJ0(C3asj(qYib~so;RD1jCv&+rVz`ut;d5bew=c6{C|K
zL%eyYV;>sK0&pb$J)FcYCnB{`TN=@oeH<W@4uP1+Vi4&`69MFRVu17Y0DhE0jU@~S
zat7(>MzkeC>VG^sdJ&Yo6TMu|-^6O9$dYz7AAfXLN21aF?Ve?*ibsh;h;T<kO)!3(
zIHx!%@6ik`A3^sO$IeoU$M*H(OnwIp5oJUSsDt*!&x=TzK?WrY0;d05_vL(&_2`3e
z^}?aOk>(05A&*T0<fihDjv1`^$jfMw^g>*RVm8Q~>~Z4gIrcPad|UX$7r3<Ls6Qkk
z=)@{QvLW0}0lzKlFZc+ZGT{fVE}@5@I|Z5AMo9R698A$g3m0;1*^<+$nu~g=oNfc0
zij^ShhG})&{`Fj%@=$!50hb5F;W7AOV1`U1=yQFKq@Un8iCljU!V19`A?bMx=v!C|
zZI?Vdev<eb6^Vv&GzCaBcoCwV6VCb9ILFv^gJ&Aypzu0w?Om_-keu&?NUa?ChOrUn
zh|tOp4OM9n6V<bwo!pu6q#w&564MQt#BTXJa<I+zS&+2P@u!u-!TlecR4zrM;(mrM
zLQcfL8U@HMGw>tlkcFM$<H|T52@vdsI#;6B#MANGw-%kxryAHKgfE~5)w4|p%}g3z
zi^nObJy@U<m<D6f0KpS~j!sg#8-zNueD8MIwCpkp5I>fN*0Co*0hNuWNG`YWq}55?
zSi&eRuEdV#7iK2dV}&n-+RS9>LVH<mYdf+m-JlpJybUJNS;B}s!9(8dX6AVD;)~B<
zk9Syj3~p-)wH$Kp^=rHpi3bjySwr$oS!#15)Tq#=kBu^|Fj?|>VZ+8(R^w3kUuD%e
z?53d?5D;(&BBH?tzwsn;swq_O$@$L%fZJQl*H%ms=2g>Gh2mM)W2%FYJ4j!}z;N%J
zrnB{zj7FSoGD&_0UR8>a*#_xQ#soX56k@7zl%JnpAKn1NNMufI!+@SqY^k_6B0@rp
z$h%3McGbqfbd`PRu`;XAu;Y1E4XZ*!#Ms`>PG7iZD%Qz9K)Ls(hlhauNRO%4K@$^`
zP(2s;7yPkfco5&ud)=|aDmWBFZ}*{duf`-1DL|TKo2Y30d0X3*Hxm;Rn@ELz6DVp&
z-j;HWud<|syFiZ%lOr>bmH$;Wm&IlBl%R-+h8{xFc;Ui@XGX<u-MqQCqbk<?4&I4i
z7n-`^nrT>JaPXFGXtH@#O&YoNm^N>|4Y|<3*m1{{tp5lp>b1Vm#?#&)O7`8pbMt0=
zJF|M^$=yvbLfYTpYa_(V<Tv&gFK$3zN&#<gxl1FYZvj$Nfs{7kKsY}kJ+JffHsKXa
z&)YxIe(FyYbrO9dRd{1b2szyN>H%8AC4o#6xKKc~pqVVX8+Z=wAy09rFT1h@+qwf@
zeaRkY?52%-bX_5Hsr(5vy=8}Og8<I~mEtc&r{mA!l|Pj&IEG7FBy3DYP{x2B!y$#^
zh~)MDow=J+q356|>US4EsD<9Cgs|xB2jYS*5rqSp5)Btg7pKY}G#l&4w@b%T&BbBB
zOV6-db(GWaTEA=7c2J{^fpZHCqnBdI(xr)9h81_Gv2G_24he-9u=470x|gi&CP({x
zZLICkpgV*bq2Z$gnHExo{+J-ymP<Ua69HuN(?oH^CO!Gq?c3#eJIu^6I&i)>cWTQU
z090NaR7-vFA|y2yDS%jS)<RrdvS<i?g!<2EZr*E^w+RyDkPR|YgQcAVy58j~>(C0-
zKR8&pCmFktcxVZHS!gRI#}htJ0a`tjaLEDzV<m6P`T$R6-)#?f_evPaMq0!D{U7jt
zo#y7|duEPM9LYJM-CAkoJ^ssIy@&f4Nqio9SXdaq3#*Dsl~q$`MPwM#yM_<6$w+Ly
ze|qw~LeLMg%tINHl{`An61aMi%25alzB#q_^q38Zi@-`k!ab$JjM1pSIR6Mue%JP{
zVV5mOKRz1`etaoT<LFjZ55I7pN+X63HgN>{)QPBx5Dq#uiMRDcH5}aDj<TF=!zEJ$
zUIhQ22nehV-E8AHE_r5WHlaI^+211<5Vf@8nW7YK7@-myh_^pIf#+dpobmX^9Co;6
zanDS@gc#&kHa1k|IvvsVe0Uqlvt_U1xEfs!I2<8;)Is3Cx_WbXtJZqhAF$gZ@J6^)
zW{J#rbl5)zifl)zDjXW!L(W(bhgmDRZD9=%OJ}~<)xKs`8airJuZ;)^3KAKVzmvQN
z*GkSJBMgRKWgS@?@Bz!95T<^;#tbxHfM*@Q?;!>Uw6hz{250e3j`lZN85c_pgwT9k
zc0o&16H05|E*2NCvC29;={ODs-anUtP6C-iTq~OW0#YBvu@S<^5DvYm`gp+H#l7`Y
z8_>P6cGIRv0DMWHfr1hS%VzjO-X0!u*U+BbcCROEg|F6@ys9b{6wM?qyf<;n2<>Cr
zw4L@zOHrDy_IY-(;mwhpI$0TLsx{NXI}d_*ZfiY1d$;K78(_F|1LGlvFll|)g@q2|
zSVWCx&e`g?Q~c|19Fs%)6v&IN)-=3VZ#w<-h@L^4>eYKwBfgovewf1gn5vHHrr(hQ
zl@aAyO@^&dzX=%~M#Nzejfr0c)8<+qs6>1!hcV)|904QE@&x<bez?Xkx&Y~07~Wxn
z(2+xj&cY66%wn-0{Ca9c^|11I%LlR@|NYumj#kcD82`q)_oEgXFwu4G&e{&Gy9`af
zEnmMrXxy>x#e)YM{32z2cB&M|cooC*;W9TjM~?v;OX7kLJ^(tPwAE1qgM-i8Wxdzn
zCDC$arz%g!e<jB>Dqf&MRKQIB5arT*S1L+HqvR%V5gbCXqH(k$8eD8Kb-<d){PD*h
z{GA|;j{zLpjOx)63P@69papnu;gr;a)EerbKZMgMXD%jPba>*Eh&T2B+Y4~$$dL}v
zZ<)j_6gaiO*Y_A-?Ric*ggQIRpXphCQIze%Yg~%*4G)j?HbG7ytsr=B&bds|PX3U`
z&>dANATm$3TU(_V?_@ZMx7Jjk=O;=_Xz{WQ!RVn7m19}C=m}mXVviGRIMnY*BQMvV
zMP{Ugn<bvfUrp>$P`wTu=idL+NI*&|kC&Y-^kkSUlV1xKRkUmrXsmXZPL^X$j2S)k
zF3Uvgh|UAno#+5bR1-h{$rb(fA6`EdziWwT9tW1=>3-jbHn~n9#*%N-K*(Rwa8lWV
zj@;S`89@Lv13SEtWg`w<+^-|BmP3?q2OKihxYdBo=-s}#WY02mROF?Z8d$DMJj2%}
zEhgoJB>?enU)HpH-7V1P=)BupI$8b`ti6^qqUKOqNCzx=nbRpp$aY;1&9iWxhoH)?
z;-+b#3kHj>qtqm;tyr?;orMV+0!Yxz3`orAsc;cNqR1lKd8(_HIPH~932=wV%hbEZ
z-D&TpC)n8Y`1g?$OeRcHFJ?y2)*B52R1p)M>@ZP|#|lN^Z3k;X>%s>VCUeWJ34?gQ
zUry@_CmUzE5jP2>WSrKB#37IxwDqFBMNHeG1YrFcL3T0mZk%&<E0g}Ap`@Jv?G{8U
zCVh1@+w9xR;Ws01_Vb7-A|^`g4`f)S2$t2<)Wn!-JU_dC6}vRz<_@olj~O^f+M+xE
zA7gI<)^qxW|9>zT+Zg-4r7V@LNC+`P2~kNYYce8Q6j_onj4ewWDMX@@P(qUIsZ>%S
z*`rjlRF;VTuk+zM%<uaBuj|+4nrmj%r}y$a=Q-y-_kEw-ek<ND#2k5^PD%Ur?R_CJ
z9C2C=j5-us<@vWED@>#6N`HHX?;iI-Jx^li^cAJC?YC(diY|0_`W*42iyGoFK0wO=
zwo;6x1(_rzla57xZ?e({ixx>jP2T9Nw`ke2WoM6xpIYg#5#5GDfF1Jc8@;_kK2Kx=
zFeDb<ZjqC?O=v8Tx^GkD94#T<Q7UN0Z`$K*kIQB8z1SlhsdGy&RSM;b`^hct=6Puj
zs>|Cd6N+BdL$vuJQG3fi3OCgv^js!vv9h6wFKL2L@ULAQKEzp?FIgY8=BC`$q`Oe4
z8Mm<Sibwb9_riI+T|H+teeJVOsD#XkovxoMXuMQ+683|XCZw0opZR_<trWO9@Hq)h
zZmf;qP2^ER(DfEY1UcCA9<XH;ih?)q-%sHzJ-VkA)%+M>)&Z9r6fL}eW)gA4f-9?E
zYG56`+r@6_R`uL;)AHq8+rrixTF|UjtHT7t5>(zfY>elnrF#x<^9%UAZIE@MhYb&}
z=dJOSQeVG(KAK9qG@xbB!wkC4Q7Y-64}o+30Zv=aW_anb<#XxX6PI7XYxa2b?86DK
z?4%7iK^|-0%IVd;*m0y5XX!ZOZFE4Mnp;|_%W}Og36Fddgn+Pgq0{r3Zr>P(ax?mq
zOrfL2&jHa~9MisKOaD7X&FhE9@5{|<RJ4}_vCtI`{8(?n?1aKghH~4Lb89)0Bnm3O
z*V|qL+`&A*dnoG>ueRq?%>(q?=DL(UAxUZ8u3g}wg;S^Q`c5MyrnWTOlsozS%1X^S
zlb43yuy4nX419jS<)3+7hkr=Jc_K<+Qs4xk3EaPb-xh9d)%|Kc?d1~zk6w@gMGvxc
zVeJ-Y9o|N}DiRG7EDQsvwh<39y1V}4_j1Xs^VL^gkp;n}z46LH0!{PZ0AQjYPJq@&
zIqo;>CCQ}fgM<rKBic7h-WXXBl)ERy=u~<3xxFF7P|O!UOo(?K)UDfSxPPXkN5X0)
zs7$1YOnI9>otK4}aWZiu8@jw-pBd9xWAh6R3dO-Skawllt=pkCB><s1ds@LM2zb73
z&#Q27MH{E?6M}AnK_5+h`0%0I49y;{Z{Jeg+TU@T{4jB`^YVohF(cS>SwKz@5XT99
znYTXQV@CLp0=p&cyJ$#@nY=^DouU(f4}SXdyY!;Bf8Vd#>zw6q^33B~mhxJaU-~+s
z=j$)FOYzf~7qfD86GJEG)l0Pv%ig>>z9R4CxxM<+)oCQ}p5WtJoSppc40XZ;mzu`K
z8v6=I5lQ=_Z|h&;GVNCzBGQs18IR#IAZp2nQPD7sawAhWFS-KwXlS}2zo;mn#OVuX
z!Kz-9XB-_G59=r3)fqvpa?Z5etoFdE6J9{pUK$UE);r36)hXHZ^wAi=j0)=2F*y18
za;6^QFM4wG5;6!UVv$EW)aTB&eS2vTHtI0g49FrTs{HNSO!A<2*kHF9>(s97A*EGh
zhjBD*Ptp?*NPqs*S9{vG`~COri(j9(pJKT5=DFe?XYJD31~#+N)*@9NAEetaYgg6X
zfz`+m`Tjn0kAz$ya@j7ML46cEC+Ro2@I!fT;bFz{nZ2f;o7!@-0WtG7FGb|!#xu&Z
zZHCJ^i@@1-$UH-lK<!;QTA8@%ekcf%ffuT)z^8YkINp6Amh(p)$%VJWlX<gz9t@f{
zJ#CU&IH5?V{A9%$G%3S|Jv{sJ#fvcd$S*iP?_g&w_)yWSbB<8@1jiESpDaK_S9dB&
z?5PD0=eAxytnyj2@p}g647PJSIIvGz+EU&39ovl`I%cRx)v~SIKHJ1OUwt)I`_aG;
ze?vwmkdmWYpE{lPQg@_`pE-yv*a#RA@ZD7HMN{?bug6Q?PEPFP{HeobZrzV=*r7KA
zhPpddoe%rDKXttuT;cdR5ANSz=pPaiQa*Uk$R}TqXpGDK+*I#fJvIFiJwg13fgkC0
z8u*Rl>^^LfoL`}5E@fqD&zN4<P{<Z#bMLcr3sTP@ub#TMxxS<wdse!8^=dip{Fhe(
zhW5$d{q931jQI{jmz%-Fybat!?s<SH6h;#Y*`~LE5JxjU+z^ojT@LwQv6t5XjSBm5
z{9e7>SH0`@2ol6Ef%tg)X(Cn7;hs;=xt90~XcO+Zul+RZXYbBeKDqo)4$`@j*LRbX
z4QT~uQ%7k~W5UusIH+C@ZRuB+UWyP7b=qz616gcGr=GY}@XU<7ZN-O0f>*JZG8t>2
zRh1a9(`EC94IA{@Nn=I-qNAhZO>pJNR;Q%x4!V>Bqbl$WW&rAtk}0RXDgD}nx12^U
zE1aNfTRR*fmP|lNxz_E~{rTEsr~nV0uPPfSi%Pm@;?izwHLCLI<~}s$-ETi9dzz0}
zSfVoOk$3%*{2#8aUKO|~s3=($hTBrjaF=WVzZY_L!c|pGPE8#rrLySjNWsG~&Cq#^
zzHDNam-Y~(r{UBP+_vHO^cr?8D7P?_g}j_qI%m$D@`|U<v=iN0tO5Bh%$?KH+})+3
z&G*Q|ZJ&kJz0lt!FYV}Yqc7)PCANhzv~CTs42z?yki;qbwlDeMvi#|ljdI;fsKu>^
zUnBfyN7)Qvn`T;FD9X<dA&Luw_<(0j>mIbuVw)XSMW82uUi3ywsuxJgy~ypMCx--^
zj_MYDZ=f`)fDIyR2eEO5P{w1*rbR;QtHAKXOvB=v``1c!2?a~6^MoKo$eQ2G1!;`%
z#DGJMmu|`4WAMkwk+!WzZ}o7#cTqZxo^g`=n{FRS<`VcBoV|jR-0<nhKPtn)B$s1r
ze$usMigaD^BK3moaLf8i8G3$mU|Q{$$K#kl5m+-_^l7C2LJ%<rw{PERn)-&<j*~Mc
zO^dbn%O7Mvz5QX0UkOEGNXIPzKV-5~DzF0x6EF7qy5*M$kk4VIp+sDTc<J^#ZXyeE
z?&37HuF!wy*LPuKx>Uud%x{(9{h!)7aPq#YJpo7n1E156eP-El>8tz6F(;f;`LY{?
zQs9+?_Fy=Co>ez_2OWm3oJx9h-0e*sMLj8YIMuptYO_7Jd>}td<u2+!N<59CMLYk3
zf%Sq;^OCq8b^^Qxjy#{;{dzh=2=D%A*7Zu>SltKYy6dX)7z{tAy+$97dxyM36ip@0
zaJ8HN{`<@i9|z4s?Rsc@yH!w-=!|X=sUw})sjmk%`~%P9P0%9(?DFVz=4uwsy`yaZ
z8my&(_}ALCM^i~*?!cj(Z!?_+A~k}9%o5xi2XwuvGK+l8GJR5Muay<xCp>$htnVTJ
zqjHMvxqAq*1L`CHSEp#nowWS=GA5(!*@2n0-*%W_7!(5ZIfMSi?dM=@5oCvzpLS6<
z-J{D~_2ml<N!`HwmXN@2QZxk>z30eF`@CPZDyn|1txbO<TDhXKil6g(xaCN1D_OVJ
z#aoBb(z4vT=acFZ?NE>D0>3E_W~eml&b>@A{E+5dhy~wv^a-x+pcx24wyQeiMjL><
zIXgLJ;`emJ=)|bx)K#2F=AgJi<yV|9zz-9W&~NQ^g9%kucSwg90NKue+6i7v;B#*9
z+0sq2PwVghJY-?p^|KPrY_C||s$*L%+YMw~_4Q7BM^USP;u(m`YT!E<Vyn)+6=Ci}
zp6WTNwYs~LZ{IvLrvxT*X>X$h@=Y5jHuUkLYwpm2;8V`)lc{oTOa8B9C>B)zY7f3F
za-#T?4(YqEtE=)m%`Y?Z%CA=t{1f-#`XNrYdo5bEXffC2z%nKhWRs^Yne*Vm0|)=b
zh|->aB!4=tvF)!nfOV<hbZ1whK#U)<?%wC+aE9(UXZdL-yb2^6Tk=S=%d6odbjT>A
z5puA4$RHi^$MJRRKihWh{=kW=X4@A}*i#gDXNQ`GG$g>wp-}kFjf5k{yp}%#T`{nn
z(CkC?-T3k|Jxab*S7*%HG5(hpA8Lo*cf5LH)Ww9RE~9&?P0LQXm7<k+c-Wq;Zqbnw
z+=f~0+iGQHxu@8yrC+C@emk4*9A#WP`?sggmj*Z+)ZVeD@=NBi%26tRU9#5P_PO+0
zsm`4NuS>po{O0SDFfZ!PyRZ3I_p>4_N<R<e<|K_W?q=$DOu2}H?P=re?2+=RaCmii
zP*D5OLx=3Cle7YiC3rS8o;rlGd|WplL$o*N?DHt{{dP6>*GY;e>TJA}Qg*{o&F_dw
z^z?+~k~zD?pE)y?5UOrb{d~{T`1tq*n1!d8KKj~D^O*Fq`J8S($(4h~xi0OsudDUq
zSw|8RS8ptZv+!a@^{1WXF=>}BjeG$0XKvJ~<wN&%KX~w<ZuJD<!AO1QpXnfGCbMTB
z0DfPU<0d+)S+n-L$iM<ukBy(kAb(Ojb<FKf9>Dv~Pil#iR<2s*3KTl{bWC*gNdyf?
zIxa<~Bo*@dv;O9Tj-C8eK>yWpRm@iU!VnnT7WJ`=9#vFSB!q#3-UfTato+;;J!Drj
z&0{Z}vrnX2cL}KdX!S{8Kj&DW71NCl!-prP-XoVdd}ZA|vT0G?q~=tMduVZXPeXO5
zj*L%G1lxUDwZm@o==F$-3eIKT#5Lh*)s0$yudNE)L(|@JrQ_~vmh&mkcQ5WZ#4XY~
zf8wsb`>(tgVub5`VIbs8@CN_2nmvSC;C_+^JtJ>bXeK0=5%-hY6j==$)8qA{BPv3?
zrSUTOG|a$Ah(gvUzTib^6nSHLuhrcR+v3Sm+vm2!PeLX|3$v+H!#K))bFV#d{FBo^
z-2vv0$XqNIJ|d@RTDucKgGNj>u1BY>HNKv?weM?1cUDs2`_<JSR?m31i%Blk;cmxj
zDNF|X%`;Drp7O}p*x2@6+Tbf`X-BWD+bAp!IN%|F{Z&&4MR4{9_O*F1IiRp0afaV0
zZ<?yd274IvUgGPK5FDd-<mAE7P`|scNY}$lr7KvN&%x?f(*7J?Rv=My{ovqe+ub-*
z+)tuT#A%j|1x0Flsf2+~czCKhh3t=jM<4AR`}vFBYa^F)k21z-Yil1Hy!u9Rawhdl
z>VU7n$=T@n3g2Um8gk>t4Xte_@=8M=K7MHwIc34dz(&ZT!>k8RkIr34KXSV&3B)8w
zA3Z<p8tr_w@Ym-9Xfr@{9(+3SQ?W)udHRshQ|LeDZ@l!HY<B?-e~S+JNY!Z>C8#Mg
zgvco+u7{$g{Y;Tkzy9EX4ZU2A@&;+7pprbKm(JVkJ{lw~VfG4MuYrNVpwlEruJkx2
z=aHJ&Nc>}&MRi$~6UU6PaVv}BX;3$9m0G>y!6Grvf?xh>(=vs^I%n|_XS%@C$ymI<
zE;dg`K<6L#so?eN>D$gefqT^Z#K>y^jcQI#kw?r6;wm}B_Ef(fC&1G&7V(S|k5IZw
zqc_sAfL>7%4Iqn^=Yxaw@%MLHB%N-AmzuGq<P}EEn`h9PVK7Kei_g8WEdkUo&>XcI
ztdQ>O=S}C8#L=8B5Sw2mR(lk3#>cxpxl|ja`1+AY)K4LCEsQv95zQuTT7FyOsP{))
z4E^qUJl)vccgK!N_IXdA&K?TG2DRa+IOk85mGddl(@VE(X(&9Lb?erJo*E8nC&qE0
zEW3@ntpbfk%e-`JKQuGAUfI0eMPE^LzTO9c97+I=dVg3pIvx?za}K}z9{FAaPl70E
zoT`?iAfcwc0k#q)r0u3u`az$?z_+KGT4?rrggz0=-@h!nzZYBGfFmh(wzfjMO1W{v
zH`!JiDIYzK{q>)h`~sIEQ;Kj=;V`7qRNC0W%4Z_jDjG#?>2(5_4Vd?t)GpzQ!Ceb2
zC|rfD_Qu7<(Jt6^p7jM<HU)~b(YSG05S$P3n6Sz;Y>O|ZdBT+iI>51w+Gt&g`$8<Y
z6iXsh7QTLdq0nwcPLD%VHjY_KaL#$bd)Mj9zS-FhqBXJ+QOw?tOQ|4PnYyZ(8qFUw
zGMHW``gHEBPj_<Ph>gXEQuGFXsHn(hp9rz7@VqPr(ukX61HOCq96fhVPYCRN5-NnN
zK;j=-e;$(w%t66oY2k~KmRnKp(L;1I?|AYNLiE5QL^Yh1E6jkDbA6pb=o!usulnVz
zR*}up2cbClOq2wS63d(b1+*&Qp(zFII|9DpC)ahpWj9PE0Ax1=R&nvz`Zf?`b@HFj
zYm!Nx!H#$lc1ue|2#+38Siiz90FjJjc9&a=R;|W)9|#E<04Oo}vy5&OysyyB-aY~(
z{3spwV&E2aO-(am_Di>TVgJFHKR&9-lblgw#z+?Ex+v+|HMtdN%OdFZ+qqM0><z(R
z&O%t}`p@P43{A0kfgRVJF;}l$8*H&&EFp4Y7g!Kl#^MIK+BY!ph`gh2-HP!+ngjr}
z1OYHQ+G6xoX56ku|Ghh45ridveRnLqly4OU3EOWH*I8CpCQly}xCqu%Hf#-ziVqQC
zXn|kAL~vYG`0UxAlt3xT$#IcKk+d$<3FJ`fW+;l?8+HIryF=L$?2VqohtI%aAzLW^
zG)>bigFoTr;eV!Iuasl~yOduhe*mG}?~<HJ9v>1C+)_j}9lVXcxZ}duo}XT_E3H7|
z)Q)F;1B|zF?C6S|eyDi-m%MrNrbo;p>>4iZ*thTX83DEK7O?7;=(hhYNhEi6GZL%e
zot?b=#8H>LJ2l6nvg~>2`Ssjx5@%P`@|8T8_Aut0Wy-0=!e-5yxqMk|{J8P5YK2J?
z@?^)q|NdL|Q6PnI2TI(_uJ4K<|80NvYl8+y4*K>@A)d{6dVenO<oU2uk>zQRsw-aR
zX1U*p4LS^F965517>7M?n@@}Bw|_1_Ii9=lxqYJJlS>P}4vqhG|Nf%WL(nWoecDZQ
z&%b-lJ?8R*2MO7Quj5}2o3{_)_*OpuQ6?+KdEdSdFETHmAdEPF*^1Rm2SU3GHHvlY
z4E9YD&7=;sYuCP0HE>NoE}iLL*0FD^?CFWArGEiY-|5_GTIQg_*c%HFlw0rbhb3@I
zjt4kIdcP>0;l9uQynp7cYju>Vsc8q==E3EOA!X@TX-22M9H^m@>V(OmO>E+$f;fD}
z@3+Wn&;HKtSCFyq^_#bE<BEfpuyeM;1@ybSZ~JzWSC2A4pHkk%7omp=D1XGx+<9c*
zCXXqTGON?U3#QqhmkC!EUlvXD?n)i^mIZI~wvyPA!CPuiMXPDCmqTKRws@plCmlR=
z$p5+r5j_dM-^!~QIv&-{nm2#;k>2#Nk4x2GS3jDZMyI>&!{O79o-MQazNpBHTE-*6
z1bNgEuFEGt=UC7WQHLpoF+{adKs8ZsimgFICLvzOW-^R^2`r^KvOB2+#HnNqZqr}~
z=5M{#y*Oh85g5w2O<AO3EfjkAa4OwmU@23uW8(i|S{zPw9TobqAMzrCR%6O$2BSt4
zon`Oe>kMugu@OR)<SG;?{uLz=+Z_{9h7IgQ(@YdBLRxs_@%g9}lLB&5yV0=m&Ryt_
z00mBt_~Q5-$>5*!8QLZY4E_d6QL8AHFy7j8WDlaq8Ejp1Qdt?kNjEp*eqkzUa`n3z
z)uG5I<?P8K@e_y&EHj+shZdt5L}~%z>4?oB-RNnND38+KpZ#P_wA`us04p)kR$Sa@
ze+;%niI9?{KLmU&)FlwQ!5&p-L~O{C6s{KSi!6a+@T>D%(0dAxLEgmfVXMwc-6FV(
zG$E!>-&?a*En)IDYOiY|I%bZr3R>=Fx}WLo9_4H>B}>a8wIWq%LhD<4_d*i}Q#;1$
zN=0{H_qX;voFeHjBg=7ea=OP}!UnHh;Zg(O2_N1YKz}F`bquLPvOrU(ha~oqX{!>;
zCBtG%@w>jr!6^cCfIHsWm;T_B4}buLD46Lq?$4XSYd9c=N_$EwOOXcBT`v;9rZ;a)
zI4T4Oen2<uEJqCF<Hbv!wM~7-q8|OvzsvFrh~~+gRObR6i5dhWP0zM}eQ2y0T~4ot
zr-60_#kMVuq56$3ex{A1|NLoH{WE779Wn!$_JGT0Q3K&2wq~Ou-{$h?B<s@=zx~V4
z`u7(R1S}yHzh3$B_3Mn{s0#!BPmi2$-)VA*^Eqf67;$F^n6<JL1AekL3BKmS#gCEi
zRNqpL@PV%_NmcD#ufj&N)e7iO+uJ$z5m^T*@zeVrw0t7e+nSX8WDjAlIehqVLtvnt
z4JXu8gc%Zgltx~4y4NtVaO&7V5f)ZewUVf6_*4&FP;hkL;bUy(;)wxxp_K>yb0oNq
zW6ME}IiARuoDKAXt>SP0N633kOr=!mem#7ws_@)>fIO0(qxV;tMr!TFW$PP!`8ss1
zT)$@uni^MgC#MsX(O;oSYEX8wL`)}5+QF5boR`s&VFe3{b7Kxe%TqO!0a_TPG@Q`L
ztnr<}!#{*=>Z^PR@{g$BfB(I6QFVQ1kN7!S3WY`4=RS8<uVB1}#plvT`-o%LpEzE(
z)oR1sSd$0H(Jof*`f&-^aIFyzCrjDpgcs=@IYBIq?t(5Cs$W|b7XB!sJlh@tfKN|i
ziJK5&)*!AJa1od-Xxr*)rfozC1_g+r(WlQu;b&4QZ(&?SSb_50-*6W_+VhznThDfb
z^eS#uq-hmI(jEf_n73)uX7K8-=S1P>lY9`wP%H!BhGqkM3CJ8Wr&dl%|8pAr%5PkN
z*z}+En**jWJ}GG1hnKm3;3e@mf}F2(W!2{oO%#ryqBe&!<?S(a+jEskiVHc8ylRC>
z<*t1pQDQvAPl~q!joYg)boITK94&V_9r3I5C6r%}>nB!Ee)eI8ze3@tW?l7L@Zc9T
zy2|?%GQ79!Zp^%G%70ZT{MI6~3_cL$`m;1C>;#EcFDgyLTNIY_7^SN}{y4Hm!^z;y
zXFhiDoVDm|QMbZ5>4=*wOwG&=ifaHKTAsA$XEG>4;dyV3e5^{;y#IbI>cNAwcnohP
zR)1Lx<aqb4zQWVK7dBG!=g;>(2r=jgX=k18HBBsFj7;EZdmp45BSL@vr^0L=EDV7P
zReP$d8zI?~|EUP;AQC3^r^^fe^U_NpDJ&g!DzCH-(>KhP>I9}$E~8T8zdWlyFAXDR
zLkHAh22JqbeC5U|7D1iwop=9{*bcY{*1TsM5!IeC75c6Pj=B}3bYs1ZyZJ;;Il?sO
zY}X)kV<{nQ8d!cORKtN?+V-3MH-AO+ZzU44vNI3=v)@Kzw0H!*G_ufy6{jf(8ctxr
zZ54$kd_N;^+&iblE`Idz@hC;{S!5?l!7Y@L(QxNl%Wmk<YsPtN@UiWx5188}W(rJR
zQ0SM6YS*OWogqI9sIr&QcG*?_mhz{eF##ls`1x$5o@=pLEgxi<m0uFaEKYdm&rEAY
zP79WYLSe3y-r7ek^;qPr+kR_G!+)iPkL{@zL{36BFNzgv;c&<lqj>;k&BU<|l4$$i
z{y|WD^yJC@VDN33b|acW>0eq}S_;>TZ6(IfZTK6L@nC+Vp<jC+r05biF@PvWaVQkA
zjYZ&QsgAsQr{BM>VftQ$rKLxOQ0{GJZ5>Hs-%g&!9)@_BCO&xd5th?o&u03uO)#=j
z`s|gJm%pR!r;H#r<g{cNsc}O=2kOgbM*PYzx!`AZyDI3X-*I$1nLj+97X%a#H|}K*
zz>WP9O@-%0=D+QS0Y1)~g&qvA7QD(#Ll_becD1y$L_Q7~aD-h<E;jUvwKA?{IrWRG
zA;Y|G6D5@9D`6u=`5791uyW83Q&7))7deF~A9hrfJm}lW{aF)~4a#THb*`OVF)FO3
zixSIHG@3is40EI15VX{}54HRsjfWltx>;*lL$O6Lt)ZmL>7}?zd!Zjzc&dv^{>>P_
zLH}70Z6!sm!+~%3DP#555W0gGS!s%{Uog|oBp*;SgvY|bfs)>9o7K=SVz$A4*0iCw
zl%#}Pd73j1!fl=m5#LW06342u81*({4TCbtCaRh~H$7uK3Hv0=i^^yu|JdU8=J5#0
z+hFpEDiK74M%fm8za3AI0m(vUweiNXs1d{9PPAi%MEwbtkH>kSnTd%UuRx{qK`&*&
zow+N$V+_L#Cqr4|uo1*a9)u#hgFN8Lp&1#4(LV;7gvlJqV1}m|C>M{kFd$}JThAwD
za~jB;?k{c=^J&-Y;o0R(r$l}SEN<jY8t+Qp{(>s5&SAb==;m0dzPeAIavFg@5w@S~
z@`zpgVt)E{u51M9?qOyy&RYssQe!-e9x=VbGlEv%phm6x>Z4irJ}8$0U$q~zM0_W5
zj$(zY!;Knx^NzwOgRbZ0<$e1pyj|d3WVm;Y0Mis76}fyxSO@`7|ADzdd3oGV$DUPB
z4UKWIE6gcKRp&E_TWDLAe|1pUp8!*C)M=PCV1I&YI4ypZ>46Zu8cwLAc)pY}_gzs|
zlKYPBIrcds{S;lv+8T-oV>`P;bsBe!q%<|0%uc7AJxC(mhT#>8;8)_C;ySTKdPd<{
zo2B2p6nX2O_SZ4qkRBO@nyR66P5}3SHUGp+Uf=wBMur7?rx?!|78b>NDBITy*UUMf
zRTk;y&m43DK~^JxnAZAm1tZ<uP7!s>Yi0_!hTM6-Cw-v{GsTHxQF!rxHdOsUQa#at
z^Q}in=LuU=9378yt0J;%)8W|PbQ^{Bn39s>t%_z=a%)&w19aPvMA^=pw}2+ARl@RE
z)o^&b(f}f_lqYm&5jn6#cvi8RAq;)Ijb3z>V8??nv6N)sNKOPK!f)Q2boqrPU#B}~
z^M%Forb-L{s;spB!3NV`$3Vq7$ZHJ4GiZQ`oMLB-rKdOmk!9KEMu(zev)R3rYJl_8
z7fG<HA>tb=grZ<jFK)FbDO(E5iCYk36K`R#P)y1`;g@=roL*V!EaDUDTknJP1Le_1
zYBf~EUSO{?Ydn-dX2^8SM&gyVXcP;|q*1@&bv^H;8vI8TeD#0fEUIWK1qvT0jbjX2
zoxY+96nUQ+iQtiE<CG|CcGx75>Z|sI$ADx@_OSXYV(0>C;PYgK$#-x@;?&c5rH~R6
zF$NDViv$rBQV7MXnjzJjQng6uQ{H`44%wb7s8I<{H9J&6Jghmtr?5*K@lF(;8scyj
zyRE^0GQUlEiK`QZ4579v(6}38G%m$$DqO7LUZ-x|Ar|HQx~~J?B`}{2D7T31?X8MS
zV**N*vtW-J<!S)6Gik+rT3!9Rx+m$Ow>S>tr?P(YAw{+kl>ca}qtfPAt`I3`rKO=<
z1x<eY=MT`}JDhK<^&U!)SzL=GtObe9ki}&pvwhuK|H)Z4Q7mwnPX^cOl8>SBRNjB|
zRL$Km;aCExt6s_rgP6!_9`u1ho0DOiOhG3Wq0(Cs+*De)Vzb5X$-m#KO`CA&-||-a
z_idvvJ4#|<&ttzw>gO_pEhoGX3YWIKShF3aEhhVnJg)FO-XJ1@ts260nye+Bd+tsP
zX2G70DE_zQpFN#&nAi|AE5ERypqL?$;utk!f6WO+cl_?*{Subcam9@|p==7lN{|8K
zOUhQNm0w69gz^p>Vfq2j@`<ZI`M@6N@%R^$63hce5RGz(5J8`*8lVkZ2;JR1X_-R*
zoi~UYj_QW&1DENYi%CC(*8<9rKN3d*&$jDHIkSyNXZ)OD-%&-RvE3<YOPdgm;Nrxu
zC5cF#oW`$O=I&lZ8GD<12;otT^_EB>XPPWKD)vaODP>C|#hK9{*vo-}b3h8EZaur~
z?E%$r-hh}qiT+l(@tZ2FFQa$bx3X-baCO)lhTbSGLV(-CNRW_O@3{P=udbRx-<uOo
z=D9p@yxeFx03A4^M$w(kU-R+BgS(KD<~2@o6MGeo6cZP-ot1IP7l-_t)}N^(zBP!G
zQm@hH^<#F?c#f5*$)AT(hA7D|tX#G8CiK7(<b}f9HzDaclHHF?edpf2Q3ALL7(f=Y
z=QG3}Q4}PADFVn3?At}rx`aO0Zv3!>8rSqQy1g(4os424t|zg_)?~0EEM6?ggdz7j
zvHCo+fhYH0<4A~PL}?m{8$v^4yj46?uCTx(4%_Pt-#6}uQQnzu;zdgQJhJkt5$6hp
z*?94f6R<$No%r?ve%=CDFnW26T#<^Z8;+Dv3DtFV&!li-*zEN8KZH;`Wzp#cge;Kj
z9&fAYatPibPr8oUFP_EoSQNVlxg0{4WIa0(e3c1(QGAF!`M<=|5%p>Wzn9PedJNNA
z_*s4*Bd^$M6){7Dd@<o5#ErJQ#5m7<Bde<Qv9|wRRlRXH4i67Ezr9*uuRhZw;Rc)M
zwq7lMpkI(x-;q1$WdcGIE2JCD21W7$@5*H6%+5y`Dq0&!2Yw;tQax_%G+f*@|0I+d
zO`Mn*d6X8daHVAgs;Db!%&!q_H<g1(%3W}#u#>54z+!W#{)FEwaTmj6a>8e&r4IU5
zim(8Z>1TS|nDrPY?&8uZq8qw1_8pA^dJX+)VX&y%YzHk+j2530^J8`XQ<pxspr#X<
zC*nw=+$S8LTp6li)afwY4Oz6}Y0Wk^Utay(3BwO?wH2R$6UAxr4|z!dJ42-VBd|Qs
z>>Qw&uzo^PVBp6XXd7`tz71i{*&fCRj(Na5S3%<N09eF#LP3?F+8Yx@Y9uKLQO`t{
z^$XZn%kxA;?$2f%j&p@%$;%2d2$aA9ND{G7k5XJxD%439a2q2Q25piuOI|cDK^%5d
zu3zuV*qiLZVk(Rs>7uyjEjc|YvvOXHRhWbfYX2L(nPiq2y+Uz7IP$EHro!Y7=+s0E
zqv`D&m!z1=ETk)Y^>O3#AD=dOKuaUt0o6SM#p?(&Kpw%5_lr;+Z6i&J>(H=pA!Yyj
zuq5Su<>z;W&4h`23V(&9TpWJ+tC*v(T!8e(ZAoJfS%=VmBnQo6W~e%wL6&BRq7Of{
zu@d=lQ<74Tk5B3nm_({zVYB8Ip9uu+r{w`9NxTvlkn8x}nqe*=BRQ<JGlcLg8<NB>
z^Mt3mA5<S%VARI+=`F~XGIyA)#UfOn7Ovckn8^~8u?M<Efn!qAVUY7EHk~;KR=J)I
zNcW4WVQVOa$ku}^Dwcd<;jrj~9UHlhWQmp^h8r(7YO*It=?oKF*7|$hI?Va6+sCFX
z)M*#gnwRgr1gKi<wh5xW+hkA`p>F0=w)0S2YMa2+`@&cqbGA4O;)3hP{Hk+Yf_MUC
zo=Ex06WP^#`SP<a>(baCVrs{7o!gO-Neu$h?C1@PYb|cAM1$w9E&6`?`0)j$kh1YY
zMr-3TToG|w>=5Y^PxqF>g7pwiQY8fdxkh`~#_pH*NrwdTftiQL`SMB%2OA-{uBlgJ
z&PwSWVvd4)qIq#SAmvR$lx?gPQqwL(y;fq@b78TR`NGdX{#MR#j_$FP!t3WM6s>>q
zvoHQjx=_GV+yi_O0cHunlMDJ$2v`s@_80`>#jF1Al^@Vk4goW&=T4w*gbF!G^cD>6
z=vhMHE7cY0wVx`)k_F(b&1Z0=WNXpP07Z=tDo^4m;l}XC+>S9MflzR>4eG5a4uv=Z
zF_1QUbY7{Ev9V^X4G5xi<Jc{*8;n>My(eJ4W^!6QD(L(7wSCDrKLa@jWj9~{s8C$~
zfo1AD`0m|1efGLEGH>6hYN7pUe+$nEbUt_C6q+D-+q&nRkoFhkhABN}NN1MpjZhLk
zGO0%2DXcRpsZ!0FGf=7RUG}t|vfg`aMHV|BwL`kOj<Cu(=_6RfH}ZwXcH@RIqAa|H
z$dFl4aVlCSrIIP3U-C<{%c9{s?u*_v5^P${q;t~og&kz8KbiM@_Bl=@y%WR3#K39A
zt3iydoQ4X$cZYTglcPVVFZ$zm($FQ$4%w_DG&fjRzV61nmF$ESOTVu*h!t||9h+Z8
zt&7vuA!~=BdV0#!m#UtPpLpo4qBR#~BiS&sb`Q{?+<D|K(_Iufmsw~BNK6J><Zjuv
z?GDc-MeiDA6>Nq(U}k$hOK7E+Uo1*2&KyF-l$^{pZF{0P1<aT1gInbT$2qjZRG1+0
z;>Dj>#O!NOMfi#sOGp$0&pfp`g42i_EX`tTt&WOds~>b=&)pOcV_{}%I9+tWO%=rd
z*s?90RXi@$nB13|_g+$vhuZwU3><>0^<_jUtxVD0VO(XFaz?Smh2w!=aGR-y6s^G!
zjC|d*_T4JjMiqRPBV=4;dL@@JjRem}Rnl9x_=J?0S_5pTi2=!MPv?myj&sTHUT^5e
zIBrm(nwyz5A9zu!;87ImLDQ*j)H@YoACS$}U~zw09j73Zu0OpkS2qF-qKZGJom$xQ
zgL+S%qJ@|~>jci#2^2?D0`ipk>A04#al^6;1+)ZF84m&ScMVW77RCsVua)yKdOLgN
zJdJh{8m8}~S(l1}qq2%6CH6BQ@`FB5&b`1ZMz+7IA?spEvcXM#2Ood`Zh>jt;vYnj
zyhR*8{w7)njR3f@FFO}ajrh*h^5&khJ3{X<B{zzQT+K-#$U4c{Xg{wSim)9&PJrOE
zJv245(UP?2)akY6J!2aDmYtrjShg$-zCe15$-D;|uyeHntan+<gMJ54w)5^h+7Eyp
z-2Oy1nA7}@a%=jA(mf;n=xHdelCOwLM3-%`uoHq7$-|g+aOErbow9u<5*?Y71+8{v
zPVKtIp4{RKs1kmFJETCq0Rfiw@f2{_T&CK`M&Q)YySj>oOz<I%-o0_{IEq{CMx)n5
zNT-BAz&g}OSgg?Z8hGdOzvGk?+2+u`h(1fAj<@O5=@=p>t!;2j*4?mk*?3?)=RqsA
z^U-v&n;HaxnTc%;CsyZLf&?iP9zH=qFIVcpO4Vwh+pK9*X}F};*HsJWC=#-V%MeY?
zNawXRT}=RySdlcfyY(D6(1Nugdq&Cwj=~{%rI^Ej&I#$eIr36S<f$kK8BgfLv9#tG
zzbV-6dUCS3bEqe5aN)MZE(7kEM(hlL*^57hgPqZ*b3tB~Z#~O}o2Cvn2}t@_Ss9KH
zQ7bTwKf?Ov+6s4HXmJwrBJq*n2>b*OJQDht^V`mHV`g2KK5*SBuJvE6e!WfNGdWLO
z)fkgCY4qsP;Th93TS6NwhV~$gb1iLc`hHX9#;rNq8<^EC-bN>IJQ<Vu{GBTq10qg5
z68um!^V;l`!LL{*ZwdY(NirUsQfpd0BK;8wGmX8EMkUP@CK_p>|B^aA)wfe&0DR2L
zEC2<8bYW4@k7ivD$n(rfi{xaD>+$v-#t?S^4Ufu!2{~mT!xQ;>+?Y6W5Z2OltP}zQ
z0%!=HiA)n<4wn#`&Sjgv7?V*}f4Y+>*e}rzKL!W$HtjCFuA1Wcd$VRC!d}SE*iEG@
zT|`bO8BxtVgaMs<vSm-u=e3Q=<qy3<vtqBlQ5M2iVJDO)v{nS`{n$;9m&<`c8)!0o
zM$A?rLUlvI5^-Y=3}DH+=VjV~exX2kdf<v&y60I{0xk3p6WU@>8HC1eeY=3hf#~8f
zO0=?@A!=&MOAOs<@ttJQx_WZM;DxjmZjs|kw~nbKd)RQ<Zk&nK{PH%i5W+>+CJ6V#
z;Nli7TgnX+%QK-mV<>cDa^3(KL1KT*!!X*@%I_b#V3cnIpd6($^XB=04`q4b%3UBA
zsTCR;I>_QK2Z+V@=zaGg#~ybk-Eq!eN*NF?VEuwi+23xfSg0WB(^kAIcFjdaSrr5N
zw=$>Z1Ve}`PQzY-zdv91i%A#8PH?&W5zlGNoaXYFPv@a3d9hHcHTa97@fH8A@AVwT
zrv?hq3l{tu$5rsgj{rR6F4TdEqpqzTlkG<1M?P_?rep`)6<n1S;|+qSWV56;&jTS{
zo?lB*-1-NI*5eP7*i(xiy(0c0tmE2J^}0cn8u(y|52qQ?U3@+e9>(=hQWF(w=s5B!
zKW<yM6Z-FMl56SR`5ifin~%o(+80`L(5DCfBfJBsT`nz`EMjN|2?VaW)dRbA8x5u<
zJ!{sGATmfH#V~!qM{{n;`hj1%i-%;cz;xUM%+5&K2&*jHZN94FldK9Z{_4bHK;BjO
z1Ht^wbxo+F(0^d-Z1&8C;Hdoaw2J(8+PXv?kHaB}z3g9EJBmaE&e03&S=!{UJS7?w
zy^WRo=YpTq(ri+HRh;Q8F~HcaDi)sH6gut6ai68(55Iv^KpHZH$Y3L_T4C*JVd7Yl
z00T*y8#G;g??s(#A#QD3@TonB5qEB>8~T@#)VralP`};+wI`Jwu=3*@=QPtrumqfs
zUW3&W%BbuYP^ESR22NWi>7{qU<bRzr!(znIg?*t*iiEqfutwdDkqQNAv_l^)R4t%a
z*GaP#VR5vP#R_=|sIuqap4=yK$}ibgP%(-kKTse*OqlVZBUcF>i(~#QJQY`LuGEbh
z2CG+{|8+JbrNM-a@KQGE0lwSv=oE1vBV}IdL}F*MbMCx(oIn$ZgK+(kqT7a-VhoM|
zQSNNno0*7#%>E1lX|zPx-8)8R&x$81WwUwMVuj+X=f5rm`VVL-hmqb5vA7GhHe1pF
zy=z%nrj*|%*yqGJid#r;Bk<NFKzqqANdHDdM4D^Ril^L)riTosVD+N@I!4><InV&*
z+8wws$KoELk%94bKH^-k!n${_UgL?|EInwi1e>>W9l%?fQ&fCQ>7*|6JV>-SC6*50
zfL#OA4$(+eA3nS~S_kfF{l<-V0_qc)=uQ`wv=wpX+?Mmu4}C$V&}86A&Xu30^u(He
zE1@23(s6+^rdM#<T&HW-uCkHH>7*@Tu(3w4TM8gPISO%A)o{1g+)bjlqIoQ+ts=XT
zkm<7b|54ZQ7T7}t6vJYMoZp0uqH%vGTYK@&K#}q-hZL$YKtqww0Xte!5IJ=)KZ=Zu
zOucs|E)MhS8*xwl2oe&_!+No#MK?zZg$C1a%Dtz6-T;@3tg@m+narG+P_NgHx=M}+
zr`i>d)vH&Fybl0*MyOY9MOgWdfS<h*wgVO-Q;7HY>&Cj5+fmpayfFv#<rsbV!JBp9
z4unybNbJb&#U{Ir8_yH#oVjT2#H0Q>v&iUY(D#0g1IBGqI8{Ry&TUe%caU_gO=nQu
zr=+L*$NZhg1jTP$09C>rL2;6we>44eJA8aXI61k$RZK|}AEI#%yqqC2!B9fTu$5;(
zDhi*<+wB&<wXn6!>tFz4uR=F+*9yMB1l3SvG&r9tiO{zo0+*xv(4h%e>IVYknQ#dK
zNzn0nu%+x`Ecqj;H0B|jo7Pf@;!Jc{m9O7%ga|kETPw{&ZUe@nrs3J={?nMfe0ISr
ztkfUH4SqxpgweQl&o(v87PqiblLQZVsE#U3nw&J2I%d!65<9G7ji@uvMSATxvg+8k
zelKB5SWEy~NnX58^EE%V(>wqF*ILTPw>8`b=_s+Vq(&zutZT8IMQ1$|)-UDmQQz|%
zpyq%P7cS_}wY7yzVs4bEu-}U8?@taB3foWcAaNyJYT1qzOL5Fyh=7txDyD`Z;;2b1
zG!8S4G+&G)0EJWUeZ)NyDCOG$C2(R5!%!GF3|lh5j_D>PW20o2XtN&+%&Zd)2GMr1
zx;PfYc+ph!J3l>twOFzhbdP`%WEG?X%H#JLlmT%hli>x_7i~N6NP{Q*38stplXdti
z$~_Z-46=vS$kzLjLr8T5{mcJeZ&lSRanZM-jtPNUxn|TUkwtS$#KSM}4tbT4!yU{u
zxUghy8CON|A=vEW-@AMFF=U&gli0h9esmMJxcn%ZoLV}Fi}Q^+V%jNzV4+UxLLE_+
z5XhDg!{y6<EB%inEC|1Qhe?s8d>#s>?02t^=WYV{Sn@0HG&@r~@gST}vyUg*Es!jf
zwN436?(<lA-CDMAxIy7~RnqVRr8iMl6ovMIkP7Aa1Aowu)&Cb_zAyl+j8SABSCf*c
z$6;ie-%iSgD<bA;F6+j}<VI4EoPW$C<)OCUyRyu%m-U(Bw>|}u;(dL)Z8c#n{??~o
zOooszCELXa3J72V(0nG+3vf=QYga~x;B7TkrZg&}?YWfludgsrBMnn0{(x~F7SL0&
z=!gzKG8FT!By&JG=`D2f^Yi_Dd`y+uG3ZC{NheW1=6}m3Df%Rn6~4UI;=N0Ht(p*^
z6?*GaEt7#kFwH6FBeQEmY-7%mG-BsHv7-hEFP%ESox=0T4~ClK7aB>z#wx5ING+3z
zzyQnuSQ60d|Ex0PNhq>si#vo=KrC{j(t$cUQwXHWCDw~$Pddc>qLcIzImIKh-N-h?
z*g%pW(lPNCQND=>aLm<?(#K<=G38!(dPd(t2Nk-aBi7bQ&dSQV%GL^-^wS2h=Aa?f
z2Fya)X2ypFB+*kmK;$BSYu0LpGK)2-EeK~C0zGtIuAxy^F#8Xt@G~lDtoV1EBFnrz
zxjKz(B1iD7SS$YI0Q*TjZL&i2#k39mDea&wSQIby_;xBTJ>Th{_3PViZ%!YHNp#95
zLNS~xAT;gR??5<j)kh$`f_~t*;&2%Ar?#G+z*A1&SAAzgNRTI`Ln5MH%9uM3W>{J6
zX7A6am#lc~EU6|&t3(uuj?@`%qjg9$S+XusRlpC8#(vU(oX(*yTHmvbFmISzLvbNC
z?1fizgwjW)b1fHX^;5(&ggf<ZO>xxsW}lN;!AAl5gec3a!=yHib6W06-;zS`9RxOe
z;Cyi4j{`oqz`jSmU_M(*(ZgoK1R)`HkAKjcgSDcfVsf6)km6MCBC&l&6&}4sNIvAe
zIZP>KYRM?+ey3imqX<4gGdz|&RSq!x8!dub{ZhBC3^rmzpDjI=hkwL#+0fZW|Hc%3
z8~=cSa4JByDU~^RbgtI6dcXd9lgZ`JH1BO;A{Hyqnka9wlb?3V{2kELRTu*c7e-1i
zgoSAKVk#Q1$re{@c}7h6wj;O#P91pr{`aB?KXPo5*YyXHQQ|sLR0bk3aa%NzHz~%k
zq)-t{tQCrs5_qV>k&to>#VGW9fF_^`Y!3<c#kd?p`r7QnI{Jy9>1qHvMO(M<TRCr}
z#9o^Hi@oS{RZ+pHGG*s!bRCQn#6{w=7*_(u#nhy*yWlDlp(wgj9t^`bfJUroi*7YN
z%LXA*rHi8Re^21#z=MKil+lE#TdYsNyLY5j%DgkMvm+I6vD?{3CRym!SX!}Eb;?sq
z4t`Y9z>uauFAc{MIQRSj%(JI_@1-z}Axvg_p*^KH(5HMC{ZT?ZZ;7F6edv(4QmdnQ
zobr`Qym;~I&w;8%WFFbWt1Blm7OHg+aB#4l*efnN^P~==%X(0R@mon+LtBpgatN$U
zVo3wjniq1uXW{%6;oYLDN}DIZxAMFN1Vx`3wctJ~QR)3<7xJ1!82thttb`~^xE|XM
z98i&omy1qo5Ud24jJ2WP!#pfCA`MwTww&?0q!+Gj|4oQY+A*ih9;+@k3|fU)5Gk!~
zZ!0Rypi`1ZKheDZ5^A0#L?-<@t$edajt8!C+$UFaKV^$Eu_~I5PQwY*X^#if<nQKF
zboSFSv``hO$qc<kYR&@FAktGxi7Y3COy9<B3t2aM!Kek+iZG++y=K?@R!2o0*)SKi
z6uHpy00-0rA<~`|*`OmD1mCM+fgq)x3w})N%N7Ns0P&Mf22f@d^h-P%O#v59Cko2W
z7(hfRG%mawB7&=xZ$>~vH}sZ4!P3r3N=kx71nqVQVN-NMlohuc4ZQC-epS$5S0l6n
zrtE*(^h=QbQ|S)chxhh@VILpYN<hped<~as;hTXqP5-a}3WMReaXx8wLgeD`MhLXb
zvK=5LfOiVUe{c=3uQHGO)REMcmAoRyjjPE7kBX>KtB#rOaD^gqDZN7x0?FZy=7`Q%
zp9T4F=%9#Fu}RND4sFT9^n3n$Sa=rq25q4&!E-PpW2eb0j{YF-qzPi)3Hl^1=#iG%
zvQUGISrA1`j=w3;9UvyGp<KFrWS#OklHLjjpLUe!cQ{_}ob*n2$ryTAyj;C2=xQz?
zR)(IUT)a)u^6uV(WAo`fEq@hPC_QKEJ0%K5h@8x5UVZ<6Vb9R0l`9FfJXLrk$E@cN
zA70?MD@y)8SZ<aq?*NJv@~z#H-ma;tGJ2uw6_FW4YYF|Ho{Co8$y%QO$aTyHvK-W=
z+`Uf|HGFB-mjp2DFEW99<;243>bGO+SlaHYXEggtdfokM{bs-RSC1HW^wi25x9yMU
zn8%yvFF0*5&g;O@YdvPKyW4N}olSnft2W)Zqw@3RjQGErYE4kh_*{BBuk=RA@@|LA
zCZrEKR)PS7%C;HgA0a+C(@h_-kv*^V5DYmvaNx1bevE({b&D8O@*+2;JM5K14*1PH
zVO1w3Cl{@|N31(A^~CJ^XI&d;XlMZZ1Rk&~#4l)swRL;W?Nyl{Ra64f>EA%muM4bH
zMKrd?Kqx#3zmzSkGHKv}$N%l;*Fr%adzjeQ=l-#6+ZqEU7`<J%PesN1%l(8Et)R?P
zJ)$@114uG)Xw<T|Psu0#Lg70Z%d?@)*z3tvW>-KLzJJX=$jRoy!Z*z<Kmf@YpKIC~
zCr+O}%<PNoNXup{Z1%m`mG9r~ZNPr}hfP#t=A1c~K{Cfv08h7XnNH)mFUW%e+`PIU
zjtl+M)K$vVB^-uNwSyj|R~l<IZQ3+yxiNshZ_CT)fK~L#vEU|m4PQM>)MpM0xi}Sq
z_D5<-`u+pJ4>-;WHqN<`mKM$!tZ+wCs4UNeT#I*o^{3p+_pjt0w0?v1+`TDxfJ!(^
zA5#Ocn7)<_YT2@-Z*QBnygSoOJ0SZR#RlWQ|Cswb3?fbTkF33^Wv>)=2}Y+k#>Ki+
ze_cfg{<Tq~tCueQ1%7FQ`(6JD8Y*S>=**kS+o9@Qh*Y0r;jnurJ3G6TD4&ffBKyZ5
z-mt;8PoKe_7uZ`HV`8YKSSPApVH-?kg($yN699gq7e5-t`=6yQ$NjQ`L0FLBhTnp#
zq;R9*+7f+#8{etle?%*UcW!#daTb!`>1Vy@i)XUm6=YW#pg>vbdpC^rm$7*e6R+@u
zE7@If9OS!C%?LGWK_}|3`1lpOPie5mCazqmhknS8hhK&8&l<NZ)>sb{Pg!Hwhe*$7
z_Ua~0c0~SRAX8Glf6ebQY}j7Nd`&R0P`{_=^z@49nDkZGZ{2E2^_bN#kcl0>8Wz3H
z``7|pD!=r4cj>YQr!vQa8*LQ46CA@fT?`*JYI+AM0d!#7!HZ%zBKj{Fh+QHG_eT1$
zhb#kmwS%y#^<ih_mz1QEGj7<lX?d{|gsKo&cP!?bEXkqWWg+jrX4!U+{`Y4zYSc4x
zMq^9Aeg671g2t<d37>Dc;8g#nqQZCzb;%qb<*t?QSB<wiyCgi^GlrF{{<wJ)_Yt7K
z%a$*9CP(P+FmUkT-4y&!3Ja5HEIKV&Vx0BfV3g_{I+4GT2C%`>s7{&Lyy9u=1q&8{
zWE!#!u{3XFu&H^!&YVgcZ2PDEw8_fa6JLL+T|2-|LRMX}i`4~#PMtbs7G>S$`zyDJ
z*THeDO*47rdo}v^Ct|k>nIn(#U_3|<OYjn<jN;9Q4~J<=)H&R=iTgWQ@yKZoP5aD`
zVy65z@BZ+#PCUX>v<>gZE@(x{Uw}OQ8M2ItL-Dm(N#oU*U~mhj>C4hmZC$z6*2daS
z|9fp1`cOpw&Eq2;fa1gfP-<sz^@ci^?MU5^qyGz*U{Iz4eP;r1g^7$qp}95!?e0I=
zn~xiNmi6hyv!(5!e71&cnWB3h%%|i7X2ZvA*lAdM7nkxSr4v~X_?0!py3!73he)f~
zP3-xh@6OQ6l88qi8+YQy{{HF@e&>Cs_ix7j`u=6RfOH8f`3<t?Yu3Db%Ap-4UWQ(G
zf)9>;yEf@?^AkA2w0oT3M-EQm)#e_bRZD^E9OF^`X8kQt)FBci%LG3|eSO87U`04B
zzdGG{wgVs$1d^i98#{yJaVDl7x<iJ{8@rktUkLAgCP(haPoL0+*TE@ZpYAmx1rI+9
zJ2Szi7=1w%ET4VwTKG>}KiZEK&mw3@fTBoa7=T`R5ce8rcNfBBH|Slrd%WP`9+Nzj
zlJpAs?gmWu9r{>AwTH9AZ1vW=Ca^-H4?z9shkqFRcO-l}?URea`EaPLrH^)o9J()2
zsm<;Yqf|A{xR-Y$x?p`gZ|c6<^_xG2+Hr^|-{v;YvMQeg2lLAyI!7)H_T4+hZPvVb
zV>rm>Ga7;-p&N0ijw-<C4vP!AY016*7Z6k|0m++l5d&H{;K#R_H#n@kecifs*|2u%
z_bT4M$IeUdTLilm7#BB)F$A?~xy%MYs$v)gw|)~&X$wad-q_usx-;pNtc0LTL=qoT
z*baO4w;?Fav+77!u)eOrsOh)p4QikCY}Q#*a|`ugKboENYd405POogy#Bd@hty<^K
z3fZ7h*2z4}y~~4+9MK@F#DJh>e;jNs*R|)J?6B~$`*C*c(4_CQ2J~q6x4lU%u&g-j
zOk+zctEZR*@1B*Fmlw!L?V@#~`O$@{J$h_lT4Z0O6kfi4cpCFndiLB1ZtKPFbS`^l
znB|7$J!sALS$*6)4O_X3)CnV@zXJo?fi<75@5zdv3<NY4H;-9gR=B&Lz-5aB*=}kb
zVlH6I2Eg3oG~Oh7UGV^YMmjQq6O(SUnv4hR9Li!o_w?G_B9(L&xHFS;%wi`o{D^CO
zcdHQjw=-yx*<43Aw?hC4;r98+IZ%fuR#XmheiqDkg{K6Uylrf=t?rVcU-$oC{%uE_
zsDH*<#jX0RbLZHM?kXyOQB8y2d2LlyDKm0%cD{7+;u^fVo&v%w32G>FN2mhF!0&?;
zLw9a1U5laR?FnC;WaAA2!8GpPyJu%I`}?7r{2L9)H)UnBX?h=WU~@edI{V~Mqc;Be
zbVp_KpyI~v0h>ui=1qO1qGD;daG@quV|R7+l;aKk{r#KwvhpR^nBZH<KsqK>7E-!r
zB=qU6qLNH;kpJSvhLDi1veMa+zUt=cx&3r>bkMf$lfXw*!4=BZ#7cIjiq?!K^Oh~s
z1tsp>ts82W!3QIbqUv9V*7b0N7u0-U#J9H!|5>*#Qy9&2nd#b<tv}AZ9g?Sh?8rW=
zJl52#jmwNfgw~7YRe#*cuVJ%o_5bbPY}c1`DboV$Av>8#ENai<2o4T5Yo)q*!}Z}D
zQ(;q2sNmdv0LckR>rATfC6Q*Ew{0t=;yu(xr&g_6g8(kHGm>|7tqZKTY{iPe`U3}O
zMg1=x{!Jh5Nq+txEcn&~7qu6^!<%2J^$j@d4abhXkeeF-GC9Y=fsI83*#zP3CV+im
z$3mVMd8M11+q3jN?Z{6#8BWB;UQJ1<&vd%Tqep2Kk<!-SZ&*GCI3;olunKClYuApy
z^@*K3e||F58_}T`RTcxrXg$_w)g-wi{?6o+bvW=Lc5H^{wE^^E!OWFBdc$$~nqCtW
zjq4Aj`|q`HUnl5fs{Q)ys9C%I!L$F1m|Fi31~IkkBdn9#$7USMoM|xX7IT|=klZf3
zdwdNb+ib!Wj3aq50PZRX8Ap#XrqKJHoWrMu!$XptfI_{Xxk7Q{k(tKYn<oST2q?eS
zhnXJ!4dCq{?a}oW!|iUg=G;_#ME$a48%aWoP0f2=!a8A7Lc-A7w{KTlEer4`+*`&u
ztI-OvLrG71(0zPft0p^EgZc#evtcp-Lm6gRk63pA&~FzTzm+m#H~eF>?%KcpntFCg
zPjJaDI9}$S#}^D()ydIuS<(AcKaDZi15Wlkp}(eP_~1vkZ|mf=ky&*S<u$;=cSpG0
znyuPIxw!PcZD5w!|M72Kqmt+}lkceFtsh=&5xtE9nSq3IojwmLJbZn8nz3}6_nge-
z)^#Q=KhectRJ12P(p*QqMvb0>Y~s^8H1|B5zY`#Jtjibo<Ft`KUZPiY;Yu<SIULb5
zNtWHP;gv^9xvE}y`yoSyF!RY){az%A47>A&GD%QA<cxV{H44E7ETe7fDcDk>oEqz&
z{oW*5o!B@A11znB$EYFuLak+ajF~$1uIaX(Q;&D%LOFBuA3yfov&Uw=ZL=oHMh6ZY
z;5d9n&Py`YgrRWvT6OH0&H-w9ZdpIIasCI1J!t=1tXQ$4Rx{iQuOCU*gk#k+WXMq0
z;^uI4R6BR>?A}^io9;t62QIP6x6h1@<5G?LS%%3z8hdJKdBJ$#{Ru#s^c{u@7P4qU
zN+R@wSsbq`Q9eofn74D_;_38QN3sUqzJD)~woYjCCdp@^8|9LBK~|q79t0FIxgI<l
zj<YmoHDlXoy1uYMy?SI*1uT>eUS6vT23_6UDk(3o-M2*0oi121Y?nG}E!igSr`Q1s
zuM|0H0!H$_0tPPbsDRuwm*E4FsO%_|9R^cp9taEt;K1DTFd&)Sq%Ai=D40BS*(s7b
z)F``1XV?W))_31riSJER$fv(`kp0Jetk%Ahxv%N7^0iKGpvC?dDDKw8ub(!LiBI$C
z-WZ)^O9-2XsFm8Z7{fR6trt+MT_-3Sm(Wo?#5+9+g|1HzBQ5ZsXHdT$b6OvAAgzXL
z^*2OtCcT(yYHBt#?xEtHfEMNips6fP0x4B+(ENP_ZQ$(zFv5*bXO#cHRHgl!o#CC?
z+RJYQ(|L*zAB?8=?%LE`qZ$gpi;12A#|J4N3!anV&n22onpC!;T4adQI&5<`@p}SK
z72XK&p?10nZ4vBd9q{#<p6rXABqA`G0<eXqhj^&Q{l_C_eevySx(p(a+y{wT@D`1y
z*70t=a4P6Pe_zYN#mVW(>gvy(XcMRrjM_B$<J;r91>;S0nwHF`j1Guq?m=svg~GL{
zbI9k$W0vre*AMpq1q7TT^SDK4OlfV*v({Q8Z5u2J2w+!12#oU*5)vw@-6V`e+?zOd
z>|+!>ztpMIv~}yTWU=_37{8xc?(~1wy#vPn&BHpU#Zew^vcD#Ij-oSBLVhhydcTmh
ztj*fU4sgqY>BEa@bb*Fm8>OC?(&|zBLC$}}ZHl38&C;QeC%M0hAayipTO(t#1O*7c
z9Sua8l1y^9>(pt0KeVWS*3^)2A70Vt>6VQfpYWPyvu6%D@Jg79<i*i_WSO!ye%8XJ
z^xgng@mj!(#??XB1GKxt$ZP|6)=zg&>$$AgC5Q)eUmSmZuKYLDQWvqzyI@N~-k(n3
zQ~6*z2Zu6TIHsVHjwHR95s^kjiJ{wTyRY`d{Fg6pFob*rg+}<KkU9jo9Ak6dVVO~b
z1`R@>izM&;rzUObwUl{neT{f+ljp!QmeQ1r4-lYdNdCd896s#VLaK+md3ntkRDu(0
z<_VrqPh=-4>~Vo~6fAq<kRLF7c;~UBRGZOq(B+hRN-fsk!NuA6DRAHc%Go;Inn^-B
z>QhT6`tAMzgm}Ev&N{_;Bs0HAOZQe3Pf@NQEwH`1t%ok}PfxkO0|BND%BE}g?%lg!
z>%D`tqRrqXJpe2Ex2VCO4hg0YJw7|gE4^Nh-OJ6;ajhmWtMlG%ATu6v`K5J!a8)H9
z=IV?aH;#ZfUVGv4<;!J`CMd%#)kF!0b|jNs&VPP2gw$y?8;LUa0MTc3Qd2IwIZW7m
z6ti=nev$aq@+4)`#x``VPpCDd<jTbA&%+fg<4e#k8ck4DQQ5Y6>sDq$-|2L0`~SEA
zy)~zux^eO-9{!l*G&VanUHMAA1!Z9|t>s)$%(Sz+#^XwXp?Dl@Uw4n}`PoI&u8^X-
zF}QEy<jGCnzke@9Wv5R2DvIT_W!v0^ni4Q<)!=ddn*AI!rk&EntfdPJzD_rvEx)&D
zVE~rtlh(hCjik-PBIRjh+)YKL&r{xP!?yi5&}G<Nzw8KQH*t*PXy&H8N(c{f&;WKM
zZ_;sCxbT(+BJ~Yp4=s6~(!vAZ6|@ecI5)OzrT3xIbNlw~`pf~*MRgKgDr<&3?%0*o
zyRaKBr=@w3n2@_9k=Z8i8=k0KLQOzI_QS($KXiHXWhNiBq0F!l45;rKO<;@lqzoHD
zkdfM9{xo{CmVjps8vBClsN{}j3Drao?!)TU!jX(5*LSZC8=9{XVB4(+*f*y#uJM?8
z<7}QbpW1b9(4cW+v5|QBFrl4-UBgfjeK-{jC&+U^Wo3@AT0CbIpkb74wK!mnGb_=>
z9J|uMi{GjInwWKT+CW&34fOGne{ttnCz?}NP?kQn?ODzW)`n=SsME0Z1*B~am1_Xy
zj~=P#Gpr31Y$0umSZNwNYH`|GQXW4iimU19wDcqrpKvR+g7h~}{<2B(-?tJsXp<un
zmA#s>oWMD*BIMZ18*A3KZGC-#5(g}3DKWLX&&ifxL8L9Z9Q+C%G9yEF;>mIOF%iSt
zis%;%O2MH}%OuRC3$cF`uWCyp?p*ohosQz;BhEiXIY(M4Spn`{1}=VhB`3#^LxiMv
z(9&1!N#SH8?5KXt0`LFlO;50k8YZY&32Hl^j%W^KR1U#iG^s2|MdLbu=boJ3jKZX^
zZUAou)zyse7*Q@+PiwRT^*G^c1yc%3iA?g-F39+nk*%#NR{+xS>oSx3qEb*d3!5oV
z#887YSWZ`_Cn@UV=S%db`qx%|y7I;ArP04^J={Z2K@6OW?~gKN_=h+?1{TQpwOuU2
zOx8dSGvyhq_1U;#tkod5m#t~*KH;e=Kv9lkg4Di4heom_R01|^q73It)q!5eEx4vc
zlzN`=@k0m;msli^ZN07NX`z$Y4nAF@@#qZ?7&;}hS}!K^Q%9khVEEgW+_6RvHYB~|
zXQ?|g<c+X^{~_#y58X<g&VQ8|Q+Qz;?#wt|m{7Va-fJ?4PG_+Hh7Af5mQc<@S`RHn
zn<xGp%z#j?+WDnO%d&a;WCV)G`T3Ut<d2^^rSt81H_fkT8LNCQ2YZtKB4ytSJ^&2I
zP|0bvgDmw4B&6+HEgV~_n{5z1kf-S=)y*j+Km>~r8vF`eW&UEZ{LXNC<(k~_@(D>S
z?4Y)sp9BiI36hI%@zMBCa3F?Pwev)mru;0QGdQn^=>+H)Zpvg&qich*TB3z)hf`Ab
z)nBU=0xScCwhfXKNUPcZV8kMo{{4MPZznMbH6&zg+mCbz#2Q=S=<GZjeyq%3d~C+r
zehBy=)8yk;R%$7TfihZC;6gUgC-(hia3>!J7gyK0U|InDS;72}O28c`?jYWwv|dJ!
zNj5=`!_()7Mi9?*SAFU)>q`0bVfW3XVSQokrM>KQ**t4aSvbp46%OZQ)oyfBoC0O2
zjib8!&R_nA*&S>QB+$aqt_iQ50Q;)~Y(uoTH75Y8NOL68I<6^ENC9wkgNNE^f*Kd*
z{f;%U39ildJ~M`8oe~)Q+%l(H{K^fsy*c-1LpW!hK7R6K1ZQ>fqB*$}19iNpdKBUm
zCGe7MQx-%27LJ^hWjItFk#`yxW)-VWF`RVcMng|@b|;YJ%58Aq1qXav<D;I<co64O
z7ts>kng$o(ezt}hcVa*5_%cs-Gskt!?Gyomf`R}v{C>4i+sXjgw4fqu_UdJc+jZs+
zd5`OuX4-do#h+wfJ`OmTc1J1Ew}<&P>WYgT2E6<^vu0hSo)!CVISitur^Oh|4DxP;
zbVy{_*0*0TK9Xc5^*pJATsWTzN8cV+om<A=G5fODV%?$sxZ>^GS#S|#+c)Vd2Py)F
z8jh4Wkgw-Cs_=q#yzIg}m~%K;u^QI{WTm~(l?5@H>DI=zgMblH43qyf6im}GdW#&E
z7<m46foFM4-ktG*G21#Gs+le_E?LaE*#J)NSXU|}^pY$L(A~kNYd!vdAoF!o+!yzO
z%`mp!^XSoDII$jr7p~khYmvoGI=|AL$;mY?u%G6mZj-Ko@=JBdS+);f{TJNc0^Ts0
zl@QW(ZWsmp<AQ?Ggivx`6lz;|F6}yuqe`20Vs>p%k=hroUR_6xBn5!1%6=&jv1xv*
z>gf2B*E#}xN=%j7ph>^={{shi9R5r$3v!~ENmeLGc5*@e<~=TuZI3U3X52SzSr?v^
z%CS0Mz7ifS*MXsc><Pk|8|B$db_sV=<L3EQE0pW7CMRb@1ZF3P@idkH$EAh3*-Q#K
z#g(g9`$4{4+f2^u9sk=g8}u$_WNf3LuPMnV3vDaREO{EL+6-rrrkX<2>f{E|MFx$P
zCp1|_OG_Fu6smuGZ%6>@d<#acmV9_+W8~Xo1PLtJDfP2F=a4xYH`uN~-?5(pP9cGt
zu-95XO=~UxXA-lBVFLZC&&fiq{4I>sI``2yh!#d6kGZG`gRJCBYa_XCfd3e;wxKMF
zCjH3#x6F5kT}Mkt&J0zT@^|ml0B*rB88*}%Twxgv@Ieg2%FN`i1BseX$Z$s|Et|`5
zq<^S+6q3MxdvX9!TRIMHtH3TdE1E|X8#@5{|G=ZeKMmW;f6{TYzEX+bgmkcoaetVZ
znJu@KkjGao6f8T@sMV4f!TV8WApz1>pgOMkm^37EdcwWOqvTb3H|5yTrTcRmwniGQ
zmHv)CeEbG}nKqjjY<u($6@wSIj|OJ$m~IH*{2c7E#)bR$gFt_kFF_&LGxu0y8Dt=-
z<&}wVfFZ~ZBYXQRz{y1)q1=#7x}Gy>GH=wVUkK~WrQXX<pA9-UJx=*VM_uVmwXW6c
zLup<sm-1E}tEmN}nM3y}z@Kt;&P?U)8HSVmNZka{U>pmAW^|vqDZcw<KaFYDWQXP}
znJNId+OVC@77hcT=usfG<eB{j-_w{!;tnrXAwKvrZ9)0sJ6D$H(D-S$OWtZ%N`qlg
zbF$9vJqA*5JJAlaKEs|vmr?c^?;F-(nd916&f}t%#^0`4>mfJM!z}D~p3T+d<c-AZ
zIdsTmXC>Pb`+Ay3H0Lre^Ta@7*tK@bP1nS|2Ac!u-!L(W!{n@t)tz53p4SZkW(@jW
zlo?W28cezmj&ud6rCe^-vgP`9>uOVC@5fVF)_C%JiN%zOHG0r3fw-<MfyZ_<S_A;E
zzuEhw3BT5>m-$%&TyE-_R=X$6r53%gwp|2QJz=ME38<*d4x$WqVz(${yu^dPEt~8(
ztTk}pVUUFjYyV0g{`KSKZ>y#303qJ|0=@J$Qp?d{*u}D!0*(x)4W~tY-5EDY`s+1l
zP%D`FQW-*2W#m<sGetlnV=f~gVC~vk05f$vHeZ5;a2w@D*0S2PS@-hg%j^zS1&D^=
zE=4URAK+^XGSRkKGIGH|)<E*hjvbzrW@a>&wVpg3XTuEGT!V0!&78!(@^J=_0Kz<g
zWzjiQbfc5p)u6uile8Q(rZqypiyNt4_m6qKjfy?xf-1Wpx2Q6Lv+%uLz^z)fYStWU
zZ2ZfGyu2X~YAVZ#vG1MfkkgU}i2f&YQgi}R3BcLLdaY~RWW<6z8}9(Wo~G-Ysu?+=
zwMIO&anq(taCBZw<&IUn*GRQWQp$@_`1Y9e7iUoxD-hE8V#mRxwG9B9L81d`GPLcW
zHZ4%kU5jn0SjiX&I{C`2TGNr)K(rm|O1rD)l*nCj6RO%YL^_hsfp-2wal4(q3s8=n
zrlVBX9$eNfokTgynQebdjjp={j!q;a6~#H5xdazb&5-<G-@w^$;lmZefxkqTzL
zkea`@YL&#UXf#1eJC!yLBpZinPVj_9qa`C}7Us^L$-!diMG0^MGd^KKOgb>C9!c?3
z<ZziMqGKtfKnUvVlTh_<aCpPmZqblb1#Si{-gt37mlzXs;>3PvmI{WWk?r<YZpTqt
zlUWRyTfDB@eOKUu{sZMbH!+MQW}M&yD7lkB3n@7-Jh}YO_*81sI^6_PFrbxSoVv|J
z;v=u}@$=_b1wWuMzybQ~MnE%9a^to^?Az(rSbsgO&-7)STt~cWYq3FV&17TL>7M#f
zMp<*iijP$1J*S^L^KG%Pm;?OZtz=5_<foY#o5ys|WhL&2>|<aM1lWJ5T?~Eji`TAo
zv-#uOin(pU0hUz;H=ae`7>&V|lU4hwtCD23l|?M8DCbj@_m$ic>><oLP76JpE&HA~
z2XC+r43sTfr7G=037cGJVf3j}hrYE*TDMXUjHkm%HC(IXjZ1^*3CbyYJ)}aeZn`hJ
zq4}0dCPVJ#xkkU8<y#3&wJ)Tl^|Gn;eL25*F*Z{oV&9E=6^?H@dJ^~BO7xrFVw=Zw
z%8c7`nXKD3^s#PnIcBF_C`&jO#{ypW)z%)j)|S5&1C>Lh&)G*T8u#hbhosDFtMAcO
z0vH<@)DZk7;k25Hce38f^Y-)S8)C1cTujIQqNnoxVXCe$oYbU{w=q7Iwrf$#Z)a<C
zW3pmK^PXApA*!hMFa0Pc|DHdHUjDZiz0S+45vM#%7S+$~zb%eNuz=+7q~Ke7`t(3X
z;MAeNo0OW`fR0T_K1aUQhu4piQc~3aKdSx&tmm$4|HnUO%B;*p8Yn~NOht-HQmJGr
zAu6JbMafVa6crH)Whx0JgcQ+uA(ae;k|;w+(xm#ocJAl<KYks@ec#V>UG*9Ed+)W^
zI?r>R%gjNHCil<PwSSx7w2$Z|>2@lr^Wf>P#3}xj;8l}4Xwab8c!Qj;34FyTAD^6j
z8SlzpwBcIx)}1pi$I2fT{=!Fg?D?gA|2;xWc!eiixQDw89G2$dy!q9eH_O}`2<yI6
zrcUJ!IP2xtvYcyR?Ij(%b)z!x18FM}bHraWs=X<h0-y3*=oc7yYt2y;(SX})z@YiL
z8Dy99vlp;=#Z~+dQrPKi^@f;H+_IafhsKTi`Y(V5MaVxTI}ADM;NXxq%5QohY<C^*
zH>d7wL>NHHG<IJvy(MTsy1?l>uiOdJ@|yO8RzBK(7v1|rO=egtVvok>Ze&Og80@sK
z*S*lozqWd3<jwfoAg`p9R;<vvxpVQVwQIewS$JTl5G~!cYub}16;Tc3U=P6o#!m|H
zPK+lYwXt2}Al653Jiy9XAOS!*$Hw}t5-RPvK0Osh{Tk$w$Uz@Tp78pn$V*Z5+$%iX
z4i+I8u%gup2ZtxSTeWHV&I_u3*4dlO4XLp?^;JwQ`-@(0!Hl&W8Mwo9)>?s=tp%Zu
z58r~flks?mC_f&%M`5|e;>CgE*BZiNwH#lClkB*)hzadpKYl#&ZHo7n@@IRPoZRN;
zheMjb#1Spl4l)NXNJmdapIc`Mo~ma^LdLwc+_Gf?Yg<av_LtO)JR+niBr|N<WN@Ge
z@I1=!mjSJ>0#e2?FbIp{42VIrJSk6~9^_vegM-uTa387HhF96o<C6x<%SSmuCezCJ
zlwwpAoq~TR8AF*BG;R9^*u=`8KhJ=V$*7u-%x|Rp^b@iLDj@3RIl!*KJQuZndJ0H{
z+Tk$b$O*MsCZl-?vf5(QJS1j<E)j$UAdA3y!G{;3DVp$OJG`?iIopM5knOPuoF@Dz
zTRgb8l#-&NjcxL;E#0o}Yx+(`)h9fAbIWsF$2q+StRg`}>I*I}po+KGyZ3O><^dm;
zbt-uE$_VupayG}UTPKbVIp>3k)yej6e;^PPScXVWrayVuP||<MB4W*>hdY<;NWz^X
z=-xThVwR67QTl<|u?FH@fazz<FJLB=8VlNf1tA*HVp?jMl9G}@``Br7(bLv}SdD)>
zzk$+mVXL*Lu$IYx907(pVN`Qbb7N8R!I{;U=q70gt}MhoQT7-Q`S9wMJcT%>!f`R5
z-th+M)otP=sg*Up`A0@B4kEiX3D@#8tv=n8#D6VdFAA=oK5l@KCA>8cXg`*EVX!S@
z*+89}Z-tn@9Cpv9+PzOdYs8Wla&D0keI{*=-D7EDUN=?YHf9j;Q1{vVAWjf|<l}s&
zX;3Nt5I{fKP^irY;RC44LA6TAhNc-$>j|pv+DA!{ujn3hxOMje;DBa^gUyV2ll~v-
zsI@#*WR|M0|9G8Vi%axc{$Aji=(()zj!#NX@$&Fc<4-2o&C@Qm0pw8HIRw`HA%d>X
z1}$-(q|BJ_o%hZ1E%p=W$%ZhVDxB|@aQE)T*Nu~U1p1lZ+OQmHqb`7lsB|M4ioL&^
z$%xOxz-ZW9tyipFdvP&1!Xw8qRm(PV5Sf=M*-esZ!ixR&8y*zSwh)vxAg~ymFmhQ!
z_i5Aan%-)lMd26!Iq_m*NU#X8FCvspK8yM@6z$n=Fn~QkUVSyrmIBCwnY5FDWUc@M
z0r|~)W^kwwziGSIG?JfPXw!=P%CG-Kl_!>uKt{(2UhQTKmkm5!Z(t2BhuMe$kBzZP
zqsk=9By0~N06wJA_%SrFm}YTewaW>CCnN04vkhU_UI3s4q#uMX0K)h+n2$&ph`mCH
zg}df%c;1C#l;VHYMoX-^^$2{9G-4D|_DL}($;6-lQT)$gby1iqlgKAGH=U%v`Y;)H
z;E^MbcnIoS%DYqdq`ZAQBefA7(gA435FU7IBce4?fB(gc%5d>aH9bM!jt_?r66P_p
zIWx$#MZuN#cb)rl3WPcu+;(>;+DBKC7m%INIo%#yJFnOp;v=&*k56j~f(4DD44JUW
zTYU1*&Og0a+)XV%uODFab!vTgJ+5CdYmYp(i-Ds2>r|vg9i^m7SR;V?BYcVxW;G3h
zDL`kGs}{bxH{VxiX(g@<_>viE@$7%)0$vz%&=hu<p72tQH-#Bl=_oKVa(ENdM~g`p
z^EwAmNX&mZbMY*Cj^e7xtoH@|-MRm9z-OmJeVnqQ@i}yZ=m-TbY9r7mF)HkW8@u0=
z{jE5_VK`V0+Y~LiAsqpHp$?$dLRTT)`r9`HLQ~#^4)-~%Pw4Tr+YwOm{`~_+F<y&}
z9XseS5S!q%LCwqaE_S1_-~&gJICB#H6n`E1>Gt3(@2^AUkN*b=YS8UF{WY)eZom4#
zr)fD^tw1E<!VPb}efRDZrLqZ93vmkqK~q5}j2jn3$`(r>6@c>T!d=8`VYmVk3Kh%t
z?FDI{H&zH>C|42F20{i0CnTtg2F>0hEoR29OwIix-PFuu0IK)3a|2w+Y(K!!$%*LH
zl}#DOEkGLjfd2_mcan#0(>E((gYCS~<gP_J*Ycw<1Ey4IX)-_(phEle;9n~NLVs5j
z9;2aZb)gqN_S+pxJ3+FQauC{zQu+4}%uC6KL`IZe^x)v3Lz9QMq>hRDCKK|-J?lc2
zC;qdw&6IWkZ~A=JYuB%Tz~}x9abwP0p#+Jy&zFKX<^N9wAl$I!+-jj-l9FnDp9>ML
zgVN8^oCC@2dy$eF6-F4>WF?ZQ914&L6&Lv(aSJSNRrbNR&zPqmk_KPNwhcnm_Ar@h
z!jm8l5ZZkkmE!oDH`maHkSV1u7tCc3BC;<mY|zgY+Y;s;aLzTEvs@g^S}2Co!;4*)
zp=T2AGwYTh7Fj{>^ay)*ifH6XeI-=?5KzIFcUKvcex>K<%kCC|*gC~Srn)jh)E*>M
z^Y^)P<Ca>jJ+N)_=CfAGyDn67LWBzK*mq1u{9w{r7k~a-ms_D*P>4jxcU=pKz~jix
zK?7ld+ypt)Sr~&Oqm~{8`4!k*ochk44a!pLt=L#AXZFwhCZKCRLo2Z}TPPF(Dtb@f
z0J<sEQtQ^uqhk47c<X8l-l|<H8xSFH=py#hN<^yb3vc>^lnfZu!C9@%OCaz{ZVFOx
z<Y8Imx|y{N4JUp3GIv{)v=4krZD^Mh8nb|q5M6>%Esko8w%vB3CXc-)GpOw2M;%f&
zgRcSwBOkRpVZ&h|56FstM~IW^J+k)kW)WYZ4d#DEIshm+a{lI7X=34z*;rf80<d4W
z<r_KytH)28>#Dc%46+AhlBpC-STc4d{(jV2waJ4wBCKK;u3u7gcl#Lb3Pa<hK9z_z
zMD?4u0}SD`iz}bcx#Gr4gRhi8J3lAk_&A%)Or$~!PHT*Ld+^*n2%ah$h>E{_(TCns
zcm8?i-x`Sn5qE@5mIEonRhvQKqt4(0-wp?M9z_%20iXGkIX!A{x<t@N+!47$V!iv0
zg80hc*Opce0sj**Qivs)+?6ThZ9y}}^lp{l_`WqQ!IJwA9?ZHfB%a=@P52{#8c(D~
zqCsKbzJ34qpjOyzk@VP24_C-h)@$90o_f-J`nA{^2s@y+_0ECDfiq^z5RislBS9C%
zj>eO?_3sj=yxsv_W3q*?Qf<MoK|43@W4jYy!~GISCdV*cqYLBSWd@y8UND$K+(0vr
zk&(R2NWMtZ%gy8;GqL`#yGQ<?U2%~BiVh}ZjYX4q+PtRkxC`1aLNs%##R#5^=&n(d
z$XOH>zk~us9e4(?%_TeuyVKb~q!OxD1uE1rTas4$QNXL=G0E=>DBYRjS^%DhWq*m~
z5I?$~Vj-|ap+k42-BNgtR|L~NQQD<C?K*V04y{E+^n}wxDjaVYQ2e~}k%~{B(l}eW
zYEi4rm}@0V2$P7=g-gsou%IQzY3KTaVLybv@FKTs6+j4JYXd|^8OPfilQ<k5-EJSO
zTz~o(*3uFN3wyCP5#^o2Qui&NuE&Fi?^*qo{?HLquaen>fT6r#^0C4A+zY|K6Gfh$
z!-K)jTr?k8l_&OTJpR3`Z0(Yp*RI{q%d5KW#yT32-sja5AxMXwN`G`c3iJYPuwe-^
z%3m(r^6o?W3=Qx5RUW5j_c8rHEr6F-3tLQrbEITehesAn*I7w#H8~oJSqCV;;7`f_
z{aZVvUH{F~*C!;G_*6uVsO{NfmFOvelo^Mo1z_lZU=1Rt<O-iO^~CBnLY+dKm#9yf
zl5^Ec^B>aw?=dXd@o`N_=dN8p5ti4WTOe|_AyCs3B3i>-cxZ?rCr({?Nx(8f7Du9d
z#qwSj(J1rS7&(~zi|io5I#RQ(w!Hx55sQGA2b<trDG!rI&Go*xSZDzv&13qLUAuq&
z(Fkg9Fsu_+P@{AzzQKo(9BzV#V2unBRtqg==HU)t`3QxWmGj{B>$>yf&U<mw#nRZZ
za1D}`UuD?~sI{=HRU|>xGiyM|y#7r>Ae+z;&o}n7C%S8t5n=(OOce1<eX^zb)atar
zA&hkmidnkkLdBt^bm<W$^)tBt1+*fN3WW9#$n+dHZ=QF3{qf2j(^c*@FPA(@ON)bg
zCRcyJj-+Z=9y+uYS)f=Yaq4o<W%ly4o^zM#iYMGh6o%wr9$sFD*?pm*p^n?PgFhTV
z(2W4qVas!ap-}w56hhsJr>CgO$m(R^U75EcwiiLvcFSX#%rD7b&%OAdnRmK+XAYnO
zqQeO>$A%lXR#lCbsa{C$PTuf3jsY5mlN4_Ev`du9k!HN!p}*F8G#oRG5Naez@sA%1
zz!4U{`9ZiAXk<=KATGi3ANMFOBsp14_+DR>6b8@ujA`ez{LQ3MG$UB`SWcyOZa#(E
zaTqv;PL^^#2}T#Tmm!~74EdWYbR=C4{+do^FK!wPt#IMDtuHL(E$E>l@#-SFpi1N$
zyjXQ{+X0HNC65zIcR*=@mM}kUj#x#8=Lj>K%Cel<$d^`;Cks-Jif<DTKUiVLX-!Rn
zNdzyNh@Ba*cBy9pst|qxpM#cvV9iDI3|trYj~?+OOetHPL{p3A3v>|+{wm97;z`Ev
zzI~pasq=1CI#+HbT{jIF)o%^?6iotco+Z1xxA5r{LqSux_58sX<B)<fAa(&s8@b?l
zq9bM*UL1erX-&YeLU$tWd~Dijq91N_MzBTTs;aY<=e~-wk#_%E*?}u>7@Z~HmhVeT
zwW%f}m);t^F}ZOPNUgzEO;M_$OA=3kY`>?R+yag()?kC>+<x~UZQc2P()eF4Hn1fm
zwk3eHWiU73n1|(ce2tQKxgi8<PI2!AuYVoC8j?g(s}i;w0^AS0pE!JURf!kl3(Vj8
z$1jdE%EU@#Azs(j?MNmSbtmm}^$dKyttowYj!u7ToygJjNM0f4L5l*}o0T~1B|a}&
zj`U)rWnze+3OH<(b7-RF#jjm5^`Ucv6vJzM^OY8@(&*zCyf<^FgR9j>HE1-|u3}c7
z4@v<dh2rSZPvaSA;7iT037bH?h#v6Fhnru1)ysjD(U;AlIes?xwlmh6haDQYoQ|tI
zl_Z}MUlTqrFye1^75!f0cm7$waMp*}<f}rGAl8vuZ!C)OvYXYyG=n=iWnm3gX=PS&
zD`<`TMe|-Pb|<9xHIJHr+^9HUN32PsFcH~@6zv%wL}u+yTXy(`kCa{-Rj&x<;w6>W
zXn1sJ9z(e3LGcAcDRO^0yrhuJwgTwq%7}G*uLa&+|NN#q-UVobe?R)epgTY4*cq|+
zbOYtFfRf9u`!ZN2z;D3hgJ44%v16uAJpn6E-`70o%f9N4#hVLviV^A|s9u2rT_Ucv
z8~nF&VDH{(By)At$)bkl_Sdh{N_+gRQvSbq*Jr&43^@6%=rY?|+HE7_=?rv2f3Dc_
z<lE2U83eNF@tE9*&KlnE)T`~_O(K-@YC`1MjDs<=$EJPyl&KtL_#^vJI_-z!5GaX#
zLPN&M^;T%$06fLf<)DrwF@iamOZs~H_?Q{^)g1$rnjVY2^cz!2jXD;6jPpDFs)-iq
zvFO$NJaxn*f`aBh<vFts`ui*L+^In0aK%O-Jf9QNrD1Ar!}dYDNu=TkXUaDzlBuKe
zFP;(%2|6mt^aP&<=CDk!P7s<}Zdc`-c+7;L*Zm$<CdAiIq|eDUAGeHKo#{T~55VJv
zy>p8_cO~5c=~$`>d4EFINu#ORUVk%VN#4`_TYvobPit-c_O-8Y`nUV^l(5)ZS@~ig
zgRotE4WaZ;OG_&se$|g!OkAuWYq7xZqM~sqJSu#RC!bcFr>Czkq!=_-DZI|!yqn}l
z@L-UkdZcI;364X_HwwlVaxV;IwN`Pq-%x}3G}5vKNgj);R3u8|vPd0v`D*;fo$&ct
zcpGxwT`R3P^h7u)YZ^<J5u3^jZm5#koQD?6cGyYK{r&ajA?Udou|Eia%MzUf#lr#h
zBq(H4b(>VjMf}Sf5<?-COo;6h+*&!4bZ54|)W(^N1lVH1iRz)hRA{M`BXd=IJAg^%
zf?^FNxAHmm$4Z-Ofk4=mv`FAU<?sA;mw*DS6zwXd*{3<6^(`rw<wUmbj*_?M)6vV%
zsGf0*{Mv~##S>)Q6307Gso%l~N?yu^FHsGI%U4Nm(-64|Ey3#~P%kHJM3Y3>%5Y{b
z6@x+!Zx$Kx;X&@~9@4io^7OvrR(B+17@i(e-aHL)Dd>*tmz|N_P8$yfiJ^T(2(1`#
zU-!Hwi|YA4!{~lL*O=qi_kmFLPO$9`j_X{qJlCs8wC(3%qQGH2QL<-k-WQjTpsIBI
zx!B8blto7m9d>l$=`rTbu86E>ic?8jV5edfAfkps{ZxPZc4K2B^g(CtNY^iSZ;(s0
zM4^IsAIQfO0LIKbedX@fO<A!nZz=T&hLvL*zt=_{yPam&IAe#H6e$F({GH3nA_3Ea
zSP^$tQ3VM72}($jzLa$>4h;#(pers!&AK&Gv&=u_1c#v31aL(gmC#gkKSYoxWj_(R
z$e<pNPrK`Wx<;n0WEiT;de#ywg+w2&@B&njpu(JgrW=q4AC}cfR6#nNf}6QP>Iy!E
z&`ybz3Jff6(?@+&O%HewgouWP%^`;KkXo;%{zUvjd=rq4sFKJ*IZ4BKkQ4pudkq@&
zgpvi-Xr**@uReX|QYHL6V{`!-PV)|OmYsD};q%*t!eXuh<m3Dv<qI(rWDU<w4;K)F
zvVrQ3yD5`3Zb?+{z2VEU(G*lonb9$pUG=c2NY35eT|8lC`lbS|SzdCEf`k<L5N)HV
zK!1zz-nAiDn%#@9e(Uk5=OdE_bD8|M!A7(6y_bJjRdnmsOL==e5APec!!ag4|7W>u
zkD>NHiXKaR_fL>tU8H!tNUrYVt}??lw}*{C?(wp!@~87-pUpqqvUjZgKIKPM8fB3q
z_x@ilCeFj&<YY}wT{k=?aC%=si7H|@qdT^^EIJU+f_1z#YUIdg9EGroLWrT?`zByU
zU@>T+RO&g0ROvZ#5h>1AlHI(DU>=9=xy~I~`}yMO?lLkS#l<UxR2AvXZXcgc7`nXw
z@#7G?w``}O)WSJ)+6(PC-2^?bg;AS0F%_on8oDlsiH@f?yY!lgI(#2Bz36|OWbt_a
zQPf8yb_|m2i8A;BcEUY+_S_2?*$eHlR_?YXOZubz=PfoCdz&>pOa5A}wPVNBQ=0Ch
z<4;apcj0EqPkwip{kp?OmtHM$vT7HeaFq?H|IS?F3dYqGVzzITG&C%ARhqG}>wKa%
zhC#z)c<!EL`u(RKSvg|Hj6ozl>1+?<m?aYtXmm!ikoTpbqp|VFuU`)!%nj$u3m!y#
z^5@S@36$1|ic;=d85wmar#{lPf7Spf0#MWaOdSTxkRt6862siw+<>5<y(oCB*;$du
z{dd1f?I>J)HnwV|gSJJ7yT-HaQ*?>Nz^2{@56&?+?=AZBND9!^4<h92(lQ2$r^dW2
z|3eP|>MOA_?N}4TUyHhiY&0M=)DNAGLD*E;3l}cvGEJ1<%DK2e?!4Zme}$;&-{Au$
z=f-;n%8<@J<jU>duU}n`<;DvoG?KO^uBGq<uyRo{C^@$2$+AJ<O(X+d4Gj$qiqB~&
zYI=RZ0CP5Ja2LUB8s`X^FDk->goI-5UETTfMRNy*=or%ukF&Dqa-W7+?*NcC`F#(X
z{)7SzP9u=DVDljCk!o0ebA3PI8htB);m~A@9oS<a@U=MO@Zo$|N6u<%WC1Cyivd#~
z0B!idpxC-yM*e&ss*<RiMt}sa_8@8EQVJ)ENH+D!eM_Rtr&6{%waoh9BDh;rXYFXr
zPT2ajH~y2ApPmdLWu03)Dl{-~4@pf4-@Shg9;Q3tPuh<=-kJ5;&&DTe1sp%_1NO_-
zT>*^5X4cuZZIXaZAZ_}*zI}sV7I++CLCrd)El}aT``NKcgU@0Xv4T?|7KVH2J-Y};
zk#iR=ln_*Kf@sI|r4b`j@EqDp=oyBCQ^l^l+}vlFG$ADGNTU>N_Cr36BkGrP=g+?f
zT^HdphrSre?fj`7y7X^LP7~E_yeK$!W%@>P8)bd__H8xwY8;F59|rfM?`|$Tn45$s
zP`19{_3KOkyogQ7+!x)e-!gfkG=#QL8}a*G7q+s=c=Q$|QJwcroj?Y=*xdYRZW5j}
zK@7{vgz?iUWY^pI(T(g7;b1xl+nG$XfkS_vWK_pU9NTP)n5wY29NuHzYgAej(}K#_
zgef;+c#K~|wXKcg$aQj4HV7H=Uq*(}>t^W6h=%@r{Hxhi?mBLi$>+zuBYiR~FGA7m
z-*{})Ql+VBfXJqQW-u_|@w<1Hzu(-n%bW9I?5}zzfi3Timr7g_?L4cwwR_Yzllu0r
ziJ3<?PLCPAk=cwo@_`3q{(IMNthnmN-q}{6a%baJ%X>lb%bDU-J|$<7YgXScbpe@D
zWp3flmefZFvXY=s;^^z|<VM7dj<>Q7)f8ru-;${>;@EzX-x`>_=v_ilV{&(6UG`Rb
z%3kvk6w=4!lbLImwHZ-bq{->;-=O;L5Z@^?G%U;}`?Thw0?lE=W={gu%bCQvm{i*s
zkomi2`NbhMvn}=Yeak~w*t~=%R4dwrbP2C>Teel$(&n!{#)FA$2Eb92`^udc;n1P$
zK#ONTaZxqEw{BX_uEHDZF6RFsUGL-IPO6t#7*7&cw3(b~Xh9KsDBjm=%U5BE-RfDU
zCc1;y^qQ{bYuYFOr)vKuA-41G#{~sT4%Hmdy-f)wef2$eXa)}6Hr$+MubW8-#&L;Q
z58#L{X5G>nPcojaQRC;YkMkCjQ+hQ1!T87F4Y9jNJ<07Qw`|x^BPO3Kv>5_Da=M_g
ziGGcUnrc!)vwx407+O9Jb@Gh{rQC_9G+R=_-^1`0{Zss)&1;e-8?f3qXzmQgbR28_
z;}&3c%0^wBKaa=6#9ZcsP4y1Z9m%=*Lz6=0(b3CQgn8b2B%E>d4rdTRrQ(aYeLG#X
z2B29?WwM%B)<69}Z{G+4(EI9T*VoKnjOAPSY)<(o+W9ji85odSmm4t?)!^vi!^5Jj
zl9L=aM488&_{O0Py6M*R_x5Ah()KfZ<DPZ%>fZ!Qz>10Ar>UtKP53U$z(1#x;F-QR
zsg+M<P3h82P6PMx<34F!mM>ejnrh2zYW4)2uZbQr!rjS1C#F?t44&>hM0Ii8Imb|1
z;Pj84IB_X{BO-vykWX4goS*2>ZozY#DnH}=^_WGgSeD2ZIB=8Z!&3!~Uw0<Pu@qYF
zbLY-oOkB4ilQJ);dH>LB9Fp$vX71ey->sUanPYL7wTq_kafJ2oW<uk{?@Jf|tsl}{
z3%{{^4WSz{;dEA0fkt}0vmd>k+S&O85wkz@_4EQUpPp0B1-AS~rSN_%U$<tBQUjs{
z(?4deBe9mgw5`Z9CNfMs#7Lu&V|oXxF9=oNL$cW)b@sygW&*F)`OXqL2QFv1O_?_B
zEa;uqPGFaymAd-+DWvieiI~oI&fy&4GL@UoV;dj=M8ybG8Dmor1%s25x^QfB7kB;w
zi0}|+TL+hVWZcCxH%bDQUf9KR2&$3m+jsbzZJRck6wFmR{NUlkHrNGAb_XfWOM!)r
z)W2ZBTjrZrkL*mP1GgFQjKeum^!CLi!(AhTeb;b13~(vy=&0yl&D5)EuC<QgjYQpf
zOU6>(u0lPagL?EDQ!q1D$JO{|8;J|=jlf=4QK(PVOrs4?$zId7)+l!y@{x^6!3Td?
zmdRZrkxJZAEhVxU6jGo1;$E25#D+*kL*lWGp{0>K)(v($tqHDk2*60Xf@|<S<bvtz
zA~FDiOYZ{me>t1f^Pp5{3OfMQnASHV9=6~PMf&KO%rwr#4713*kK4n#z2-{sEM)O#
zlg8x#S#)`sC5&cfdXW1Q$xK2`sYAZ;n?sYvwkk;93VHv2ye;a}C1%6UmsD`g2wqn7
zPw<~r<SE>lN5db=2aQxwkwUxih<|MM=Q9pvktb{bYeYTn-*AfVE~-J(CU+`i(Ww&d
zh8Bbzd4WwccY*XR(#^|%YBp1wJrkjQEO+t>GI}kbxgLN_=C6zL^UZ!+TwZ#sD{xN#
zL;qSewRKLOKd&fMucDQhmbypaDMHuK`p0CZuO^H-oY{TQAb&va)<0_aSBMVM+5Ap?
zDc-vq%yQ}m*T7AmSH2PRomuvgOaT+^B=p)zI1Xf6qo3{4Z1^@FVUH}Rj$CeDUS)0N
zDwxUst_LMK4W!Jc91Ag=0=;bSx8K{_cN#GWqHmV%+&ObfsNjNbHUgjEox2bDg>==d
z13`@YpuJtSF&Wb447ZThGzEpWB7xzGKJs+@jkvg;{;6;8?ObGHGuHt>*v<@L40&mc
zF7GeeLX|QfVUh$P&?iY6toKsbqKjfmWM3x!6gcn_?YVHm?|}ja2X~X+nfQxznfiGm
zp#1sY%>yRb4lzB^c2l}z(Y>9@c14Jsdq25rLIL0bz-MmTP+L2MEPrLGXBl^{*oXyF
zybGqM|JqX5SBzB;=~IZUB50TX$6Xj7U!(l-uWTjy9j|RJGYl7VOKJUFY$HZNKvqas
z0Kv->wsT`e2>*TFLcs|gObXM~KQ;@68wj6P1F4cq!-<n85A3=;!{wKD?&?ou(?dzR
zMV}R0Y}nXMPw?j6LlPJM7D@n=KI-iSRxUzY-YxFdt+}L~6aUc8+hc>(?M*tl-9i{1
zNxIj`!Avg{x!8RYL62?XbStTT`iyN~nnt}0bDjGK99eM2R7YRVM*rx@s6SqYgX_vZ
zv%~d!<(JYt(-E9PhXR=VS$sJGFWg%>!2_Cp1+i3XC+XX8PSsP<tT5c(3Pb+fZQPo5
zO@G3boqDG=2-^%&?LGxe0Wq2uI9S5-ZoBHk8!!c>ht!_Gs&i`UW@{KDN*@RXZgSTO
zu-EuBH;4e-`hTc<hKjKSy^nbzULoy8m~0^Esxyx(X?C9P=;W`ik$^8i7V{;$5m<`}
zc9r*HoZruX+q%22Ov9HiRg0Z83%Wj1{j=!UqvK@3)X8lm-~%J>gv^&ahRaDGtcmZ7
zZg+4tF-Fk2{&y6IIoB2rSEY}77Rv8|IduX)c$vWZHQ0ZHR_%Zn(qh}_hC#m(Cv>_+
zjymD{(zi2BP7!sypZ25H&2xTt67S^lX%zsD69#*=T;JXrhJ4EuvW1+x+c#3Ux2LgY
zCQcAiiAB87ARf{jasJ7&u|AUFn~wQe`V#As>2CEhq7cGAb<c-$e<}4azVRZLil8w|
zZSv%~V9wHsR%X)DX^$Ss*d{CL9}%1YlCI#^6<!%)_SSx|*)|gT>Ag7(1B9r&zPlgO
zTaFA*09*DiT)k#Z3iY`}4T09ZRP*kD5@yqW2%<Y;8?}$Nbm_V;rzb1xEaHOm0~zZm
z_I1jO7v>d{_!8Ymz1b`*Wk0EBD!(4FYV~Rl5|fTx5bHrCjO~h224D5__m@@lR~e?6
zuFe3t30Ey-(b}s|o3`$ovSzm*d{>=q50=%HE$HWP>cjLqyZn&scov$ZWph49+ZorB
zHb{wT9640y8;$<{{yJ{JmBCM?MdOu<!G--@XZnJ8+a=b5_iuf_4RHcP4n)W}@77TL
z)r&9EDZh^7O<CX1Na1kiHOz;tm(JJJju#M?@zSiU0P09vjD%RM3)j_UK*Zhy$p<yK
zBaBI@AhV#>()uv7h)l;*b~#~0G<}Y!P`0|!EhEf8m2*65g}uFffc-jZdI0Rb&qkBA
zoXd}*d&r9`+gGX8x`-!L=W6>BMvH`yI;#m~YxbfTVNZLcvl)tln+6F1P~X15rR5*h
z`OKbVwm)|Oqr|5j7%gliRi~(!TQmg7q&3LXaw!Uz0-koj5$BET)*WC+NU2_Gq?MH(
z;8{u1?)6jM#TB!~_%nGxz>&*WEM|SdUMoU7?~rIOQ3yLmR&xjvu44t*uD<@6;Un)f
zU8xF(MtB53;2|(shHN<a7iKDNX|tPLt#p-2QnPGLb&^cYD*H=sUpl2WoEd$EYOn;Z
zke261I6B;zdT{7;q6dZ7{;m(tG&R;(-IM!3jKl2X5vjm?JY0Ze=_Ttz-Md3`QsB*V
zuI^WX<avUgWN5838vTW?J;(QiOKI`}eFnBH50_88Gpi5*RLA6YDEJSF7a%9Maff#V
z<c$8?eEIUozhu!`cWEdoXO2ReGcR=z9ABg}l1<7_$!3C8YiMHU$5`WpQd-g&oBZiy
z%Qup`UMeSq_r*b=e51L!xiIy^10t{H-9{Y0AZSD<V~)i>EJ@IT{(RTaZqurxa%<gc
zz7Jv(1aDkFcPZ<ah6<?NknOpfNdHh8>?Kj@W19@AT6e`AwrUt9V2DSf22&>diLqxP
z+yvUO-oe3|XGa#+d$?spw^@dUU1>E`h+lNs!ninl9@|?YdPKm}hxkn<`x;EvtPMkB
zwn6wCB<^bL&my)Tb1K~Tg3DO~(DO^lC5eX_tyLp!;ECW%@A=WVXX`CB>sCrA`(}~w
zOx&=D1-+pc?LxgL1r-<WYXjCt<F)pIul?;4P4dRlP%y?`ldGuW@1nYuy-onU%zd8Z
zQgz{6ii#(W1alz7uB7cU;jEGlmzK?#15pPCie^8wz>Q=(y0Mwn_YI0*gI&G3$#!sE
z%4Eq;GPV_HEc@RQoe8<eRytKLs623boUc_Hq0LljlCTxTY#{+STQWpVtvh6?it6v>
z$-4&e$m%X#lVt*fQWJS&I34XHUDR|d)4jt-C{3j&&CZ~tbLY-78=^0z91jcg=WV4O
zxJ_>K=~cSe!lGjv3(pF6U_@ySRE1vBD@#g@m>~5V*7o*N{F?QBH6~3sG;`t))R#5J
zsFTck8GVjJ%FI#P^X{O{h7AX)&}npOGjoI1lfR2!XifM(EkIorN;%~v`6Czb?2!K-
zv8=5N=oKgR!L5>7Ek|kvP0E3;eiRkbdZ8|-v&lPqRp-2{o9$$F2~F=j-`52N{b>bB
z(eVQU5krkHTT~AX-0@BbrFAMW-XRAH8~xa*yEvRfaaDc5Df4_;ht65iU2b7w$519_
zNTnv-(T`i?r&pM&B-0S>`;~MrWv?6vb8e>Z=lYu@DtfZNbti*wcE<{W63+nBGp1+V
zhySY@RXl6;aA?t^@dZi&cO`Qy8Ild8Gkku@^P_*)UcU3CJNdwU__tQ2N`XMfGO%l@
zyua}UgMKlG2K4W*a!Dl@Dm+uV#{>p?_LZ5xV8L8lTiZR#UV@q6M{gp9y}G*k5Xf|%
zD~^PDdRiv>`ua)9%uVTCd&hxtnB_S3;|KT4NXMRKygUi&(E37dOzFSV8Y%GW(t@|d
z&OC|6fu0^~2Zt_PBA7XLg^#B)8uiRV8yTkJ{BcV4ecQEd+p)@8xbkr$^lQ2C0_a55
zJK}MR8&6CzwrM72wGp-jW~&c$x2->DNef|2z3Q&)V7b$(k;D`0@A3gl2jx6{YM6VI
z=qz*N{AknYC$7RR@ICjaWYdrB3>mUL6*@Fcl?(Q2tce{Ak4^<fIC^9Ctk<k*4=lg~
zZDXXVv3~d+*Mmh~&(k?N8(;<YKiin%(Fqt@K=|*bjWw@yD<+MyyGu=bmNyU>m_!)U
zWMpNPl#BqRT&<Q`vQ!t4bmAZ5(C$~GmIMJa1b9>daR>z*2)s$b<f}_85hoX|Ykfh;
zH8UPRmMK72mUXOil<yH)?JYyckMBh*atgU^WqHmpV3c7L+^bUd9I-e1Wp&9+(TT#t
zyJwsV?9a$c-ei2*&yQX;z2tM`<=t`2?Mvejh+Qp!M>b-_%VXnCoVrL}heo}fpwFof
zaya=bjXkSC5Pw!xwF#J}mViJg^G8e#15MVF%$QoV!)3cO;P`zOLU`*5XCqkc1dQC^
zUiOgN({?uyrFt5on9<+LW?OE(srIhVFP};#f6nQWnQk<&ILW;^DOb8ry}G;1gzt%^
zH<@M0YsY6O)_*<xyviC~ptX&SV(3EJIC?OJWXcrnqTiLv%VrBv3VyZyW`5UqLcn+|
zF(x=TI5OsQ;_wAdoBT_X-W@9`ODu7kVC?!XIBYU#Xe{it<hFFbzbFs$#_89;;)xF9
z^CNeE`Nd7^+=3G-rvLGsPeq1J9v+!R*vEQwH%M495r=&~n~Ji)Y|cmbV=BLp#~@ho
zu)o8@37KHm6Y9i=j3Wj372@XUtw;|QU$ZgsbsH|KUvuFKRC4-ve2l|!-g@@mk7uG}
za&K<R$jA_ds#@1e>Uv9;UP?=>o_Kux#@6d*mHNw49Xv=$X$=PaT(2i{9r_THq!Q)i
zgFcbFh!(GBmMESA7rWw6pwSt4Q^;~epY)LJ;fNXJ&Tp0zod9*eZ2I==hkmk?=b%fg
z+1CpjntczvTleLv`e~y+MK7zw9nT)nYB;<=!+1$&t>Z8_XVA@gA3D@QA}*#eR!Ps*
zUtVD|2Yooa;AfjnR&9P&m3AI$xfl|`-%Unt_SRJizX0|<U`U7JBZ>{<EG@0QX)cRQ
zYL<t@51PHGW}s^B$5p1>S5Rx6JQuAV*N;}5?yN5-<1Cm9H@!6cf<pGOQRVB#jqw>j
zaiZDBL?iB(h!mrdLF(uKIvX$hsI$<ghNyLh1Orp9wh{zb-6=|T6Nn;<K;T3{EV=D7
z?G6J(hS;yWzW4l<o}Hd|>e_WbL<sTYeD%!}gPU=<DW-yuY*H<ZT2ejyhnMZC5)-G(
zYNrb+#U5ab1tpkcKW%1JRa*&onu@uq21CpRFR;%*X*L3_Sb}hvl2tUS)V4yJPS!sI
zRuwS66OLNv-7qroWP6BQDnh|<ezcyfEJCReaJ#f!BEdjME0KBHpcQ-P8{x-^F2Kz6
zz?wf5OV7K%Os|_g_B)(W;`ZuS9Ms;!eMq7FwG>Be6^?Wi6u<8sI;(0j#<BJ7t5fO@
z%%zE0N6BZQT+vpIwy4ba{p?o<EheC&=vM1fv->&pv^uXaFR5E<_IQWGPO=TE4X&d4
zu0$yV#3WepI*rr5m)N|$52Q~0{h2E%!hUc48V3h|^3@gS^00N>5BUXVV`ZDJY%=%c
z4`1SR#5N`a<nH3_L%>8p`J<{`OqVx<s0t56;z4^bTUFC|1|!@@zU4@IF1a0i3$b%+
zOt@X6A7w3E7E32s3@}WzdF3aU(?wPP7#wXK|LfPs114`xDuS@?`0@-6%z;ObwwKs)
zE9|Mc$2lIVH(|{CI1-q4?|y)*NhNlZAD0NrRRT2MD&X!ANAjZ<0>{l84%QoOL+gw|
z%0kH^NMYATHc?v%I__D(x>6J8IM^;7YPv{Phc{Au@|7xF8XZ#?iF4H_|MKSeuw)Ca
z$DB8>shDjIN}hkLy~IUv{_%gtp$jAI9@p>;1;B94IdRgKiDSf6Z`1<OX}dHuGz6?t
zpi$p<_RE*67>2%w)PAMl)0R06*`8EIni31f#&b9ef-7Jf{{4k*h9M>EHUaSVWhmW?
zv2AW&)oTeKf#EU-6b#)ArZX>O4i?yep6w;Kt+FVc5jF2LPDZpPstKA+wMZAX^xs#a
zqEKKe^}J^FUxL{~W)z3GF5#O9+al64lz~f<<j#xRfXAiqLlby6;Vorxa3g7)c16j!
zSgXq0`AQocP8Cw2hzKIuas%$30IF;(E-Be7&Ms=rx?QgXkm78qIRPoq1B#8n;9yVw
zrAQ_FFM6IkanYhhfLX0Lc-lZ|gEe;vWe3?1fL1ZEwkHWWW2iebKn6XdcKp3d9-q3+
zvQ8Yc<#y|%D|_B{9eGE?U1rW3ITZ(u@v>H%b?j2(OM+G}3c0!AtuOKyNyeZ=M29cx
zolKUUUr@RC)Kp)$8&QKVA3T1%9~;tx2;U~7@roD)I%VkMO0DFT%K+a*+lNF<FJkJ*
z&8`un>8$uY6CSAZp#8I+giTxYLoXCCOgQqq|MY1;dIGm+X7{6<^FEEYsvJK7=E~|c
z34-`uQ2afcK7IKjMG*s7^pG}GtKcpO_UNE7H0exE*s){B-E@%%E*2K;fcd0$yCI-b
zXpiK6c?6%WhUDjb8{@1JcPrnrE`&YtQ?E&wdReFS)J1?^#~OPYR~`!r5@ytr-GPBU
zqKk0?dIzc|LB!HC=oaq$p+wHHV|!qur@H&ixdpjhQY$qadPgmKVHNAVcbxC4u!3(^
zV=ZU<EmAQz86{jS&VW#*d@XwZTn2T<eTpY$qOPLVLaE3i23XlpHSWuaE9uXw8~Y+u
zYMu>t;)HQKDM1GWc3!rI&Sdh?ZF^~aJA+VF<l9-Pht&5AM=Uwtm4T0<#t=eN=&11q
zE|lB{d(qZYrz%&+Fj(?-(fhW)t`0oh$19xQl0x^~y=Nm1B!@6W)p0oA-9c}DiK(j}
zQ?9r4-ls9bJT-Uj8(2CEW*-LJw*#Mms{*N-RhF*O6^?EXS<NsqBR&Wfw^g{Z<EL1}
zL<|<)hp+IQRaSLo`vEYdO=rbMs`}M<maI(+7&==RmxamAeOcBmHQ{B<QZF9TXSJ2B
ze<gpCI(up2z0+*4j;ggZlJdB|-s`!XO|5EQrf8@i-a^6~Q-b6)ty;NKWHd1!#*H02
zi^@Q!S#IFKk(alKB-J|AAd2YOjrIap_hhFJ`^_7|BT_Lygv{H64G)tZ{kU`cPF0W(
z%Zd&Pyhw9RT|E{v-aXaJo}BE!T^^Qh0zJUp`G76M_4)wxm+WN7)NKd^gkBeyw-xYe
zb$`MWRK6ghyGC{tmj1u~eV&EqMx;ktI+w&cDCfJ=OL8QZG9O%5^!7{c6FVKT6eEY7
zQ_c{*(F)4SXNeEOcOqP?v3U5wnh^)r+DE*yD%3w!@ZHME^+v*>MW2vcU_GL>DT3X|
z4bmQ>yv<y5=Ls{?J*)cyGYD(|4XEJhk&RuW(B;N9Iuhc7wp0LTB87!YIJ!~O_i83P
zQCNaZ^%q@ek!rg7C5hWAP8eNqL+{d8!!^NjpUOzI?|Zl$gb9=+fvedv(RmNoO`DAQ
z6I`o4b)f?~M|3?8v^q6O)$`oNi(jV>d_Y&DPJ<knfMGB7vk?(L$(JR!J4JlS%2ic*
z&up--BM;UPuoR=LAh2~D98N8b(tHI@=Z{oEx&pT9%h~zGiKMj5q2B{pPtaNcsYM4~
zQ24{3QubU*1+b&-y)Q?@DU58xo-OSALpn9PvzytH=hby*uaz8KiIYvpI2--&K?}Z=
zT~aAfXrp>+_7;~t*U!{^m>qr~eJC)!Xc3bLqUSyp-;E`sj4ZjER;*kpe_2xVw{CN0
zp~|0pn;<u!$L`FPo_h3%!c#sBddN-{q(culU2C0kye?w1Efh)nyUAV2`%$IsIqT#e
zgVf-)lr>}QFJ8DHl^@pX43GsF&PXFy6VW-2p+Fi3+KJ9!4BfSBn>tD;yVId;ioY>w
z`U<}tadY${Spm`g`+11Wy-!a&Mlec!oT}KP2eS*_z7@u4IV+$2(hCLaYcKJW(`>re
z`(;yKizQ2>4xKjjrHrnuT;gz5y=jC^_?qPhulBB~%%XgSZ0}S7irjzXp1jml9b|PX
zR9Iaq`(jF&Li-PJu@E=WquW&shJfbwcHt@lTrf#;IMA?8_`eT7bk1ry;zI5xZ8W8^
zQ69sCT|FlpH`q_Km97vi2@-SyU13PW<nB@_g0}W4K+F{s?PT9@)^GrN3fUXrDp7fj
z|1Em|7x~QdrrN8<XZJ^HEr(NlYPpX=zY%}X22R*8Y@$y*xn#sm*Bd(XW!$Gn$v3x=
zde&HFx=e5}(3Cppc-V=c30}0cJbGrW2Zp&%Kp7R?+$LU!Boo6MBL09|E7$cc$o*J%
zR_((J<+|*A;QsK4>r>m!e|l_(Mhn!M&IF5SgFllVH4gXv?Q+W0h6pVB8zpB*ek{nR
z_ayD(i$9>YwG}|muNrU8P^Vk;$!)zOwTX~mHh5lGa&yyCUEOw@lIM<K{>@Y&XQ(S`
zA>g-vdu42=siM(;@&-SO$qyVB(I3wpe*C^(s>!>V{vw<2aYMF`6oa!w3G{43;mw%x
zTh7*^+z@OA`Im1~5-aHHmRwsRZM^iPPRD;hxj4t=)Ji=r4PwN2PZE=Hj>Mwh;nC=@
zx$sBE<22(%RwWU=E!_tX?h2cW>rR2u&|LunySg-x(q)8;aWgGEIj+NLk<hCex^+s@
zk~is$gSjnl&&SW18CZUF^yyWzHRpgXr;&a20LD<+?L^_j)QeAFt=?E27MZ1sM*aEC
zCnh<l=${gnb6hGvzr7ysv44L`Mn*@$%UT|EShHq7Nttv74`RAtH*YdV?~o;zf$}`P
zC{rcH2a4IE?kfauF)DmX2p_d1zWX52@^N%aC>CJi_J8+oFmld}nr4i{BX0gZXSucE
zNCd|+BFloM3|tmw@MZJKkdWc_|23B{(pjWhFCz;U_=rn4;KT_jVZ89_+EA;AGl>YD
zcs<d$@7edfJw-{6(Fx>=U(c6pG&bq`;{f|Oob*epfy-ctn}*0)YTrCNvij10RbCjj
zMmAF5XO1l^Ej<IubM4kGujuIL(u&`+3bXT}S2GXviIlIt5ndZ|DY~_TcZ0DE0GoHu
zDO=fP!Y{}I%W26OE<+*GwH?|)2fV`*S8DR<$Zg5Qrk)$faA-d^T{RM@JVIv_h-l2D
z-VPH1j^g!80FxCQgxcA-VfO6|gLaDm^o4GV`&#TUfYL2XW*p7v&e2!;{HD#2qP2JF
z(4zJpp>n81<?N*l<)_h<vE85{Lr$E~|B_4_XHM&Ov{d}}GM(-Id)i-!$R%zUw{G4H
zz|@0T!oPP^?EZf{@7w`;F1l&{l?>TFeG}^X_QIA#qxY{W6`s5hXG(6HP1*A%Rxwp&
zyQvyvp6$ykVb6|ct`7PX%~+?eFLfMfHqk5e^zxeb<rwv({?F?-Z?-VsUly|eiq^VX
zC>@rqtGi*+$LH9ufnTe}inGPXxP}{xzZ-U`J~Zk0G+kKVdAZZUBhPDk1x!oMeet3<
zp#+8S{?ZEZS%1GjyP%})5dAm-)*~exYHCdR7;KpwKf?UoU4}xPt6+%6<wK5=(gp30
z#O)9Wg+j$NV9v<twg4khJ601CbR6pRPkqVIJo-!1y9x%v{P_#hy6=36py&_8QQw}X
z<kpI2U?yLJOXXrXD7|mUx|@yw*E*Yj=ySw3lqjNu6*<WYAX0dovWD=*A+%<;p+^?!
zu+FbZFbzF7uv3?r$Zv2m=ZXVx{r&CrLfiDz)b}8MK{w|Ue#!48wBNQ5GFQvXuRAOD
zy~i`m0WzbnLWInBZXPf-d|u_|iO|{(?orwPQ(dx;EB%u0DEXpV)W2j``Mx{)FHLS8
z(|oBrrNcYQp?e2$l^+4mj1DUtlpZMRznbZZSHUn3XjtJjf8OaNVqv6XogC`2?%p{Q
zQtC#_Y75J=?N{t%hL1##cP|xX0N?Qgs9Y_b9I>SHu4a}#zSlYFD2L{(N=DI}vptKE
zf)k2b_xq)~z4dKrHQk{fuTNQk5RQ`Na_niiNj`yMeDJ8AT+Dlkz?~|HOfa;1zQcGT
z>Q;cJVz^Rpyg<7D{d_*g*jKgo)3HBl@tWnI=A?^R8j*{yUwGC8Ls|)b@E-J5a>l80
z{SHma{a!b^4W15$m{_KmP4|$Z!*(#F@T^Nyg{Do`lETm^Y08{bx5Qs=X<i(V1ApHX
z5G^odvHhm}amr}FbY1%gr{FAo3$oIlRa8{y<g2q(o{*4w4VttgS|;!6GNjpc3-@_M
zN}7V9gjUH21y3a1!r57}Mbpm87+IK(Y6x23nX7ID(o5$rTOy^EG}ZQ9<%B6nT+X=(
zrnt7}kMi=4gl+%A_;oK0kG!EC8XCSMT!m3-^cIfU{YG}>5yeR`oAb7~&X0aB^1tZd
zZKS*-FRl0kk$_~Cp>@B;{;r#z$xMX42=M6p9(g&*#uhDEIy4u~`V#gjEw|-~ch;ao
zMZ~cW?b8Y%)F9P4`}gl>9){kkWLWM_K50bDRI=KM8)!IdjpqOovB3hN1eQ}sH9b<^
z;MPhqdw=IWfQ8T4U(x3($Sg)HICSYZPCIwfo$2lwRtfVKEI9W2l@Jrd0ZTTGIn{+B
zu+gi&oLHGZxlVt7mhL_EsHc?F6V1nxb^8BIl*|NI%U4Ql*}#9>#jvRoch(sGZYIUh
z`iS)VrOI0|GQgt{Dt%#tC8*jR^UTpMLORN=3#HTn_J^%K8?HD0-60imK$lxIA<3!y
z{dr+<<Mp^W`pL$`^C+v#wk%vj4I&ED9E@=FuKEB#P;<_)-N+b0hh`8a-@#T6y}O+0
z<^{q|CU_8hREUmI%6kxF%irk#9%{UGSI{j4<~enCtY=+4$dajP<GA6&X~^khU$Ya(
zotQ<QZ?CUCl%_!@58NhG0Jw5Hcb6zO%|edF>5q_bETvz{P@F2-4fwz0P(JsnNSr?`
z5VtW_&FhMZa^<U5!t|Ut_M1$$^Fa0P^gPaHZ55yR45_Ft&zXH>3yR`N{yv*H9T6$r
z!Pt7q&Not4Rh8y5n--#TvStfu6+U{@mak`Ca7+ErB9+ngJ4-Wq_!pMSZWx}$By$0X
z+;BUmk?vX9H%!BA$%lmLlAP&R@C5Vz6rH)0qS0~M#EG(0+|Lt!KxzvM3<+fKiK}0f
zd0%7eRXvqAML*n%;L=v}A1Hc_nG`g*pMSZE20*PXPLn9EVTiR*2j(`xYT$>VvnP68
zR338GkdTm$6>x{L(7}sdH*<vssd{HE(mGuKA(sq;47^WJ+^?RmQIQA#$%g%*r@|x4
zPJEveBcYt}cP{W=xzNmE)b!~C3AVW^QTR_QMW`(0BrZI8L>9USC|!To1{irGpx1CW
z9uke5RyNJxGj;~ujc-1Im+gU42!#m^<_CI)cYe?Qq7!m1%P+1bM!#?(EH9>e;{u9v
zEwfytamKf%7XgeSAf60JiRoPtlX5;&SK&_}2{`E|pmx1ZPlR}yWKYPvL2^XjeuTbL
z>Yk*cJ~KDi#DQ6MtRVGY$-}cheVA(QJx?FnAn+Y&eQxX)m!?|sYHigSjnKR5+rNiq
zHBrmcX*!E>C=yXfVv%~uJpK6%`f_~DKNld~&1zdNBo&}CJr&<1b>qgts>sx0)I4K8
zCv|TC#M0j<uz9G#Rch?K8^XK!M%QF1`$@W6PLmi>nxAh7a37RR)$nnpgq_XX<xH&;
z+ngM2CPd%AME&A1GcVYRL%ehAjC*HpvrNIAY5?U@QJc1sa4J(si6?$<X32zZ47B%W
zef2tGlC440DC3B>pP1h1D!MuQ6QYd@YN!gy&cmL4&%1c2TuamK72>rVM9<#}Z*`-a
z3!J0_bCe?fZ1yXkE$$M6mvFvOX+WLR6G^C6A;V5s!zV9AKXf@-pJ#^p`pC`iW|?U*
zzgTfpT!bzCo47OHDFA5HNbx-bS|!Z&a(8i5dX1VR{A%cuTv)Na00={9Uph#H5esoB
z_s?j0ZvXuR4n~NrYvH(+ewL}rpB<q~+!ymz?@t#$KB-wPYuGq;+_)4pRZ21)bqx*Y
zuu;sO*32_B+z)G|mL5f~QG|!#e2AuwIBJDzx?9@F5q9ZVU22<+f)CQvxaGJ`oiar@
z&1p?jSGO2`<ji9ckd>qQ2LUYuG@RdN82-F0L2d9^MCd{Y5ZbMq1kOETY%|Hkh?qy6
zBwS81isuk^MTU(J3k&N4*cLwX5tp|S(uSDtZTLEYSH}8OHFF#Nx9EYGe!64+(Z_i3
z`;j0^S8#<1sZ3<oAhfColUH)ASg+v_OJf1@@1-sTz@hqBVteuYd23p-#e*h0-A`#P
zl<Dx6RLvd?*d9(uMoA*HbiyjWJw$oTN-<UQ=e|F{o?Oq}K`4Ak4OZLOFve~$61=`M
zcEktRzGBdU$eBLw!s=+m^mHPtXj29p!Mc4nHPleTG}oYTxaKBAAEZ2G&5&DCet&%@
z3W?L+cGBH(q2$fQC!*{~6a5nO1Pb4ma<sGO+)kc2u@Vw5qSIgCwELvl|5Mfr%23>>
zWH=sFd163zbnOq?z&EbEe!=&Ay$oT4iHZy*YCrS)rCCI|o+_Dk^^>7{_0nDFOHERE
z4~L!U)V^IiYx?fECuZSQK-?840b6rHs)IXg+1slCckIJBXx7K9S-z#cuIzr3`SHiX
z>Zi|s*Cl1-0nh-t&USY1(6+6x84_}T(WxNTitx%)dB|CN|M{~I*QlUZh@3CKT(>$_
zCOU6Epy_BIu(O9X9V)f>?KlD@(Qb{{ZvamF4~S1f%_^+RXKsiHd;ap4xV@|eO4UH~
z100MB`VmhSkYVS=><fz|*D35j&vw#Q+I^T*Iz*zUJ<GJWxB2>=ax~sKH~bj|S0|>^
zQA~=a252$ipigt98hy_TpIhZg?cJXsSO1fC4&TxD8|vzYq98@&lYq{`G3R+!mcGq6
zSIQP)KNeo`@uR1BEeKmKH+U=2FM@vy#kI6w-?3VoTtzQC*|hSq$az!*pfC#V7v{yp
zNs9pw7T4Ew(Woo!CAkmQuST~ayT3gzBDyc|A{H#92$wUzyS!lhcy8*n)a<p#4U|5+
zel9IlSn~b}`cAaDwX`D<IQJll7EX79cOafgBuLm&=v7oGswpX^DJPXm_=e^BlG(_n
zJF~@I5j+Vtn^u<Hz=-WBZ})fgk9<atO9U^49YzmYO8p6#!`Sj-0k;1b%v2A5&X81D
zpx^^|H<jdlF&42}@g1KMQ}%<dNs%px+evIBg!Xh152C0R`C7c;QA0hw2i(7&9FP*x
zext81uHI|iQ<|G<G;{91;EpmIeNG}=Fdor~=FvcB;8Z}kVsiYsm>M7u_JV2%#kd_D
zvhS2l3g{-pwIHUQ2JO1|Pemjm$1bLKclxk^c9us#2>-YvKT$PMDfl#*3)%#a!kInX
z1|l8WE#Ph_Qpu)XLutv{{H@@IG{H*H^q(Su;g{)7ibfM8{FC`q`Wi1t;&b4~=^_#y
z&4M3?kQV(s0ARQs4G_q8{r0afx?rYhU$b~wy6Th1kFPPVhPLKT?kB0t=%d^NtS=4k
z0BJ1jl)7N~$lA+KoR~i(*8U*^YI0v4j?Y-E16#L1u#J36M@Pr|JOK|EH&Sxv-{n0p
zH0y}8_Ag5$*-edLfBDL7GAHX+Wk@A39+G2krLo{YAtc3_WH-|YJ&TmH60W9IM%cV7
zpQ{DFZ`#*y^_QbMb<J8@TB2u5a-V(+ZPmr)S?mFw{MV-a)E$SA9JZ4Pupd8=i5uv9
zYVY4^z|8M`o+Zl7;?kzCye&qP6E<{^pd9X@@)fSoaM1gb%Ml#4Riua!_SU{UhPDu^
zBBgdVl&StvWEJ`SHTv1V+-dzkFn(UB)wSKLP(`B~etu<d>r(R=Zh>BANr&;$C=%^b
zLX{t|rI^u9$f9li<}uAQ``XG>De#|56>Hb8_d{mC0$j0wuPAB@5B5Riz0;!O2|=}p
zMSQFg71wF*!*yeFQ4vfb7H((b?wFtA8w%2xW)&f<3I1dJXF9E>ofrZ#%hffpps}!^
zAccQVy|8GB(*<&hx=U{Wm%3Ah^oTjdsP|6LV>i+{AAQ8Yv73j+k!Wcl{4{N9reuw!
zRXe68Is-Ya^HtR<Vl5kH>5&YadRO1b)B|Sdp)y5ZR086%PnbC@75rG|v>Y6*SX}z`
z!?#Uyn<3XO{6~$>DPrj=(uSa*<zb<r*i-MJAYPZ$MoMZ^Q;VK*XUzsq;PS}abG}hg
zBM3~s+zG?)mEQo<7^t~x*@v+ovt!x?A3od$Fc!wl)JBelhO0P3MJw2XT$K}m^gYbG
zEZnXs-15MvL6zGQq*M+GzcGO&n0iKE`{6Y-=_|dJOLbqErU9gh<$^~u{lj#<bWWhB
zf|gAZaN=c)*w#2>nfCiZHmSPfFO-2Vr!Vr&MO0=tQb^bos=*x|kxW0oqT4}&bj||<
zQXE|r?2?ht{I@7J4@E4L%;9)9jP=-tVkURoA?(NKiP($(eecNDA9zP6A$`3G2XNqp
z^kUnys54Lzk3doFL*{UA&_y98>i|H4h4g-wc!0uwP=BEfxpwoWdBX{XTu;22q79Cc
zcgzsDD9{JE{>+M9JXtSxGs2$!_6<UhEw&Z$?fD<dx)ff@iJJL*nIG7~dk8<R2BEmA
zu1@6)KEE>kq!^1VJUp;Vii{l-&@jywDY^5nQlGRWp{#*g>nMOx8jNKB*I-1!b%f)3
zQZ0KmtdB}refJ*5m8<RSM5`grlUnf^&c<*|ilSn-T_C`nfreW@?C|9lFhU_I?xBya
zV_YCPdVRVn_(h&=u&W<C7;W?5{ta^6uDtb}O`GD3bhX2XkWn!|FUHXOAbbIZXf`cL
z%+(U(!ncgRBwv@(hTexX7@wBAdD=FEib-_KJ>Y^t(>Zj+uH^?8(YN1^#Mr(jC(&hd
z_=!@=BH^*&FBa;`@I5^wVo60S45AAATQQdRrtWI27Oe8D&(ljNfLBpjJ6C^^Wom82
z*xnM#n^`#3C{zdE!aGOgeV?00Stj=7&^_Qzef~83#iuDRs#mp@@VLZH2U>s8DcMEb
zU~pl+zFi{akr%^_`EQQd{bqGOsRs;t^vtT5-{I;lJwUo^U436n?%75Jnjl9ES8CsN
zQvb1zVY9*_LI|u$>P7S<?h5Ir{qsm+UxVhM3#IWKU0v^YAB5c|b--TkuVOxgbKNgR
zxc3z{HlFBqBY+RfnS=+fI_mEV6J^2ASCp3iwj7&(W%&TuiU)Mebnn~Oo2m<E*!I|@
z4__RoAQA+D8}r4qZj72Q&=T0efjmPU+Ubp!KL&!ttUw16{~6bPjc=cf#)t|+9MADR
zRVy<&bdbwJKM1>i4EnnR_U^rYy=te~_jM?%MR;T%ln^=Cg`G&Bt$_?Ve_|M#4uvby
z0Qn^o=lXKDti%dajL8LZ75y+bd^F^Iv7;05l5}<^@@{B=%TD>R1iG_?*pAZD57}9w
zEtV1%we3Sf10xH>&<6T$o?+u}EITAhXgavoh5QR2BB3~#sN~q&tbUhODWj7gi3VCA
z@AnkLA_u{F7Ws&h>l%U1At5;&dilKH=xjM~SdssOiFyivgyt3O%S3yx$Pv&GqL%G|
zte~7EBJkvyjocBa!^3%D^-ZN6sRq1|PNXo-1HI*bCdrYX)L8y#Wya?eOlPRNQ){Mm
zij0YVU~5hv*Vq{_Q0(LmnI8%s`rS(d?~1m-g~BG=5rgxgJ@V*}0A-oNS@$NoM82I%
zMlD3{bmsLzL&V9RNq)_ZX$1Gfbd5HYb|ZV|Cy6qa-dhN&5^@cZU;y#?bMJ)<eKI2_
zY5Ni#3C6Mu=Oe1|pu#JL?H5h!JaR>4$JR6PzFM(y;~`;^PXH31n}rk7doeN^LS7LD
z?aJ6GKE9d`mtOaD>ntLpfIGxV7P%a|{XM`jxu_>ki|JNkz65bq*~(^%jYyh+a#ny<
zBWscqWIg4PWG3=Hp*T>VHf<H(1|jucqv~b7AQt#1(!raNo{2VvVvuZ#&;5+MKxggw
zBaCD&sJ&N}I7`HbG_igd0*Hc)7^17Ew+G<s%ZnK44jno$|HAaDUX5UYT7J%j381i3
zC>O;emg--o_qgDu!vV5xY3WmU?%o}s9&u<c;aZ$6Y(<JuunK`Ed>u}%ZGqQ&Q(+4g
zIgi{6;3}!ci74*HwIt9}VciD><r^6}90*Zp)$k>3L-0Sp)!M6Ms^w=o1IT%Wj{-2t
zBW{Ksefo4D|2K^^YK4rC{Wgt~<6Qpqp|hF9J6QNtiN`Mz48E+#xmPtV2dgXlzJ`aU
z)up8f!t*{9B@?O^k=)}P-vRCumA&vy7DK|t-I4z_4H!zWy|52LdM7qLP}5bb<T*gx
z;u{cx#8f)ic6Hu)v`dj#v3m2>Do(jxqo`LH>MR}uS10<YM$7!~>__kNhbj7@I?}39
z0NV}4b^F9W<7c=g!Or>{7{(yM-tQ4w@#U)&CyrA{)ejXD(w3=dpZj-XY3OVGuzKgG
zD~(WRE?(M_GekZE;jR7u`HAS3O9_XE03UM|VYY?Q`S)htoqhB|O^0Lo>sd(q>wal5
zc<}%8qq`vP`t`BQ$O#wvF|fM6YXAQiA2(s)h%iAQ4eH=l46H<AWL>T%{yx$>Fmv_(
zu6y`DrM$~F|L;#Cnm^zJK}2mO;V#|>8{DEy&Yf(j%+NIPGhe~1BtAY~W&>El=TUuI
z{#ktjeeta&443LWuv4p<JiAsB()oe5e9=rOVsNH;$0j){v}{xUYkx3q2f%mI6-1a#
zm9Fv;-7MbK+P_P{kUvm~vX(+Xa=(&+f)R)O&nXzzjp^ueSj;LBA68|=h)$9*`n_do
zux%$1QddRA)cZMP0!}?()ry;+4*S3Nk&C8q5Eqq1q&Dp5{z7J*dol5AiOb7Oq#_dn
zHP<@2BFiZMAV0$AI)GG)&_>^D_pf_V?hs%qctor2&7X!m9I^0%pCkO#)7^pM&4jXI
zWWAF6;FiHPyl(v~QF(|;R&t-<iGBEZK?<MS=<+g^7OWwMGdmCL;#Q7KuNBP3`_t3O
z{NMi&(DNR71^E#)v3V2@v_gzfpMXRpJ~md!ZE_efNa?qvXlx6pl{0sUyu4Q54E*!r
z_M9CmiL|_T$5GV(?fuvoz6S(AUADQdO7ymfpPckHFoeX7RuVDgh-fqU|9z(M@WcQu
z{nh)^q11J-hU@S*pK1|h=cMbH^t_0|Ow_|Xpa|~U`h%<%KN_tCV!7dE^rGzDfkvW2
z=ydDHo7=1I>$0N~0yM2MPS0)mMe(~za6K)Od<l;#b$gN9T7j5HstEyp!l2&0*WA}F
zUnPot1atn)8;X(~%8h(<yBxfVq}VM{LGgdm;?E~s&(BqI>fOblg#F~;6wu;U$}4pY
znX*}l>RKWJ^4v|&l{ktKDPlP4`$_4GaLJddJLmrI<(7p!(reik?S(ITi^>dT!1`j_
zjuKj7aKKZQ+$R$nDjpK+CcD+WsZM$Ko2uIYax8SqHfNb!0lb8Zh~x~=N(i3d|K2he
z%Y^sa-c@hn_slqsPECJZB*LUnWO{8{hP--mj$1x}X*og;F&Tq)w~6*1LLa3r-&#VL
z#9F(PgbX1we<hzihA;~b(@TN@wih=nzxBVLh!o0{eSBK!3bN{l7?kYhBO6<Tm>s>g
z!U>}CU6NZ1I}thj?xv4Y07`9yCUg(Fx0YxXk@85vu;s;6w!m6z1p|W2Z8LZFFK9`n
zHyS&BN46R^R6Zi&JB2|?-JAUUd~v@1{m3Q}s-+GryJ4Hty1AjK`QLKgO#J!*?VFa(
z8eY~0#gx+j?Io}O_q73vjj=J7>}JZR^}eBjG?Cqh_7Hx8;+(ZClTJP@aGr%|=Ekj1
zXjv})%X7@AQC-Nhdlx_v?~fgOk@4ql>r1a5+E{q=Ny}k07B&iETB<}$LZj6<|GxON
z#?~K#g*gBZZVTzrLr^g>DVNOe*|YZnp_C;L(6vj%DcE$wLSEkF3-6w=t)joA80c;;
zNMXyjYx&~F@0W10k!7_SGI`HMf`3bp?)-rEXOQ4ll4S{wt(dIR@=tw*XOc)6B)gFs
zpP>ozUqqK0a9F+N9Ayk42?YaPTdt<t3Ap$DA5zNBeLJ!7%usvx*+Bn&j`(K(eq@t~
z>X+%mltAkE^2}QrB*&ej5bJacH_`w7rBF#bE^SJj#hM{603RE_rTf1{G~Gg=JPY6~
zDnE%Bkt;+2d(LuRB1%{mF{V2JZ7Vi+?Df{r0^_z={J-^IL-F&9>4Or{@g33WHA<IC
zlu`_+n#f$3wk)|Q`)M3+`6E+p(H+ZvYbBXr|GXoMqDGeNXv)i(X1H4Th;bMEXrr>a
zhOu3E1$WYYTLMGm;%h5fwVY>0r;Y0MZ>fmQXMg>CKO5KKE)tQv=2S!O)~g4TtWd|b
zk%-PvM&*bHZfwl?)vnrrD+(eXK4sua#ee%;K1+Gpw1(!`?PWO=i+q~XtzNJ0u{fZ+
z%=|)+A&>k`^o$xd?K|#km)?2F*O7N@?QUH9YU^c^m(!(sU9`=Te8acV+kD@sCCE?k
z=pCb|8{Bqi(3{d4uZ>sx+oc7)S#Y%MLugZAw%eF7Wt+P`xmQ!|BJQ063}_WTU(eRa
zbE|o~)=@>WI|m-@97sxZLV^Y5VLcd@86FmASG5A3l}HjEQ@_Qd{?f}WYWg#Iut!Hp
zfWpVp(&4mUqNzRV<>e&?OA##l4U(3uxQ0diOj`4gDcoRKG$O2vPzf`uAL)gP7u&tg
z#o5^w%kZbC)BOirB=_wb(owP^J~45+KTLq8X0Huk*)r-aZ$0xy9PriJ7W1ARjJfDy
z*j5rLa*nXnU0x5vDNZBLoVj0&x$H0-8=KEO-R(_6N@h~rR&u8QS>BhQ;W?z;6EPRl
z6Uj{Vqh1U8hVqW*s96?MC<dJ)D%g_>w^wd0eqG`iMpYjDPWd$51GM&aHsC#N2fLK^
z5N;JgM~)u7fbHynmY2Yh-a@nS_hAJ)lA|v<XR(#$8wS4db>FpRSHIyZDnWOr-7^v|
z8mR&_d4>K&^#$+$7VqmeVN1EuzJ;7l2k-*%mjk<cQLctlTtrgy%t`;>XE_eoVBuc-
zP)<RQBKZX$wf2;Zq_{u&AA1gpl#~>3RumP@h@nFtlv_w7eJD*&k@_m?U82*FrXtz?
z{q1?kk`-qbWtE0KiinVB^x9k;4@~=Q(`s4e_JsI6K({c>=8t3C|9_Od2UJw&7WRMC
z7!x&m6MHviP*AL(0@lQ696+Sm5W!x+0thNtAjyqhyN=iptRT&<C}6|bKm{U2L`6|V
z1gsPh5&i%60K&cBx4yOhv$(m54m0Px=Y4m1_OqXDg>uR7+{TebC{<<<fx8*C)dUjb
z?ylWM?nDV0TToco7a_EqMB>@tm~RzbE3l8^r&~swYCzSCW9l@1PE`JNaBU?ri7RGi
zmQhY36EU0DwNRA3eLVZW4GrF%1d?7v;O<PqI^(!XK6&pS61TSZdj6w23BbSj{-aN0
z!5RA)HB(}mxh-7i!2oW(Zr#i@DL|1q+egdqC`#B`nhkTw7&L)!ox{LO<Vo62Cj~zz
zx;$!8W`+K6h%KgF^2bmdq7eh~_kO*xO@9DyMrX3z&3>Qg+Ny29PJjPH?33`2kdfp(
z_+ZV7vipy=4IeDuHMH)g?c0M%l+#}>QX^RW@)R6QQj?pR(-ofrYfH<$NNUa|#nJ?l
zOvcVX6HWo{>n?TVciC9%_UA~8rm%ssA3xrQ!wMq;k%w*TBKt@tw$S5jrXj^IrW^St
zpy%DmijUhu(hZN?q~SP9O(1Z<Z2DNkU^HEdlWTfoy3_wks}3CyMkzk32}JMl{D-`7
z+`^~{`rpYnhbYUIEjtCENXz+14JFCb;k(;@qlDP8>tHL;G)TM6&8f#NIXCi3O=!^b
z++LopnWp`+H$T_+SA6oVL5Y{YOP`gn>WQ6xb2+Pw7?WG`U4_IO8k!~z`~eKl4hEg)
z7nGP#bBiY*(;3FS4^Eh+=IN_fN5CAyLHTAx4CG6E>h~Bj#F0lDuz&ylbsPS0D`03(
zSJc#CS8Bu@<%>?q<jH&GCTHmIUhj`t9g5M3Gu6<^9|4s@N2O}%-%xwWF%j5uDr$%S
z_@j1dS(!6cQ}&{PNpofI4>B<&70I%k%dQ=kWY${FjIn}(f<8vAl)Z<9ibMn5iS)Ue
zKuA*%#MQ72w2!RW$aMcJr6+M_b5N+(1afrF7bTe`Sj#!^3G3d!|2XdK*ocko4+Ao^
zYF97&>5;RqlE6TkK@?ZtJ2vIdSLaI{5Wh&0?Qufpq!qu)xIZDYh}H>M*du;#qP=!o
zGl8mq{`q&+92}b1zW#?une8H7_MJQx1sW{?d{k7FcD4Tk3F^vWpOA^1^&N6ZJ(t>8
zRo`5-zsA<y{t$JfNu<#34*2UmY(f?DvQIpy&le4?>x%gFD1Rd>amcDrGSv?ZmtEpD
zyV+?VyW-sB>z<uvv9n~2!z7a6={`6Fg!x@-!;zGiG=bDsoXDam0&SP4A}{+yr--h-
zds}JT@6S7<hYl6BbU9D9V*%b%F})tdoN8-0EV2RFGp%@HE+?spgiuUyBr(|TURq{N
zJ;o9%@}0%kiqduOi9xGvG$@lAQVT6FP4wPI>031iq}YpR=K($a#KT5>x6h?*Jspn_
zCJbocPrL}#v~}Htu1DD4B#B!%yPdR$U!Fqz?m-i0d_2@q@o4}H#S(Npq4q2gK?(rg
z!|En@<H5W#f3L^XQT=(4dH}t<9*xxAppxB{y*GN`+kY44Q0x@n@<2mLa}I2%`i&ZG
zb1%KLrsToGjk59oB&D2h>*{(OUC$%D$GCz2;tY@NAdI^5lpM!A>*D$;iB&9O;q&PH
z((rgXtZ!v7;v%}V*aRzw`(pWBeg~SH!%cuh1{B(%U@spCM$vLd!y`o~Bst-5?9O{d
z6xv}R4N**G+yZ~6xCjuw4~X4~igbcA+xa#H>89=5uNdac-kS&&$Nkd_Tn1Un)$7;0
z!J+HCMNK6wq29#`Z*OZWvz!?d-ch|~<Hj^Hf6vi%6wA&c)hPP_qvz&;EPE1=mH3v)
zec*1Q;_JgXSe{xEB#T|1@A&!2Yga|F^c#00;)<S{;@NpVQU)X%A7u~ZuI{qi(%s$N
zbH3Q8y=|K|Z=Uz)^QSZDe~!W`CU~C5OnBvm2HV0AsFQr@8)}7K&vwo-q;n)k&z~*k
z$9RzTE8b5j{qUi+|G$(sk8p+LZvzLF7bV|FNLb_`{Rh-SyR-e<ILIM3VbE9TG#WHh
zK;=AuvE`weci!T3=aA>+r4={tyPb+0OR0<G0JZ3Q(A9Xy?c<6v25)fW+J<)FjKGcc
zJj}E8%HBH8kjl5Gmsjf@jTB4vt9?9W_M_(t)?i*#YCgaVgDI{u3W$rV#LQ<qKJg7C
zXUIEz=3!EoJ<G6toz&8g42Zm?&5Hd)WY0}&+N{|eibh^J0gI1xot<)I(nOktETvV-
zeASxCIGomuUKRb=oCpH+y!2UnQinAjq@q5V^>-~rHR)5yM|w+d!*DL58fSczym&Md
z;#T4YmnAz7XxnJU^yzq~oBYf?sYgAtbj4&fFica(-&^9I4m^%J>9b|A-^#@(4Gmbi
z^GK0v0?{tX!%Fuv`d--^_Tl+~jYW5-ZOjF>O>hPiNT!z}gjn9SzM*2Pl$MtI8O++?
zu%dK^m_W~RF^&Mo(#u>Yxt8K%dZn*U!k(8@q!=slgJ&&G6&K(H3I1Lj=Nxo$j@)a_
zCB%#}S@>+7e)F{ysMdzx*o9~Avn$@~#qsN?@l2lli>(UtsNEpw3mUg=lv6BaR#8k~
z&H&;qvnT(egzR4N@$GNtQP1MC62@eY>fgUVl}U4hnYS!qP~BYi2%uRH#i4$?M$w&n
z4}5)$;|M)9B3dAd+4>#;BvwL=Eu5=e7UgqyT4trp*0n31e-nybNAVyMHHRI8S8TO$
zV8MTg_}o*AIc##q35t7eFL@2>BqN~Y)XJ)wxIX!JVKgKc#(Eyj*}c>P!ob&^al~8K
zIh*+>Wp^o<sNb9%#&q!m*HzulU>_BiI&vD%mp<DiJCtzjsMkxF8Fw5%e;6kg*0)i!
zNrZ8`m%p1(PAYt-r}xXEq9R+2+Vp<AQVE_iJaYoRKKPYS*h`q3wEFKnG&US|*#W%H
zq+%*E8GWfPL!wLwih?{B5M*&SL#*ng!q2TgLzc5gIT$HBew--1bs4CQZ+!Zs@nq+r
zTcat$;pX4|twUegC()ohx8c?Ke{NKM3%xXQXj-evC5G!e`=6M-|5hv!)jl(NZ1@<U
zsbn4;lYO5qqn~0Fb@4m0r&zx_HrP+o8^t;aC){2=FlOFw$+kPQkr<?F!Q0FkoG)ze
z-pqqJWo2cL3R*mZHN7mg$>8}TsO(}x*6-S#SRVODnQPsWtUj-<H+!(LTV#jru0xJw
zuCTvU9(kzK11YpQ0njJx$;ik^#X@|?6zA+2jcpxZ;irZlz0h^-)%_R`X9v*R3rOf7
z$>=QM6cq<dj|RP&T61dcb-Qu%rVWG%u$HYmcgj4OIq(C<1?(+1Ae|QCr9|HR(RU-4
z9>!px;BZ5G4xJ$(e_&Ub1^{#(g=c?rdLKQ4A*|t8ib!jJs958$*}!HW=nU+s#%}GT
zS>Wn0*cd&|P*@+eXYeGP{y4Sc>v^c;$TcD?!5R;hYi!uicjLy5+G~uaURYR|%67y>
zPck(#vl%<~PTy(g&z&=52pFh=8W!!>fpxKZzh-xLJs1=;hq(b~PF(}N4_I*F;UU0-
z=Zvg5ms=`H2?G)qo3(-p6H2GX!=%<!yvL0k89OR5N4vPJdaIJG`r1{$8QEc+i+0t=
zTD|(s+FC<YFOgaKv;G$$cdws6XOOKU_;oU6vLaR^kFb8k^;$ko!TfFOKGXjrp#FeY
zk(ZwveYTl7^Wcj`Wn;Q5dX}BN)ufBQ{;dgl>(;Kd!9zWTw!8m|_L%PNBb<J}{tI3$
zJC-8MQZBxwe|^O=fC=S}qydtZ1rmTUH%JLX%l{iO3XKO9{nNdJ6Q9sAx7p|e+BhvF
z@}Xx=J1T(_F+U@I`&lQAF%pCDB3`eea!x^1m~eRP;Rz>45Tu{6eekv7s}J#x1!9m~
zhU|9o$Ai}8OCKVGCJ%+EQxo|2-@h>L+qgl%qO+6Wd_*}VkbBICu;WK((u%J0-Q7=-
zjrwONAaJt%^91c34ZT`Ij!U>b^W;u)vKSW-20wH$9G1T9`||GbVH_k0y{MOM4G5U$
zK2Qm9lhhK<zN8RhmH5bY8?Z28J4`1&Y53iDsStVR&smT5&BTPag>k?3M`hy0HUl)`
z*v|Re?!c2i8&s-2G(5f@S^wvA?Ruk}npZ;iqqRPsp{f&(`|#^E<quUZxz@e&OV41<
zgTT^e%$n7$OPA}JzKW$MgMZ8_4v#hE&XoPb8)PiHKX1hKt~O9|Dy&O_GvQ$x*jGT8
zMF#@|pA#@8q&28t-~X?~J>|>na(a$_f&QvT?jE)(FI#;$ez%#k(K)xE#hF>Rc1&^U
zL?X+SlZ6+KmOogSsMf4tP66W)egn5ywBmvnmgLVZwTEWEEu0lD*YfSRR^!L-z}yFk
z0mn_>k?08#jFahy;QEITKESF)PMCj{*N9cxFC_|{_}InBck|}Q`W~ITba94h-Sy>5
zF!a#`5=IRID&niHc`4S~_q#Z+C^`Emm&)x{)hN{dz={kolf1@B&?<E(>J>jYUd%so
zHszwXH<?veqO{$+x4vuJoiyHmoUUsHIG}`RM8=gJ*45Z}0vm|Zh6w>u=t2+@w&{Rb
zp$1qG0^=U06-knP9NzxNAAjcR>+{rMQlKq5+g)G2y~kKv+dx=)6ZD(2kJDei`{BB5
zb*bK_&mBuT)vRgLr%$jodo}wjfEE7rgAAHFnKI{Ox-a^o#4?WUUX$3;*#XGM*LUtb
zf+(nmxi$+`ZrE8_ZAN_mDCXWw!!`*9!I85_ZZcfPLDGja=EQh?)_<!wwN_bt7TK;Y
z=>_$A2!w&PC>RXY`FV9k;smTD*~QBTQOOhX9*}PV(wPR>$h@83e(T1)<EQfe+4m=O
zDSO|Ra+iwlUns22h7PF-1iNI6Ou(vs68lAtAil%w22=ha6V_!?;}usRLc$EBcC4{U
zrR6O9#~*aSFto4tLC{pw=l0849RSFcMkUEK05QjB`W}Afoio7JBtfsT+ihG){)Uej
zAu3{;Kr9TW#X<`^D3-SgSmr%EH-U3Y(Km=Z*0ik%O_7lOhLOmFWC|jjtzG*+_EoLD
z5~QcA%TV{T{y_<Jnb02>l44AW)fSJ3&MZHKWnKEtyO+}UcWrPxG*ki%Ne?weKs2)v
zq7a~jAib4{>Na0HTrR-7Jc7KLrH5+;&apForUd<5uwh3%Md{V@%k4*6Q4c$D|E2t-
z&*di?3U}y2TqIxZ-(?Z1a5t`Bzdn4m_RD=#EXZP~+x!3G!J>Q~4(A{Yu6+g2Wf_MB
zFlE@)R^kwNDWTL%w;A=IV+nA^)ZE;buF7oKn6d;hif;uts3@6Zn>BAv1hSlOPiQiL
zuq8a{;fi7+9i}I<D_H`pN{fRMGO<^485~Ex*hB60_^%d*MCE@rk!uTaB3k$L_t(e4
zgBjTaAa!@OhXp{ZZcpyN820ePhYx#Luutv(jl5IE%XbxISz@C233;c|=MtWphc1+H
z;;_cp`}_b-=<4dmRxUuuJ&1ChPp0duqhFrBkG1%b!AJr}aRGV2bYk<5kN4GRte^Zj
z8R$HPX$>5GCy)mU!@ITn-njbVAIa33oFC6-ywdl8)fW~L@_bwltT5d^eXP;X_MAEk
zDql8-@9(GD>$P1`x32V3Z51g$^#{N@L&yS^7cif5bBYK8zRq|`8F$#rN3@3zU^sSe
zKW7FstXZ?BkL&>5VPBR%Mi;i1jh&s}+<v<#rbbY~;4y}!7}u&*OTOu3N;*Zt)KHtM
zO%UQwL-)QeyO;c;X^Or!l$bc8Jb`a<MJE9*W)p{>usduQ?H)=RYxN`&ug4Q$9m0ss
zc}#Au7*UtCUo7Cz;ZVzS^*!#rT*RXoMamJB0G(}|y}dtq8$Vc^#8qdsEAUYUfzpa-
zO%H_I;>F@tnl))Mt-|`bND%Mj6h?}{7JRX2>FO--RY~*WxPKl3@sqpu%bV2ra$t#Y
zOeSC;c0eLu=mq*78!leFC=8-4ae<s54@6C22~?Wkp;!ArW0qmgnm|IDIkW&&973NY
z4&gr&o{JSDo@Dt+(z(QLbXr|3jbD%gXQPgW7&w1umGM)$?vft#(88Q10}2ZYs;tko
zt5+rNEO_^Bk=0b~ov)jMeTISuFv!q3XATUGt~byaFZ(mVe9p4&*j?5gCXQ*_7R{Ok
zQEq1G@f7lmu3~m$j_&<pd$vxRGG!n2J^#NXGoVkv*M!MmW>7)atX;cs+W?s9lPV0Z
zPeG20QPVbU!U<$eH8QY#eY*j7DxSGZA^0PdHU03^u$~v*|3W`VPgi?D88VQ=e7_h#
zMEbQSZ^iFlPM+=X|0600N+>NomEgGhZf<^ler4e2dsy&7DI4gh*&L}DcL0_tf`zk<
z2f&^4y*l+a6{iTqg_<>MHp$=`6mH3EDoj3I8x2R+SB6H6__E+(b*MUkhvC`qxUdn%
z%vtdLQZFwiI{!W4Io<D^63OKoY|%#RE?G1sgi?#vLl~E%h?x>^Bya-u=mGqZFE;Pc
zc%eCQ2^kra2)^YK%*}n{rfbh!YNIb!H{{6wKbId<+^XN)9fLo<8KzE>COvY_3ajUn
zZ%;EJsV;UOZV2W4>!xkfs8EZ!n|V@X&8;b;+50zDe4MJ+K64E@nTc5dzet$Nq4|*d
zwtf5dYu2sfV%@fxpvnQz`|9r$xkbhP|FQ8}cBUVK2y0uaQz}IQTkG|KtW-1!@m62o
zK%Rd4r!0PYwaI($0oyYTTsR;o0e_^+aJUp>@?vX=k+ZUzVNr+`-ptSj$v$RAZ5v^P
z39i-uv+FEfKfbx#*p6ZL9Cj4b4@=T(aP9KtVbWucLOP0(d%2WWo4EC^e}36^nb)Z3
z-Iz&x>I-^pmagvb>XS9WqSnv2<J`i@O)5`$NRLl%H+nxO1Kz4Ff{{XQrAl@wR2F6L
zRLQ@XveZqs{l$;gVl-Us(Zjnn^CZB3D(xCzO3_AG+pVp?Jo%Wz|3Cfxm?Yr#)I$>t
z2)b{wt$1YIV6Tc}Z(G07s$uZFs*V6UT*P~bRbn}|rq8_tZVQN=DGt}x<cq~uX7n^L
zup!dqhmYe2qOiV)1<Qb~ExeL|^DBM-K2o>0_C`;%At+&NO-FTWW4FL(2?5R$kK^#p
zF)3bm?bS{!fP_LM?tZ=Lr~+@?&UX_j1hbaf^{>D2%lhyA_<!;C```WFy}fO7!XRt>
z(-<Ie=)hw1q##y#_Dp5mjC3v#?RKZX4VR#^f%bK49zRck{OD<AxM4f8`tGmA#>L%+
zt>YhLu`B){Ad;_1Fd+Jx_3O9N#2z=@8VNRDm7%8P_grt+e*MN$tJ4Hh8kq>B-j`Yy
zFWXT7njlPT831ueOrJC-x!RNH!;K7LrcaHaW9VOB!d=C(H`}@byBlB@P;~;0t54wW
zyXmlbfg8{EGL|oj8HssMBk0soW8;&FLEfJShvaCgIDQ0t9DmY=uA%#%K*uGT6eypd
zQ)(yT?pIcyzmQ1IUht_?hWI)}T$}-G;5FKkZPczULtJNt${e3Co8S$VL9cx-76*$c
zwLBt`Bfh>(BNjY*C{;>=&hF(*MY6lFEDC=sC_Wtwj{g_)mpcO&B-J{BeluLJw_BOb
z@_AN2l&xmq035<I_bE5`%g)K!Zqns{5GsI;b5#$;L5O*B>NHZ8JpcAun+pG{j7%c(
zkAQwm_QTJNnKQQ;y`mDnc=tc);9A>PtUTk9Kb9MCAiNP^R)XlrxeIU*zkG2JmQjSM
zU4aEF4ha0q6b_-;{}a=>@i)BQR5}&a*3^O{z!8YlT4}Q)U~Sfq)C%y4!_xI^hQpIc
z8d#)Z78?z#y=Tdzw@kCzzumzruCx$hDH(u1nE1B*3)t;d)rTi9?$#~$Z6@A>j~n#t
zc{IhCI)8%oEaD^%M;AH^TIgt_<Of2(4528=c+5>I84n)~VEUbfHo=Z;FE~B~BOO9z
zm;`!5hJ5zx&E^xYhUMxDdPI)1&2s~(6-ilPm@|z<|4^0e8A0#Pt&%#5?ZoOQH^z~Y
z7YJ6izB}<7S$S}ZG1E=fY}hcC>pJ<MRr~g@Dtx(5(!k;rn-4&lJ=9Vx=MrXAn39<I
z*!Xyvj&h3ZgTsui+iui@=g;zxJLKIxegLJM1#=z&c7>B~=v;V4@<AELiJ50-PSsPC
z&ovgw%VfpJNV?Ns4Lojux+engx*iFM!5*Iqu2|4(S-D0dt+CE<rED!NJF;@Pn5{Qg
zYjbENjbiyd#$U6iWu!SL)z?VsKzT5b#k3<|PKukqvczxD($kby#^vpRNt*`@Y{T(f
zuw=QS)ChExr0m9j!G^2I@PItW1)X32s-k!^YhN+|AsG@_R1b40hAFAPQFszPJJ@t)
zn%&-?JO!MpNHVF?;<#^2Js#^6ktf3NQ>HxbJB_Mv(+(Ya0&alXhcX%>En*SVFwm9@
z`)vH`^!@qCpnQjrvu%^mSc&;uZ3|+KiXZ`EaABmLguj@q=Mawxz8f~Q1%Km;34IAR
z&sMks+j&r#E=Vm|ofNS6VJH;bn<R%7VW3ZeOj?oZp2Yt6WkQ^ZPa>h2>FDU_VULL3
zy1#uKj{6@6907<Glz)%O2{wrX3)cz84Wr~i(jJOb-L0N{B`d)za|FT%|7deF(@~?w
zSXtdZx}oIF8%r<*T-C{xR)9JnvK7gp&yCkyz0<>UQ6xetHI~f=e^I&XT_PY3VDLZ%
zp2NOMqm|RFZCaqZ2?ZK6wwXk~XU?2a&7rfq^}r=>zI%Ni55AnZGi~`jvc(!3OgZy}
zcIJCg3xiLlNC#L>i#6V(XsVIAHS2|?3#CFvaPC?Wxrg`*-U*hb*1Wp-c?E|lHuG1x
zJD)Wj|J!_b&Z;)wG$=_}yTIuvn<4LW`Nhv&PNpvlKVC~ot@kCI0F1$IlDXlvfn)Ni
zTJ!W8G+4OJWK`!ZuP&6$PYtoUJbd+nNnkdDEDkD&{V5^1r3AJ^Cmbh|O$ptCacatN
z8glurpsy8%-KYLD%zkAyaRDmz=h9Xg@nCSt^r{&$EPHT^I*NT`Q!N`v?#1^EC3|-N
z%?Wwt78Z99Ndm%wyC8HMurq$hDy@OZCN+8A8HKTMtW$Jh!hqsuyB35Iw~Y8r3t|j^
znv)aGPBBGFLM3(`8l`_l#HBRL0GSr528LbSJssn^$;E&Q;c#vAI(6D_o>o_}HLf;I
zy?$YFUMxDx4@<;P4>IFCf4XsrDFJ+_`k=DpYV{;39+6H2QC&U_oF4lxDgUu640?%^
z_X*1k+U}4<4Nj%?z2IHd`Nz;1))jEuAc9d)4!x5e3|H)nNI%Sq=v#Nb`mf6g`38TC
z>ls`DH&rAaSyhQg9*Kt_D-CQMdwUhlMwO@}%o#M(PvlS6wUTW<%uNe4Jdm=HKw~|7
zjs|3v;t_Ui-3G074+&&+Km>KyK92oKI%#RNvb40^dGtFar07e|t5{mW{6g-3UV?RS
z{LEx5DNhSB=~dvK>3v98govcmp*h4&X8;+U3wMZ>&-1vPc!E(m^1v-U$o|8DS)_qm
zOL-8>KEEO{8#-g_=FR(9tL^tc01%|2)R@ASaJ$d()~i0=dr!C+wY?D=yD4M-;%z2K
z?ToRtEec=wza`8WUVWa?)YTMaz*U#uhEKy6^re_7l@h%8jMM$K&q*jqttl6)Lh%sn
zpK;VOD-VCjAt}65Pv4f`o)y!U80og_Q$CQ2Z)lC{=|eE>PeweQR2v2nqSw#C-#Aj=
zd}B#xAqJ5oCkz<8f4~$rd)~wIWghFlgLp-0DU2I@ShPacCQOEdhH~N)9xcF$(C#1%
zJzhg++yTy`o*$>PRNOzzVoSn>Nhdledw;29y2;zdEJWNEYc+r@<@CH=plxOWP1?6V
zLi0`qEF}>Pue69qLgpK_dtm9xJ~-hq@UynD0T#?|&&T-WJ#Os%{?EaTF;*v`*oZOf
z&<e^MFUy@&*hhysUcA7boOF5?F{`APZ2Oxtmey9fs;W=6tp`<Etnb#0`P<nm_&X0K
z-C5@#lo1YP4UC_Ne;h$NJOaBXcgguC9KO>gLEu4QFu>^q#JKRdRK7OsAdK%Y+G{j{
z63zp%U;I#}y&xrR8eE6gq#8F%Jiv7*H5Q?FA>66#vN}rFuWDS-Z(x=Q);Wp6Uus6?
zXykz%GU!aAAgHmUp&#`R$d+RDt_2rDAp<`sk`lBMwvucWD)U$q$PTJG>=_}WRPB=&
z!Z0I(Y;)QhtT+PxXx}+WSB`<ZU$*?>loV?RhuxHs_kI4DGep9F>DQx)1@r{yL1HBk
zpPQ;-6vm(~OqyeHmqPSy8gzXF0uGaJw4_AUwHbA5B1E#mk@P3k|K^)sCO<%(TY??|
zXs@AKvYv<)<zB3PWfW<-ld!m(O7$z&vakN1s3Tz;55e?yM4y9-!6|x64p&rX+H7l}
zbQOz{lVc8QRZvpW-z4dCb%3k*zMu$r@51u4_zeR3NpO_xOB&X%pM+~X3d<rcBjdb%
zN%Z$XarclhBh3}$2P|#Uja#=2Nj_~`0V_z(F5+d>sJc;)U_S~F9C?=n1gz+7{crA9
zaRI_{4xY?BBBrJfv42JOx>qxi5&@}F@sR5*Mcn=S69xIklT!FQ?BgTc1NlQrADHRQ
zM~@z*B9i^`Wp{6HZ^&Uc`78MHv@vhpxG`~zGugIOP@o?Pn#o}Z0x68n9NkDs9fsAC
zDyldm9DNEA)lZ0R9GF&KT?kNImV;vf+gHYE1hV$;*xZsNa5ocs<U4Mp&L@tCs?u?R
z9Kd>7(IgVKi))os53fIlh@;!RdM{!7f$gS%oX*RL9;49`h2VEoCXRc%-bEltXq`~Z
zptd-6ci0y6@#C`Y=tvT|r{M8#l>f|dvTg!j(nsfv5Cg_$>F^~!hSkpBr4YM+imPy&
zAc2&ypNe6rW2o%1PyD7P3Myip^&ANLkSE8lo`$nO02ZvPuWv33S5mTg!eBV9XsjLu
z1&6e1PLeCNr`1`qwFr@%iLbS}Aa#5D84x+VAYbbUw~l)A7UHE!*Bsef%Dr%j`qC)`
ziFd0}XErL2dK`v@I27LBkQTvT5urgHp>@$8K0*Zh*m1dTp+u6j)y;;y%Q0|qHSJP;
z)H9z6zJ(XOk(k&sZ8_a=d&fC=4riger!A*CFRm6dZ$?mgF)xCx6iPQq9u)#^ID{gc
zuXA%HWCJtkx{~S>am<0#NxGXd=4idIzs`ecoHlLapb?(!si6{BcQrAw<Gcy1jFkh5
znpFWSD@X2gdW$I6YWY6chofGHfeh%B+r^^-s1M}-TKoQ>gdM>SJRkD8bfX*@!MD0}
z$q7qiPlm%1vi2^$>G%|EGjB<CkP;KGwy+or=5X_!!LIlZIYT~&(wx05H<~!o3qY#)
zmO-MuM}2?JJJWiPg7ATW%;y?=CL_spG2yYxY(%uQsch;v$2f|Zjt##3_BKt!G$Qm8
z^ReRx#j-hwE@rVO4ZwEQIWq)5UYQ?&BV-kOo_q86nBW*sPtQ=sDSpW6@HqraR_M@K
ziL?YMmMxE35q)u^fDGL(+@WwTzXrtWHlv2a$b5E-j(RB1k!B@Dg@wrr69)xfZ@<(W
z`*7ymL&}v9E_A`!Z+jjlFcuEf?qvn)zH{%s*;b+`5B!ADz*hAe;(Y-O-9K$<(er(@
z_ly%>>hHOVaz-+(WWrM&HI2M?kSRVut+CQd`h1>8z#?$~5YMx7k|)9}bf#1&XnC)8
z%1{mXVxW+X;iN?7?b}C9GDUm@+a<Eo*ELSp!YXHQVllZyF(?AX$WI5;n0rn{oOUsf
z@LZ$9j+X-yrU=6YqNeW{TV-e~W1LIp(Yp_M!=X!X2L0APN#di2G1C>FT^=-D3V#l|
zwkD8<i*P(*WrmmOU3PuQr+ugg7<Cw!1K#rEu$9k`UvF;%l-tmvp7MSH_<CBmqvs@b
zH$F;omj7qt!2q69Jr|7rf?y~Q{Mo_ztkKGsub(|TNJNzTR||(t!NEBmiD%t&HKIn@
z|KXC$o-etrH0ZM_g#QKqsZV7H4(C5~<`iFUJjCdzFoea;jS&W)&>~X=*OLO9Mz)vm
zu&#Ywv5UNAFxiL=Z9_%8)w4S#8lfQ4AnL3o7mkPLxZ5jFFTlw4m!$01?U){H0ck68
z-|Kb{I@UPx*TfFtT)L(WqR|3Q!?3>dAB-o6978eh$OrLGc!Q{YeMQ5NLpX=SmEbHV
zOVSct#@6BvwG>MWv1-4y_xu$^YO#aa$dUgDqqk|(>Dr6=KtqbJudhl=uSZ$r0F<wa
zDw^^ppAhdLwl<$lvDrQ9eIZA30>nKy<J&K>{nFq*2=m@I)ycylA4ovZKggilu5YVx
zs@G3s(ga(|-AvT3Lb&QQUjxw0FrG^|Ir@wbfj6P6P^WVcW`!z_&;fOrgJjp2`Vb~(
z+93qA?^99m5E(6?354Dv=iKS;v3<hX4I-qlqJzl&#ogZOq~JX(^7P%ckHPYL;Ng*|
zCHw;<p;CjaC#`E`AjB0dtMv}k<pYJly?5_W4UEJA%ZpRTi9wmLV0ew)6UqL29+BqI
z&BJ`?(3rkgZ#nMX--@dE{!>YOtW-7I4PG{0g9_1443@MfPIjO0HC@Ta?!TnyAzV0>
z9`KFum|#Q+9r%4;8cOy)^!~hDtx2D|?dsIt{j><C67-%7klnXf#tUjvs9h|_$2i?P
z6?*&<XMHY5&+GZVRlW>dRXe4h2ol5k&p-d1c(C#dET%-d5&}uc4}>i~0{yk{Pi3WE
z!)ZDSqV-rfLP^<fiNt;G*V`=?wz=h4F#`y^<T0OOi%O_;rr2TaDdTDvNEm@35#Slh
z&GN$d{;+R0R{+m6qoAD745~g)&`iU#KS*X;@bD+a_D3d-CKGD=d#6e61PvP#)-}9)
zb?W8QW-$^H1Ve4wWqAu9QabrTQp}=1PPxfPHT~g-sV;rU&If>E(fJTYrAlqh&<LS5
zXzg4><&BhCb@5j8Hg8gifGuvvumY4;bOx~elu`mNUWdXUi+L{R8)eKtm_ez@z>qW5
zh=v+=2lOUbf1CH!^ZH{Yv(&y)&1UGjn6zyKY9qX7*ow@;`5BITSO|$T!I|lqj78A6
zxly5VEjiL#v2BYu)~Uj^N6~M&&5(|UH2Clu8pmvtKw2q`yf@B+=0*)+sP=O>0?zOj
zB(PBB0%P3)@w_>Ad9@MPYbuo62vYqDG?gX+kan@~mb>Slec4a0LgYeKV1=UsOi~4b
zQVU`yRADvQk}%J}{%2NXU5e>DNUlyiQq7U3O9>(++}9l~_a=1>!bx=!3SmW<!tHVQ
z=3gDSsA<JofKm4bT1N-d5>O8O`4ka3epx!CGCP6LSdqbGyBIN|VxK7AkkS(Y`#1>>
z<AmHZmgOh4kh<V(EU3Dc(~~dNhL&inArc6uMvpJOAvZ8?0t`+v{R0$u*&aeAj4pe1
zcEOCeaYofg(^e1C(;Yg|iwYO&^RE`NqC1Je^TocaAfXU-m{SSCpZMfClc~XwN1~u8
z_NR@~F~!SA11z2gFK^MOfk6ND{T0jjqWrYc><nn|wLBz@b#Qpt{jj>`*Rs{qFdRB+
z{N>XIPZY~hLekD7Ob)#1c+QVzRobTZdPzRy;9c&a)({;60CXYFuC%O{Cx(>gj)vB~
zO;!GC@x^3Vfzk@r>u+$w!(h$oBszZ#iOwe4B_4jCbKO!~ew8b>b(6J*IIU8X3{)O;
z2|Y)YGLuCj`v3&=s917@?-1Mz9U!Ei>^_13lhKd;8AiE#$&+utTnJ<F%mJ1)fvCUE
zyL;?sU<C2MEp_O8z!8oG(w3)9KLwC?!OU%5e{HFvt0+UaT4EFk=ctHaT4@Ow+E+@h
z<u4a_Z&6W#o`W0`<p`g=?47fAyEZq1S9Fy4=T-&ir}!Q^cFdC0q{ix#_L%dy`R%4n
zpHtJikQ-6+k$=k^yHMeqsWqUU+)`(P4!M_g&&V-gum%MOn`sc-?<U<J>?O-$>3$$5
z2FND)-j*?c9KNCwAsV{+;qsD6)bGb?QgT0^T%7Z^@9xsi*B^CR`|IX^-@E!l_XDOQ
zttPfu>g+ht)G0jZ)bRM0GY+Q4|Fg(s&4!?mLAJx3ABEaY?cf{|pmO^5MC=9sM;%r~
zT~w>wetD9xI<4?+Y?iBIj(ytliwEu*Z0WWqzwkykJA7f>YDMw<nUa7|$^ZKIzej4A
z&@KVsjFNut)w%V%p5$<>g~7GYgjl+nUnwIsZQHezKB_iZLIvo2u$E7kVSJ9zI5ARc
zjbQZkD;|&MSfr!mzttL80hpChY_M3F-7GxR*0fxFdcW`0u<rqMD_+Nz;**o;<kC^2
zsTPKPo#EC=8S0Qku}EM9O%W{}qd|oXjlWl(F8KWZ`Vg>ET{Xzb3O`3Tt;_9rbOHr1
z#>wcYF-+FrLfUUNd$=U)yFrDw2=M?kg+iHrq2BN>fN9Jt@T-i-XDzPOQgX3l>s$8z
zyJe!kaq<;IWz$6|6G&54;L_j}<RE|Buwg^zifFy#!qAeP1Qye${=jvjlo3co-6m^Q
z`m7V<507=WSgE}xxw|F)T+^6PYZgf-j{J}+FJ5#@I>ZG}a~6<+AxG&myn)Opr;3>n
zk^EIh$JLCcqj7Aq`{R!gz$I4Sq9r&fo1*Z9gHU$Y1mZ}YZ~&dV)K;DjCmR$~zJLFI
z=?){Jq2pus36IjROsXi3igNa}G<wO?lE)@btCgFps}=KJRT<s-2!zP~==iym*mp`(
z7utCo#iOz2@@jk<R#R3qp;G70wUnnbhOcRAg-KJXVfJBf%ue62x$>>{o&4j4U0x@D
z5D32=Un!f?;O(SG9{f)e#qACuT`_GG>gaJsLl|qzU=qKI@|;gSv+{+1L#fNSqEdX$
z9l^6)^H@}}Q-5rDs$cT&RUfRhUxoa8vz`w2P&*2Wic)~GrXL@F3*0sK0Qy$8bPxZv
za^(#e?_e%&NP`S$uG~G(@0@@0cY_b?J(XIBd9vl*v}mBxULP5FT)<eMB9`<ZBv8M0
z-zu9{fy0LJzBQ{?TSd8LHzjP%f8E7=Z<FeMJCRzm6(bhdAV1mAy}9IH#2eCIplSO3
z_xms}ei-|LRnq68o5|Sw--*oUu(joNg5ua<?Zcqg%E_%Y`zR+bFAUDUsRojzI}-qQ
zrCHZl(mM=-T9;4ayn{uZ+|yyn<GNWNI9)srpN~`Us~<4KI_lAbihV`Ul`E;*G7%|d
z=Fslsx!qk$bkjNwpx~ruyBg<Ue!C$n9z-5NU0}6ZQQmbb&KaJU_`Ndp-CwI#NuMF*
zBuyZP)0Bj#ud%l9uAEkv7ha-&ZI||Xti2s9tFLzVH&pz39id5jCP&2Q{K)fBE1;J)
z*`oNAm(jHA&b$`2HN3jwhf81YtZ<3S{h$|+fs8|~gG`^T<XZm{NrbfY^jZ7aJC(1y
zhp_`Aqt8UWf`G)T`3%zK%(w;=7}r)}FR&Y&2x>GcU^gj>Q#+|Vu`!GEhIDXWxRA5A
zC2kJ4xEK2WH4kFd_<Y@J%eSc844W(Q@#`I1)>G~V4*k7$*YU^p2ELegq>l0_5h$fE
zOuuR3u*Za0Ub><!ASu%wZzR@R+f;9=oAyqZM;cZyZ02fFFjZS~QKKws5A<W7Q8}Fc
zzArP5PiECBwYbr92HfTy{$D9CBxH4GkcnC-vXbfUstgrgn5EC^Y8$dCPQ9reFickl
zRjqWSe0S3&o<Oh9q$dCR*P9&|>nQIIal!2QZNBH1cV6?wSNdeg(^J)}cbZ<&(`Ka)
zMwYf^b?3i&5GAAmRp_s)fABM2cu<@JuO&t=3~DLuS6-dESw3eli&gd7B3&0S9r)q+
z&Ad*$8f)p|a!eBO)>#<vaO71wcdjsgqx~qx1C?HRPrfu|oAZUTR|BS9xOnmNw2|Kq
z9L(vI@^jeO!(jxcgwgk2jEq&P7f$<??RD`(gaUWKJymLI^y2_OM-ew_Ujx%g^#l1U
zf;UsLoHU@X_znMVSYsQ?`!au*s=k&dO)gtZ&y}1??Q1CAgnq6NRn+P?9h+7gO8Lv!
z6z-=lkeyod=bwK{h)NJud7=~rHO@eqp+bmQ!zeMdHgegUzIjuvE+@Z)OWH4p-6f@J
zx-za_xiV5Cb8Uc?DlHx{plYlB1Z#kanb(Ile_nlf<X1Zm6M&OOsGjOD1Yjg~tu!2p
za;)e7Tx(<^wzep~|D%x_|9}8ySPX2YKmMPm#IxGhWJmv7+Ol@Ve-8W>wG8Qe%%gAr
zHf1OGoP>5;b*+A3NT)Ro77`MZ%zoIsXCZy-QlgM}5;hKsRo&sk$U}CbT}bHX#6j%m
zr!k9pCIv`tyFi#gy+^H)c2j(JV%0@>UYFAYFe%(lHnHi*k(O9h;bMN*ve!`R&*pxm
zATX&^bt7OC1q@m`mV8niA_!l&w4;MFZ0ue}ws?wqFQZkTQ_`v5xbb%AV92rsVX8wx
zD7@ZYZPB6BSP*wo5P=YX_+dY-C~|{BaihsopxssXPJ)W;lHOVNRX*>d0&5LPV+<mK
z(wi))1A2=cjJ=_g3E7sAo<WahT`l+eAggcynur?E%^IP;6CM{)S(JQ;+ymxUuzT6N
zdB#RYu~5Ybat{fYndBhcKFV#Qe>=~$SDhm4;v0QQ_TKc8&!f*o3?J0*{pqeg<`NLU
zK3BNmh{JimTz~!|E-`U5h>U6SV<$peX&5}Q>>9pG`X&Kf^uoKx;2UNG4-*Gi_@%LF
z$6Q!es=vpBPUTb-HshT!8OfYU1OLxF({6@_hBwP<Z)~uqgR&_C6)5=vO%c$wGeL!$
zySrhfnK!<knmPt9r*QCDuGx$HmrVn~W%u8$lOtpj6S{4dA59Cf8pfEHIG0gWGzGVM
zP~ydAqN%W>S^yI!Sa*2;0+1fh9^aDQG67&p;+N9;=}rNt;cJ<BBN%}sda$ooE@Y&p
z>>_rQsRkgaC$y1C%ymhgOG2NF<Lj{FvsHkMzkmz&=~nxvAg`(a{<ewj@u^qK9@5%k
z&Ex4~zr7$)s)QdPDv}m#%{7o?i_TLBG@5?djT<&h;iIGvT0e))JQj>8_RYUip&P|s
zzux7t2gSH#!~mp+I~0YzXht5cEUe#DS5&@4EMmgUbVNYJ!a(R5baVMY{H{z~Kv-_`
z7MpA3ZRW*oe2|r$9Rw)>KHuG<#a2mQt$NU?(uQ#&2gZpQBwe(ZbTjqQpOU>sZ?P_&
zDnx`Iz{kfhNu4uc2haA_I|J(HA?$VGNMP5yTJ8V$L5qMeI>n>|nec2Y{c|FKrWW22
zjobS<csb8YpxhU%lB!x=e`rQ_2p-q141+&EqERaK<gC%uyljahk|CMV%n}0jb?NzD
z)W$*xU`FD0QxLKWmD($=lMmP|wJr)DxtI|vFq?H{7RYh3kK-dAKwPmVI$T#fVb*TX
z5kx|FKt7<G=tzIWXWG!$z0x2Rj{5maWH9&59pX%>O`r^poK*Y-pcIZOcuO(un5~C6
zo9cxv|L5lrJ*oLo`)xIRA?78GX>snzdrzW3fqdW{coDz=KdL2`mFfeMxpQ+G(~1NY
zM;aJJlR!R}ny8TA4rChA*+Qxvk$Wr_6Y7NHWlh6S@)H>rna3Wztn`H3CvlJ9@o>8F
z85(&;pikY1R_>>7?ti;FN59+MB=pCl13+Q9lV<JOb<ce`^F21p_QdqKvS$o74gm(n
z{*3jr*>N)ON-O9pR7~`c$JE%OSjkSkd3W;Y?sWU`ugUC4ZS%7U=U`<<|9a4A>A7i{
zB{=|uqzNk$am8F3jlelYcsA+Nop7X9`~HK6C><9{&F<a1e$fsx8VxkESJjE$XyG4q
z6ZR6S-~Cp6J&~9h_t-Nz;g3+DY&n;b!b|rkQUm5W&&GC1ND&~Nb8xTaxG-C-e##J@
zV%al?NSB`l7RHCFLSbihwGR9bImjtw>Wl{r2;SL&89kt;VmLK{)Sv_?ZhWh-m+&8X
zwr!S<r6=YeF#PPiJg-O0AbTB#4W+;QKY>eEX?+D$ft?SB6c1TNtvVlzFVrG{b2zVV
zk3N#qNwJBxB8~g&MA?X&{~UI$ygn(Mw#ddZCY|rGk2urj_u6$|?OKYBk<LBB2!nz{
z_E81DdzXkZBvf5>bXr^z1{)ij1Ss-)*V4y-*A+|vFLbSBuxG(j^Vi+ubYIaElfq?;
zLEE=s4qhmPBS&3li)#6&(j2J(wO{gjT|x3{gT*y=g%UdqIbx@|E6oJs>Z70OD<uoe
zcDj3#ixZenzxqZx(!tQGxr?baHF!A#&3f36*Pu6Wg!1x`%TcdU1MquGNonqllMH4g
z?y;Z{WJ~|=TsxwUf|c;cL=!_d6T!v)m4`cX-+2Az@~15w&n}98HZ>iLR0_0Egd6K;
z8k$k^LTn)5@lqn^JGA@FT!)}r2<ZQF$Q02>eQcu7EKY_{Ic*tItZj!Z`^PCl^b)S^
z9<%S4zYH;Q4I3Eggt|QbTN-ZH)s6$56itwRLtaz%Ai2CviQ9nN6+(I_dBnB%tSyfD
zrr~FX2Cc7c!7_~6*+=JcZ3}hS_~ZL|vO78_QlFfc;7ywssoc!*F?tzgvm^~}lZ6j%
zIsEK|oxP^~a<Hy{()nMW!uM#M1tadEuKQoh4^-y~N^D)qtI`J88@<lsG_M!M^aZ4{
z!79yNagBy-(WjVZj}(ibOTRnc*w#J#%tNo6Ein~#Qp#Mgn<ige_kwjT^vCa|p;O-=
zsOi#oM8Ghe;@uFji5(zt4~=h=lFlbTC9rU+4ACYhvZ;0-v~<C-O*o>isC($>jdGpl
zki<7PY|zK%9NqVGu7R^zaZv^Vd%T8>(4cGdX?CsTbjmG$b$-UHhBE$Tpvx%Yq$GF!
z3`UB8*32t3s7Fn8Mp`Pjeo(K#{Cv*Vdz@92<@Ba~q|_XW`f1~HQZ38Q&YrO@=hn^x
z?2L)~yIou3V^+ll(AU?$`KPJP>(Y$#OZ#6wf4XQ*Q@#CEb#-ofva$LdS@0lobC!39
z$&ISh2*t;q2`o<xvnWU2Kh<NvaDS>g(jL_N6%&I;<wHW4XFm~EnB6Je%bua5Kdk@H
ze`4Jo)<|eH>Fm1;5q$US0n6SW8(dywqYbK|v3KM7FC1183Y~oha)Enkodps<Yr>1u
zOm1NVh_~tY`tZQ{)EQS~mSGR(T$|9YV1jJGEW-XF_Q%WvU%R9q9MxvmouG%KHLpBI
zMqm>UpYXRiF<njP<^7g>I4rQF&M$EIFz5_M^`7rOijUVkL~G#X{aMK-YA;GK9p10V
zt8JwQmrtsbgIcQaFefxWZCYZo`rwG~XMGx44OuCh8YJx*A-iYS9dP3NAJ>VVS-erW
zW9yMte79lNEenfRpdEf&)&UTIiYeo8+kkdT2O0sbvkfgnlP`cv2=x(=P%8*AvvjRf
zMT~xW<fBmr;q(3BCT&tl*2RpgsAl<_k+UuR{WZC@(Zs?A&AR%|Y}>3^vCE}Cn{d+y
zm)xFu{O-9HTccyFH89EN+T{R=BcyuQ@C?Uq&Nbo(a{~)=^Ay0U^~VuKUS4LuGnCNJ
z?=1x-?b|oc2u><`{o9BU_uLmuG%_}hqYtU&BW8(QfAw8q&BW^IA4S1W1_P}1JKSyJ
z=|k~}M#Q0huA~ri?QjvHhFL{zp6G%3N!4rGGI#~%4F)5TX^5(X+WqlYO3XtMwq|X%
zxxda_E#=Cm73p-xi*Me&v#@Cxi4P%dnr>7A`%8#I^q4$cQTFnBv)=b1pJ^Y~z+4Bb
z%s-9rG_AVm=nS$y))0-pA4gfuMwjtvUY=9(Jrsnc>#F6dl`He8nBH=mwCG)Q(&AH+
z!~4A@9!h?SPVc!<8TvtG4Z3e^H(+LRG|p<4aq#3zf6TcCJF#EJQd)MU%}=^OrI8m~
zIhHf|s?4-}amSP1j*}hdmQ8F>@!<Gp%uJjzr0_WoH=TwJ8U7)8XNRJ7MO|LcoMkgG
zf-p*#n4@m}rjf7DOIi8;f{SC&B3I49R<&C6(VmR_h~q^WTpG%bZM+rJK5zv)DhYw4
zj=2tLt8~n^Jt$fGXT{fT+)q$i;V#=X7)R?9Pcws|l;=D(fux#Ye9T6Jsi&u(88+L(
zqv`s!Yj2(i4_3ja`c?XcR2dhR4~SnHy0*_e#zY?^OPaQf;9vjj+=H+6KXlA3xtjF)
zXwahuXX&y*9%)u#xy@Nd0FNr9h8(J&6ZpvHq}}1h{$i<dn>U`#-<uxQ|3#gEFVKVH
z)3$lDW;2=&I(V$uHmeAYy3QshCcllI+UUFQ;@@~6vU2|2J?Cq8A3buEi(c>$_eP@s
zq4;F!QX=GM&;_hm{Pm7s?Ls{)KHz+XrI?!v1H&*w7sKrG$e$MJa?%#ft7gkP%?f-q
zRWJR5<JR<#DRl#O9*?k@Iilnug8}?pKYZNSY5C92Gv{W6pB#f07TMDDrY=nOk+@E{
z($ZK@M*y&RVCjwAhP^uc*L{o)vHX#oE>Q0&x{eXKPaP%>BKMJakl5#~=nBOL3_LlF
zVAk&9it{QFoS^mg4%5lkl`uLp6thaE{2cHz{%uYj#m9d;+(ryUn~a812;=}(;Q~*8
z<FG?CI*Sg3o?7a-QYGbbTP~FD4#?bFdpbHm`Y`C`bYaViN4kt(o0y-Rm^i;n;){nv
zmOm_-2LEna;*ItULKf1}h*ef}830p+lC*U0aye^6m#WTdZ1O5_?`GZY4C9^kcAcDX
zy!h8CyqWypB;FZN`E=b|2cav4kbz!XLUK8$zO7BP%V{u9GTXUFrn3!=oj<;RE6KD<
z0B%CYK9!JhY$8F!PiwDEU57v2^Q_$Wl|O<kZZ3Nt=rM7yM~TPWThE<IW3KrXEfp9!
z-D~EDgk5Ms-E?Cr!n*j9(HGBbjh+_n?)|AKz@#vmQMe+Ib1f3<-qdCAzV}D9Y(nix
z*wlb{J9!;Lf)*r4`_3PKdI_D$N$Yw}ik@D0?*(&2{vq&K|CV~IwxU_m@20dQ<T_ku
zB{r2v-s!mP?O!VFofT!>Q@JhPR2;DOW4WGUSVyh1$T#xw7Mehk*mBY`kT%@y`59xc
zEQ_OK6F|K8H3G|%QLzPtwsY1PnK8Zs^E*B272$5b9K<uG`n`pSvwf!j;Lm&-mEh@x
z<?)NyGlo=K+U)ZEYUY23FuDDwBriys`HZ?nzaa;3S&fSus!co#0<nda=1@PUG586~
zdCe%O>=`tLX02L<qBt8uY}{{12SwTO6I3%i>NsLO)f`4c!3JF&wNHC{F-`ViKdyOD
zuwigor`*rcZotx?E+nj7Vo0?^u$1-3Q=R0v&9#F9blWhCP~+aI9&}?r`{g#}$lz9t
z9q%7+wEPndYk)R)4lXrIqVS8OGR96Oyf1raMA<l~Rq$&=zwc?h0nVSedGZhc6<1{D
z=kJ<%&MCDk4%|KR)Nm>2V!hWP&^v2a8VT>+NMa1<ZG{*u=>ch#MJ$j9QpC{GXw@8g
zs=+HxmBMa~Y0Hp=-qDLM7ToC8dcdMXQNLH-=K+E?>IH_^gxW_}htN9JbJh;ryfEqY
zgrJX}vvkS_PHa?nYap>wIY9b^1X5Ewzh7=6E;kn3D4NjWBL&dodaG1214tu9A2@#L
z=XwN3o;@4f3I{VY>KRu_?v!wNWUp^)V3|XEyua!AX+P(vWfq%<%r5_TXFz^nm?7yO
z*CC9d4XS0hr)0b*?OT9ZF(v4tCJ+cbgni#{&;oYJwTM)wG1x;7Qr>2|mP3I`ju}I|
z?j?g>B1BeH5G=_Z8cnI#z8lwr1Ce0PSoh~2e~411G=THpB_30LO<)Le5Yjhu*3l&I
zPL5RX2)WK?TCRKJfi4g-PcJl>74Nq-+aJ3TlxqZ`DyCshP~s3;U%`^P-7LdqPavSy
zs07sU?Qt}Bt#$wT4|Zhxy=WTx_lp0D7c>Ov4uEBSt_h&a6EN^^)22-~KppBVGxJ+X
zr7>x)Qug6P9%0wc1jgQ-#^wd$l!Rcgv(MFMPu2^@2dVU$QFha8lD4qvahmiSC}Olf
zN=D3>W@`<_AW?%7c)k*P%v2|XZIF$&$>N($_cxd|pRxc&7r*)~dg!aAs}A<buH4;d
zCHYd*lTLHB22c@DXKG>Ln&#cZ_`Sg{@x+ZHi#E+(1SMXiCm;NXlNaQdIB!>OIeNY#
z&-hPlVadhM8P7f%hticUIZf|}XBO-MK;D%5k*i!@_VUcepIWy!_^%W2Bs*K^nOWPv
zXSeGjWEW4kNd7LmabyfMWOU&VWhBxEm^vql8!~oBXoOS-H>t>8N0CX$V7_9hlw!8!
z_fU`S%bXpWPlic{w);^<L_!nq6k0F-WWG6i7<H`JrRZb|^g}0unMQ~~9t@@avT(NN
zmbgkQV7#+b`{?a8H!YeTf-ZSw<gPARGmB9kdXP6VB3*~Ed*I6W(HVJ=|K*VZY61yB
z=8(^F^iGvdAk|5(*J0USDI^KFDWor+FJz8fp7{r;=F+^UEs9a^*b4xE^U{xnea;k>
zGHW}Sc*cpE1>RCaSE)4m0Llr0(2|(N0}LC`+Y)Lw@N`i7cN<@ACVtH&jYUvl%iDW6
zVncMug)$|*^{fw=i6Mh+xnnD;83(5ie|2KKO-aBPWbW8cv?ttV_4z;@5T@HNjE0IY
zlv*!!4OOx+<K6v?T=xq_k+goMQ4R%i!It_g>xqQB@D8O*d|L7V0|L!7gj@{<-G2Ww
z_=s3XDr;sMj$bHuF{5m-&(8NBl{bN{Am7s`(L*h8EAaoG+|LiSM>$jdnDQXK2l70D
zCk+QAG?{@d$a@fVviB=yEXu&4U3VC87fJ_OzxY8OdA=h1$UV!<$_WV_dQJ8jT=v31
zUD7DR9?o}s;$2|<Q5nBVhA3JlNEb@&eOsY)^8o(+c+>MV*zO^r!}$34=;vI(sB*tk
z{VX&)bD`b3CgKdBgUygVS%m0};9<!gob5E+^ce1OSAk?+zy!B}z)tv2WbmhGVo{82
zbpb$PUIw1x_YzUK7wrPcyfo5ll)doL($|AvN3ipt#idf^0#bij?+6^XL!#9uO-Zd@
zl?apr*qw?uc$$^9Wkz!~Kv^WyXJ#hb-@S%UCb5F2cpL0--h@7v^zyZhycL$Q{cwV6
zNt&ygXFt8qYMc?Z@HhOu4fwW*l8I`9bjEQ=I^?wy4O<w7*q|WFc+!L;VEr${hSA0o
zhz$?JGV!PDy`ai=8op*=%0k?5AR1Y-!S^-K|3<pm;S5eiTAMUQDK?{}Pp>Irmh!SW
z4<I0w9+*)&Y79<4<J(&fP}hJVK5pnxAKXGPu4B^06_e1Yf_XfS-aWR$PeP}s?fJ>D
zA=fWm`WYx`ERIX?H@u7)GbzSl)ntwqPc{^=)+r-8g2*AhL<DGy=`@i34~baiERs!k
zgE1(u5}!_#)t%jML8Q6mz=5n`-i?i=o#q})$!$F*g;9GDOHJ#_mbGiwvgn<OI+2Y3
zg-CbX$&)A3T{kzVQPg_TJRk)Y!JmnZmC=F;NM3IA*c++5)@;nUQ0*$jw&6FP`WFQl
z(ZU6R<LT~IxmR*G-LHsNv~S@b106)m2hx*OOgh0w{i<{$=Bxjyi<REOBytNq)Fl=x
z_wej0OPmx1<9@A9=8r9fWbi_2f<t#7=K+<&5#FZ4V@PmX@a9bwZtB^j%v|w#Frvn9
zPQ;^WR9MelRB$Jcrt3#XErFo%x60FV+@)}V-q%eL_O28`s%~?s^}C(0h0QH$E5}-3
zHZe!r|F>vsn%fyI&zTiPWvXR7N7e1EjfP&>g8m3dl^N9)953pdAcMQBK6#h*ioq?v
z|9*=0*O*n`|H#RciBEmoRK^3`WBYm($JL+5ZQ9~CwU46gR8M-pN2=$gGE66eR7ss=
zkT~!Fn;>@T@g0@yjB|R~Ie)V8K*v9N&+gcUYG4YJ(i=W%KvNdE@Vd8@y&(I&i`-u;
z3T2q2$b^K!O{x<ppNy_#*{;i<-D(8~KzRYXU~S5tpOne3MJth>bPLVXCr|cMbzD8w
zL;IZDcjOy6zv#mDGiKaId&aXq{69Wb@a-dH?Va(CO_KTGOEgTZ&)0r*;t~6({S*#w
zI7OIhq*P2e@#TKdm$TzYYz<r*P}{iZyJM3$@>B|^E^3&0yg|c;hO%tGR;Njm&sZ;5
z6WHTMEv2IYn=Ez#qdZcWhe{Y^rYXYwNdQ+x4tgEVbadB%Xx(~>&#f9k&~78h{eM8D
z(osYCskkeuP{TF0dxzE{9C+HmyX^iWZN*Hhb_zhbrAaMWjmaJ0S<JGB=>1yD;`kF?
z15x|bN<kAM-aWUi)0dQP@)-`}K_Z!v8d0!ClVfKabmoJF+e^~3Nt^zI1NS3M8|X(K
zB1C_UiiiU9YL1nwqD&071?%VUx2vRTLvgP+ztOSh0P0Fi^60MiDlbmM_DwIetBz%S
zCOKmQUC-0~FAusPn%{4eMlRB~_qWQxXv$RK0QN#i5C*h#E>}hycenfQY0_CoM>wsW
zgN0!vYjot~*~FSB)fs5o@CZ#S^R&#XnGtry6tYb<m^8}KQX*fn1k5HcDQK;0-cm+(
zA`jTHvT8rs+xoQunEOs4C2c1Vz5(o@U3)1wPt4b=K?8Y-Cm%<hQAZ63GY0#sWzO9u
ztZ@hIP+M1`tJ?1)u)V;<G=2343G%6@SMdwVq`F<f)v8PxE4W(1RlsGb)L6=~n3$6h
zyK6kX5+?giwP&*3RV@=L4{7KAM{>v3i`gy0%k+|NRxwm;q)0<*StZ(d#i!h~yzp<V
z^UfCB+$`A0X^|^-EbqKF)hXiH^p4sS#Xo$mcSl77mTZc+%*@Qa5WGUb24`M}L-svQ
z4dbYpPLHt5rZb*IDVBL3FUBjI=E)+k8i+hWrKR%%RiSy6^r;sU;m}C51`H!6lRBOp
zbuGc|3Zu0Sf=H<~yO+K)mi^a!t<HcmJBHRcSS5@K?n|$&+8Pv0#=psyFVN78q?^4=
zxPm)sTPTZi$)M$-TE!UC@la{;V{1{#Y>m#nN|gBMOc3Cq<I!Vwhb1cO{fYj96BA?f
zR$F}G`*qy&pY`j<08&aDD=-HeK7_1VZhyyIqEit$P=(@k$+1-=a4#&lC~~@BVj44z
zu-e&wA+T&Zqvy@5qc<jY94!GJQt>x2al%)MNrH41Go9h8Cz*WuqgEu%og{UY-jl#;
z_E>BO?!1_OG<Edi*3UpEs20ki`s@^X2DGSFS%_<Bw0F;l34{7Zhzm)mU3K?A!?l$h
zSB2_`T7u23J9f<0egHAa`bR%+3M3RAY}>*@U4uXMv<>K5GOa7<dD8+0>M)B<?6Y(5
zhvz?0{4>)4Lmr@;CmA$$({hfWjDOWp0|rRNS3tnZ=U%Xfs4AS0M-xa^Lk%!q@LL0+
zx<@a}*`NxgE=FvaJ?zoVsS$REjhj{ICbDQTb2-0bpi77vWW9RzMCg-XLKZEv_e6j=
zE=+W+n5mM>my5nUe>MFk*v%MHFG>QY8hU@61#+x39#0)*hf;$^o2w(EyTp+3$ogwP
zvaqnwU`6wZ<TA1Yflrs+y+5i3jW1U4>it?F6Va{{nnARC+yhHaOCEW39-@KP35K7@
zz4S7_n!H#tCVDRp_?)D0{DHv=GpknQ3=U-uAc;mamxNsorYS6nOjij@KT-8=okb=+
z$h20lYIXIA-}HJJB4!fDt8&4Y@~EZ|M+nVnNn7%r15l`SNhh>ab&AfraLFfL6K$ta
z08(WP2$@cPRZVp3__+*NLBxvGQfn%rdXDI20t@AH{V|VQ64tkm%4q}{ij4KOpB|-6
z9cAqGg=^oj1A{hg*&`&BdTP-M5aSpX9qmWI#-@n`Ek*a7kIvzgYXU(CW)nSXscCUa
z$E6sOAaW12M&e#VSYc0A`jX5zSI9%qP{xr(YE+cmtVNiR%q;4XqC|D3QAyY-Y*>|A
zD#Db3aEibGcD}d<fz@YO=*Z2T?2b?UYfX!uJ=D@V%k%>6uBTUHKEtikn2`W=mGIOK
z3Pqq#6Q2xYWHm`sX0G!_ue9>$EV;FN#fLexuZHt^nm`B=`LDe{pCwToMVCzMcO;0?
zzi1Vc3vmN$`SjQSd14EQjYm<+Ht2VkVeCgRPTgRzJ*BP8AHDG)uHK38Tich$&b9yu
z>u20bnV!g=6iq4+4^z$H_nCDBMyTAv19#$u5AeBh&}}q=4CT^OmKkHI#LoKr@?}ZV
zg^#SUWe)9O$BPc+rg6t-)rc+%r1esi^o6phnu8_sdKr->+%B2U=Jq#tDZ4_5MH;5{
z{V2zdGBB3R=Y^CgWxNCttJ)eUvY6$#Uv**WqW%v!fps_22v-G->Fsrf)#|uMrk{%j
zc{?MQp&Hv$v#o^OC0$@{MaI>kxX-eqfNwIxfwGN`n)F0NW%c;?b(CYHS2gZ1md%D%
z!U}aszdQZOt|{WDvyz*Z(}E}(Wjbm}!;uh|D5=tw`|#(^{9af4$Ya_r7)fiWuuWtB
zUd(~JFml9<!80GGWdu{A9W<^{{h;dd@WAgKJhPh#o<j8GtICBIC9o*O*#+_umq80w
zvb{xK-c$o^P?89+mRRkDQTI$E^U+b0|9=2d5z1KAg*9cF)7Wpa8fpmaIY=(lV-F<A
z=6FHeBao#~4oKlgYQ*r+3mrl*8dYFk9Yw`7Ie=v~Ve8^^m>z(R4?WGKOatiA-s#^+
znY@;;E|>34gEWc;L>IMJZqW8qP>RPKb;x+?&hpVtfriSl-k*4u;Vm4O-L9mocKiw7
zkCa3i4c^_=$jzlda}i!EhSg8|(GsM=!uUW@g~O$*mOd~pM+dbd`=vdqam~!tPlyvJ
ze2x;()nOu}8^JTk1eiVY@QORpFS^ItX$11;d~NHaoWfPq3fi>GSMCPE70O-T|6#8+
zNx!XYxer)HY@58McMh`IMX&kWdt}_wjz-wls_xGa!*yu5!qaOFljJHw&cvz(izf#S
zjQX%l8}j6RAq*=8AxOavmmKNS$HLKBeJMIE|BxFjE$e^Q$(d5yCtdWGU?xgNJSV#I
zxDd;qQTiH3PlCC5J*9C^XjRgdfNAcM>oD#yv1}Z15zkXzG0kb}NMd@?j)M(yaMR&{
zexbZY4D!J`%19A16IE=2-F!N6uH`fH^){>}X=EO}p=FaA+~d)wU{46ePxM0a<~89$
zMKG3aZUdraZvKrj`XI!9C-%7?nnz?eQ_sC3hQkfyAy>bh68_YwzbH!#rI{8ktd3d&
zJqk-=GOG6Hi%8GHgDV@G<mdcc#Rcea1fLk_d7i;wUVxJ#26F?|$v}7I%~P_kJFs&E
zTL$5~yd)z%eF8Bffo+!UvB{UNH(OQh#}u^*)qNT4z>_}cd7dyy$T|`}0i8-Iva`-t
zJ|ACGqY!?UB!1A25H0vIzKnj!e>nJ|JZ?YVe<^O)0FY=#?xog}VHW0Ci1ulvLCTyW
zveN=85qa-^=Fy@>3n^%F@T9@U$f&8}b{h(th^R^j4m(*2tZ#SIj3O<@(l?8lUH;&9
zjkO;+59$|_evY1^Nz`zci5_QO26s{0NBzQvNDCZbiGQ$rZJ&&_Gz1XyBF{?{c}Orw
zP=;uX@A8H1+O<nPJ#V5Y7D>AQ?VMU2XGxxhF}9(`LYBf9EXf;BRtyC5X^UL=*V5Q1
z&a6+@w70N}q*Ysn7ciks6fgntNR6ldVu!f|r(E-&|3Ht7lokOzbltDN?rLaQbRmlA
zrudI7B2pu%aFS__n6KkBx=vEqbfWA9h4ShPBjwGNP}@+AsDnbk4p~-O0Gr#Vtt+&A
z61LHR_?uMHc1(+H6%RCa4nyv5OQC_`Gv(w*@*7<&h?-L&ILlleP_cdk8dci|Wr8tt
z2BlY&)A<~9fePN~^vz1gM|#kDMOA^55dwPc1$8PO6?J22{wEMSp-rd6eTmYLwn?NS
zdVJcPxKLWtSx}MOkx5`28dh`Y1skUQ!mt#@H9k53X1UX%z?2%#IQ9tDT?qjJPDE}}
z9w11`B7Nmp8?s&=u%+;Si4!Gf>k8B;tvG2k^aw|T;Cr)*uKZlAr5E-?A_aN1KO@vj
z_<Iag?)Ppdky|{?D2q5im){FHQ!F3gpkKTEpKDx;{HB}8I2EywX3^tfFU(48Z_-Gc
zRA5xLO!?!_^l#-&RWF~9lC4(MD8c#I==*)DY~1eu<*r9%zIl9EgwoKxQRTAftw^fB
z@aE}B`rj+L%R{Hg_nWp1@ZsIdaC%Lu%F&AQCerG^$>~pFg|;L>ltw^JpfI6nBCkTp
zxcr9I&R=`|##fvo!eD=C<z9s4EHND15cA44q=Hqa#<t;ihBjh`%V_PSJnPM!hM$58
z1cCeSN(;anPS%Kg<U5}FKu(=NL2|o_{en`(Xxw#;OEUkTOWH^|ZI}dk+9eVa262gb
zRcnFLtIdC3Ps~q|Ud3TTDTT75mZ2NMyRI~<e0@RfYUeGN<Fm@W<kcji1`EuamWts-
z@w_q^mdkDSKiAXvN0K<I;~yS<{?tWH&XYttuZr=J61-Ks3hMvwcl5$S9=4CN*2Z2E
zXzG~yR^M~Sp8Ki;9{GsagEY5~Af(?S;ATF9=%lh)ebced|FbVnTw>34SYEVQ6*u+x
zY!G6x=aon!pXd4ge=m~HdYLu@;4N(%8rj*RsZe#(@4a7Sct&nt`1g=Ei^F~Q*G*m5
z*ZEq(H%+F6UT?4VGtgVp@^+IM&QE@v`0Kl%(&5|Itu*!h=G$p{*T#Rnadq!4Bc@%u
z*GFfLaX?|&ft%Y;%`TqjdH%>vkBmoUpOy_SDO)<<^~gdm?;*;e(CYsG$*s)&je65;
z?@ng2RVCbk9w6nNDth=Pk2+YrtA|VmqZf@vMo$&0)Q}k5aD^=@3A4^7)ru73*HBjr
z#({W#E-%@l!qp{Ep0H*iJ5^u;lSj#v#>$tOB{D@g=QK~kGmQ#=9?LYLoum4t2gb%}
z-!r~Vbz6GO5B4sX>P7+cZBezEcgm{0_s6$$qq4URuh^F{_iCsIHP1p<=?3xuyHr?m
zYeelneN6K%AGg^?yI+p=t5($VdF@G7o{V|^JD(<s>>blK=uNHYLc24fy5ajxJHi|%
zA#xzTrBVZ)ExguhR@16AIB?@?qnPOIMDI@v+vDJ-UYE<iGX?V<e{k1)yB9JPAjU*w
zLCmmD1+ejT)cF4G?1wL((M`u<Y^X4YqnPBnww;)}@zM}qu>|2)(Ma_FY2&~?HV4gV
zA)(kxC}U1av$kS1olnHpJ-sSzq;l-xuEEQ_1eFjmN;Ly7&a7+MyHlIF$9~>iab}M7
zJK3a5csvkG0R0H(?uwMIk4FrBGXBB4y}~O0JB8X;0Knce_|p~|#+m98TEtyzv%tPG
z#5?rT*J#4LMd>rPw{^Yp{nZV!zK7}%<B!$W3}Q^2pn2wEP6&N=snXq(M-?&|b2?7F
zC%J^Y?$m9zww?Ch<R5FY0j3hTB7apV;FIug+bz1gFE&*Bj`7t)DU@Rl(R89DGbTr+
z2DB>pXISGIiGxb?D*=Z)I@J&F&|<WfH+4G_*-32;Gx7lnA;vF`U0zd(-+QdD7HY`f
z%8@YSxphBxfjU|;@nfgX+A7rKNkLz>Zj4vf)fSHC)B;7xNfnj(M*}wk<f<c)-?jQk
z+-U&Lw`jwlcXw~?9yCDh_GEPzN|=MIc5Srz7ka#p2e{Vx?ZX-mGQ<p-Y1(?EShX^8
znGA$rS+&|Xs}Jz_;BsjAx4!?a?Pf|`ozy4DHD+MAL+PZcdd5d=zQDGV0|ZXXK6bg$
zP1$3rxb{n5Y-URhl>XVOM^UfFnr(sA?oXC&L?F3BGaXfmTQ;KyZLN7a9jj00v4owy
zr;XS!upo9_harR2N%a4JMB6QP=B{~Tt1f<Y&5K_KK5qPM$Fu^t%U#qf3byQdN?*AJ
zKx)-~F0T1l)&bv&U@=94Uo?ThrmYNSnx9UexvR>sEDx-HIwn&pH=WoBCYsts^r7yE
z5N~9aU2)X=($IG7j>r^(y!4^jLKU~N&HNqo!s}Ct)vH>TSv8iWctzo^M^6(5yf)X^
zQn6)#k7c8QZ(^$K=JIO$7Gw8v|Joim$V2yNtx)?VnZKWcuUom7tM&DLBJQH#NbdVV
zr53yhhdYR7486*oJig{TleUg?*eN?X?vL6z3n+<vgc{PL@}2LuRc~bHfHw0Ze+2b2
zJ+U{&^Je!mfh5J1yH06PV_zJ0OCHrfn<{wEb1=5U7v7mLt6A@gwr2g^H9l|s#Q9M=
zgx>QXd(ae}1VUALFnn}sewqs`|7ZRS6ZFnWMSLMdv*uNw7Et5U&Os{~0Flm%$v<0E
z<Lq6B_lSPJzj76Qf@*x?`bC`TK&rEU)t-gGLK0bC)!RB$JE@q~`HL@>O|ls9@{})w
z>D4@-bM{MLMpWJL&1!XhpT0?K>m6+72+PcYIsOaX*osw0SARf_qiY1<Z%TA%;Qi^N
zszH5!^oXWax{#QHY8Ntjy<6I(#o5GUJ(?Pg<_aFTT>kFM7guW@`SiUePt2WZKq!!@
zWw>MZh5@feQYKwfVZ)VedupuFzvT(e3h)@KT_Ylr74xIAwZm;OhIj0$H)YdF03Zv=
z6I#^w2Qj(jkv3^uWi)uqt@_aEh5>9m`=db*-E$XS{C~uqiC>QC+y9%@3^ROZ##po8
z*2<P5WtqitCmE$wT2Qi-t&}Zk7GubI7fC5g32jJdp=>QuAtYogC1p*rv^?+QPR#uN
zf#3alUY_r8FV}Tm=Xorj<8yqD!&z?GdigEiKbxp_TRs|LpDN&KGTtqdd#Lv#0w}}f
z!8hLefqx0~Zj3>-R{n9`$u}kHH7Z<W4uoU>{?H$M^wCB<+tY`WMuKr!j5h0uPRkTX
zF*c3wo%8WjKHC#GVx_k{&Tlk_km(W_-{Q6mh<tt9H{?5(pM88(-YsRkRauzd3)Hd^
zO(J~*Yz+r%81+7kj{K;<tPxp=o|N0^uX`5Lm|?0u$B3)tp!z+z@lxJ?yzGNQO6ZoZ
z>25+>`6{(y(F+>pBEaoFe0jjf#}=7QD3ynvXieFKmBuR9BbP?E9qzTHIg`HL+bW-{
zvL`Gz6G@;kH7_t7jJ<ag%>^UxRe>{ZCdYHzHGnc!hS;L<g9``UbdAe)n+dMri3H{u
zlOTN^k-^lz>FRfWOY8cSdTiqZEJ&Z^+r05IiPO0awG93o=w0*rbUE&`3#}oxPrNiz
z-`?A}vc{3@Z#yO5mWFCPX~bbdga_PEF2yt~6`T>2NY>O)m2Y&>uNSf!seJ~5y%2YJ
z1*X%_*bOp~b~5b>d6k3%+6%ozJ(upU`r147*;}y{SK=8pocFS#Ko{cI0x>P8-_YB&
zm%>28{k2*$sdZI}38rLM&6K(E7IJ6jV(!3szvWe#y(*u^)7-W*gAZLX;j+tdUE|r4
zhpVnliQu71wCZ_lVfu7xQxt&p@SD|cfM=z9HV$XWi3)KK>q5V6i1zF`sy<hDg3=Yt
zJyy7NsPma`2ix9{|FikfeG81Zr(}jf1P=9>^W=EZhHp9=4cR#Cm6h1^b2+tCFt$=n
z7(s41N>v;&qIS0T+q}>c>_-Nu;byW&1K9aa_5h4mqNglg(^dr5q*a8PkKIErG(JTx
zEBxeO4WDywSYz6S?!0__b1;@^vc!L2KP5d!yip$7BbRz`?Fod84NySVrKL)d$@cfB
z*5~A_bF|S#zh1~0Nu6^Lg0U0T6s<wW$QYN((H~lkrzuS=kI52e!u^NAJCtT168;kx
zj|2Kd@E32`7l%)M+1}}H>hCf#P!)#KJ|!)89OP!{rJ{y+fy&~^>)+g*mo!&ejv(S|
zo89+<?pg`4w{;o;-amwe!{~ThCZGDOg;9U7stI6%(m*_Yj0B-myGsw=Yyc3%PO=o4
zNIP#~B2U+#d5N>%M<=NpW2YbDy(3(C9v8X?*s3teddh+1&bJV|@?H0i`GcY8>J3)9
zi5qDetTcm75e)Iw;ND+9s1(e~{Z&@}-0i14xp4%6481vs`PW%Amx`iN#p=e?wEs-w
z+H&lzEK;7(+9-O<VTPC%EYRikOI)so)h8f#CdB4VzsMmWdccOWxcj;M%hVGR_ql8~
zH#G=fP*Ynt)UWvWCLPDBCPi5dpcZCb(N3a8&6VaEH7D8MC7XU}o++U^wBS}b0b940
zU}JK=Z11`W96LH=3O}`*t0+M+Zyc^Au553R=sq$&#R}r!l!{sMo7#WgL4fEDW6}dj
zsCp`ejDeodKoY4w?}L(F9kG5^zOE;#uC0<v@RD6F_(+|WhKtcJaV0YHwQ~$z@k)uM
z@Kd0Bapn7`9wa(Fl~mBph76`j(}dPC^yi%Pk%9SfDJ>dV#aQ|rBDWWj4z+zK$4mhC
zt7P)(48o~%8wAoxoi4kVqi(F8HUYl36zMZFBCJN+cHfM{7N0RV)FxN~oNN8?25rYp
zlO{b*uFM%{yNBIcFl{xYSim#eS=mpAzPj#c&c}^7{;Lk39c;Lmhvs?i4m~h#&`@n)
zYWO>DFz6vBLi)x)Aj6ZBlN%nI`nej22S4>i>z3a3L}oQTILz}!1p=I3UcY&H+cejR
zJcY$*mNdO@CQUK{D4F!lPelCB=H3znEFB)+fJ5FaU99Azp^}EeT;GB?sR=P4Qy`1F
zI?9|W$-R)^&H%^`l07dQhSklZ`r-wqnYE>Z##D>f@LPBAaXm;#E=O-KD`yJ2KI>&u
zQLQVN6J?LV$;|8-8qBe32=`!N4#jJ0zkje+?qY1`Jfo5*lI-I;RPydHQq!ztP!e6x
z_fSPM?@zpM%#P4mv;s_XRT02&UytbZ`Zs1Bsyz7=X^&iyz95`MRI(cY{|p%|$Jomm
zIp1m=Oq7CZ8In&xSwT`AAgb=UW09Ez(+y|Ws#LlTc_`<{^*iCO5KKx;BarI;_wn=8
zY?;j$`4e@#ILw5O#|4$=J?(z|9A>r7Wy8*-=OKamk?yq9f1U2xMhIRGvu=a2dM|i9
ze-hNf1vO;AIty<h_q>D4?j_4gd=sLWee>vJv;30u=NEKyTN-|<p&dspc53{cB54hw
zBR7PmkWyT{2}g${DTt>!GyYC{7Ix~2N_<;4(XJOrLD6jZNNJuH*rL(iY#$?T_fJY0
zJ)r{H?@GWb_!gdX$NZQ8aJ?8&Upb;+Wqoby1p9;+$Lgo_)6$yqXc#)O&C+Ej-#-q|
zC_>Jv(UJvJZ?~}QCHwYlu-SU-)ErNaK`SmeYV-N?=U)&_3*A++3gP>6AzjFyZvC9J
zS#0CM>Gp^kLPI?K?310c-8gcGiJ@9i29;M-V4qfoylo#Is#lW^i+*c9<gSn8-pUtZ
z;)x-r=odKooy!pt^}QuPPvh)XQutU^)TxiICMnOVB94snE<2-7G_E;OkjrI@Z685C
zwzNF@*jmJ$S+3~8Zp9gR^W@*|-kK3$N30$zv%`6%nq6m|uJy;Bo~MSHaxo#_ez^|B
z<E2IG2g~gwGZ6R<!A;`DZQ1%aqZZVV3u{+(yi+s1_FZMG(to;b$k?^I<94xf{^q9E
zX^T%ix<<M%eT;_Bj91jnW}?|U;zli}aH~{*VDK9}+PBB+D<SKo_E|tI4I0x|MkK3Y
z4j^rZ`%N~fuT7(w91feQ`1>3v)Nvg4nJCCX$Q(yLduUZ!1sio$5fP1!#|b*~dnwJ>
zZkYV3`yKfoDBYz+O?9&jaEVFSd;L3&seyJ&A8d&{an>4LgbGp;*dm!<_YB@d!6N$F
zJ2jZmkLcU|&4YkauDo9`bJ8+_4@%F($x>-dZ97TZlmAyxfr5(x9e;ib&e?g%EiP7x
zsyeQFtV$NFH>aawD9!$T{f)Tlr&qVN(U@B0Ld!`^Ldo_TSd}>hZkLgn{6ffTJ{C?q
zQSo!F`YiB-9zYdwVuM5Snc)OQ$jaAt(WOxdDBkO0Q6$V--NDmJu!7^au}C#3_0{#_
zN>R*hHv3vsL01<5pxp~@vFe?=y)q6mTIyE!A|R!uJ!R!Z3H8%cG1D(%SJrAN#hRm3
zx+tLp@8!$|uF5A2H>cY-#nwebBXTq~aC0KPA0`+L09Ud*R3s}A<;QW{>bP;3K4`>r
zRTxI242EsakfL>st|8tR&f`=aDKRX1dV3hE>o@0@!rF6q_||&njx8{Yx-0u0zxxM3
z$?{$00&_v2@KXqujw4$6*A*=~Zx@PM^u163;6(}f5BY^@bwnj8w<;;XB>eU_499-e
z?KD!?-kk2Ma~aro9Ykr6?_BoyUR0<!R@+Us_3r|J>98v`;nL{#4@A56{mT9*qvu=#
zS=4?+;tN$_Btn6Pi|5_DA)y!=?Fyv`p3Xq{ESvbo52UP*Br&gepyC6RDn54TU~`Ss
z?fdAME2CFia4!k435hf1UZITfDwk5)jFj3}S!!b?U44~eG|)i&;0FvX`pVXNBO<8y
zIMcpK6^GTf`%cQrX^V3dbO21i*tlW7ikb3$9!;~vi#6D&QXrLFWzQWi9mgQY`;*)W
z{l`0qe;=SO)wg0`bFLuu(PGvh0tR{D;Q98d;+Z4fS&LDNK6>Loqm_Rwkx%4UC|vFr
ze`?YIBR&YZgYvy(HCdA;4dJ>50;^mI8r+XvktL6D?T@Wp5845<Nu&@F>}E`J|Mrxc
z2mPS<)cFx1Im_YYnT3K9_C4&3nQ8jw*e$oLP8WUyt~mYrT=cO%lw4vL#47e%M&EFY
zJ;cD?AFu6mNU3wnt$$K!cS(<En$v8vVd$fMQr5@lTkWdPSKHS6&=f*_`zH3ntmp(A
z-v6iDK&OmNuCRGVnKTroTvD3S{Qclu^HYgxn0R)?j;#tgNR{Od^@$m4Se#x*A%P9#
zYHS|bQfYL{=(0MDC~r1293laWutiZiri4_C`W>cJ$-RMp+O1V9CtdrNa?e=YQzd@5
zbq*o74zNMdz!k51$%EH8IBD`tHRSjDk~CO|7A`R2V^<kn`<s&}tPDuk^ut2*Q%am(
z%o)ig0I<C^RX(k8uz3USU>Qj&qrc+qOQDQNTGZN5o~nit@=lq=9BK$rMvJNus-X{e
zHl9KcUv;i}(1r;OC$WLNUT>hzS&YlQfOS_XlAa{GfMQSVl&Ypr;^)sz7oVYiO|8<x
zFmr_6sPln0h{MHjwQS&-Pt`m0fy9?ObRH3dSkz8+&sLvK5V!*{-1ywu;h$-$zss`q
zhYa4w<GKW0bdWgRKf6Y$`d0sXXF9mXa`4~dfg0CV3^#7Tr8pHGJlI9f93ZWhL)s6T
zwW+oJ5|MzFm-oMvzRR~E2~rGUXabpuc+Z|nX>MXZx1OM&zEC>kb*<CC;SY;a4&`KG
zCY<7@{m#s8<O0OD%t5WsKn^@9w9VNDlEcBef8X+b+<q>}jzzBpjF{CSds$5E)Ld=T
z*;k)_yy19T3Z@c+{nFe(L8NZ2a!Pu9G%FMX6YO@r_b8;a<y9U_X+izJ_#<^~5GX5D
zZGZTMt#djELOf{JZ)A9O@2|$^Bdz?JnIO;blgzjlI8A~KkslvWTR{yC7Y)M?7--Zg
z0mun#4blJeP~)ph@BZGzO@f;DLCLdk|8?mZ$cnmA|8OGBTIIg{Ny5yINFW0~=r<lR
zFNc5cnSx}NvH-u?8fW(ShMttM87VHWsNX*VRM!@8E=k^}xM6M{Py{eh$-$-VXBNIR
zfIYGE_$TU|-sju_+Y0sP<qz!w2_C5(sgv=0w=gu7B=Xmv1gYP0@NY^T@86u-?h&b7
zHo?NghE?H6XF*Z;@%{eQ`8a&^;NOHG*&OVxXPO&ohoS(llDb7MwR(5KQ}I`$OdG_~
zw0-~mkd8^#EY?<n++aiGsB~!#<SfSilc~N8p9eEOkbHgg&ay56)Cczbb~g&I*%yH=
z>HdGeo3H)s`2oZ4br^nz5yWVTeVUa@dLeiPSc8sqEAW~+Lt?jv*Uo=BGKL(WuWz#-
zhtOIdOlCNy@mt;aR-daC5?0I4OO`E(f*NTg+?z`^BeZLTSX3rST9GYo#``%jpmvoz
zpvO|#F2tC78*j)0ZSJYx4j>XS0-;;Vp!yeB_?fus+4NXL7W1b^XC^VQ?V}p;d}sgp
zE|6v^5*nFAUYt_@F2%j#Q2Jgx(75DM(t;BiZy-uKNobB4J`5sXOd^`itgxhu+aw-E
zvS+M1Jfy;2*5YKkU<aKT{ME-YY8%ks15lOB7CF{83km22HKb6Jm=O=U)vC*P8mSwf
zijPD@bQ8Lm^wm4Bxf3b@X4C`*eH93Kgv##3d$wR2QtOPM;Ly0Uql^JwU3`F?2Ft+Q
z7ZUuwdW*L4wivsly|l~w+Dh%sbZA)^kI`+j3pmJxqc`S6f+&4fz>#c~U{BEqgEeh`
zfuy&@*dxSG&Lfqd6l*_Bc^K;-5f!CPcAbrBeF1ZRAih8A`~0>8rvJ*?hA1+7M+6_X
zZ#jfyFhn<JNH@?=Nvj2w7ervc)uHJsCk&OA+s_(4ozfvZ>7ow44$*qq?*N?vvOi_n
zg+4VsT^{u7iE!u*1MEDlDV`>(QFMU@1VxuXW^<nU<Y`21|GwwQ%_P2B;9N6QVJIRQ
zJn%P{1jhN#Z}l^zh6NM16z*CE0Doe}Ei!3Mq^#eZ=~Tz8Kl~@%2%+qWVgV!PC6?`#
zJX?^6I>c-DvOUIKkm(S%BuT6Cff+9VON2mxD`ll}*|9qSzqAFUn`Q#nK>fo26b3X+
zN}1<e%rLSV35b=4GkXLTZ+D5YuIqQjMr4a~NTB@`)J<(&H@6~X)Wv}czATkg7Dh5k
z$<<O`pX(=u!iIIRimrQFU)K*$ogv`nk2xKP2Y3j}51(r_rR9Kzob6zuBmzEF1&~Xc
zeUT%X8&T&30@yr49?Cvlsf*a7J6$E)V82D=wXT&<NjQG~ounc60@#Aeq4v>aNrVJ=
zFCmySCh{0axs_OX#eQoU<pD7DW1Ru^#GQEK0VUZSdhB;7+ET*%xqA*#lFrQFg`-u)
zeAs6CnmG328}f<}MHwCL`h|;_Q}lCmIL66p$ZfE6Uy&xweqOj^aD+*qN+JNC&zg@V
zeQ^WolN5dw3_l`{Wh`3H09!ZZdso616>vj3AK$x@^8VdTeY#2<nPXIWK|O{{*X#8&
zSJvOR!%dDw&8uT6cEvSd+5S$a`Yn0)vMes6N^n(mJ1Z(^J$QIIZlzINa!lf*1E<R&
z4KCSL4o}*Y<F8Uk@q@rF<zf{lcrU~WWh?*TjEOjWtL-&Q@mcqlU@EjlPE{|RYI7wL
zfPyruGvt~PjLEh2xN7mHAlG}Voo7LJ+2lP;w*7WCj2qO5Ov9uFVg?7x?+LKwBq6F`
z@o_mrwLs!~ag5|O)dO^^mx{frQ2T=~l4db3Yv%r<6o*d%uv#<=y}bPTwAJA}DBm4y
z`Sc$*pn{w5vvrW3i{%h+ND-|Qr3LZl-w|eeE-`28L~nA-nJ{vdttv9!6z&MPrYFJh
z&q;ysMGh`_OvG;g&<3#l1q8gl3!~va4};*91wuM`NHrjl6wFO&xpAkmT*77-@G<$U
z9=;*k)-AKU8blNa25df_GEwUrU0%M+8Co7C5>G@}8G`XhDmSKs$0Q^v{0NwwI8F)V
ztO9Mx$!$vr?dH9O?OiB9rtkqsce7}xep-W0dX~UAP=#BjB_jtf?1ebMpPArm=XH+V
zQi=5bKl{aaY`p^p<c0xLrvS3(iv2*FF~{$(P3BVtMB;0M8-^~t9x#dQ$EWHlC&O$A
zp<@-X|GZBQl1h!!Jca}Kaf1Vf>A}ef>I%!MBFF+dZnZRyW;3oQP&@>VVYXNre&GQ$
z-{lOa2~|i&?UwpUlc<Ze`K?ygzq?a;x>-N%S2twTN=5>kizOyr?*{#UG$>*)WTQo(
zT+iZbVt}3mYiPE`3xa=;JO|0+yB6VfPQAn&wzggHlibRvm=xAdI4uLoNt4lVS+&C=
zl#)s^VY(;^iBS5FX&d2V@L11_B|lqDucJZ|HCR;JV8}ZYISaDtMyB}y+{=_$Q8hoM
z-%!rUWo)BN)!;dMNj6gaQnuzeeZ>Ey7@WhrwPU9f2q?Nq^!bV;rE8A{NAB&4*YBm8
z-*gLjR;F0_Ta_tE-AXJ$SI;h-;(BP5l%ExEBoDe(tyF=%?OY(>ML)L_FjOU<mp2fE
zBp=ijfGbq0@@)2mCyC{j4{y8pq2yVLY1FE5zpU)}V~mK4H?t!?EIlkXJ1<cV7=77d
z6K~-t+74T`NT5Yrf5$+M$gWv$)^qnj{nVmXK7lSH8k%J!yuvV%ER!$Dp}vuLcDa<7
z4e+hH)6f2M+B6E{Ko(7xVmzG@tX%ct8B~^JO^5E1D^k;vh8~GLnD!=HyY<_HDoP@y
zeAjs#Lx{ZjU{LA(;Z6ej@3exW5yH(xu&q{oS}KJ&Afq0XJQ5q@*3}hfzy3{NY?H$~
zovagHyr?gIO$4o_IUkQUPIEG)3Ne6zfY`#DdeCCK_L0wufdEpj4|rD*H^G#CF7Z85
zeDMU#HNvhdRh-5_K!R4gmdB1i3Zui76lYaFEADmg_0B(=+w3Yeg7JR~9*t~|%Q&o6
zF#sg9H+SeCitf-i9fl54%HMv|UO(!$`{Px9rbBy+xex>}GA6MP<sB1De5-_~QjtN_
zPG2bnLp5xs*N+msqM2vT(ARhWtPDu0=`JB59L+$SZ|f&1Mna+`@#t4=q%pPd2;MIY
z{%{%R0tMbs;(!O-T`oA1N<kbVg@l0w9U_b?^%n9ex3k}kX$YHJu*gxdQfCZanAY|U
z*1BowD?~!2#=GhcYH0E*?<8*JHMm=|q@LFfSh%dZMmpzw?>HxVT!(krRZnxQTXBSp
zrTUF~?V*$jD0!xabJrBb0;+w}X}A#>iKKoi1$+Q^cHL3i>IWp6RhBul`;TfGocV{H
zZIkAj+t&V=QfT##BVnwfv9mW0uKedOa-2@}9Cn9dzQJ9e&jvyQ5a!S-=1{ueKzeJ=
zC%}?OOAz0<(Aua4bwQ`+a8T>2Vpq=KdUB9+ki>La!kyClo=d*a7~9Irfp9#jm_M?_
zy<jf;3BWC7R2KeN8<IH*3As?7p}z2@yr26zXunVo(@uTVA6kg8;M(?|I`W<^IGx+M
zA14<_#pT#MT-?yUpZXS%d*@eDN*ZFnxv;Gt^}QntYJN$1U0c$TC{YbVIyl!eI%}JR
z6ZW~=^|S;Z!!oP1d;P)0^d<WZ=ggxTAcovsSWOaH%=P@mMv0>?`95|<{hd&XTRoJ5
zdNE^Z!6A3KNFn6}f+|j{siqerX@M<}D!ghbSWF<cYF3&uWn>74K_x^K31U{>?Vxi2
z1C}*x=8DB^=9nurQdSYvWxpQe>m5{B7lvP30?sgbP8|k%8#^ubN;}?go(kM?GnXYD
zn=R>h=Y`S92TQf@E{$p@@(J`mxPeobFxeuLzCp1LQU&2TUqB&r6Jd@Mpyha+@MW%Y
z)$pAX_D{7)gjeXEV^CVm@AL{blrW6Azf>x|-@PZ9x-?a9$9Zoa2dRo*AV;w5-s`0B
zi{}EV%TLQud3IwE>Z9q4e`6(`rC=u@6u@a-ixRrlT%=NpwVc78$vV}fQ_Y|5mf=!o
zJ*bseE$dsr&JP$`k6XzWJoCDX!0*_Tq|0ps#$ep?I8HO#y~7uj@VDb+MB0)yMPiP3
zc;4tO;2Y%F>T?_3c<+A^lz(SfePI!vT%jb+K5}8HwE<EM7mTy(=S^HZO2(vW?QonS
zJsny*c2PFe+Ht1RDz)<U*3#!XxsIRYI$niAVb**^%`;6>NOdWP$||fL1GM9er(K|2
z=6=;;@`D4BG3Oa~v$Z@XsjtA#-<gY}DOWL^DUALV;b`5n#eKR%HHlJ%7RjmR-<!)q
zifWdW)p%1efxW{nJuHp&kUEZgz|dFND;uYrYMC?%SmUAkqI(K&4rkFRQ_b~W5Rv=z
z07SQGUIRSudSUYi&b}OW&Yr~UZHt^xkdEnDQJH@nPTe?{Ox?ER(4C5rTeNLk+6R-B
z49Qfb^tjXSop+^4%JRCTbP6O6C3L|?o(BDT0ai2<RAXuJ<}t!|7?q~Bz=cBs9(<;h
zi}%hDN~F~6Bu&2eOJ6@3o{@lSkK@XI9MC{zJlI1WD3YuUlPpfuO{nMt0nAS*nGej<
zP_kW9gqyLy{@8uCp@RH)Fot?c%r`E(DGj#t>{xrU(~VWSN^tj3A=n(s-+IKi?9n8t
z<8^#jxMM(RcF?ga+}Z6!aKb37gWVtECjnb=>hBG8>}BopL}D6F*3+j)6T5zip`)JZ
z7@lw47UN-47JKeRjlq=f<&hUn2}wU@wDAk0jTY+EO;7>IkF7kr)W-nSmf#MgjQGOW
zp=NilJ7*-EE8oj?^`&T5@0WtjTjef{0k*?YVYxLT1A2jyE>^(6ZENW1swx&H2Y_dQ
zi#L0=SvrCC?eEi1U<iG8#Kq7|Rc4$W3+ih##ktVQZ@d)z{i&hL-Zo43Hm@Oz89#aU
z-(ob^7zXyurXS~|v3pbtfxs%hBHT~w)4OH#I;tEo->vHU-y7Sa3n2!_s12>_JX^SR
z!buXuFe-WiQf?m8G4TAB+T*f=tvqnfev(7-);__JY*mq85w>pa%n7m$K=`xo%psT}
zTqz+kH&qE3oMm_a7D1Oe&&UhS%%vn!0Yjc7<u^C1IQV0;ESp_mMsm<Ax$F{{e9EOu
z-%MW|g5j8hzEG_qlUDEqV_h^hyF){L>6Sa?2V)(6N}?8m@UapRt@1qfdNf1;H7L#I
zBB9>$RNyTiix@zI#f2V|#ny1FTIO_ua!B-4zwG%Z2O6&WJj}XYDlr+<MueO_5NyW?
zfB)uctatqNLl}c>%u)#3v2_kaZXJ*pSK&{SpBc&%lv?Ddl0t5RgvE3b9zE*h;Ir*^
z3F#Yjtdd}=gzBHMQkmJ+aURhNkFQ7O&-JF!O{IN{m9Z?_u9TvIEtHZ~`P8+f`Vru=
zUf9?2NtLK#^4HceZ%F=a_w8Ipb^o>Q7aEb*g;znGF#}9#DQT0rz5U6HRjjEL%Eehq
znS-iqtKPl2+k-LhAjk#_Jfbf3+fs&i<EfM$$upf&(PXw5ez3z0?9cCNNl>%1`lSR*
zm?{()L5JlrkXnybT?id9LU_*t@;nBD+I+fD=qhq0^dQR*cf?*u5UW&)Im3pi!yxTm
z!EeSrDL+Y*axovvVn(;R>yd)aYE8GXsxY{7l0->5Pjv<2hGbi#Op+>udrRc7>)A4@
zyICjFghS!t8h_O?Ygo#WBa3!@0~_RpTkIzq+K;%>3Boy-jZIm*uh;dng9QYWe3tZD
zNIo#d;<|PhF#lQ;%4(TFR>3&JFt$tPAE6V3kSLV|47Kn3^vmidb=H;>+5Imd43!8w
zOxpOs0E<=Q`ewpf+{O`eMr=Qjy{ks9r#RKI%{Q@ud&&Y*BzoacU*(_{)gvrZBaADm
zEL6qB`{N*TdS9s?y-<iMLg|-;0G2>~rWB6n9xdnr`$YYVu&Wfx!vyT5I876>9pB(K
z8djYIsge?)1WUBzIP(~M*U3*|qArUdA6978*sr<cD5;UX00tQ@7y=y*zNg=aSr&d7
zN$eClG3Awd10wd$3{SFmAutQmQ@%TenORaaZ<-H!8$#l{m*B;B$4a<^VHDpJD6Uq@
z0`ZMHwk}K?6po7EfTT;2wI>cb4*?daB#P)Q!HlrkWkQq;5!$RMn6+oUgz7J!6dFqI
zwb}1ejq*LjI1!YZ0&(75^W}1m&|S-u$zsi|{*sGLefy9UYaFxxIFe6W=CCMQLZB7w
zh1hlfKV|AW@`;CUo7!$CrR)Um+8tX|3PesV$x8+EP=;fYXuLk<@C_B<L2ztsxt6WO
zPctYyi|nUhmf<-!xyl#Vw(d$0sHF3^bLsvyv$WjGF^f5@mp_`XbK`$+Rb<s>)#6;#
zd=sIZ{ah^|lP)ZS3xt>ywt7Xy0}DiOT*n13CI*`U+ECz@rPF%o2*9EXz}mY8j^bvT
z)z?WiUCZCy`Z0!Q)$;CY1F?KP$~(GZj#OcUS6RdAlUCZRoFaC3+B~5kC=(9s9#&0&
zNiO9tA*2xsuJsp=qg5vaAnMQ8gX1}0-1*ZgcOh4<=^3(({Ks>ZH`6#ZH0nc7EZ>-$
zP**>sE?M93U|aWg<1Y+2{l832$FFwGy4U`J*W0e8H~clte*S0obW?{>Ym9$AoxkgE
ze{+-OXBPKJpB6ZGTS3)3$CMSv+jt(2S&`PUnmfGdabiSh?z`ILS8u$W<I6ox@H;*?
zrC-dzWX<>8X+2$I(xLqE;~l$prx-1L(Pq@&uVi3lQtq41$bkEWhC1xs5I^~oUl>(U
z{Pbz)lUg(u151_=06%Htj=E|_mETsxw49bk)1n<opJL$f+Y_r{awSZZ`YVxCIZlRs
zy8dB1$+#G-bd`4dmjsyQcoK8br?$t&i!R0NIy*JpP2SZxuCdptc>tgueUu0A3nF+a
z%9EZ7c@HAb(R5dVoK8aCb$}(Efikg;(5v{&UiBOJ!>6#ZIZpEt&*v)r*o+{U_I`Td
zrg7365A*gK@$vcrw5p1@lzL79;@s1RHh#;q{*A()L;RWFP|#2W9NE_c!6fO!<l02#
zY~6F5Ryh?eGn!(?tFcS<H@JQFpVqL{U%DfRg85uS)*6d2r(-fJ*KUM&GFLJ4<F9TC
z`fnFq$EGxXUPQ`Hm^iqS^KJSu?F7-~xg4Xhbi4QH@nHcIzWq=0*WKTbp{OakY&V#y
zJ)i_JClIYNj0S_jH?C<ouUzZMd80ncXYt=lrov2jkTzooezSC8Q0bBtjt9IM(7OS4
z=1o4uPS6BCzI@h+2C9h5B8L+6X$}X}j55Y$EUXNxd;i+I=R-+M{$|Nkn$c<USCW@)
zL?>kGx&WOimgI$#sbtSMWF-yO?_Udk_|Ls{P9KM8cH^?3aoVQ()hkQp9$`LZ($o(>
zH27IQed}Mk9{lGfM{!R`{K$Z1^gQd^7KyemI5&CaFjRN1oc*iuM>Q6Y`OnH^EJTl@
z3eaX22ZpZS{%JdcgCT?UlsyPQGGIWL#*e1fHtxZOzj5=&w`dI+VkL$1w(~hrP>+8>
zE$&R1Q{!he+7|tHQ5oJz>lXm{Zu-=3Hq3t${g?kPYKxZt*=aueMVsQ}isjNZ%p(|>
zCZMqhq@s-}P{q=QVy67+z}_1R#0)jQMm~|@23TVuu%Vi&pN3{f6rN4yQpj-Hk2g5-
z+J86d;j4vf2g8O9XB_nu%4D0+n_KhshcB=A^Iu)*JOv%;{Fh~;RX5=39KEy^D^}<s
zj7WuN+<$lV!G8a}y<ZBWE1k*mMLJc+x)6lveY`9m$NyfI<{$URH{RVQwGnLmFO<pV
z&`>9GJ!G3rtfMU*Z!v6`=1|eGLl<NG$KAJSSfgob__XICgbrk?f(^4t$3O3}V{qe*
z{=bJQLq#+6_~0QEQNGxdJ>*<NZJ2NAe9V0ybUS&aK4x2qEG4tA13NUZA8xI>I_E0(
zwd%^Wa!=C4&Kk;Jf0|@IVL#!Y{ujX%X)bfbc5_zPRHDc9{<X<P_T7SQ@FBCNACejK
z@)!+G6FF%I7z%+10;s?F)>34`w<B4=#-|afA9}7u(!p)rf-Z+={}VhI)k9G@A;KOH
zuC|qDWQagMuh%OXvMlP{_}gfBvyUKf4?enulS?H_TGs|mP1Q(9N($Q6^8H0=q<Ya?
zTm2oA<INXsd5b`qg5Mm5yrT>pL%$?HDd#(c^)wOLb<3}1|E{m4);l#WniMGQC?(lj
zRo`BK#isx2CN0q=_JgK2q3@nr!!d=D5Sg()uj>3Nxeo4m7U5yrp*cuRf?xP4WRw*q
zG{bBI`RzV>;H0g;ufxgU@C2@|qbK(w+zg2;y>^afo-j6S50KN}F%1$`+q4G^5dG;%
z@R6!O?Pn3$rs-qkY6%=g#!}?W)_!>Y8g-pU1&{2(Zn8V+Wevu6k@Q~Nt9r&sti$5h
zH>k~A0$Qzj=J9N*^}(sXPfT52obg>;(JTyMBC_s`Ws-s1p26SHhqn_&+$f?(84Mg4
zQu_LNk;vXFlp<TVq2sUvG8Jh~JbHQuNGM;em8X~Liyi@KuAWoI|GFy_V^v;WUTE+t
zl&ItE(Q2S1^pI(mST2jbCT-=)5Pi`<%AJga;B*-x+!Z8bXh{CoBkWvC>vDz<!cu!)
z%eWE*c|>DWWK)%w(T_l`<gRO`fETXTO)}P@($emnFHb=C_L8T)cvv!J0o{S+xwWQ9
zuOpwX9%oYO9TE~EZSiUuY|&|+6eFS}>85~c+)?(c4PiVcgB_d(%E_YtfONG|#N|XN
z0W$O~cCA`4Cb41a=%&J)kYSN9F87g0wbOUq1|x2UYotTUN#-^0qG)G93hMd#&Yz8Y
zQ={XgNh*y%R%lY#*zcr296Q;dVgoWH&EKU5J%$+>DU<}`mP8$Au6=TNI{zdJk!TV_
z1n9mx7DrM$fi+nM1gH*y$n^DZ(s2}n%!p+2YB_uQPVN{ukdHV%K{{(@pZw_gPQ9S>
z?=9hKmTVK499L_>+gf(&)KA%m*;91{CjwC+*E@+u0U@3-JSOP<?sW1)tXCodZ*On0
zWFl3pBu2X|iWMT|fzY@VPBN8^21>(iIQG;~Ml{j{)kn$rtiK^|^;{jzY_g>ypwCmd
zhQxNsK@Xp(ZH9|u6e6C&&o6qtjNXU8=oRTFdZ{UOZzHKyMS=7ft3;_)RB*ALbMBlx
zfm|Bq`Qp1A>jbwMNF148&d&Fnh(u^w5fP`;9H>P;t0&H>9?v<F0Z^B-83cqV2(h#4
zt7<y!K{Fqomqbd#=TA{EB0-LVJu^V9Wk%b-f(a7iiTY%2Mm!@_)mkL7QyN|^WJrqP
z^Xlrs9Sslv5TOt$3^U_t){i=wy8Dv3A^X3NSV-_v4t@^;2Dx|<0mMs-$O$qnCdf~k
zQ{hC}`*~8%Vxm5o#6(^r<K}P=7R+c8k|?`n@vtW-pJQ72Ka9x@yJ{_#mpwYMczL8y
zx82#hV~|*rP*i40J;)B4O#hwwDd8BpWheBU5aOyi|Bw~^JcAuY!qoLBHkqjcbuk8y
zMcZpAx*cKuU2`fBTHeiFmqQ2=p*@7<x+`TQE<$h-)Tu;|86lnV)xnJPQ-!gg+lBUD
z(NA>A--IB`d9BBt$E8jjn#DUrDvm3bQj1!y4}sxb1PHshmq>(rL@LfM=`X-f3()k*
z#Pf1M&>cIUo<0U3?=s2dE;QE%ZFFIdDtC(j${>bl$^=DJ0OwrVy$b0KkGC&piy$s#
zugRnbS7=4Dt#&7_a~i)aasKUZ?F`gA^I!_9l@i*cktE_qma+D8YX%<;2nYZ^+y}yd
zrphG1>4641jSo5X&(%k#!aOr5j9PQjyEgHceA#-ShI<i&&|$t72W#QOWd@gsU#=XT
zJEdrOEy?&J;FRBh0^<>0aZC|~irC~AYmd$=T$ZjW8-P++7%|Dsbz?@2k}%OGdC4K*
z%{eW{4K@4q*T2{@;ow7U9O3AaHAc$h^I8tS1!z)-?WgQz9G_A1ufFQv{mkQ1t+>)y
zwwWr7W(TZ}sgOC`j$SwpsE2orM?=R^8!026{S^*rl>BT<L(LK1wurM_fz2|v#DMxC
zhkp-KuVz7OlHs6Cy4zH;U1UeZvKcu5KH<^3p%of*`aVT~{lD${S@Mb#_I<G_WAQ|&
zNX?-sVf6F8=cZ4jpKTDK>Y~L9COkghwq;8xp!8<c`or@HC=_Q0>ZU)urapKYPl$_6
zvE$bOkQTjuHVpvW+gtQGpbsw3PwC$H95gzwKqB~hX7o8T;p{%NLAgczx?(m&tLAoE
z5ol#yqY$=-7OF8AKV%=+tFo@_U7Sz%6)!S-s)s?h-O<H7yw&gac+T6mZ*SL~xq%^r
zv48#bxlFk~w_0?i!Qv{O<<oyZLZ(OJ{U)L2w0rk%3KK&_L4#}xiLGZH(ka8)q9OcR
zv@s9qbnSZDIS3!Z+1IQas+f(mzA6l`a}x3#8C&(j{qL6n?1+!g193wA8i7nP>>Ui;
zeq|b{M1;7lz$XUfSN1!$oK;VfTV1#kGK&74O32?gi<4@DMd8`bppN!2EwuCG$lxp(
z7I{DzAs`!zD}or@$Dx@h+V4moQ4tJBq7&R@Jb}V*Kr5)w>d!Tf`^EJD<pCG=7n6<$
z#H1iwX^zq_y3Tp1mdof1W8>bhKiAMW9uT$YPICK3*JsevH2Al%WB(Cs64dxQ8N&gK
zarnG=3^K|{CZ9AC5;c4}5GP+hCBrwEO`uev7fr5F{!EX|OAEP1nWpg;u1trM_NGM)
zf%>l7Xc5_@(q9)28bTZTuAjd^D-ST#{t;5=&z+xf9|f8t&<#aAp6TOgg3FX7RBbKU
zxfUtjD?}p?!|JaPnaCx3680QFetZ@o!ZK=YTgdo_kp{M7itEIGjx%ANV7R~|n*fWL
z&gpd|bFQj`7jtzg1x_QW<U|0!SH@LNrm}x^=TDzWR?kXrQ49k_myA~uZDh3eAzCR=
zcVbC)=!lew{@BQQs~@BUNbEp`P&A9g#L&x;g-;QEE|^E6T2Qd4o%FN5#Ap~BC*)8k
zUa<ugL$VWT^rhD6($axI(iR6WJLV=6!ek(%2z8T&4!8~4dX;hA>|>F4RFt7-V97)b
zMY`Q&w60H~=&HxuB?J!<=N+#yOPv4eN#T_DKJf6Mm2)QNaOT?-4mMA6lbKGQ^&Qlc
zReYKhAhah6e?Tgx@Bk(;mgP`1D7Far$RtKWt6ogE>CvMhk>HdrIsPo5b?0WNWXj}#
z6To3I!A3C~%5xv<!(nIgL5h!v9IPaX7`I8_44NmMP0g6q6*Qt_KpzWZCQ2MG`(w@_
ztC#c&JcMl~Y~TUjkZLhiCaNo8${+OR-Fm&@P342E2)G62_~uCj(-Lz**gH2Z7S7;s
zWDxImD8k`iVogztWPIB5w3op$otAMl0ScLUbMu-GQICvxK(_Y)-Oehrrp?R~jvs$W
z-aUz;`XGq)-*|il1AHI5S|}+=rKTH6gj|T+M^AkI81v*(=|uvoQZOG_@Q>LYVaK;w
z0d}u4TRR-$_JZ``lCF0|gv%Bymx%Ys9_Cq|n}~^-QEo8ie9+u^rO4MmSox=vrpMe%
zB+!#6Gkhtq@qC7=yxQ;_S1@DtZKiw;dYa3L%RF<YTV7?49zF0vD;BQ{zzd?~7#K2H
zg-l%p@}ofmP-7=e(jQmZ5vaOk{A3jav7{Ew*N|e}s4*Fj2CE(()_K`&MqUJ&mh5K+
zp4R;NPfc>Q7Oo7tUw3@XyP~}n$s>sK_Aw%~{kb5s^H1AI`NZvc$z!u5H<1r%`t7${
zNY@hG_Il1vsrs{@+E8ccps6RKapOaW+tIDHI$NJXZ!&3t6KIZSH8K7Gwmf?V^p)Ww
z<4zW@D4I{%=Ovi0$buDYjaXS<CtHo-G`nSHD{`F$M~@cJ9qAep;dv|h=4))q`a_(u
zM>K_~WXfQn$NDM95PU4$H7XeQK6}FmdX3YHZZ>Oicn;L2?M6!v+m$Ah`H1k7xG&Eg
z@9bsjC~XYyo|H2C7?GqNzqm)7+;HXol?rEep|mNw+#zHr`nC;_fjL;^<y*0oW<fB`
zF}s%xs_2^-N8X`dw7)uFQDsN>vmUFCKoNMi;See69JGb3w;$uY3Br<XHFD(0#3zTA
z3nk=o#UDX$b-Tq~i*<<Q$=w-nlFLY{a~;2tWe$HcEM(w}wU^2U$5HiJG3L@_@SK4)
zm1UkpzBY-A_GepVs=BXlu{|S-tt%|##p^<cEbKaNl!KgYHHKPG+H|<)UCZh*O`mKG
zIzD3F`J8uWGBcHAqG?5xdF|NCDdkUlS#P<qMYmV4v3$r3rTI%++!;=QQ#9RB|64<5
zT_MtoxUxJf*3;K3JIkB|dL&-LGS14+&ky_SgNsgB?N{)L&+6Y1a1;ks4xE#7%>yhc
z24Qj`3T&s}=xEG|si!`**4{cO<%va`rCK9Kq^R?9&s;}4TieTC-p8IxdEN0?t1oXw
zMhb<uxT3-<eZYnV$3DF^#Ub~x-F~;-IQjG?2*Z2-;CFq(oJ;q~=9r*f<?4_E-dXnK
zxHx=q5Q)|CE6QIb+1)HlNT0XDt9%%3GMzJf-Ss!~KK2Az=`&4-%)cr|b<WlE&J`@n
zRkl-ylltmgau&gJf%?!uWjL5PI+6@XIc7Y2D3&8XI=YeiyME}o=@TYT%1DJU|34Bj
z{5<C2mr-Monx71s!aN)=8qzajUH-U;RxNHbA8VrMiJ`!$9w7>|V_<U5nfvoPn3)&e
z$P_8fC0?<EH&4-SC6PRnm`6_w-s4o#E^|%OR@$?SHmqN-H$Z8;I6BI+zu&x*&1cRm
zHEh~G?iWz2%O~b9i+w!hSVZ-WoQz?AS6pJgs$rVIj}MkgyFYMCwsVE|s8eMVd6I*4
zXIk*k^Mdo4d_1ml*^(t;2C*qAE4P+x2_yO|ykwU^ONe#IG2*_(EmuSw`>Mts_#m&w
ze#w$0PP6ShbqXC+8lPU3R=fF+cSSweZimb3#+sT+8q0W(C<0!IRcO@qm*Cy4c&r$o
zrP)VZ3llA%?~0!Bx<fub8YnqVz(%-bY*(-Uw#T&aN=7T{;K<jWat;FS6M<Jq@B&dG
zE7`dBG77=Y28x6(aI&N;4=O5ztRm4NDMq+Bo}d-F=C8Nhb|fP`AowBUa<Hxzf9GdL
zQPw)ZuCw0WjS&WYtp`53Hu!2!h2oaS2y_QQC&#MLh189|#y%V9Ve8q3l!z%os(VGC
z9*g*iLC1dVZT4#`-%(5e#N1z)b5;{5rI{)Wt&Hu5PbMWTPplsQRQBQp`<si4{wZ%d
z8`|vFUrN4h-rO@InHrcmN4#SAd@>%*7bh5m=y|<*2^drJ!2It<`Is3$pRz2;b>^=;
zrVg)$mDG(rRlD$q>h$piD^~usdUYh1ej~v>Lgoj;11y7wmrr_yvfbRI=#^EEEY`Qx
zm08UC-ZBCrUnb&Em0Gl7Y(w)+i|zP@IfzNiAfBhMlY+>HWQ~BbGIDcs{k<9&S!3+;
z1xLI;GtnozmcUpgdIRX`-*8QdH@rtu`*dh-bUFj7I@`IlW=s_xUZr55k#1q2yDswD
zMbAl+o+LV8XP$rgY|6&XuD-5*D%EA9;rM3yiO(BfX|jPwJh7sUNTRoUH+Dq_@t_8o
zwR~x6uj1r<S;CvKbGf4B%&a*;$NS*%+xnk+c(N^!GFymq`Vd27ZC}I_HYWkqBUdb{
zt_IAWL+DGrVQ+(c;<JkdteC)XOls?;P@LWC6Nl{(l#V?gP6ML(dj>eawdpg7V}Fl`
zh@i}=(>uKXBe+cQ1y9P$wS3PZs1AM3u2iz9=f~j|mD#7@zSLQ9AA*O%^aqxp<bxC2
z^L*-J-W4WBldkFCd{LSkizwyxwxL5k`eYX^uDHH1cemYgcXxN)j_K#l6)vuqK?!r>
zonM!G7k&=|vbXXwnPgymYyZ!{S=*i4K<qZiAiLIpaYvLs{S|Wy6R$p0EKeelZ8!hn
z@!A_Xea18df?Ik{=m@YNBK@PQvq=;MEGBEe2E}8b-cv4*85p~<br0c{Nt|f9t^ZGi
zoDzc;+6!$&L&KTqJ&!Xr>BKS*T-+2kjecG8yPPx0HYndU;*>?dsH=|Z4^36$K=3G&
zo-2<Mot`8<Tt8Pp4>pHFDKaW{%?wg%UW`-rUS%aLis~DrLUYLV@zb<W`)5wxwETEN
z0+eHWl|oP&{K-kjQ1Z1EH2V{`>9Z(HgC3ijnx+Mvo*Zp6V^K}QRZ8>~9cn{3p6k*H
z=d-TG&)>4_aR}>89(>`7CY3oV4Vx)8RQ<3Uv>*mAuQVe_8K1TJI}kE|#kutK_W92q
zlYKcFnLaKK%F}T;dFyx&5QyxEwGopoV<W{0^*AGQosDQFNRTO468g%1e669n;Q8Pd
zddGzgpl!m>r1e<8N#HriXivsZ>m3#R2PI7z<0%6k!GqIYPqkLBoBUC%37+jKNDK+3
zv2oearOTgRA;s;c)`bSe5_L@A2PI%}9<_B;)(7v=h=Vr+q-XN?wF!^Y+G3lp2B@^w
zQ3GMo{S;iFrTB6(JVxAGh)%8uS7U({;VqMwE?ahcz!-3g?n)8pW=gK$aFRL_nwxP#
zkDv+fxL(G$&lR7fR?QR4K@RWHDs#Qrm(MOq2FP=DxD#SlX=rE|Nr2KLLK#Xr5s9%?
znZ;CcGkyB>F$SlTLG|!?rnOv0?pfKkPaBpo<G_`eU?#+8Oq!f?<w+2_6=Cms$ZFi(
z<x%_ALlObJ>@o-`f&$5OuIu)6Z4juBWscSM{?q>0-|3$QXn_y;yd%auC`$Swb*l=K
zf%XvB2OJ#+O<5GV5hImilnnbLsq#-sN&-Tjw`dV*@fL#0E%)M?pC{wcl9D&v+R1mk
zMOVKV^=pJs|CM-3YRKtBzn!Sag^@6`a|QtLNx4Gxl4iesgzhrCiHgcgip4VV`+Vq#
zl}AtXphWR4AcCqT(;M0D`<OmEIzRP_OWq~n1R~8$k_&^()jDtD$=X*_lxD!QqT=qC
z7q$_RU@0mwl6)dz-7;D*1TIWSNDx_z?Stosg!$*}rm8SV;H-@3fq7f}8rlYZRu__J
zMx(kfL_;bpJM>+zcV<Xq?+3rEgFLM#+Qkfbn@Ef5`ue>DO%1P+Im|$sItIGmB`BMU
z@I_}efrGPOp*1b}hzxW>mTuwFB}+2!#HgCeSS-oW8Cf^fys$k1p0w@CTHc4mdP&MS
z$f=T#Sb)&Gd3ab5*D~L_iNLj;R@NNSZZ&Ruy0{X0xdpH;#^O>-7v}@+EHe{j8n&zV
zC>iA!DWe?N95Rc3_I#%aCt8U{KX_M=s!Js0o_M6YlxEO#OicX-;8$ce*&xONzt5o0
z`}N!?86j0r7n($F9ACIc;w;Ze*ni*fV_H4mALSNtq*($Yas>g@>_w6J{@Je`mxHGc
zns)kgQ5u#~4#v#H1j0hY%jd#DEAtVFKOFoM_Dh>{Q+DmzC3p|xWwv!;T0I-X49Wt(
z9f5NQ?9ltnAAkPIaZ+sQZ9W=QW|1eQ<^?+mrOFX#x`>C5<tmiDJ53Ul3M)EcC>q>{
z=8qidLoZP*;K_xk<NcM~?7Ey6j95Kj7QU9>;?=owgQmgmix+>>u{t;se-KwtLViev
z?EtMul*u#gMlASHQqyShR|}ctb<mKkQtF^8S+HTVLR<f@efvnhM0uHZO~zwzas7EP
zV!%`C>ykiwd-o1M=q}0fC(<e@0|{$68}G-SbsWp_mrRCTD#?MA=P5Yuw;L?e6lq1U
zp+XV<LFicHZWo!so!W(-y>BJb;*O*!TCi}S6F&idbqQi$499$%D1da%m&tNRvFgMd
z=dWJ9+NQ-<UtOf-K|uH6Q`PS5St?U4edPNDNtVq-8}9lRsf*m*^I&}TlW_@R^nh8o
zK4-V-@1LWF#&tDGV0AC1EyySb<z-rIw*&{fy6t|sX8roB;C%`uVjN$N5^_L+cDz<8
zJ9!t7{-qV__SI)2*YY-^!>h?l-7WT;jUqA=(B{yv_;P1LBbhDFlm9E`Llsw}#eO4a
z?#n0{%G`c-?e-4cejyiL^!)y)I7fb^co{zP22=eC2B<GHV-t*{Ine6_|HOf~9>pk>
z2_{X`-j8kCjJ>2bsq)^DeSi87`yf#EvTO||kYAK=?YjM*(chAsj>ln}5!8+!`yqqZ
z(EShnVC0v!cnU4csEgcOJcFR%Koq$4W1VPKiiJ`<o*GjB-X9@dxBrSKk6n!`c?3!Q
zBD&ekh;fX|Ms;4~-6h=wBs<SQ?=XM$9m*dW%Ky~M@Aj3cS2#8oiWBfAjyt}SmT)4R
z1E>LzvS-!#eEjUcMc{ERt?*!%LFWXg1P9KX6Mw_Cnqn9(5ZartgsJ9i^Z$4-3_ARq
zD&k}Nlw3tjMytNI`<^n^j0tA?Hz95?)oZ6XKfE&4twCz`X2Vr<RQyK@+AW^qg$vEY
zj9#qhaFfVuw|ow)h(e2Amya^C9i2XY?yxrFrokT)z1|!KB@EtgASbVN>j0CP3_Q9D
zy05z;S-}tm>E*EA?ntBni}3u*=TayEhmP~_LS}{)$hxc<ys>5Q#-Jgc2ETN&_ip3@
zEF5mOVEDZIUbkNE3~4c8!i1ntP4QSz*C+9;L?6AoQVI)1s(E^yN*j{DWy2enIrYzv
zw3v}wPe65~F1~zgi52Appcma0UPMLG{vQ4faggwwx0e31W=-bH=OpA(o;&P64Ud4K
zn7y^+u2<MyuUlu|9A8tp`H#wlKa>{jZBpfBMC2f&9-;s3XT1nRt-&3C-J24@%?RxA
z=uP9*Xte0jGp-?)<*zz=!IbNdtdYjZz3>RM2uVovCXt3wq`iPB{ri{C>d!yw4p>_r
zn`gIU#7cYI6}s^mS2URrV152K0tA5^#<@#_&A9Y&W|)petglI@r;Ip$@QviY@GJtt
z4Tf~;%@Qw^HGO|OWj_t?1l*hEn4^Yj+)n@df@P@1_70!6*ofEIM>R+lhS60|kNH)7
zeRro_WU7E9&MJvxKo7U6r8*j~UP7R^shJfYZb$y%coM#2?(O6PwRW@mt)2fpkb=L-
zIOUuh5>1XRLzZP2Xu(4D`}kwzmmkS57g001utEN%^S6+^kI5`jrD*J{r>s!FXcPhG
z9c3R0V8zpwN+odl0Wl2J(c+5aD~^xnS?hV%Ecvmot(B@J4n}^T>{`er1|JT<@lGG8
zI-LV#o-4xqQfIR}zThXswl0#RUqFeUL#;1(j5rxRmmW$P!9tY=iMZW7=WXxRF?idp
zW*<)Sre^2Pp52GFkfMZo%nL!-mf#&b8btp)>rjV~5R+2f0ZPncRYzS5c7uxd*4MxG
zUXx9JcisoP<YUX+dmAYs6Cx>O{;AMm7^2vqfPdE;4$MBO32`0+6nLQt@?LCM<CFou
z?()wc40Lf<X52}X#r%W5%085TK{_nJFdD?-$HpQ<_%nH#CDQo{5f;j$y*vhx_oJA6
z14omu<&|(2cwg|R4#SpBQcVJgXMWsd>Te}q$}!nAjT3T__IMeMpu8->nvgwu^pKHI
z?_ampeqdZ>d_l%WejE)dYph}@iHyXfikn{SByS!FSU4F7Y}2u0#}vZcUPgb!M-6(7
zfAIs0KczV@TtZT{AU=W|#C<LeLcrD!N=qZi&b1s)da1%lTwTCK{<X{Ye_Z?3w}5p9
zIi`K1lu5uM2gea?m^9;a!U!@!yJ5?Ykk2V*(^Dd4f{zOKJ|AZf8eNfbl2AP!@(rpm
zqKHVIc*8GFozD(a&yLHaTFsvB(Nmnh`la$u|ES`T{)SFwkzP=W0@anI#q(v_9YK>n
z`$8(+l+oQt{2PRB<n#aguqmHil<^5N_)IBdK}K)T>`m#cVFbZ~mP`uR)4r$r7x^Qj
z?UG{+pbpVOm_**)d-fcs5}1Ry<cdsc`}i99<mV4k9ZSZ7ob01iPm;EUvWt_a=EpZm
zMD=l`Ku0rDZK=B>!QM!BegEFhlieEWX8-;)|2y@}KOZ*AfB&h;&i@W8X#CgIPd+@!
z#=mO#{|hueod5s+gQm~_$Dhb>{0~!AV~8mwV_{zcLyK&=l(ZRv^8S^f*X#aYXOhXT
z+cEd~ebkov%U#fC=P~N{-2Gh;OwOmV3dm%<X`|obp?%j=vxGM$pZdPx+eXH?-8=n-
z^drxn$5_^j!h+9UGqoS6;8hVGDmh9+#6JG3p#4_|;|M4u5*+J(lg{{EqG;&eb=y4}
zp7S|XdAE)dsJNPbmKwtN8GleCVz}NQ(M*nc|0?d&a^dM1ytf6c8+G)Fk6FEq%o*}>
z=gEk#9aisEnU!$`$v3|rPT6w@RS;iigRiK<D0h@oWxMdwivt5z7a$;>Xho%O>@=Bb
z9u*`RLrDx7N!Co`Z-f;7fQ)aHhlgd6v%aLI&IRKp6WsAXa+Q)3K!LHCI&~*=q@!TN
zIlIJ6dNsV0MrS4Y{v|Ss1BhnzW}F`MskOeoevy%p0C4?)_GKI#pX~#ZvcayK7U8lC
zpOG@Mcr@V#fTc39bPQD`8P5tuxpz`*62ou=fJ&>RwLsDrPlr<b;NW0|(lSzSN`7Nm
zwDU0l2v3-HVT@RNzgiex=}TS&XNAB+hOj!PXI>I^Ci$MEa#91t*z}|{qMbYC_=0Ic
zrZWIxEP&e+H&SiPmt+^WDKmh%t8BL(%WozxjeGu@1||J|WA$Ac4rdpov55(2U6qNc
z6GOKjUlw`<-obfk3=erW!h)2<Lm46#P$FI28xfK1B%Xi>UW_zoV3;()WtHEVKxzu!
zy_>p0Zv%q@=bJ!>aK~b$9tsCg9Fb@;xQ@8<%BWLW71i78H8qczpNK)(GPk_jOe;|Q
z=c4AgXzGfw3aXbN-r|{M(0R#l_JJ{j=-7+#E8clRBsl<z-Sq3PCsJN@KilrrGrE@^
z!YIEAOysER&1M&0kKRat<hlaxQ1(6Vmp**{{vlM1opXX`2MF&{{er;r!7MBpTY-yZ
z>-bS=Etu$*Osqh40HqhW{=$rW%j%Ce?Jhlg!cJ8PP}7u2#{|0LO+E66r-f;xmh7G+
z!{2`B5=pn$E@Co5u%{dTNV_23nK*Q&QFT?71;11Uk@CQ=Yrc2$^t^^@<Vytb%V=xb
zXc{^^8VpI0dE)&`{Iw1Rp#be;hA`aGQt^P9pj6?oBBVCK^vGp&Z7u?`laah&K$mIQ
zq4LmHl5%>!x2g$(PKjH(<upNv^@I)1HRSm%7-&4RIfaSw!fT^{o3m^Ww0Di1mRlV`
zo-+qHP>2jNA|52^(o4O=b6_N;HQ8aM-RcB=(oLH-H6t)%2J9Y^a#<1S7nGF;M1@5)
zn_J6yaYKHP;mC)L8oDu9`z8H9riU3Gtkc5WQs;ydv$P_LjpG*<jV@XUhG}B*>2i5!
zxV8NfI2e~Hmh*s<V!k7Bf+OA=!5jS|7#3I91X0|y3|zj!D00PY08Ju);TlzJ|3!7b
zs7Sb|dwlY4-V}62-)@PWot;I5X$NR~V;C^KA8-YUG{QGQdC2H)44ZtQVn`<q%_63$
z<Vk>*`TEUfVFApdV0-{FXZCSQAwodLOhvLM+YOlh7w1$$eIfhNFJ(d5#Qj3MwpCo+
zEY>1TP1ISmvM&>$BnFnrm)w>m{0ShIy+V2?#k00~Cp8-I@Q`qW_xn^M_0R0gj}e2@
zG2qAjEcp3=exbBN?}Vcg)PdnBl+T190{2ZOhaxlZbzHZC<MIDUccOm3uMSthdaz{U
z3EG9xJU}S|lNfB5-%DCLFlmO=JjK9bNg3Gdm3AHTwthqGE1BEVe#gA-a)Dqm$H1CG
zDf4v?Z)r6Fo-7et-15G?m$Sddno&x5+AAj64ttB<t3}aL?vD)=rx`zbv@KLV48!2z
z+Iujo@=d}pCGMVXRFC3cJPbvX*cJ5U2}eLBlDQ_ijowY8qwB}y)~y3^&U)m+C=d7w
zDq^wkw^J2k)$>Oma$v6VK=l^f9T``2eF#*Oi5in(4uU0KE&erQtq6iqG8NnlmF{r@
zU@$7p)2;~4&6a?G{1w$&^p}qxFn;><-N1-5^R;DG8bu=)qkfoQ>Ojlx6j3#DHiF}H
z1RlcleB1qpi@Y>^qy~xCidi8PTLXhaFT+#9tAl>0d1&-#0ivOq3=Uz5-i5paxv&9r
z-uJu)fGX$Z4Jm=(VPr}qU`VgL%EqVf)8t>1r|zDd_w@8^X-)Yvim))x=abI{F{ltY
z`C1q0#Oqf04wm!HlBJf3A>k~t4u*bE2n6&YcIjVy{`vJq-wU(q@pdf>&)^msy;y#b
z%xy6%nlYc1a#x$&qWkw3A+atO%$PGk^$SbvNh2t}K028$#qcG`Y#&Tb|1&PJHu1(_
z!J8FCh`p$EwGH|3w2U<W6{A%{!0e$u(-)e3nk3Q|GA=i$`K(umsZ%_@eyE$>j=tv7
z`%>~Ix<g*YG*(DIH}B?(!4Qe`M{Qxx=207{*goE^;*Ew64A)uw)muPA-9;j};2LtL
ze5Si+)aJHAasV)Uo=l~#Z0M6$wWJE0c6s!Jt^!$PtV(7BJ!$d54r#QAP{U^pOKoJ!
z36A%A<5r7^rIcP!Nt=X^B@EDl2O;F+g74xKkk5sv$&>bYcK0Xh3==tVeV_voc}38^
z+QqIzNQI0KmHDOuHDU}Oagv1PBb7adCuT83i)i0W2&H=UP}GE0tyED(B&+ne;|Omx
zo=5|x)cm3DCF>Zm%e=qcpMKGFAM(kvanUyrNs$?S{JNb@PZ!;&iv%0UD=ie-KlO?$
zC<yI_EU)H-HEo&kvPMEGxMniq4mbdsRWA&_TsfHd^I>$69y;Uu#yFtw<BM-};q%`x
zTD)-Kq4Jtzt)7h)a4&B~6kS_*<@rW;@Cn$S`Nv$%atu3FyFq~x&>L~>eEb-FR68X?
zqd+0$b-IUCyG&1Pc>88Xyp0sW)Zr5jT4?f=>Zl0RI4ob>(ZV93eqqgLU-fR<d`V_+
zw~)Mnnm?Z#Fn(Oz5$%ZN8FNZT?%5iU82-0SSyh15W}6LxaRy;Ma#Y=ODmVS`)##>w
z{PStklo3`>hA<4g<&b}#=G!cet_`lNn7J~yV0DwHC%kedi>wa~%il2&FoTc{+LeKg
zA=PEN67mQ<p;<3v)*b&fg?R4>7^=v42p>K0^z2om*i^!UB`Rq5ZaF?rLR_hsN$^E1
z^4M-~Z^Q<#aN{0#Q$L|Xm{nLP!`>``duV1TeK*`{NB(kmZ(B!dXssOMmMZ~GAuoqf
zzCf~nUZK>~x1D@KgT;@nTSri(DWg|W*P(x-73k&v)d%59lX9x2gc1P{+yzp{zahjh
zU&au&${jU&bRU2_?!&xcB;7NU(pcvD%FMRx^440G+IgLd#+ecH0iy&xFrXxNm0p4I
zSD#Yntb8jxxRjKXQ9H*OH+y;Z?AeF>8{C|jzgu*IO1iATw)`m7{BjCc7hWWwQE}CI
z>RoR7EgCrQaoriqh68L6OUnsrrERqnTU?HbvqVJ0<f`XQ9QC?Qqp(aCVgFRsFYm%G
z#Mxhl%j~aGO@D67U`~PsDSA<C?Zvqhh9?0dp~dvi_k->4=k(3ANy1@bif#6$Qx}+4
zRa-~U0xhxB)xi-4Dz#a^-W9DGyu5E;F9A;|YMqcgALs6zWWNjyRwCS&T-PJWz=0tf
zeSM`d^t%nqiQM%(E360&dXC9Y2$)fk?0nRGuQi`&RTTXB1tzrXq)DZC%l0RF`}gk`
z)CMxOZmsK`+eO&feTX^EMtfx$-C1*E0s`Jm2FL{anEO|5UQ%tP=Zj*cj$NYgelL=&
z#4_R3z<kX0C|AnY0&S4@#hT=bQ4yyj<pCMcN1ys7%J1$vU(MQhV!)nFoBoGQn)!Ce
za>5NM^Agm^Ly}$%o7&5N*DUFPv}zzTueU!ifZpjx%nmDQC$W;1QR@yJqWJ<$_AJWM
zLVhFS+xlQA-!sw4$!TwTJ-rkRIi0yQhyHt!VOCzo_(D}m(JmD>qYU93IBYmxN2Wz(
zxpOZ3%1_-}x7>YRlk1g5?H~akY%_Vk*|{rATNpj_!BWINd$vG%8O(=a>aKm9xW$xq
zI}Gp8??}8dbhSmJ*YG+0_x~CJi_Vj8C3=jL_*L*CqFm7sb6c=LDi7#%50Zzf8U%$^
z8pK3wZHHD)ps<!{bsn9OOL>6&=PLS`)X(~qfh0>wnZPNWee+;kbqp=`zY<<=_HQ*v
z!Wpc21nbbJd-qY))P$4=y0`~8Q0-|T*fSg3S*Ggkjg1p$cAWUjvQ(av%%mni?jG*(
zSky1BCVIFCHI_l|a_)~X%vIcpbm)WR{rJEC$uN3ij>paKtVxO^8<oNsZ}wM6Nmv?-
ziMpjN3cPU^@Vk_v<?xZG4jogT8#=;4`YIGnnlvHS90h^coD4(&0Yc8%^b^W{l-@?l
zJf;XhY-d1Gdub8NcbNZ^kX3yTMdudm$Y^smnZ}zkcik_)$qhE})6y$+k;S2zMluTN
zh=LOq2}h|99XvE1T_bVke9CC*SqxDj@;}_R_(Ei@mcG6vy&b<%ol=z1Tz*kvafyjp
zDrxPJWQHdsDWw3i^r8INvE#hY>!NA(-#aSfAm(QoyjW?hVn`=(tGmEVl)I&80fG>i
z#E#1tMfY2e<2nlx$F$r@ho-C5eCFV@c(^3`eP?6uBpU%Fklr7PdE-Njtl6K378h6x
zoGi>_Y0Bc}WCSZ-@CaN(HF>yYyLJZ%(~NHR+tw>eayAm-c17g4#jnS8ZhBfzR@3)T
z;K5gOf%ZrLQ;COw){`t9dzHcLkT%QLKx>qw@XXrAE~k;xe*J)5O|^Nu!2)>m;8$_7
z6C{NfFx*$ek(QR0ZQAU+ePVh0mHX;JKK4QJWRLeRL2=odCulim&H*Vhv9HG&zx(Vh
z8|y}es;#D`cJJu+w@^fi^!INm$Sj^l-!4VxeXW1JMg)u;!CHJbti#dCj86D2gD`8%
zxEMu$L_tg;wRBK|Q*8zVEtm%+RSYpS6u=?y)0I3g3rvGpK$*-VJ%N2~kM9)z7$$3Q
zyeD&uL6^ISpGcG8Hd#xl*iuB({WO>n6e$9$N;tlaNOCuL;XXn!;Sn%-ChD)h{xWe1
ze|i*f-vK<~it~CQD$V(Z#sV6~W<H8nnlZ?@4gr%kpXBgp1>$ej<$J129*=KWfDGi}
z<EldIYtmo7Z7bvm&UpNiD99JW;g^K^k<`F(G5-GkVOXs>WP;1kbbj)BS@l{H@%HK9
z!nsv<rE%`uYhtv$x!Ko;UA4)-bGm{MO*WXM%aY}{%C(5Kx1($zy^I1av(y4EAbj|{
z=;Y1aIqX<<EZ9xA)Pmiknz7#s?k`>sc!YFOEcPZ+GINL*>rKxbjj~O++*QiTBT33H
zPB-OFgo4M-n;>ntfx3P;MYBGP3>J~Lqxl)8-cQzB%jhKOTR@4xW>JK;!$~Rf_J%Dd
z872e<OA-{kO*rx60ph|pbwdSJ+>=$sz%%KAElR*PNzXS;k%t~`vVz;a9^n8siJY;p
zBk$_y<RF?f4vaZOw5IK_I)20Mw&bPxOYFY>`fG}d*{_1Jx^3;0LYKhsIxo-JapREV
z>gP!lWl;2rT)QH?dn-G4^66+#%WUJc+<8DzN4w-m3JG~Z+#GxmH_MN4%#jy$E~$;@
zq<R$>&luwoS*bb5pzTDxGPBK?(W4(h@wbjCVw5?kdix@rIDDjYlfGr-@<e0fwW1Ey
zJ9X(`njoYM+O`N7)9UjfdG7cJeok084<A0XjXbp#|61UjKAIhb$82AO)t>RlWJWdt
zABA~srSzU%eRf6^ZmpBY2(M(6kM16!1mZ^a&dC@@x_a#4d4bPt9)GHqduTs6=c~#R
z0nY}0!7nacc!{2%sPDTQk5$#d&8LT<>rIaqorVLoEX$;@A?OEr-IIO8mlps|d{?(6
zjAj<Azn-}L0xwx74QzApig|By<HNN7OkY)qHG9c(FGuJ0=uk=|>C;0;0P@2E3weAa
z;l#@D7N0MEbS1N7rXT=h3l>N9?D0Y<jr3205_JXNI?ux!7)blJFAisB`$@^IiQ{H5
zeZ+EQ3;%u&_HY38(@20t*w%Z?%p35KUlP7eVf~h)DYmZY%9Tk)zFnUThCJf8@D}_f
z0^L_3Nog1v?KFe07Z{rn@e<oy!V<>Qf2LHyb%6!zzGzruwEEkU@9sAi7K_)Cn%I2%
z>)6XgTk{4CCoH_0K(uk_;qc=~X1T_O1_wt;Bb@3b{mo8>ccO@iXOnWIT?ZlW8tnI@
zR5Hvhz5D;L4EpQQhohxE$J`~N#jj*HL$eInoY&B7UAGW#NV8|0YyK1X%rZ>+>gLws
zfUu<PiUijA#!qQR?REUBlZU653<qKbyiIRIRj;t52J+w|7X2DKS5g>Ow@j2$J6xUF
zLc%;)_XE6Xu8s0sexg0>(YUWFg;aTEDVSMK%e0f7ehD_SS)E*wIOi22KKECIKtVrR
zUh=AL%W$U-aP1!7aDH`wfWYRj6Yr2@oqO>(HQTgzTefdMK)U9%e8(R#7&HAdZ&XyY
zB8f@c7x6gT?3mbWEqiMpDg&0Gv1ERuw%xgwGP!9aj#J#d>hj5d&TxM6&s4oxxPIL`
z4^zV_GlS=Aom~nbC{lN-Fv8wQR@5M|p?#i5)a%yAT*7UX=TF;KcU+cne`6&B%A<2U
zAfu68uYV4@C>lvY<?|uD5YJ1mP6$#Ceik$P4H|Uiz?BYT8@T{oTNXcltTTCXzH4qJ
zRwKS$)YmxUS+i#C?fd673g4MShU0sVRzm4*`Th4@m2RX{Z@1H98I6~Bbk(T{oCvTe
zh@825<!QnPoiyjC)}~J%WA1-!*KW?G!tAZVw<4rj?W9-1{X_IwJ>Wb^%=_IY$%nxR
z1AFE_fAPYM#A5wI%FHxEZhO2GSDy3IDlNyg+V|ztP(1(<>48}H96&kx%eyb;)F&2Z
zTB^KJf^ju071o|a(nz##&Ao$S`0GD3dPn<(2JU>0R}n3Pwna3vU7a49+&b{hH)9Wf
zmB<(=Y2sRHak;|)_j<+wbSWC5t*zZJtM>xV{2(gUjz++8aCshtB>k=MeHO2vAfXsj
z3IXz_ZNC!)qcX)u(1TIccmL?2Q2Z)a0737>xJDa!_vFtXVu?*!GQ_^}4f;13g@uJ}
zX6AR%aU7gok+_hK94pmJYKbLt7N^s&KV=<dow<*a@o`iBTGRB8mmR`^5f1OD2VQJ%
zr*38jedv?5l;4xYMtA0gZGu)OEs7HP;B(hq3sC{f@OAs9?5R0nm6^GDbPhz<<WcEE
zl&!iX{^1H?sOdM~5VwLk3Q-9l=0(LaYQzHi51<G2VB6*n;4RSAbccm_uYb2{_3ANu
za|89-R%a0B78`%-TpjzeOF7Tz0LY&}Gy7wLtqo{29yK@xR#af~Kf>>50m7K9Ds6U{
zQvLSr0h2?2zojZnnXIs0<elC3E8A_vi5Z|N0{Mp>X`W0zexfvH;gmE}7V-9lOK;ki
zPHFaB&_!t%P?Sk|>8Rs}9^T{i#TAh~8dv(@>AXDMn+9d|+iqJ9?$%r9H|Rxndwg#a
z>Pb7fcD+v>nuZc2XliOw!K&7@oOhFuSv##P>C}t}eDGpT&GTy{3&Fz=UGi+FSRbP2
zN5ljAXI2)Rs2AD&9!1wJ7iX_Kg$PA#g>PStCogZ;iTLoI@^i@1;1oCj0YWvW3Lp$y
zbowlF6Fkbd0#8MM>E}y6Bz#7SPEA|1nA{MZEFM$!AcE+zOo|}{I#QUId1hlqOj&#y
zEehV12On<>9iYNbg=fqjk>(+32d8lVh#M117+pvGloI|+e_@pQ#nT0Ec5z>OOZPsV
z+AI8V?$P2`b$TeD2o01jiT|ta-2ZCM*EYUpj2YV=hP_P_Qwp)O9a3q88HEv1X+$CA
zPzmKw357Hm3<)_)*h)!65gm<T<dACUB&y+w(wR~b)$_h%Ei?P|dj5c?U(9q^>$|?+
z&*wf|*L7c?pfCn>7cq?>eI(4^)8gV#L8mDv!W}aUE0>mS@DEy!y#y`Dm6xyEZ_m#h
zZDrE#b^4v2M_T^qG2d^tnwpFfQ=a!Sr~0%x@^;j%9c~5t^p_uHvMsdEu;|U{F>k3x
z1gX}EWV{1yq-*CWWpYt8wj1-#D1(D!CUj$-4%rQI(YUnwGv5E2s28DWFniVG_H84+
z_eQJ~pG{wknGMpD3QyG0uUIp)<h6WzB|Pl)d53o^>>U5thAKj4^`uBjzhYw>8+??;
zhj`oF$*osLc6N4>?~cm8Mn*<9S90A8hgYc`B!+WQGXDq0=Np(|RE(|SUFu1%wS~#T
zz)IVUv~Mrfp`g_w$DSQsNY#j4-O46xx}zA^9e_cTIGC_P4AP0hdDKACO>XN_O87aT
z7gr?F68Oq_WQ}xMpf4_7R#pwM0lgxaWF1)gG^b@6)Dy{Rl0XUgn%MQx384^v*X~aU
z!XWRYD96U{=y~w!zA~4}rUVA^fFp*EuP<8hy4Gcz2g9s%1-*hxc|~=jhr#(NnU$_r
z9PcDEMC4j*9C0*rbq<fIZD{wND66E8?%cU^V}a4VwD^@zhwcpa@Uvb0nAtrO>5l0>
zY8xB(hlV<3{9Va>Dt_wO%j$ilpsOQNge1p5Zr$pimq(EjiV2|mC#Gt8j)(jI=EH#J
z8HXg>DKnoLUUK=o@I+0&Lm|m;$F>mL7E+;&@OvBoVmif6Hc}@cR6scHET1e&9&8P+
zWj7)4H@rGYa5x%)^M_Zm4J<-*ylQ*qrvYOb<}Cik`EXWd6LT*@9e-U+jE|%o5St<<
zAA`#h>Dzu6>(fxTf&rkixQxt!ev@e;^c_OH)y8d-auKpeRF(qdv9Sh#fYB5wfy7vs
zNy5QP0!3^*C<8oh{^eZzE>Cov_r54?WxF3Nc~ajZT|WS$)rL<y-6PsDFnt=BcKV$J
z8ZeaJBjv#Y10;&2`_l@G2@{5RTN)aCT2x~+e_aF#>AsEQ+tp7Uzk*W6N?~+RHZ2|&
zN8jYNcG|6#ck?Eg2pufkc+j4P8Ofm0Wu~>1n*&q3k2E8g+mkwXxB1n;W8eDelrk#*
zxtFz`0|zF80kf}?DUB=0&oYpq?+OS3QEQFRIUA&4*4bW8c_%14XZ?~I(&6*ZFLW@K
zo`tlX9#>CXcGtMHFe|k@8GPY3lZQ|t20by<8k%!3=LjfFzwzV8Z_{?=VBP3EO$`nA
zu8-V#AmLUahl-;w{4^S@LDab*owu)FAIDO6x%`>&Qm^I9r!u|n?ssu%n)?5oripdj
zJ5ATQppW4Smp!$9t%>P`L9tx}Nc&PvEu0`JQCcm}ien;St9UX`v71}CJ)pj^ENeV@
zEWy!j?z|F1|MkV->(=N!bs{Olgb4QfZq`AqymR3sDYGp$ZFN5k>cJ7intf2f_yQpg
zI`Rh?(%s0MnRc%B)FQevM7v}2rcEUaRB|=z=O2pXZVT|N60e>Blc3qe;c;8MsEC5X
zpzKCX{`e!-^#yfm)!<RXoW9B|@GMU41mu2C9c}7NmdouNx3DifSbPu4zXcf*-7*DR
zsuQ5)!J<oVJ?ly=ZR*HbPfAL{@qR0gAt&7~Ed1`FmzU@4uLjRrpzlW@s!3D0QkcoT
zB~_>KV8jdqkR>I}pzqqx#-rli)*PWt%X2GE+uG`C1&KVs#O8I(Iv24por2CkM;d!P
z3-EA1LrkCh?Jx7Lca$Mtfnto$Vu_<xTu^z(CuzW3zadG(9t0gYu;5gd=Zd30ChhwA
zZ0Nb0+Z*ShuouK8?1EnR1I8Z5w|2W#XwvKE4!8Rpo@@zhsa{OQaHyk4zkdBdy6>rT
z>gfy!JfA6jl--B!+uG8KLCW<TKbAsA4qgs;K@WiT@=J5E7!r|%I5xogrSGobZ0Bq9
z4#x~QI;gDO`h+sFq1x!H2&0DDZD=p927tI>7arL_fVpR!bx68+{T1Oo!FJn*4I4br
zb{wvHdw8RJ&C8;aCzl!6{@r*5RW$q6@~HiEQp-45cNG6mXlR*2q!W*eT{$DXD*yZq
z;|A0K*dkU=-@f+A6GJU*sPr>ldF{)4v%fnqVGiaOA`07fgJTGeE?kn@T-8vtV5v{S
zu$fK`UONk2wZB`P9JpS$`Hl9|%+)cRxWb(_4`NVVQxJ)%6D}Qd=ra=P@tnFA?jPAc
zl=kPJ88Dv~yf9Ek6vmFVY;{HGpjLVbawg43c@K|0?}fVm-Iwlj&H!&;Yz$i>3XnZ&
zop*{>$F+RlK6@;V7U++nW{a$&;^hIKzyXNv5{|<m<KE`3?)euM(nZn&x|h0Eo?OuI
zU6729-hzrp`Z{%A=2y^u#+PBcE>^gF{zblLL}LR9CcLCD-eWABRjTo&nyeIUtUxfw
zy(>VgxpdSVY~Ws}!RP@<wHpiX^$*7_iMTTBs9(+au4YusQLKrwG#41QXt-3)I<)^y
z9?;Lzs;V;_R;`P6>EUaT_3&}71*bnggpW>4hw7dteg)YNF2O6`du37jWRfCw_x{zP
zM)MmlBZ`?3u-N--+>Hv|Ej?E~`e>VW#KFe@S#o*EV>W0m97$hug63UP@(JiY`w*>e
z-E?FwqPszusl&AZ)}0_UsjqnWL%fTidY9-gfe4DuRprELy^Ze<<KmXLcO<1rVdj*K
zmZ%xUsoDJ@Yi}A};=`t8Wo4-otshQ}pGS7{vE7aG@4E4b5f8F`qzI1mDI0qjFezh2
z3o4GqT{li#cX<#AKL-Y6khw@&_IU|&r~%3{KWTnb^w0UX)cb+8M@81%DSAjmp1E8-
z0Nal*TMSko?U*Q;7W0;OjtvMb#1FH|HWS!~dpQU&%EqBuM~rIBR>k1C3DoH0xNrKL
z6JcS~H_kZo^VY4g9XxE=p{(V&1G`z(du-1h^bZD~yk4{K<&Re%YG`T-8|<Wktl#%2
zzF;=+H#(72kuqY)McqtGxqIm0*}fy`(7y@P<k#_Fi}I>^F`{;TXmMflAJu_@fodjS
z&mVCau&6OGx=nFGDKXK~avBK<h;@O7mgiW%S=R;<9O?3|we5we>aReD^aHgZ1qJJz
zT(0CH6pf#$ic0^FGw<I00i)Q%`hi=v{Eq99H_;TBA#48rvjy=Dlk`%Y1L$i7P7W@m
z8+isCbm#mb85Zk$r~t$VdT#L%FX%-ZhezP3bC%oJJ%9c?{A^IwaAu8SI!w?23Z42$
z1nQ|VaoS5@1OkesYXB{`&6C+m)=z9o^<s{cKQ~UdhNPRnVqE|lH>!a}O+CuSY9YbM
z;=?b_m~3Hj$3wwUAlG1{_pia!5Gz`Xkj_>31B^ywCp@~2xlki*XJh0ULobU?<8@Qh
zBHRMZwC`IirYpF;!jTbe<Wl|H+`PclEY7f|!^Bn-G$ogw)iT6O?*OhT7rjIe4rb(3
zz%^TV)q1v0TuKv@pJ8rgb@5d}Hb%W^J}_9jsDAZ3ckE~|?Han>fG*3J+Oi07vURN9
zo-4W};&{w(S1Mq&j~)<R?ALT1W7e=r)2G+k&9^YxarVXTBhGBRt=nvMYcfIcD;%iR
zOeS^byA^?nK}$d>1i+(ez-{$EaNog=rB+XF7<ndUinj?=cj_y&A#uak5t!9#YR=8Q
zR-N2-E@@ko`;dzZ`iQ#YZqD-+!=hh4{x$g?gK68AUFV|RH#Nicd83a=s=nhQU@yJ$
z(tv-$S9a+#CoZj+5L3^<?SYvi*M61GX#6dP7qWe*d*W8@kXiL$ha6bBMH@XNet9hM
zl|-O!|M%a+SvSD>=R3?is1IJi_~#w<bsVi~JF;EBzI|`e;1b-z^J>7guCKy3#dfT?
zw~Jzl*$#K#=Klh(9zB{v^LPE_YoA`Y2U5z2PB`pB;~WG$34S3^;ilsFgAP>`mMG0q
zUnU6B*?mPPG=%u0!wnB$uYj3GQ4;4zCn3se-@bi+AYEoD(Y^hS;LS2`nl+@2%=ZaP
zl{1{-Od0Um?m<anu(stya^Wxcf^kvm+WTS&u!9}%7j_}~aQCFTq7ooljElZ57D7oz
zyvDA8N*Q|ftw>^yAniPS`IF#U-=%kMQ@nE{v}RN}e+?(>@2_yG#19A&&yA8pvOFGB
zdKd)i&5GYy_8PqV9m4O~NruV=;J0lTg)*KizUeW<lM1&7%5GUAyar2TTcQ_nq1dV+
zUrzd=%^K3;a8Wd#OeLlzh(44`VEYSkvjb^=TBI7<Q#79F-7xrlepDz7@$(2p#)Y2F
zJJku)+Zx`g_U(glTf5yQJBSvEwt+`x&JjzSaEOT_2S)Oq$aAvUk{n6>t;nodDa<Fa
zus7F+g)`&OAJ;K%cu*MOvy99lR!3yqll*4|AFc_&Quvl}-+zDeT^${pu%xMrD|N6%
zp71YB-?$MBnrLq7v8QB%IyMQJ333}nU6SZi7cQewl$Xzlg}e6T`cxu2IG?zLEs{fj
z8anbRZ}s%qvy+&h%h?seW6JgNrXmPs(MX$Wna9z$?WK1S71NTU<VbQaEx{z=GbNWX
zrURC{kT4}HMWi2MvjFdJ5Z8jibtn(HoZKj-1;@5sl0j3^X<WNDRw5gAb5JD%p?ov5
z*Cq}!Ffg!rW_Wg#*ZhLcdy6{Hm~@X@fo`XER*DrTG~d;4xOjMD-#)#2i@k<0Scq_c
zmIo9cZ_ET&e;5&Xgu7g>2r&3G<xHIF$1Q0(wom6ocz8LtKYYXkHD_6hXq{hz89gKJ
z`!Z<G5YXc9ILc0}u)t`IAi?`=nZajATQ$8?Y-vu*-Xa=HBiI)Xw4B7&ND+@hvZ}bz
zNV^I*HUI9kJlqrJ-#<-VA&N;6&<c?Uxr;Ryry-`i<k&}Hn??Kz?{a}MfFXS}8I6MJ
zCiE=NtVq!xATuqpJ~I*-9}zW}*lD?>smV@-m!0yUW+VG$TlMDlPraYCXGe%D-ISRM
z_@INnXnHY1m18`*7xu=J=$M>5nPA5KHge6-2pw6;F@~p*m)H|uMHEYQ_mWLq0l{EP
zy05=CNlzS{!pk30bFn#G-hR#_p0T`pW~bL4oysyA^I2qHAD!F3Dy<)$P|j;|!5=R`
zcq8sBd}jO*<#v|C0cDY{aQB!HSN~;<az};LJ>IM(SXh3>LkdV9gU$)>8OfU2G2zTu
zH1We&hn4RKi}kA+mrYm9F%!oSs_fX1VGP@{qy}ddGY$&=konEPTxtwa3=*1Rdz{2B
zYlBe7N-fL9a4^1^%M*5o33!K&PU@X>FP849$JaKyLCKpP81fw~5;6cYS}W0>3mpzL
z{|tB_?wQ{rGgwiu3lZketUA3i7*2Rp%PS{(v3n)SqUTlDzdGM8h0~F&Av+ReVPP=u
zGCYDOVB1bl)I;3yMRn)P*|%){NVIH-SZ{e+**5kkdE+x1(;>j+`meu^Cdk-OXdCey
z2bK;*qSt%DJ~$Nq{K4_nVh~$Tz|2q1WogS62=<DVWAsEGYkPDD71@LIArVlR(K7f^
zUZ;2Y)p2mj#Q~hw_BT$Sw6iTmSv!MK#pp7m5wmAQ7}-*jSRDN)hy<@W^V@H{N-PA?
zEAjF1Ht~dU9|KOAUYo!&Sja{btPT~Tn^%ZfwQ=D;$gyA<X;sY?7r|x5n!J6!0sXhj
zd7F~O01&w%T4xw@{z$ut6~;}R)1V86{xE?mHdoy6&?S;kBWmX@t)#Ng8GKtHa%2ud
za7Psy_l=a{d$?bVm3nu!<;Ad9wnZPy<(!SLMBc1J_Wa}3`j70H#zsXq@5OXTbq<}-
z6#Ayz2e?oM_oqURMtODC8Z{8zKhlfoAsju_XS<e(Bh*O_bG+vd*Fln1aB)LZ`9ybt
zDNaW(v?YY0M!7KXXxa|92`evW7F`721$4>npy7I*!cmkvV!9c2Gi06KMH;TI*56c!
zLKQB|??#>q#%w=D@nM#*F@PsCRjBbXCcpwW^D694O*IFKvS#L(8!IED{ehUoZ;67w
zUV;TQ=r<c&6K(-=uq^IUF<wA=LNESX2fsbw0kkJSP+Qf*5Kvc7dB9XadOaRI?w{yU
zJRB)jMC{4c%%RX=&$S)ii8~}|<rz`V)5BvI(nQHSvc#5OiJ;A6j(L+1IFMAL5tG3Q
z8WF?|Z8ajxL7G~N_EE76YZ?xMoRH~Xs3<v0nmBO_c^pjKZZw4)uhl)&aF$hkV&cEG
zwM}KrO;od_4s}h3!h!;E`jZ1m<TdH!CG}*v!Jkx+)bfyqcmuG?*ePYs@U^cS!|)|@
zqxu&=_#Q(SXyeuwT!*`Ny=c9_egu|BsU9leI-NOexQLKncrg+-lJadbVCL{sk-VAc
z^-=d8hiiF;4BSw2;J=^o)WeC#r&ttn>I!9Q1TF#<d@gwrk<3IK@tw$m(67Jzw4udY
zQk=}FC>`y|WZo#i5E=^Fivz}9BCB|wiopX_Imoq9^C+r;StMW^p0?;mDGKCivlZ=x
zC5xQWmuYotk%yomv^cx^U$st4?{HAOAB=qtX{Mg_C6=1^2cW_^&PIzwAp`b1{YO8|
zgmmJEAnxR$a49Od)l-RFOfpWB`^<ghGWPPg)4h4A2nG%-c<58;$=-WmBG^0$G+xeD
zL?(q-W5gO?m-d;3)#v%xyHJ{maT%+EJ!~PYA!-}ePz3#48EJOc{{416FU#>1ZC}fN
z!5vHaKRb0q(m|sVbnjzNlKmyGampXbzl}pjs1%S2iHP(mK|0`($T@Fhd`TwJ9wsg@
zR9WOg8y0(&gL2|c(b!4V(J4$(s@fql)QN<C6Rx1`eQsKE_{}=ejqvFf(#AFn{_OqR
z`@i{A1z0PNk<6q{YZ$C*7x}eg)*csDs*;)f-OZ6KEy|@Joe|N>-`trgmI?2^e)^5M
z0+HfEH}D`VLCPBj%MVRI>Y}B!RAayH4_5>CzW-&#vepl9%um086M_ySUtk8c=bGoE
zN7_jsIi?+tHL*u!Ny+>h6P14$-)j6IN%=pf)f2Q3_EsO(93a<fy)&$}<}<e8d8fY0
zUny9p;)YFo{eYoQ@heJ<6{qD5Oh<cS0Z?q*yl$!T*A+fHRFb`J|6rRA3CwEUB`#_H
zZ|s}&d_G*YX-n@9pZ$A^!)Ykp&E+_{J%<i0D%~otc;6mz2Me~Zxh$KWey`@f`(*O7
zE6qRs>74-l1ew%foo`*zMgUz{*Xjp<Hc{-xsq38Iy`ubO{4ZUUVg&y&VQjk87p(ru
z>2kCE*UtfmN%bJZSG9dZ{y@?AFV$+X-h!{{l)%7+tywj!<GpB9-(PDQ2yz_yb@QO}
z=M6usk4pa!m)RS<ogBVPM^w2uNwuv~#ku_=E}&)&3PZ|%l4!glTIrnN|K<zTOX)hJ
z{aT`dO~RFT3*lS@7vKE`TQ>RK1C(oh)G|&*4|hBWE)zvt5fnyvSOiS;?v+jbq)21P
zs*kyNV~G>XpKt8XAFkK{EJXsJerHg7CnI@-%Ky;F`l1XG4LWP~Ha%@oU*%d~HrsQ8
zrTO|9Jr#;i9p#JwOUt;GUFGli@-N_i9S3YJ$00B%3~9wFs+|?lvj0u}#3vqF=5MN9
zJxA-K<sY8ZP0}a(%qSIYi#6I@%>gn&JLd~VkF`|>T>hG>zTJNeHJq(%37!5%M$=F-
z|5pIH(tKh|imHQq5<)2>sGKhccs;GIUQ**Jm#qBqf3tP$A}WQHZOZTb<zHRrk2R_s
zlsDloKl(FY@|PcIf1z53fBC-RX1fpA-(P-Gt57Y5zkJ{STT|6D|H}{l`z>@I!P|?`
zE2{gVq7%yB_1DmySEHIomha9fq;M92U{zs~omk8|rX-5gpe9!JH-A$-8-=qc!edUO
zvImuD7%UiO{8EQs{2asw+z~!TUL<HaB!wW{8GBW?>AVJ)V^gI%bLZF`9~5#j@rA$C
zqAUI%VFek5-gjQx+xov5LzI?8B~#rXoU00`Zm33cMM!j`TzO@toj>_-W2YOF_r;r^
zhN@Vfid|wW1@mrFnr4&);*Mw0k%;nBDiSdZy@44=cEn^Y8JH8rvgQC)SvLHGjB{r`
zS3!OpD$ZDCt**}Kc;yI*ws~h%>7t<toF+H5@iIibC{pF92cRdn7cT#%s_?~TTKGel
zqWl<Dd|tL$6>k-%#a5C;DPlls3HyUb8lOLx2}9QPAnp(@Czgme<-9-#?+zI89im`e
zGwg);`=7G@&sB#rF$6)_{Gu1HN!)fwU8+xQ;s6lBPf<<}lJXP3EpK-9Y%JC)*fF$V
z$GT-EEx$HTxYqpYGrqNQwFwa-luwDtv<;@&886r17LXlo`L&RDP#=lGQ<RU=f30fY
zDV+6i$e;|$LzQs++_`h^rDrkn1;a`%t`Lxbf+D53uCKtz;1{h9XI@Z*mID;6T;oRt
zGZdI5>C{}<e`)Kg7j>#0-=q45NxW9;Bfqf6es4FtdGUqnTk#LyypS%mZA^F-sY|Sg
z#j*>}jribyC@;_S*Dlt-6@95(Ast7t5ajVvRA!??m$R~(Dd(ljqbsd+AbKg5P39oQ
zO*)Yl2Mq`$a@1EiYwvbiTH1#d>|bD}+ITzrNKH*eDm|7ti0FQ?Y8KTav|A#Xm=>?P
zya%|W1kdV4+c`k}Np!}3F$zF7-4K|GjZAVy51Hi=Rl&>+Z+2~x&nlmNVcW;&vn#(<
z-QWimWJW_~#~?-ZqCc^S!^4WIS`C#SUHO^Y{`C*|e-Wls*ZhA+!Yi%qhB*G1xi9Yi
RX8CRNaaP|)kD0Une*r_c+CBgP

diff --git a/setup.py b/setup.py
index 5b7d12bb3..8b2b4f7e5 100644
--- a/setup.py
+++ b/setup.py
@@ -461,14 +461,22 @@ class precompiled_wheel_utils:
                     "vllm/cumem_allocator.abi3.so",
                 ]
 
-                compiled_regex = re.compile(
+                flash_attn_regex = re.compile(
                     r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
                 )
+                triton_kernels_regex = re.compile(
+                    r"vllm/third_party/triton_kernels/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
+                )
                 file_members = list(
                     filter(lambda x: x.filename in files_to_copy, wheel.filelist)
                 )
                 file_members += list(
-                    filter(lambda x: compiled_regex.match(x.filename), wheel.filelist)
+                    filter(lambda x: flash_attn_regex.match(x.filename), wheel.filelist)
+                )
+                file_members += list(
+                    filter(
+                        lambda x: triton_kernels_regex.match(x.filename), wheel.filelist
+                    )
                 )
 
                 for file in file_members:
@@ -648,7 +656,7 @@ def get_vllm_version() -> str:
         if envs.VLLM_TARGET_DEVICE == "empty":
             version += f"{sep}empty"
     elif _is_cuda():
-        if envs.VLLM_USE_PRECOMPILED:
+        if envs.VLLM_USE_PRECOMPILED and not envs.VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX:
             version += f"{sep}precompiled"
         else:
             cuda_version = str(get_nvcc_cuda_version())
diff --git a/vllm/envs.py b/vllm/envs.py
index 8b954fa14..4b594e54f 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -78,6 +78,7 @@ if TYPE_CHECKING:
     MAX_JOBS: str | None = None
     NVCC_THREADS: str | None = None
     VLLM_USE_PRECOMPILED: bool = False
+    VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX: bool = False
     VLLM_DOCKER_BUILD_CONTEXT: bool = False
     VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
     VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
@@ -462,6 +463,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     .lower()
     in ("1", "true")
     or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
+    # If set, skip adding +precompiled suffix to version string
+    "VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX": lambda: bool(
+        int(os.environ.get("VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX", "0"))
+    ),
     # Used to mark that setup.py is running in a Docker build context,
     # in order to force the use of precompiled binaries.
     "VLLM_DOCKER_BUILD_CONTEXT": lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "")
-- 
GitLab


From b78772c433515a22bfeeaea41f3524002609e264 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Wed, 3 Dec 2025 20:53:44 +0800
Subject: [PATCH 044/258] [Frontend] supports deepseekv32 chat template
 (#29837)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 vllm/config/model.py                      |   3 +-
 vllm/entrypoints/openai/serving_engine.py |   9 +-
 vllm/tokenizers/__init__.py               |   2 +
 vllm/tokenizers/deepseek_v32_encoding.py  | 456 ++++++++++++++++++++++
 vllm/tokenizers/deepseekv32.py            | 148 +++++++
 5 files changed, 616 insertions(+), 2 deletions(-)
 create mode 100644 vllm/tokenizers/deepseek_v32_encoding.py
 create mode 100644 vllm/tokenizers/deepseekv32.py

diff --git a/vllm/config/model.py b/vllm/config/model.py
index 5de976976..655b7c995 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -84,7 +84,7 @@ TaskOption = Literal[
     "transcription",
     "draft",
 ]
-TokenizerMode = Literal["auto", "hf", "slow", "mistral"]
+TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
 ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
 LogprobsMode = Literal[
     "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
@@ -141,6 +141,7 @@ class ModelConfig:
     - "hf" will use the fast tokenizer if available.\n
     - "slow" will always use the slow tokenizer.\n
     - "mistral" will always use the tokenizer from `mistral_common`.\n
+    - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
     - Other custom values can be supported via plugins."""
     trust_remote_code: bool = False
     """Trust remote code (e.g., from HuggingFace) when downloading the model
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 67291f45a..9642024dd 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -105,7 +105,7 @@ from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import BeamSearchParams, SamplingParams
-from vllm.tokenizers import MistralTokenizer, TokenizerLike
+from vllm.tokenizers import DeepseekV32Tokenizer, MistralTokenizer, TokenizerLike
 from vllm.tracing import (
     contains_trace_headers,
     extract_trace_headers,
@@ -1128,6 +1128,13 @@ class OpenAIServing:
                 messages=messages,
                 **_chat_template_kwargs,
             )
+        elif isinstance(tokenizer, DeepseekV32Tokenizer):
+            request_prompt = tokenizer.apply_chat_template(
+                conversation=conversation,
+                messages=messages,
+                model_config=model_config,
+                **_chat_template_kwargs,
+            )
         else:
             request_prompt = apply_hf_chat_template(
                 tokenizer=tokenizer,
diff --git a/vllm/tokenizers/__init__.py b/vllm/tokenizers/__init__.py
index 42487f5f5..67a6d7c8e 100644
--- a/vllm/tokenizers/__init__.py
+++ b/vllm/tokenizers/__init__.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from .deepseekv32 import DeepseekV32Tokenizer
 from .hf import HfTokenizer
 from .mistral import MistralTokenizer
 from .protocol import TokenizerLike
@@ -21,4 +22,5 @@ __all__ = [
     "get_tokenizer",
     "cached_tokenizer_from_config",
     "init_tokenizer_from_config",
+    "DeepseekV32Tokenizer",
 ]
diff --git a/vllm/tokenizers/deepseek_v32_encoding.py b/vllm/tokenizers/deepseek_v32_encoding.py
new file mode 100644
index 000000000..72f43395b
--- /dev/null
+++ b/vllm/tokenizers/deepseek_v32_encoding.py
@@ -0,0 +1,456 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+# copy from https://huggingface.co/deepseek-ai/DeepSeek-V3.2/blob/main/encoding/encoding_dsv32.py
+import copy
+import json
+import re
+from typing import Any
+
+# flake8: noqa: E501
+TOOLS_SYSTEM_TEMPLATE = """## Tools
+You have access to a set of tools you can use to answer the user's question.
+You can invoke functions by writing a "<{dsml_token}function_calls>" block like the following as part of your reply to the user:
+<{dsml_token}function_calls>
+<{dsml_token}invoke name="$FUNCTION_NAME">
+<{dsml_token}parameter name="$PARAMETER_NAME" string="true|false">$PARAMETER_VALUE</{dsml_token}parameter>
+...
+</{dsml_token}invoke>
+<{dsml_token}invoke name="$FUNCTION_NAME2">
+...
+</{dsml_token}invoke>
+</{dsml_token}function_calls>
+String and scalar parameters should be specified as is without any escaping or quotes, while lists and objects should use JSON format. The "string" attribute should be set to "true" for string type parameters and "false" for other types (numbers, booleans, arrays, objects).
+If the thinking_mode is enabled, then after function results you should strongly consider outputting a thinking block. Here is an example:
+<{dsml_token}function_calls>
+...
+</{dsml_token}function_calls>
+<function_results>
+...
+</function_results>
+{thinking_start_token}...thinking about results{thinking_end_token}
+Here are the functions available in JSONSchema format:
+<functions>
+{tool_schemas}
+</functions>
+"""
+
+bos_token: str = "<｜begin▁of▁sentence｜>"
+eos_token: str = "<｜end▁of▁sentence｜>"
+thinking_start_token: str = "<think>"
+thinking_end_token: str = "</think>"
+dsml_token: str = "｜DSML｜"
+system_msg_template: str = "{content}"
+user_msg_template: str = "<｜User｜>{content}<｜Assistant｜>"
+assistant_msg_template: str = "{reasoning}{content}{tool_calls}<｜end▁of▁sentence｜>"
+thinking_template = "{reasoning_content}"
+
+response_format_template: str = "## Response Format:\n\nYou MUST strictly adhere to the following schema to reply:\n{schema}"
+tool_call_template: str = (
+    '<{dsml_token}invoke name="{name}">\n{arguments}\n</{dsml_token}invoke>'
+)
+tool_calls_template = (
+    "<{dsml_token}function_calls>\n{tool_calls}\n</{dsml_token}function_calls>"
+)
+
+tool_output_template: str = "\n<result>{content}</result>"
+
+
+def to_json(value: Any) -> str:
+    try:
+        return json.dumps(value, ensure_ascii=False)
+    except Exception:
+        return json.dumps(value, ensure_ascii=True)
+
+
+def tools_from_openai_format(tools):
+    return [tool["function"] for tool in tools]
+
+
+def tool_calls_from_openai_format(tool_calls):
+    return [
+        {
+            "name": tool_call["function"]["name"],
+            "arguments": tool_call["function"]["arguments"],
+        }
+        for tool_call in tool_calls
+    ]
+
+
+def tool_calls_to_openai_format(tool_calls):
+    return [
+        {
+            "type": "function",
+            "function": {
+                "name": tool_call["name"],
+                "arguments": tool_call["arguments"],
+            },
+        }
+        for tool_call in tool_calls
+    ]
+
+
+def encode_arguments_to_dsml(tool_call: dict[str, str]) -> str:
+    p_dsml_template = """<{dsml_token}parameter name="{key}" string="{is_str}">{value}</{dsml_token}parameter>"""
+    P_dsml_strs = []
+
+    arguments = json.loads(tool_call["arguments"])
+
+    for k, v in arguments.items():
+        p_dsml_str = p_dsml_template.format(
+            dsml_token=dsml_token,
+            key=k,
+            is_str="true" if isinstance(v, str) else "false",
+            value=v if isinstance(v, str) else to_json(v),
+        )
+
+        P_dsml_strs.append(p_dsml_str)
+
+    return "\n".join(P_dsml_strs)
+
+
+def decode_dsml_to_arguments(
+    tool_name: str, tool_args: dict[str, tuple[str, str]]
+) -> dict[str, str]:
+    def _decode_value(key: str, value: str, string: str):
+        if string == "true":
+            value = to_json(value)
+        return f"{to_json(key)}: {value}"
+
+    tool_args_json = (
+        "{"
+        + ", ".join(
+            [_decode_value(k, v, string=is_str) for k, (v, is_str) in tool_args.items()]
+        )
+        + "}"
+    )
+    return dict(name=tool_name, arguments=tool_args_json)
+
+
+def render_tools(tools: list[dict[str, str | dict[str, Any]]]) -> str:
+    tools_json = [to_json(t) for t in tools]
+
+    return TOOLS_SYSTEM_TEMPLATE.format(
+        tool_schemas="\n".join(tools_json),
+        dsml_token=dsml_token,
+        thinking_start_token=thinking_start_token,
+        thinking_end_token=thinking_end_token,
+    )
+
+
+def find_last_user_index(messages: list[dict[str, Any]]) -> int:
+    last_user_index = -1
+    for idx in range(len(messages) - 1, -1, -1):
+        if messages[idx].get("role") in ["user", "developer"]:
+            last_user_index = idx
+            break
+    return last_user_index
+
+
+def render_message(
+    index: int, messages: list[dict[str, Any]], thinking_mode: str
+) -> str:
+    assert 0 <= index < len(messages)
+    assert thinking_mode in ["chat", "thinking"], (
+        f"Invalid thinking_mode `{thinking_mode}`"
+    )
+
+    prompt = ""
+    msg = messages[index]
+    last_user_idx = find_last_user_index(messages)
+
+    role = msg.get("role")
+    content = msg.get("content")
+    tools = msg.get("tools")
+    response_format = msg.get("response_format")
+    tool_calls = msg.get("tool_calls")
+    reasoning_content = msg.get("reasoning") or msg.get("reasoning_content")
+
+    if tools:
+        tools = tools_from_openai_format(tools)
+    if tool_calls:
+        tool_calls = tool_calls_from_openai_format(tool_calls)
+
+    if role == "system":
+        prompt += system_msg_template.format(content=content or "")
+        if tools:
+            prompt += "\n\n" + render_tools(tools)
+
+        if response_format:
+            prompt += "\n\n" + response_format_template.format(
+                schema=to_json(response_format)
+            )
+
+    elif role == "developer":
+        assert content, f"Invalid message for role `{role}`: {msg}"
+        content_developer = ""
+        if tools:
+            content_developer += "\n\n" + render_tools(tools)
+
+        if response_format:
+            content_developer += "\n\n" + response_format_template.format(
+                schema=to_json(response_format)
+            )
+
+        content_developer += "\n\n# The user's message is: {}".format(content)
+
+        prompt += user_msg_template.format(content=content_developer)
+        if index == last_user_idx and thinking_mode == "thinking":
+            prompt += thinking_start_token
+        else:
+            prompt += thinking_end_token
+
+    elif role == "user":
+        prompt += user_msg_template.format(content=content)
+
+        if index == last_user_idx and thinking_mode == "thinking":
+            prompt += thinking_start_token
+        else:
+            prompt += thinking_end_token
+
+    elif role == "tool":
+        prev_assistant_idx = index - 1
+        assistant_msg = messages[prev_assistant_idx]
+        while prev_assistant_idx >= 0 and assistant_msg.get("role") == "tool":
+            prev_assistant_idx -= 1
+            assistant_msg = messages[prev_assistant_idx]
+
+        assert (
+            index == 0
+            or prev_assistant_idx >= 0
+            and assistant_msg.get("role") == "assistant"
+        ), f"Invalid messages at {index}:\n{assistant_msg}"
+
+        tool_call_order = index - prev_assistant_idx
+        assistant_tool_calls = assistant_msg.get("tool_calls")
+        assert assistant_tool_calls and len(assistant_tool_calls) >= tool_call_order, (
+            "No tool calls but found tool output"
+        )
+
+        if tool_call_order == 1:
+            prompt += "\n\n<function_results>"
+
+        prompt += tool_output_template.format(content=content)
+
+        if tool_call_order == len(assistant_tool_calls):
+            prompt += "\n</function_results>"
+
+            if index >= last_user_idx and thinking_mode == "thinking":
+                prompt += "\n\n" + thinking_start_token
+            else:
+                prompt += "\n\n" + thinking_end_token
+
+    elif role == "assistant":
+        prev_assistant_idx = index
+        thinking_part = ""
+
+        tool_calls_content = ""
+        if tool_calls:
+            tool_calls = [
+                tool_call_template.format(
+                    dsml_token=dsml_token,
+                    name=tool_call.get("name"),
+                    arguments=encode_arguments_to_dsml(tool_call),
+                )
+                for tool_call in tool_calls
+            ]
+            tool_calls_content += "\n\n" + tool_calls_template.format(
+                dsml_token=dsml_token, tool_calls="\n".join(tool_calls)
+            )
+
+        summary_content = content or ""
+
+        if thinking_mode == "thinking" and index > last_user_idx:
+            assert reasoning_content or tool_calls, (
+                f"ThinkingMode: {thinking_mode}, invalid message without reasoning_content/tool_calls `{msg}` after last user message"
+            )
+            thinking_part = (
+                thinking_template.format(reasoning_content=reasoning_content or "")
+                + thinking_end_token
+            )
+
+        prompt += assistant_msg_template.format(
+            reasoning=thinking_part,
+            content=summary_content,
+            tool_calls=tool_calls_content,
+        )
+    else:
+        raise NotImplementedError(f"Unknown role: {role}")
+
+    return prompt
+
+
+def drop_thinking_messages(
+    messages: list[dict[str, Any]], last_user_idx: int | None = None
+) -> list[dict[str, Any]]:
+    messages_wo_thinking: list[dict[str, Any]] = []
+    last_user_idx = (
+        find_last_user_index(messages) if last_user_idx is None else last_user_idx
+    )
+    for idx, msg in enumerate(messages):
+        role = msg.get("role")
+        if role in ["user", "system", "tool"] or idx >= last_user_idx:
+            messages_wo_thinking.append(msg)
+            continue
+
+        elif role == "assistant":
+            msg_wo_thinking = copy.copy(msg)
+            msg_wo_thinking.pop("reasoning_content", None)
+            msg_wo_thinking.pop("reasoning", None)
+            messages_wo_thinking.append(msg_wo_thinking)
+
+    return messages_wo_thinking
+
+
+def encode_messages(
+    messages: list[dict[str, Any]],
+    thinking_mode: str,
+    context: list[dict[str, Any]] | None = None,
+    drop_thinking: bool = True,
+    add_default_bos_token: bool = True,
+) -> str:
+    context = context if context else []
+    full_messages = context + messages
+
+    prompt = bos_token if add_default_bos_token and len(context) == 0 else ""
+
+    if thinking_mode == "thinking" and drop_thinking:
+        full_messages = drop_thinking_messages(full_messages)
+
+    for idx in range(len(messages)):
+        prompt += render_message(
+            idx + len(context), full_messages, thinking_mode=thinking_mode
+        )
+
+    return prompt
+
+
+def _read_until_stop(
+    index: int, text: str, stop: list[str]
+) -> tuple[int, str, None | str]:
+    min_pos = len(text)
+    matched_stop = None
+
+    for s in stop:
+        pos = text.find(s, index)
+        if pos != -1 and pos < min_pos:
+            min_pos = pos
+            matched_stop = s
+
+    if matched_stop:
+        content = text[index:min_pos]
+        return min_pos + len(matched_stop), content, matched_stop
+    else:
+        content = text[index:]
+        return len(text), content, None
+
+
+def parse_tool_calls(index: int, text: str):
+    tool_calls: list[dict[str, Any]] = []
+    stop_token = None
+    tool_calls_end_token = f"</{dsml_token}function_calls>"
+
+    while index < len(text):
+        index, _, stop_token = _read_until_stop(
+            index, text, [f"<{dsml_token}invoke", tool_calls_end_token]
+        )
+        assert _ == ">\n", "Tool call format error"
+
+        if stop_token == tool_calls_end_token:
+            break
+
+        assert stop_token is not None, "Missing special token"
+
+        index, tool_name_content, stop_token = _read_until_stop(
+            index, text, [f"<{dsml_token}parameter", f"</{dsml_token}invoke"]
+        )
+
+        p_tool_name = re.findall(
+            r'^\s*name="(.*?)">\n$', tool_name_content, flags=re.DOTALL
+        )
+        assert len(p_tool_name) == 1, "Tool name format error"
+        tool_name = p_tool_name[0]
+
+        tool_args: dict[str, tuple[str, str]] = {}
+        while stop_token == f"<{dsml_token}parameter":
+            index, param_content, stop_token = _read_until_stop(
+                index, text, [f"/{dsml_token}parameter"]
+            )
+
+            param_kv = re.findall(
+                r'^ name="(.*?)" string="(true|false)">(.*?)<$',
+                param_content,
+                flags=re.DOTALL,
+            )
+            assert len(param_kv) == 1, "Parameter format error"
+            param_name, string, param_value = param_kv[0]
+
+            assert param_name not in tool_args, "Duplicate parameter name"
+            tool_args[param_name] = (param_value, string)
+
+            index, content, stop_token = _read_until_stop(
+                index, text, [f"<{dsml_token}parameter", f"</{dsml_token}invoke"]
+            )
+            assert content == ">\n", "Parameter format error"
+
+        tool_call = decode_dsml_to_arguments(tool_name=tool_name, tool_args=tool_args)
+        tool_calls.append(tool_call)
+
+    return index, stop_token, tool_calls
+
+
+# NOTE: This function is designed to parse only correctly
+# formatted string and will not attempt to correct malformed output
+# that may be generated by the model.
+def parse_message_from_completion_text(text: str, thinking_mode: str):
+    summary_content, reasoning_content, tool_calls = "", "", []
+    index, stop_token = 0, None
+    tool_calls_start_token = f"\n\n<{dsml_token}function_calls"
+
+    is_thinking, is_tool_calling = thinking_mode == "thinking", False
+
+    if is_thinking:
+        index, content_delta, stop_token = _read_until_stop(
+            index, text, [thinking_end_token, tool_calls_start_token]
+        )
+        reasoning_content = content_delta
+        assert stop_token == thinking_end_token, "Invalid thinking format"
+
+    index, content_delta, stop_token = _read_until_stop(
+        index, text, [eos_token, tool_calls_start_token]
+    )
+    summary_content = content_delta
+    if stop_token == tool_calls_start_token:
+        is_tool_calling = True
+    else:
+        assert stop_token == eos_token, "Invalid summary format"
+
+    if is_tool_calling:
+        index, stop_token, tool_calls = parse_tool_calls(index, text)
+
+        index, tool_ends_text, stop_token = _read_until_stop(index, text, [eos_token])
+        assert not tool_ends_text, "Unexpected content after tool calls"
+
+    assert len(text) == index and stop_token in [eos_token, None], (
+        "Unexpected content at end"
+    )
+
+    for sp_token in [
+        bos_token,
+        eos_token,
+        thinking_start_token,
+        thinking_end_token,
+        dsml_token,
+    ]:
+        assert sp_token not in summary_content and sp_token not in reasoning_content, (
+            "Unexpected special token in content"
+        )
+
+    return {
+        "role": "assistant",
+        "content": summary_content,
+        "reasoning_content": reasoning_content,
+        "reasoning": reasoning_content,
+        "tool_calls": tool_calls_to_openai_format(tool_calls),
+    }
diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py
new file mode 100644
index 000000000..7466ad407
--- /dev/null
+++ b/vllm/tokenizers/deepseekv32.py
@@ -0,0 +1,148 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from pathlib import Path
+
+from transformers import BatchEncoding
+
+from .deepseek_v32_encoding import encode_messages
+from .hf import HfTokenizer, TokenizerLike
+from .registry import TokenizerRegistry
+
+
+@TokenizerRegistry.register("deepseek_v32")
+class DeepseekV32Tokenizer(HfTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
+        self.tokenizer = tokenizer
+        self.name_or_path = (
+            tokenizer.name_or_path if hasattr(tokenizer, "name_or_path") else ""
+        )
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        path_or_repo_id: str | Path,
+        *args,
+        trust_remote_code: bool = False,
+        revision: str | None = None,
+        download_dir: str | None = None,
+        **kwargs,
+    ) -> "TokenizerLike":
+        tokenizer = super().from_pretrained(
+            path_or_repo_id,
+            *args,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            download_dir=download_dir,
+            **kwargs,
+        )
+        return DeepseekV32Tokenizer(tokenizer)
+
+    def apply_chat_template(self, messages, tools=None, **kwargs):
+        thinking = kwargs.get("thinking", False)
+        thinking_mode = "thinking"
+        if not thinking:
+            thinking_mode = "chat"
+        messages = messages.copy()
+        drop_thinking = True
+        if tools is not None and len(tools) > 0:
+            messages.insert(0, {"role": "system"})
+            messages[0]["tools"] = tools
+            drop_thinking = False
+        encode_config = dict(thinking_mode=thinking_mode, drop_thinking=drop_thinking)
+        prompt_str = encode_messages(messages, **encode_config)  # type: ignore
+        return prompt_str
+
+    @property
+    def all_special_tokens(self) -> list[str]:
+        return self.tokenizer.all_special_tokens
+
+    @property
+    def all_special_ids(self) -> list[int]:
+        return self.tokenizer.all_special_ids
+
+    @property
+    def bos_token_id(self) -> int:
+        return self.tokenizer.bos_token_id
+
+    @property
+    def eos_token_id(self) -> int:
+        return self.tokenizer.eos_token_id
+
+    @property
+    def pad_token_id(self) -> int:
+        return self.tokenizer.pad_token_id
+
+    @property
+    def is_fast(self) -> bool:
+        return self.tokenizer.is_fast
+
+    @property
+    def vocab_size(self) -> int:
+        return self.tokenizer.vocab_size
+
+    @property
+    def max_token_id(self) -> int:
+        return self.tokenizer.max_token_id
+
+    @property
+    def truncation_side(self) -> str:
+        return self.tokenizer.truncation_side
+
+    def __hash__(self) -> int:
+        return hash(id(self))
+
+    def __len__(self) -> int:
+        # </think> is an added token in DeepseekV32 tokenizer
+        return self.vocab_size + len(self.get_added_vocab())
+
+    def __call__(
+        self,
+        text: str | list[str],
+        text_pair: str | None = None,
+        add_special_tokens: bool = True,
+        truncation: bool = False,
+        max_length: int | None = None,
+    ) -> "BatchEncoding":
+        return self.tokenizer(
+            text,
+            text_pair=text_pair,
+            add_special_tokens=add_special_tokens,
+            truncation=truncation,
+            max_length=max_length,
+        )
+
+    def get_vocab(self) -> dict[str, int]:
+        return self.tokenizer.get_vocab()
+
+    def get_added_vocab(self) -> dict[str, int]:
+        return self.tokenizer.get_added_vocab()
+
+    def encode(
+        self,
+        text: str,
+        truncation: bool | None = None,
+        max_length: int | None = None,
+        add_special_tokens: bool = True,
+    ) -> list[int]:
+        return self.tokenizer.encode(
+            text,
+            truncation=truncation,
+            max_length=max_length,
+            add_special_tokens=add_special_tokens,
+        )
+
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        return self.tokenizer.convert_tokens_to_string(tokens)
+
+    def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str:
+        return self.tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
+
+    def convert_ids_to_tokens(
+        self,
+        ids: list[int],
+        skip_special_tokens: bool = False,
+    ) -> list[str]:
+        return self.tokenizer.convert_ids_to_tokens(
+            ids, skip_special_tokens=skip_special_tokens
+        )
-- 
GitLab


From 15b1511a15dfb1d56048847da755213632c07b29 Mon Sep 17 00:00:00 2001
From: ioana ghiban <ioana.gbn@gmail.com>
Date: Wed, 3 Dec 2025 13:56:47 +0100
Subject: [PATCH 045/258] [GPU Backend] [Doc]: Remove duplicate statements on
 missing GPU wheels. (#29962)

Signed-off-by: Ioana Ghiban <ioana.ghiban@arm.com>
---
 docs/getting_started/installation/gpu.rocm.inc.md | 3 ---
 docs/getting_started/installation/gpu.xpu.inc.md  | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/docs/getting_started/installation/gpu.rocm.inc.md b/docs/getting_started/installation/gpu.rocm.inc.md
index c80ba9478..21120cc6f 100644
--- a/docs/getting_started/installation/gpu.rocm.inc.md
+++ b/docs/getting_started/installation/gpu.rocm.inc.md
@@ -5,9 +5,6 @@ vLLM supports AMD GPUs with ROCm 6.3 or above, and torch 2.8.0 and above.
 !!! tip
     [Docker](#set-up-using-docker) is the recommended way to use vLLM on ROCm.
 
-!!! warning
-    There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
-
 # --8<-- [end:installation]
 # --8<-- [start:requirements]
 
diff --git a/docs/getting_started/installation/gpu.xpu.inc.md b/docs/getting_started/installation/gpu.xpu.inc.md
index 620a660a2..7e9c6a2b9 100644
--- a/docs/getting_started/installation/gpu.xpu.inc.md
+++ b/docs/getting_started/installation/gpu.xpu.inc.md
@@ -2,9 +2,6 @@
 
 vLLM initially supports basic model inference and serving on Intel GPU platform.
 
-!!! warning
-    There are no pre-built wheels for this device, so you need build vLLM from source. Or you can use pre-built images which are based on vLLM released versions.
-
 # --8<-- [end:installation]
 # --8<-- [start:requirements]
 
-- 
GitLab


From 1bb17ecb396f911beaa26ab0d3926d46154c7155 Mon Sep 17 00:00:00 2001
From: ioana ghiban <ioana.gbn@gmail.com>
Date: Wed, 3 Dec 2025 14:33:50 +0100
Subject: [PATCH 046/258] [CPU Backend] [Doc]: Update Installation Docs for
 CPUs (#29868)

Signed-off-by: Ioana Ghiban <ioana.ghiban@arm.com>
---
 .../installation/cpu.apple.inc.md             |  7 +++--
 .../installation/cpu.arm.inc.md               | 26 ++++++++++++----
 docs/getting_started/installation/cpu.md      | 30 +++++++++++++++++--
 .../installation/cpu.s390x.inc.md             |  7 +++--
 .../installation/cpu.x86.inc.md               |  2 ++
 5 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/docs/getting_started/installation/cpu.apple.inc.md b/docs/getting_started/installation/cpu.apple.inc.md
index 4dc707d5f..9f1f6e382 100644
--- a/docs/getting_started/installation/cpu.apple.inc.md
+++ b/docs/getting_started/installation/cpu.apple.inc.md
@@ -4,9 +4,6 @@ vLLM has experimental support for macOS with Apple Silicon. For now, users must
 
 Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
 
-!!! warning
-    There are no pre-built wheels or images for this device, so you must build vLLM from source.
-
 # --8<-- [end:installation]
 # --8<-- [start:requirements]
 
@@ -20,6 +17,8 @@ Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
 # --8<-- [end:set-up-using-python]
 # --8<-- [start:pre-built-wheels]
 
+Currently, there are no pre-built Apple silicon CPU wheels.
+
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 
@@ -78,6 +77,8 @@ uv pip install -e .
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]
 
+Currently, there are no pre-built Arm silicon CPU images.
+
 # --8<-- [end:pre-built-images]
 # --8<-- [start:build-image-from-source]
 
diff --git a/docs/getting_started/installation/cpu.arm.inc.md b/docs/getting_started/installation/cpu.arm.inc.md
index 9cae9ed1a..156f31f63 100644
--- a/docs/getting_started/installation/cpu.arm.inc.md
+++ b/docs/getting_started/installation/cpu.arm.inc.md
@@ -1,11 +1,6 @@
 # --8<-- [start:installation]
 
-vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform.
-
-ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
-
-!!! warning
-    There are no pre-built wheels or images for this device, so you must build vLLM from source.
+vLLM offers basic model inferencing and serving on Arm CPU platform, with support NEON, data types FP32, FP16 and BF16.
 
 # --8<-- [end:installation]
 # --8<-- [start:requirements]
@@ -20,6 +15,23 @@ ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
 # --8<-- [end:set-up-using-python]
 # --8<-- [start:pre-built-wheels]
 
+Pre-built vLLM wheels for Arm are available since version 0.11.2. These wheels contain pre-compiled C++ binaries.
+Please replace `<version>` in the commands below with a specific version string (e.g., `0.11.2`).
+
+```bash
+uv pip install --pre vllm==<version>+cpu --extra-index-url https://wheels.vllm.ai/<version>%2Bcpu/
+```
+
+??? console "pip"
+    ```bash
+    pip install --pre vllm==<version>+cpu --extra-index-url https://wheels.vllm.ai/<version>%2Bcpu/
+    ```
+
+The `uv` approach works for vLLM `v0.6.6` and later. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
+
+!!! note
+    Nightly wheels are currently unsupported for this architecture. (e.g. to bisect the behavior change, performance regression).
+
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 
@@ -69,6 +81,8 @@ Testing has been conducted on AWS Graviton3 instances for compatibility.
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]
 
+Currently, there are no pre-built Arm CPU images.
+
 # --8<-- [end:pre-built-images]
 # --8<-- [start:build-image-from-source]
 ```bash
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index 4b68cb481..210f720e2 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -46,11 +46,25 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 
 ### Pre-built wheels
 
-Please refer to the instructions for [pre-built wheels on GPU](./gpu.md#pre-built-wheels).
-
 When specifying the index URL, please make sure to use the `cpu` variant subdirectory.
 For example, the nightly build index is: `https://wheels.vllm.ai/nightly/cpu/`.
 
+=== "Intel/AMD x86"
+
+    --8<-- "docs/getting_started/installation/cpu.x86.inc.md:pre-built-wheels"
+
+=== "ARM AArch64"
+
+    --8<-- "docs/getting_started/installation/cpu.arm.inc.md:pre-built-wheels"
+
+=== "Apple silicon"
+
+    --8<-- "docs/getting_started/installation/cpu.apple.inc.md:pre-built-wheels"
+
+=== "IBM Z (S390X)"
+
+    --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:pre-built-wheels"
+
 ### Build wheel from source
 
 #### Set up using Python-only build (without compilation) {#python-only-build}
@@ -87,6 +101,18 @@ VLLM_USE_PRECOMPILED=1 VLLM_PRECOMPILED_WHEEL_VARIANT=cpu VLLM_TARGET_DEVICE=cpu
 
     --8<-- "docs/getting_started/installation/cpu.x86.inc.md:pre-built-images"
 
+=== "ARM AArch64"
+
+    --8<-- "docs/getting_started/installation/cpu.arm.inc.md:pre-built-images"
+
+=== "Apple silicon"
+
+    --8<-- "docs/getting_started/installation/cpu.apple.inc.md:pre-built-images"
+
+=== "IBM Z (S390X)"
+
+    --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:pre-built-images"
+
 ### Build image from source
 
 === "Intel/AMD x86"
diff --git a/docs/getting_started/installation/cpu.s390x.inc.md b/docs/getting_started/installation/cpu.s390x.inc.md
index c2163139a..4984c87c1 100644
--- a/docs/getting_started/installation/cpu.s390x.inc.md
+++ b/docs/getting_started/installation/cpu.s390x.inc.md
@@ -4,9 +4,6 @@ vLLM has experimental support for s390x architecture on IBM Z platform. For now,
 
 Currently, the CPU implementation for s390x architecture supports FP32 datatype only.
 
-!!! warning
-    There are no pre-built wheels or images for this device, so you must build vLLM from source.
-
 # --8<-- [end:installation]
 # --8<-- [start:requirements]
 
@@ -21,6 +18,8 @@ Currently, the CPU implementation for s390x architecture supports FP32 datatype
 # --8<-- [end:set-up-using-python]
 # --8<-- [start:pre-built-wheels]
 
+Currently, there are no pre-built IBM Z CPU wheels.
+
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 
@@ -69,6 +68,8 @@ Execute the following commands to build and install vLLM from source.
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]
 
+Currently, there are no pre-built IBM Z CPU images.
+
 # --8<-- [end:pre-built-images]
 # --8<-- [start:build-image-from-source]
 
diff --git a/docs/getting_started/installation/cpu.x86.inc.md b/docs/getting_started/installation/cpu.x86.inc.md
index 310f179cb..1fad7f433 100644
--- a/docs/getting_started/installation/cpu.x86.inc.md
+++ b/docs/getting_started/installation/cpu.x86.inc.md
@@ -17,6 +17,8 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
 # --8<-- [end:set-up-using-python]
 # --8<-- [start:pre-built-wheels]
 
+Currently, there are no pre-built x86 CPU wheels.
+
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 
-- 
GitLab


From 5aa9b090407d5fb9b89c05d28fab808623e3070c Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Wed, 3 Dec 2025 08:56:35 -0600
Subject: [PATCH 047/258] [CI/Build][AMD] Skip
 test_shared_storage_connector_hashes in test_shared_storage_connector.py due
 to hipErrorLaunchFailure when calling .cpu() (#29839)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 .../kv_connector/unit/test_shared_storage_connector.py   | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/v1/kv_connector/unit/test_shared_storage_connector.py b/tests/v1/kv_connector/unit/test_shared_storage_connector.py
index e7013a794..ff4697a97 100644
--- a/tests/v1/kv_connector/unit/test_shared_storage_connector.py
+++ b/tests/v1/kv_connector/unit/test_shared_storage_connector.py
@@ -3,12 +3,14 @@
 from dataclasses import asdict
 from typing import NamedTuple
 
+import pytest
 from PIL import Image
 
 from vllm import LLM, EngineArgs, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.config import KVTransferConfig
 from vllm.multimodal.utils import encode_image_base64
+from vllm.platforms import current_platform
 
 MODEL_NAME = "RedHatAI/Qwen2.5-VL-3B-Instruct-quantized.w8a8"
 
@@ -108,6 +110,13 @@ def process_prompt(processor, llm: LLM, question: str, image_urls: list[Image]):
         print("-" * 50)
 
 
+@pytest.mark.skipif(
+    current_platform.is_rocm(),
+    reason=(
+        "hipErrorLaunchFailure when running this test, see issue:"
+        "https://github.com/ROCm/pytorch/issues/2822"
+    ),
+)
 def test_shared_storage_connector_hashes(tmp_path):
     """
     Tests that SharedStorageConnector saves KV to the storage locations
-- 
GitLab


From 9bcf92295a918c9579d59a9c4d003cb563f495f7 Mon Sep 17 00:00:00 2001
From: Lumis Chen <lumischen01@gmail.com>
Date: Thu, 4 Dec 2025 00:06:57 +0800
Subject: [PATCH 048/258] [Core] Add xxHash as a high-performance hash option
 for accelerating prefix caching (#29163)

Signed-off-by: LuminolT <lumischen01@gmail.com>
Signed-off-by: Lumis Chen <lumischen01@gmail.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
---
 benchmarks/benchmark_hash.py              | 120 ++++++++++++++++++++++
 benchmarks/benchmark_prefix_block_hash.py | 110 ++++++++++++++++++++
 docs/benchmarking/cli.md                  |  29 ++++++
 tests/v1/engine/test_engine_args.py       |  16 +++
 vllm/config/cache.py                      |  18 +++-
 vllm/utils/hashing.py                     |  36 +++++++
 vllm/v1/core/kv_cache_utils.py            |  11 +-
 7 files changed, 332 insertions(+), 8 deletions(-)
 create mode 100644 benchmarks/benchmark_hash.py
 create mode 100644 benchmarks/benchmark_prefix_block_hash.py

diff --git a/benchmarks/benchmark_hash.py b/benchmarks/benchmark_hash.py
new file mode 100644
index 000000000..08cdc012d
--- /dev/null
+++ b/benchmarks/benchmark_hash.py
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Micro benchmark comparing built-in hash(), SHA-256, and xxHash.
+
+This focuses on a single test payload shaped like the prefix-cache hash input:
+    (32-byte bytes object, 32-int tuple)
+
+Usage:
+    python benchmarks/hash_micro_benchmark.py --iterations 20000
+"""
+
+from __future__ import annotations
+
+import argparse
+import random
+import statistics
+import time
+from collections.abc import Callable, Iterable
+
+from vllm.utils.hashing import sha256, xxhash
+
+
+def _generate_test_data(seed: int) -> tuple[bytes, tuple[int, ...]]:
+    """Generate a deterministic test payload."""
+    random.seed(seed)
+    bytes_data = bytes(random.getrandbits(8) for _ in range(32))
+    int_tuple = tuple(random.randint(1, 1_000_000) for _ in range(32))
+    return (bytes_data, int_tuple)
+
+
+def _benchmark_func(func: Callable[[tuple], object], data: tuple, iterations: int):
+    """Return (avg_seconds, std_seconds) for hashing `data` `iterations` times."""
+    times: list[float] = []
+
+    # Warm-up to avoid first-run noise.
+    for _ in range(200):
+        func(data)
+
+    for _ in range(iterations):
+        start = time.perf_counter()
+        func(data)
+        end = time.perf_counter()
+        times.append(end - start)
+
+    avg = statistics.mean(times)
+    std = statistics.stdev(times) if len(times) > 1 else 0.0
+    return avg, std
+
+
+def _run_benchmarks(
+    benchmarks: Iterable[tuple[str, Callable[[tuple], object]]],
+    data: tuple,
+    iterations: int,
+):
+    """Yield (name, avg, std) for each benchmark, skipping unavailable ones."""
+    for name, func in benchmarks:
+        try:
+            avg, std = _benchmark_func(func, data, iterations)
+        except ModuleNotFoundError as exc:
+            print(f"Skipping {name}: {exc}")
+            continue
+        yield name, avg, std
+
+
+def builtin_hash(data: tuple) -> int:
+    """Wrapper for Python's built-in hash()."""
+    return hash(data)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--iterations",
+        type=int,
+        default=10_000,
+        help="Number of measured iterations per hash function.",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=42, help="Random seed for test payload."
+    )
+    args = parser.parse_args()
+
+    data = _generate_test_data(args.seed)
+    benchmarks = (
+        ("SHA256 (pickle)", sha256),
+        ("xxHash (pickle)", xxhash),
+        ("built-in hash()", builtin_hash),
+    )
+
+    print("=" * 60)
+    print("HASH FUNCTION MICRO BENCHMARK")
+    print("=" * 60)
+    print("Test data: (32-byte bytes object, 32-int tuple)")
+    print(f"Iterations: {args.iterations:,}")
+    print("=" * 60)
+
+    results = list(_run_benchmarks(benchmarks, data, args.iterations))
+    builtin_entry = next((r for r in results if r[0] == "built-in hash()"), None)
+
+    print("\nResults:")
+    for name, avg, std in results:
+        print(f"  {name:16s}: {avg * 1e6:8.2f} ± {std * 1e6:6.2f} μs")
+
+    if builtin_entry:
+        _, builtin_avg, _ = builtin_entry
+        print("\n" + "=" * 60)
+        print("SUMMARY (relative to built-in hash())")
+        print("=" * 60)
+        for name, avg, _ in results:
+            if name == "built-in hash()":
+                continue
+            speed_ratio = avg / builtin_avg
+            print(f"• {name} is {speed_ratio:.1f}x slower than built-in hash()")
+    else:
+        print("\nBuilt-in hash() result missing; cannot compute speed ratios.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/benchmark_prefix_block_hash.py b/benchmarks/benchmark_prefix_block_hash.py
new file mode 100644
index 000000000..8bcd8af0d
--- /dev/null
+++ b/benchmarks/benchmark_prefix_block_hash.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Simple benchmark to compare prefix-cache block hashing algorithms.
+
+Example:
+    python benchmark_prefix_block_hash.py --num-blocks 20000 --block-size 32
+"""
+
+from __future__ import annotations
+
+import argparse
+import random
+import statistics
+import sys
+import time
+from collections.abc import Callable, Iterable, Sequence
+
+from vllm.utils.hashing import get_hash_fn_by_name
+from vllm.v1.core.kv_cache_utils import BlockHash, hash_block_tokens, init_none_hash
+
+SUPPORTED_ALGOS = ("sha256", "sha256_cbor", "xxhash", "xxhash_cbor")
+
+
+def _generate_blocks(
+    num_blocks: int, block_size: int, vocab_size: int, seed: int
+) -> list[list[int]]:
+    rng = random.Random(seed)
+    return [
+        [rng.randrange(vocab_size) for _ in range(block_size)]
+        for _ in range(num_blocks)
+    ]
+
+
+def _hash_all_blocks(
+    hash_fn: Callable[[object], bytes],
+    blocks: Iterable[Sequence[int]],
+) -> float:
+    parent_hash: BlockHash | None = None
+    start = time.perf_counter()
+    for block in blocks:
+        parent_hash = hash_block_tokens(hash_fn, parent_hash, block, extra_keys=None)
+    end = time.perf_counter()
+    return end - start
+
+
+def _benchmark(
+    hash_algo: str,
+    blocks: list[list[int]],
+    trials: int,
+) -> tuple[float, float, float] | None:
+    try:
+        hash_fn = get_hash_fn_by_name(hash_algo)
+        init_none_hash(hash_fn)
+        timings = [_hash_all_blocks(hash_fn, blocks) for _ in range(trials)]
+    except ModuleNotFoundError as exc:
+        print(f"Skipping {hash_algo}: {exc}", file=sys.stderr)
+        return None
+
+    avg = statistics.mean(timings)
+    best = min(timings)
+    # throughput: tokens / second
+    tokens_hashed = len(blocks) * len(blocks[0])
+    throughput = tokens_hashed / best
+    return avg, best, throughput
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--num-blocks", type=int, default=10000, help="Block count.")
+    parser.add_argument("--block-size", type=int, default=32, help="Tokens per block.")
+    parser.add_argument(
+        "--vocab-size", type=int, default=32000, help="Token id range [0, vocab_size)."
+    )
+    parser.add_argument("--seed", type=int, default=0, help="Random seed.")
+    parser.add_argument(
+        "--trials", type=int, default=5, help="Number of timed trials per algorithm."
+    )
+    parser.add_argument(
+        "--algorithms",
+        nargs="+",
+        default=SUPPORTED_ALGOS,
+        choices=SUPPORTED_ALGOS,
+        help="Hash algorithms to benchmark.",
+    )
+    args = parser.parse_args()
+
+    blocks = _generate_blocks(
+        args.num_blocks, args.block_size, args.vocab_size, args.seed
+    )
+    print(
+        f"Benchmarking {len(args.algorithms)} algorithms on "
+        f"{args.num_blocks} blocks (block size={args.block_size})."
+    )
+
+    for algo in args.algorithms:
+        result = _benchmark(algo, blocks, args.trials)
+        if result is None:
+            continue
+
+        avg, best, throughput = result
+        print(
+            f"{algo:14s} avg: {avg:.6f}s  best: {best:.6f}s  "
+            f"throughput: {throughput / 1e6:.2f}M tokens/s"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/benchmarking/cli.md b/docs/benchmarking/cli.md
index 44a4c4012..1ce6b6117 100644
--- a/docs/benchmarking/cli.md
+++ b/docs/benchmarking/cli.md
@@ -670,6 +670,35 @@ vllm bench serve \
 
 </details>
 
+### 🧪 Hashing Benchmarks
+
+<details class="admonition abstract" markdown="1">
+<summary>Show more</summary>
+
+Two helper scripts live in `benchmarks/` to compare hashing options used by prefix caching and related utilities. They are standalone (no server required) and help choose a hash algorithm before enabling prefix caching in production.
+
+- `benchmarks/benchmark_hash.py`: Micro-benchmark that measures per-call latency of three implementations on a representative `(bytes, tuple[int])` payload.
+
+```bash
+python benchmarks/benchmark_hash.py --iterations 20000 --seed 42
+```
+
+- `benchmarks/benchmark_prefix_block_hash.py`: End-to-end block hashing benchmark that runs the full prefix-cache hash pipeline (`hash_block_tokens`) across many fake blocks and reports throughput.
+
+```bash
+python benchmarks/benchmark_prefix_block_hash.py --num-blocks 20000 --block-size 32 --trials 5
+```
+
+Supported algorithms: `sha256`, `sha256_cbor`, `xxhash`, `xxhash_cbor`. Install optional deps to exercise all variants:
+
+```bash
+uv pip install xxhash cbor2
+```
+
+If an algorithm’s dependency is missing, the script will skip it and continue.
+
+</details>
+
 ### ⚡ Request Prioritization Benchmark
 
 <details class="admonition abstract" markdown="1">
diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py
index e96759ed6..527a56ff4 100644
--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@@ -9,6 +9,7 @@ from vllm.config import VllmConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.hashing import _xxhash
 
 
 def test_prefix_caching_from_cli():
@@ -48,6 +49,21 @@ def test_prefix_caching_from_cli():
         args = parser.parse_args(["--prefix-caching-hash-algo", "invalid"])
 
 
+@pytest.mark.skipif(_xxhash is None, reason="xxhash not installed")
+def test_prefix_caching_xxhash_from_cli():
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+
+    # set hash algorithm to xxhash (pickle)
+    args = parser.parse_args(["--prefix-caching-hash-algo", "xxhash"])
+    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
+    assert vllm_config.cache_config.prefix_caching_hash_algo == "xxhash"
+
+    # set hash algorithm to xxhash_cbor
+    args = parser.parse_args(["--prefix-caching-hash-algo", "xxhash_cbor"])
+    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
+    assert vllm_config.cache_config.prefix_caching_hash_algo == "xxhash_cbor"
+
+
 def test_defaults_with_usage_context():
     engine_args = EngineArgs(model="facebook/opt-125m")
     vllm_config: VllmConfig = engine_args.create_engine_config(UsageContext.LLM_CLASS)
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index 00530846f..91f083a55 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -30,7 +30,7 @@ CacheDType = Literal[
     "fp8_ds_mla",
 ]
 MambaDType = Literal["auto", "float32"]
-PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor"]
+PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor", "xxhash", "xxhash_cbor"]
 KVOffloadingBackend = Literal["native", "lmcache"]
 
 
@@ -77,9 +77,21 @@ class CacheConfig:
     """Whether to enable prefix caching."""
     prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
     """Set the hash algorithm for prefix caching:\n
-    - "sha256" uses Pickle for object serialization before hashing.\n
+    - "sha256" uses Pickle for object serialization before hashing. This is the
+    current default, as SHA256 is the most secure choice to avoid potential
+    hash collisions.\n
     - "sha256_cbor" provides a reproducible, cross-language compatible hash. It
-    serializes objects using canonical CBOR and hashes them with SHA-256."""
+    serializes objects using canonical CBOR and hashes them with SHA-256.\n
+    - "xxhash" uses Pickle serialization with xxHash (128-bit) for faster,
+    non-cryptographic hashing. Requires the optional ``xxhash`` package.
+    IMPORTANT: Use of a hashing algorithm that is not considered 
+    cryptographically secure theoretically increases the risk of hash collisions,
+    which can cause undefined behavior or even leak private information in
+    multi-tenant environments. Even if collisions are still very unlikely, it is
+    important to consider your security risk tolerance against the performance
+    benefits before turning this on.\n
+    - "xxhash_cbor" combines canonical CBOR serialization with xxHash for
+    reproducible hashing. Requires the optional ``xxhash`` package."""
     cpu_offload_gb: float = Field(default=0, ge=0)
     """The space in GiB to offload to CPU, per GPU. Default is 0, which means
     no offloading. Intuitively, this argument can be seen as a virtual way to
diff --git a/vllm/utils/hashing.py b/vllm/utils/hashing.py
index edf1e9cb3..f01c6b074 100644
--- a/vllm/utils/hashing.py
+++ b/vllm/utils/hashing.py
@@ -11,6 +11,17 @@ from typing import Any
 
 import cbor2
 
+try:
+    # It is important that this remains an optional dependency.
+    # It would not be allowed in environments with strict security controls,
+    # so it's best not to have it installed when not in use.
+    import xxhash as _xxhash
+
+    if not hasattr(_xxhash, "xxh3_128_digest"):
+        _xxhash = None
+except ImportError:  # pragma: no cover
+    _xxhash = None
+
 
 def sha256(input: Any) -> bytes:
     """Hash any picklable Python object using SHA-256.
@@ -47,6 +58,27 @@ def sha256_cbor(input: Any) -> bytes:
     return hashlib.sha256(input_bytes).digest()
 
 
+def _xxhash_digest(input_bytes: bytes) -> bytes:
+    if _xxhash is None:
+        raise ModuleNotFoundError(
+            "xxhash is required for the 'xxhash' prefix caching hash algorithms. "
+            "Install it via `pip install xxhash`."
+        )
+    return _xxhash.xxh3_128_digest(input_bytes)
+
+
+def xxhash(input: Any) -> bytes:
+    """Hash picklable objects using xxHash."""
+    input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
+    return _xxhash_digest(input_bytes)
+
+
+def xxhash_cbor(input: Any) -> bytes:
+    """Hash objects serialized with CBOR using xxHash."""
+    input_bytes = cbor2.dumps(input, canonical=True)
+    return _xxhash_digest(input_bytes)
+
+
 def get_hash_fn_by_name(hash_fn_name: str) -> Callable[[Any], bytes]:
     """Get a hash function by name, or raise an error if the function is not found.
 
@@ -60,6 +92,10 @@ def get_hash_fn_by_name(hash_fn_name: str) -> Callable[[Any], bytes]:
         return sha256
     if hash_fn_name == "sha256_cbor":
         return sha256_cbor
+    if hash_fn_name == "xxhash":
+        return xxhash
+    if hash_fn_name == "xxhash_cbor":
+        return xxhash_cbor
 
     raise ValueError(f"Unsupported hash function: {hash_fn_name}")
 
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 602eb81be..774200dee 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -12,7 +12,7 @@ from typing import Any, NewType, TypeAlias, overload
 from vllm import envs
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils.hashing import sha256_cbor
+from vllm.utils.hashing import sha256_cbor, xxhash_cbor
 from vllm.utils.math_utils import cdiv
 from vllm.utils.mem_constants import GiB_bytes
 from vllm.v1.kv_cache_interface import (
@@ -83,18 +83,19 @@ logger = init_logger(__name__)
 #
 # The function `init_none_hash` initializes this variable globally.
 NONE_HASH: BlockHash
+_CBOR_HASH_FUNCTIONS = frozenset({sha256_cbor, xxhash_cbor})
 
 
 def init_none_hash(hash_fn: Callable[[Any], bytes]):
     global NONE_HASH
 
     hash_seed = os.getenv("PYTHONHASHSEED")
-    if hash_seed is None and hash_fn is sha256_cbor:
+    if hash_seed is None and hash_fn in _CBOR_HASH_FUNCTIONS:
         logger.warning(
             "PYTHONHASHSEED is not set. This will lead to non-reproducible "
-            "block-hashes when using sha256_cbor as the hash function."
-            "Consider setting PYTHONHASHSEED to a fixed value for "
-            "reproducibility."
+            "block-hashes when using CBOR-based hash functions such as "
+            "sha256_cbor or xxhash_cbor. Consider setting PYTHONHASHSEED to a "
+            "fixed value for reproducibility."
         )
 
     if hash_seed is None:
-- 
GitLab


From 9ae3c55b10318ad7b0c19becb0dc8ad41c171db2 Mon Sep 17 00:00:00 2001
From: Yu Jiaqi <54204033+piood@users.noreply.github.com>
Date: Thu, 4 Dec 2025 00:12:58 +0800
Subject: [PATCH 049/258] SigLIP example add chat_template (#29902)

Signed-off-by: piood <2477084691@qq.com>
---
 ...ai_chat_embedding_client_for_multimodal.py |  3 +-
 vllm/entrypoints/chat_utils.py                | 35 ++++++++++++++-----
 2 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py b/examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py
index 47c2c5030..a7ab7e73e 100644
--- a/examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py
@@ -150,7 +150,8 @@ def run_siglip(client: OpenAI, model: str):
     Start the server using:
 
     vllm serve google/siglip-base-patch16-224 \
-        --runner pooling
+        --runner pooling \
+        --chat-template template_basic.jinja
     """
 
     response = create_chat_embeddings(
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 2dd5b9c8f..1b3a7d266 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1139,11 +1139,19 @@ def validate_chat_template(chat_template: Path | str | None):
             not any(c in chat_template for c in JINJA_CHARS)
             and not Path(chat_template).exists()
         ):
-            raise ValueError(
-                f"The supplied chat template string ({chat_template}) "
-                f"appears path-like, but doesn't exist!"
+            # Try to find the template in the built-in templates directory
+            from vllm.transformers_utils.chat_templates.registry import (
+                CHAT_TEMPLATES_DIR,
             )
 
+            builtin_template_path = CHAT_TEMPLATES_DIR / chat_template
+            if not builtin_template_path.exists():
+                raise ValueError(
+                    f"The supplied chat template string ({chat_template}) "
+                    f"appears path-like, but doesn't exist! "
+                    f"Tried: {chat_template} and {builtin_template_path}"
+                )
+
     else:
         raise TypeError(f"{type(chat_template)} is not a valid chat template type")
 
@@ -1173,12 +1181,23 @@ def _load_chat_template(
 
         JINJA_CHARS = "{}\n"
         if not any(c in chat_template for c in JINJA_CHARS):
-            msg = (
-                f"The supplied chat template ({chat_template}) "
-                f"looks like a file path, but it failed to be "
-                f"opened. Reason: {e}"
+            # Try to load from the built-in templates directory
+            from vllm.transformers_utils.chat_templates.registry import (
+                CHAT_TEMPLATES_DIR,
             )
-            raise ValueError(msg) from e
+
+            builtin_template_path = CHAT_TEMPLATES_DIR / chat_template
+            try:
+                with open(builtin_template_path) as f:
+                    return f.read()
+            except OSError:
+                msg = (
+                    f"The supplied chat template ({chat_template}) "
+                    f"looks like a file path, but it failed to be opened. "
+                    f"Tried: {chat_template} and {builtin_template_path}. "
+                    f"Reason: {e}"
+                )
+                raise ValueError(msg) from e
 
         # If opening a file fails, set chat template to be args to
         # ensure we decode so our escape are interpreted correctly
-- 
GitLab


From d1f7392c5f774245d0a0776d141a64e72ca3e8ca Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Wed, 3 Dec 2025 11:17:07 -0600
Subject: [PATCH 050/258] [ROCm][CI] Fix v1/logits_processors failure on ROCm
 (#29927)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 tests/v1/logits_processors/test_custom_offline.py |  5 -----
 tests/v1/logits_processors/test_custom_online.py  | 12 ++----------
 tests/v1/logits_processors/utils.py               |  2 +-
 3 files changed, 3 insertions(+), 16 deletions(-)

diff --git a/tests/v1/logits_processors/test_custom_offline.py b/tests/v1/logits_processors/test_custom_offline.py
index 189973773..e3ddb6138 100644
--- a/tests/v1/logits_processors/test_custom_offline.py
+++ b/tests/v1/logits_processors/test_custom_offline.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import random
-import sys
 from typing import Any
 
 import pytest
@@ -10,7 +9,6 @@ from tests.utils import create_new_process_for_each_test
 from tests.v1.logits_processors.utils import (
     DUMMY_LOGITPROC_ARG,
     DUMMY_LOGITPROC_FQCN,
-    DUMMY_LOGITPROC_MODULE,
     MAX_TOKENS,
     MODEL_NAME,
     POOLING_MODEL_NAME,
@@ -18,7 +16,6 @@ from tests.v1.logits_processors.utils import (
     CustomLogitprocSource,
     DummyLogitsProcessor,
     WrappedPerReqLogitsProcessor,
-    dummy_module,
     prompts,
 )
 from tests.v1.logits_processors.utils import entry_points as fake_entry_points
@@ -162,8 +159,6 @@ def test_custom_logitsprocs(monkeypatch, logitproc_source: CustomLogitprocSource
     kwargs: dict[str, list[str | type[LogitsProcessor]]] = {}
     if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_FQCN:
         # Scenario: load logitproc based on fully-qualified class name (FQCN)
-        # Inject dummy module which defines logitproc
-        sys.modules[DUMMY_LOGITPROC_MODULE] = dummy_module
         kwargs["logits_processors"] = [DUMMY_LOGITPROC_FQCN]
     elif logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_CLASS:
         # Scenario: load logitproc from provided class object
diff --git a/tests/v1/logits_processors/test_custom_online.py b/tests/v1/logits_processors/test_custom_online.py
index 3e0bb02ed..3dc6b8979 100644
--- a/tests/v1/logits_processors/test_custom_online.py
+++ b/tests/v1/logits_processors/test_custom_online.py
@@ -14,11 +14,9 @@ from tests.utils import RemoteOpenAIServerCustom, create_new_process_for_each_te
 from tests.v1.logits_processors.utils import (
     DUMMY_LOGITPROC_ARG,
     DUMMY_LOGITPROC_FQCN,
-    DUMMY_LOGITPROC_MODULE,
     MAX_TOKENS,
     MODEL_NAME,
     TEMP_GREEDY,
-    dummy_module,
     prompts,
 )
 from tests.v1.logits_processors.utils import entry_points as fake_entry_points
@@ -47,20 +45,14 @@ def _server_with_logitproc_entrypoint(
     main.main()
 
 
-def _server_with_logitproc_module(
+def _server_with_logitproc_fqcn(
     env_dict: dict[str, str] | None,
     model: str,
     vllm_serve_args: list[str],
 ) -> None:
     """Start vLLM server, inject module with dummy logitproc"""
-
-    # Patch `modules` to inject dummy logitproc module
     from vllm.entrypoints.cli import main
 
-    sys.modules[DUMMY_LOGITPROC_MODULE] = dummy_module
-
-    # fork is required for workers to see entrypoint patch
-    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "fork"
     if env_dict is not None:
         os.environ.update(env_dict)
 
@@ -99,7 +91,7 @@ def server(default_server_args, request, monkeypatch):
     if request.param:
         # Launch server, append FQCN argument, inject dummy logitproc module
         args = default_server_args + request.param
-        _server_fxn = _server_with_logitproc_module
+        _server_fxn = _server_with_logitproc_fqcn
     else:
         # Launch server, inject dummy logitproc entrypoint
         args = default_server_args
diff --git a/tests/v1/logits_processors/utils.py b/tests/v1/logits_processors/utils.py
index b8548bc31..e54da72e5 100644
--- a/tests/v1/logits_processors/utils.py
+++ b/tests/v1/logits_processors/utils.py
@@ -27,7 +27,7 @@ DUMMY_LOGITPROC_ARG = "target_token"
 TEMP_GREEDY = 0.0
 MAX_TOKENS = 20
 DUMMY_LOGITPROC_ENTRYPOINT = "dummy_logitproc"
-DUMMY_LOGITPROC_MODULE = "DummyModule"
+DUMMY_LOGITPROC_MODULE = "tests.v1.logits_processors.utils"
 DUMMY_LOGITPROC_FQCN = f"{DUMMY_LOGITPROC_MODULE}:DummyLogitsProcessor"
 
 
-- 
GitLab


From dd5d1ef780b5b73b9817e5dc8fe9b3e98a399e20 Mon Sep 17 00:00:00 2001
From: avigny <47987522+avigny@users.noreply.github.com>
Date: Wed, 3 Dec 2025 18:45:31 +0100
Subject: [PATCH 051/258] [Bugfix] Mistral tool parser streaming update
 (#19425)

Signed-off-by: avigny <47987522+avigny@users.noreply.github.com>
Signed-off-by: Chauncey <chaunceyjiang@gmail.com>
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Co-authored-by: Jeff Cook <jeff@jeffcook.io>
Co-authored-by: sfbemerk <benjaminmerkel@mail.de>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 requirements/common.txt                       |   1 +
 tests/tool_use/test_mistral_tool_parser.py    | 847 ++++++++++++++++++
 tests/tool_use/utils.py                       |  28 +-
 .../tool_parsers/mistral_tool_parser.py       | 598 ++++++++-----
 4 files changed, 1272 insertions(+), 202 deletions(-)
 create mode 100644 tests/tool_use/test_mistral_tool_parser.py

diff --git a/requirements/common.txt b/requirements/common.txt
index 8b9e6b935..f18560b98 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -46,6 +46,7 @@ scipy # Required for phi-4-multimodal-instruct
 ninja # Required for xgrammar, rocm, tpu, xpu
 pybase64 # fast base64 implementation
 cbor2 # Required for cross-language serialization of hashable objects
+ijson # Required for mistral streaming tool parser
 setproctitle # Used to set process names for better debugging and monitoring
 openai-harmony >= 0.0.3  # Required for gpt-oss
 anthropic == 0.71.0
diff --git a/tests/tool_use/test_mistral_tool_parser.py b/tests/tool_use/test_mistral_tool_parser.py
new file mode 100644
index 000000000..e5deb7f40
--- /dev/null
+++ b/tests/tool_use/test_mistral_tool_parser.py
@@ -0,0 +1,847 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Generator
+
+import partial_json_parser
+import pytest
+from mistral_common.protocol.instruct.messages import AssistantMessage
+from mistral_common.protocol.instruct.request import InstructRequest
+from mistral_common.protocol.instruct.tool_calls import FunctionCall, ToolCall
+from partial_json_parser.core.options import Allow
+
+from vllm.entrypoints.openai.protocol import DeltaMessage, DeltaToolCall
+from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolParser
+from vllm.tokenizers import (
+    MistralTokenizer,
+    TokenizerLike,
+    get_tokenizer,
+)
+from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
+
+
+@pytest.fixture(scope="module")
+def mistral_pre_v11_tokenizer():
+    MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
+    return get_tokenizer(tokenizer_name=MODEL)
+
+
+@pytest.fixture(scope="module")
+def mistral_tokenizer():
+    MODEL = "mistralai/Mistral-Small-3.2-24B-Instruct-2506"
+    return get_tokenizer(tokenizer_name=MODEL, tokenizer_mode="mistral")
+
+
+@pytest.fixture
+def mistral_pre_v11_tool_parser(mistral_pre_v11_tokenizer):
+    return MistralToolParser(mistral_pre_v11_tokenizer)
+
+
+@pytest.fixture
+def mistral_tool_parser(mistral_tokenizer):
+    return MistralToolParser(mistral_tokenizer)
+
+
+def assert_tool_calls(
+    actual_tool_calls: list[ToolCall] | list[DeltaToolCall],
+    expected_tool_calls: list[ToolCall],
+):
+    assert len(actual_tool_calls) == len(expected_tool_calls)
+
+    for actual_tool_call, expected_tool_call in zip(
+        actual_tool_calls, expected_tool_calls
+    ):
+        assert isinstance(actual_tool_call.id, str)
+        assert len(actual_tool_call.id) == 9
+
+        if isinstance(actual_tool_call, ToolCall):
+            assert actual_tool_call.type == "function"
+        elif isinstance(actual_tool_call, DeltaToolCall):
+            assert actual_tool_call.function is not None
+            assert actual_tool_call.function.name is not None
+            assert actual_tool_call.function.arguments is not None
+        assert actual_tool_call.function is not None
+        assert actual_tool_call.function.name == expected_tool_call.function.name, (
+            f"got wrong function name:${actual_tool_call.function.name}"
+        )
+        assert (
+            actual_tool_call.function.arguments == expected_tool_call.function.arguments
+        ), f"got wrong function argument:${actual_tool_call.function.arguments}"
+
+
+def fix_tool_call_tokenization(
+    tokens: list[int],
+    mistral_tool_parser: MistralToolParser,
+    mistral_tokenizer: TokenizerLike,
+):
+    """
+    Replaces the textual token sequence for [TOOL_CALLS]
+    with its single special token ID.
+    """
+    textual_tool_call_token_ids = mistral_tokenizer.encode(
+        text=mistral_tool_parser.bot_token,
+        add_special_tokens=False,
+    )
+    # textual_tool_call_token_ids must not contain special tokens like bos, eos etc
+    special_tool_call_token_ids = [mistral_tool_parser.bot_token_id]
+
+    # If the input is too short to contain the sequence, no replacement is possible
+    if not tokens or len(tokens) < len(textual_tool_call_token_ids):
+        return tokens
+
+    result_tokens = []
+    i = 0
+    target_len = len(textual_tool_call_token_ids)
+
+    while i < len(tokens):
+        # Check if the slice from the current position matches the target sequence
+        if tokens[i : i + target_len] == textual_tool_call_token_ids:
+            # If it matches, add the replacement and jump the index forward
+            result_tokens.extend(special_tool_call_token_ids)
+            i += target_len
+        else:
+            # Otherwise, just add the current token and move to the next one
+            result_tokens.append(tokens[i])
+            i += 1
+
+    return result_tokens
+
+
+def stream_delta_message_generator(
+    mistral_tool_parser: MistralToolParser,
+    mistral_tokenizer: TokenizerLike,
+    model_output: str | None,
+    tools: list[tuple[str, str]] | None,
+) -> Generator[DeltaMessage, None, None]:
+    if (
+        isinstance(mistral_tokenizer, MistralTokenizer)
+        and mistral_tokenizer.version >= 11
+    ):
+        # With the newer versions of the tokenizer,
+        # we cannot tokenize free text
+        # so we need to create a list of messages to get tokenized
+        assert tools is not None
+        assistant_msg = AssistantMessage(
+            tool_calls=[
+                ToolCall(
+                    function=FunctionCall(
+                        name=name,
+                        arguments=arg,
+                    )
+                )
+                for (name, arg) in tools
+            ],
+        )
+        request = InstructRequest(
+            messages=[assistant_msg],
+        )
+        all_token_ids = mistral_tokenizer.instruct.encode_instruct(request).tokens
+    else:
+        # Older versions of the tokenizer are
+        # able to encode directly the model's output (free text) into tokens
+        assert model_output is not None
+        all_token_ids = mistral_tokenizer.encode(model_output, add_special_tokens=False)
+
+    all_token_ids = fix_tool_call_tokenization(
+        all_token_ids, mistral_tool_parser, mistral_tokenizer
+    )
+
+    previous_text = ""
+    previous_tokens = None
+    prefix_offset = 0
+    read_offset = 0
+    for i, delta_token in enumerate(all_token_ids):
+        delta_token_ids = [delta_token]
+        previous_token_ids = all_token_ids[:i]
+        current_token_ids = all_token_ids[: i + 1]
+
+        (new_tokens, delta_text, new_prefix_offset, new_read_offset) = (
+            detokenize_incrementally(
+                tokenizer=mistral_tokenizer,
+                all_input_ids=current_token_ids,
+                prev_tokens=previous_tokens,
+                prefix_offset=prefix_offset,
+                read_offset=read_offset,
+                skip_special_tokens=isinstance(mistral_tokenizer, MistralTokenizer),
+                spaces_between_special_tokens=True,
+            )
+        )
+
+        current_text = previous_text + delta_text
+
+        delta_message = mistral_tool_parser.extract_tool_calls_streaming(
+            previous_text,
+            current_text,
+            delta_text,
+            previous_token_ids,
+            current_token_ids,
+            delta_token_ids,
+            request=None,  # type: ignore[arg-type]
+        )
+        if delta_message:
+            yield delta_message
+
+        previous_text = current_text
+        previous_tokens = (
+            previous_tokens + new_tokens if previous_tokens else new_tokens
+        )
+        prefix_offset = new_prefix_offset
+        read_offset = new_read_offset
+
+
+def test_extract_tool_calls_no_tools(mistral_pre_v11_tool_parser):
+    model_output = "This is a test"
+    extracted_tool_calls = mistral_pre_v11_tool_parser.extract_tool_calls(
+        model_output, request=None
+    )  # type: ignore[arg-type]
+    assert not extracted_tool_calls.tools_called
+    assert extracted_tool_calls.tool_calls == []
+    assert extracted_tool_calls.content == model_output
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "single_tool_add",
+        "single_tool_weather",
+        "argument_before_name",
+        "argument_before_name_and_name_in_argument",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        (
+            """[TOOL_CALLS][{"name": "add", "arguments":{"a": 3.5, "b": 4}}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3.5, "b": 4})
+                    )
+                )
+            ],
+            None,
+        ),
+        (
+            """[TOOL_CALLS] [{"name": "get_current_weather", "arguments":{"city": "San Francisco", "state": "CA", "unit": "celsius"}}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                )
+            ],
+            None,
+        ),
+        (
+            """[TOOL_CALLS] [{"arguments":{"city": "San Francisco", "state": "CA", "unit": "celsius"}, "name": "get_current_weather"}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                )
+            ],
+            None,
+        ),
+        (
+            """[TOOL_CALLS] [{"arguments":{"name": "John Doe"}, "name": "get_age"}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_age",
+                        arguments=json.dumps(
+                            {
+                                "name": "John Doe",
+                            }
+                        ),
+                    )
+                )
+            ],
+            None,
+        ),
+    ],
+)
+def test_extract_tool_calls_pre_v11_tokenizer(
+    mistral_pre_v11_tool_parser, model_output, expected_tool_calls, expected_content
+):
+    extracted_tool_calls = mistral_pre_v11_tool_parser.extract_tool_calls(
+        model_output, request=None
+    )  # type: ignore[arg-type]
+    assert extracted_tool_calls.tools_called
+
+    assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
+
+    assert extracted_tool_calls.content == expected_content
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "single_tool_add",
+        "single_tool_weather",
+        "multiple_tool_calls",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        (
+            """[TOOL_CALLS]add_this_and_that{"a": 3.5, "b": 4}""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add_this_and_that",
+                        arguments=json.dumps({"a": 3.5, "b": 4}),
+                    )
+                )
+            ],
+            None,
+        ),
+        (
+            """[TOOL_CALLS]get_current_weather{"city": "San Francisco", "state": "CA", "unit": "celsius"}""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                )
+            ],
+            None,
+        ),
+        (
+            """[TOOL_CALLS]add{"a": 3.5, "b": 4}[TOOL_CALLS]multiply{"a": 3, "b": 6}""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3.5, "b": 4})
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="multiply", arguments=json.dumps({"a": 3, "b": 6})
+                    )
+                ),
+            ],
+            None,
+        ),
+    ],
+)
+def test_extract_tool_calls(
+    mistral_tool_parser, model_output, expected_tool_calls, expected_content
+):
+    extracted_tool_calls = mistral_tool_parser.extract_tool_calls(
+        model_output, request=None
+    )  # type: ignore[arg-type]
+    assert extracted_tool_calls.tools_called
+
+    assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
+
+    assert extracted_tool_calls.content == expected_content
+
+
+def _test_extract_tool_calls_streaming(
+    tool_parser, tokenizer, model_output, tools, expected_tool_calls, expected_content
+):
+    other_content: str = ""
+    function_names: list[str] = []
+    function_args_strs: list[str] = []
+    tool_call_idx: int = -1
+    tool_call_ids: list[str | None] = []
+
+    for delta_message in stream_delta_message_generator(
+        tool_parser, tokenizer, model_output, tools
+    ):
+        # role should never be streamed from tool parser
+        assert not delta_message.role
+
+        if delta_message.content:
+            other_content += delta_message.content
+
+        streamed_tool_calls = delta_message.tool_calls
+
+        if streamed_tool_calls and len(streamed_tool_calls) > 0:
+            # make sure only one diff is present - correct even for parallel
+            assert len(streamed_tool_calls) == 1
+            tool_call = streamed_tool_calls[0]
+
+            assert len(tool_parser.prev_tool_call_arr) > 0
+
+            # if a new tool is being called, set up empty arguments
+            if tool_call.index != tool_call_idx:
+                tool_call_idx = tool_call.index
+                function_args_strs.append("")
+                tool_call_ids.append(None)
+
+            # if a tool call ID is streamed, make sure one hasn't been already
+            if tool_call.id and not tool_call_ids[tool_call.index]:
+                tool_call_ids[tool_call.index] = tool_call.id
+
+            # if parts of the function start being streamed
+            if tool_call.function:
+                # if the function name is defined, set it. it should be streamed
+                # IN ENTIRETY, exactly one time.
+                if tool_call.function.name:
+                    assert isinstance(tool_call.function.name, str)
+                    function_names.append(tool_call.function.name)
+
+                if tool_call.function.arguments:
+                    # make sure they're a string and then add them to the list
+                    assert isinstance(tool_call.function.arguments, str)
+
+                    function_args_strs[tool_call.index] += tool_call.function.arguments
+
+    assert other_content == expected_content
+
+    actual_tool_calls = [
+        ToolCall(
+            id=tool_call_id,
+            function=FunctionCall(
+                name=function_name,
+                arguments=partial_json_parser.ensure_json(
+                    function_args_str, Allow.OBJ | Allow.STR
+                ),
+            ),
+        )
+        for tool_call_id, function_name, function_args_str in zip(
+            tool_call_ids, function_names, function_args_strs
+        )
+    ]
+    assert_tool_calls(actual_tool_calls, expected_tool_calls)
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "no_tools",
+        "single_tool_add",
+        "single_tool_add_strings",
+        "single_tool_weather",
+        "argument_before_name",
+        "argument_before_name_and_name_in_argument",
+        "multiple_tools",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        ("""This is a test""", [], """This is a test"""),
+        (
+            """[TOOL_CALLS]  [ {"name":"add" , "arguments" : {"a": 3, "b": 4} } ]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3, "b": 4})
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"name": "add", "arguments":{"a": "3", "b": "4"}}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": "3", "b": "4"})
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"city": "San Francisco", "state": "CA", "unit": "celsius"}}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"arguments": {"city": "San Francisco", "state": "CA", "unit": "celsius"}, "name": "get_current_weather"}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"arguments": {"name": "John Doe"}, "name": "get_age"}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_age",
+                        arguments=json.dumps(
+                            {
+                                "name": "John Doe",
+                            }
+                        ),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"name": "add", "arguments": {"a": 3.5, "b": 4}}, {"name": "get_current_weather", "arguments":{"city": "San Francisco", "state": "CA", "unit": "celsius"}}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3.5, "b": 4})
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                ),
+            ],
+            "",
+        ),
+    ],
+)
+def test_extract_tool_calls_streaming_pre_v11_tokenizer(
+    mistral_pre_v11_tool_parser,
+    mistral_pre_v11_tokenizer,
+    model_output,
+    expected_tool_calls,
+    expected_content,
+):
+    _test_extract_tool_calls_streaming(
+        mistral_pre_v11_tool_parser,
+        mistral_pre_v11_tokenizer,
+        model_output,
+        None,
+        expected_tool_calls,
+        expected_content,
+    )
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "single_tool_add",
+        "single_tool_add_strings",
+        "multiple_tools",
+    ],
+    argnames=["tools", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        (
+            [("add", '{"a": 3, "b": 4}')],
+            # [TOOL_CALLS]add{"a": 3, "b": 4}
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3, "b": 4})
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            [("add_two_strings", '{"a": "3", "b": "4"}')],
+            # [TOOL_CALLS]add_two_strings{"a": "3", "b": "4"}
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add_two_strings",
+                        arguments=json.dumps({"a": "3", "b": "4"}),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            [
+                ("add", '{"a": 3.5, "b": 4}'),
+                (
+                    "get_current_weather",
+                    '{"city": "San Francisco", "state": "CA", "unit": "celsius"}',  # noqa: E501
+                ),
+            ],
+            # [TOOL_CALLS]add{"a": 3.5, "b": 4}[TOOL_CALLS]get_current_weather{"city": "San Francisco", "state": "CA", "unit": "celsius"}  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3.5, "b": 4})
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                ),
+            ],
+            "",
+        ),
+    ],
+)
+def test_extract_tool_calls_streaming(
+    mistral_tool_parser,
+    mistral_tokenizer,
+    tools,
+    expected_tool_calls,
+    expected_content,
+):
+    _test_extract_tool_calls_streaming(
+        mistral_tool_parser,
+        mistral_tokenizer,
+        None,
+        tools,
+        expected_tool_calls,
+        expected_content,
+    )
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "single_tool_add",
+        "single_tool_weather",
+        "multiple_tool_calls",
+        "content_before_tool",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        (
+            """[TOOL_CALLS]add_this_and_that{"a": 3.5, "b": 4}""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add_this_and_that",
+                        arguments=json.dumps({"a": 3.5, "b": 4}),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS]get_current_weather{"city": "San Francisco", "state": "CA", "unit": "celsius"}""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS]add{"a": 3.5, "b": 4}[TOOL_CALLS]multiply{"a": 3, "b": 6}""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3.5, "b": 4})
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="multiply", arguments=json.dumps({"a": 3, "b": 6})
+                    )
+                ),
+            ],
+            "",
+        ),
+        (
+            # Additional content should not be after the tool calls
+            """bla[TOOL_CALLS]add_this_and_that{"a": 3.5, "b": 4}""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add_this_and_that",
+                        arguments=json.dumps({"a": 3.5, "b": 4}),
+                    )
+                )
+            ],
+            "bla",
+        ),
+    ],
+)
+def test_extract_tool_calls_streaming_one_chunk(
+    mistral_tool_parser,
+    mistral_tokenizer,
+    model_output,
+    expected_tool_calls,
+    expected_content,
+):
+    if isinstance(mistral_tokenizer, MistralTokenizer):
+        all_token_ids = mistral_tokenizer.encode(model_output)
+    else:
+        all_token_ids = mistral_tokenizer.encode(model_output, add_special_tokens=False)
+    all_token_ids = fix_tool_call_tokenization(
+        all_token_ids, mistral_tool_parser, mistral_tokenizer
+    )
+
+    delta_message = mistral_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text=model_output,
+        delta_text=model_output,
+        previous_token_ids=[],
+        current_token_ids=all_token_ids,
+        delta_token_ids=all_token_ids,
+        request=None,
+    )  # type: ignore[arg-type]
+    assert isinstance(delta_message, DeltaMessage)
+    assert len(delta_message.tool_calls) == len(expected_tool_calls)
+
+    assert_tool_calls(delta_message.tool_calls, expected_tool_calls)
+
+    if delta_message.content is None:
+        assert expected_content == ""
+    else:
+        assert delta_message.content == expected_content
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "no_tools",
+        "single_tool_add",
+        "single_tool_add_strings",
+        "single_tool_weather",
+        "argument_before_name",
+        "argument_before_name_and_name_in_argument",
+        "multiple_tools",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        ("""This is a test""", [], """This is a test"""),
+        (
+            """[TOOL_CALLS]  [ {"name":"add" , "arguments" : {"a": 3, "b": 4} } ]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3, "b": 4})
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"name": "add", "arguments":{"a": "3", "b": "4"}}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": "3", "b": "4"})
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"city": "San Francisco", "state": "CA", "unit": "celsius"}}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"arguments": {"city": "San Francisco", "state": "CA", "unit": "celsius"}, "name": "get_current_weather"}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"arguments": {"name": "John Doe"}, "name": "get_age"}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_age",
+                        arguments=json.dumps(
+                            {
+                                "name": "John Doe",
+                            }
+                        ),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"arguments": {"a": 3.5, "b": 4}, "name": "add"}, {"arguments":{"city": "San Francisco", "state": "CA", "unit": "celsius"}, "name": "get_current_weather"}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3.5, "b": 4})
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                ),
+            ],
+            "",
+        ),
+    ],
+)
+def test_extract_tool_calls_streaming_pre_v11_tokenizer_one_chunk(
+    mistral_pre_v11_tool_parser,
+    mistral_pre_v11_tokenizer,
+    model_output,
+    expected_tool_calls,
+    expected_content,
+):
+    if isinstance(mistral_pre_v11_tokenizer, MistralTokenizer):
+        all_token_ids = mistral_pre_v11_tokenizer.encode(model_output)
+    else:
+        all_token_ids = mistral_pre_v11_tokenizer.encode(
+            model_output, add_special_tokens=False
+        )
+    all_token_ids = fix_tool_call_tokenization(
+        all_token_ids, mistral_pre_v11_tool_parser, mistral_pre_v11_tokenizer
+    )
+
+    delta_message = mistral_pre_v11_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text=model_output,
+        delta_text=model_output,
+        previous_token_ids=[],
+        current_token_ids=all_token_ids,
+        delta_token_ids=all_token_ids,
+        request=None,
+    )  # type: ignore[arg-type]
+    assert isinstance(delta_message, DeltaMessage)
+    assert len(delta_message.tool_calls) == len(expected_tool_calls)
+
+    assert_tool_calls(delta_message.tool_calls, expected_tool_calls)
+
+    if delta_message.content is None:
+        assert expected_content == ""
+    else:
+        assert delta_message.content == expected_content
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index 7584b9031..de7284a30 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -123,7 +123,7 @@ CONFIGS: dict[str, ServerConfig] = {
         "supports_parallel": True,
         "extended": True,
     },
-    "mistral": {
+    "mistral-7b": {
         "model": "mistralai/Mistral-7B-Instruct-v0.3",
         "arguments": [
             "--enforce-eager",
@@ -145,6 +145,32 @@ CONFIGS: dict[str, ServerConfig] = {
         "call the tool. Otherwise, answer the user's query directly "
         "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
         "to the user's question - just respond to it normally.",
+        "supports_parallel": True,
+    },
+    "mistral-small-3.2": {
+        "model": "mistralai/Mistral-Small-3.2-24B-Instruct-2506",
+        "arguments": [
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "mistral",
+            "--tokenizer-mode",
+            "mistral",
+            "--config-format",
+            "mistral",
+            "--load-format",
+            "mistral",
+            "--tensor-parallel-size",
+            "4",
+            '--ignore-patterns="consolidated.safetensors"',
+        ],
+        "system_prompt": "You are a helpful assistant with access to tools. If a tool"
+        " that you have would be helpful to answer a user query, "
+        "call the tool. Otherwise, answer the user's query directly "
+        "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
+        "to the user's question - just respond to it normally.",
+        "supports_parallel": True,
+        "extended": True,
     },
     # FIXME: This test currently fails, need to debug why.
     # "granite20b": {
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index b89db6054..aa5089ffe 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -3,12 +3,12 @@
 
 import json
 from collections.abc import Sequence
+from enum import Enum, auto
 from random import choices
 from string import ascii_letters, digits
 
-import partial_json_parser
+import ijson
 import regex as re
-from partial_json_parser.core.options import Allow
 from pydantic import Field
 
 from vllm.entrypoints.openai.protocol import (
@@ -23,7 +23,6 @@ from vllm.entrypoints.openai.protocol import (
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
 )
-from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
 from vllm.tokenizers import MistralTokenizer, TokenizerLike
 
@@ -32,6 +31,22 @@ logger = init_logger(__name__)
 ALPHANUMERIC = ascii_letters + digits
 
 
+class StreamingState(Enum):
+    """Enum for tracking the current streaming parsing state."""
+
+    WAITING_FOR_TOOL_START = auto()
+    WAITING_FOR_TOOL_KEY = (
+        auto()
+    )  # waiting for the "name" or "arguments" key to be complete
+    PARSING_NAME = auto()
+    PARSING_NAME_COMPLETED = auto()
+    WAITING_FOR_ARGUMENTS_START = auto()
+    PARSING_ARGUMENTS = auto()
+    PARSING_ARGUMENTS_COMPLETED = auto()
+    TOOL_COMPLETE = auto()
+    ALL_TOOLS_COMPLETE = auto()
+
+
 class MistralToolCall(ToolCall):
     id: str = Field(default_factory=lambda: MistralToolCall.generate_random_id())
 
@@ -46,8 +61,8 @@ class MistralToolCall(ToolCall):
         return id.isalnum() and len(id) == 9
 
 
-def _is_fn_name_regex_support(model_tokenizer: TokenizerLike) -> bool:
-    return (
+def _is_pre_v11_tokeniser(model_tokenizer: TokenizerLike) -> bool:
+    return not (
         isinstance(model_tokenizer, MistralTokenizer) and model_tokenizer.version >= 11
     )
 
@@ -69,16 +84,22 @@ class MistralToolParser(ToolParser):
 
         # initialize properties used for state when parsing tool calls in
         # streaming mode
-        self.prev_tool_call_arr: list[dict] = []
         self.current_tool_id: int = -1
-        self.current_tool_name_sent: bool = False
-        self.streamed_args_for_tool: list[
-            str
-        ] = []  # map what has been streamed for each tool so far to a list
+        self.streaming_state: StreamingState = StreamingState.WAITING_FOR_TOOL_START
+
+        # For streaming pre v11 tokenizer tool calls
+        self.current_tool_name: str | None = None
+        self.current_tool_mistral_id: str | None = None
+        self.starting_new_tool = False
+        if _is_pre_v11_tokeniser(self.model_tokenizer):
+            self.parse_coro = ijson.parse_coro(
+                self.update_stream_state_pre_v11_tokenizer()
+            )
+
         self.bot_token = "[TOOL_CALLS]"
         self.bot_token_id = self.vocab.get(self.bot_token)
         self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
-        if _is_fn_name_regex_support(self.model_tokenizer):
+        if not _is_pre_v11_tokeniser(self.model_tokenizer):
             self.fn_name_regex = re.compile(
                 r"([a-zA-Z0-9_-]+)(\{[\s\S]*?\}+)", re.DOTALL
             )
@@ -131,18 +152,19 @@ class MistralToolParser(ToolParser):
             # jsons is difficult
             try:
                 if self.fn_name_regex:
-                    matches = self.fn_name_regex.findall(tool_content)
-
                     function_call_arr = []
-                    for match in matches:
-                        fn_name = match[0]
-                        args = match[1]
-
-                        # fn_name is encoded outside serialized json dump
-                        # only arguments are serialized
-                        function_call_arr.append(
-                            {"name": fn_name, "arguments": json.loads(args)}
-                        )
+                    for single_tool_content in model_output.split(self.bot_token):
+                        matches = self.fn_name_regex.findall(single_tool_content)
+
+                        for match in matches:
+                            fn_name = match[0]
+                            args = match[1]
+
+                            # fn_name is encoded outside serialized json dump
+                            # only arguments are serialized
+                            function_call_arr.append(
+                                {"name": fn_name, "arguments": json.loads(args)}
+                            )
                 else:
                     function_call_arr = json.loads(tool_content)
             except json.JSONDecodeError:
@@ -193,198 +215,372 @@ class MistralToolParser(ToolParser):
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
     ) -> DeltaMessage | None:
-        # if the tool call token is not in the tokens generated so far, append
-        # output to contents since it's not a tool
-        if self.bot_token not in current_text:
+        if self.bot_token_id not in current_token_ids:
+            # if the tool call token is not in the tokens generated so far,
+            # append output to contents since it's not a tool
             return DeltaMessage(content=delta_text)
 
-        # if the tool call token ID IS in the tokens generated so far, that
+        # if the tool call token IS in the tokens generated so far, that
         # means we're parsing as tool calls now
-
-        # handle if we detected the BOT token which means the start of tool
-        # calling
-        if self.bot_token_id in delta_token_ids and len(delta_token_ids) == 1:
-            # if it's the only token, return None, so we don't send a chat
-            # completion any don't send a control token
-            return None
-
-        # bit mask flags for partial JSON parsing. If the name hasn't been
-        # sent yet, don't allow sending
-        # an incomplete string since OpenAI only ever (as far as I have
-        # seen) allows sending the entire tool/ function name at once.
-        flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR
         try:
-            # replace BOT token with empty string, and convert single quotes
-            # to double to allow parsing as JSON since mistral uses single
-            # quotes instead of double for tool calls
-            parsable_arr = current_text.split(self.bot_token)[-1]
-
-            # tool calls are generated in an array, so do partial JSON
-            # parsing on the entire array
-            try:
-                tool_call_arr: list[dict] = partial_json_parser.loads(
-                    parsable_arr, flags
+            if _is_pre_v11_tokeniser(self.model_tokenizer):
+                return self._extract_tool_calls_streaming_pre_v11_tokenizer(
+                    delta_text=delta_text,
+                    delta_token_ids=delta_token_ids,
                 )
-            except partial_json_parser.core.exceptions.MalformedJSON:
-                logger.debug("not enough tokens to parse into JSON yet")
-                return None
-
-            # select as the current tool call the one we're on the state at
+            else:
+                return self._extract_tool_calls_streaming(
+                    delta_text=delta_text, delta_token_ids=delta_token_ids
+                )
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            return None
 
-            current_tool_call: dict = (
-                tool_call_arr[self.current_tool_id] if len(tool_call_arr) > 0 else {}
-            )
+    def _extract_tool_calls_streaming(
+        self,
+        delta_text: str,
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        """
+        Extracts tool calls for Mistral models
+        doing tool calls of the following format:
+        `[TOOL_CALLS]add{"a": 3.5, "b": 4}`
+        """
+        additional_content: str = ""
+        if self.streaming_state == StreamingState.WAITING_FOR_TOOL_START:
+            # this is the first tool call
+            assert self.bot_token_id in delta_token_ids
+            if not delta_text.startswith(self.bot_token):
+                additional_content += delta_text.split(self.bot_token)[0]
+                delta_text = self.bot_token + "".join(
+                    delta_text.split(self.bot_token)[1:]
+                )
 
-            # case -- if no tokens have been streamed for the tool, e.g.
-            #   only the array brackets, stream nothing
-            if len(tool_call_arr) == 0:
+        delta_tool_calls = self._generate_delta_tool_call(delta_text)
+        if not additional_content and len(delta_tool_calls) == 0:
+            if self.streaming_state in [
+                StreamingState.PARSING_ARGUMENTS,
+                StreamingState.PARSING_ARGUMENTS_COMPLETED,
+                StreamingState.TOOL_COMPLETE,
+                StreamingState.ALL_TOOLS_COMPLETE,
+            ]:
+                # Return an empty DeltaMessage once the tool calls are all done
+                # so that finish_reason gets set.
+                return DeltaMessage()
+            else:
+                # return None when the tool is not likely to be finished
+                # This can occur when the name is being parsed for example
+                # and we wait for the name to be complete
+                # before sending the function name
                 return None
 
-            # case: we are starting a new tool in the array
-            #   -> array has > 0 length AND length has moved past cursor
-            elif (
-                len(tool_call_arr) > 0 and len(tool_call_arr) > self.current_tool_id + 1
-            ):
-                # if we're moving on to a new call, first make sure we
-                # haven't missed anything in the previous one that was
-                # auto-generated due to JSON completions, but wasn't
-                # streamed to the client yet.
-                if self.current_tool_id >= 0:
-                    diff: str | None = current_tool_call.get("arguments")
-
-                    if diff:
-                        diff = json.dumps(diff, ensure_ascii=False).replace(
-                            self.streamed_args_for_tool[self.current_tool_id], ""
-                        )
-                        delta = DeltaMessage(
-                            tool_calls=[
-                                DeltaToolCall(
-                                    index=self.current_tool_id,
-                                    function=DeltaFunctionCall(
-                                        arguments=diff
-                                    ).model_dump(exclude_none=True),
-                                )
-                            ]
-                        )
-                        self.streamed_args_for_tool[self.current_tool_id] += diff
-                    else:
-                        delta = None
-                else:
-                    delta = None
-                # re-set stuff pertaining to progress in the current tool
-                self.current_tool_id = len(tool_call_arr) - 1
-                self.current_tool_name_sent = False
-                self.streamed_args_for_tool.append("")
-                logger.debug("starting on new tool %d", self.current_tool_id)
-                return delta
-
-            # case: update an existing tool - this is handled below
-
-            # if the current tool name hasn't been sent, send if available
-            # - otherwise send nothing
-            if not self.current_tool_name_sent:
-                function_name = current_tool_call.get("name")
-                if function_name:
-                    delta = DeltaMessage(
-                        tool_calls=[
-                            DeltaToolCall(
-                                index=self.current_tool_id,
-                                type="function",
-                                id=MistralToolCall.generate_random_id(),
-                                function=DeltaFunctionCall(
-                                    name=function_name
-                                ).model_dump(exclude_none=True),
-                            )
-                        ]
+        delta = DeltaMessage()
+        if additional_content:
+            delta.content = additional_content
+        if len(delta_tool_calls) > 0:
+            delta.tool_calls = delta_tool_calls
+
+        # HACK: serving_chat.py inspects the internal state of tool parsers
+        # when determining its final streaming delta, automatically
+        # adding autocompleted JSON.
+        # These two lines avoid that nonsense while ensuring finish_reason
+        # is set to tool_calls when at least one tool is called.
+        if delta_tool_calls and not self.prev_tool_call_arr:
+            self.prev_tool_call_arr = [{"arguments": {}}]
+        return delta
+
+    def _generate_delta_tool_call(self, delta_text: str) -> list[DeltaToolCall]:
+        if delta_text == "" or delta_text is None:
+            return []
+        delta_function_name = None
+        tool_id = None
+        if self.streaming_state not in [
+            StreamingState.PARSING_NAME,
+            StreamingState.PARSING_ARGUMENTS,
+        ] and delta_text.startswith(self.bot_token):
+            self.current_tool_id += 1
+            self.streaming_state = StreamingState.PARSING_NAME
+            delta_text = delta_text.replace(self.bot_token, "", 1)
+        if self.streaming_state == StreamingState.PARSING_NAME:
+            if self.current_tool_name is None:
+                self.current_tool_name = ""
+            # The name stops where the arguments start
+            # And the arguments start with the `{` char
+            if "{" in delta_text:
+                tool_id = MistralToolCall.generate_random_id()
+                delta_function_name = delta_text.split("{")[0]
+                self.current_tool_name += delta_function_name
+                delta_text = delta_text[len(delta_function_name) :]
+                self.streaming_state = StreamingState.PARSING_ARGUMENTS
+            else:
+                # we want to send the tool name once it's complete
+                self.current_tool_name += delta_text
+                return []
+        if self.streaming_state == StreamingState.PARSING_ARGUMENTS:
+            next_function_text = None
+            if self.bot_token in delta_text:
+                # current tool call is over
+                delta_arguments = ""
+                delta_arguments += delta_text.split(self.bot_token)[0]
+                next_function_text = delta_text[len(delta_arguments) :]
+                self.streaming_state = StreamingState.TOOL_COMPLETE
+            else:
+                delta_arguments = delta_text
+            ret = []
+            if self.current_tool_name or delta_arguments:
+                ret += [
+                    DeltaToolCall(
+                        index=self.current_tool_id,
+                        type="function",
+                        id=tool_id,
+                        function=DeltaFunctionCall(
+                            name=self.current_tool_name, arguments=delta_arguments
+                        ).model_dump(exclude_none=True),
                     )
-                    self.current_tool_name_sent = True
-                else:
-                    delta = None
-
-            # now we know we're on the same tool call and we're streaming
-            # arguments
+                ]
+                self.current_tool_name = None
+            if next_function_text:
+                ret += self._generate_delta_tool_call(next_function_text)
+            return ret
+        # Should not happen
+        return []
+
+    @ijson.coroutine
+    def update_stream_state_pre_v11_tokenizer(self):
+        while True:
+            (prefix, event, value) = yield
+
+            if prefix == "item" and event == "start_map":
+                self.streaming_state = StreamingState.WAITING_FOR_TOOL_KEY
+            if prefix == "item" and event == "map_key" and value == "name":
+                self.streaming_state = StreamingState.PARSING_NAME
+            if prefix == "item.name" and event == "string":
+                self.current_tool_name = value
+                self.streaming_state = StreamingState.PARSING_NAME_COMPLETED
+            if prefix == "item" and event == "map_key" and value == "arguments":
+                self.streaming_state = StreamingState.WAITING_FOR_ARGUMENTS_START
+            if prefix == "item.arguments" and event == "start_map":
+                self.streaming_state = StreamingState.PARSING_ARGUMENTS
+            if prefix == "item.arguments" and event == "end_map":
+                self.streaming_state = StreamingState.PARSING_ARGUMENTS_COMPLETED
+            if prefix == "item" and event == "end_map":
+                self.streaming_state = StreamingState.TOOL_COMPLETE
+            if prefix == "" and event == "end_array":
+                self.streaming_state = StreamingState.ALL_TOOLS_COMPLETE
+
+    def _extract_tool_calls_streaming_pre_v11_tokenizer(
+        self,
+        delta_text: str,
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        """
+        Extracts tool calls for Mistral models
+        doing tool calls of the following format:
+        `[TOOL_CALLS][{"name": "add", "arguments":{"a": 3.5, "b": 4}}`
+        """
+        assert self.parse_coro is not None
+        content = None
+        delta_tool_calls: list[DeltaToolCall] = []
+        current_tool_call: DeltaToolCall = DeltaToolCall(
+            index=self.current_tool_id, type="function"
+        )
+        current_tool_call_modified = False
+        if self.bot_token_id in delta_token_ids:
+            # this is the first tool call
+            if not delta_text.startswith(self.bot_token):
+                content = delta_text.split(self.bot_token)[0]
+            delta_text = "".join(delta_text.split(self.bot_token)[1:])
+
+        # Cut smartly the delta text to catch the ijson events
+        # as ijson does not give us the index in the text at each event.
+        # We need to cut so that we know
+        # where in the text the events are emitted from.
+        while len(delta_text) > 0:
+            streaming_state_before_parse = self.streaming_state
+
+            if self.streaming_state == StreamingState.WAITING_FOR_TOOL_START:
+                delta_to_be_parsed, delta_text = self._split_delta(
+                    delta_text=delta_text,
+                    stop_after_opening_curly_braces=1,
+                )
+            elif self.streaming_state == StreamingState.WAITING_FOR_TOOL_KEY:
+                # Wait until another key is sent
+                # or the current tool is completed
+                delta_to_be_parsed, delta_text = self._split_delta(
+                    delta_text=delta_text,
+                    stop_after_colon=1,
+                    stop_after_opening_curly_braces=1,
+                    # if the tool ends, we want to separate
+                    # at the start of the next tool
+                )
+            elif self.streaming_state == StreamingState.PARSING_NAME:
+                delta_to_be_parsed, delta_text = self._split_delta(
+                    delta_text=delta_text,
+                    stop_after_comma=1,
+                    stop_after_closing_brackets=1,
+                )
+            elif self.streaming_state == StreamingState.WAITING_FOR_ARGUMENTS_START:
+                delta_to_be_parsed, delta_text = self._split_delta(
+                    delta_text=delta_text,
+                    stop_after_opening_curly_braces=1,
+                )
+            elif self.streaming_state == StreamingState.PARSING_ARGUMENTS:
+                delta_to_be_parsed, delta_text = self._split_delta(
+                    delta_text=delta_text,
+                    stop_after_closing_curly_braces=1,
+                    # we could be more clever
+                    # by listening to item.arguments.* start_map events
+                    # and know how many curly braces we can allow
+                )
+            elif self.streaming_state in [
+                StreamingState.PARSING_ARGUMENTS_COMPLETED,
+                StreamingState.PARSING_NAME_COMPLETED,
+            ]:
+                delta_to_be_parsed, delta_text = self._split_delta(
+                    delta_text=delta_text,
+                    stop_after_closing_curly_braces=1,
+                    stop_after_closing_brackets=1,
+                )
+            elif self.streaming_state == StreamingState.TOOL_COMPLETE:
+                delta_to_be_parsed, delta_text = self._split_delta(
+                    delta_text=delta_text,
+                    stop_after_opening_curly_braces=1,
+                    stop_after_closing_brackets=1,
+                )
+            elif self.streaming_state == StreamingState.ALL_TOOLS_COMPLETE:
+                content = delta_text
+                delta_text = ""
             else:
-                prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
-                    "arguments"
+                delta_to_be_parsed = delta_text
+                delta_text = ""
+
+            if self.streaming_state != StreamingState.ALL_TOOLS_COMPLETE:
+                self.parse_coro.send(delta_to_be_parsed.encode("utf-8"))
+
+            # Given the parsed text and the possible streaming state change,
+            # let's add to the tool delta
+            if (
+                (streaming_state_before_parse != self.streaming_state)
+                and streaming_state_before_parse
+                in [StreamingState.WAITING_FOR_TOOL_START, StreamingState.TOOL_COMPLETE]
+                and self.streaming_state
+                not in [
+                    StreamingState.ALL_TOOLS_COMPLETE,
+                    StreamingState.TOOL_COMPLETE,
+                    StreamingState.WAITING_FOR_TOOL_START,
+                ]
+            ):
+                # starting a new tool call
+                if current_tool_call_modified:
+                    if self.current_tool_mistral_id is not None:
+                        current_tool_call.id = self.current_tool_mistral_id
+                        self.current_tool_mistral_id = None
+                    delta_tool_calls.append(current_tool_call)
+                current_tool_call_modified = False
+                self.current_tool_id += 1
+                self.current_tool_mistral_id = MistralToolCall.generate_random_id()
+                current_tool_call = DeltaToolCall(
+                    index=self.current_tool_id,
+                    type="function",
                 )
-                cur_arguments = current_tool_call.get("arguments")
-
-                new_text = delta_text.replace("'", '"')
-                if '"}' in new_text:
-                    new_text = new_text[: new_text.rindex('"}')]
-
-                if not cur_arguments and not prev_arguments:
-                    delta = None
-                elif not cur_arguments and prev_arguments:
-                    logger.error(
-                        "INVARIANT - impossible to have arguments reset mid-arguments"
-                    )
-                    delta = None
-                elif cur_arguments and not prev_arguments:
-                    cur_arguments_json = json.dumps(cur_arguments, ensure_ascii=False)[
-                        :-2
-                    ]
-                    logger.debug("finding %s in %s", new_text, cur_arguments_json)
-
-                    if new_text not in cur_arguments_json:
-                        return None
-                    arguments_delta = cur_arguments_json[
-                        : cur_arguments_json.rindex(new_text) + len(new_text)
-                    ]
-                    logger.debug(
-                        "First tokens in arguments received: %s", arguments_delta
-                    )
-                    delta = DeltaMessage(
-                        tool_calls=[
-                            DeltaToolCall(
-                                index=self.current_tool_id,
-                                function=DeltaFunctionCall(
-                                    arguments=arguments_delta
-                                ).model_dump(exclude_none=True),
-                            )
-                        ]
-                    )
-                    self.streamed_args_for_tool[self.current_tool_id] += arguments_delta
-
-                elif cur_arguments and prev_arguments:
-                    cur_args_json = json.dumps(cur_arguments, ensure_ascii=False)
-                    prev_args_json = json.dumps(prev_arguments, ensure_ascii=False)
-                    logger.debug(
-                        "Searching for diff between \n%s\n%s",
-                        cur_args_json,
-                        prev_args_json,
+            if current_tool_call.function is None:
+                current_tool_call.function = DeltaFunctionCall()
+
+            if self.current_tool_name is not None:
+                # we have the complete tool name
+                current_tool_call_modified = True
+                current_tool_call.function.name = self.current_tool_name
+                self.current_tool_name = None
+            if self.streaming_state == StreamingState.PARSING_NAME_COMPLETED:
+                self.streaming_state = StreamingState.WAITING_FOR_TOOL_KEY
+            if self.streaming_state in [
+                StreamingState.PARSING_ARGUMENTS,
+                StreamingState.PARSING_ARGUMENTS_COMPLETED,
+            ]:
+                if self.streaming_state == StreamingState.PARSING_ARGUMENTS_COMPLETED:
+                    self.streaming_state = StreamingState.WAITING_FOR_TOOL_KEY
+                # the delta_to_be_parsed is part of arguments.
+                current_tool_call_modified = True
+                if current_tool_call.function.arguments is None:
+                    current_tool_call.function.arguments = delta_to_be_parsed
+                else:
+                    current_tool_call.function.arguments += delta_to_be_parsed
+                if streaming_state_before_parse != StreamingState.PARSING_ARGUMENTS:
+                    # It's the first chunk of arg. let's lstrip it
+                    current_tool_call.function.arguments = (
+                        current_tool_call.function.arguments.lstrip()
                     )
 
-                    argument_diff = extract_intermediate_diff(
-                        cur_args_json, prev_args_json
-                    )
-                    logger.debug("got arguments diff: %s", argument_diff)
-                    delta = DeltaMessage(
-                        tool_calls=[
-                            DeltaToolCall(
-                                index=self.current_tool_id,
-                                function=DeltaFunctionCall(
-                                    arguments=argument_diff
-                                ).model_dump(exclude_none=True),
-                            )
-                        ]
-                    )
-                    self.streamed_args_for_tool[self.current_tool_id] += argument_diff
-                else:
-                    # try parsing it with regular JSON - if it works we're
-                    # at the end, and we need to send the difference between
-                    # tokens streamed so far and the valid JSON
-                    delta = None
+        if current_tool_call_modified:
+            if self.current_tool_mistral_id is not None:
+                current_tool_call.id = self.current_tool_mistral_id
+                self.current_tool_mistral_id = None
+            delta_tool_calls.append(current_tool_call)
+
+        # HACK: serving_chat.py inspects the internal state of tool parsers
+        # when determining it's final streaming delta, automatically
+        # adding autocompleted JSON.
+        # These two lines avoid that nonsense while ensuring finish_reason
+        # is set to tool_calls when at least one tool is called.
+        if delta_tool_calls and not self.prev_tool_call_arr:
+            self.prev_tool_call_arr = [{"arguments": {}}]
+
+        if content or len(delta_tool_calls) > 0:
+            delta_message = DeltaMessage()
+            if content:
+                delta_message.content = content
+            if len(delta_tool_calls) > 0:
+                delta_message.tool_calls = delta_tool_calls
+            return delta_message
+        else:
+            if self.streaming_state == StreamingState.ALL_TOOLS_COMPLETE:
+                return DeltaMessage()
+            else:
+                return None
 
-            # check to see if the name is defined and has been sent. if so,
-            # stream the name - otherwise keep waiting
-            # finish by setting old and returning None as base case
-            self.prev_tool_call_arr = tool_call_arr
-            return delta
+    def _split_delta(
+        self,
+        delta_text: str,
+        stop_after_quotes: int = -1,
+        stop_after_opening_curly_braces: int = -1,
+        stop_after_closing_curly_braces: int = -1,
+        stop_after_closing_brackets: int = -1,
+        stop_after_colon: int = -1,
+        stop_after_comma=-1,
+    ) -> tuple[str, str]:
+        delta_to_be_parsed = ""
+        for i, c in enumerate(delta_text):
+            if c in ['"', "'"]:
+                delta_to_be_parsed += c
+                stop_after_quotes -= 1
+                if stop_after_quotes == 0:
+                    return (delta_to_be_parsed, delta_text[i + 1 :])
+            elif c == "{":
+                delta_to_be_parsed += c
+                stop_after_opening_curly_braces -= 1
+                if stop_after_opening_curly_braces == 0:
+                    return (delta_to_be_parsed, delta_text[i + 1 :])
+            elif c == "}":
+                delta_to_be_parsed += c
+                stop_after_closing_curly_braces -= 1
+                if stop_after_closing_curly_braces == 0:
+                    return (delta_to_be_parsed, delta_text[i + 1 :])
+            elif c == "]":
+                delta_to_be_parsed += c
+                stop_after_closing_brackets -= 1
+                if stop_after_closing_brackets == 0:
+                    return (delta_to_be_parsed, delta_text[i + 1 :])
+            elif c == ":":
+                delta_to_be_parsed += c
+                stop_after_colon -= 1
+                if stop_after_colon == 0:
+                    return (delta_to_be_parsed, delta_text[i + 1 :])
+            elif c == ",":
+                delta_to_be_parsed += c
+                stop_after_comma -= 1
+                if stop_after_comma == 0:
+                    return (delta_to_be_parsed, delta_text[i + 1 :])
+            else:
+                delta_to_be_parsed += c
 
-        except Exception:
-            logger.exception("Error trying to handle streaming tool call.")
-            logger.debug(
-                "Skipping chunk as a result of tool streaming extraction error"
-            )
-            return None
+        return (delta_to_be_parsed, "")
-- 
GitLab


From 19bee6d12d985c231b16374c99836376fc0c5706 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Wed, 3 Dec 2025 13:04:59 -0500
Subject: [PATCH 052/258] [Performance][DP/EP] Add
 silu_mul_per_token_group_quant_fp8_colmajor kernel (#29470)

Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 .../benchmark_2d_silu_mul_fp8_quant.py        | 244 ++++++++++++++++++
 ..._mul_per_token_group_quant_fp8_colmajor.py |  86 ++++++
 .../layers/fused_moe/deep_gemm_moe.py         | 114 +++-----
 .../layers/quantization/utils/fp8_utils.py    | 133 ++++++++++
 4 files changed, 496 insertions(+), 81 deletions(-)
 create mode 100644 benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
 create mode 100644 tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py

diff --git a/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
new file mode 100644
index 000000000..04921dafb
--- /dev/null
+++ b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
@@ -0,0 +1,244 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from enum import Enum
+from itertools import product
+from typing import Any
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    _per_token_group_quant_fp8_colmajor,
+    silu_mul_per_token_group_quant_fp8_colmajor,
+)
+from vllm.triton_utils import triton
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
+
+from .utils import ArgPool, Bench, CudaGraphBenchParams
+
+GROUP_SIZE = 128
+FLOAT8_T = torch.float8_e4m3fn
+
+
+def print_timers(timers: list[TMeasurement], cuda_graph_nops: int):
+    print(
+        f"Note : The timings reported above is for {cuda_graph_nops} "
+        "consecutive invocations of the benchmarking functions. "
+        f"Please divide by {cuda_graph_nops} for single invocation "
+        "timings."
+    )
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+class ImplType(Enum):
+    SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR = 1
+    REFERENCE = 2
+
+    def get_impl(self):
+        if self == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
+            return silu_mul_per_token_group_quant_fp8_colmajor
+        elif self == ImplType.REFERENCE:
+            return reference
+        raise ValueError(f"Unrecognized ImplType {self}")
+
+
+@dataclass
+class BenchmarkTensors:
+    input: torch.Tensor
+    output: torch.Tensor
+
+    # Reference act output tensor
+    ref_act_out: torch.Tensor
+    ref_quant_out: torch.Tensor
+
+    @staticmethod
+    def make(T: int, N: int) -> "BenchmarkTensors":
+        assert T % GROUP_SIZE == 0
+        assert N % (GROUP_SIZE * 2) == 0
+
+        input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")
+
+        # silu_mul_per_token_group_quant_fp8_colmajor output.
+        output = torch.rand((T, N // 2), dtype=torch.bfloat16, device="cuda").to(
+            FLOAT8_T
+        )
+
+        # reference output.
+        ref_act_out = torch.empty((T, N // 2), dtype=torch.bfloat16, device="cuda")
+        ref_quant_out = torch.empty(
+            (T, N // 2), dtype=torch.bfloat16, device="cuda"
+        ).to(FLOAT8_T)
+
+        return BenchmarkTensors(
+            input=input,
+            output=output,
+            ref_act_out=ref_act_out,
+            ref_quant_out=ref_quant_out,
+        )
+
+    @property
+    def T(self):
+        return self.input.size(0)
+
+    @property
+    def N(self):
+        return self.input.size(1)
+
+    def make_impl_kwargs(self, impl_type: ImplType) -> dict[str, Any]:
+        if impl_type == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
+            return {
+                "input": self.input,
+                "output": self.output,
+                "use_ue8m0": is_deep_gemm_e8m0_used(),
+            }
+        elif impl_type == ImplType.REFERENCE:
+            return {
+                "input": self.input,
+                "act_out": self.ref_act_out,
+                "quant_out": self.ref_quant_out,
+                "use_ue8m0": is_deep_gemm_e8m0_used(),
+            }
+        raise ValueError(f"Unrecognized impl_type {impl_type}")
+
+
+def reference_quant(x: torch.Tensor, quant_out: torch.Tensor, use_ue8m0: bool):
+    """
+    Reference triton quant kernel from,
+    vllm.model_executor.layers.quantization.utils.fp8_utils
+    """
+    assert quant_out.size() == x.size()
+    # Allocate the scale tensor column-major format.
+    shape = (x.shape[-1] // GROUP_SIZE,) + x.shape[:-1]
+    x_q = quant_out
+    x_s = torch.empty(shape, device=x.device, dtype=torch.float32).permute(-1, -2)
+
+    M = x.numel() // GROUP_SIZE
+    N = GROUP_SIZE
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+    num_stages = 1
+
+    finfo = torch.finfo(FLOAT8_T)
+    fp8_min = finfo.min
+    fp8_max = finfo.max
+
+    _per_token_group_quant_fp8_colmajor[(M,)](
+        x,
+        x_q,
+        x_s,
+        GROUP_SIZE,
+        x.shape[1],
+        x.stride(0),
+        x_s.stride(1),
+        eps=1e-10,
+        fp8_min=fp8_min,
+        fp8_max=fp8_max,
+        use_ue8m0=use_ue8m0,
+        BLOCK=BLOCK,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    return x_q, x_s
+
+
+def reference(
+    input: torch.Tensor,
+    act_out: torch.Tensor,
+    quant_out: torch.Tensor,
+    use_ue8m0: bool,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    torch.ops._C.silu_and_mul(act_out, input)
+    return reference_quant(act_out, quant_out, use_ue8m0)
+
+
+def bench_impl(
+    bench_tensors: list[BenchmarkTensors], impl_type: ImplType
+) -> TMeasurement:
+    T = bench_tensors[0].T
+    N = bench_tensors[0].N
+
+    arg_pool_size = len(bench_tensors)
+    kwargs_list = [bt.make_impl_kwargs(impl_type) for bt in bench_tensors]
+
+    # warmup
+    for kwargs in kwargs_list:
+        impl_type.get_impl()(**kwargs)
+    torch.cuda.synchronize()
+
+    # Merge into a single kwargs and qualify arguments as ArgPool
+    kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
+    for _kwargs in kwargs_list:
+        for k, v in _kwargs.items():
+            kwargs[k].values.append(v)
+
+    cuda_graph_params = None
+    cuda_graph_params = CudaGraphBenchParams(arg_pool_size)
+    timer = None
+    with Bench(
+        cuda_graph_params,
+        "silu-mul-quant",
+        f"num_tokens={T}, N={N}",
+        impl_type.name,
+        impl_type.get_impl(),
+        **kwargs,
+    ) as bench:
+        timer = bench.run()
+    return timer
+
+
+def test_correctness(T: int, N: int):
+    print(f"Testing num_tokens={T}, N={N} ...")
+
+    bench_tensor = BenchmarkTensors.make(T, N)
+
+    def output_from_impl(impl: ImplType) -> tuple[torch.Tensor, torch.Tensor]:
+        return impl.get_impl()(**bench_tensor.make_impl_kwargs(impl))
+
+    # reference output
+    ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
+
+    # test ouptut
+    out_q, out_s = output_from_impl(
+        ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
+    )
+
+    torch.testing.assert_close(ref_out_q.to(torch.float32), out_q.to(torch.float32))
+    torch.testing.assert_close(ref_out_s, out_s)
+
+
+def run(Ts: list[int], Ns: list[int], arg_pool_size: int) -> list[TMeasurement]:
+    timers = []
+    for N, T in product(Ns, Ts):
+        test_correctness(T, N)
+
+        bench_tensors: list[BenchmarkTensors] = [
+            BenchmarkTensors.make(T, N) for _ in range(arg_pool_size)
+        ]
+
+        silu_mul_quant_timer = bench_impl(
+            bench_tensors, ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
+        )
+        timers.append(silu_mul_quant_timer)
+        reference_timer = bench_impl(bench_tensors, ImplType.REFERENCE)
+        timers.append(reference_timer)
+
+        print_timers(
+            [silu_mul_quant_timer, reference_timer], cuda_graph_nops=arg_pool_size
+        )
+
+    print_timers(timers, cuda_graph_nops=arg_pool_size)
+
+    return timers
+
+
+if __name__ == "__main__":
+    T = [128 * i for i in range(1, 16)] + [2048 * i for i in range(1, 65)]
+    N = [2048, 4096, 8192]
+
+    print(f"T = {T}, N = {N}")
+    run(T, N, arg_pool_size=8)
diff --git a/tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py b/tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py
new file mode 100644
index 000000000..e4617072c
--- /dev/null
+++ b/tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    _per_token_group_quant_fp8_colmajor,
+    silu_mul_per_token_group_quant_fp8_colmajor,
+)
+from vllm.platforms import current_platform
+from vllm.triton_utils import triton
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
+
+FLOAT8_DTYPE = torch.float8_e4m3fn
+GROUP_SIZE = 128
+
+
+def reference_quant(x: torch.Tensor, use_ue8m0: bool):
+    """
+    Reference triton quant kernel from,
+    vllm.model_executor.layers.quantization.utils.fp8_utils
+    """
+
+    x_q = torch.empty_like(x, device=x.device, dtype=FLOAT8_DTYPE)
+
+    # Allocate the scale tensor in column-major format.
+    shape = (x.shape[-1] // GROUP_SIZE,) + x.shape[:-1]
+    x_s = torch.empty(shape, device=x.device, dtype=torch.float32).permute(-1, -2)
+
+    M = x.numel() // GROUP_SIZE
+    N = GROUP_SIZE
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+    num_stages = 1
+
+    finfo = torch.finfo(FLOAT8_DTYPE)
+    fp8_min = finfo.min
+    fp8_max = finfo.max
+
+    _per_token_group_quant_fp8_colmajor[(M,)](
+        x,
+        x_q,
+        x_s,
+        GROUP_SIZE,
+        x.shape[1],
+        x.stride(0),
+        x_s.stride(1),
+        eps=1e-10,
+        fp8_min=fp8_min,
+        fp8_max=fp8_max,
+        use_ue8m0=use_ue8m0,
+        BLOCK=BLOCK,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    return x_q, x_s
+
+
+def reference(x: torch.Tensor, use_ue8m0: bool) -> tuple[torch.Tensor, torch.Tensor]:
+    T, N = x.size()
+    ref_act_out = torch.empty((T, N // 2), dtype=torch.bfloat16, device="cuda")
+    torch.ops._C.silu_and_mul(ref_act_out, x)
+    return reference_quant(ref_act_out, use_ue8m0)
+
+
+@pytest.mark.parametrize("T", [128, 256, 512])
+@pytest.mark.parametrize("N", [128 * 2, 256 * 2, 768 * 2, 2048 * 2, 7168 * 2])
+def test_silu_mul_fp8_quant_deep_gemm(T: int, N: int):
+    current_platform.seed_everything(42)
+
+    input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")
+
+    use_ue8m0 = is_deep_gemm_e8m0_used()
+
+    # Test
+    output, output_scales = silu_mul_per_token_group_quant_fp8_colmajor(
+        input, use_ue8m0=use_ue8m0
+    )
+
+    # Reference
+    ref_output, ref_output_scales = reference(input, use_ue8m0)
+
+    torch.testing.assert_close(output.to(torch.float32), ref_output.to(torch.float32))
+    torch.testing.assert_close(output_scales, ref_output_scales)
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
index 86cdd25f2..9f47e692d 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -2,9 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
-from tqdm import tqdm
 
-import vllm.envs as env
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
@@ -25,12 +23,12 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
 from vllm.model_executor.layers.fused_moe.utils import _resize_cache
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8,
+    silu_mul_per_token_group_quant_fp8_colmajor,
 )
 from vllm.utils.deep_gemm import (
     get_mk_alignment_for_contiguous_layout,
     m_grouped_fp8_gemm_nt_contiguous,
 )
-from vllm.utils.func_utils import run_once
 from vllm.utils.import_utils import has_deep_gemm
 
 logger = init_logger(__name__)
@@ -108,70 +106,6 @@ def _valid_deep_gemm(
     return True
 
 
-@run_once
-def warmup_deepgemm_gg_contiguous_kernels(
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    w1_scale: torch.Tensor,
-    w2_scale: torch.Tensor,
-    num_topk: int,
-):
-    """
-    DeepGemm JITs the grouped-gemm kernels. The JIT'ing happens based on the
-    input tensor shapes. In this function, we construct all possible input
-    tensor shapes so all the kernels are JIT'ed and cached.
-    Note that this warmup is expected to happen during the model profile
-    call and not during actual model inference.
-    """
-
-    assert w1.size(0) == w2.size(0), "w1 and w2 must have the same number of experts"
-
-    block_m = get_mk_alignment_for_contiguous_layout()[0]
-    num_experts = w1.size(0)
-    device = w1.device
-
-    # This is the maximum GroupedGemm M size that we expect to run
-    # the grouped_gemm with.
-    MAX_M = compute_aligned_M(
-        env.VLLM_FUSED_MOE_CHUNK_SIZE,
-        num_topk,
-        num_experts,
-        block_m,
-        expert_tokens_meta=None,
-    )
-    # Distribute expert-ids evenly.
-    MAX_BLOCKS = MAX_M // block_m
-    expert_ids_block = torch.randint(
-        low=0, high=num_experts, size=(MAX_BLOCKS,), device=device, dtype=torch.int32
-    )
-    expert_ids = torch.repeat_interleave(expert_ids_block, block_m, dim=0)
-
-    def _warmup(w: torch.Tensor, w_scale: torch.Tensor):
-        _, n, k = w.size()
-        a1q = torch.empty((MAX_M, k), device=device).to(torch.float8_e4m3fn)
-        a1q_scales = torch.empty(
-            (MAX_M, k // block_m), device=device, dtype=torch.float32
-        )
-        out = torch.empty((MAX_M, n), device=device, dtype=torch.bfloat16)
-
-        pbar = tqdm(
-            total=MAX_BLOCKS, desc=f"DeepGemmExperts GEMM warmup (MAX_M={MAX_M})"
-        )
-        num_tokens = MAX_M
-        while num_tokens > 0:
-            m_grouped_fp8_gemm_nt_contiguous(
-                (a1q[:num_tokens], a1q_scales[:num_tokens]),
-                (w, w_scale),
-                out[:num_tokens],
-                expert_ids[:num_tokens],
-            )
-            pbar.update(1)
-            num_tokens = num_tokens - block_m
-
-    _warmup(w1, w1_scale)
-    _warmup(w2, w2_scale)
-
-
 class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
     def __init__(self, quant_config: FusedMoEQuantConfig):
         super().__init__(quant_config)
@@ -215,11 +149,32 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         )
         assert M_sum % block_m == 0
 
-        workspace1 = (M_sum, N)
-        workspace2 = (M_sum, max(N // 2, K))
+        workspace1 = (M_sum, max(N // 2, K))
+        workspace2 = (M_sum, max(N, K))
         output = (M, K)
         return (workspace1, workspace2, output)
 
+    def _act_mul_quant(
+        self, input: torch.Tensor, output: torch.Tensor, activation: str
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if activation == "silu":
+            return silu_mul_per_token_group_quant_fp8_colmajor(
+                input=input, output=output
+            )
+        else:
+            # This is a fallback path. If we find ourselves using any activation other
+            # than silu, we should add that activation to
+            # silu_mul_per_token_group_quant_fp8_colmajor kernel as it is much faster.
+            M_sum, N = input.size()
+            act_out = torch.empty(
+                (M_sum, N // 2), dtype=input.dtype, device=input.device
+            )
+            self.activation(activation, act_out, input)
+            assert self.block_shape is not None
+            return per_token_group_quant_fp8(
+                act_out, self.block_shape[1], column_major_scales=True, out_q=output
+            )
+
     def apply(
         self,
         output: torch.Tensor,
@@ -261,14 +216,9 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
             expert_tokens_meta=expert_tokens_meta,
         )
 
-        a1q_perm = _resize_cache(workspace2.view(dtype=torch.float8_e4m3fn), (M_sum, K))
-        mm1_out = _resize_cache(workspace13, (M_sum, N))
-        act_out = _resize_cache(workspace2, (M_sum, N // 2))
-        quant_out = _resize_cache(
-            workspace13.view(dtype=torch.float8_e4m3fn), (M_sum, N // 2)
+        a1q_perm = _resize_cache(
+            workspace13.view(dtype=torch.float8_e4m3fn), (M_sum, K)
         )
-        mm2_out = _resize_cache(workspace2, (M_sum, K))
-
         a1q, a1q_scale, expert_ids, inv_perm = deepgemm_moe_permute(
             aq=a1q,
             aq_scale=a1q_scale,
@@ -280,17 +230,19 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         )
         assert a1q.size(0) == M_sum
 
+        mm1_out = _resize_cache(workspace2, (M_sum, N))
         m_grouped_fp8_gemm_nt_contiguous(
             (a1q, a1q_scale), (w1, self.w1_scale), mm1_out, expert_ids
         )
 
-        self.activation(activation, act_out, mm1_out.view(-1, N))
-
-        a2q_scale: torch.Tensor | None = None
-        a2q, a2q_scale = per_token_group_quant_fp8(
-            act_out, self.block_shape[1], column_major_scales=True, out_q=quant_out
+        quant_out = _resize_cache(
+            workspace13.view(dtype=torch.float8_e4m3fn), (M_sum, N // 2)
+        )
+        a2q, a2q_scale = self._act_mul_quant(
+            input=mm1_out.view(-1, N), output=quant_out, activation=activation
         )
 
+        mm2_out = _resize_cache(workspace2, (M_sum, K))
         m_grouped_fp8_gemm_nt_contiguous(
             (a2q, a2q_scale), (w2, self.w2_scale), mm2_out, expert_ids
         )
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index ae63b4a76..6e73833d1 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -492,6 +492,139 @@ def _per_token_group_quant_fp8(
     tl.store(y_s_ptr, y_s)
 
 
+@triton.jit
+def _silu_mul_per_token_group_quant_fp8_colmajor(
+    y_ptr,  # [M, N]
+    y_q_ptr,  # [M, N // 2]
+    y_s_ptr,  # [M, (N // 2) // GROUP_SIZE]
+    M,  # num tokens
+    N,  # intermediate size
+    # Stride
+    y_s_col_stride: tl.int64,
+    # Information for float8
+    eps,
+    fp8_min,
+    fp8_max,
+    use_ue8m0: tl.constexpr,
+    # Meta-parameters
+    GROUP_SIZE: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    # TODO(varun) : Add expert_ids so we may early-exit no-op thread blocks.
+    """
+    Each thread block (BLOCK_N) computes [BLOCK_M, GROUP_SIZE] act-mul outputs. Then
+    the thread block quantizes the [BLOCK_M, GROUP_SIZE] block of values and fills
+    the outputs tensors at the right positions.
+    """
+
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+    N_2 = N // 2
+
+    m_offset = pid_m * BLOCK_M
+    n_offset = pid_n * BLOCK_N
+    if m_offset >= M:
+        return
+
+    offs_n = tl.arange(0, BLOCK_N).to(tl.int64)
+    offs_m = tl.arange(0, BLOCK_M).to(tl.int64)
+
+    base_y_ptr = y_ptr + m_offset * N + n_offset
+
+    act_in_ptrs = base_y_ptr + offs_m[:, None] * N + offs_n[None, :]
+
+    act_in = tl.load(act_in_ptrs)
+    mul_in = tl.load(act_in_ptrs + N_2)
+
+    # silu & mul
+    act_in = act_in.to(tl.float32)
+    one_f32 = tl.cast(1, tl.float32)
+    silu_out = (act_in / (one_f32 + tl.exp(-act_in))).to(y_ptr.dtype.element_ty)
+    y = (silu_out * mul_in).to(tl.float32)
+
+    # quant
+    _absmax = tl.maximum(tl.max(tl.abs(y), axis=1), eps)
+    scale_raw = _absmax / fp8_max
+    y_s = tl.math.exp2(tl.ceil(tl.log2(scale_raw))) if use_ue8m0 else scale_raw
+    y_s = tl.reshape(y_s, (BLOCK_M, 1))
+    y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
+
+    # store y_q
+    base_y_q_ptr = y_q_ptr + m_offset * N_2 + n_offset
+    y_q_ptrs = base_y_q_ptr + offs_m[:, None] * N_2 + offs_n[None, :]
+    tl.store(y_q_ptrs, y_q)
+
+    # store y_s
+    group_id = n_offset // GROUP_SIZE
+    base_y_s_ptr = y_s_ptr + group_id * y_s_col_stride + m_offset
+    y_s_ptrs = base_y_s_ptr + offs_m
+    y_s = tl.reshape(y_s, (BLOCK_M,))
+    tl.store(y_s_ptrs, y_s)
+
+
+def silu_mul_per_token_group_quant_fp8_colmajor(
+    input: torch.Tensor,  # [M, N]
+    output: torch.Tensor | None = None,  # [M, N // 2]
+    use_ue8m0: bool | None = None,
+    eps: float = 1e-10,
+):
+    """
+    silu+mul + block-fp8 quant with group size 128.
+    """
+    GROUP_SIZE = 128
+    assert input.ndim == 2
+    if output is not None:
+        assert output.ndim == 2
+    assert input.size(0) % GROUP_SIZE == 0
+    assert input.size(1) % (GROUP_SIZE * 2) == 0
+
+    if use_ue8m0 is None:
+        use_ue8m0 = is_deep_gemm_e8m0_used()
+
+    M, N = input.size()
+    N_2 = N // 2
+
+    if output is None:
+        output = torch.empty((M, N_2), dtype=torch.float8_e4m3fn, device=input.device)
+
+    output_scales = torch.empty(
+        ((N_2 // GROUP_SIZE), M), dtype=torch.float32, device=input.device
+    ).transpose(0, 1)
+
+    BLOCK_M = 8
+    BLOCK_N = GROUP_SIZE
+    assert M % BLOCK_M == 0
+    assert N_2 % BLOCK_N == 0
+
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    fp8_min = finfo.min
+    fp8_max = finfo.max
+
+    # Force even division so we can avoid edgecases within the kernel.
+    assert M % BLOCK_M == 0
+    assert N_2 % BLOCK_N == 0
+    grid = (M // BLOCK_M, N_2 // BLOCK_N)
+
+    _silu_mul_per_token_group_quant_fp8_colmajor[grid](
+        input,
+        output,
+        output_scales,
+        M,
+        N,
+        output_scales.stride(-1),
+        eps,
+        fp8_min,
+        fp8_max,
+        use_ue8m0,
+        GROUP_SIZE,
+        BLOCK_M,
+        BLOCK_N,
+    )
+
+    return output, output_scales
+
+
 @triton.jit
 def _per_token_group_quant_fp8_colmajor(
     # Pointers to inputs and output
-- 
GitLab


From afe9eb408ee1191cd57a68d46b6ce2860b1b41e1 Mon Sep 17 00:00:00 2001
From: elvischenv <219235043+elvischenv@users.noreply.github.com>
Date: Thu, 4 Dec 2025 02:50:53 +0800
Subject: [PATCH 053/258] [Bugfix] Fix flashinfer ar+norm kernel not available
 issue (#29960)

Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
---
 vllm/compilation/fix_functionalization.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
index 76068f86e..2625562aa 100644
--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@@ -104,7 +104,8 @@ class FixFunctionalizationPass(VllmInductorPass):
                 mutated_args = {1: "result"}
                 self.defunctionalize(graph, node, mutated_args)
             elif (
-                at_target
+                hasattr(torch.ops.vllm, "flashinfer_trtllm_fused_allreduce_norm")
+                and at_target
                 == torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default
             ):
                 mutated_args = {
-- 
GitLab


From 2fc5d6e0d7596dd93dbf4e1ca776f17449bb2143 Mon Sep 17 00:00:00 2001
From: Yongtao Huang <yongtaoh2022@gmail.com>
Date: Thu, 4 Dec 2025 04:14:44 +0800
Subject: [PATCH 054/258] Fix LLMEngine.del dp_group cleanup condition (#29954)

Signed-off-by: Yongtao Huang <yongtaoh2022@gmail.com>
---
 vllm/v1/engine/llm_engine.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index d21cdf04e..8772f2e48 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -409,8 +409,6 @@ class LLMEngine:
         return self.collective_rpc("apply_model", args=(func,))
 
     def __del__(self):
-        if (
-            dp_group := getattr(self, "dp_group", None)
-            and not self.external_launcher_dp
-        ):
+        dp_group = getattr(self, "dp_group", None)
+        if dp_group is not None and not self.external_launcher_dp:
             stateless_destroy_torch_distributed_process_group(dp_group)
-- 
GitLab


From ac1886588fd8799ff874b860b6c266a84d5a2b2b Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 3 Dec 2025 15:16:54 -0500
Subject: [PATCH 055/258] [CI] Fix re import error (#29973)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 .buildkite/scripts/generate-nightly-index.py     | 3 ++-
 vllm/entrypoints/serve/instrumentator/metrics.py | 3 +--
 vllm/tokenizers/deepseek_v32_encoding.py         | 3 ++-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
index 8d09ba178..4d28ec961 100644
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -7,13 +7,14 @@
 
 import argparse
 import json
-import re
 import sys
 from dataclasses import asdict, dataclass
 from pathlib import Path
 from typing import Any
 from urllib.parse import quote
 
+import regex as re
+
 if not sys.version_info >= (3, 12):
     raise RuntimeError("This script requires Python 3.12 or higher.")
 
diff --git a/vllm/entrypoints/serve/instrumentator/metrics.py b/vllm/entrypoints/serve/instrumentator/metrics.py
index efe0c63a9..523145138 100644
--- a/vllm/entrypoints/serve/instrumentator/metrics.py
+++ b/vllm/entrypoints/serve/instrumentator/metrics.py
@@ -2,9 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
-import re
-
 import prometheus_client
+import regex as re
 from fastapi import FastAPI, Response
 from prometheus_client import make_asgi_app
 from prometheus_fastapi_instrumentator import Instrumentator
diff --git a/vllm/tokenizers/deepseek_v32_encoding.py b/vllm/tokenizers/deepseek_v32_encoding.py
index 72f43395b..fb8989e65 100644
--- a/vllm/tokenizers/deepseek_v32_encoding.py
+++ b/vllm/tokenizers/deepseek_v32_encoding.py
@@ -5,9 +5,10 @@
 # copy from https://huggingface.co/deepseek-ai/DeepSeek-V3.2/blob/main/encoding/encoding_dsv32.py
 import copy
 import json
-import re
 from typing import Any
 
+import regex as re
+
 # flake8: noqa: E501
 TOOLS_SYSTEM_TEMPLATE = """## Tools
 You have access to a set of tools you can use to answer the user's question.
-- 
GitLab


From 2902c348265639de300c95cbcae1c26486f57ac7 Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Wed, 3 Dec 2025 15:49:00 -0500
Subject: [PATCH 056/258] [Kernels] Remove BatchedTritonOrDeepGemmExperts and
 default fallback to Triton (#29929)

Signed-off-by: Bill Nell <bnell@redhat.com>
Signed-off-by: bnellnm <49004751+bnellnm@users.noreply.github.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 docs/design/moe_kernel_features.md            |   3 +-
 .../moe/modular_kernel_tools/mk_objects.py    |  17 --
 .../layers/fused_moe/__init__.py              |   4 -
 .../batched_triton_or_deep_gemm_moe.py        | 180 ------------------
 .../compressed_tensors_moe.py                 |  59 ++++--
 5 files changed, 46 insertions(+), 217 deletions(-)
 delete mode 100644 vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py

diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md
index 44aaa6521..48341d199 100644
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -90,7 +90,6 @@ To be used with a particular `FusedMoEPrepareAndFinalize` subclass, MoE kernels
 | cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`cutlass_moe_fp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp8],</br>[`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] |
 | flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
 | gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
-| deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup> | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] |
 | marlin | standard,</br>batched | <sup>3</sup> / N/A | <sup>3</sup> / N/A | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
 | trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
 | pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] |
@@ -114,5 +113,5 @@ The following table shows "families" of modular kernels that are intended to wor
 | backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses |
 |---------|-----------------------------------------|----------------------------------------------|
 | deepep_high_throughput | `DeepEPHTPrepareAndFinalize` |  `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` |
-| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` |  `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts` |
+| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` |  `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts` |
 | flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |
diff --git a/tests/kernels/moe/modular_kernel_tools/mk_objects.py b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
index d79fdfbe0..99b168dc7 100644
--- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py
+++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
@@ -13,9 +13,6 @@ from vllm.model_executor.layers.fused_moe.all2all_utils import (
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
     BatchedDeepGemmExperts,
 )
-from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (
-    BatchedTritonOrDeepGemmExperts,
-)
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEConfig,
     FusedMoEQuantConfig,
@@ -286,16 +283,6 @@ if has_deep_gemm() and is_deep_gemm_supported():
         needs_matching_quant=False,
         needs_deep_gemm=True,
     )
-    register_experts(
-        BatchedTritonOrDeepGemmExperts,
-        batched_format,
-        common_float_and_int_types,
-        blocked_quantization_support=True,
-        supports_chunking=False,
-        supports_expert_map=False,
-        needs_matching_quant=True,
-        needs_deep_gemm=True,
-    )
     register_experts(
         TritonOrDeepGemmExperts,
         standard_format,
@@ -457,10 +444,6 @@ def make_fused_experts(
         kwargs = batch_kwargs | quant_kwargs
         print(f"Making BatchedTritonExperts {kwargs} ...")
         experts = BatchedTritonExperts(**kwargs)
-    elif fused_experts_type == BatchedTritonOrDeepGemmExperts:
-        kwargs = batch_kwargs | quant_kwargs | deepgemm_kwargs
-        print(f"Making BatchedTritonOrDeepGemmExperts {kwargs} ...")
-        experts = BatchedTritonOrDeepGemmExperts(**kwargs)
     elif fused_experts_type == DeepGemmExperts:
         print(f"Making DeepGemmExperts {quant_config} ...")
         experts = DeepGemmExperts(quant_config)
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 669abcb3d..9103e84aa 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -60,9 +60,6 @@ if HAS_TRITON:
     from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
         BatchedDeepGemmExperts,
     )
-    from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
-        BatchedTritonOrDeepGemmExperts,
-    )
     from vllm.model_executor.layers.fused_moe.cutlass_moe import (
         CutlassBatchedExpertsFp8,
         CutlassExpertsFp8,
@@ -98,7 +95,6 @@ if HAS_TRITON:
         "DeepGemmExperts",
         "BatchedDeepGemmExperts",
         "TritonOrDeepGemmExperts",
-        "BatchedTritonOrDeepGemmExperts",
     ]
 else:
     # Some model classes directly use the custom ops. Add placeholders
diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
deleted file mode 100644
index e69e9fd30..000000000
--- a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import torch
-
-import vllm.model_executor.layers.fused_moe.modular_kernel as mk
-from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
-    BatchedDeepGemmExperts,
-)
-from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
-from vllm.model_executor.layers.fused_moe.fused_batched_moe import BatchedTritonExperts
-from vllm.utils.deep_gemm import get_mk_alignment_for_contiguous_layout
-
-
-class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
-    def __init__(
-        self,
-        max_num_tokens: int,
-        num_dispatchers: int,
-        quant_config: FusedMoEQuantConfig,
-        allow_deep_gemm: bool = False,
-    ):
-        super().__init__(quant_config)
-
-        self.batched_triton_experts = BatchedTritonExperts(
-            max_num_tokens=max_num_tokens,
-            num_dispatchers=num_dispatchers,
-            quant_config=self.quant_config,
-        )
-
-        self.allow_deep_gemm = (
-            allow_deep_gemm
-            and self.quant_config.use_fp8_w8a8
-            and self.block_shape == get_mk_alignment_for_contiguous_layout()
-        )
-
-        self.batched_deep_gemm_experts = (
-            BatchedDeepGemmExperts(
-                max_num_tokens=max_num_tokens,
-                num_dispatchers=num_dispatchers,
-                quant_config=self.quant_config,
-            )
-            if self.allow_deep_gemm
-            else None
-        )
-
-        assert (
-            self.batched_deep_gemm_experts is not None
-            or self.batched_triton_experts is not None
-        )
-
-    @property
-    def activation_formats(
-        self,
-    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
-        if self.batched_triton_experts is not None:
-            assert (
-                self.batched_deep_gemm_experts is None
-                or self.batched_deep_gemm_experts.activation_formats
-                == self.batched_triton_experts.activation_formats
-            )
-            return self.batched_triton_experts.activation_formats
-        else:
-            assert self.batched_deep_gemm_experts is not None
-            return self.batched_deep_gemm_experts.activation_formats
-
-    def supports_chunking(self) -> bool:
-        bdge = self.batched_deep_gemm_experts
-        bte = self.batched_triton_experts
-        return (bdge is None or bdge.supports_chunking()) and (
-            bte is None or bte.supports_chunking()
-        )
-
-    def supports_expert_map(self) -> bool:
-        bdge = self.batched_deep_gemm_experts
-        bte = self.batched_triton_experts
-        return (bdge is None or bdge.supports_expert_map()) and (
-            bte is None or bte.supports_expert_map()
-        )
-
-    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
-        bdge = self.batched_deep_gemm_experts
-        bte = self.batched_triton_experts
-        bdge_war = bdge.finalize_weight_and_reduce_impl() if bdge else None
-        bte_war = bte.finalize_weight_and_reduce_impl() if bte else None
-        is_bdge_war = bdge_war is not None
-        is_bte_war = bte_war is not None
-
-        if is_bdge_war and is_bte_war:
-            assert bdge_war == bte_war, (
-                "Both implementations should agree on WeightAndReduce impls. "
-                f"Got bdge_war: {bdge_war}, and bte_war: {bte_war}"
-            )
-
-        if bdge_war is not None:
-            return bdge_war
-
-        assert bte_war is not None
-        return bte_war
-
-    def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype:
-        return act_dtype
-
-    def workspace_shapes(
-        self,
-        M: int,
-        N: int,
-        K: int,
-        topk: int,
-        global_num_experts: int,
-        local_num_experts: int,
-        expert_tokens_metadata: mk.ExpertTokensMetadata | None,
-    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
-        # Note: the deep gemm workspaces are strictly larger than the triton
-        # workspaces so we can be pessimistic here and allocate for DeepGemm
-        # even if we fall back to triton later, e.g. if expert maps are set.
-        if self.allow_deep_gemm:
-            assert self.batched_deep_gemm_experts is not None
-            return self.batched_deep_gemm_experts.workspace_shapes(
-                M,
-                N,
-                K,
-                topk,
-                global_num_experts,
-                local_num_experts,
-                expert_tokens_metadata,
-            )
-        else:
-            assert self.batched_triton_experts is not None
-            return self.batched_triton_experts.workspace_shapes(
-                M,
-                N,
-                K,
-                topk,
-                global_num_experts,
-                local_num_experts,
-                expert_tokens_metadata,
-            )
-
-    def apply(
-        self,
-        output: torch.Tensor,
-        hidden_states: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        activation: str,
-        global_num_experts: int,
-        expert_map: torch.Tensor | None,
-        a1q_scale: torch.Tensor | None,
-        a2_scale: torch.Tensor | None,
-        workspace13: torch.Tensor,
-        workspace2: torch.Tensor,
-        expert_tokens_meta: mk.ExpertTokensMetadata | None,
-        apply_router_weight_on_input: bool,
-    ):
-        experts = (
-            self.batched_deep_gemm_experts
-            if self.allow_deep_gemm
-            else self.batched_triton_experts
-        )
-        assert experts is not None
-        experts.apply(
-            output,
-            hidden_states,
-            w1,
-            w2,
-            topk_weights,
-            topk_ids,
-            activation,
-            global_num_experts,
-            expert_map,
-            a1q_scale,
-            a2_scale,
-            workspace13,
-            workspace2,
-            expert_tokens_meta,
-            apply_router_weight_on_input,
-        )
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index c7368bf42..d7fb6d2ca 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -90,8 +90,10 @@ from vllm.platforms import CpuArchEnum, current_platform
 from vllm.scalar_type import scalar_types
 from vllm.utils.deep_gemm import (
     get_col_major_tma_aligned_tensor,
+    get_mk_alignment_for_contiguous_layout,
     is_deep_gemm_e8m0_used,
 )
+from vllm.utils.import_utils import has_deep_gemm
 
 logger = init_logger(__name__)
 
@@ -1088,9 +1090,11 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
 
             return experts
 
-        # triton path
-        from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
-            BatchedTritonOrDeepGemmExperts,
+        from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
+            BatchedDeepGemmExperts,
+        )
+        from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+            BatchedTritonExperts,
         )
         from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
             TritonOrDeepGemmExperts,
@@ -1098,6 +1102,8 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
 
         assert not self.rocm_aiter_moe_enabled and not self.use_marlin
 
+        use_deep_gemm = envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM
+
         if (
             prepare_finalize.activation_format
             == FusedMoEActivationFormat.BatchedExperts
@@ -1105,22 +1111,47 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
             assert max_num_tokens_per_rank is not None
 
-            logger.debug("BatchedTritonExperts(%s)", self.__class__.__name__)
-            return BatchedTritonOrDeepGemmExperts(
-                max_num_tokens=max_num_tokens_per_rank,
-                num_dispatchers=prepare_finalize.num_dispatchers(),
-                quant_config=self.moe_quant_config,
-                allow_deep_gemm=(
-                    envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM
-                ),
+            if use_deep_gemm and not has_deep_gemm():
+                raise RuntimeError(
+                    "DeepGEMM requested for MoE layer but not installed."
+                )
+
+            compatible_with_deep_gemm = (
+                self.moe_quant_config.use_fp8_w8a8
+                and self.moe_quant_config.block_shape
+                == get_mk_alignment_for_contiguous_layout()
             )
+
+            # If this MoE layer is compatible with DeepGEMM, the proper env
+            # vars are set and DeepGEMM is not installed, throw an error.
+            if use_deep_gemm and compatible_with_deep_gemm and not has_deep_gemm():
+                raise RuntimeError(
+                    f"MoE layer incompatible with DeepGEMM, expected "
+                    f"fp8==True, got {self.moe_quant_config.use_fp8_w8a8}"
+                    f"or block_shape {self.moe_quant_config.block_shape}"
+                    f"=={get_mk_alignment_for_contiguous_layout()}."
+                )
+
+            if use_deep_gemm and compatible_with_deep_gemm and has_deep_gemm():
+                logger.debug("BatchedDeepGemmExperts(%s)", self.__class__.__name__)
+                return BatchedDeepGemmExperts(
+                    max_num_tokens=max_num_tokens_per_rank,
+                    num_dispatchers=prepare_finalize.num_dispatchers(),
+                    quant_config=self.moe_quant_config,
+                )
+            else:
+                logger.debug("BatchedTritonExperts(%s)", self.__class__.__name__)
+                return BatchedTritonExperts(
+                    max_num_tokens=max_num_tokens_per_rank,
+                    num_dispatchers=prepare_finalize.num_dispatchers(),
+                    quant_config=self.moe_quant_config,
+                )
+
         else:
             logger.debug("TritonOrDeepGemmExperts(%s)", self.__class__.__name__)
             return TritonOrDeepGemmExperts(
                 self.moe_quant_config,
-                allow_deep_gemm=(
-                    envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM
-                ),
+                allow_deep_gemm=use_deep_gemm,
             )
 
     def get_fused_moe_quant_config(
-- 
GitLab


From b5407869c8594d8e3c4ee3c09ff7cfe454be0798 Mon Sep 17 00:00:00 2001
From: Elizabeth Thomas <email2eliza@gmail.com>
Date: Wed, 3 Dec 2025 16:00:52 -0600
Subject: [PATCH 057/258] [Bugfix] Respect VLLM_CONFIGURE_LOGGING value
 (#28671)

Signed-off-by: Elizabeth Thomas <email2eliza@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Jane Xu <janeyx@meta.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Johnny Yang <johnnyyang@google.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: bruceszchen <bruceszchen@tencent.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Jane (Yuan) Xu <31798555+janeyx99@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Johnny Yang <24908445+jcyang43@users.noreply.github.com>
---
 tests/test_envs.py         | 51 ++++++++++++++++++++++++++++++++++++++
 vllm/envs.py               |  6 +++--
 vllm/utils/system_utils.py |  4 +++
 3 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/tests/test_envs.py b/tests/test_envs.py
index 6a9835a68..11bbec382 100644
--- a/tests/test_envs.py
+++ b/tests/test_envs.py
@@ -365,3 +365,54 @@ class TestEnvSetWithChoices:
         with patch.dict(os.environ, {"TEST_ENV": "option1,option1,option2"}):
             env_func = env_set_with_choices("TEST_ENV", [], ["option1", "option2"])
             assert env_func() == {"option1", "option2"}
+
+
+class TestVllmConfigureLogging:
+    """Test cases for VLLM_CONFIGURE_LOGGING environment variable."""
+
+    def test_configure_logging_defaults_to_true(self):
+        """Test that VLLM_CONFIGURE_LOGGING defaults to True when not set."""
+        # Ensure the env var is not set
+        with patch.dict(os.environ, {}, clear=False):
+            if "VLLM_CONFIGURE_LOGGING" in os.environ:
+                del os.environ["VLLM_CONFIGURE_LOGGING"]
+
+            # Clear cache if it exists
+            if hasattr(envs.__getattr__, "cache_clear"):
+                envs.__getattr__.cache_clear()
+
+            result = envs.VLLM_CONFIGURE_LOGGING
+            assert result is True
+            assert isinstance(result, bool)
+
+    def test_configure_logging_with_zero_string(self):
+        """Test that VLLM_CONFIGURE_LOGGING='0' evaluates to False."""
+        with patch.dict(os.environ, {"VLLM_CONFIGURE_LOGGING": "0"}):
+            # Clear cache if it exists
+            if hasattr(envs.__getattr__, "cache_clear"):
+                envs.__getattr__.cache_clear()
+
+            result = envs.VLLM_CONFIGURE_LOGGING
+            assert result is False
+            assert isinstance(result, bool)
+
+    def test_configure_logging_with_one_string(self):
+        """Test that VLLM_CONFIGURE_LOGGING='1' evaluates to True."""
+        with patch.dict(os.environ, {"VLLM_CONFIGURE_LOGGING": "1"}):
+            # Clear cache if it exists
+            if hasattr(envs.__getattr__, "cache_clear"):
+                envs.__getattr__.cache_clear()
+
+            result = envs.VLLM_CONFIGURE_LOGGING
+            assert result is True
+            assert isinstance(result, bool)
+
+    def test_configure_logging_with_invalid_value_raises_error(self):
+        """Test that invalid VLLM_CONFIGURE_LOGGING value raises ValueError."""
+        with patch.dict(os.environ, {"VLLM_CONFIGURE_LOGGING": "invalid"}):
+            # Clear cache if it exists
+            if hasattr(envs.__getattr__, "cache_clear"):
+                envs.__getattr__.cache_clear()
+
+            with pytest.raises(ValueError, match="invalid literal for int"):
+                _ = envs.VLLM_CONFIGURE_LOGGING
diff --git a/vllm/envs.py b/vllm/envs.py
index 4b594e54f..60d91e985 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -37,7 +37,7 @@ if TYPE_CHECKING:
     VLLM_DISABLE_FLASHINFER_PREFILL: bool = False
     VLLM_DO_NOT_TRACK: bool = False
     VLLM_USAGE_SOURCE: str = ""
-    VLLM_CONFIGURE_LOGGING: int = 1
+    VLLM_CONFIGURE_LOGGING: bool = True
     VLLM_LOGGING_LEVEL: str = "INFO"
     VLLM_LOGGING_PREFIX: str = ""
     VLLM_LOGGING_STREAM: str = "ext://sys.stdout"
@@ -623,7 +623,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # If set to 0, vllm will not configure logging
     # If set to 1, vllm will configure logging using the default configuration
     #    or the configuration file specified by VLLM_LOGGING_CONFIG_PATH
-    "VLLM_CONFIGURE_LOGGING": lambda: int(os.getenv("VLLM_CONFIGURE_LOGGING", "1")),
+    "VLLM_CONFIGURE_LOGGING": lambda: bool(
+        int(os.getenv("VLLM_CONFIGURE_LOGGING", "1"))
+    ),
     "VLLM_LOGGING_CONFIG_PATH": lambda: os.getenv("VLLM_LOGGING_CONFIG_PATH"),
     # this is used for configuring the default logging level
     "VLLM_LOGGING_LEVEL": lambda: os.getenv("VLLM_LOGGING_LEVEL", "INFO").upper(),
diff --git a/vllm/utils/system_utils.py b/vllm/utils/system_utils.py
index a4eb8f4d4..76cac59c1 100644
--- a/vllm/utils/system_utils.py
+++ b/vllm/utils/system_utils.py
@@ -204,6 +204,10 @@ def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None:
 
 def decorate_logs(process_name: str | None = None) -> None:
     """Decorate stdout/stderr with process name and PID prefix."""
+    # Respect VLLM_CONFIGURE_LOGGING environment variable
+    if not envs.VLLM_CONFIGURE_LOGGING:
+        return
+
     if process_name is None:
         process_name = get_mp_context().current_process().name
 
-- 
GitLab


From 1109f98288b4a77f10e6f3b520b07005a0143b13 Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Thu, 4 Dec 2025 06:08:19 +0800
Subject: [PATCH 058/258] [CI] fix docker image build by specifying merge-base
 commit id when downloading pre-compiled wheels (#29930)

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 .buildkite/generate_index.py                  | 46 -------------------
 docker/Dockerfile                             |  3 ++
 setup.py                                      | 25 +++++-----
 tests/standalone_tests/python_only_compile.sh |  6 ++-
 vllm/envs.py                                  |  6 ---
 5 files changed, 22 insertions(+), 64 deletions(-)
 delete mode 100644 .buildkite/generate_index.py

diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
deleted file mode 100644
index bbed80ebe..000000000
--- a/.buildkite/generate_index.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import os
-
-template = """<!DOCTYPE html>
-<html>
-    <body>
-    <h1>Links for vLLM</h1/>
-        <a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
-        <a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
-    </body>
-</html>
-"""
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--wheel", help="The wheel path.", required=True)
-args = parser.parse_args()
-
-filename = os.path.basename(args.wheel)
-
-with open("index.html", "w") as f:
-    print(f"Generated index.html for {args.wheel}")
-    # sync the abi tag with .buildkite/scripts/upload-wheels.sh
-    if "x86_64" in filename:
-        x86_wheel = filename
-        arm_wheel = filename.replace("x86_64", "aarch64").replace(
-            "manylinux1", "manylinux2014"
-        )
-    elif "aarch64" in filename:
-        x86_wheel = filename.replace("aarch64", "x86_64").replace(
-            "manylinux2014", "manylinux1"
-        )
-        arm_wheel = filename
-    else:
-        raise ValueError(f"Unsupported wheel: {filename}")
-    # cloudfront requires escaping the '+' character
-    f.write(
-        template.format(
-            x86_wheel=x86_wheel,
-            x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
-            arm_wheel=arm_wheel,
-            arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
-        )
-    )
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 8bcd7f118..73cb4d7e0 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -196,6 +196,7 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
 
 # Flag to control whether to use pre-built vLLM wheels
 ARG VLLM_USE_PRECOMPILED=""
+ARG VLLM_MERGE_BASE_COMMIT=""
 ARG VLLM_MAIN_CUDA_VERSION=""
 
 # Use dummy version for csrc-build wheel (only .so files are extracted, version doesn't matter)
@@ -216,6 +217,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         && export SCCACHE_IDLE_TIMEOUT=0 \
         && export CMAKE_BUILD_TYPE=Release \
         && export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
+        && export VLLM_PRECOMPILED_WHEEL_COMMIT="${VLLM_MERGE_BASE_COMMIT}" \
         && export VLLM_MAIN_CUDA_VERSION="${VLLM_MAIN_CUDA_VERSION}" \
         && export VLLM_DOCKER_BUILD_CONTEXT=1 \
         && sccache --show-stats \
@@ -233,6 +235,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
         rm -rf .deps && \
         mkdir -p .deps && \
         export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
+        export VLLM_PRECOMPILED_WHEEL_COMMIT="${VLLM_MERGE_BASE_COMMIT}" && \
         export VLLM_DOCKER_BUILD_CONTEXT=1 && \
         python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
     fi
diff --git a/setup.py b/setup.py
index 8b2b4f7e5..af7282d4f 100644
--- a/setup.py
+++ b/setup.py
@@ -346,10 +346,13 @@ class precompiled_wheel_utils:
         The order of preference is:
         1. user-specified wheel location (can be either local or remote, via
            VLLM_PRECOMPILED_WHEEL_LOCATION)
-        2. user-specified variant from nightly repo (current main commit via
-           VLLM_PRECOMPILED_WHEEL_VARIANT)
+        2. user-specified variant (VLLM_PRECOMPILED_WHEEL_VARIANT) from nightly repo
         3. the variant corresponding to VLLM_MAIN_CUDA_VERSION from nightly repo
-        4. the default variant from nightly repo (current main commit)
+        4. the default variant from nightly repo
+
+        If downloading from the nightly repo, the commit can be specified via
+        VLLM_PRECOMPILED_WHEEL_COMMIT; otherwise, the head commit in the main branch
+        is used.
         """
         wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
         if wheel_location is not None:
@@ -362,10 +365,13 @@ class precompiled_wheel_utils:
             # try to fetch the wheel metadata from the nightly wheel repo
             main_variant = "cu" + envs.VLLM_MAIN_CUDA_VERSION.replace(".", "")
             variant = os.getenv("VLLM_PRECOMPILED_WHEEL_VARIANT", main_variant)
-            commit = os.getenv(
-                "VLLM_PRECOMPILED_WHEEL_COMMIT",
-                precompiled_wheel_utils.get_base_commit_in_main_branch(),
-            )
+            commit = os.getenv("VLLM_PRECOMPILED_WHEEL_COMMIT", "").lower()
+            if not commit or len(commit) != 40:
+                print(
+                    f"VLLM_PRECOMPILED_WHEEL_COMMIT not valid: {commit}"
+                    ", trying to fetch base commit in main branch"
+                )
+                commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
             print(f"Using precompiled wheel commit {commit} with variant {variant}")
             try_default = False
             wheels, repo_url, download_filename = None, None, None
@@ -502,10 +508,6 @@ class precompiled_wheel_utils:
 
     @staticmethod
     def get_base_commit_in_main_branch() -> str:
-        # Force to use the nightly wheel. This is mainly used for CI testing.
-        if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL:
-            return "nightly"
-
         try:
             # Get the latest commit hash of the upstream main branch.
             resp_json = subprocess.check_output(
@@ -516,6 +518,7 @@ class precompiled_wheel_utils:
                 ]
             ).decode("utf-8")
             upstream_main_commit = json.loads(resp_json)["sha"]
+            print(f"Upstream main branch latest commit: {upstream_main_commit}")
 
             # In Docker build context, .git may be immutable or missing.
             if envs.VLLM_DOCKER_BUILD_CONTEXT:
diff --git a/tests/standalone_tests/python_only_compile.sh b/tests/standalone_tests/python_only_compile.sh
index 7cc5ef659..d29b9afcc 100644
--- a/tests/standalone_tests/python_only_compile.sh
+++ b/tests/standalone_tests/python_only_compile.sh
@@ -5,6 +5,10 @@
 set -e
 set -x
 
+merge_base_commit=$(git merge-base HEAD origin/main)
+echo "Current merge base commit with main: $merge_base_commit"
+git show --oneline -s $merge_base_commit
+
 cd /vllm-workspace/
 
 # uninstall vllm
@@ -18,7 +22,7 @@ apt autoremove -y
 
 echo 'import os; os.system("touch /tmp/changed.file")' >> vllm/__init__.py
 
-VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL=1 VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e .
+VLLM_PRECOMPILED_WHEEL_COMMIT=$merge_base_commit VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e .
 
 # Run the script
 python3 -c 'import vllm'
diff --git a/vllm/envs.py b/vllm/envs.py
index 60d91e985..2ed5816b3 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -80,7 +80,6 @@ if TYPE_CHECKING:
     VLLM_USE_PRECOMPILED: bool = False
     VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX: bool = False
     VLLM_DOCKER_BUILD_CONTEXT: bool = False
-    VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
     VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
     CMAKE_BUILD_TYPE: Literal["Debug", "Release", "RelWithDebInfo"] | None = None
     VERBOSE: bool = False
@@ -473,11 +472,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
     .strip()
     .lower()
     in ("1", "true"),
-    # Whether to force using nightly wheel in python build.
-    # This is used for testing the nightly wheel in python build.
-    "VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL": lambda: bool(
-        int(os.getenv("VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL", "0"))
-    ),
     # CMake build type
     # If not set, defaults to "Debug" or "RelWithDebInfo"
     # Available options: "Debug", "Release", "RelWithDebInfo"
-- 
GitLab


From ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 Mon Sep 17 00:00:00 2001
From: Xieyang Xu <xieyang@meta.com>
Date: Wed, 3 Dec 2025 17:02:02 -0800
Subject: [PATCH 059/258] enable multi-node in external launcher mode (#29833)

---
 vllm/config/parallel.py            |  8 ++++--
 vllm/distributed/parallel_state.py | 39 ++++++++++++++++--------------
 2 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 4a8c8bc17..20de67225 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -593,10 +593,14 @@ class ParallelConfig:
                 "max_parallel_loading_workers is currently "
                 "not supported and will be ignored."
             )
-        if self.distributed_executor_backend not in ("mp", "uni") and self.nnodes > 1:
+        allowed_backends = ("mp", "uni", "external_launcher")
+        if (
+            self.distributed_executor_backend not in allowed_backends
+            and self.nnodes > 1
+        ):
             raise ValueError(
                 "nnodes > 1 can only be set when distributed executor "
-                "backend is mp or uni."
+                "backend is mp, uni or external_launcher."
             )
 
     @property
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index c82a77c21..f910f1040 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1169,17 +1169,13 @@ def init_distributed_environment(
     from vllm.config import get_current_vllm_config
 
     config = get_current_vllm_config()
-    if config is not None and config.parallel_config.nnodes > 1:
-        parallel_config = config.parallel_config
-        ip = parallel_config.master_addr
-        rank = parallel_config.data_parallel_rank * world_size + rank
-        world_size = parallel_config.world_size_across_dp
-        port = parallel_config.master_port
-        distributed_init_method = get_distributed_init_method(ip, port)
-    elif (
+    if (
         config is not None
-        and config.parallel_config.data_parallel_size > 1
         and config.parallel_config.distributed_executor_backend != "external_launcher"
+        and (
+            config.parallel_config.nnodes > 1
+            or config.parallel_config.data_parallel_size > 1
+        )
     ):
         parallel_config = config.parallel_config
         # adjust to take into account data parallelism
@@ -1187,15 +1183,22 @@ def init_distributed_environment(
         rank = parallel_config.data_parallel_rank * world_size + rank
         # adjust the world size to take into account data parallelism
         world_size = parallel_config.world_size_across_dp
-        ip = parallel_config.data_parallel_master_ip
-        port = parallel_config.get_next_dp_init_port()
-        distributed_init_method = get_distributed_init_method(ip, port)
-        logger.debug(
-            "Adjusting world_size=%d rank=%d distributed_init_method=%s for DP",
-            world_size,
-            rank,
-            distributed_init_method,
-        )
+
+        # Use appropriate IP and port based on configuration
+        if parallel_config.nnodes > 1:
+            ip = parallel_config.master_addr
+            port = parallel_config.master_port
+            distributed_init_method = get_distributed_init_method(ip, port)
+        else:
+            ip = parallel_config.data_parallel_master_ip
+            port = parallel_config.get_next_dp_init_port()
+            distributed_init_method = get_distributed_init_method(ip, port)
+            logger.debug(
+                "Adjusting world_size=%d rank=%d distributed_init_method=%s for DP",
+                world_size,
+                rank,
+                distributed_init_method,
+            )
     if not torch.distributed.is_initialized():
         logger.info(
             "world_size=%d rank=%d local_rank=%d distributed_init_method=%s backend=%s",
-- 
GitLab


From c493b9d0924b3810439fd3fcd17995f3bb93bb75 Mon Sep 17 00:00:00 2001
From: Zhewen Li <zhewenli@meta.com>
Date: Wed, 3 Dec 2025 19:21:45 -0800
Subject: [PATCH 060/258] [CI/Build] Add MM code path to Examples Test (#29986)

Signed-off-by: zhewenli <zhewenli@meta.com>
---
 .buildkite/test-pipeline.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f79e92665..a79f0b0c6 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -387,6 +387,7 @@ steps:
   working_dir: "/vllm-workspace/examples"
   source_file_dependencies:
   - vllm/entrypoints
+  - vllm/multimodal
   - examples/
   commands:
     - pip install tensorizer # for tensorizer test
-- 
GitLab


From 33a3d6c79826aa7938db45a4e24a213664859cc0 Mon Sep 17 00:00:00 2001
From: Iceber Gu <caiwei95@hotmail.com>
Date: Thu, 4 Dec 2025 11:48:30 +0800
Subject: [PATCH 061/258] fix LoRA-related examples (#29956)

Signed-off-by: Iceber Gu <caiwei95@hotmail.com>
---
 .../lora_with_quantization_inference.py        | 16 ++++------------
 .../offline_inference/multilora_inference.py   | 18 +++---------------
 2 files changed, 7 insertions(+), 27 deletions(-)

diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py
index dc5c6202f..2f3564b59 100644
--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
@@ -23,31 +23,23 @@ def create_test_prompts(
         # this is an example of using quantization without LoRA
         (
             "My name is",
-            SamplingParams(
-                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
-            ),
+            SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
             None,
         ),
         # the next three examples use quantization with LoRA
         (
             "my name is",
-            SamplingParams(
-                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
-            ),
+            SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
             LoRARequest("lora-test-1", 1, lora_path),
         ),
         (
             "The capital of USA is",
-            SamplingParams(
-                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
-            ),
+            SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
             LoRARequest("lora-test-2", 1, lora_path),
         ),
         (
             "The capital of France is",
-            SamplingParams(
-                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
-            ),
+            SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
             LoRARequest("lora-test-3", 1, lora_path),
         ),
     ]
diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py
index 5e5da2c01..92021f9fb 100644
--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@@ -27,9 +27,7 @@ def create_test_prompts(
     return [
         (
             "A robot may not injure a human being",
-            SamplingParams(
-                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
-            ),
+            SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
             None,
         ),
         (
@@ -41,22 +39,12 @@ def create_test_prompts(
         ),
         (
             "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
-            SamplingParams(
-                temperature=0.0,
-                logprobs=1,
-                prompt_logprobs=1,
-                max_tokens=128,
-            ),
+            SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
             LoRARequest("sql-lora", 1, lora_path),
         ),
         (
             "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
-            SamplingParams(
-                temperature=0.0,
-                logprobs=1,
-                prompt_logprobs=1,
-                max_tokens=128,
-            ),
+            SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
             LoRARequest("sql-lora2", 2, lora_path),
         ),
     ]
-- 
GitLab


From 5f91cdda75b24a3d9cdda8c82897db07b288b5c9 Mon Sep 17 00:00:00 2001
From: Li Wang <wangli858794774@gmail.com>
Date: Thu, 4 Dec 2025 11:53:00 +0800
Subject: [PATCH 062/258] [Misc] Add docker build env for Ascend NPU (#30015)

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 .buildkite/scripts/hardware_ci/run-npu-test.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/scripts/hardware_ci/run-npu-test.sh b/.buildkite/scripts/hardware_ci/run-npu-test.sh
index 29c8f5ed5..0db1abe37 100644
--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@@ -74,6 +74,7 @@ FROM ${BASE_IMAGE_NAME}
 
 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
+ENV SOC_VERSION="ascend910b1"
 
 RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
     pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
-- 
GitLab


From dd38ba3a2682d6f73b02bc983a5b0157ed3e5498 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 4 Dec 2025 12:51:15 +0800
Subject: [PATCH 063/258] [Bugfix] Fix adapter_enabled IMA (#29977)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/ops/triton_ops/fused_moe_lora_op.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
index 413ee8ecb..34383cdf1 100644
--- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
+++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
@@ -96,10 +96,14 @@ def _fused_moe_lora_kernel(
     slice_id = tl.program_id(axis=1)
     lora_idx = tl.program_id(axis=2)
     lora_id = tl.load(lora_ids + lora_idx)
-    moe_enabled = tl.load(adapter_enabled + lora_id)
-    if lora_id == -1 or moe_enabled == 0:
+
+    if lora_id == -1:
         # Early exit for the no-lora case.
         return
+    moe_enabled = tl.load(adapter_enabled + lora_id)
+    if moe_enabled == 0:
+        # Early exit for the no moe lora case.
+        return
     max_loras = tl.num_programs(axis=2)
     grid_k = tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)
 
-- 
GitLab


From 28097d5638cc695f4644c411edac8eb05a03b39b Mon Sep 17 00:00:00 2001
From: gausah01 <141038176+gausah01@users.noreply.github.com>
Date: Thu, 4 Dec 2025 05:01:15 +0000
Subject: [PATCH 064/258] [Bugfix][CPU] Fix CPU KV cache fallback memory
 allocation (#29604)

Signed-off-by: Gauri Sahnan <gauri.sahnan@arm.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
---
 vllm/platforms/cpu.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 2b2c2f9cd..a2518d5fd 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -10,6 +10,7 @@ import sys
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
 
+import psutil
 import regex as re
 import torch
 
@@ -147,11 +148,21 @@ class CpuPlatform(Platform):
         from vllm.utils.mem_constants import GiB_bytes
 
         kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
+        node_dir = "/sys/devices/system/node"
         if kv_cache_space is None:
-            kv_cache_space = 4 * GiB_bytes  # type: ignore
+            nodes = (
+                [d for d in os.listdir(node_dir) if d.startswith("node")]
+                if os.path.exists(node_dir)
+                else []
+            )
+            num_numa_nodes = len(nodes) or 1
+            free_cpu_memory = psutil.virtual_memory().total // num_numa_nodes
+            DEFAULT_CPU_MEM_UTILIZATION = 0.5
+            kv_cache_space = int(free_cpu_memory * DEFAULT_CPU_MEM_UTILIZATION)
+            kv_cache_space_gib = kv_cache_space / GiB_bytes
             logger.warning_once(
-                "Environment variable VLLM_CPU_KVCACHE_SPACE (GiB) "
-                "for CPU backend is not set, using 4 by default."
+                "VLLM_CPU_KVCACHE_SPACE not set. Using "
+                f"{kv_cache_space_gib:.2f} GiB for KV cache."
             )
         else:
             kv_cache_space *= GiB_bytes
-- 
GitLab


From fca3f4665838605e268a8408bc7ca359f5d5c14b Mon Sep 17 00:00:00 2001
From: Benjamin Bartels <benjamin@bartels.dev>
Date: Thu, 4 Dec 2025 05:50:27 +0000
Subject: [PATCH 065/258] [Frontend] Fixes anthropic /v1/messages streaming not
 containing input_tokens on first chunk (#29971)

Signed-off-by: bbartels <benjamin@bartels.dev>
---
 tests/entrypoints/openai/test_messages.py      | 11 +++++++++++
 vllm/entrypoints/anthropic/serving_messages.py | 10 +++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/openai/test_messages.py b/tests/entrypoints/openai/test_messages.py
index 3e390ad49..b804a1a7a 100644
--- a/tests/entrypoints/openai/test_messages.py
+++ b/tests/entrypoints/openai/test_messages.py
@@ -69,9 +69,20 @@ async def test_anthropic_streaming(client: anthropic.AsyncAnthropic):
         stream=True,
     )
 
+    first_chunk = None
+    chunk_count = 0
     async for chunk in resp:
+        chunk_count += 1
+        if first_chunk is None and chunk.type == "message_start":
+            first_chunk = chunk
         print(chunk.model_dump_json())
 
+    assert chunk_count > 0
+    assert first_chunk is not None, "message_start chunk was never observed"
+    assert first_chunk.usage is not None, "first chunk should include usage stats"
+    assert first_chunk.usage["output_tokens"] == 0
+    assert first_chunk.usage["input_tokens"] > 5
+
 
 @pytest.mark.asyncio
 async def test_anthropic_tool_call(client: anthropic.AsyncAnthropic):
diff --git a/vllm/entrypoints/anthropic/serving_messages.py b/vllm/entrypoints/anthropic/serving_messages.py
index 340dabf0e..e7ea3bb59 100644
--- a/vllm/entrypoints/anthropic/serving_messages.py
+++ b/vllm/entrypoints/anthropic/serving_messages.py
@@ -183,7 +183,9 @@ class AnthropicServingMessages(OpenAIServingChat):
 
         if anthropic_request.stream:
             req.stream = anthropic_request.stream
-            req.stream_options = StreamOptions.validate({"include_usage": True})
+            req.stream_options = StreamOptions.validate(
+                {"include_usage": True, "continuous_usage_stats": True}
+            )
 
         if anthropic_request.tool_choice is None:
             req.tool_choice = None
@@ -323,6 +325,12 @@ class AnthropicServingMessages(OpenAIServingChat):
                                     content=[],
                                     model=origin_chunk.model,
                                 ),
+                                usage=AnthropicUsage(
+                                    input_tokens=origin_chunk.usage.prompt_tokens
+                                    if origin_chunk.usage
+                                    else 0,
+                                    output_tokens=0,
+                                ),
                             )
                             first_item = False
                             data = chunk.model_dump_json(exclude_unset=True)
-- 
GitLab


From 8aaa81b35f96a3b5c56d3dccf58f48129ee34126 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 4 Dec 2025 14:00:52 +0800
Subject: [PATCH 066/258] [KVConnector] remove unused code (the model aware kv
 ops class) (#29709)

Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
---
 .../kv_transfer/kv_connector/utils.py         | 87 +------------------
 1 file changed, 1 insertion(+), 86 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
index b8eb5ea3b..b2c2c0e6b 100644
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -8,9 +8,7 @@ from typing import TYPE_CHECKING, Literal
 
 import torch
 
-import vllm.envs as envs
-from vllm import _custom_ops as ops
-from vllm.config import VllmConfig, get_current_vllm_config
+from vllm.config import get_current_vllm_config
 from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
 from vllm.logger import init_logger
 from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
@@ -21,89 +19,6 @@ if TYPE_CHECKING:
 logger = init_logger(__name__)
 
 
-class model_aware_kv_ops_helper:
-    def __init__(self, config: VllmConfig):
-        self.is_deepseek_mla = config.model_config.is_deepseek_mla
-        self.use_mla_opt = not envs.VLLM_MLA_DISABLE
-        self.tp_size = config.parallel_config.tensor_parallel_size
-
-    def get_model_args(self, model_executable: torch.nn.Module):
-        model_config = model_executable.model.config
-        self.model_executable = model_executable
-        num_heads = int(model_config.num_key_value_heads / self.tp_size)
-        hidden_size = model_config.hidden_size
-        num_attention_heads = model_config.num_attention_heads
-
-        # Deepseek's MLA (Multi-head Latent Attention) uses two different
-        # kv_cache shapes based on whether VLLM_MLA_DISABLE is set to 0.
-        # When VLLM_MLA_DISABLE=0 (default), forward absorb is applied,
-        # resulting in a kv_cache shape of [num_blks, blk_size, 1,
-        # kv_lora_rank + qk_rope_head_dim].
-        # When VLLM_MLA_DISABLE=1, standard FA is used instead, leading
-        # to a kv_cache shape of [2, num_blks, blk_size,
-        # num_key_value_heads / tp, qk_nope_head_dim + qk_rope_head_dim].
-        # For more details, see vllm/v1/attention/backends/mla/common.py.
-        if self.is_deepseek_mla and self.use_mla_opt:
-            head_size = model_config.kv_lora_rank + model_config.qk_rope_head_dim
-            num_heads = 1
-        elif self.is_deepseek_mla and not self.use_mla_opt:
-            head_size = model_config.qk_nope_head_dim + model_config.qk_rope_head_dim
-        else:
-            head_size = getattr(model_config, "head_dim", None)
-            if head_size is None:
-                head_size = int(hidden_size // num_attention_heads)
-
-        return num_heads, head_size
-
-    def get_kv_from_cache(self, kv_cache, num_heads, head_size):
-        if self.is_deepseek_mla and self.use_mla_opt:
-            key_cache = kv_cache.reshape(-1, num_heads, head_size)
-            value_cache = kv_cache.reshape(-1, num_heads, head_size)
-        else:
-            key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
-            value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
-        return key_cache, value_cache
-
-    def put_kv_to_cache(
-        self,
-        model_executable: torch.nn.Module,
-        keys,
-        values,
-        layer,
-        kv_cache,
-        slot_mapping,
-        start_pos,
-        end_pos,
-    ):
-        model_config = model_executable.model.config
-
-        if self.is_deepseek_mla and self.use_mla_opt:
-            layer.self_attn.attn = layer.self_attn.mla_attn
-            k_c_normed_k_pe = keys.squeeze(1)
-            k_c_normed = k_c_normed_k_pe[:, : model_config.kv_lora_rank]
-            k_pe = k_c_normed_k_pe[:, model_config.kv_lora_rank :]
-            ops.concat_and_cache_mla(
-                k_c_normed.to(kv_cache.device),
-                k_pe.to(kv_cache.device),
-                kv_cache,
-                slot_mapping[start_pos:end_pos],
-                layer.self_attn.attn.kv_cache_dtype,
-                layer.self_attn.attn._k_scale,
-            )
-        else:
-            key_cache, value_cache = kv_cache[0], kv_cache[1]
-            ops.reshape_and_cache_flash(
-                keys.to(key_cache.device),
-                values.to(value_cache.device),
-                key_cache,
-                value_cache,
-                slot_mapping[start_pos:end_pos],
-                layer.self_attn.attn.kv_cache_dtype,
-                layer.self_attn.attn._k_scale,
-                layer.self_attn.attn._v_scale,
-            )
-
-
 def get_kv_connector_cache_layout():
     # NOTE (NickLucche) When running disaggregated PD with NIXL, HND layout is
     # used for faster transfer.
-- 
GitLab


From 80f8af4b2fadf85403290a38c8ae77f01b6b5378 Mon Sep 17 00:00:00 2001
From: Jianwei Mao <maojianwei2016@126.com>
Date: Thu, 4 Dec 2025 14:04:44 +0800
Subject: [PATCH 067/258] Fix error while downloading dependencies for CPU
 backend (#29797)

Signed-off-by: Jianwei Mao <maojianwei2016@126.com>
---
 requirements/cpu-build.txt | 1 -
 requirements/cpu.txt       | 1 -
 2 files changed, 2 deletions(-)

diff --git a/requirements/cpu-build.txt b/requirements/cpu-build.txt
index e18e0825f..1ea401a04 100644
--- a/requirements/cpu-build.txt
+++ b/requirements/cpu-build.txt
@@ -3,7 +3,6 @@ ninja
 packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
---extra-index-url https://download.pytorch.org/whl/cpu
 torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
 torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"
 scons; platform_machine == "aarch64"    # needed to build Arm Compute Library (ACL)
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index 21571be47..7a670812e 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -4,7 +4,6 @@
 numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative decoding
 
 # Dependencies for CPUs
---extra-index-url https://download.pytorch.org/whl/cpu
 torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
 torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"
 
-- 
GitLab


From 9ae2f603748446317c90fe40f1eb269e9a027815 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 4 Dec 2025 14:22:20 +0800
Subject: [PATCH 068/258] [Misc] Various cleanups for MM input processing
 (#29970)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/features/multimodal_inputs.md            |   8 +-
 ...ompt_embed_inference_with_openai_client.py |  12 +-
 .../entrypoints/openai/test_vision_embeds.py  |  75 +++++------
 tests/entrypoints/test_chat_utils.py          | 127 +-----------------
 .../test_completion_with_image_embeds.py      |  17 +--
 vllm/entrypoints/chat_utils.py                |  13 +-
 vllm/entrypoints/llm.py                       |   1 -
 vllm/entrypoints/openai/serving_engine.py     |   6 -
 vllm/entrypoints/score_utils.py               |   5 +-
 vllm/model_executor/models/hunyuan_vision.py  |   3 +-
 vllm/model_executor/models/keye.py            |   4 +-
 vllm/model_executor/models/keye_vl1_5.py      |   4 +-
 vllm/multimodal/audio.py                      |   7 +-
 vllm/utils/serial_utils.py                    |  10 ++
 14 files changed, 67 insertions(+), 225 deletions(-)

diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index 4656ee43e..2b25dc766 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -795,14 +795,12 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
 ??? code
 
     ```python
+    from vllm.utils.serial_utils import tensor2base64
+
     image_embedding = torch.load(...)
     grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
 
-    buffer = io.BytesIO()
-    torch.save(image_embedding, buffer)
-    buffer.seek(0)
-    binary_data = buffer.read()
-    base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
+    base64_image_embedding = tensor2base64(image_embedding)
 
     client = OpenAI(
         # defaults to os.environ.get("OPENAI_API_KEY")
diff --git a/examples/online_serving/prompt_embed_inference_with_openai_client.py b/examples/online_serving/prompt_embed_inference_with_openai_client.py
index 0bbe4b8f5..889be6820 100644
--- a/examples/online_serving/prompt_embed_inference_with_openai_client.py
+++ b/examples/online_serving/prompt_embed_inference_with_openai_client.py
@@ -28,13 +28,11 @@ Dependencies:
 - openai
 """
 
-import base64
-import io
-
-import torch
 import transformers
 from openai import OpenAI
 
+from vllm.utils.serial_utils import tensor2base64
+
 
 def main():
     client = OpenAI(
@@ -58,11 +56,7 @@ def main():
     prompt_embeds = embedding_layer(token_ids).squeeze(0)
 
     # Prompt embeddings
-    buffer = io.BytesIO()
-    torch.save(prompt_embeds, buffer)
-    buffer.seek(0)
-    binary_data = buffer.read()
-    encoded_embeds = base64.b64encode(binary_data).decode("utf-8")
+    encoded_embeds = tensor2base64(prompt_embeds)
 
     completion = client.completions.create(
         model=model_name,
diff --git a/tests/entrypoints/openai/test_vision_embeds.py b/tests/entrypoints/openai/test_vision_embeds.py
index a6593c5b0..42d9fe484 100644
--- a/tests/entrypoints/openai/test_vision_embeds.py
+++ b/tests/entrypoints/openai/test_vision_embeds.py
@@ -2,64 +2,47 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
-import io
 
 import numpy as np
 import pytest
 import requests
 import torch
 
-from ...utils import RemoteOpenAIServer
+from vllm.utils.serial_utils import tensor2base64
 
-MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
-DTYPE = "float16"
+from ...utils import RemoteOpenAIServer
 
 
-def _terratorch_dummy_inputs(model_name: str):
+def _terratorch_dummy_messages():
     pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
     location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
 
-    buffer_tiff = io.BytesIO()
-    torch.save(pixel_values, buffer_tiff)
-    buffer_tiff.seek(0)
-    binary_data = buffer_tiff.read()
-    base64_tensor_embedding = base64.b64encode(binary_data).decode("utf-8")
-
-    buffer_coord = io.BytesIO()
-    torch.save(location_coords, buffer_coord)
-    buffer_coord.seek(0)
-    binary_data = buffer_coord.read()
-    base64_coord_embedding = base64.b64encode(binary_data).decode("utf-8")
-
-    return {
-        "model": model_name,
-        "additional_data": {"prompt_token_ids": [1]},
-        "encoding_format": "base64",
-        "messages": [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "image_embeds",
-                        "image_embeds": {
-                            "pixel_values": base64_tensor_embedding,
-                            "location_coords": base64_coord_embedding,
-                        },
-                    }
-                ],
-            }
-        ],
-    }
+    return [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_embeds",
+                    "image_embeds": {
+                        "pixel_values": tensor2base64(pixel_values),
+                        "location_coords": tensor2base64(location_coords),
+                    },
+                }
+            ],
+        }
+    ]
 
 
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_single_request(model_name: str):
+@pytest.mark.parametrize(
+    "model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
+)
+def test_single_request(model_name: str):
     args = [
         "--runner",
         "pooling",
         # use half precision for speed and memory savings in CI environment
         "--dtype",
-        DTYPE,
+        "float16",
         "--enforce-eager",
         "--trust-remote-code",
         "--max-num-seqs",
@@ -70,11 +53,15 @@ async def test_single_request(model_name: str):
         "--enable-mm-embeds",
     ]
 
-    with RemoteOpenAIServer(MODEL_NAME, args) as server:
-        prompt = _terratorch_dummy_inputs(model_name)
-
-        # test single pooling
-        response = requests.post(server.url_for("pooling"), json=prompt)
+    with RemoteOpenAIServer(model_name, args) as server:
+        response = requests.post(
+            server.url_for("pooling"),
+            json={
+                "model": model_name,
+                "messages": _terratorch_dummy_messages(),
+                "encoding_format": "base64",
+            },
+        )
         response.raise_for_status()
 
         output = response.json()["data"][0]["data"]
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 03a0c058e..75be34820 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -29,6 +29,7 @@ from vllm.multimodal.utils import (
     encode_video_base64,
 )
 from vllm.tokenizers import MistralTokenizer, get_tokenizer
+from vllm.utils.serial_utils import tensor2base64
 
 from ..models.registry import HF_EXAMPLE_MODELS
 from ..utils import VLLM_PATH
@@ -85,11 +86,6 @@ def phi3v_model_config_image_embeds():
     )
 
 
-@pytest.fixture(scope="module")
-def phi3v_tokenizer():
-    return get_tokenizer(PHI3V_MODEL_ID)
-
-
 @pytest.fixture(scope="function")
 def qwen2_audio_model_config():
     return ModelConfig(
@@ -115,11 +111,6 @@ def audio_embeds_model_config():
     )
 
 
-@pytest.fixture(scope="module")
-def qwen2_audio_tokenizer():
-    return get_tokenizer(QWEN2AUDIO_MODEL_ID)
-
-
 @pytest.fixture(scope="function")
 def qwen25omni_model_config_mm_interleaved():
     return ModelConfig(
@@ -134,11 +125,6 @@ def qwen25omni_model_config_mm_interleaved():
     )
 
 
-@pytest.fixture(scope="module")
-def qwen25omni_tokenizer():
-    return get_tokenizer(QWEN25OMNI_MODEL_ID)
-
-
 @pytest.fixture(scope="function")
 def mistral_model_config():
     return ModelConfig(
@@ -150,11 +136,6 @@ def mistral_model_config():
     )
 
 
-@pytest.fixture(scope="module")
-def mistral_tokenizer():
-    return get_tokenizer(MISTRAL_MODEL_ID)
-
-
 @pytest.fixture(scope="module")
 def image_url():
     image = ImageAsset("cherry_blossom")
@@ -239,7 +220,6 @@ def _assert_mm_data_inputs(
 
 def test_parse_chat_messages_single_image(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages(
@@ -253,7 +233,6 @@ def test_parse_chat_messages_single_image(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -266,7 +245,6 @@ def test_parse_chat_messages_single_image(
 
 def test_parse_chat_messages_single_image_with_uuid(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid = str(hash(image_url))
@@ -287,7 +265,6 @@ def test_parse_chat_messages_single_image_with_uuid(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -300,7 +277,6 @@ def test_parse_chat_messages_single_image_with_uuid(
 
 def test_parse_chat_messages_single_empty_image_with_uuid(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid = str(hash(image_url))
@@ -319,7 +295,6 @@ def test_parse_chat_messages_single_empty_image_with_uuid(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -332,7 +307,6 @@ def test_parse_chat_messages_single_empty_image_with_uuid(
 
 def test_parse_chat_messages_single_image_with_bad_uuid_format(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid = str(hash(image_url))
@@ -354,7 +328,6 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -367,7 +340,6 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format(
 
 def test_parse_chat_messages_multiple_images_with_uuids(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid1 = "my_uuid_1"
@@ -397,7 +369,6 @@ def test_parse_chat_messages_multiple_images_with_uuids(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -413,7 +384,6 @@ def test_parse_chat_messages_multiple_images_with_uuids(
 
 def test_parse_chat_messages_multiple_empty_images_with_uuids(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid1 = "my_uuid_1"
@@ -439,7 +409,6 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -455,7 +424,6 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids(
 
 def test_parse_chat_messages_mixed_empty_images_with_uuids(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid1 = "my_uuid_1"
@@ -483,7 +451,6 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -500,7 +467,6 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids(
 @pytest.mark.asyncio
 async def test_parse_chat_messages_single_image_with_uuid_async(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid = str(hash(image_url))
@@ -519,7 +485,6 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -533,7 +498,6 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
 @pytest.mark.asyncio
 async def test_parse_chat_messages_empty_image_with_uuid_async(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid = str(hash(image_url))
@@ -552,7 +516,6 @@ async def test_parse_chat_messages_empty_image_with_uuid_async(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -566,7 +529,6 @@ async def test_parse_chat_messages_empty_image_with_uuid_async(
 @pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_with_uuids_async(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid1 = "my_uuid_1"
@@ -592,7 +554,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -609,7 +570,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
 @pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid1 = "my_uuid_1"
@@ -635,7 +595,6 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -652,7 +611,6 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
 @pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid2 = "my_uuid_2"
@@ -676,7 +634,6 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -692,7 +649,6 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
 
 def test_parse_chat_messages_empty_system(
     mistral_model_config,
-    mistral_tokenizer,
 ):
     # Test string format
     conversation, _, _ = parse_chat_messages(
@@ -704,7 +660,6 @@ def test_parse_chat_messages_empty_system(
             },
         ],
         mistral_model_config,
-        mistral_tokenizer,
         content_format="string",
     )
     assert conversation == [
@@ -722,7 +677,6 @@ def test_parse_chat_messages_empty_system(
             },
         ],
         mistral_model_config,
-        mistral_tokenizer,
         content_format="openai",
     )
     assert conversation == [
@@ -734,7 +688,6 @@ def test_parse_chat_messages_empty_system(
 @pytest.mark.asyncio
 async def test_parse_chat_messages_single_image_async(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     conversation, mm_future, mm_uuids = parse_chat_messages_futures(
@@ -748,7 +701,6 @@ async def test_parse_chat_messages_single_image_async(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -761,7 +713,6 @@ async def test_parse_chat_messages_single_image_async(
 
 def test_parse_chat_messages_multiple_images(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages(
@@ -779,7 +730,6 @@ def test_parse_chat_messages_multiple_images(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -795,7 +745,6 @@ def test_parse_chat_messages_multiple_images(
 
 def test_parse_chat_messages_empty_pil_image_with_uuid(
     phi3v_model_config,
-    phi3v_tokenizer,
 ):
     uuid = "abcd"
     conversation, mm_data, mm_uuids = parse_chat_messages(
@@ -809,7 +758,6 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -825,7 +773,6 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(
 
 def test_parse_chat_messages_empty_image_embeds_with_uuid(
     phi3v_model_config_image_embeds,
-    phi3v_tokenizer,
 ):
     uuid = "abcd"
     conversation, mm_data, mm_uuids = parse_chat_messages(
@@ -839,7 +786,6 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
             }
         ],
         phi3v_model_config_image_embeds,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -857,7 +803,6 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
 
 def test_parse_chat_messages_empty_audio_embeds_with_uuid(
     audio_embeds_model_config,
-    qwen2_audio_tokenizer,
 ):
     """Test audio_embeds with UUID (no actual embeds data)."""
     uuid = "test-audio-uuid-123"
@@ -873,7 +818,6 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
             }
         ],
         audio_embeds_model_config,
-        qwen2_audio_tokenizer,
         content_format="string",
     )
 
@@ -889,11 +833,8 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
 
 def test_parse_chat_messages_audio_embeds_with_string(
     audio_embeds_model_config,
-    qwen2_audio_tokenizer,
 ):
     """Test audio_embeds with base64 string embedding data."""
-    import base64
-    import io
 
     import torch
 
@@ -901,11 +842,7 @@ def test_parse_chat_messages_audio_embeds_with_string(
     audio_embedding = torch.randn(1, 128, 768)
 
     # Encode it as base64
-    buffer = io.BytesIO()
-    torch.save(audio_embedding, buffer)
-    buffer.seek(0)
-    binary_data = buffer.read()
-    base64_audio_embedding = base64.b64encode(binary_data).decode("utf-8")
+    base64_audio_embedding = tensor2base64(audio_embedding)
 
     conversation, mm_data, mm_uuids = parse_chat_messages(
         [
@@ -921,7 +858,6 @@ def test_parse_chat_messages_audio_embeds_with_string(
             }
         ],
         audio_embeds_model_config,
-        qwen2_audio_tokenizer,
         content_format="string",
     )
 
@@ -939,11 +875,8 @@ def test_parse_chat_messages_audio_embeds_with_string(
 @pytest.mark.asyncio
 async def test_parse_chat_messages_audio_embeds_async(
     audio_embeds_model_config,
-    qwen2_audio_tokenizer,
 ):
     """Test audio_embeds with async futures."""
-    import base64
-    import io
 
     import torch
 
@@ -951,11 +884,7 @@ async def test_parse_chat_messages_audio_embeds_async(
     audio_embedding = torch.randn(1, 128, 768)
 
     # Encode it as base64
-    buffer = io.BytesIO()
-    torch.save(audio_embedding, buffer)
-    buffer.seek(0)
-    binary_data = buffer.read()
-    base64_audio_embedding = base64.b64encode(binary_data).decode("utf-8")
+    base64_audio_embedding = tensor2base64(audio_embedding)
 
     conversation, mm_future, mm_uuids = parse_chat_messages_futures(
         [
@@ -971,7 +900,6 @@ async def test_parse_chat_messages_audio_embeds_async(
             }
         ],
         audio_embeds_model_config,
-        qwen2_audio_tokenizer,
         content_format="string",
     )
 
@@ -990,7 +918,6 @@ async def test_parse_chat_messages_audio_embeds_async(
 @pytest.mark.asyncio
 async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
     phi3v_model_config_image_embeds,
-    phi3v_tokenizer,
 ):
     uuid = "abcd"
     conversation, mm_future, mm_uuids = parse_chat_messages_futures(
@@ -1004,7 +931,6 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
             }
         ],
         phi3v_model_config_image_embeds,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -1024,7 +950,6 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
 @pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_async(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     conversation, mm_future, mm_uuids = parse_chat_messages_futures(
@@ -1042,7 +967,6 @@ async def test_parse_chat_messages_multiple_images_async(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -1058,7 +982,6 @@ async def test_parse_chat_messages_multiple_images_async(
 
 def test_parse_chat_messages_placeholder_already_in_prompt(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages(
@@ -1076,7 +999,6 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
     assert conversation == [
@@ -1091,7 +1013,6 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
 
 def test_parse_chat_messages_placeholder_one_already_in_prompt(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages(
@@ -1110,7 +1031,6 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -1127,7 +1047,6 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
 
 def test_parse_chat_messages_multiple_images_across_messages(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages(
@@ -1149,7 +1068,6 @@ def test_parse_chat_messages_multiple_images_across_messages(
             },
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -1164,7 +1082,6 @@ def test_parse_chat_messages_multiple_images_across_messages(
 
 def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid = str(hash(image_url))
@@ -1195,7 +1112,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
             },
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -1210,7 +1126,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
 
 def test_parse_chat_messages_context_text_format(
     phi3v_model_config,
-    phi3v_tokenizer,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages(
         [
@@ -1222,7 +1137,6 @@ def test_parse_chat_messages_context_text_format(
             {"role": "user", "content": "What about this one?"},
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="openai",
     )
 
@@ -1246,7 +1160,6 @@ def test_parse_chat_messages_context_text_format(
 
 def test_parse_chat_messages_rejects_too_many_images_in_one_message(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     with warnings.catch_warnings():
@@ -1277,14 +1190,12 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message(
                     }
                 ],
                 phi3v_model_config,
-                phi3v_tokenizer,
                 content_format="string",
             )
 
 
 def test_parse_chat_messages_rejects_too_many_images_across_messages(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     with warnings.catch_warnings():
@@ -1322,14 +1233,12 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages(
                     },
                 ],
                 phi3v_model_config,
-                phi3v_tokenizer,
                 content_format="string",
             )
 
 
 def test_parse_chat_messages_multiple_images_uncommon_input(
     phi3v_model_config,
-    phi3v_tokenizer,
     image_url,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages(
@@ -1344,7 +1253,6 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
             }
         ],
         phi3v_model_config,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -1360,7 +1268,6 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
 
 def test_parse_chat_messages_multiple_images_interleave(
     phi3v_model_config_mm_interleaved,
-    phi3v_tokenizer,
     image_url,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages(
@@ -1380,7 +1287,6 @@ def test_parse_chat_messages_multiple_images_interleave(
             }
         ],
         phi3v_model_config_mm_interleaved,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -1398,7 +1304,6 @@ def test_parse_chat_messages_multiple_images_interleave(
 @pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_interleave_async(
     phi3v_model_config_mm_interleaved,
-    phi3v_tokenizer,
     image_url,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages_futures(
@@ -1418,7 +1323,6 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
             }
         ],
         phi3v_model_config_mm_interleaved,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -1436,7 +1340,6 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
 @pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
     phi3v_model_config_mm_interleaved,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid = str(hash(image_url))
@@ -1465,7 +1368,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
             }
         ],
         phi3v_model_config_mm_interleaved,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -1482,7 +1384,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
 
 def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
     phi3v_model_config_mm_interleaved,
-    phi3v_tokenizer,
     image_url,
 ):
     conversation, mm_data, mm_uuids = parse_chat_messages(
@@ -1505,7 +1406,6 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
             },
         ],
         phi3v_model_config_mm_interleaved,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -1523,7 +1423,6 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
 
 def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave(
     phi3v_model_config_mm_interleaved,
-    phi3v_tokenizer,
     image_url,
 ):
     image_uuid = str(hash(image_url))
@@ -1555,7 +1454,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl
             },
         ],
         phi3v_model_config_mm_interleaved,
-        phi3v_tokenizer,
         content_format="string",
     )
 
@@ -1573,7 +1471,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl
 
 def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
     qwen25omni_model_config_mm_interleaved,
-    qwen25omni_tokenizer,
     image_url,
     video_url,
     audio_url,
@@ -1601,7 +1498,6 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
             },
         ],
         qwen25omni_model_config_mm_interleaved,
-        qwen25omni_tokenizer,
         content_format="string",
     )
 
@@ -1627,7 +1523,6 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
 
 def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave(
     qwen25omni_model_config_mm_interleaved,
-    qwen25omni_tokenizer,
     image_url,
     video_url,
     audio_url,
@@ -1671,7 +1566,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
             },
         ],
         qwen25omni_model_config_mm_interleaved,
-        qwen25omni_tokenizer,
         content_format="string",
     )
 
@@ -1699,7 +1593,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
 
 def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_messages_interleave(  # noqa: E501
     qwen25omni_model_config_mm_interleaved,
-    qwen25omni_tokenizer,
     image_url,
     video_url,
     audio_url,
@@ -1743,7 +1636,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
             },
         ],
         qwen25omni_model_config_mm_interleaved,
-        qwen25omni_tokenizer,
         content_format="string",
     )
 
@@ -1775,7 +1667,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
 
 def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave(  # noqa: E501
     qwen25omni_model_config_mm_interleaved,
-    qwen25omni_tokenizer,
     image_url,
     video_url,
     audio_url,
@@ -1811,7 +1702,6 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
             },
         ],
         qwen25omni_model_config_mm_interleaved,
-        qwen25omni_tokenizer,
         content_format="string",
     )
 
@@ -1837,7 +1727,6 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
 
 def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
     phi3v_model_config_mm_interleaved,
-    phi3v_tokenizer,
     image_url,
 ):
     with pytest.raises(
@@ -1861,7 +1750,6 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
                 }
             ],
             phi3v_model_config_mm_interleaved,
-            phi3v_tokenizer,
             content_format="string",
         )
 
@@ -2237,9 +2125,7 @@ def test_resolve_content_format_examples(template_path, expected_format):
     assert resolved_format == expected_format
 
 
-def test_parse_chat_messages_include_thinking_chunk(
-    mistral_model_config, mistral_tokenizer
-):
+def test_parse_chat_messages_include_thinking_chunk(mistral_model_config):
     messages = [
         {
             "role": "system",
@@ -2269,7 +2155,6 @@ def test_parse_chat_messages_include_thinking_chunk(
     conversation_with_thinking, _, _ = parse_chat_messages(
         messages,
         mistral_model_config,
-        mistral_tokenizer,
         content_format="openai",
     )
 
@@ -2353,7 +2238,6 @@ def test_apply_mistral_chat_template_thinking_chunk():
 
 def test_parse_chat_messages_single_empty_audio_with_uuid(
     qwen2_audio_model_config,
-    qwen2_audio_tokenizer,
 ):
     audio_uuid = "abcd"
     conversation, mm_data, mm_uuids = parse_chat_messages(
@@ -2371,7 +2255,6 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
             }
         ],
         qwen2_audio_model_config,
-        qwen2_audio_tokenizer,
         content_format="string",
     )
 
@@ -2389,7 +2272,6 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
 @pytest.mark.asyncio
 async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
     qwen2_audio_model_config,
-    qwen2_audio_tokenizer,
 ):
     audio_uuid = "abcd"
     conversation, mm_future, mm_uuids = parse_chat_messages_futures(
@@ -2407,7 +2289,6 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
             }
         ],
         qwen2_audio_model_config,
-        qwen2_audio_tokenizer,
         content_format="string",
     )
 
diff --git a/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py b/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py
index 276de2ff8..b30556fbc 100644
--- a/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py
+++ b/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import base64
-import io
 import json
 
 import openai  # use the official client for correctness check
@@ -13,6 +11,7 @@ from transformers import AutoConfig
 
 from tests.conftest import ImageTestAssets
 from tests.utils import RemoteOpenAIServer
+from vllm.utils.serial_utils import tensor2base64
 
 # any model with a chat template should work here
 MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
@@ -50,18 +49,6 @@ async def client_with_image_embeds(server_with_image_embeds):
         yield async_client
 
 
-def encode_image_embedding_to_base64(image_embedding) -> str:
-    """
-    Encode image embedding to base64 string
-    """
-    buffer = io.BytesIO()
-    torch.save(image_embedding, buffer)
-    buffer.seek(0)
-    binary_data = buffer.read()
-    base64_image_embedding = base64.b64encode(binary_data).decode("utf-8")
-    return base64_image_embedding
-
-
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("dtype", [torch.half, torch.float16, torch.float32])
@@ -73,7 +60,7 @@ async def test_completions_with_image_embeds(
 ):
     # Test case: Single image embeds input
     image_embeds = image_assets[0].image_embeds.to(dtype=dtype)
-    base64_image_embedding = encode_image_embedding_to_base64(image_embeds)
+    base64_image_embedding = tensor2base64(image_embeds)
     chat_completion = await client_with_image_embeds.chat.completions.create(
         messages=[
             {"role": "system", "content": "You are a helpful assistant."},
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 1b3a7d266..077fe681b 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -536,7 +536,7 @@ def resolve_hf_chat_template(
 def _resolve_chat_template_content_format(
     chat_template: str | None,
     tools: list[dict[str, Any]] | None,
-    tokenizer: TokenizerLike,
+    tokenizer: TokenizerLike | None,
     *,
     model_config: ModelConfig,
 ) -> _ChatTemplateContentFormat:
@@ -593,7 +593,7 @@ def resolve_chat_template_content_format(
     chat_template: str | None,
     tools: list[dict[str, Any]] | None,
     given_format: ChatTemplateContentFormatOption,
-    tokenizer: TokenizerLike,
+    tokenizer: TokenizerLike | None,
     *,
     model_config: ModelConfig,
 ) -> _ChatTemplateContentFormat:
@@ -627,11 +627,10 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
     maximum per prompt.
     """
 
-    def __init__(self, model_config: ModelConfig, tokenizer: TokenizerLike):
+    def __init__(self, model_config: ModelConfig):
         super().__init__()
 
         self._model_config = model_config
-        self._tokenizer = tokenizer
 
         self._items_by_modality = defaultdict[str, list[_T | None]](list)
         self._uuids_by_modality = defaultdict[str, list[str | None]](list)
@@ -1612,7 +1611,6 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None:
 def parse_chat_messages(
     messages: list[ChatCompletionMessageParam],
     model_config: ModelConfig,
-    tokenizer: TokenizerLike,
     content_format: _ChatTemplateContentFormat,
 ) -> tuple[
     list[ConversationMessage],
@@ -1620,7 +1618,7 @@ def parse_chat_messages(
     MultiModalUUIDDict | None,
 ]:
     conversation: list[ConversationMessage] = []
-    mm_tracker = MultiModalItemTracker(model_config, tokenizer)
+    mm_tracker = MultiModalItemTracker(model_config)
 
     for msg in messages:
         sub_messages = _parse_chat_message_content(
@@ -1644,7 +1642,6 @@ def parse_chat_messages(
 def parse_chat_messages_futures(
     messages: list[ChatCompletionMessageParam],
     model_config: ModelConfig,
-    tokenizer: TokenizerLike,
     content_format: _ChatTemplateContentFormat,
 ) -> tuple[
     list[ConversationMessage],
@@ -1652,7 +1649,7 @@ def parse_chat_messages_futures(
     MultiModalUUIDDict | None,
 ]:
     conversation: list[ConversationMessage] = []
-    mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer)
+    mm_tracker = AsyncMultiModalItemTracker(model_config)
 
     for msg in messages:
         sub_messages = _parse_chat_message_content(
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index c121fa71f..481a47a97 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -834,7 +834,6 @@ class LLM:
             conversation, mm_data, mm_uuids = parse_chat_messages(
                 msgs,
                 model_config,
-                tokenizer,
                 content_format=resolved_content_format,
             )
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 9642024dd..bfa98f29a 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1088,11 +1088,6 @@ class OpenAIServing:
         Sequence[RequestPrompt],
         list[EngineTokensPrompt],
     ]:
-        if tokenizer is None:
-            raise ValueError(
-                "Unable to get tokenizer because `skip_tokenizer_init=True`"
-            )
-
         model_config = self.model_config
 
         resolved_content_format = resolve_chat_template_content_format(
@@ -1105,7 +1100,6 @@ class OpenAIServing:
         conversation, mm_data_future, mm_uuids = parse_chat_messages_futures(
             messages,
             model_config,
-            tokenizer,
             content_format=resolved_content_format,
         )
 
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index 8819c85af..072ddd4c9 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -89,12 +89,10 @@ def parse_score_data(
     data_1: str | ScoreContentPartParam,
     data_2: str | ScoreContentPartParam,
     model_config: ModelConfig,
-    tokenizer: TokenizerLike,
 ) -> tuple[str, str, MultiModalDataDict | None]:
-    mm_tracker = MultiModalItemTracker(model_config, tokenizer)
+    mm_tracker = MultiModalItemTracker(model_config)
 
     content_1 = _parse_score_content(data_1, mm_tracker)
-
     content_2 = _parse_score_content(data_2, mm_tracker)
 
     def ensure_str(content: _ContentPart | None) -> str:
@@ -188,7 +186,6 @@ def get_score_prompt(
         data_1,
         data_2,
         model_config,
-        tokenizer,
     )
     from vllm.model_executor.model_loader import get_model_cls
 
diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py
index 6537b6df8..5aef09ca9 100644
--- a/vllm/model_executor/models/hunyuan_vision.py
+++ b/vllm/model_executor/models/hunyuan_vision.py
@@ -62,6 +62,7 @@ from vllm.multimodal.inputs import (
 from vllm.multimodal.parse import (
     DictEmbeddingItems,
     ImageSize,
+    ModalityDataItems,
     MultiModalDataItems,
     MultiModalDataParser,
 )
@@ -570,7 +571,7 @@ class HunYuanVLMultiModalDataParser(MultiModalDataParser):
     def _parse_image_data(
         self,
         data: dict[str, torch.Tensor] | ModalityData[ImageItem],
-    ):
+    ) -> ModalityDataItems[Any, Any] | None:
         if isinstance(data, dict):
             return DictEmbeddingItems(
                 data,
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 881760155..09acf8372 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -1000,7 +1000,7 @@ class KeyeMultiModalDataParser(MultiModalDataParser):
     def _parse_image_data(
         self,
         data: dict[str, torch.Tensor] | ModalityData[ImageItem],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> ModalityDataItems[Any, Any] | None:
         if isinstance(data, dict):
             return DictEmbeddingItems(
                 data,
@@ -1017,7 +1017,7 @@ class KeyeMultiModalDataParser(MultiModalDataParser):
     def _parse_video_data(
         self,
         data: dict[str, torch.Tensor] | ModalityData[VideoItem],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> ModalityDataItems[Any, Any] | None:
         if isinstance(data, dict):
             return DictEmbeddingItems(
                 data,
diff --git a/vllm/model_executor/models/keye_vl1_5.py b/vllm/model_executor/models/keye_vl1_5.py
index 124e9c2af..2b04e3bd4 100644
--- a/vllm/model_executor/models/keye_vl1_5.py
+++ b/vllm/model_executor/models/keye_vl1_5.py
@@ -333,7 +333,7 @@ class KeyeVL1_5MultiModalDataParser(MultiModalDataParser):
     def _parse_image_data(
         self,
         data: dict[str, torch.Tensor] | ModalityData[ImageItem],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> ModalityDataItems[Any, Any] | None:
         if isinstance(data, dict):
             return DictEmbeddingItems(
                 data,
@@ -350,7 +350,7 @@ class KeyeVL1_5MultiModalDataParser(MultiModalDataParser):
     def _parse_video_data(
         self,
         data: dict[str, torch.Tensor] | ModalityData[VideoItem],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> ModalityDataItems[Any, Any] | None:
         if isinstance(data, dict):
             return DictEmbeddingItems(
                 data,
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index b93a42ffd..062547401 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -11,6 +11,7 @@ import pybase64
 import torch
 
 from vllm.utils.import_utils import PlaceholderModule
+from vllm.utils.serial_utils import tensor2base64
 
 from .base import MediaIO
 
@@ -135,8 +136,4 @@ class AudioEmbeddingMediaIO(MediaIO[torch.Tensor]):
         return torch.load(filepath, weights_only=True)
 
     def encode_base64(self, media: torch.Tensor) -> str:
-        buffer = BytesIO()
-        torch.save(media, buffer)
-        buffer.seek(0)
-        binary_data = buffer.read()
-        return pybase64.b64encode(binary_data).decode("utf-8")
+        return tensor2base64(media)
diff --git a/vllm/utils/serial_utils.py b/vllm/utils/serial_utils.py
index b89fa6ce4..a6d717e03 100644
--- a/vllm/utils/serial_utils.py
+++ b/vllm/utils/serial_utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import base64
+import io
 import sys
 from dataclasses import dataclass
 from typing import Literal
@@ -52,6 +53,15 @@ Endianness = Literal["native", "big", "little"]
 EncodingFormat = Literal["float", "base64", "bytes"]
 
 
+def tensor2base64(x: torch.Tensor) -> str:
+    with io.BytesIO() as buf:
+        torch.save(x, buf)
+        buf.seek(0)
+        binary_data = buf.read()
+
+    return base64.b64encode(binary_data).decode("utf-8")
+
+
 def tensor2binary(
     tensor: torch.Tensor, embed_dtype: EmbedDType, endianness: Endianness
 ) -> bytes:
-- 
GitLab


From 82a64b3d8f93521d39569078d4ac56992a50a640 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Thu, 4 Dec 2025 15:12:12 +0800
Subject: [PATCH 069/258] [Bugfix] fixed deepseekv32 tool calling error
 (#30025)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/tokenizers/deepseek_v32_encoding.py | 6 ++++--
 vllm/tokenizers/deepseekv32.py           | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/vllm/tokenizers/deepseek_v32_encoding.py b/vllm/tokenizers/deepseek_v32_encoding.py
index fb8989e65..521bd9295 100644
--- a/vllm/tokenizers/deepseek_v32_encoding.py
+++ b/vllm/tokenizers/deepseek_v32_encoding.py
@@ -95,8 +95,10 @@ def tool_calls_to_openai_format(tool_calls):
 def encode_arguments_to_dsml(tool_call: dict[str, str]) -> str:
     p_dsml_template = """<{dsml_token}parameter name="{key}" string="{is_str}">{value}</{dsml_token}parameter>"""
     P_dsml_strs = []
-
-    arguments = json.loads(tool_call["arguments"])
+    if isinstance(tool_call["arguments"], str):
+        arguments = json.loads(tool_call["arguments"])
+    else:
+        arguments = tool_call["arguments"]
 
     for k, v in arguments.items():
         p_dsml_str = p_dsml_template.format(
diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py
index 7466ad407..1140357cf 100644
--- a/vllm/tokenizers/deepseekv32.py
+++ b/vllm/tokenizers/deepseekv32.py
@@ -43,7 +43,8 @@ class DeepseekV32Tokenizer(HfTokenizer):
         thinking_mode = "thinking"
         if not thinking:
             thinking_mode = "chat"
-        messages = messages.copy()
+        conversation = kwargs.get("conversation", messages)
+        messages = conversation.copy()
         drop_thinking = True
         if tools is not None and len(tools) > 0:
             messages.insert(0, {"role": "system"})
-- 
GitLab


From 404fc4bfc049fc86cc1ddd1d975ecfc72609db4f Mon Sep 17 00:00:00 2001
From: daniel-salib <danielsalib@meta.com>
Date: Wed, 3 Dec 2025 23:36:57 -0800
Subject: [PATCH 070/258] [Frontend] refactor harmony utils output message
 parsing (#29820)

Signed-off-by: Daniel Salib <danielsalib@meta.com>
---
 vllm/entrypoints/harmony_utils.py | 216 ++++++++++++++++--------------
 1 file changed, 117 insertions(+), 99 deletions(-)

diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py
index 47a252348..bb932e39e 100644
--- a/vllm/entrypoints/harmony_utils.py
+++ b/vllm/entrypoints/harmony_utils.py
@@ -328,6 +328,105 @@ def render_for_completion(messages: list[Message]) -> list[int]:
     return token_ids
 
 
+def _parse_browser_tool_call(message: Message, recipient: str) -> ResponseOutputItem:
+    """Parse browser tool calls (search, open, find) into web search items."""
+    if len(message.content) != 1:
+        raise ValueError("Invalid number of contents in browser message")
+    content = message.content[0]
+
+    # Parse JSON args (with retry detection)
+    try:
+        browser_call = json.loads(content.text)
+    except json.JSONDecodeError:
+        json_retry_output_message = (
+            f"Invalid JSON args, caught and retried: {content.text}"
+        )
+        browser_call = {
+            "query": json_retry_output_message,
+            "url": json_retry_output_message,
+            "pattern": json_retry_output_message,
+        }
+
+    # Create appropriate action based on recipient
+    if recipient == "browser.search":
+        action = ActionSearch(
+            query=f"cursor:{browser_call.get('query', '')}", type="search"
+        )
+    elif recipient == "browser.open":
+        action = ActionOpenPage(
+            url=f"cursor:{browser_call.get('url', '')}", type="open_page"
+        )
+    elif recipient == "browser.find":
+        action = ActionFind(
+            pattern=browser_call.get("pattern", ""),
+            url=f"cursor:{browser_call.get('url', '')}",
+            type="find",
+        )
+    else:
+        raise ValueError(f"Unknown browser action: {recipient}")
+
+    return ResponseFunctionWebSearch(
+        id=f"ws_{random_uuid()}",
+        action=action,
+        status="completed",
+        type="web_search_call",
+    )
+
+
+def _parse_function_call(message: Message, recipient: str) -> list[ResponseOutputItem]:
+    """Parse function calls into function tool call items."""
+    function_name = recipient.split(".")[-1]
+    output_items = []
+    for content in message.content:
+        random_id = random_uuid()
+        response_item = ResponseFunctionToolCall(
+            arguments=content.text,
+            call_id=f"call_{random_id}",
+            type="function_call",
+            name=function_name,
+            id=f"fc_{random_id}",
+        )
+        output_items.append(response_item)
+    return output_items
+
+
+def _parse_reasoning_content(message: Message) -> list[ResponseOutputItem]:
+    """Parse reasoning/analysis content into reasoning items."""
+    output_items = []
+    for content in message.content:
+        reasoning_item = ResponseReasoningItem(
+            id=f"rs_{random_uuid()}",
+            summary=[],
+            type="reasoning",
+            content=[
+                ResponseReasoningTextContent(text=content.text, type="reasoning_text")
+            ],
+            status=None,
+        )
+        output_items.append(reasoning_item)
+    return output_items
+
+
+def _parse_final_message(message: Message) -> ResponseOutputItem:
+    """Parse final channel messages into output message items."""
+    contents = []
+    for content in message.content:
+        output_text = ResponseOutputText(
+            text=content.text,
+            annotations=[],  # TODO
+            type="output_text",
+            logprobs=None,  # TODO
+        )
+        contents.append(output_text)
+    return ResponseOutputMessage(
+        id=f"msg_{random_uuid()}",
+        content=contents,
+        role=message.author.role,
+        status="completed",
+        type="message",
+    )
+
+
 def parse_output_message(message: Message) -> list[ResponseOutputItem]:
     """
     Parse a Harmony message into a list of output response items.
@@ -340,119 +439,38 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
 
     output_items: list[ResponseOutputItem] = []
     recipient = message.recipient
+
+    # Browser tool calls
     if recipient is not None and recipient.startswith("browser."):
-        if len(message.content) != 1:
-            raise ValueError("Invalid number of contents in browser message")
-        content = message.content[0]
-        # We do not need to check the VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY
-        # env variable since if it is not set, we are certain the json is valid
-        # The use of Actions for web search will be removed entirely in
-        # the future, so this is only necessary temporarily
-        try:
-            browser_call = json.loads(content.text)
-        except json.JSONDecodeError:
-            # If the content is not valid JSON, then it was
-            # caught and retried by vLLM, which means we
-            # need to make note of that so the user is aware
-            json_retry_output_message = (
-                f"Invalid JSON args, caught and retried: {content.text}"
-            )
-            browser_call = {
-                "query": json_retry_output_message,
-                "url": json_retry_output_message,
-                "pattern": json_retry_output_message,
-            }
-        # TODO: translate to url properly!
-        if recipient == "browser.search":
-            action = ActionSearch(
-                query=f"cursor:{browser_call.get('query', '')}", type="search"
-            )
-        elif recipient == "browser.open":
-            action = ActionOpenPage(
-                url=f"cursor:{browser_call.get('url', '')}", type="open_page"
-            )
-        elif recipient == "browser.find":
-            action = ActionFind(
-                pattern=browser_call["pattern"],
-                url=f"cursor:{browser_call.get('url', '')}",
-                type="find",
-            )
-        else:
-            raise ValueError(f"Unknown browser action: {recipient}")
-        web_search_item = ResponseFunctionWebSearch(
-            id=f"ws_{random_uuid()}",
-            action=action,
-            status="completed",
-            type="web_search_call",
-        )
-        output_items.append(web_search_item)
+        output_items.append(_parse_browser_tool_call(message, recipient))
+
+    # Analysis channel (reasoning/chain-of-thought)
     elif message.channel == "analysis":
-        for content in message.content:
-            reasoning_item = ResponseReasoningItem(
-                id=f"rs_{random_uuid()}",
-                summary=[],
-                type="reasoning",
-                content=[
-                    ResponseReasoningTextContent(
-                        text=content.text, type="reasoning_text"
-                    )
-                ],
-                status=None,
-            )
-            output_items.append(reasoning_item)
+        output_items.extend(_parse_reasoning_content(message))
+
+    # Commentary channel
     elif message.channel == "commentary":
+        # Function calls
         if recipient is not None and recipient.startswith("functions."):
-            function_name = recipient.split(".")[-1]
-            for content in message.content:
-                random_id = random_uuid()
-                response_item = ResponseFunctionToolCall(
-                    arguments=content.text,
-                    call_id=f"call_{random_id}",
-                    type="function_call",
-                    name=function_name,
-                    id=f"fc_{random_id}",
-                )
-                output_items.append(response_item)
+            output_items.extend(_parse_function_call(message, recipient))
+
+        # Built-in tools on commentary channel are treated as reasoning for now
         elif recipient is not None and (
             recipient.startswith("python")
             or recipient.startswith("browser")
             or recipient.startswith("container")
         ):
-            for content in message.content:
-                reasoning_item = ResponseReasoningItem(
-                    id=f"rs_{random_uuid()}",
-                    summary=[],
-                    type="reasoning",
-                    content=[
-                        ResponseReasoningTextContent(
-                            text=content.text, type="reasoning_text"
-                        )
-                    ],
-                    status=None,
-                )
-                output_items.append(reasoning_item)
+            output_items.extend(_parse_reasoning_content(message))
         else:
             raise ValueError(f"Unknown recipient: {recipient}")
+
+    # Final output message
     elif message.channel == "final":
-        contents = []
-        for content in message.content:
-            output_text = ResponseOutputText(
-                text=content.text,
-                annotations=[],  # TODO
-                type="output_text",
-                logprobs=None,  # TODO
-            )
-            contents.append(output_text)
-        text_item = ResponseOutputMessage(
-            id=f"msg_{random_uuid()}",
-            content=contents,
-            role=message.author.role,
-            status="completed",
-            type="message",
-        )
-        output_items.append(text_item)
+        output_items.append(_parse_final_message(message))
+
     else:
         raise ValueError(f"Unknown channel: {message.channel}")
+
     return output_items
 
 
-- 
GitLab


From fd68e909db1804f211707bb027a49b82bb5c2d8f Mon Sep 17 00:00:00 2001
From: CYJiang <86391540+googs1025@users.noreply.github.com>
Date: Thu, 4 Dec 2025 15:46:15 +0800
Subject: [PATCH 071/258] [docs] Remove _total from counter metrics names
 (#30028)

In Prometheus Counters always expose their actual numeric value with a metric name that ends in _total. We should document the base name, as this what appears in the get_metrics() API.

Signed-off-by: CYJiang <86391540+googs1025@users.noreply.github.com>
---
 docs/design/metrics.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/design/metrics.md b/docs/design/metrics.md
index 59cb6ba46..13264f686 100644
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@@ -57,15 +57,15 @@ vLLM also provides [a reference example](../../examples/online_serving/prometheu
 The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:
 
 - `vllm:e2e_request_latency_seconds_bucket` - End to end request latency measured in seconds.
-- `vllm:prompt_tokens_total` - Prompt tokens.
-- `vllm:generation_tokens_total` - Generation tokens.
+- `vllm:prompt_tokens` - Prompt tokens.
+- `vllm:generation_tokens` - Generation tokens.
 - `vllm:time_per_output_token_seconds` - Inter-token latency (Time Per Output Token, TPOT) in seconds.
 - `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds.
 - `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in the RUNNING, WAITING, and SWAPPED states.
 - `vllm:gpu_cache_usage_perc` - Percentage of used cache blocks by vLLM.
 - `vllm:request_prompt_tokens` - Request prompt length.
 - `vllm:request_generation_tokens` - Request generation length.
-- `vllm:request_success_total` - Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.
+- `vllm:request_success` - Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.
 - `vllm:request_queue_time_seconds` - Queue time.
 - `vllm:request_prefill_time_seconds` - Requests prefill time.
 - `vllm:request_decode_time_seconds` - Requests decode time.
@@ -571,9 +571,9 @@ model and then validate those tokens with the larger model.
 
 - `vllm:spec_decode_draft_acceptance_rate` (Gauge)
 - `vllm:spec_decode_efficiency` (Gauge)
-- `vllm:spec_decode_num_accepted_tokens_total` (Counter)
-- `vllm:spec_decode_num_draft_tokens_total` (Counter)
-- `vllm:spec_decode_num_emitted_tokens_total` (Counter)
+- `vllm:spec_decode_num_accepted_tokens` (Counter)
+- `vllm:spec_decode_num_draft_tokens` (Counter)
+- `vllm:spec_decode_num_emitted_tokens` (Counter)
 
 There is a PR under review (<https://github.com/vllm-project/vllm/pull/12193>) to add "prompt lookup (ngram)"
 speculative decoding to v1. Other techniques will follow. We should
-- 
GitLab


From 9aa33a74b00b2db7d7da22a59ed64b44ebbabe14 Mon Sep 17 00:00:00 2001
From: Charlie Fu <charlifu@amd.com>
Date: Thu, 4 Dec 2025 01:52:28 -0600
Subject: [PATCH 072/258] [Rocm][CI] Fix test_speculator_eagle3 by skipping the
 CompressedTensorw4a16 Model (#30001)

Signed-off-by: charlifu <charlifu@amd.com>
Co-authored-by: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
---
 tests/v1/spec_decode/test_speculators_eagle3.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/v1/spec_decode/test_speculators_eagle3.py b/tests/v1/spec_decode/test_speculators_eagle3.py
index 5ce6e1593..9a252cfff 100644
--- a/tests/v1/spec_decode/test_speculators_eagle3.py
+++ b/tests/v1/spec_decode/test_speculators_eagle3.py
@@ -5,6 +5,7 @@ import torch
 
 from vllm.config import SpeculativeConfig
 from vllm.model_executor.models.interfaces import supports_eagle3
+from vllm.platforms import current_platform
 
 
 @pytest.mark.parametrize(
@@ -21,6 +22,10 @@ from vllm.model_executor.models.interfaces import supports_eagle3
         pytest.param(
             "nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized-w4a16",
             id="qwen3-eagle3-speculator-w4a16-verifier",
+            marks=pytest.mark.skipif(
+                current_platform.is_rocm(),
+                reason="The tests are skipped on rocm platform.",
+            ),
         ),
     ],
 )
-- 
GitLab


From 3f1b03739ae1422361446d3d23bed970bd549ebc Mon Sep 17 00:00:00 2001
From: TJian <tunjian.tan@embeddedllm.com>
Date: Thu, 4 Dec 2025 16:20:24 +0800
Subject: [PATCH 073/258] [ROCm] [Bugfix] `compute_attn_mask_seqlen` for qwen3
 omni (#29974)

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 vllm/model_executor/models/qwen3_omni_moe_thinker.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index fe825198d..e6979211b 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -494,7 +494,10 @@ class Qwen3Omni_VisionTransformer(nn.Module):
         cu_seqlens: torch.Tensor,
     ) -> torch.Tensor:
         max_seqlen = torch.zeros([], device=cu_seqlens.device)
-        if self.attn_backend == AttentionBackendEnum.FLASH_ATTN:
+        if self.attn_backend in {
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
+        }:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
         return max_seqlen
 
-- 
GitLab


From 5430e110c099fdc6c8c80f443bf5adffe67aa30b Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Thu, 4 Dec 2025 02:20:54 -0600
Subject: [PATCH 074/258] [CI][AMD] Match Main CI Behavior By Skipping
 test_eplb_spec_decode In AMD CI (#30006)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 tests/distributed/test_eplb_spec_decode.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/distributed/test_eplb_spec_decode.py b/tests/distributed/test_eplb_spec_decode.py
index 868cc7028..22977ce94 100644
--- a/tests/distributed/test_eplb_spec_decode.py
+++ b/tests/distributed/test_eplb_spec_decode.py
@@ -6,6 +6,7 @@ import lm_eval
 import pytest
 
 from tests.utils import large_gpu_mark
+from vllm.platforms import current_platform
 
 
 def get_model_args(
@@ -45,6 +46,12 @@ def get_model_args(
     return model_args
 
 
+pytestmark = pytest.mark.skipif(
+    current_platform.is_rocm(),
+    reason="EPLB with Spec Decode is a work in progress on ROCm.",
+)
+
+
 @pytest.mark.parametrize(
     "model_setup",
     [
-- 
GitLab


From 68eb5c8d970a453a440776211f8dbff215fb40c3 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 4 Dec 2025 16:21:19 +0800
Subject: [PATCH 075/258] [Misc] Move functions into `PoolingMetadata` (#30027)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/layers/pooler.py | 50 ++++------------------------
 vllm/model_executor/models/gritlm.py |  6 ++--
 vllm/v1/pool/metadata.py             | 21 ++++++++++++
 3 files changed, 30 insertions(+), 47 deletions(-)

diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 7dd02e32f..185e03e5f 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -64,42 +64,6 @@ class PoolingParamsUpdate:
         params.requires_token_ids = self.requires_token_ids
 
 
-def get_prompt_lens(
-    hidden_states: torch.Tensor | list[torch.Tensor],
-    pooling_metadata: PoolingMetadata,
-) -> torch.Tensor:
-    return pooling_metadata.prompt_lens
-
-
-def get_prompt_token_ids(pooling_metadata: PoolingMetadata) -> list[torch.Tensor]:
-    assert pooling_metadata.prompt_token_ids is not None, (
-        "Please set `requires_token_ids=True` in `get_pooling_updates`"
-    )
-
-    return [
-        pooling_metadata.prompt_token_ids[i, :num]
-        for i, num in enumerate(pooling_metadata.prompt_lens)
-    ]
-
-
-def get_pooling_params(pooling_metadata: PoolingMetadata) -> list[PoolingParams]:
-    pooling_params = pooling_metadata.pooling_params
-    return pooling_params
-
-
-def get_tasks(pooling_metadata: PoolingMetadata) -> list[PoolingTask]:
-    pooling_params = get_pooling_params(pooling_metadata)
-
-    tasks: list[PoolingTask] = [
-        task
-        for pooling_param in pooling_params
-        if (task := pooling_param.task) is not None
-    ]
-    assert len(pooling_params) == len(tasks)
-
-    return tasks
-
-
 def get_classification_activation_function(config: PretrainedConfig):
     # Implement alignment with transformers ForSequenceClassificationLoss
     # https://github.com/huggingface/transformers/blob/57bb6db6ee4cfaccc45b8d474dfad5a17811ca60/src/transformers/loss/loss_utils.py#L92
@@ -466,7 +430,7 @@ class EmbeddingPoolerHead(PoolerHead):
             pooled_data = self.projector(pooled_data)
         # pooled_data shape: [batchsize, embedding_dimension]
 
-        pooling_params = get_pooling_params(pooling_metadata)
+        pooling_params = pooling_metadata.pooling_params
 
         # for matryoshka representation
         dimensions_list = [pooling_param.dimensions for pooling_param in pooling_params]
@@ -606,7 +570,7 @@ class ClassifierPooler(Pooler):
         if self.logit_bias is not None:
             pooled_data -= self.logit_bias
 
-        pooling_params = get_pooling_params(pooling_metadata)
+        pooling_params = pooling_metadata.pooling_params
         flags = [p.use_activation for p in pooling_params]
 
         if len(set(flags)) == 1:
@@ -704,7 +668,7 @@ class AllPooler(Pooler):
         pooling_metadata: PoolingMetadata,
     ) -> PoolerOutput:
         pooled_data = self.pooling(hidden_states, pooling_metadata)
-        pooling_params = get_pooling_params(pooling_metadata)
+        pooling_params = pooling_metadata.pooling_params
         assert len(pooled_data) == len(pooling_params)
 
         pooled_data = [self.head(d, p) for d, p in zip(pooled_data, pooling_params)]
@@ -724,11 +688,11 @@ class StepPooler(Pooler):
         pooling_metadata: PoolingMetadata,
     ) -> torch.Tensor | list[torch.Tensor]:
         pooled_data_lst = self.pooling(hidden_states, pooling_metadata)
-        prompt_token_ids = get_prompt_token_ids(pooling_metadata)
+        prompt_token_ids = pooling_metadata.get_prompt_token_ids()
 
         pooled_data = list[torch.Tensor]()
 
-        pooling_params = get_pooling_params(pooling_metadata)
+        pooling_params = pooling_metadata.pooling_params
 
         for data, token_id, pooling_param in zip(
             pooled_data_lst, prompt_token_ids, pooling_params
@@ -757,7 +721,7 @@ class StepPooler(Pooler):
         pooling_metadata: PoolingMetadata,
     ) -> PoolerOutput:
         pooled_data = self.extract_states(hidden_states, pooling_metadata)
-        pooling_params = get_pooling_params(pooling_metadata)
+        pooling_params = pooling_metadata.pooling_params
         assert len(pooled_data) == len(pooling_params)
 
         pooled_data = [self.head(d, p) for d, p in zip(pooled_data, pooling_params)]
@@ -794,7 +758,7 @@ class DispatchPooler(Pooler):
 
         outputs = list[torch.Tensor]()
         offset = 0
-        for task, group in groupby(get_tasks(pooling_metadata)):
+        for task, group in groupby(pooling_metadata.tasks):
             if not (pooler := poolers_by_task.get(task)):
                 raise ValueError(
                     f"Unsupported task: {task} "
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index 550e8b014..2aba626a7 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -14,8 +14,6 @@ from vllm.model_executor.layers.pooler import (
     PoolerHead,
     PoolerNormalize,
     PoolingParamsUpdate,
-    get_prompt_lens,
-    get_prompt_token_ids,
 )
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.tasks import PoolingTask
@@ -153,11 +151,11 @@ class GritLMMeanPool(nn.Module):
         hidden_states: torch.Tensor | list[torch.Tensor],
         pooling_metadata: PoolingMetadata,
     ) -> list[torch.Tensor] | torch.Tensor:
-        prompt_lens = get_prompt_lens(hidden_states, pooling_metadata)
+        prompt_lens = pooling_metadata.prompt_lens
         instr_lens = torch.tensor(
             [
                 self._get_instruction_len(token_ids.cpu().numpy())
-                for token_ids in get_prompt_token_ids(pooling_metadata)
+                for token_ids in pooling_metadata.get_prompt_token_ids()
             ],
             device="cpu",
         )
diff --git a/vllm/v1/pool/metadata.py b/vllm/v1/pool/metadata.py
index 7bd2c7415..9ee588ea4 100644
--- a/vllm/v1/pool/metadata.py
+++ b/vllm/v1/pool/metadata.py
@@ -5,6 +5,7 @@ from dataclasses import dataclass
 import torch
 
 from vllm.pooling_params import PoolingParams
+from vllm.tasks import PoolingTask
 from vllm.utils.platform_utils import is_pin_memory_available
 
 pin_memory = is_pin_memory_available()
@@ -40,6 +41,18 @@ class PoolingMetadata:
     pooling_params: list[PoolingParams]
     pooling_cursor: PoolingCursor | None = None
 
+    def __post_init__(self) -> None:
+        pooling_params = self.pooling_params
+
+        tasks: list[PoolingTask] = [
+            task
+            for pooling_param in pooling_params
+            if (task := pooling_param.task) is not None
+        ]
+        assert len(pooling_params) == len(tasks)
+
+        self.tasks = tasks
+
     def __getitem__(self, indices: slice):
         return PoolingMetadata(
             prompt_lens=self.prompt_lens[indices],
@@ -52,6 +65,14 @@ class PoolingMetadata:
             else self.pooling_cursor[indices],
         )
 
+    def get_prompt_token_ids(self) -> list[torch.Tensor]:
+        prompt_token_ids = self.prompt_token_ids
+        assert prompt_token_ids is not None, (
+            "Please set `requires_token_ids=True` in `get_pooling_updates`"
+        )
+
+        return [prompt_token_ids[i, :num] for i, num in enumerate(self.prompt_lens)]
+
     def build_pooling_cursor(
         self, num_scheduled_tokens: list[int], device: torch.device
     ):
-- 
GitLab


From 899e2ef558e7345b99bc0d53c2e1c60ffdca7470 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Thu, 4 Dec 2025 08:22:03 +0000
Subject: [PATCH 076/258] [Core] Fix standalone runs of
 test_reset_prefix_cache_e2e (#29899)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 tests/v1/core/test_reset_prefix_cache_e2e.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/v1/core/test_reset_prefix_cache_e2e.py b/tests/v1/core/test_reset_prefix_cache_e2e.py
index e543c30a1..083fc3f34 100644
--- a/tests/v1/core/test_reset_prefix_cache_e2e.py
+++ b/tests/v1/core/test_reset_prefix_cache_e2e.py
@@ -11,7 +11,9 @@ PROMPTS = [
 ]
 
 
-def test_reset_prefix_cache_e2e():
+def test_reset_prefix_cache_e2e(monkeypatch):
+    # "spawn" is required for test to be deterministic
+    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
     engine_args = EngineArgs(
         model="Qwen/Qwen3-0.6B",
         gpu_memory_utilization=0.2,
-- 
GitLab


From b8a6ae415859bf8c9a3509cbd714695866e35d66 Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <yeq@meta.com>
Date: Thu, 4 Dec 2025 00:45:57 -0800
Subject: [PATCH 077/258] [ROCm] add fallback for aiter fp8 decode mla (#30005)

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
---
 vllm/_aiter_ops.py | 37 +++++++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index a8f472d14..35920d826 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -283,6 +283,28 @@ def _rocm_aiter_grouped_topk_fake(
     pass
 
 
+# Cache whether aiter supports FP8 MLA parameters
+_AITER_MLA_SUPPORTS_FP8: bool | None = None
+
+
+def _check_aiter_mla_fp8_support() -> bool:
+    """Check if aiter.mla.mla_decode_fwd supports q_scale and kv_scale parameters."""
+    global _AITER_MLA_SUPPORTS_FP8
+    if _AITER_MLA_SUPPORTS_FP8 is None:
+        try:
+            import inspect
+
+            from aiter.mla import mla_decode_fwd
+
+            sig = inspect.signature(mla_decode_fwd)
+            _AITER_MLA_SUPPORTS_FP8 = (
+                "q_scale" in sig.parameters and "kv_scale" in sig.parameters
+            )
+        except Exception:
+            _AITER_MLA_SUPPORTS_FP8 = False
+    return _AITER_MLA_SUPPORTS_FP8
+
+
 def _rocm_aiter_mla_decode_fwd_impl(
     q: torch.Tensor,
     kv_buffer: torch.Tensor,
@@ -299,6 +321,16 @@ def _rocm_aiter_mla_decode_fwd_impl(
 ) -> None:
     from aiter.mla import mla_decode_fwd
 
+    kwargs = {
+        "sm_scale": sm_scale,
+        "logit_cap": logit_cap,
+    }
+
+    # Only pass q_scale and kv_scale if the aiter library supports them
+    if _check_aiter_mla_fp8_support():
+        kwargs["q_scale"] = q_scale
+        kwargs["kv_scale"] = kv_scale
+
     mla_decode_fwd(
         q,
         kv_buffer.view(-1, 1, 1, q.shape[-1]),
@@ -308,10 +340,7 @@ def _rocm_aiter_mla_decode_fwd_impl(
         kv_indices,
         kv_last_page_lens,
         max_seqlen_qo,
-        sm_scale=sm_scale,
-        logit_cap=logit_cap,
-        q_scale=q_scale,
-        kv_scale=kv_scale,
+        **kwargs,
     )
 
 
-- 
GitLab


From ffdd18111b767d271786b982378d51b51ce151fe Mon Sep 17 00:00:00 2001
From: Xu Wenqing <121550081+Xu-Wenqing@users.noreply.github.com>
Date: Thu, 4 Dec 2025 16:46:34 +0800
Subject: [PATCH 078/258] Add DeepSeek-V3.2 tool parser. (#29848)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 许文卿 <xwq391974@alibaba-inc.com>
---
 .../openai/tool_parsers/__init__.py           |   4 +
 .../tool_parsers/deepseekv32_tool_parser.py   | 591 ++++++++++++++++++
 2 files changed, 595 insertions(+)
 create mode 100644 vllm/entrypoints/openai/tool_parsers/deepseekv32_tool_parser.py

diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index 89e439dd5..ed43ea7ee 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -30,6 +30,10 @@ _TOOL_PARSERS_TO_REGISTER = {
         "deepseekv31_tool_parser",
         "DeepSeekV31ToolParser",
     ),
+    "deepseek_v32": (
+        "deepseekv32_tool_parser",
+        "DeepSeekV32ToolParser",
+    ),
     "ernie45": (
         "ernie45_tool_parser",
         "Ernie45ToolParser",
diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv32_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv32_tool_parser.py
new file mode 100644
index 000000000..4973deb7c
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv32_tool_parser.py
@@ -0,0 +1,591 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+import uuid
+from collections.abc import Sequence
+from typing import Any
+
+import regex as re
+
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+)
+from vllm.logger import init_logger
+from vllm.tokenizers import TokenizerLike
+
+logger = init_logger(__name__)
+
+
+class DeepSeekV32ToolParser(ToolParser):
+    """
+    example tool call content:
+    <｜DSML｜function_calls>
+    <｜DSML｜invoke name="get_weather">
+    <｜DSML｜parameter name="location" string="true">杭州</｜DSML｜parameter>
+    <｜DSML｜parameter name="date" string="true">2024-01-16</｜DSML｜parameter>
+    </｜DSML｜invoke>
+    <｜DSML｜invoke name="get_weather">
+    <｜DSML｜parameter name="location" string="true">北京</｜DSML｜parameter>
+    <｜DSML｜parameter name="date" string="true">2024-01-16</｜DSML｜parameter>
+    </｜DSML｜invoke>
+    </｜DSML｜function_calls>
+    """
+
+    def __init__(self, tokenizer: TokenizerLike):
+        super().__init__(tokenizer)
+
+        self.prev_tool_call_arr: list[dict] = []
+
+        # Sentinel tokens
+        self.dsml_token: str = "｜DSML｜"
+        self.dsml_start_check: str = "<" + self.dsml_token
+        self.tool_call_start_token: str = "<｜DSML｜function_calls>"
+        self.tool_call_end_token: str = "</｜DSML｜function_calls>"
+        self.invoke_start_prefix: str = "<｜DSML｜invoke name="
+        self.invoke_end_token: str = "</｜DSML｜invoke>"
+        self.parameter_prefix: str = "<｜DSML｜parameter name="
+        self.parameter_end_token: str = "</｜DSML｜parameter>"
+
+        # Streaming state variables
+        self.current_tool_name_sent: bool = False
+        # Override base class type - we use string IDs for tool calls
+        self.current_tool_id: str | None = None  # type: ignore
+        self.streamed_args_for_tool: list[str] = []
+        self.is_tool_call_started: bool = False
+        self.failed_count: int = 0
+
+        # Initialize streaming state variables
+        self.current_tool_index: int = 0
+        self.invoke_index: int = 0
+        self.header_sent: bool = False
+        self.current_function_name: str | None = None
+        self.current_param_name: str | None = None
+        self.current_param_value: str = ""
+        self.param_count: int = 0
+        self.in_param: bool = False
+        self.in_function: bool = False
+        self.json_started: bool = False
+        self.json_closed: bool = False
+        self.accumulated_params: dict = {}
+        self.streaming_request: ChatCompletionRequest | None = None
+
+        # Enhanced streaming state - reset for each new message
+        self._reset_streaming_state()
+
+        # Regex patterns for complete parsing
+        self.tool_call_complete_regex = re.compile(
+            r"<｜DSML｜function_calls>(.*?)</｜DSML｜function_calls>", re.DOTALL
+        )
+        self.invoke_complete_regex = re.compile(
+            r'<｜DSML｜invoke\s+name="([^"]+)"\s*>(.*?)</｜DSML｜invoke>', re.DOTALL
+        )
+        self.parameter_complete_regex = re.compile(
+            r'<｜DSML｜parameter\s+name="([^"]+)"\s+string="(?:true|false)"\s*>(.*?)</｜DSML｜parameter>',
+            re.DOTALL,
+        )
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ToolParser "
+                "constructor during construction."
+            )
+
+        logger.debug(
+            "vLLM Successfully import tool parser %s !", self.__class__.__name__
+        )
+
+    def _generate_tool_call_id(self) -> str:
+        """Generate a unique tool call ID."""
+        return f"call_{uuid.uuid4().hex[:24]}"
+
+    def _reset_streaming_state(self):
+        """Reset all streaming state."""
+        self.current_tool_index = 0
+        self.invoke_index = 0
+        self.is_tool_call_started = False
+        self.header_sent = False
+        self.current_tool_id = None
+        self.current_function_name = None
+        self.current_param_name = None
+        self.current_param_value = ""
+        self.param_count = 0
+        self.in_param = False
+        self.in_function = False
+        self.json_started = False
+        self.json_closed = False
+        # Store accumulated parameters for type conversion
+        self.accumulated_params = {}
+        self.streaming_request = None
+        # Clear previous tool call history to avoid state pollution
+        self.prev_tool_call_arr.clear()
+
+    def _parse_invoke_params(self, invoke_str: str) -> dict | None:
+        param_dict = dict()
+        for param_name, param_val in self.parameter_complete_regex.findall(invoke_str):
+            param_dict[param_name] = param_val
+        return param_dict
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+        """Extract tool calls from complete model output (non-streaming)."""
+        # Quick check
+        if self.tool_call_start_token not in model_output:
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
+
+        try:
+            tool_calls = []
+
+            # Find all complete tool_call blocks
+            for tool_call_match in self.tool_call_complete_regex.findall(model_output):
+                # Find all invokes within this tool_call
+                for invoke_name, invoke_content in self.invoke_complete_regex.findall(
+                    tool_call_match
+                ):
+                    param_dict = self._parse_invoke_params(invoke_content)
+                    tool_calls.append(
+                        ToolCall(
+                            type="function",
+                            function=FunctionCall(
+                                name=invoke_name,
+                                arguments=json.dumps(param_dict, ensure_ascii=False),
+                            ),
+                        )
+                    )
+
+            if not tool_calls:
+                return ExtractedToolCallInformation(
+                    tools_called=False, tool_calls=[], content=model_output
+                )
+
+            # Extract content before first tool call
+            first_tool_idx = model_output.find(self.tool_call_start_token)
+            content = model_output[:first_tool_idx] if first_tool_idx > 0 else None
+
+            return ExtractedToolCallInformation(
+                tools_called=True, tool_calls=tool_calls, content=content
+            )
+
+        except Exception:
+            logger.exception("Error extracting tool calls")
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
+
+    def _extract_name(self, name_str: str) -> str:
+        """Extract name from quoted string."""
+        name_str = name_str.strip()
+        if (
+            name_str.startswith('"')
+            and name_str.endswith('"')
+            or name_str.startswith("'")
+            and name_str.endswith("'")
+        ):
+            return name_str[1:-1]
+        return name_str
+
+    def _extract_param_name(self, input_str: str) -> str:
+        """Extract param name"""
+        start = input_str.find('"') + 1
+        end = input_str.find('"', start)
+        return input_str[start:end] if start > 0 and end > start else input_str
+
+    def _convert_param_value(self, value: str, param_type: str) -> Any:
+        """Convert parameter value to the correct type."""
+        if value.lower() == "null":
+            return None
+
+        param_type = param_type.lower()
+        if param_type in ["string", "str", "text"]:
+            return value
+        elif param_type in ["integer", "int"]:
+            try:
+                return int(value)
+            except (ValueError, TypeError):
+                return value
+        elif param_type in ["number", "float"]:
+            try:
+                val = float(value)
+                return val if val != int(val) else int(val)
+            except (ValueError, TypeError):
+                return value
+        elif param_type in ["boolean", "bool"]:
+            return value.lower() in ["true", "1"]
+        elif param_type in ["object", "array"]:
+            try:
+                return json.loads(value)
+            except json.JSONDecodeError:
+                return value
+        else:
+            # Try JSON parse first, fallback to string
+            try:
+                return json.loads(value)
+            except json.JSONDecodeError:
+                return value
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],  # pylint: disable=unused-argument
+        current_token_ids: Sequence[int],  # pylint: disable=unused-argument
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> DeltaMessage | None:
+        """Extract tool calls from streaming model output."""
+
+        # Store request for type conversion
+        if not previous_text:
+            self._reset_streaming_state()
+            self.streaming_request = request
+
+        # If no delta text, return None unless it's an EOS token after tools
+        if not delta_text:
+            # Check if this is an EOS token after all tool calls are complete
+            if delta_token_ids:
+                # Count complete tool calls
+                complete_calls = len(
+                    self.tool_call_complete_regex.findall(current_text)
+                )
+
+                # If we have completed tool calls and populated prev_tool_call_arr
+                if complete_calls > 0 and len(self.prev_tool_call_arr) > 0:
+                    # Check if all tool calls are closed
+                    open_calls = current_text.count(
+                        self.tool_call_start_token
+                    ) - current_text.count(self.tool_call_end_token)
+                    if open_calls == 0:
+                        # Return empty delta for finish_reason processing
+                        return DeltaMessage(content="")
+                elif not self.is_tool_call_started and current_text:
+                    # This is a regular content response that's now complete
+                    return DeltaMessage(content="")
+            return None
+
+        # Check if we need to advance to next tool
+        if self.json_closed and not self.in_function:
+            # Check if this tool call has ended
+            invoke_ends = current_text.count(self.invoke_end_token)
+            if invoke_ends > self.current_tool_index:
+                # This tool has ended, advance to next
+                self.current_tool_index += 1
+                self.header_sent = False
+                self.param_count = 0
+                self.json_started = False
+                self.json_closed = False
+                self.in_function = False  # Now we can safely set this to False
+                self.accumulated_params = {}
+                # Continue processing next tool
+                return None
+
+        # Handle normal content before tool calls
+        if not self.is_tool_call_started:
+            # Check if tool call is starting
+            if self.dsml_token in current_text:
+                self.is_tool_call_started = True
+                # Return any content before the tool call
+                if self.dsml_start_check in delta_text:
+                    content_before = delta_text[
+                        : delta_text.index(self.dsml_start_check)
+                    ]
+                    if content_before:
+                        return DeltaMessage(content=content_before)
+                return None
+            else:
+                # Check if we're between tool calls - skip whitespace
+                if (
+                    current_text.rstrip().endswith(self.tool_call_end_token)
+                    and delta_text.strip() == ""
+                ):
+                    # We just ended a tool call, skip whitespace
+                    return None
+                # Normal content, no tool call
+                if delta_text.endswith("<"):
+                    return DeltaMessage(content=delta_text[:-1])
+                if previous_text and previous_text.endswith("<"):
+                    return DeltaMessage(content="<" + delta_text)
+                return DeltaMessage(content=delta_text)
+
+        # Check if we're between tool calls (waiting for next one)
+        invoke_starts_count = current_text.count(self.invoke_start_prefix)
+        if self.current_tool_index >= invoke_starts_count:
+            # We're past all tool calls, shouldn't be here
+            return None
+
+        # Find the current tool call portion
+        invoke_start_positions: list[int] = []
+        idx = 0
+        while True:
+            idx = current_text.find(self.invoke_start_prefix, idx)
+            if idx == -1:
+                break
+            invoke_start_positions.append(idx)
+            idx += len(self.invoke_start_prefix)
+
+        if self.current_tool_index >= len(invoke_start_positions):
+            # No more tool calls to process yet
+            return None
+
+        invoke_start_idx = invoke_start_positions[self.current_tool_index]
+        # Find where this tool call ends (or current position if not ended yet)
+        invoke_end_idx = current_text.find(self.invoke_end_token, invoke_start_idx)
+        if invoke_end_idx == -1:
+            tool_text = current_text[invoke_start_idx:]
+        else:
+            tool_text = current_text[
+                invoke_start_idx : invoke_end_idx + len(self.invoke_end_token)
+            ]
+
+        # Looking for function header
+        if not self.header_sent:
+            if self.invoke_start_prefix in tool_text:
+                func_start = tool_text.find(self.invoke_start_prefix) + len(
+                    self.invoke_start_prefix
+                )
+                # Find the end quote for the function name
+                func_end = tool_text.find(">", func_start)
+
+                if func_end != -1:
+                    # Found complete function name
+                    function_name_raw = tool_text[func_start:func_end]
+                    self.current_function_name = self._extract_name(function_name_raw)
+                    self.current_tool_id = self._generate_tool_call_id()
+                    self.header_sent = True
+                    self.in_function = True
+
+                    # Add to prev_tool_call_arr immediately when we detect a tool call
+                    # Each tool call should be recorded regardless of function name
+                    # Ensure we don't add the same tool call index multiple times
+                    if len(self.prev_tool_call_arr) <= self.current_tool_index:
+                        self.prev_tool_call_arr.append(
+                            {
+                                "name": self.current_function_name,
+                                "arguments": "{}",  # Placeholder, will be updated later
+                            }
+                        )
+
+                    # Send header with function info
+                    return DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_index,
+                                id=self.current_tool_id,
+                                function=DeltaFunctionCall(
+                                    name=self.current_function_name, arguments=""
+                                ),
+                                type="function",
+                            )
+                        ]
+                    )
+            return None
+
+        # We've sent header, now handle function body
+        if self.in_function:
+            # Send opening brace if not sent yet
+            if self.in_function and not self.json_started:
+                self.json_started = True
+                return DeltaMessage(
+                    tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_index,
+                            function=DeltaFunctionCall(arguments="{"),
+                        )
+                    ]
+                )
+
+            # Make sure json_started is set if we're processing parameters
+            if not self.json_started:
+                self.json_started = True
+
+            # Check for function end in accumulated text
+            if not self.json_closed and self.invoke_end_token in tool_text:
+                # Count total parameters in the tool text
+                total_param_count = tool_text.count(self.parameter_prefix)
+
+                # Only close JSON if all parameters have been processed
+                if self.param_count >= total_param_count:
+                    # Close JSON
+                    self.json_closed = True
+
+                    # Extract complete tool call
+                    # Find the invoke content
+                    invoke_start = tool_text.find(self.invoke_start_prefix) + len(
+                        self.invoke_start_prefix
+                    )
+                    invoke_content_end = tool_text.find(
+                        self.invoke_end_token, invoke_start
+                    )
+                    if invoke_content_end != -1:
+                        invoke_content = tool_text[invoke_start:invoke_content_end]
+                        # Parse to get the complete arguments
+                        try:
+                            invoke_params = self._parse_invoke_params(invoke_content)
+                            if invoke_params and self.current_tool_index < len(
+                                self.prev_tool_call_arr
+                            ):
+                                # Update existing entry in prev_tool_call_arr
+                                self.prev_tool_call_arr[self.current_tool_index][
+                                    "arguments"
+                                ] = json.dumps(invoke_params, ensure_ascii=False)
+                        except Exception:
+                            pass  # Ignore parsing errors during streaming
+
+                    result = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_index,
+                                function=DeltaFunctionCall(arguments="}"),
+                            )
+                        ]
+                    )
+
+                    # Reset state for next tool
+                    self.json_closed = True
+                    self.in_function = False
+                    self.accumulated_params = {}
+
+                    logger.debug("[M2_STREAMING] Tool call completed")
+
+                    return result
+                else:
+                    # Don't close JSON yet, continue processing parameters
+                    return None
+
+            # Look for parameters
+            # Find all parameter starts
+            param_starts = []
+            idx = 0
+            while True:
+                idx = tool_text.find(self.parameter_prefix, idx)
+                if idx == -1:
+                    break
+                param_starts.append(idx)
+                idx += len(self.parameter_prefix)
+
+            # Check if we should start a new parameter
+            if (
+                not self.in_param
+                and self.param_count < len(param_starts)
+                and len(param_starts) > self.param_count
+            ):
+                # Process the next parameter
+                param_idx = param_starts[self.param_count]
+                param_start = param_idx + len(self.parameter_prefix)
+                remaining = tool_text[param_start:]
+
+                if ">" in remaining:
+                    # We have the complete parameter name
+                    name_end = remaining.find(">")
+                    param_name_raw = remaining[:name_end]
+                    self.current_param_name = self._extract_param_name(param_name_raw)
+
+                    # Find the parameter value
+                    value_start = param_start + name_end + 1
+                    value_text = tool_text[value_start:]
+                    if value_text.startswith("\n"):
+                        value_text = value_text[1:]
+
+                    # Find where this parameter ends
+                    param_end_idx = value_text.find(self.parameter_end_token)
+                    if param_end_idx == -1:
+                        # No closing tag, look for next parameter or function end
+                        next_param_idx = value_text.find(self.parameter_prefix)
+                        func_end_idx = value_text.find(self.invoke_end_token)
+
+                        if next_param_idx != -1 and (
+                            func_end_idx == -1 or next_param_idx < func_end_idx
+                        ):
+                            param_end_idx = next_param_idx
+                        elif func_end_idx != -1:
+                            param_end_idx = func_end_idx
+                        else:
+                            # Neither found, check if tool call is complete
+                            if self.invoke_end_token in tool_text:
+                                # Tool call and parameter is complete
+                                param_end_idx = len(value_text)
+                            else:
+                                # Still streaming, wait for more content
+                                return None
+
+                    if param_end_idx != -1:
+                        # Complete parameter found
+                        param_value = value_text[:param_end_idx]
+                        if param_value.endswith("\n"):
+                            param_value = param_value[:-1]
+
+                        # Store raw value for later processing
+                        self.accumulated_params[self.current_param_name] = param_value
+
+                        # Get parameter configuration for type conversion
+                        param_config = {}
+                        if self.streaming_request and self.streaming_request.tools:
+                            for tool in self.streaming_request.tools:
+                                if (
+                                    hasattr(tool, "function")
+                                    and tool.function.name == self.current_function_name
+                                    and hasattr(tool.function, "parameters")
+                                ):
+                                    params = tool.function.parameters
+                                    if (
+                                        isinstance(params, dict)
+                                        and "properties" in params
+                                    ):
+                                        param_config = params["properties"]
+                                    break
+
+                        # Get parameter type
+                        param_type = "string"
+                        if (
+                            self.current_param_name in param_config
+                            and isinstance(param_config[self.current_param_name], dict)
+                            and "type" in param_config[self.current_param_name]
+                        ):
+                            param_type = param_config[self.current_param_name]["type"]
+
+                        # Convert param value to appropriate type
+                        converted_value = self._convert_param_value(
+                            param_value, param_type
+                        )
+
+                        # Build JSON fragment based on the converted type
+                        # Use json.dumps to properly serialize the value
+                        serialized_value = json.dumps(
+                            converted_value, ensure_ascii=False
+                        )
+
+                        if self.param_count == 0:
+                            json_fragment = (
+                                f'"{self.current_param_name}": {serialized_value}'
+                            )
+                        else:
+                            json_fragment = (
+                                f', "{self.current_param_name}": {serialized_value}'
+                            )
+
+                        self.param_count += 1
+
+                        return DeltaMessage(
+                            tool_calls=[
+                                DeltaToolCall(
+                                    index=self.current_tool_index,
+                                    function=DeltaFunctionCall(arguments=json_fragment),
+                                )
+                            ]
+                        )
+
+        return None
-- 
GitLab


From dfdda96747c4d06e96355ea96c7207c9f7dd3816 Mon Sep 17 00:00:00 2001
From: Arpit Khandelwal <60464796+arpitkh101@users.noreply.github.com>
Date: Thu, 4 Dec 2025 04:15:04 -0500
Subject: [PATCH 079/258] [Core] Remove forced None assignment for deprecated
 PassConfig flags (#29994)

Signed-off-by: arpitkh101 <arpit5khandelwal@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 tests/compile/test_config.py | 21 +++++++++++++++------
 vllm/config/compilation.py   | 23 ++++++++++++-----------
 2 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index 9e912c6d8..8dd6959a0 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -392,39 +392,48 @@ def test_pass_config_deprecation(caplog_vllm):
     assert "enable_fusion is deprecated" in caplog_vllm.text
     assert config.fuse_norm_quant is True
     assert config.fuse_act_quant is True
-    assert config.enable_fusion is None
+    assert config.enable_fusion is True
 
     # Test enable_attn_fusion -> fuse_attn_quant
     caplog_vllm.clear()
     config = PassConfig(enable_attn_fusion=True)
     assert "enable_attn_fusion is deprecated" in caplog_vllm.text
     assert config.fuse_attn_quant is True
-    assert config.enable_attn_fusion is None
+    assert config.enable_attn_fusion is True
 
     # Test enable_noop -> eliminate_noops
     caplog_vllm.clear()
     config = PassConfig(enable_noop=True)
     assert "enable_noop is deprecated" in caplog_vllm.text
     assert config.eliminate_noops is True
-    assert config.enable_noop is None
+    assert config.enable_noop is True
 
     # Test enable_sequence_parallelism -> enable_sp
     caplog_vllm.clear()
     config = PassConfig(enable_sequence_parallelism=True)
     assert "enable_sequence_parallelism is deprecated" in caplog_vllm.text
     assert config.enable_sp is True
-    assert config.enable_sequence_parallelism is None
+    assert config.enable_sequence_parallelism is True
 
     # Test enable_async_tp -> fuse_gemm_comms
     caplog_vllm.clear()
     config = PassConfig(enable_async_tp=True)
     assert "enable_async_tp is deprecated" in caplog_vllm.text
     assert config.fuse_gemm_comms is True
-    assert config.enable_async_tp is None
+    assert config.enable_async_tp is True
 
     # Test enable_fi_allreduce_fusion -> fuse_allreduce_rms
     caplog_vllm.clear()
     config = PassConfig(enable_fi_allreduce_fusion=True)
     assert "enable_fi_allreduce_fusion is deprecated" in caplog_vllm.text
     assert config.fuse_allreduce_rms is True
-    assert config.enable_fi_allreduce_fusion is None
+    assert config.enable_fi_allreduce_fusion is True
+
+    # Test hash consistency
+    config_old = PassConfig(enable_fusion=True)
+    config_new = PassConfig(fuse_norm_quant=True, fuse_act_quant=True)
+    assert config_old.compute_hash() == config_new.compute_hash()
+
+    config_old = PassConfig(enable_async_tp=True)
+    config_new = PassConfig(fuse_gemm_comms=True)
+    assert config_old.compute_hash() == config_new.compute_hash()
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 963b09193..d3d50e6ae 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -4,7 +4,7 @@
 import enum
 from collections import Counter
 from collections.abc import Callable
-from dataclasses import asdict, field
+from dataclasses import field
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar, Literal
 
@@ -13,7 +13,7 @@ from pydantic.dataclasses import dataclass
 
 import vllm.envs as envs
 from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
-from vllm.config.utils import config, handle_deprecated
+from vllm.config.utils import config, get_hash_factors, handle_deprecated, hash_factors
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import resolve_obj_by_qualname
@@ -196,7 +196,16 @@ class PassConfig:
         Any new fields that affect compilation should be added to the hash.
         Any future fields that don't affect compilation should be excluded.
         """
-        return InductorPass.hash_dict(asdict(self))
+
+        ignored_fields = [
+            "enable_fusion",
+            "enable_attn_fusion",
+            "enable_noop",
+            "enable_sequence_parallelism",
+            "enable_async_tp",
+            "enable_fi_allreduce_fusion",
+        ]
+        return hash_factors(get_hash_factors(self, ignored_factors=ignored_fields))
 
     @field_validator(
         "fuse_norm_quant",
@@ -267,14 +276,6 @@ class PassConfig:
             "v0.13.0 or v1.0.0, whichever is sooner",
         )
 
-        # Force old flags to None to ensure they are not used
-        self.enable_fusion = None
-        self.enable_attn_fusion = None
-        self.enable_noop = None
-        self.enable_sequence_parallelism = None
-        self.enable_async_tp = None
-        self.enable_fi_allreduce_fusion = None
-
         if not self.eliminate_noops:
             if self.fuse_norm_quant or self.fuse_act_quant:
                 logger.warning_once(
-- 
GitLab


From f2f4cea6ccaad20becb6f02e253ae673f8a249ae Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Thu, 4 Dec 2025 03:30:22 -0600
Subject: [PATCH 080/258] [CI/Build][AMD] Skip test on
 test_hybrid_attention_mamba_tensor_shapes on ROCm, requires FLASHINFER
 (#29995)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/v1/worker/test_gpu_model_runner.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 0439bef12..459abcfdd 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -761,6 +761,10 @@ def test_init_kv_cache_with_kv_sharing_valid():
     assert kv_cache_config_after_init.kv_cache_groups[0].layer_names[1] == layer_1
 
 
+@pytest.mark.skipif(
+    current_platform.is_rocm(),
+    reason="Attention backend FLASHINFER is not supported on ROCm.",
+)
 def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
     """
     The GPU model runner creates different views into the
-- 
GitLab


From 842aba501d92ac77874d45612b7e3c6fed2ca243 Mon Sep 17 00:00:00 2001
From: dtc <790567447@qq.com>
Date: Thu, 4 Dec 2025 17:51:36 +0800
Subject: [PATCH 081/258] [P/D] Introduce Mooncake Transfer Engine as
 kv_connector (#24718)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Tianchen Ding <dtcccc@linux.alibaba.com>
Signed-off-by: dtc <dtcccc@linux.alibaba.com>
Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com>
---
 docs/features/mooncake_connector_usage.md     |  58 ++
 .../kv_transfer/kv_connector/factory.py       |   5 +
 .../kv_transfer/kv_connector/utils.py         | 124 +++
 .../kv_connector/v1/mooncake_connector.py     | 914 ++++++++++++++++++
 .../kv_connector/v1/nixl_connector.py         | 128 +--
 vllm/envs.py                                  |  10 +
 6 files changed, 1114 insertions(+), 125 deletions(-)
 create mode 100644 docs/features/mooncake_connector_usage.md
 create mode 100644 vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py

diff --git a/docs/features/mooncake_connector_usage.md b/docs/features/mooncake_connector_usage.md
new file mode 100644
index 000000000..653ea29ad
--- /dev/null
+++ b/docs/features/mooncake_connector_usage.md
@@ -0,0 +1,58 @@
+# MooncakeConnector Usage Guide
+
+## About Mooncake
+
+Mooncake aims to enhance the inference efficiency of large language models (LLMs), especially in slow object storage environments, by constructing a multi-level caching pool on high-speed interconnected DRAM/SSD resources. Compared to traditional caching systems, Mooncake utilizes (GPUDirect) RDMA technology to transfer data directly in a zero-copy manner, while maximizing the use of multi-NIC resources on a single machine.
+
+For more details about Mooncake, please refer to [Mooncake project](https://github.com/kvcache-ai/Mooncake) and [Mooncake documents](https://kvcache-ai.github.io/Mooncake/).
+
+## Prerequisites
+
+### Installation
+
+Install mooncake through pip: `uv pip install mooncake-transfer-engine`.
+
+Refer to [Mooncake official repository](https://github.com/kvcache-ai/Mooncake) for more installation instructions
+
+## Usage
+
+### Prefiller Node (192.168.0.2)
+
+```bash
+vllm serve Qwen/Qwen2.5-7B-Instruct --port 8010 --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_producer"}'
+```
+
+### Decoder Node (192.168.0.3)
+
+```bash
+vllm serve Qwen/Qwen2.5-7B-Instruct --port 8020 --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_consumer"}'
+```
+
+### Proxy
+
+```bash
+python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --prefiller-host 192.168.0.2 --prefiller-port 8010 --decoder-host 192.168.0.3 --decoder-port 8020
+```
+
+> NOTE: The Mooncake Connector currently uses the proxy from nixl_integration. This will be replaced with a self-developed proxy in the future.
+
+Now you can send requests to the proxy server through port 8000.
+
+## Environment Variables
+
+- `VLLM_MOONCAKE_BOOTSTRAP_PORT`: Port for Mooncake bootstrap server
+    - Default: 8998
+    - Required only for prefiller instances
+    - Each vLLM worker needs a unique port on its host; using the same port number across different hosts is fine
+    - For TP/DP deployments, each worker's port on a node is computed as: base_port + dp_rank * tp_size + tp_rank
+    - Used for the decoder notifying the prefiller
+
+- `VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT`: Timeout (in seconds) for automatically releasing the prefiller’s KV cache for a particular request. (Optional)
+    - Default: 480
+    - If a request is aborted and the decoder has not yet notified the prefiller, the prefill instance will release its KV-cache blocks after this timeout to avoid holding them indefinitely.
+
+## KV Role Options
+
+- **kv_producer**: For prefiller instances that generate KV caches
+- **kv_consumer**: For decoder instances that consume KV caches from prefiller
+- **kv_both**: Enables symmetric functionality where the connector can act as both producer and consumer. This provides flexibility for experimental setups and scenarios where the role distinction is not predetermined.
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index df871dd7c..02f51a1dc 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -190,3 +190,8 @@ KVConnectorFactory.register_connector(
     "vllm.distributed.kv_transfer.kv_connector.v1.decode_bench_connector",
     "DecodeBenchConnector",
 )
+KVConnectorFactory.register_connector(
+    "MooncakeConnector",
+    "vllm.distributed.kv_transfer.kv_connector.v1.mooncake_connector",
+    "MooncakeConnector",
+)
diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
index b2c2c0e6b..99d3be57c 100644
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -4,10 +4,13 @@
 KV cache helper for store.
 """
 
+from dataclasses import dataclass
 from typing import TYPE_CHECKING, Literal
 
 import torch
 
+from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import get_current_vllm_config
 from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
 from vllm.logger import init_logger
@@ -181,3 +184,124 @@ def copy_kv_blocks(
         src_tensor = src_kv_caches[layer_name]
         dst_tensor = dst_kv_caches[layer_name]
         copy_fn(src_tensor, dst_tensor, src_indices, dst_indices)
+
+
+@dataclass
+class TpKVTopology:
+    """
+    Helper class for tensor parallel and KV topology information for
+    mapping between local and remote TP workers.
+    """
+
+    tp_rank: int
+    remote_tp_size: dict[str, int]
+    is_mla: bool
+    total_num_kv_heads: int
+    attn_backend: type[AttentionBackend]
+    engine_id: str
+    remote_block_size: dict[str, int]
+
+    def __post_init__(self):
+        # Figure out whether the first dimension of the cache is K/V
+        # or num_blocks. This is used to register the memory regions correctly.
+        kv_cache_shape = self.attn_backend.get_kv_cache_shape(
+            num_blocks=1, block_size=16, num_kv_heads=1, head_size=1
+        )
+        # Non-MLA backends caches have 5 dims [2, num_blocks, H,N,D],
+        # we just mock num_blocks to 1 for the dimension check below.
+        self._is_kv_layout_blocks_first = (
+            len(kv_cache_shape) == 5 and kv_cache_shape[0] == 1
+        )
+
+        attn_backend = AttentionBackendEnum[self.attn_backend.get_name()]
+        self._use_pallas = attn_backend == AttentionBackendEnum.PALLAS
+
+    @property
+    def is_kv_layout_blocks_first(self) -> bool:
+        return self._is_kv_layout_blocks_first
+
+    @property
+    def split_k_and_v(self) -> bool:
+        # Whether to register regions for K and V separately (when present).
+        return not (self.is_mla or self._use_pallas or self.is_kv_layout_blocks_first)
+
+    @property
+    def tp_size(self) -> int:
+        return self.remote_tp_size[self.engine_id]
+
+    @property
+    def block_size(self) -> int:
+        return self.remote_block_size[self.engine_id]
+
+    def tp_ratio(
+        self,
+        remote_tp_size: int,
+    ) -> int:
+        """
+        Calculate the tensor parallel ratio between local and remote TP.
+        We can think of it as the number of local TP workers-per-remote TP
+        workers. Local workers will read from the same remote TP worker in
+        groups of size `tp_ratio`.
+        """
+        assert self.tp_size % remote_tp_size == 0, (
+            f"Local tensor parallel size {self.tp_size} is not divisible "
+            f"by remote tensor parallel size {remote_tp_size}."
+        )
+        return self.tp_size // remote_tp_size
+
+    def block_size_ratio(
+        self,
+        remote_block_size: int,
+    ) -> float:
+        """
+        Calculate the block size ratio between local and remote TP.
+        """
+        assert self.block_size % remote_block_size == 0, (
+            f"Local block size {self.block_size} is not divisible "
+            f"by remote block size {remote_block_size} or vice versa."
+        )
+        return self.block_size // remote_block_size
+
+    def tp_ratio_from_engine_id(
+        self,
+        remote_engine_id: str,
+    ) -> int:
+        remote_tp_size = self.remote_tp_size[remote_engine_id]
+        return self.tp_ratio(remote_tp_size)
+
+    def block_size_ratio_from_engine_id(
+        self,
+        remote_engine_id: str,
+    ) -> float:
+        remote_block_size = self.remote_block_size[remote_engine_id]
+        return self.block_size_ratio(remote_block_size)
+
+    def is_kv_replicated(self, engine_id: str) -> bool:
+        """
+        Whether the KV cache is replicated across TP workers due to the
+        number of TP workers being greater than the number of KV heads.
+        """
+        tp_size = self.remote_tp_size[engine_id]
+        return tp_size // self.total_num_kv_heads >= 1
+
+    def replicates_kv_cache(self, remote_engine_id: str) -> bool:
+        # MLA is always replicated as the hidden dim can't be split.
+        return self.is_mla or self.is_kv_replicated(remote_engine_id)
+
+    def get_target_remote_rank(
+        self,
+        remote_tp_size: int,
+    ) -> int:
+        """
+        Get the remote TP rank (on P) that the current local TP rank
+        (on D) will read from.
+        """
+        tp_ratio = self.tp_ratio(remote_tp_size)
+        return self.tp_rank // tp_ratio
+
+    def get_target_remote_rank_from_engine_id(
+        self,
+        remote_engine_id: str,
+    ) -> int:
+        remote_tp_size = self.remote_tp_size[remote_engine_id]
+        return self.get_target_remote_rank(remote_tp_size)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py
new file mode 100644
index 000000000..705960aeb
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py
@@ -0,0 +1,914 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import threading
+import time
+import uuid
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional
+
+import msgspec
+import numpy as np
+import torch
+import zmq
+import zmq.asyncio
+
+from vllm import envs
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.selector import get_attn_backend
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.utils import TpKVTopology
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1,
+    KVConnectorMetadata,
+    KVConnectorRole,
+)
+from vllm.distributed.parallel_state import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    get_tp_group,
+)
+from vllm.forward_context import ForwardContext
+from vllm.logger import init_logger
+from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket
+from vllm.v1.attention.backends.utils import get_kv_cache_layout
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.request import RequestStatus
+
+try:
+    from mooncake.engine import TransferEngine
+except ImportError as e:
+    raise ImportError(
+        "Please install mooncake by following the instructions at "
+        "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "  # noqa: E501
+        "to run VLLM with MooncakeTransferEngine."
+    ) from e
+
+if TYPE_CHECKING:
+    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.kv_cache_interface import KVCacheConfig
+    from vllm.v1.request import Request
+
+EngineId = str
+ReqId = str
+
+TRANS_DONE = b"trans_done"
+TRANS_ERROR = b"trans_error"
+
+logger = init_logger(__name__)
+
+
+class MooncakeAgentMetadata(
+    msgspec.Struct,
+    omit_defaults=True,  # type: ignore[call-arg]
+    # required for @cached_property.
+    dict=True,
+):
+    remote_hostname: str
+    remote_port: int
+    request_ids: list[ReqId]
+    kv_caches_base_addr: list[int]
+    block_ids: list[list[int]]
+
+
+@dataclass
+class RecvReqMeta:
+    local_block_ids: list[int]
+    remote_host: str
+    remote_port: int
+
+
+@dataclass
+class SendBlockMeta:
+    local_block_ids: list[int]
+    ready: threading.Event
+    expire_time: float = float("inf")
+
+
+@dataclass
+class SendReqMeta:
+    reqs: dict[ReqId, SendBlockMeta]
+    lock: threading.Lock
+
+
+@dataclass
+class FinishedSendReqSet:
+    set: set[ReqId]
+    lock: threading.Lock
+
+
+@dataclass
+class FinishedReceiveReqSet:
+    set: set[ReqId]
+    lock: asyncio.Lock
+
+
+class MooncakeConnectorMetadata(KVConnectorMetadata):
+    def __init__(self):
+        self.reqs_to_recv: dict[ReqId, RecvReqMeta] = {}
+        self.reqs_to_send: dict[ReqId, list[int]] = {}
+
+    def add_new_req(
+        self,
+        request_id: ReqId,
+        local_block_ids: list[int],
+        kv_transfer_params: dict[str, Any],
+        load_remote_cache: bool = True,
+    ):
+        if load_remote_cache:
+            self.reqs_to_recv[request_id] = RecvReqMeta(
+                local_block_ids=local_block_ids,
+                remote_host=kv_transfer_params["remote_host"],
+                remote_port=kv_transfer_params["remote_port"],
+            )
+        else:
+            self.reqs_to_send[request_id] = local_block_ids
+
+
+class MooncakeConnector(KVConnectorBase_V1):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        role: KVConnectorRole,
+        kv_cache_config: Optional["KVCacheConfig"] = None,
+    ):
+        super().__init__(vllm_config, role, kv_cache_config)
+
+        assert vllm_config.kv_transfer_config is not None
+        assert vllm_config.kv_transfer_config.engine_id is not None
+        self.engine_id: EngineId = vllm_config.kv_transfer_config.engine_id
+
+        if role == KVConnectorRole.SCHEDULER:
+            self.connector_scheduler: MooncakeConnectorScheduler | None = (
+                MooncakeConnectorScheduler(vllm_config, self.engine_id)
+            )
+            self.connector_worker: MooncakeConnectorWorker | None = None
+        elif role == KVConnectorRole.WORKER:
+            self.connector_scheduler = None
+            self.connector_worker = MooncakeConnectorWorker(vllm_config, self.engine_id)
+
+    ############################################################
+    # Scheduler Side Methods
+    ############################################################
+
+    def get_num_new_matched_tokens(
+        self, request: "Request", num_computed_tokens: int
+    ) -> tuple[int, bool]:
+        assert self.connector_scheduler is not None
+        return self.connector_scheduler.get_num_new_matched_tokens(
+            request, num_computed_tokens
+        )
+
+    def update_state_after_alloc(
+        self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
+    ):
+        assert self.connector_scheduler is not None
+        return self.connector_scheduler.update_state_after_alloc(
+            request, blocks, num_external_tokens
+        )
+
+    def build_connector_meta(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> KVConnectorMetadata:
+        assert self.connector_scheduler is not None
+        return self.connector_scheduler.build_connector_meta(scheduler_output)
+
+    def request_finished(
+        self,
+        request: "Request",
+        block_ids: list[int],
+    ) -> tuple[bool, dict[str, Any] | None]:
+        assert self.connector_scheduler is not None
+        return self.connector_scheduler.request_finished(request, block_ids)
+
+    ############################################################
+    # Worker Side Methods
+    ############################################################
+    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
+        assert self.connector_worker is not None
+        self.connector_worker.register_kv_caches(kv_caches)
+
+    def get_finished(
+        self, finished_req_ids: set[str]
+    ) -> tuple[set[str] | None, set[str] | None]:
+        """Get the finished recving and sending requests."""
+        assert self.connector_worker is not None
+        return self.connector_worker.get_finished()
+
+    def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None:
+        assert self.connector_worker is not None
+        assert isinstance(self._connector_metadata, MooncakeConnectorMetadata)
+        self.connector_worker.start_load_kv(self._connector_metadata)
+
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        """MooncakeConnector does not do layerwise saving."""
+        pass
+
+    def save_kv_layer(
+        self,
+        layer_name: str,
+        kv_layer: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        **kwargs,
+    ) -> None:
+        """MooncakeConnector does not save explicitly."""
+        pass
+
+    def wait_for_save(self):
+        pass
+
+
+class MooncakeConnectorScheduler:
+    """Implementation of Scheduler side methods"""
+
+    def __init__(self, vllm_config: VllmConfig, engine_id: str):
+        self.vllm_config = vllm_config
+        self.engine_id: EngineId = engine_id
+        self.side_channel_host = get_ip()
+        self.side_channel_port = get_mooncake_side_channel_port(vllm_config)
+
+        assert vllm_config.kv_transfer_config
+        self.kv_role = vllm_config.kv_transfer_config.kv_role
+        logger.info("Initializing Mooncake Transfer Engine Scheduler %s", engine_id)
+
+        # Requests that need to start recv/send.
+        # New requests are added by update_state_after_alloc in
+        # the scheduler. Used to make metadata passed to Worker.
+        self._reqs_need_recv: dict[ReqId, tuple[Request, list[int]]] = {}
+        self._reqs_need_send: dict[ReqId, list[int]] = {}
+
+    def get_num_new_matched_tokens(
+        self, request: "Request", num_computed_tokens: int
+    ) -> tuple[int, bool]:
+        """
+        For remote prefill, pull all prompt blocks from remote
+        asynchronously relative to engine execution.
+
+        Args:
+            request (Request): the request object.
+            num_computed_tokens (int): the number of locally
+                computed tokens for this request
+        Returns:
+            * the number of tokens that can be loaded from the
+              external KV cache beyond what is already computed.
+            * true if the external KV cache tokens will be loaded
+              asynchronously (between scheduler steps).
+        """
+
+        params = request.kv_transfer_params
+        logger.debug(
+            "MooncakeConnector get_num_new_matched_tokens: "
+            "num_computed_tokens=%s, kv_transfer_params=%s",
+            num_computed_tokens,
+            params,
+        )
+
+        if params is not None and params.get("do_remote_prefill"):
+            # Remote prefill: get all prompt blocks from remote.
+            token_ids = request.prompt_token_ids or []
+            count = len(token_ids) - num_computed_tokens
+            if count > 0:
+                return count, True
+
+        # No remote prefill for this request.
+        return 0, False
+
+    def update_state_after_alloc(
+        self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
+    ):
+        params = request.kv_transfer_params
+        logger.debug(
+            "MooncakeConnector update_state_after_alloc: "
+            "num_external_tokens=%s, kv_transfer_params=%s",
+            num_external_tokens,
+            params,
+        )
+
+        if not params:
+            return
+
+        if params.get("do_remote_prefill"):
+            assert self.kv_role != "kv_producer"
+            if all(p in params for p in ("remote_host", "remote_port")):
+                # If remote_blocks and num_external_tokens = 0, we have
+                # a full prefix cache hit on the D worker. We need to call
+                # send_notif in _read_blocks to free the memory on the P.
+                local_block_ids = (
+                    blocks.get_unhashed_block_ids() if num_external_tokens > 0 else []
+                )
+                # Get unhashed blocks to pull from remote.
+                self._reqs_need_recv[request.request_id] = (request, local_block_ids)
+            else:
+                logger.warning(
+                    "Got invalid KVTransferParams: %s. This "
+                    "request will not utilize KVTransfer",
+                    params,
+                )
+            # Only trigger 1 KV transfer per request.
+            params["do_remote_prefill"] = False
+
+        elif params.get("do_remote_decode"):
+            # Add an empty list to worker to create event.
+            self._reqs_need_send[request.request_id] = []
+
+    def build_connector_meta(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> KVConnectorMetadata:
+        meta = MooncakeConnectorMetadata()
+
+        # Loop through scheduled reqs and convert to RecvReqMeta.
+        if self.kv_role != "kv_producer":
+            for req_id, (req, block_ids) in self._reqs_need_recv.items():
+                assert req.kv_transfer_params is not None
+                meta.add_new_req(
+                    request_id=req_id,
+                    local_block_ids=block_ids,
+                    kv_transfer_params=req.kv_transfer_params,
+                )
+            self._reqs_need_recv.clear()
+
+        if self.kv_role != "kv_consumer":
+            for req_id, block_ids in self._reqs_need_send.items():
+                meta.add_new_req(
+                    request_id=req_id,
+                    local_block_ids=block_ids,
+                    kv_transfer_params={},
+                    load_remote_cache=False,
+                )
+            self._reqs_need_send.clear()
+
+        return meta
+
+    def request_finished(
+        self,
+        request: "Request",
+        block_ids: list[int],
+    ) -> tuple[bool, dict[str, Any] | None]:
+        """
+        Once a request is finished, determine whether request blocks
+        should be freed now or will be sent asynchronously and freed later.
+        """
+
+        params = request.kv_transfer_params
+        logger.debug(
+            "MooncakeConnector request_finished, request_status=%s, "
+            "kv_transfer_params=%s",
+            request.status,
+            params,
+        )
+        if not params:
+            return False, None
+
+        if params.get("do_remote_prefill"):
+            # If do_remote_prefill is still True when the request is finished,
+            # update_state_after_alloc must not have been called (the request
+            # must have been aborted before it was scheduled).
+            # To avoid stranding the prefill blocks in the prefill instance,
+            # we must add empty block_ids to _reqs_need_recv so that our
+            # worker side will notify and free blocks in the prefill instance.
+            assert self.kv_role != "kv_producer"
+            self._reqs_need_recv[request.request_id] = (request, [])
+            params["do_remote_prefill"] = False
+            return False, None
+
+        if (
+            not params.get("do_remote_decode")
+            or request.status != RequestStatus.FINISHED_LENGTH_CAPPED
+        ):
+            return False, None
+
+        assert self.kv_role != "kv_consumer"
+
+        # TODO: check whether block_ids actually ever be 0. If not we could
+        # remove the conditional below
+        delay_free_blocks = len(block_ids) > 0
+
+        if delay_free_blocks:
+            self._reqs_need_send[request.request_id] = block_ids
+
+        return delay_free_blocks, dict(
+            do_remote_prefill=True,
+            do_remote_decode=False,
+            remote_host=self.side_channel_host,
+            remote_port=self.side_channel_port,
+        )
+
+
+class MooncakeConnectorWorker:
+    """Implementation of Worker side methods"""
+
+    def __init__(self, vllm_config: VllmConfig, engine_id: str):
+        logger.info("Initializing Mooncake Transfer Engine worker %s", engine_id)
+
+        self.vllm_config = vllm_config
+
+        self.engine = TransferEngine()
+        self.hostname = get_ip()
+        ret_value = self.engine.initialize(self.hostname, "P2PHANDSHAKE", "rdma", "")
+        if ret_value != 0:
+            raise RuntimeError("Mooncake Transfer Engine initialization failed.")
+
+        self.rpc_port = self.engine.get_rpc_port()
+
+        logger.debug(
+            "Mooncake Transfer Engine initialized at %s:%d",
+            self.hostname,
+            self.rpc_port,
+        )
+
+        # Mooncake handshake port.
+        self.side_channel_port: int = get_mooncake_side_channel_port(vllm_config)
+
+        self.engine_id: EngineId = engine_id
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.world_size = get_tensor_model_parallel_world_size()
+        self.tp_group = get_tp_group()
+        self.num_blocks = 0
+
+        assert vllm_config.kv_transfer_config
+        self.kv_role = vllm_config.kv_transfer_config.kv_role
+        self.num_workers = vllm_config.kv_transfer_config.kv_connector_extra_config.get(
+            "num_workers", 10
+        )
+
+        self.kv_caches_base_addr: list[int] = []
+        self.device_kv_caches: dict[str, torch.Tensor] = {}
+        self.reqs_need_send: SendReqMeta = SendReqMeta(reqs={}, lock=threading.Lock())
+
+        # For kv_both, we will act both prefiller and decoder.
+        if self.kv_role != "kv_consumer":
+            # Background thread for sending kvcaches to D.
+            self._mooncake_sender_t: threading.Thread | None = None
+            # Background thread for processing new sending requests.
+            self._sender_executor = ThreadPoolExecutor(
+                max_workers=self.num_workers, thread_name_prefix="vllm-mooncake-sender"
+            )
+            logger.debug(
+                "Mooncake Prefiller: use %d workers to send kvcaches", self.num_workers
+            )
+        if self.kv_role != "kv_producer":
+            self.receiver_loop = asyncio.new_event_loop()
+            self._mooncake_receiver_t = threading.Thread(
+                target=self._receiver_loop, args=(self.receiver_loop,), daemon=True
+            )
+            self._mooncake_receiver_t.start()
+            logger.debug("Mooncake Decoder: start receiver thread")
+
+        self.finished_sending_reqs: FinishedSendReqSet = FinishedSendReqSet(
+            set(), threading.Lock()
+        )
+        self.finished_recving_reqs: FinishedReceiveReqSet = FinishedReceiveReqSet(
+            set(), asyncio.Lock()
+        )
+
+        self.block_size = vllm_config.cache_config.block_size
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.use_mla = self.model_config.use_mla
+
+        backend = get_attn_backend(
+            self.model_config.get_head_size(),
+            self.model_config.dtype,
+            self.cache_config.cache_dtype,
+            self.block_size,
+            use_mla=self.use_mla,
+        )
+        self.backend_name = backend.get_name()
+        self.kv_cache_layout = get_kv_cache_layout()
+        logger.debug("Detected attention backend %s", self.backend_name)
+        logger.debug("Detected kv cache layout %s", self.kv_cache_layout)
+
+        self._tp_size: dict[EngineId, int] = {self.engine_id: self.world_size}
+        self._block_size: dict[EngineId, int] = {self.engine_id: self.block_size}
+        self.kv_topo = TpKVTopology(
+            tp_rank=self.tp_rank,
+            engine_id=self.engine_id,
+            remote_tp_size=self._tp_size,  # shared state
+            remote_block_size=self._block_size,  # shared state
+            is_mla=self.use_mla,
+            total_num_kv_heads=self.model_config.get_total_num_kv_heads(),
+            attn_backend=backend,
+        )
+        self._use_pallas = self.kv_topo._use_pallas
+
+        self.zmq_ctx = zmq.Context()
+        self.async_zmq_ctx = zmq.asyncio.Context()
+        self._encoder = msgspec.msgpack.Encoder()
+        self._decoder = msgspec.msgpack.Decoder(MooncakeAgentMetadata)
+
+    def __del__(self):
+        self.shutdown()
+
+    def shutdown(self):
+        """Cleanup background threads on destruction."""
+        self.zmq_ctx.term()
+        self.async_zmq_ctx.term()
+        if self.kv_role != "kv_consumer":
+            self._sender_executor.shutdown(wait=False)
+            if self._mooncake_sender_t:
+                self._mooncake_sender_t.join()
+        if self.kv_role != "kv_producer" and self.receiver_loop.is_running():
+            self.receiver_loop.call_soon_threadsafe(self.receiver_loop.stop)
+            self._mooncake_receiver_t.join()
+
+    def _receiver_loop(self, loop: asyncio.AbstractEventLoop):
+        asyncio.set_event_loop(loop)
+        loop.run_forever()
+
+    def _mooncake_sender(
+        self, ready_event: threading.Event, base_port: int, tp_rank: int
+    ):
+        """
+        Background thread that listens for Mooncake requests, dispatches them
+        to a thread pool, and sends acknowledgments upon completion.
+        """
+
+        frontend_path = make_zmq_path("tcp", self.hostname, base_port + tp_rank)
+        frontend = make_zmq_socket(self.zmq_ctx, frontend_path, zmq.ROUTER)
+        logger.debug("Mooncake sender starting listening on path: %s", frontend_path)
+
+        backend_path = make_zmq_path("inproc", str(uuid.uuid4()))
+        backend = make_zmq_socket(self.zmq_ctx, backend_path, zmq.PULL)
+
+        poller = zmq.Poller()
+        poller.register(frontend, zmq.POLLIN)
+        poller.register(backend, zmq.POLLIN)
+
+        ready_event.set()
+
+        try:
+            while True:
+                sockets = dict(poller.poll())
+
+                if frontend in sockets:
+                    identity, _, metadata_bytes = frontend.recv_multipart()
+                    self._sender_executor.submit(
+                        self._sender_worker,
+                        identity,
+                        metadata_bytes,
+                        backend_path,
+                    )
+
+                if backend in sockets:
+                    identity, status = backend.recv_multipart()
+                    frontend.send_multipart((identity, b"", status))
+
+        except zmq.ContextTerminated:
+            logger.debug("ZMQ context terminated, exiting Mooncake sender thread.")
+        except Exception as e:
+            logger.error("Error in Mooncake sender thread: %s. Exiting thread.", str(e))
+        finally:
+            frontend.close()
+            backend.close()
+
+    def _sender_worker(
+        self, identity: bytes, metadata_bytes: bytes, worker_channel_path: str
+    ):
+        status = TRANS_ERROR
+
+        try:
+            metadata = self._decoder.decode(metadata_bytes)
+            self.send_kv_to_decode(metadata)
+            status = TRANS_DONE
+        except Exception as e:
+            logger.error("Error processing Mooncake handshake: %s", e)
+        finally:
+            pusher = make_zmq_socket(self.zmq_ctx, worker_channel_path, zmq.PUSH)
+            try:
+                pusher.send_multipart((identity, status))
+            except zmq.ZMQError as e:
+                logger.warning(
+                    "Internal error, maybe the server is shutting down. Error: %s",
+                    e,
+                )
+            finally:
+                pusher.close()
+
+    def send_kv_to_decode(self, meta: MooncakeAgentMetadata):
+        send_reqs: list[tuple[ReqId, SendBlockMeta]] = []
+        with self.reqs_need_send.lock:
+            for req_id in meta.request_ids:
+                send_meta = self.reqs_need_send.reqs.get(req_id)
+                if send_meta is None:
+                    logger.warning("Request %s not found in reqs_need_send", req_id)
+                    return
+                # Mark it as not expired. We will send it now.
+                send_meta.expire_time = float("inf")
+                send_reqs.append((req_id, send_meta))
+
+        self._send_blocks(send_reqs, meta)
+
+        with self.reqs_need_send.lock:
+            for req_id in meta.request_ids:
+                del self.reqs_need_send.reqs[req_id]
+
+        with self.finished_sending_reqs.lock:
+            self.finished_sending_reqs.set.update(meta.request_ids)
+
+    def _send_blocks(
+        self,
+        send_reqs: list[tuple[ReqId, SendBlockMeta]],
+        agent_meta: MooncakeAgentMetadata,
+    ):
+        src_ptrs = []
+        dst_ptrs = []
+        lengths = []
+        local_base_addr = self.kv_caches_base_addr
+        remote_base_addr = agent_meta.kv_caches_base_addr
+        block_len = self.block_len
+        remote_session = f"{agent_meta.remote_hostname}:{agent_meta.remote_port}"
+
+        assert len(send_reqs) == len(agent_meta.block_ids)
+        for (req_id, send_meta), remote_block_ids in zip(
+            send_reqs, agent_meta.block_ids
+        ):
+            send_meta.ready.wait()
+
+            num_remote_blocks = len(remote_block_ids)
+            if num_remote_blocks == 0:
+                continue
+
+            local_block_ids = send_meta.local_block_ids
+            # Partial prefix cache hit: just read uncomputed blocks.
+            num_local_blocks = len(local_block_ids)
+            assert num_local_blocks >= num_remote_blocks
+            if num_local_blocks > num_remote_blocks:
+                local_block_ids = local_block_ids[-num_remote_blocks:]
+
+            # Group by indices
+            group_local_block_ids, group_remote_block_ids = group_concurrent_contiguous(
+                local_block_ids, remote_block_ids
+            )
+
+            for local_layer_addr, remote_layer_addr in zip(
+                local_base_addr, remote_base_addr
+            ):
+                for group_local_block_id, group_remote_block_id in zip(
+                    group_local_block_ids, group_remote_block_ids
+                ):
+                    src_ptrs.append(
+                        local_layer_addr + group_local_block_id[0] * block_len
+                    )
+                    dst_ptrs.append(
+                        remote_layer_addr + group_remote_block_id[0] * block_len
+                    )
+                    lengths.append(block_len * len(group_local_block_id))
+
+            logger.debug(
+                "Sending kv_caches for request %s (%d blocks) to %s",
+                req_id,
+                num_remote_blocks,
+                remote_session,
+            )
+
+        start_time = time.perf_counter()
+        ret_value = self.engine.batch_transfer_sync_write(
+            remote_session, src_ptrs, dst_ptrs, lengths
+        )
+        if ret_value != 0:
+            raise RuntimeError(f"Error in batch_transfer_sync_write: {ret_value}")
+
+        logger.debug(
+            "Sending to %s done, took %s",
+            remote_session,
+            time.perf_counter() - start_time,
+        )
+
+    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
+        """Register the KV Cache data in mooncake."""
+
+        logger.info("Registering KV_Caches. use_mla: %s", self.use_mla)
+
+        kv_data_ptrs = []
+        kv_data_lens = []
+        seen_base_addresses = []
+
+        split_k_and_v = self.kv_topo.split_k_and_v
+        tensor_size_bytes = None
+        for layer_name, cache_or_caches in kv_caches.items():
+            logger.debug(
+                "registering layer %s with shape %s", layer_name, cache_or_caches.shape
+            )
+            cache_list = cache_or_caches if split_k_and_v else [cache_or_caches]
+
+            for cache in cache_list:
+                base_addr = cache.data_ptr()
+                if base_addr in seen_base_addresses:
+                    continue
+
+                seen_base_addresses.append(base_addr)
+                curr_tensor_size_bytes = cache.nbytes
+
+                if tensor_size_bytes is None:
+                    tensor_size_bytes = curr_tensor_size_bytes
+                    self.num_blocks = cache.shape[0]
+
+                assert tensor_size_bytes == curr_tensor_size_bytes, (
+                    "All kv cache tensors must have the same size"
+                )
+                kernel_block_size = cache.shape[-2 if self.use_mla else -3]
+                assert self.block_size == kernel_block_size
+                kv_data_ptrs.append(base_addr)
+                kv_data_lens.append(tensor_size_bytes)
+
+        self.kv_caches_base_addr = seen_base_addresses
+
+        ret_value = self.engine.batch_register_memory(kv_data_ptrs, kv_data_lens)
+        if ret_value != 0:
+            raise RuntimeError("Mooncake batch memory registration failed.")
+
+        assert tensor_size_bytes is not None
+        assert self.num_blocks != 0
+        assert tensor_size_bytes % self.num_blocks == 0
+        self.block_len = tensor_size_bytes // self.num_blocks
+        self.device_kv_caches = kv_caches
+        logger.debug(
+            "registered num_blocks=%d block_len=%d", self.num_blocks, self.block_len
+        )
+
+        # No need to launch server for D node.
+        if self.kv_role == "kv_consumer":
+            return
+
+        ready_event = threading.Event()
+        self._mooncake_sender_t = threading.Thread(
+            target=self._mooncake_sender,
+            args=(ready_event, self.side_channel_port, self.tp_rank),
+            daemon=True,
+            name="mooncake_sender",
+        )
+        self._mooncake_sender_t.start()
+        ready_event.wait()  # Wait for listener ZMQ socket to be ready.
+
+    async def fetch_finished_recving_reqs(self) -> set[ReqId]:
+        async with self.finished_recving_reqs.lock:
+            finished_recving_reqs = self.finished_recving_reqs.set
+            self.finished_recving_reqs.set = set()
+        return finished_recving_reqs
+
+    def get_finished(self) -> tuple[set[str] | None, set[str] | None]:
+        """
+        Get requests that are done sending or recving on this specific worker.
+        The scheduler process (via the MultiprocExecutor) will use this output
+        to track which workers are done.
+        """
+        fut = None
+        if self.kv_role != "kv_producer":
+            fut = asyncio.run_coroutine_threadsafe(
+                self.fetch_finished_recving_reqs(), self.receiver_loop
+            )
+
+        if self.kv_role != "kv_consumer":
+            with self.finished_sending_reqs.lock:
+                finished_sending_reqs = self.finished_sending_reqs.set
+                self.finished_sending_reqs.set = set()
+        else:
+            finished_sending_reqs = set()
+
+        finished_recving_reqs = fut.result() if fut else set()
+
+        if finished_sending_reqs or finished_recving_reqs:
+            logger.debug(
+                "Rank %s, get_finished: %s requests done sending "
+                "and %s requests done recving",
+                self.tp_rank,
+                len(finished_sending_reqs),
+                len(finished_recving_reqs),
+            )
+
+        # Handle timeout to avoid stranding blocks on remote.
+        now = time.perf_counter()
+        with self.reqs_need_send.lock:
+            expired_reqs = [
+                req_id
+                for req_id, send_meta in self.reqs_need_send.reqs.items()
+                if send_meta.expire_time < now
+            ]
+            for req_id in expired_reqs:
+                logger.warning(
+                    "Request %s timed out after %d seconds without "
+                    "being sent. Freeing its blocks on the producer side.",
+                    req_id,
+                    envs.VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT,
+                )
+                del self.reqs_need_send.reqs[req_id]
+            if expired_reqs:
+                finished_sending_reqs.update(expired_reqs)
+
+        return finished_sending_reqs or None, finished_recving_reqs or None
+
+    async def receive_kv(self, path: str, req_blocks: list[tuple[str, list[int]]]):
+        req_ids, block_ids = map(list, zip(*req_blocks))
+        metadata = MooncakeAgentMetadata(
+            remote_hostname=self.hostname,
+            remote_port=self.rpc_port,
+            request_ids=req_ids,
+            kv_caches_base_addr=self.kv_caches_base_addr,
+            block_ids=block_ids,
+        )
+
+        encoded_data = self._encoder.encode(metadata)
+        logger.debug(
+            "Size of encoded MooncakeAgentMetadata: %d bytes", len(encoded_data)
+        )
+        logger.debug("Sending kv transfer request for %s on path: %s", req_ids, path)
+
+        # Send query for the request.
+        sock: zmq.asyncio.Socket = make_zmq_socket(
+            self.async_zmq_ctx, path, zmq.REQ, bind=False, linger=0
+        )
+        sock.setsockopt(zmq.RCVTIMEO, 60000)
+        try:
+            await sock.send(encoded_data)
+            ret_msg = await sock.recv()
+            if ret_msg != TRANS_DONE:
+                logger.error(
+                    "Error happens during tranfering kvcache for %s, see logs in prefiller.",  # noqa: E501
+                    req_ids,
+                )
+                return
+        except zmq.ContextTerminated:
+            logger.debug("ZMQ context terminated, exiting Mooncake receiver thread.")
+        except Exception as e:
+            logger.error("MooncakeAgentMetadata transfer failed for %s: %s", req_ids, e)
+            return
+        finally:
+            sock.close()
+
+        async with self.finished_recving_reqs.lock:
+            self.finished_recving_reqs.set.update(req_ids)
+
+        logger.debug("pulling kv_caches for %s finished", req_ids)
+
+    def group_kv_pull(self, metadata: MooncakeConnectorMetadata):
+        kv_pulls = defaultdict(list)
+        for req_id, meta in metadata.reqs_to_recv.items():
+            logger.debug(
+                "start_load_kv for request %s from remote engine. "
+                "Num local_block_ids: %s.",
+                req_id,
+                len(meta.local_block_ids),
+            )
+            path = make_zmq_path(
+                "tcp", meta.remote_host, meta.remote_port + self.tp_rank
+            )
+            kv_pulls[path].append((req_id, meta.local_block_ids))
+
+        return kv_pulls
+
+    def start_load_kv(self, metadata: MooncakeConnectorMetadata):
+        if self.kv_role != "kv_producer":
+            kv_pulls = self.group_kv_pull(metadata)
+            for path, req_blocks in kv_pulls.items():
+                asyncio.run_coroutine_threadsafe(
+                    self.receive_kv(path, req_blocks), self.receiver_loop
+                )
+
+        if self.kv_role != "kv_consumer":
+            with self.reqs_need_send.lock:
+                for req_id, block_ids in metadata.reqs_to_send.items():
+                    if block_ids:
+                        # Already gone through request_finished()
+                        send_meta = self.reqs_need_send.reqs[req_id]
+                        send_meta.local_block_ids = block_ids
+                        send_meta.ready.set()
+                        send_meta.expire_time = (
+                            time.perf_counter()
+                            + envs.VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT
+                        )
+                    else:
+                        # From update_state_after_alloc(),
+                        # but not reach request_finished() yet
+                        self.reqs_need_send.reqs[req_id] = SendBlockMeta(
+                            local_block_ids=[], ready=threading.Event()
+                        )
+
+
+def group_concurrent_contiguous(
+    src_indices: list[int], dst_indices: list[int]
+) -> tuple[list[list[int]], list[list[int]]]:
+    """Vectorised NumPy implementation."""
+    if len(src_indices) == 0:
+        return [], []
+
+    brk = np.where((np.diff(src_indices) != 1) | (np.diff(dst_indices) != 1))[0] + 1
+    src_groups = np.split(src_indices, brk)
+    dst_groups = np.split(dst_indices, brk)
+
+    src_groups = [g.tolist() for g in src_groups]
+    dst_groups = [g.tolist() for g in dst_groups]
+
+    return src_groups, dst_groups
+
+
+def get_mooncake_side_channel_port(vllm_config: VllmConfig) -> int:
+    # This logic is now centralized
+    return (
+        envs.VLLM_MOONCAKE_BOOTSTRAP_PORT
+        + vllm_config.parallel_config.data_parallel_rank
+        * vllm_config.parallel_config.tensor_parallel_size
+    )
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 41e32bb73..24b7599a4 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -20,10 +20,10 @@ import torch
 import zmq
 
 from vllm import envs
-from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
-from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.selector import get_attn_backend
 from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.utils import TpKVTopology
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     CopyBlocksOp,
     KVConnectorBase_V1,
@@ -668,128 +668,6 @@ class NixlConnectorScheduler:
 class NixlConnectorWorker:
     """Implementation of Worker side methods"""
 
-    @dataclass
-    class TpKVTopology:
-        """
-        Helper class for tensor parallel and KV topology information for
-        mapping between local and remote TP workers.
-        """
-
-        tp_rank: int
-        remote_tp_size: dict[EngineId, int]
-        is_mla: bool
-        total_num_kv_heads: int
-        attn_backend: type[AttentionBackend]
-        engine_id: EngineId
-        remote_block_size: dict[EngineId, int]
-
-        def __post_init__(self):
-            # Figure out whether the first dimension of the cache is K/V
-            # or num_blocks. This is used to register the memory regions correctly.
-            kv_cache_shape = self.attn_backend.get_kv_cache_shape(
-                num_blocks=1, block_size=16, num_kv_heads=1, head_size=1
-            )
-            # Non-MLA backends caches have 5 dims [2, num_blocks, H,N,D],
-            # we just mock num_blocks to 1 for the dimension check below.
-            self._is_kv_layout_blocks_first = (
-                len(kv_cache_shape) == 5 and kv_cache_shape[0] == 1
-            )
-
-            attn_backend = AttentionBackendEnum[self.attn_backend.get_name()]
-            self._use_pallas = attn_backend == AttentionBackendEnum.PALLAS
-
-        @property
-        def is_kv_layout_blocks_first(self) -> bool:
-            return self._is_kv_layout_blocks_first
-
-        @property
-        def split_k_and_v(self) -> bool:
-            # Whether to register regions for K and V separately (when present).
-            return not (
-                self.is_mla or self._use_pallas or self.is_kv_layout_blocks_first
-            )
-
-        @property
-        def tp_size(self) -> int:
-            return self.remote_tp_size[self.engine_id]
-
-        @property
-        def block_size(self) -> int:
-            return self.remote_block_size[self.engine_id]
-
-        def tp_ratio(
-            self,
-            remote_tp_size: int,
-        ) -> int:
-            """
-            Calculate the tensor parallel ratio between local and remote TP.
-            We can think of it as the number of local TP workers-per-remote TP
-            workers. Local workers will read from the same remote TP worker in
-            groups of size `tp_ratio`.
-            """
-            assert self.tp_size % remote_tp_size == 0, (
-                f"Local tensor parallel size {self.tp_size} is not divisible "
-                f"by remote tensor parallel size {remote_tp_size}."
-            )
-            return self.tp_size // remote_tp_size
-
-        def block_size_ratio(
-            self,
-            remote_block_size: int,
-        ) -> float:
-            """
-            Calculate the block size ratio between local and remote TP.
-            """
-            assert self.block_size % remote_block_size == 0, (
-                f"Local block size {self.block_size} is not divisible "
-                f"by remote block size {remote_block_size} or vice versa."
-            )
-            return self.block_size // remote_block_size
-
-        def tp_ratio_from_engine_id(
-            self,
-            remote_engine_id: EngineId,
-        ) -> int:
-            remote_tp_size = self.remote_tp_size[remote_engine_id]
-            return self.tp_ratio(remote_tp_size)
-
-        def block_size_ratio_from_engine_id(
-            self,
-            remote_engine_id: EngineId,
-        ) -> float:
-            remote_block_size = self.remote_block_size[remote_engine_id]
-            return self.block_size_ratio(remote_block_size)
-
-        def is_kv_replicated(self, engine_id: EngineId) -> bool:
-            """
-            Whether the KV cache is replicated across TP workers due to the
-            number of TP workers being greater than the number of KV heads.
-            """
-            tp_size = self.remote_tp_size[engine_id]
-            return tp_size // self.total_num_kv_heads >= 1
-
-        def replicates_kv_cache(self, remote_engine_id: EngineId) -> bool:
-            # MLA is always replicated as the hidden dim can't be split.
-            return self.is_mla or self.is_kv_replicated(remote_engine_id)
-
-        def get_target_remote_rank(
-            self,
-            remote_tp_size: int,
-        ) -> int:
-            """
-            Get the remote TP rank (on P) that the current local TP rank
-            (on D) will read from.
-            """
-            tp_ratio = self.tp_ratio(remote_tp_size)
-            return self.tp_rank // tp_ratio
-
-        def get_target_remote_rank_from_engine_id(
-            self,
-            remote_engine_id: EngineId,
-        ) -> int:
-            remote_tp_size = self.remote_tp_size[remote_engine_id]
-            return self.get_target_remote_rank(remote_tp_size)
-
     def __init__(self, vllm_config: VllmConfig, engine_id: str):
         if NixlWrapper is None:
             logger.error("NIXL is not available")
@@ -958,7 +836,7 @@ class NixlConnectorWorker:
         self.consumer_notification_counts_by_req = defaultdict[ReqId, int](int)
         self.xfer_stats = NixlKVConnectorStats()
 
-        self.kv_topo = self.TpKVTopology(
+        self.kv_topo = TpKVTopology(
             tp_rank=self.tp_rank,
             engine_id=self.engine_id,
             remote_tp_size=self._tp_size,  # shared state
diff --git a/vllm/envs.py b/vllm/envs.py
index 2ed5816b3..37711dece 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -175,6 +175,7 @@ if TYPE_CHECKING:
     VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
     VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost"
     VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5600
+    VLLM_MOONCAKE_BOOTSTRAP_PORT: int = 8998
     VLLM_ALL2ALL_BACKEND: Literal[
         "naive",
         "pplx",
@@ -197,6 +198,7 @@ if TYPE_CHECKING:
     VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
     VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: int | None = None
     VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 480
+    VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT: int = 480
     VLLM_USE_CUDNN_PREFILL: bool = False
     VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL: bool = False
     VLLM_ENABLE_CUDAGRAPH_GC: bool = False
@@ -1260,6 +1262,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_NIXL_SIDE_CHANNEL_PORT": lambda: int(
         os.getenv("VLLM_NIXL_SIDE_CHANNEL_PORT", "5600")
     ),
+    # Port used for Mooncake handshake between remote agents.
+    "VLLM_MOONCAKE_BOOTSTRAP_PORT": lambda: int(
+        os.getenv("VLLM_MOONCAKE_BOOTSTRAP_PORT", "8998")
+    ),
     # all2all backend for vllm's expert parallel communication
     # Available options:
     # - "naive": naive all2all implementation using broadcasts
@@ -1369,6 +1375,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_NIXL_ABORT_REQUEST_TIMEOUT": lambda: int(
         os.getenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", "480")
     ),
+    # Timeout (in seconds) for MooncakeConnector in PD disaggregated setup.
+    "VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT": lambda: int(
+        os.getenv("VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT", "480")
+    ),
     # Controls whether or not to use cudnn prefill
     "VLLM_USE_CUDNN_PREFILL": lambda: bool(
         int(os.getenv("VLLM_USE_CUDNN_PREFILL", "0"))
-- 
GitLab


From 6366c098d7c76120b6a55a6829a2649c727a2862 Mon Sep 17 00:00:00 2001
From: Noa Neria <noa@run.ai>
Date: Thu, 4 Dec 2025 12:04:43 +0200
Subject: [PATCH 082/258] Validating Runai Model Streamer Integration with S3
 Object Storage (#29320)

Signed-off-by: Noa Neria <noa@run.ai>
---
 docker/Dockerfile                             |  2 +-
 requirements/nightly_torch_test.txt           |  2 +-
 requirements/rocm.txt                         |  2 +-
 requirements/test.in                          |  2 +-
 requirements/test.txt                         |  6 +--
 setup.py                                      |  2 +-
 .../__init__.py                               |  0
 .../runai_streamer_loader/conftest.py         | 39 ++++++++++++++
 .../test_runai_model_streamer_loader.py       |  0
 .../test_runai_model_streamer_s3.py           | 52 +++++++++++++++++++
 .../test_runai_utils.py                       |  0
 .../test_weight_utils.py                      |  0
 vllm/transformers_utils/runai_utils.py        |  4 +-
 13 files changed, 100 insertions(+), 11 deletions(-)
 rename tests/model_executor/model_loader/{runai_model_streamer => runai_streamer_loader}/__init__.py (100%)
 create mode 100644 tests/model_executor/model_loader/runai_streamer_loader/conftest.py
 rename tests/model_executor/model_loader/{runai_model_streamer => runai_streamer_loader}/test_runai_model_streamer_loader.py (100%)
 create mode 100644 tests/model_executor/model_loader/runai_streamer_loader/test_runai_model_streamer_s3.py
 rename tests/model_executor/model_loader/{runai_model_streamer => runai_streamer_loader}/test_runai_utils.py (100%)
 rename tests/model_executor/model_loader/{runai_model_streamer => runai_streamer_loader}/test_weight_utils.py (100%)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 73cb4d7e0..0d50d97e5 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -580,7 +580,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     else \
         BITSANDBYTES_VERSION="0.46.1"; \
     fi; \
-    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.0'
+    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.3'
 
 ENV VLLM_USAGE_SOURCE production-docker-image
 
diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
index 53b012372..7b2c66544 100644
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -42,6 +42,6 @@ tritonclient==2.51.0
 
 numba == 0.61.2 # Required for N-gram speculative decoding
 numpy
-runai-model-streamer[s3,gcs]==0.15.0
+runai-model-streamer[s3,gcs]==0.15.3
 fastsafetensors>=0.1.10
 pydantic>=2.12 # 2.11 leads to error on python 3.13
diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index abbd33d6e..05b9a2179 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -12,7 +12,7 @@ tensorizer==2.10.1
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
-runai-model-streamer[s3,gcs]==0.15.0
+runai-model-streamer[s3,gcs]==0.15.3
 conch-triton-kernels==1.2.1
 timm>=1.0.17
 fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459
diff --git a/requirements/test.in b/requirements/test.in
index da7a7db1f..dfae5b758 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -51,7 +51,7 @@ tritonclient==2.51.0
 arctic-inference == 0.1.1 # Required for suffix decoding test
 numba == 0.61.2 # Required for N-gram speculative decoding
 numpy
-runai-model-streamer[s3,gcs]==0.15.0
+runai-model-streamer[s3,gcs]==0.15.3
 fastsafetensors>=0.1.10
 pydantic>=2.12 # 2.11 leads to error on python 3.13
 decord==0.6.0
diff --git a/requirements/test.txt b/requirements/test.txt
index c5f103b8b..571194e05 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -965,11 +965,11 @@ rsa==4.9.1
     # via google-auth
 rtree==1.4.0
     # via torchgeo
-runai-model-streamer==0.15.0
+runai-model-streamer==0.15.3
     # via -r requirements/test.in
-runai-model-streamer-gcs==0.15.0
+runai-model-streamer-gcs==0.15.3
     # via runai-model-streamer
-runai-model-streamer-s3==0.15.0
+runai-model-streamer-s3==0.15.3
     # via runai-model-streamer
 s3transfer==0.10.3
     # via boto3
diff --git a/setup.py b/setup.py
index af7282d4f..6fcb6653b 100644
--- a/setup.py
+++ b/setup.py
@@ -797,7 +797,7 @@ setup(
         "bench": ["pandas", "matplotlib", "seaborn", "datasets"],
         "tensorizer": ["tensorizer==2.10.1"],
         "fastsafetensors": ["fastsafetensors >= 0.1.10"],
-        "runai": ["runai-model-streamer[s3,gcs] >= 0.15.0"],
+        "runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"],
         "audio": [
             "librosa",
             "soundfile",
diff --git a/tests/model_executor/model_loader/runai_model_streamer/__init__.py b/tests/model_executor/model_loader/runai_streamer_loader/__init__.py
similarity index 100%
rename from tests/model_executor/model_loader/runai_model_streamer/__init__.py
rename to tests/model_executor/model_loader/runai_streamer_loader/__init__.py
diff --git a/tests/model_executor/model_loader/runai_streamer_loader/conftest.py b/tests/model_executor/model_loader/runai_streamer_loader/conftest.py
new file mode 100644
index 000000000..9a022f6bb
--- /dev/null
+++ b/tests/model_executor/model_loader/runai_streamer_loader/conftest.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.utils.network_utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.v1.executor import UniProcExecutor
+from vllm.v1.worker.worker_base import WorkerWrapperBase
+
+
+# This is a dummy executor for patching in test_runai_model_streamer_s3.py.
+# We cannot use vllm_runner fixture here, because it spawns worker process.
+# The worker process reimports the patched entities, and the patch is not applied.
+class RunaiDummyExecutor(UniProcExecutor):
+    def _init_executor(self) -> None:
+        distributed_init_method = get_distributed_init_method(get_ip(), get_open_port())
+
+        local_rank = 0
+        rank = 0
+        is_driver_worker = True
+
+        device_info = self.vllm_config.device_config.device.__str__().split(":")
+        if len(device_info) > 1:
+            local_rank = int(device_info[1])
+
+        worker_rpc_kwargs = dict(
+            vllm_config=self.vllm_config,
+            local_rank=local_rank,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            is_driver_worker=is_driver_worker,
+        )
+
+        wrapper_kwargs = {
+            "vllm_config": self.vllm_config,
+        }
+
+        self.driver_worker = WorkerWrapperBase(**wrapper_kwargs)
+
+        self.collective_rpc("init_worker", args=([worker_rpc_kwargs],))
+        self.collective_rpc("init_device")
diff --git a/tests/model_executor/model_loader/runai_model_streamer/test_runai_model_streamer_loader.py b/tests/model_executor/model_loader/runai_streamer_loader/test_runai_model_streamer_loader.py
similarity index 100%
rename from tests/model_executor/model_loader/runai_model_streamer/test_runai_model_streamer_loader.py
rename to tests/model_executor/model_loader/runai_streamer_loader/test_runai_model_streamer_loader.py
diff --git a/tests/model_executor/model_loader/runai_streamer_loader/test_runai_model_streamer_s3.py b/tests/model_executor/model_loader/runai_streamer_loader/test_runai_model_streamer_s3.py
new file mode 100644
index 000000000..d60c9ba64
--- /dev/null
+++ b/tests/model_executor/model_loader/runai_streamer_loader/test_runai_model_streamer_s3.py
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from pathlib import Path
+
+from huggingface_hub import snapshot_download
+from runai_model_streamer.safetensors_streamer.streamer_mock import StreamerPatcher
+
+from vllm.engine.arg_utils import EngineArgs
+
+from .conftest import RunaiDummyExecutor
+
+load_format = "runai_streamer"
+test_model = "openai-community/gpt2"
+
+
+def test_runai_model_loader_download_files_s3_mocked_with_patch(
+    vllm_runner,
+    tmp_path: Path,
+    monkeypatch,
+):
+    patcher = StreamerPatcher(str(tmp_path))
+
+    test_mock_s3_model = "s3://my-mock-bucket/gpt2/"
+
+    # Download model from HF
+    mock_model_dir = f"{tmp_path}/gpt2"
+    snapshot_download(repo_id=test_model, local_dir=mock_model_dir)
+
+    monkeypatch.setattr(
+        "vllm.transformers_utils.runai_utils.runai_list_safetensors",
+        patcher.shim_list_safetensors,
+    )
+    monkeypatch.setattr(
+        "vllm.transformers_utils.runai_utils.runai_pull_files",
+        patcher.shim_pull_files,
+    )
+    monkeypatch.setattr(
+        "vllm.model_executor.model_loader.weight_utils.SafetensorsStreamer",
+        patcher.create_mock_streamer,
+    )
+
+    engine_args = EngineArgs(
+        model=test_mock_s3_model,
+        load_format=load_format,
+        tensor_parallel_size=1,
+    )
+
+    vllm_config = engine_args.create_engine_config()
+
+    executor = RunaiDummyExecutor(vllm_config)
+    executor.driver_worker.load_model()
diff --git a/tests/model_executor/model_loader/runai_model_streamer/test_runai_utils.py b/tests/model_executor/model_loader/runai_streamer_loader/test_runai_utils.py
similarity index 100%
rename from tests/model_executor/model_loader/runai_model_streamer/test_runai_utils.py
rename to tests/model_executor/model_loader/runai_streamer_loader/test_runai_utils.py
diff --git a/tests/model_executor/model_loader/runai_model_streamer/test_weight_utils.py b/tests/model_executor/model_loader/runai_streamer_loader/test_weight_utils.py
similarity index 100%
rename from tests/model_executor/model_loader/runai_model_streamer/test_weight_utils.py
rename to tests/model_executor/model_loader/runai_streamer_loader/test_weight_utils.py
diff --git a/vllm/transformers_utils/runai_utils.py b/vllm/transformers_utils/runai_utils.py
index eac4294bb..041056720 100644
--- a/vllm/transformers_utils/runai_utils.py
+++ b/vllm/transformers_utils/runai_utils.py
@@ -18,9 +18,7 @@ SUPPORTED_SCHEMES = ["s3://", "gs://"]
 try:
     from runai_model_streamer import list_safetensors as runai_list_safetensors
     from runai_model_streamer import pull_files as runai_pull_files
-except (ImportError, OSError):
-    # see https://github.com/run-ai/runai-model-streamer/issues/26
-    # OSError will be raised on arm64 platform
+except ImportError:
     runai_model_streamer = PlaceholderModule("runai_model_streamer")  # type: ignore[assignment]
     runai_pull_files = runai_model_streamer.placeholder_attr("pull_files")
     runai_list_safetensors = runai_model_streamer.placeholder_attr("list_safetensors")
-- 
GitLab


From e96a6a6dca930d00902852ea6937a214a584b89b Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Thu, 4 Dec 2025 05:00:16 -0600
Subject: [PATCH 083/258] [ROCm][CI][Bugfix] Fixing the `Multi-Modal Models
 Test (Extended) 1` group (#30013)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .buildkite/test-amd.yaml                      |  6 ++-
 .../models/multimodal/generation/conftest.py  | 16 +++++++
 .../multimodal/generation/test_common.py      | 12 ++++-
 .../generation/test_granite_speech.py         | 15 ++++++-
 .../multimodal/generation/test_pixtral.py     | 10 +++++
 .../generation/vlm_utils/custom_inputs.py     |  2 +-
 .../generation/vlm_utils/model_utils.py       | 45 ++++++++++++++++++-
 tests/models/multimodal/pooling/conftest.py   | 24 ++++++++++
 tests/models/registry.py                      |  4 ++
 vllm/v1/attention/backends/flex_attention.py  | 14 +++++-
 10 files changed, 139 insertions(+), 9 deletions(-)
 create mode 100644 tests/models/multimodal/pooling/conftest.py

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index ee4fdebae..022b6ea23 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -987,7 +987,8 @@ steps:
   commands:
   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
 
-- label: Multi-Modal Models Test (Extended) 1
+- label: Multi-Modal Models Test (Extended) 1 # 60min
+  timeout_in_minutes: 120
   mirror_hardwares: [amdexperimental]
   agent_pool: mi325_1
   # grade: Blocking
@@ -1011,7 +1012,8 @@ steps:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
 
-- label: Multi-Modal Models Test (Extended) 3
+- label: Multi-Modal Models Test (Extended) 3 # 75min
+  timeout_in_minutes: 150
   mirror_hardwares: [amdexperimental]
   agent_pool: mi325_1
   # grade: Blocking
diff --git a/tests/models/multimodal/generation/conftest.py b/tests/models/multimodal/generation/conftest.py
index ee3ecdb10..26f858674 100644
--- a/tests/models/multimodal/generation/conftest.py
+++ b/tests/models/multimodal/generation/conftest.py
@@ -2,6 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Pytest configuration for vLLM tests."""
 
+import warnings
+
 import torch
 
 from vllm.platforms import current_platform
@@ -14,6 +16,20 @@ def pytest_configure(config):
     if not current_platform.is_rocm():
         return
 
+    skip_patterns = ["test_granite_speech.py"]
+    if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns):
+        # Skip disabling SDP for Granite Speech tests on ROCm
+        return
+
+    # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
+    # accuracy issues
+    # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
     torch.backends.cuda.enable_flash_sdp(False)
     torch.backends.cuda.enable_mem_efficient_sdp(False)
     torch.backends.cuda.enable_math_sdp(True)
+    warnings.warn(
+        "ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
+        "to avoid HuggingFace Transformers accuracy issues",
+        UserWarning,
+        stacklevel=1,
+    )
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 0eaf7198f..f896126a4 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -403,12 +403,13 @@ VLM_TEST_SETTINGS = {
         # So, we need to reduce the number of tokens for the test to pass.
         max_tokens=8,
         num_logprobs=10,
+        auto_cls=AutoModelForCausalLM,
         marks=[large_gpu_mark(min_gb=32)],
     ),
     "glm4_1v": VLMTestInfo(
         models=["zai-org/GLM-4.1V-9B-Thinking"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",
+        prompt_formatter=lambda img_prompt: f"[gMASK]<|user|>\n{img_prompt}<|assistant|>\n",  # noqa: E501
         img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>",
         video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>",
         max_model_len=2048,
@@ -423,6 +424,7 @@ VLM_TEST_SETTINGS = {
         models=["zai-org/GLM-4.1V-9B-Thinking"],
         # GLM4.1V require include video metadata for input
         test_type=VLMTestType.CUSTOM_INPUTS,
+        prompt_formatter=lambda vid_prompt: f"[gMASK]<|user|>\n{vid_prompt}<|assistant|>\n",  # noqa: E501
         max_model_len=4096,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
@@ -737,7 +739,13 @@ VLM_TEST_SETTINGS = {
         max_model_len=8192,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
-        marks=[large_gpu_mark(min_gb=48)],
+        marks=[
+            large_gpu_mark(min_gb=48),
+            pytest.mark.skipif(
+                current_platform.is_rocm(),
+                reason="Model produces a vector of <UNK> output in HF on ROCm",
+            ),
+        ],
     ),
     "qwen_vl": VLMTestInfo(
         models=["Qwen/Qwen-VL"],
diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py
index e39dfc888..f528a993f 100644
--- a/tests/models/multimodal/generation/test_granite_speech.py
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@@ -8,6 +8,7 @@ from transformers import AutoModelForSpeechSeq2Seq
 
 from vllm.logprobs import SampleLogprobs
 from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
 
 from ....conftest import AudioTestAssets, HfRunner, PromptAudioInput, VllmRunner
 from ...registry import HF_EXAMPLE_MODELS
@@ -34,6 +35,12 @@ audio_lora_path = MODEL_NAME
 models = [MODEL_NAME]
 
 
+@pytest.fixture(autouse=True)
+def set_attention_backend_for_rocm(monkeypatch):
+    if current_platform.is_rocm():
+        monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
+
+
 def run_test(
     hf_runner: type[HfRunner],
     vllm_runner: type[VllmRunner],
@@ -111,8 +118,12 @@ def run_test(
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_model_len", [2048])
+@pytest.mark.parametrize(
+    "dtype", ["float16"] if current_platform.is_rocm() else ["bfloat16"]
+)
+@pytest.mark.parametrize(
+    "max_model_len", [512] if current_platform.is_rocm() else [2048]
+)
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
 def test_models(
diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py
index 3cad2c43d..375099f43 100644
--- a/tests/models/multimodal/generation/test_pixtral.py
+++ b/tests/models/multimodal/generation/test_pixtral.py
@@ -15,6 +15,7 @@ from transformers import AutoProcessor
 from vllm import SamplingParams, TextPrompt, TokensPrompt
 from vllm.logprobs import Logprob, SampleLogprobs
 from vllm.multimodal import MultiModalDataBuiltins
+from vllm.platforms import current_platform
 
 from ....utils import VLLM_PATH, large_gpu_test
 from ...utils import check_logprobs_close
@@ -165,6 +166,15 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
 def test_chat(
     vllm_runner, max_model_len: int, model: str, dtype: str, local_asset_server
 ) -> None:
+    if (
+        model == MISTRAL_SMALL_3_1_ID
+        and max_model_len == 65536
+        and current_platform.is_rocm()
+    ):
+        pytest.skip(
+            "OOM on ROCm: 24B model with 65536 context length exceeds GPU memory"
+        )
+
     EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT[model])
     with vllm_runner(
         model,
diff --git a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
index 8c9c39091..841092336 100644
--- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
+++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
@@ -140,7 +140,7 @@ def video_with_metadata_glm4_1v():
     metadata = VIDEO_ASSETS[0].metadata
     question = "Describe the video."
     video_prompt = "<|begin_of_video|><|video|><|end_of_video|>"
-    formatted_prompt = f"<|user|>\n{video_prompt}{question}<|assistant|>\n"
+    formatted_prompt = f"[gMASK]<|user|>\n{video_prompt}{question}<|assistant|>\n"
 
     scales = [0.1, 0.2, 0.25]
     video_input = [
diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index 87cd5c3cd..b2c62fbd1 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -25,6 +25,7 @@ from transformers import (
 from transformers.video_utils import VideoMetadata
 
 from vllm.logprobs import SampleLogprobs
+from vllm.platforms import current_platform
 from vllm.utils.collection_utils import is_list_of
 
 from .....conftest import HfRunner, ImageAsset, ImageTestAssets
@@ -366,6 +367,40 @@ def gemma3_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOut
 
 def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for GLM4V."""
+    if current_platform.is_rocm():
+        import types
+
+        config = hf_model.model.config
+        if hasattr(config, "num_layers") and not hasattr(config, "num_hidden_layers"):
+            config.num_hidden_layers = config.num_layers
+        config.output_hidden_states = True
+
+        def patched_prepare_cache(
+            self, generation_config, model_kwargs, *args, **kwargs
+        ):
+            model_kwargs["past_key_values"] = None
+            model_kwargs["use_cache"] = False
+            return model_kwargs
+
+        hf_model.model._prepare_cache_for_generation = types.MethodType(
+            patched_prepare_cache, hf_model.model
+        )
+        original_generate = hf_model.model.generate
+
+        def patched_generate(*args, **kwargs):
+            kwargs["output_hidden_states"] = True
+            kwargs["return_dict_in_generate"] = True
+            return original_generate(*args, **kwargs)
+
+        hf_model.model.generate = patched_generate
+        original_forward = hf_model.model.forward
+
+        def patched_forward(*args, **kwargs):
+            kwargs["output_hidden_states"] = True
+            return original_forward(*args, **kwargs)
+
+        hf_model.model.forward = patched_forward
+
     hf_processor = hf_model.processor
 
     def processor(*args, text="", images=None, **kwargs):
@@ -406,7 +441,15 @@ def glm4_1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
         if videos is not None and is_list_of(videos, tuple):
             # If videos is a list of tuples, we assume each tuple contains
             # (video_array, metadata) as in the case of GLM4.1V.
-            video_metadata = [[VideoMetadata(**video[1])] for video in videos]
+            # Filter out 'do_sample_frames' as it's not a valid VideoMetadata arg
+            video_metadata = [
+                [
+                    VideoMetadata(
+                        **{k: v for k, v in video[1].items() if k != "do_sample_frames"}
+                    )
+                ]
+                for video in videos
+            ]
             videos = [[video[0]] for video in videos]
         else:
             video_metadata = None
diff --git a/tests/models/multimodal/pooling/conftest.py b/tests/models/multimodal/pooling/conftest.py
new file mode 100644
index 000000000..c5f40cb42
--- /dev/null
+++ b/tests/models/multimodal/pooling/conftest.py
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Pytest configuration for vLLM pooling tests."""
+
+import os
+import warnings
+
+from vllm.platforms import current_platform
+
+
+def pytest_collection_modifyitems(config, items):
+    """Set FLEX_ATTENTION backend for SigLIP tests on ROCm."""
+    if not current_platform.is_rocm():
+        return
+
+    siglip_tests = [item for item in items if "test_siglip" in item.nodeid]
+
+    if siglip_tests:
+        os.environ["VLLM_ATTENTION_BACKEND"] = "FLEX_ATTENTION"
+        warnings.warn(
+            "ROCm: Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION for SigLIP tests",
+            UserWarning,
+            stacklevel=1,
+        )
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 6b1d24b1c..bf88bac20 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -667,6 +667,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
         "moonshotai/Kimi-VL-A3B-Instruct",
         extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},
         trust_remote_code=True,
+        max_transformers_version="4.53.3",
+        transformers_version_reason="HF model uses deprecated transformers API "
+        "(PytorchGELUTanh, DynamicCache.seen_tokens, and more). See: "
+        "https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/31",
     ),
     "LightOnOCRForConditionalGeneration": _HfExamplesInfo(
         "lightonai/LightOnOCR-1B",
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index fe92f6570..a2a6eeeb1 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -31,6 +31,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
     vllm_is_batch_invariant,
 )
+from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 from vllm.v1.attention.backends.utils import (
@@ -927,7 +928,18 @@ def get_kernel_options(
 
         if torch.cuda.is_available():
             device_props = torch.cuda.get_device_properties()
-            max_shared_memory = device_props.shared_memory_per_block_optin
+            # ROCm doesn't expose shared_memory_per_block_optin attribute
+            # AMD GPUs typically have 64KB LDS (Local Data Share) per workgroup
+            if hasattr(device_props, "shared_memory_per_block_optin"):
+                max_shared_memory = device_props.shared_memory_per_block_optin
+            elif current_platform.is_rocm():
+                # ROCm fallback: use 64KB
+                max_shared_memory = 65536
+            else:
+                raise RuntimeError(
+                    "Unable to determine shared memory size on this hardware."
+                )
+
             if max_shared_memory < 144 * 1024:
                 block_m_candidate = ensure_divisible(
                     max(1, block_m_candidate // 2), block_m
-- 
GitLab


From 6796ce8bdbf29f5624fcdc03792626574c919b41 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Thu, 4 Dec 2025 19:11:59 +0800
Subject: [PATCH 084/258] [Bugfix] Fix the issue with interleaved thinking when
 using streaming (#30033)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: Chauncey <chaunceyjiang@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 .../reasoning/test_base_thinking_reasoning_parser.py | 12 +++++++++++-
 vllm/reasoning/basic_parsers.py                      |  9 ++++++++-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/tests/reasoning/test_base_thinking_reasoning_parser.py b/tests/reasoning/test_base_thinking_reasoning_parser.py
index d31b1c7d1..34e9483de 100644
--- a/tests/reasoning/test_base_thinking_reasoning_parser.py
+++ b/tests/reasoning/test_base_thinking_reasoning_parser.py
@@ -112,7 +112,7 @@ class TestBaseThinkingReasoningParserMethods:
         """Test the is_reasoning_end method."""
         parser = TestThinkingReasoningParser(test_tokenizer)
         end_token_id = parser.end_token_id
-
+        start_token_id = parser.start_token_id
         # Test with end token present
         assert parser.is_reasoning_end([1, 2, end_token_id, 4]) is True
 
@@ -122,6 +122,16 @@ class TestBaseThinkingReasoningParserMethods:
         # Test with empty list
         assert parser.is_reasoning_end([]) is False
 
+        # Test with interleaved thinking
+        assert parser.is_reasoning_end([1, start_token_id, 2, end_token_id]) is True
+        assert parser.is_reasoning_end([1, start_token_id, 2, 3]) is False
+        assert (
+            parser.is_reasoning_end(
+                [1, start_token_id, 2, end_token_id, 2, 2, start_token_id]
+            )
+            is False
+        )
+
     def test_extract_content_ids(self, test_tokenizer):
         """Test the extract_content_ids method."""
         parser = TestThinkingReasoningParser(test_tokenizer)
diff --git a/vllm/reasoning/basic_parsers.py b/vllm/reasoning/basic_parsers.py
index 35084c0e7..e78ac4a5e 100644
--- a/vllm/reasoning/basic_parsers.py
+++ b/vllm/reasoning/basic_parsers.py
@@ -64,8 +64,15 @@ class BaseThinkingReasoningParser(ReasoningParser):
             )
 
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        start_token_id = self.start_token_id
         end_token_id = self.end_token_id
-        return any(input_id == end_token_id for input_id in reversed(input_ids))
+
+        for i in range(len(input_ids) - 1, -1, -1):
+            if input_ids[i] == start_token_id:
+                return False
+            if input_ids[i] == end_token_id:
+                return True
+        return False
 
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
         """
-- 
GitLab


From 1b7c7f5159484063af28cb47809d79e83d3301ec Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Thu, 4 Dec 2025 03:18:29 -0800
Subject: [PATCH 085/258] [release] install regex (#30008)

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 .buildkite/scripts/upload-wheels.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 2eaa91c04..0ac8fdd45 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -81,7 +81,7 @@ else
     alias_arg=""
 fi
 
-$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
+$PYTHON pip install regex && .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
 
 # copy indices to /<commit>/ unconditionally
 echo "Uploading indices to $S3_COMMIT_PREFIX"
-- 
GitLab


From 74c4d80c6ca8160578b6e812079cb11dfd8b3d22 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Thu, 4 Dec 2025 21:44:15 +0800
Subject: [PATCH 086/258] [Model][6/N] Improve all pooling task | Support
 chunked prefill with ALL pooling (#27145)

Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 docs/features/README.md                       |  2 +-
 .../pooling/classify/test_offline.py          |  7 +-
 .../pooling/classify/test_online.py           | 12 +--
 .../entrypoints/pooling/embed/test_offline.py |  2 +-
 .../pooling/reward/test_offline.py            |  7 ++
 .../test_all_pooling_plus_chunked_prefill.py  | 53 +++++++++++
 .../pooling/test_extract_hidden_states.py     |  1 -
 tests/test_config.py                          |  8 +-
 vllm/config/model.py                          | 24 +++--
 vllm/model_executor/layers/pooler.py          | 93 ++++++++++++++-----
 vllm/model_executor/models/terratorch.py      |  4 +-
 vllm/v1/outputs.py                            |  2 +-
 vllm/v1/pool/metadata.py                      | 29 +++++-
 vllm/v1/worker/gpu_input_batch.py             | 23 ++++-
 vllm/v1/worker/gpu_model_runner.py            | 50 +++-------
 15 files changed, 224 insertions(+), 93 deletions(-)
 create mode 100644 tests/models/language/pooling/test_all_pooling_plus_chunked_prefill.py

diff --git a/docs/features/README.md b/docs/features/README.md
index 5faf3768f..684802301 100644
--- a/docs/features/README.md
+++ b/docs/features/README.md
@@ -54,7 +54,7 @@ th:not(:first-child) {
 | beam-search | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/7968) | ❔ | ✅ | ✅ | |
 | [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❔ | ❔ | ❌ | ❔ | ❔ | ✅ |
 
-\* Chunked prefill and prefix caching are only applicable to last-token pooling.  
+\* Chunked prefill and prefix caching are only applicable to last-token or all pooling with causal attention.  
 <sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
 
 ### Feature x Hardware
diff --git a/tests/entrypoints/pooling/classify/test_offline.py b/tests/entrypoints/pooling/classify/test_offline.py
index 1063c3b6b..a07fcd372 100644
--- a/tests/entrypoints/pooling/classify/test_offline.py
+++ b/tests/entrypoints/pooling/classify/test_offline.py
@@ -61,11 +61,8 @@ def test_pooling_params(llm: LLM):
 
 
 @pytest.mark.skip_global_cleanup
-def test_encode_api(llm: LLM):
-    # chunked prefill does not support all pooling
-    err_msg = "pooling_task must be one of.+"
-    with pytest.raises(ValueError, match=err_msg):
-        llm.encode(prompts, pooling_task="token_classify", use_tqdm=False)
+def test_token_classify(llm: LLM):
+    llm.encode(prompts, pooling_task="token_classify", use_tqdm=False)
 
 
 def test_score_api(llm: LLM):
diff --git a/tests/entrypoints/pooling/classify/test_online.py b/tests/entrypoints/pooling/classify/test_online.py
index 6fef68858..1a6c33b45 100644
--- a/tests/entrypoints/pooling/classify/test_online.py
+++ b/tests/entrypoints/pooling/classify/test_online.py
@@ -255,21 +255,21 @@ async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: str):
-    # token_classify uses ALL pooling, which does not support chunked prefill.
     task = "token_classify"
+    input_text = ["This product was excellent and exceeded my expectations"]
     response = requests.post(
         server.url_for("pooling"),
         json={
             "model": model_name,
-            "input": "test",
+            "input": input_text,
             "encoding_format": "float",
             "task": task,
         },
     )
-    assert response.json()["error"]["type"] == "BadRequestError"
-    assert response.json()["error"]["message"].startswith(
-        f"Task {task} is not supported"
-    )
+    poolings = PoolingResponse.model_validate(response.json())
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 8
+    assert len(poolings.data[0].data[0]) == 2
 
 
 @pytest.mark.asyncio
diff --git a/tests/entrypoints/pooling/embed/test_offline.py b/tests/entrypoints/pooling/embed/test_offline.py
index f5eab4c29..12b47b1a0 100644
--- a/tests/entrypoints/pooling/embed/test_offline.py
+++ b/tests/entrypoints/pooling/embed/test_offline.py
@@ -42,7 +42,7 @@ def llm():
 
 
 @pytest.mark.skip_global_cleanup
-def test_encode_api(llm: LLM):
+def test_token_embed(llm: LLM):
     outputs = llm.encode(prompts, pooling_task="token_embed", use_tqdm=False)
     multi_vector = outputs[0].outputs.data
     assert multi_vector.shape == (11, 384)
diff --git a/tests/entrypoints/pooling/reward/test_offline.py b/tests/entrypoints/pooling/reward/test_offline.py
index 0255704ce..b061b5514 100644
--- a/tests/entrypoints/pooling/reward/test_offline.py
+++ b/tests/entrypoints/pooling/reward/test_offline.py
@@ -36,6 +36,13 @@ def llm():
     cleanup_dist_env_and_memory()
 
 
+@pytest.mark.skip_global_cleanup
+def test_config(llm: LLM):
+    vllm_config = llm.llm_engine.vllm_config
+    assert vllm_config.cache_config.enable_prefix_caching
+    assert vllm_config.scheduler_config.enable_chunked_prefill
+
+
 def test_pooling_params(llm: LLM):
     def get_outputs(use_activation):
         outputs = llm.reward(
diff --git a/tests/models/language/pooling/test_all_pooling_plus_chunked_prefill.py b/tests/models/language/pooling/test_all_pooling_plus_chunked_prefill.py
new file mode 100644
index 000000000..c259c5322
--- /dev/null
+++ b/tests/models/language/pooling/test_all_pooling_plus_chunked_prefill.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+from transformers import AutoModel
+
+from tests.models.utils import check_embeddings_close
+from vllm import TokensPrompt
+
+
+@pytest.mark.parametrize(
+    "model",
+    ["Qwen/Qwen3-Embedding-0.6B"],
+)
+@torch.inference_mode
+def test_embed_models(hf_runner, vllm_runner, model: str):
+    chunk_size = 10
+    n_prompt_tokens = [55, 56, 57]
+    token_prompts = [[1024 + i for i in range(n)] for n in n_prompt_tokens]
+
+    with vllm_runner(
+        model,
+        runner="pooling",
+        max_model_len=128,
+        max_num_batched_tokens=chunk_size,
+        enforce_eager=True,
+        # `enable_chunked_prefill`: Set to `False` instead of `None` in VllmRunner
+        enable_chunked_prefill=True,
+        enable_prefix_caching=True,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.token_embed(
+            [TokensPrompt(prompt_token_ids=t) for t in token_prompts],
+        )
+
+    with hf_runner(
+        model,
+        auto_cls=AutoModel,
+    ) as hf_model:
+        hf_outputs = []
+        for token_prompt in token_prompts:
+            inputs = hf_model.wrap_device({"input_ids": torch.tensor([token_prompt])})
+            input_ids = inputs["input_ids"]
+            output = hf_model.model(input_ids)
+            hf_outputs.append(output.last_hidden_state.cpu().float()[0])
+
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+        check_embeddings_close(
+            embeddings_0_lst=hf_output,
+            embeddings_1_lst=vllm_output,
+            name_0="hf",
+            name_1="vllm",
+            tol=1e-2,
+        )
diff --git a/tests/models/language/pooling/test_extract_hidden_states.py b/tests/models/language/pooling/test_extract_hidden_states.py
index 0d41b9323..488b27e2d 100644
--- a/tests/models/language/pooling/test_extract_hidden_states.py
+++ b/tests/models/language/pooling/test_extract_hidden_states.py
@@ -20,7 +20,6 @@ def test_extract_hidden_states(hf_runner, vllm_runner, model: str):
         max_model_len=128,
         enforce_eager=True,
         runner="pooling",
-        enable_chunked_prefill=False,
         enable_prefix_caching=True,
     ) as vllm_model:
         pooling_outputs = vllm_model.llm.encode(
diff --git a/tests/test_config.py b/tests/test_config.py
index 019c0d6d8..203447cd5 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -629,8 +629,8 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files):
         (
             "internlm/internlm2-1_8b-reward",
             "decoder",
-            False,
-            "Pooling models with all pooling does not support chunked prefill.",
+            True,
+            "Pooling models with causal attn and all pooling support chunked prefill.",
         ),
         (
             "BAAI/bge-base-en",
@@ -748,8 +748,8 @@ def test_is_chunked_prefill_supported(
         (
             "internlm/internlm2-1_8b-reward",
             "decoder",
-            False,
-            "Pooling models with all pooling does not support prefix caching.",
+            True,
+            "Pooling models with causal attn and all pooling support prefix caching.",
         ),
         (
             "BAAI/bge-base-en",
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 655b7c995..ae5189ce6 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -1780,20 +1780,22 @@ class ModelConfig:
                 return False
             elif attn_type == "decoder":
                 pooling_type = self.pooler_config.pooling_type.lower()
-                if pooling_type in ["all", "mean", "step", "cls"]:
+                if pooling_type in ["mean", "step", "cls"]:
                     logger.debug(
                         "Pooling models with %s pooling does not "
                         "support chunked prefill.",
                         pooling_type,
                     )
                     return False
-                else:
-                    # pooling_type == "last"
+                elif pooling_type in ["all", "last"]:
                     logger.debug(
-                        "Pooling models with causal attn and last pooling support "
-                        "chunked prefill."
+                        "Pooling models with causal attn and %s pooling support "
+                        "chunked prefill.",
+                        pooling_type,
                     )
                     return True
+                else:
+                    raise ValueError(f"{pooling_type=} not supported.")
             # vllm currently does not have pooling models using hybrid,
             # attention_free or encoder_decoder attn types.
             return attn_type != "encoder_decoder"
@@ -1817,20 +1819,22 @@ class ModelConfig:
                 return False
             elif attn_type == "decoder":
                 pooling_type = self.pooler_config.pooling_type.lower()
-                if pooling_type in ["all", "mean", "step", "cls"]:
+                if pooling_type in ["mean", "step", "cls"]:
                     logger.debug(
                         "Pooling models with %s pooling does not "
                         "support prefix caching.",
                         pooling_type,
                     )
                     return False
-                else:
-                    # pooling_type == "last"
+                elif pooling_type in ["all", "last"]:
                     logger.debug(
-                        "Pooling models with causal attn and last pooling support "
-                        "prefix caching."
+                        "Pooling models with causal attn and %s pooling support "
+                        "prefix caching.",
+                        pooling_type,
                     )
                     return True
+                else:
+                    raise ValueError(f"{pooling_type=} not supported.")
             # vllm currently does not have pooling models using hybrid,
             # attention_free or encoder_decoder attn types.
             return False
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 185e03e5f..d1942689d 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -127,14 +127,14 @@ class PoolingMethod(nn.Module, ABC):
         self,
         hidden_states: torch.Tensor,
         pooling_cursor: PoolingCursor,
-    ) -> list[torch.Tensor] | torch.Tensor:
+    ) -> PoolerOutput:
         raise NotImplementedError
 
     def forward(
         self,
         hidden_states: torch.Tensor,
         pooling_metadata: PoolingMetadata,
-    ) -> list[torch.Tensor] | torch.Tensor:
+    ) -> PoolerOutput:
         pooling_cursor = pooling_metadata.pooling_cursor
         return self.forward_all(hidden_states, pooling_cursor)
 
@@ -147,7 +147,7 @@ class CLSPool(PoolingMethod):
         self,
         hidden_states: torch.Tensor,
         pooling_cursor: PoolingCursor,
-    ) -> list[torch.Tensor] | torch.Tensor:
+    ) -> PoolerOutput:
         assert not pooling_cursor.is_partial_prefill(), (
             "partial prefill not supported with CLS pooling"
         )
@@ -163,27 +163,65 @@ class LastPool(PoolingMethod):
         self,
         hidden_states: torch.Tensor,
         pooling_cursor: PoolingCursor,
-    ) -> list[torch.Tensor] | torch.Tensor:
+    ) -> PoolerOutput:
         return hidden_states[pooling_cursor.last_token_indices_gpu]
 
 
 class AllPool(PoolingMethod):
+    def __init__(self):
+        super().__init__()
+
+        vllm_config = get_current_vllm_config()
+        self.enable_chunked_prefill = (
+            vllm_config.scheduler_config.enable_chunked_prefill
+        )
+
     def get_supported_tasks(self) -> Set[PoolingTask]:
         return {"token_embed", "token_classify"}
 
     def forward_all(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_cursor: PoolingCursor,
-    ) -> list[torch.Tensor] | torch.Tensor:
-        assert not pooling_cursor.is_partial_prefill(), (
-            "partial prefill not supported with ALL pooling"
+        self, hidden_states: torch.Tensor, pooling_cursor: PoolingCursor
+    ) -> PoolerOutput:
+        raise NotImplementedError(
+            "forward_all is not implemented for AllPool. Use forward instead."
         )
 
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        pooling_cursor = pooling_metadata.pooling_cursor
+        is_finished = pooling_cursor.is_finished()
         hidden_states_lst = list(
             hidden_states.split(pooling_cursor.num_scheduled_tokens_cpu.tolist())
         )
-        return [hidden_states_lst[i] for i in pooling_cursor.index]
+        hidden_states_lst = [hidden_states_lst[i] for i in pooling_cursor.index]
+
+        if not self.enable_chunked_prefill:
+            return hidden_states_lst
+
+        pooling_states = pooling_metadata.pooling_states
+
+        # If chunked_prefill is enabled
+        # 1. first store the chunked hidden_states in pooling_states.hidden_states_cache
+        for p, hs_chunk in zip(pooling_states, hidden_states_lst):
+            p.hidden_states_cache.append(hs_chunk)
+
+        # 2. Once prefill is finished, send hidden_states_cache to PoolerHead
+        output_list: PoolerOutput = []
+        for p, finished in zip(pooling_states, is_finished):
+            if finished:
+                hidden_states_cache = p.hidden_states_cache
+                if len(hidden_states_cache) == 1:
+                    output_list.append(hidden_states_cache[0])
+                else:
+                    output_list.append(torch.concat(hidden_states_cache, dim=0))
+                p.clean()
+            else:
+                output_list.append(None)
+
+        return output_list
 
 
 class MeanPool(PoolingMethod):
@@ -194,7 +232,7 @@ class MeanPool(PoolingMethod):
         self,
         hidden_states: torch.Tensor,
         pooling_cursor: PoolingCursor,
-    ) -> list[torch.Tensor] | torch.Tensor:
+    ) -> PoolerOutput:
         assert not pooling_cursor.is_partial_prefill(), (
             "partial prefill not supported with MEAN pooling"
         )
@@ -399,7 +437,7 @@ class PoolerHead(nn.Module):
         self,
         pooled_data: list[torch.Tensor] | torch.Tensor,
         pooling_metadata: PoolingMetadata,
-    ):
+    ) -> PoolerOutput:
         return self.activation(pooled_data)
 
 
@@ -418,7 +456,7 @@ class EmbeddingPoolerHead(PoolerHead):
         self,
         pooled_data: list[torch.Tensor] | torch.Tensor,
         pooling_metadata: PoolingMetadata,
-    ):
+    ) -> PoolerOutput:
         if isinstance(pooled_data, list):
             pooled_data = torch.stack(pooled_data)
         # pooled_data shape: [batchsize, hidden_dimension]
@@ -586,8 +624,12 @@ class ClassifierPooler(Pooler):
 
 class TokenEmbeddingPoolerHead(EmbeddingPoolerHead):
     def forward(
-        self, pooled_data: torch.Tensor, pooling_param: PoolingParams
-    ) -> torch.Tensor:
+        self, pooled_data: torch.Tensor | None, pooling_param: PoolingParams
+    ) -> PoolerOutput:
+        # for unfinished chunked prefill
+        if pooled_data is None:
+            return None
+
         pooled_data = pooled_data.to(self.head_dtype)
         # pooled_data shape: [n_tokens, hidden_dimension]
 
@@ -630,9 +672,13 @@ class TokenClassifierPoolerHead(nn.Module):
 
     def forward(
         self,
-        hidden_states: torch.Tensor,
+        hidden_states: torch.Tensor | None,
         pooling_param: PoolingParams,
-    ) -> torch.Tensor:
+    ) -> PoolerOutput:
+        # for unfinished chunked prefill
+        if hidden_states is None:
+            return None
+
         hidden_states = hidden_states.to(self.head_dtype)
         # hidden_states shape: [n_token, hidden_size]
 
@@ -686,17 +732,20 @@ class StepPooler(Pooler):
         self,
         hidden_states: torch.Tensor | list[torch.Tensor],
         pooling_metadata: PoolingMetadata,
-    ) -> torch.Tensor | list[torch.Tensor]:
+    ) -> PoolerOutput:
         pooled_data_lst = self.pooling(hidden_states, pooling_metadata)
         prompt_token_ids = pooling_metadata.get_prompt_token_ids()
-
-        pooled_data = list[torch.Tensor]()
-
         pooling_params = pooling_metadata.pooling_params
 
+        pooled_data: PoolerOutput = []
         for data, token_id, pooling_param in zip(
             pooled_data_lst, prompt_token_ids, pooling_params
         ):
+            # for unfinished chunked prefill
+            if data is None:
+                pooled_data.append(data)
+                continue
+
             step_tag_id = pooling_param.step_tag_id
             returned_token_ids = pooling_param.returned_token_ids
 
diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py
index 19052c8d4..9f34090e3 100644
--- a/vllm/model_executor/models/terratorch.py
+++ b/vllm/model_executor/models/terratorch.py
@@ -64,7 +64,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import IsAttentionFree, MultiModalEmbeddings, SupportsMultiModal
-from .interfaces_base import default_pooling_type
+from .interfaces_base import attn_type
 
 logger = init_logger(__name__)
 
@@ -220,7 +220,7 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor):
         )
 
 
-@default_pooling_type("All")
+@attn_type("attention_free")
 @MULTIMODAL_REGISTRY.register_processor(
     TerratorchMultiModalProcessor,
     info=TerratorchProcessingInfo,
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 88ac6b4ae..546eacebf 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -89,7 +89,7 @@ class LogprobsTensors(NamedTuple):
 
 # [num_reqs, <dynamic>]
 # The shape of each element depends on the pooler used
-PoolerOutput = torch.Tensor | list[torch.Tensor]
+PoolerOutput = list[torch.Tensor | None] | torch.Tensor | None
 
 
 @dataclass
diff --git a/vllm/v1/pool/metadata.py b/vllm/v1/pool/metadata.py
index 9ee588ea4..acd1a00e8 100644
--- a/vllm/v1/pool/metadata.py
+++ b/vllm/v1/pool/metadata.py
@@ -17,6 +17,7 @@ class PoolingCursor:
     first_token_indices_gpu: torch.Tensor
     last_token_indices_gpu: torch.Tensor
     prompt_lens_cpu: torch.Tensor
+    seq_lens_cpu: torch.Tensor
     num_scheduled_tokens_cpu: torch.Tensor
 
     def __getitem__(self, indices: slice):
@@ -25,12 +26,25 @@ class PoolingCursor:
             first_token_indices_gpu=self.first_token_indices_gpu[indices],
             last_token_indices_gpu=self.last_token_indices_gpu[indices],
             prompt_lens_cpu=self.prompt_lens_cpu[indices],
+            seq_lens_cpu=self.seq_lens_cpu[indices],
             num_scheduled_tokens_cpu=self.num_scheduled_tokens_cpu[indices],
         )
 
     def is_partial_prefill(self):
         return not torch.all(self.prompt_lens_cpu == self.num_scheduled_tokens_cpu)
 
+    def is_finished(self):
+        return self.prompt_lens_cpu == self.seq_lens_cpu
+
+
+class PoolingStates:
+    def __init__(self):
+        # for chunked prefill with ALL pooling
+        self.hidden_states_cache: list[torch.Tensor] = []
+
+    def clean(self):
+        self.hidden_states_cache.clear()
+
 
 @dataclass
 class PoolingMetadata:
@@ -39,6 +53,7 @@ class PoolingMetadata:
     prompt_lens: torch.Tensor  # CPU Tensor
     prompt_token_ids: torch.Tensor | None
     pooling_params: list[PoolingParams]
+    pooling_states: list[PoolingStates]
     pooling_cursor: PoolingCursor | None = None
 
     def __post_init__(self) -> None:
@@ -60,6 +75,7 @@ class PoolingMetadata:
             if self.prompt_token_ids is None
             else self.prompt_token_ids[indices],
             pooling_params=self.pooling_params[indices],
+            pooling_states=self.pooling_states[indices],
             pooling_cursor=None
             if self.pooling_cursor is None
             else self.pooling_cursor[indices],
@@ -74,15 +90,21 @@ class PoolingMetadata:
         return [prompt_token_ids[i, :num] for i, num in enumerate(self.prompt_lens)]
 
     def build_pooling_cursor(
-        self, num_scheduled_tokens: list[int], device: torch.device
+        self,
+        num_scheduled_tokens: list[int],
+        seq_lens_cpu: torch.Tensor,
+        device: torch.device,
     ):
         self.pooling_cursor = build_pooling_cursor(
-            num_scheduled_tokens, self.prompt_lens, device
+            num_scheduled_tokens, seq_lens_cpu, self.prompt_lens, device
         )
 
 
 def build_pooling_cursor(
-    num_scheduled_tokens: list[int], prompt_lens: torch.Tensor, device: torch.device
+    num_scheduled_tokens: list[int],
+    seq_lens_cpu: torch.Tensor,
+    prompt_lens: torch.Tensor,
+    device: torch.device,
 ):
     assert len(prompt_lens) == len(num_scheduled_tokens)
 
@@ -99,5 +121,6 @@ def build_pooling_cursor(
         first_token_indices_gpu=cumsum[:n_seq],
         last_token_indices_gpu=cumsum[1:] - 1,
         prompt_lens_cpu=prompt_lens,
+        seq_lens_cpu=seq_lens_cpu,
         num_scheduled_tokens_cpu=num_scheduled_tokens_cpu,
     )
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 516c76a5e..ead7a3619 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -15,7 +15,7 @@ from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import length_from_prompt_token_ids_or_embeds
 from vllm.utils.collection_utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
-from vllm.v1.pool.metadata import PoolingMetadata
+from vllm.v1.pool.metadata import PoolingMetadata, PoolingStates
 from vllm.v1.sample.logits_processor import (
     BatchUpdateBuilder,
     LogitsProcessors,
@@ -33,7 +33,6 @@ class CachedRequestState:
     prompt_token_ids: list[int] | None
     mm_features: list[MultiModalFeatureSpec]
     sampling_params: SamplingParams | None
-    pooling_params: PoolingParams | None
     generator: torch.Generator | None
 
     block_ids: tuple[list[int], ...]
@@ -51,11 +50,18 @@ class CachedRequestState:
     # Used when both async_scheduling and spec_decode are enabled.
     prev_num_draft_len: int = 0
 
+    # for pooling models
+    pooling_params: PoolingParams | None = None
+    pooling_states: PoolingStates | None = None
+
     def __post_init__(self):
         self.num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
             self.prompt_token_ids, self.prompt_embeds
         )
 
+        if self.pooling_params is not None:
+            self.pooling_states = PoolingStates()
+
     @property
     def num_tokens(self) -> int:
         return self.num_prompt_tokens + len(self.output_token_ids)
@@ -255,7 +261,9 @@ class InputBatch:
         # This is updated each time the batch constituents change.
         self.sampling_metadata = self._make_sampling_metadata()
 
+        # for pooling models
         self.pooling_params: dict[str, PoolingParams] = {}
+        self.pooling_states: dict[str, PoolingStates] = {}
 
         # Cached reference to the GPU tensor of previously sampled tokens
         self.prev_sampled_token_ids: torch.Tensor | None = None
@@ -413,7 +421,11 @@ class InputBatch:
                     sampling_params.bad_words_token_ids
                 )
         elif pooling_params := request.pooling_params:
+            pooling_states = request.pooling_states
+            assert pooling_states is not None
+
             self.pooling_params[req_id] = pooling_params
+            self.pooling_states[req_id] = pooling_states
             self.logits_processing_needs_token_ids[req_index] = (
                 pooling_params.requires_token_ids
             )
@@ -469,6 +481,7 @@ class InputBatch:
 
         if self.is_pooling_model:
             self.pooling_params.pop(req_id, None)
+            self.pooling_states.pop(req_id, None)
             return req_index
 
         self.greedy_reqs.discard(req_id)
@@ -837,13 +850,19 @@ class InputBatch:
         assert len(self.req_ids) == len(self.pooling_params)
         return [self.pooling_params[req_id] for req_id in self.req_ids]
 
+    def get_pooling_states(self) -> list[PoolingStates]:
+        assert len(self.req_ids) == len(self.pooling_states)
+        return [self.pooling_states[req_id] for req_id in self.req_ids]
+
     def get_pooling_metadata(self) -> PoolingMetadata:
         pooling_params = self.get_pooling_params()
+        pooling_states = self.get_pooling_states()
 
         return PoolingMetadata(
             prompt_lens=torch.from_numpy(self.num_prompt_tokens[: self.num_reqs]),
             prompt_token_ids=self.sampling_metadata.prompt_token_ids,
             pooling_params=pooling_params,
+            pooling_states=pooling_states,
         )
 
     def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 3f043e3b2..a7eb9cdae 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -131,7 +131,7 @@ from vllm.v1.outputs import (
     SamplerOutput,
     make_empty_encoder_model_runner_output,
 )
-from vllm.v1.pool.metadata import PoolingMetadata
+from vllm.v1.pool.metadata import PoolingMetadata, PoolingStates
 from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs
 from vllm.v1.sample.logits_processor.interface import LogitsProcessor
 from vllm.v1.sample.metadata import SamplingMetadata
@@ -2291,20 +2291,6 @@ class GPUModelRunner(
 
         supported_tasks = list(model.pooler.get_supported_tasks())
 
-        if self.scheduler_config.enable_chunked_prefill:
-            if "token_embed" in supported_tasks:
-                supported_tasks.remove("token_embed")
-            if "token_classify" in supported_tasks:
-                supported_tasks.remove("token_classify")
-
-            logger.debug_once(
-                "Chunked prefill is not supported with "
-                "token_embed and token_classify tasks "
-                "which using ALL pooling. "
-                "Please turn off chunked prefill by "
-                "`--no-enable-chunked-prefill` before using it."
-            )
-
         if "score" in supported_tasks:
             num_labels = getattr(self.model_config.hf_config, "num_labels", 0)
             if num_labels != 1:
@@ -2381,11 +2367,12 @@ class GPUModelRunner(
         )
 
         hidden_states = hidden_states[:num_scheduled_tokens]
+        seq_lens_cpu = self.seq_lens.cpu[: self.input_batch.num_reqs]
+
         pooling_metadata = self.input_batch.get_pooling_metadata()
         pooling_metadata.build_pooling_cursor(
-            num_scheduled_tokens_np.tolist(), device=hidden_states.device
+            num_scheduled_tokens_np.tolist(), seq_lens_cpu, device=hidden_states.device
         )
-        seq_lens_cpu = self.seq_lens.cpu[: self.input_batch.num_reqs]
 
         model = cast(VllmModelForPooling, self.model)
         raw_pooler_output: PoolerOutput = model.pooler(
@@ -2393,7 +2380,7 @@ class GPUModelRunner(
             pooling_metadata=pooling_metadata,
         )
         raw_pooler_output = json_map_leaves(
-            lambda x: x.to("cpu", non_blocking=True),
+            lambda x: x.to("cpu", non_blocking=True) if x is not None else x,
             raw_pooler_output,
         )
         self._sync_device()
@@ -4248,10 +4235,13 @@ class GPUModelRunner(
             prompt_lens=dummy_prompt_lens,
             prompt_token_ids=dummy_token_ids,
             pooling_params=[dummy_pooling_params] * num_reqs,
+            pooling_states=[PoolingStates() for i in range(num_reqs)],
         )
 
         dummy_metadata.build_pooling_cursor(
-            num_scheduled_tokens_list, device=hidden_states.device
+            num_scheduled_tokens_list,
+            seq_lens_cpu=dummy_prompt_lens,
+            device=hidden_states.device,
         )
 
         try:
@@ -4278,22 +4268,12 @@ class GPUModelRunner(
         supported_pooling_tasks = self.get_supported_pooling_tasks()
 
         if not supported_pooling_tasks:
-            if self.scheduler_config.enable_chunked_prefill:
-                raise RuntimeError(
-                    f"Model {self.model_config.model} does not support "
-                    "any pooling tasks with chunked prefill enabled. "
-                    "Please add --no-enable-chunked-prefill to your "
-                    "config or CLI args. See "
-                    "https://docs.vllm.ai/en/latest/models/pooling_models.html "
-                    "to learn more."
-                )
-            else:
-                raise RuntimeError(
-                    f"Model {self.model_config.model} does not support "
-                    "any pooling tasks. See "
-                    "https://docs.vllm.ai/en/latest/models/pooling_models.html "
-                    "to learn more."
-                )
+            raise RuntimeError(
+                f"Model {self.model_config.model} does not support "
+                "any pooling tasks. See "
+                "https://docs.vllm.ai/en/latest/models/pooling_models.html "
+                "to learn more."
+            )
 
         output_size = dict[PoolingTask, float]()
         for task in supported_pooling_tasks:
-- 
GitLab


From 9998ea5b576972a508c854227f57829dd4bca940 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 4 Dec 2025 13:44:50 +0000
Subject: [PATCH 087/258] Delete HF version of Phi 4 MM (#30049)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/models/supported_models.md               |    1 -
 .../generation/test_phi4_multimodal.py        |  281 ----
 .../multimodal/processing/test_common.py      |   22 -
 tests/models/registry.py                      |    4 -
 vllm/model_executor/models/phi4_multimodal.py | 1447 -----------------
 vllm/model_executor/models/registry.py        |    2 +-
 6 files changed, 1 insertion(+), 1756 deletions(-)
 delete mode 100644 tests/models/multimodal/generation/test_phi4_multimodal.py
 delete mode 100644 vllm/model_executor/models/phi4_multimodal.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 040107c11..96d5ec25c 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -711,7 +711,6 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ |
 | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
 | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ |
-| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ |
 | `PixtralForConditionalGeneration` | Ministral 3 (Mistral format), Mistral 3 (Mistral format), Mistral Large 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Mistral-Large-3-675B-Instruct-2512` `mistralai/Pixtral-12B-2409` etc. | | ✅︎ |
 | `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ |
 | `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ |
diff --git a/tests/models/multimodal/generation/test_phi4_multimodal.py b/tests/models/multimodal/generation/test_phi4_multimodal.py
deleted file mode 100644
index 624562217..000000000
--- a/tests/models/multimodal/generation/test_phi4_multimodal.py
+++ /dev/null
@@ -1,281 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-from collections.abc import Sequence
-
-import librosa
-import pytest
-from huggingface_hub import snapshot_download
-
-from vllm.assets.image import ImageAsset
-from vllm.lora.request import LoRARequest
-from vllm.multimodal.image import rescale_image_size
-
-from ....conftest import (
-    IMAGE_ASSETS,
-    HfRunner,
-    PromptAudioInput,
-    PromptImageInput,
-    VllmRunner,
-)
-from ....utils import large_gpu_test
-from ...utils import check_logprobs_close
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
-    {
-        "stop_sign": "<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
-        "cherry_blossom": "<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
-    }
-)
-HF_MULTIIMAGE_IMAGE_PROMPT = (
-    "<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
-)
-
-model_path = snapshot_download(
-    "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
-)
-# Since the vision-lora and speech-lora co-exist with the base model,
-# we have to manually specify the path of the lora weights.
-vision_lora_path = os.path.join(model_path, "vision-lora")
-speech_question = os.path.join(
-    model_path, "examples", "what_is_shown_in_this_image.wav"
-)
-models = [model_path]
-
-target_dtype = "half"
-
-
-def run_test(
-    hf_runner: type[HfRunner],
-    vllm_runner: type[VllmRunner],
-    inputs: Sequence[tuple[list[str], PromptImageInput, PromptAudioInput | None]],
-    model: str,
-    *,
-    max_model_len: int,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    mm_limit: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: str | None = None,
-):
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test are from IMAGE_ASSETS.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects
-    and corresponding MultiModalConfig as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-    # max_model_len should be greater than image_feature_size
-    with vllm_runner(
-        model,
-        task="generate",
-        max_model_len=max_model_len,
-        max_num_seqs=2,
-        dtype=dtype,
-        limit_mm_per_prompt={"image": mm_limit},
-        tensor_parallel_size=tensor_parallel_size,
-        distributed_executor_backend=distributed_executor_backend,
-        enable_lora=True,
-        max_lora_rank=320,
-        gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
-        enforce_eager=True,
-        trust_remote_code=False,
-    ) as vllm_model:
-        lora_request = LoRARequest("vision", 1, vision_lora_path)
-        vllm_outputs_per_case = [
-            vllm_model.generate_greedy_logprobs(
-                prompts,
-                max_tokens,
-                num_logprobs=num_logprobs,
-                images=images,
-                audios=audios,
-                lora_request=lora_request,
-            )
-            for prompts, images, audios in inputs
-        ]
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_model.model.load_adapter(
-            vision_lora_path,
-            adapter_name="vision",
-        )
-        hf_processor = hf_model.processor
-        eos_token_id = hf_processor.tokenizer.eos_token_id
-        hf_outputs_per_case = [
-            hf_model.generate_greedy_logprobs_limit(
-                prompts,
-                max_tokens,
-                num_logprobs=num_logprobs,
-                images=images,
-                audios=audios,
-                eos_token_id=eos_token_id,
-            )
-            for prompts, images, audios in inputs
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [12800])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [10])
-def test_models(
-    hf_runner,
-    vllm_runner,
-    image_assets,
-    model,
-    size_factors,
-    dtype: str,
-    max_model_len: int,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_image = [
-        (
-            [prompt for _ in size_factors],
-            [rescale_image_size(image, factor) for factor in size_factors],
-            None,
-        )
-        for image, prompt in zip(images, HF_IMAGE_PROMPTS)
-    ]
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_per_image,
-        model,
-        dtype=dtype,
-        max_model_len=max_model_len,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=1,
-        tensor_parallel_size=1,
-    )
-
-
-@large_gpu_test(min_gb=48)
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        # [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [25600])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [10])
-def test_multi_images_models(
-    hf_runner,
-    vllm_runner,
-    image_assets,
-    model,
-    size_factors,
-    dtype: str,
-    max_model_len: int,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_case = [
-        (
-            [HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
-            [
-                [rescale_image_size(image, factor) for image in images]
-                for factor in size_factors
-            ],
-            None,
-        ),
-    ]
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_per_case,
-        model,
-        dtype=dtype,
-        max_model_len=max_model_len,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=2,
-        tensor_parallel_size=1,
-    )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [12800])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [10])
-def test_vision_speech_models(
-    hf_runner,
-    vllm_runner,
-    model,
-    dtype: str,
-    max_model_len: int,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-    # use the example speech question so that the model outputs are reasonable
-    audio = librosa.load(speech_question, sr=16000)
-    image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
-
-    inputs_vision_speech = [
-        (
-            ["<|user|><|image|><|audio|><|end|><|assistant|>"],
-            [image],
-            [audio],
-        ),
-    ]
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_vision_speech,
-        model,
-        dtype=dtype,
-        max_model_len=max_model_len,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=1,
-        tensor_parallel_size=1,
-    )
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 8ef1fba8d..6b9d388f2 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -396,28 +396,6 @@ def test_processing_correctness(
     )
 
 
-# Phi4MultimodalForCausalLM share same model repo with original format
-# Phi4MMForCausalLM, so we add it as a separate test case
-# Remove this test after conversion PR merged:
-# https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/70
-@pytest.mark.parametrize("model_arch", ["Phi4MultimodalForCausalLM"])
-@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
-@pytest.mark.parametrize("num_batches", [32])
-@pytest.mark.parametrize("simplify_rate", [1.0])
-def test_processing_correctness_phi4_multimodal(
-    model_arch: str,
-    hit_rate: float,
-    num_batches: int,
-    simplify_rate: float,
-):
-    _test_processing_correctness(
-        model_arch,
-        hit_rate=hit_rate,
-        num_batches=num_batches,
-        simplify_rate=simplify_rate,
-    )
-
-
 def _assert_inputs_equal(
     a: MultiModalInputs,
     b: MultiModalInputs,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index bf88bac20..b9f9945eb 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -771,10 +771,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "Phi4MMForCausalLM": _HfExamplesInfo(
         "microsoft/Phi-4-multimodal-instruct", trust_remote_code=True
     ),
-    "Phi4MultimodalForCausalLM": _HfExamplesInfo(
-        "microsoft/Phi-4-multimodal-instruct",
-        revision="refs/pr/70",
-    ),
     "PixtralForConditionalGeneration": _HfExamplesInfo(
         "mistralai/Pixtral-12B-2409",
         extras={
diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py
deleted file mode 100644
index 0f1230a55..000000000
--- a/vllm/model_executor/models/phi4_multimodal.py
+++ /dev/null
@@ -1,1447 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import math
-from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Any, Literal, TypeAlias
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from transformers import (
-    BatchFeature,
-    Phi4MultimodalAudioConfig,
-    Phi4MultimodalConfig,
-    Phi4MultimodalFeatureExtractor,
-    Phi4MultimodalImageProcessorFast,
-)
-from transformers import Phi4MultimodalProcessor as Phi4MMProcessor
-from transformers.models.phi4_multimodal.modeling_phi4_multimodal import (
-    Phi4MultimodalAudioConvModule,
-    Phi4MultimodalAudioNemoConvSubsampling,
-    Phi4MultimodalAudioRelativeAttentionBias,
-    adaptive_enc_mask,
-    unfold_tensor,
-)
-
-from vllm.config import VllmConfig
-from vllm.config.multimodal import BaseDummyOptions
-from vllm.distributed import (
-    divide,
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
-)
-from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn
-from vllm.model_executor.layers.linear import (
-    ColumnParallelLinear,
-    MergedColumnParallelLinear,
-    QKVParallelLinear,
-    RowParallelLinear,
-)
-from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.module_mapping import MultiModelKeys
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (
-    MultiModalDataDict,
-    MultiModalFieldConfig,
-    MultiModalKwargsItems,
-    NestedTensors,
-)
-from vllm.multimodal.parse import (
-    AudioProcessorItems,
-    ImageEmbeddingItems,
-    ImageProcessorItems,
-    ImageSize,
-    MultiModalDataItems,
-    MultiModalDataParser,
-)
-from vllm.multimodal.processing import (
-    BaseMultiModalProcessor,
-    BaseProcessingInfo,
-    PromptReplacement,
-    PromptUpdate,
-)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder
-from vllm.sequence import IntermediateTensors
-from vllm.utils.tensor_schema import TensorSchema, TensorShape
-
-from .idefics2_vision_model import Idefics2VisionTransformer
-from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
-from .utils import (
-    AutoWeightsLoader,
-    WeightsMapper,
-    init_vllm_registered_model,
-    maybe_prefix,
-)
-
-_AUDIO_MAX_SOUNDFILE_SIZE = 241_000
-
-
-def _get_padding_size(
-    orig_width: int, orig_height: int, target_height: int, target_width: int
-):
-    ratio_width = target_width / orig_width
-    ratio_height = target_height / orig_height
-
-    if ratio_width < ratio_height:
-        padding_width = 0
-        padding_height = target_height - int(orig_height * ratio_width)
-    else:
-        padding_width = target_width - int(orig_width * ratio_height)
-        padding_height = 0
-    return padding_height, padding_width
-
-
-class Phi4MMProjector(nn.Module):
-    def __init__(self, input_size: int, hidden_size: int):
-        super().__init__()
-        self.up = ColumnParallelLinear(input_size, hidden_size)
-        self.down = RowParallelLinear(hidden_size, hidden_size)
-        self.act = get_act_fn("gelu")
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x, _ = self.up(x)
-        x = self.act(x)
-        x, _ = self.down(x)
-        return x
-
-
-class Phi4MMImageEmbedding(nn.Module):
-    """Image embedding."""
-
-    def __init__(self, config: Phi4MultimodalConfig):
-        super().__init__()
-        self.config = config
-        self.layer_idx = config.vision_config.feature_layer
-        self.crop_size = config.vision_config.crop_size
-        self.image_dim_out = config.vision_config.hidden_size
-
-        n_patches = config.vision_config.image_size // config.vision_config.patch_size
-        if n_patches % 2 != 0:
-            self.img_processor_padding = nn.ReflectionPad2d((0, 1, 0, 1))
-            n_patches += 1
-        self.num_img_tokens = (n_patches // 2) ** 2
-
-        num_hidden_layers = (
-            config.vision_config.num_hidden_layers + self.layer_idx + 1
-            if self.layer_idx < 0
-            else self.layer_idx + 1
-        )
-        self.img_processor = Idefics2VisionTransformer(
-            config.vision_config,
-            require_post_norm=False,
-            num_hidden_layers_override=num_hidden_layers,
-        )
-        self.image_token_compression = nn.AvgPool2d(kernel_size=2, stride=2)
-        self.img_projection = Phi4MMProjector(self.image_dim_out, config.hidden_size)
-        self.global_img_feature_extensor = nn.Parameter(
-            torch.zeros([1, 1, self.image_dim_out])
-        )
-        self.sub_img_feature_extensor = nn.Parameter(
-            torch.zeros([1, 1, 1, self.image_dim_out])
-        )
-
-    def get_img_features(
-        self,
-        img_embeds: torch.FloatTensor,
-        attention_mask: torch.Tensor | None = None,
-    ) -> torch.FloatTensor:
-        img_feature = self.img_processor(
-            img_embeds, patch_attention_mask=attention_mask
-        )
-
-        patch_feature = img_feature
-        # reshape to 2D tensor
-        width = int(math.sqrt(patch_feature.size(1)))
-        patch_feature = patch_feature.view(-1, width, width, patch_feature.size(-1))
-        # convert to NCHW
-        patch_feature = patch_feature.permute(0, 3, 1, 2)
-        if getattr(self, "img_processor_padding", None) is not None:
-            patch_feature = self.img_processor_padding(patch_feature)
-        patch_feature = self.image_token_compression(patch_feature)
-        # convert to NHWC
-        patch_feature = patch_feature.permute(0, 2, 3, 1)
-        patch_feature = patch_feature.view(
-            -1, patch_feature.size(1) * patch_feature.size(2), patch_feature.size(-1)
-        )
-        return patch_feature
-
-    def forward(
-        self,
-        image_pixel_values: torch.FloatTensor,
-        image_sizes: torch.Tensor | None = None,
-        image_attention_mask: torch.Tensor | None = None,
-    ) -> torch.FloatTensor:
-        image_pixel_values = image_pixel_values.to(
-            self.img_processor.embeddings.patch_embedding.weight.dtype
-        )
-
-        target_device = self.img_projection.up.bias.device
-        target_dtype = self.img_projection.up.bias.dtype
-
-        batch_size = image_pixel_values.shape[0]
-
-        img_features = self.get_img_features(
-            image_pixel_values.flatten(0, 1),
-            attention_mask=image_attention_mask.flatten(0, 1).to(
-                dtype=bool, device=target_device
-            ),
-        )
-        base_feat_size = int(np.sqrt(img_features.shape[1]))
-        img_features = img_features.view(
-            batch_size, -1, base_feat_size**2, self.image_dim_out
-        )
-        image_sizes = image_sizes.view(-1, 2)
-
-        output_imgs = []
-        for idx in range(batch_size):
-            height, width = image_sizes[idx]
-            height_ratio = height // self.crop_size
-            width_ratio = width // self.crop_size
-            area_ratio = height_ratio * width_ratio
-
-            global_img = img_features[idx, :1]
-            global_img = global_img.reshape(
-                1, base_feat_size, base_feat_size, self.image_dim_out
-            ).contiguous()
-            temporary_extensor = self.sub_img_feature_extensor.repeat(
-                1, base_feat_size, 1, 1
-            )
-            global_img = torch.cat([global_img, temporary_extensor], dim=2).reshape(
-                1, -1, self.image_dim_out
-            )
-
-            sub_img = img_features[idx, 1:]
-            sub_img = sub_img[:area_ratio]
-            sub_img = (
-                sub_img.reshape(
-                    height_ratio,
-                    width_ratio,
-                    base_feat_size,
-                    base_feat_size,
-                    self.image_dim_out,
-                )
-                .transpose(1, 2)
-                .reshape(
-                    1,
-                    height_ratio * base_feat_size,
-                    width_ratio * base_feat_size,
-                    self.image_dim_out,
-                )
-                .contiguous()
-            )
-
-            if image_attention_mask is not None:
-                reshaped_image_attention_mask = (
-                    image_attention_mask[idx, 1 : area_ratio + 1, 0::2, 0::2]
-                    .reshape(height_ratio, width_ratio, base_feat_size, base_feat_size)
-                    .transpose(1, 2)
-                    .reshape(
-                        1, height_ratio * base_feat_size, width_ratio * base_feat_size
-                    )
-                )
-                useful_height = int(reshaped_image_attention_mask[0, :, 0].sum().item())
-                useful_width = int(reshaped_image_attention_mask[0, 0, :].sum().item())
-                sub_img = sub_img[:, :useful_height, :useful_width]
-                temporary_extensor = self.sub_img_feature_extensor.repeat(
-                    1, useful_height, 1, 1
-                )
-            else:
-                temporary_extensor = self.sub_img_feature_extensor.repeat(
-                    1, height_ratio * base_feat_size, 1, 1
-                )
-
-            sub_img = torch.cat([sub_img, temporary_extensor], dim=2).reshape(
-                1, -1, self.image_dim_out
-            )
-
-            # Merge global and sub
-            output_imgs.append(
-                torch.cat(
-                    [sub_img, self.global_img_feature_extensor, global_img], dim=1
-                )
-            )
-
-        img_set_tensor = []
-        for output_img in output_imgs:
-            output_img = output_img.to(device=target_device, dtype=target_dtype)
-            img_feature_proj = self.img_projection(output_img)
-            img_set_tensor.append(img_feature_proj.flatten(0, 1))
-
-        return img_set_tensor
-
-
-class Phi4MultimodalAudioMLP(nn.Module):
-    def __init__(
-        self,
-        config: Phi4MultimodalAudioConfig,
-        quant_config: QuantizationConfig | None = None,
-        prefix: str = "",
-    ):
-        super().__init__()
-        self.layer_norm = nn.LayerNorm(config.hidden_size)
-        self.act_fn = MulAndSilu()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            config.hidden_size,
-            [config.intermediate_size] * 2,
-            bias=True,
-            quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj",
-        )
-        self.down_proj = RowParallelLinear(
-            config.intermediate_size,
-            config.hidden_size,
-            bias=True,
-            quant_config=quant_config,
-            prefix=f"{prefix}.down_proj",
-        )
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.layer_norm(hidden_states)
-        hidden_states, _ = self.gate_up_proj(hidden_states)
-        hidden_states = self.act_fn(hidden_states)
-        hidden_states, _ = self.down_proj(hidden_states)
-        return hidden_states
-
-
-class Phi4MultimodalAudioAttention(nn.Module):
-    def __init__(
-        self,
-        config: Phi4MultimodalAudioConfig,
-        quant_config: QuantizationConfig | None = None,
-        prefix: str = "",
-    ):
-        super().__init__()
-        self.config = config
-        self.embed_dim = config.hidden_size
-        self.total_num_heads = config.num_attention_heads
-        self.head_dim = self.embed_dim // self.total_num_heads
-        if self.head_dim * self.total_num_heads != self.embed_dim:
-            raise ValueError(
-                "embed_dim must be divisible by num_heads "
-                f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {self.num_heads})."
-            )
-        self.scale = self.head_dim**-0.5
-
-        self.qkv_proj = QKVParallelLinear(
-            hidden_size=self.embed_dim,
-            head_size=self.head_dim,
-            total_num_heads=self.total_num_heads,
-            quant_config=quant_config,
-            prefix=f"{prefix}.qkv_proj",
-        )
-
-        self.o_proj = RowParallelLinear(
-            input_size=self.embed_dim,
-            output_size=self.embed_dim,
-            quant_config=quant_config,
-            prefix=f"{prefix}.out_proj",
-        )
-
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.tp_rank = get_tensor_model_parallel_rank()
-        self.num_heads = divide(self.total_num_heads, self.tp_size)
-
-    def split_attn_mask(self, attention_mask: torch.Tensor) -> torch.Tensor:
-        start_idx = self.num_heads * self.tp_rank
-        end_idx = self.num_heads * (self.tp_rank + 1)
-        return attention_mask[:, start_idx:end_idx]
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: torch.Tensor,
-    ) -> torch.Tensor:
-        qkv_states, _ = self.qkv_proj(hidden_states)
-        query, key, value = qkv_states.chunk(3, dim=-1)
-
-        bsz, seq_len, _ = query.size()
-        query = query.view(bsz, seq_len, self.num_heads, self.head_dim)
-        key = key.view(bsz, seq_len, self.num_heads, self.head_dim)
-        value = value.view(bsz, seq_len, self.num_heads, self.head_dim)
-        query, key, value = (x.transpose(1, 2) for x in (query, key, value))
-
-        attention_mask = self.split_attn_mask(attention_mask)
-        out = F.scaled_dot_product_attention(
-            query,
-            key,
-            value,
-            scale=self.scale,
-            attn_mask=attention_mask,
-        )
-        out = out.transpose(1, 2).reshape(bsz, seq_len, -1)
-
-        attn_output, _ = self.o_proj(out)
-
-        return attn_output
-
-
-class Phi4MultimodalAudioConformerEncoderLayer(nn.Module):
-    def __init__(self, config: Phi4MultimodalAudioConfig):
-        super().__init__()
-
-        self.feed_forward_in = Phi4MultimodalAudioMLP(config)
-        self.self_attn = Phi4MultimodalAudioAttention(config)
-        self.conv = Phi4MultimodalAudioConvModule(config)
-        self.feed_forward_out = Phi4MultimodalAudioMLP(config)
-        self.layer_norm_att = nn.LayerNorm(config.hidden_size)
-        self.layer_norm = nn.LayerNorm(config.hidden_size)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: torch.Tensor,
-    ) -> torch.Tensor:
-        residual = hidden_states + 0.5 * self.feed_forward_in(hidden_states)
-        hidden_states = self.layer_norm_att(residual)
-
-        hidden_states = residual + self.self_attn(hidden_states, attention_mask)
-        hidden_states = hidden_states + self.conv(hidden_states)
-        hidden_states = hidden_states + 0.5 * self.feed_forward_out(hidden_states)
-
-        out = self.layer_norm(hidden_states)
-
-        return out
-
-
-class Phi4MMAudioMeanVarianceNormLayer(nn.Module):
-    """Mean/variance normalization layer.
-
-    Will subtract mean and multiply input by inverted standard deviation.
-    Typically used as a very first layer in a model.
-
-    Args:
-        config: [Phi4MultimodalAudioConfig](https://huggingface.co/docs/transformers/model_doc/phi4_multimodal#transformers.Phi4MultimodalAudioConfig)
-            object containing model parameters.
-    """
-
-    def __init__(self, config: Phi4MultimodalAudioConfig):
-        super().__init__()
-        self.global_mean = nn.Parameter(torch.zeros(config.input_size))
-        self.global_invstd = nn.Parameter(torch.ones(config.input_size))
-
-    def forward(self, input_: torch.Tensor) -> torch.Tensor:
-        """MeanVarianceNormLayer Forward
-
-        Args:
-            input_: torch.Tensor
-                input tensor.
-        """
-        return (input_ - self.global_mean) * self.global_invstd
-
-
-class Phi4MultimodalAudioModel(nn.Module):
-    def __init__(self, config: Phi4MultimodalAudioConfig):
-        super().__init__()
-        self.config = config
-
-        self.encoder_embedding = Phi4MMAudioMeanVarianceNormLayer(config)
-        self.embed = Phi4MultimodalAudioNemoConvSubsampling(config)
-        self.relative_attention_bias_layer = Phi4MultimodalAudioRelativeAttentionBias(
-            config
-        )
-        self.encoders = nn.ModuleList(
-            [
-                Phi4MultimodalAudioConformerEncoderLayer(config)
-                for _ in range(config.num_blocks)
-            ]
-        )
-
-    def _streaming_mask(
-        self,
-        seq_len: int,
-        batch_size: int,
-        chunk_size: int,
-        left_chunk: int,
-    ):
-        # Create mask matrix for streaming
-        # S stores start index. if chunksize is 18, s is [0,18,36,....]
-        chunk_start_idx = np.arange(0, seq_len, chunk_size)
-
-        enc_streaming_mask = (
-            adaptive_enc_mask(seq_len, chunk_start_idx, left_window=left_chunk)
-            .unsqueeze(0)
-            .expand([batch_size, -1, -1])
-        )
-        return enc_streaming_mask
-
-    def forward_embeddings(
-        self,
-        hidden_states: torch.Tensor,
-        masks: torch.Tensor,
-    ):
-        """Forwarding the inputs through the top embedding layers"""
-        seq_len = math.ceil(hidden_states.shape[1] / self.config.time_reduction)
-        if seq_len <= 0:
-            raise ValueError(
-                f"Sequence length after time reduction is invalid: {seq_len}."
-                "Your input feature is too short."
-            )
-
-        batch_size = hidden_states.shape[0]
-
-        enc_streaming_mask = self._streaming_mask(
-            seq_len, batch_size, self.config.chunk_size, self.config.left_chunk
-        )
-        enc_streaming_mask = enc_streaming_mask.to(hidden_states.device)
-
-        hidden_states, masks = self.embed(hidden_states, masks)
-
-        streaming_mask = enc_streaming_mask
-        if streaming_mask is not None and masks is not None:
-            hs_mask = masks & streaming_mask
-        elif masks is not None:
-            hs_mask = masks
-        else:
-            hs_mask = streaming_mask
-
-        return hidden_states, hs_mask, masks
-
-    def calculate_hs_mask(
-        self, hidden_states: torch.Tensor, device: torch.device, mask: torch.Tensor
-    ):
-        max_audio_length = hidden_states.shape[1]
-        batch_size = hidden_states.shape[0]
-        enc_streaming_mask = self._streaming_mask(
-            max_audio_length, batch_size, self.config.chunk_size, self.config.left_chunk
-        )
-        enc_streaming_mask = enc_streaming_mask.to(device)
-        if mask is None:
-            return enc_streaming_mask
-
-        feature_lens = mask.sum(1)
-        padding_length = feature_lens
-        pad_mask = torch.arange(0, max_audio_length, device=device).expand(
-            padding_length.size(0), -1
-        ) < padding_length.unsqueeze(1)
-        pad_mask = pad_mask.unsqueeze(1)
-        pad_mask = pad_mask & enc_streaming_mask
-        return pad_mask
-
-    def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor | None = None):
-        hidden_states = self.encoder_embedding(hidden_states)
-        hidden_states, hs_mask, mask = self.forward_embeddings(hidden_states, mask)
-
-        unfolded = False
-        bs, seq_len, _ = hidden_states.shape
-        max_seq_len = 500  # maximum position for absolute positional encoding
-        if seq_len > max_seq_len:
-            # audio sequence is longer than max_seq_len,
-            # unfold it into chunks of max_seq_len
-            unfolded = True
-            # the unfold op will drop residual frames,
-            # pad it to the multiple of max_seq_len
-            if seq_len % max_seq_len > 0:
-                chunk_pad_size = max_seq_len - (seq_len % max_seq_len)
-            else:
-                chunk_pad_size = 0
-            if chunk_pad_size > 0:
-                hidden_states_pad = F.pad(
-                    hidden_states, (0, 0, 0, chunk_pad_size), "constant", 0
-                )
-                hidden_states = hidden_states_pad.to(hidden_states.device)
-
-            hidden_states = unfold_tensor(hidden_states, max_seq_len)
-            masks_unfold = None
-            if mask is not None:
-                # revise hs_mask here because the previous calculated hs_mask
-                # did not consider extra pad
-                subsampled_pad_mask = mask.squeeze(1)  # [bz, subsampled_unmask_seq_len]
-                extra_padded_subsamlped_pad_mask = F.pad(
-                    subsampled_pad_mask, (0, chunk_pad_size), "constant", False
-                )  # extra padding to the pad mask
-                extra_padded_subsamlped_pad_mask = (
-                    extra_padded_subsamlped_pad_mask.unsqueeze(-1).float()
-                )
-                masks_unfold = unfold_tensor(
-                    extra_padded_subsamlped_pad_mask, max_seq_len
-                )  # unfold the pad mask like we did to the input tensor
-                masks_unfold = masks_unfold.squeeze(
-                    -1
-                ).bool()  # unfold op does not support bool tensor
-            hs_mask = self.calculate_hs_mask(
-                hidden_states, hidden_states.device, masks_unfold
-            )  # calculate hs_mask based on the unfolded pad mask
-
-        relative_attention_bias = self.relative_attention_bias_layer(hidden_states)
-        attention_mask = hs_mask.unsqueeze(1) + relative_attention_bias
-
-        for layer in self.encoders:
-            hidden_states = layer(hidden_states, attention_mask)
-
-        if unfolded:
-            embed_dim = hidden_states.shape[-1]
-            hidden_states = hidden_states.reshape(bs, -1, embed_dim)
-            # if we ever padded before unfolding, we need to remove the padding
-            if chunk_pad_size > 0:
-                hidden_states = hidden_states[:, :-chunk_pad_size, :]
-
-        return hidden_states
-
-
-class Phi4MMAudioEmbedding(nn.Module):
-    def __init__(self, config: Phi4MultimodalConfig):
-        super().__init__()
-        self.config = config
-        self.layer_idx = config.audio_config.feature_layer
-
-        self.encoder = Phi4MultimodalAudioModel(config.audio_config)
-
-        audio_config = config.audio_config
-        proj_input_size = audio_config.hidden_size * audio_config.downsample_rate
-        self.vision_speech_projection = Phi4MMProjector(
-            proj_input_size, config.hidden_size
-        )
-        self.speech_projection = Phi4MMProjector(proj_input_size, config.hidden_size)
-
-    def get_projection(
-        self,
-        audio_projection_mode: Literal["speech", "vision"],
-    ) -> Phi4MMProjector:
-        if audio_projection_mode == "speech":
-            return self.speech_projection
-        elif audio_projection_mode == "vision":
-            return self.vision_speech_projection
-
-    def forward(
-        self,
-        audio_input_features: torch.FloatTensor,
-        audio_embed_sizes=None,
-        audio_attention_mask=None,
-        audio_projection_mode="speech",
-    ) -> torch.FloatTensor:
-        audio_projection = self.get_projection(audio_projection_mode)
-
-        target_device = audio_projection.up.bias.device
-        target_dtype = audio_projection.up.bias.dtype
-
-        audio_input_features = audio_input_features.to(
-            device=target_device, dtype=target_dtype
-        )
-
-        audio_encoder_hidden_states = self.encoder(
-            audio_input_features, audio_attention_mask
-        )
-        audio_embeds = audio_projection(audio_encoder_hidden_states)
-
-        return audio_embeds.flatten(0, 1)
-
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-
-        for name, loaded_weight in weights:
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
-
-class Phi4MMImagePixelInputs(TensorSchema):
-    """
-    Dimensions:
-        - bn: Batch size * number of images
-        - p: Number of patches (1 + num_patches)
-        - c: Number of channels (3)
-        - h: Height of each image patch
-        - w: Width of each image patch
-        - nc: Number of crops
-        - H_mask: Height of attention mask
-        - W_mask: Width of attention mask
-    """
-
-    type: Literal["pixel_values"]
-
-    pixel_values: Annotated[
-        torch.Tensor | list[torch.Tensor],
-        TensorShape(
-            "bn", "p", 3, "h", "w", dynamic_dims={"p"}
-        ),  # may be different per batch and image
-    ]
-
-    image_sizes: Annotated[
-        torch.Tensor,
-        TensorShape("bn", 2),  # (height, width)
-    ]
-
-    num_img_tokens: Annotated[
-        list[int],
-        TensorShape("bn"),
-    ]
-
-    image_attention_mask: Annotated[
-        torch.Tensor,
-        TensorShape("bn", "nc", 32, 32),  # H_mask, W_mask
-    ]
-
-
-class Phi4MMImageEmbeddingInputs(TensorSchema):
-    """
-    Dimensions:
-        - bn: Batch size * number of images
-        - f: Image feature size
-        - h: Hidden size (must match language model backbone)
-    """
-
-    type: Literal["image_embeds"]
-
-    data: Annotated[
-        torch.Tensor | list[torch.Tensor],
-        TensorShape("bn", "f", "h"),
-    ]
-
-
-class Phi4MMAudioFeatureInputs(TensorSchema):
-    """
-    Dimensions:
-        - bn: Batch size * number of audios
-        - f: Number of Mel filterbank bins (80)
-        - t: Time frames (M)
-    """
-
-    type: Literal["audio_features"]
-
-    audio_features: Annotated[
-        torch.Tensor | list[torch.Tensor],
-        TensorShape("bn", "t", 80, dynamic_dims={"t"}),
-    ]
-
-
-class Phi4MMAudioEmbeddingInputs(TensorSchema):
-    """
-    Dimensions:
-        - b: Batch size
-        - n: Number of audios
-        - f: Audio feature size
-        - h: Hidden size (must match language model backbone)
-    """
-
-    type: Literal["audio_embeds"]
-
-    data: Annotated[
-        NestedTensors,
-        TensorShape("b", "n", "f", "h"),
-    ]
-
-
-Phi4MMImageInput: TypeAlias = Phi4MMImagePixelInputs | Phi4MMImageEmbeddingInputs
-Phi4MMAudioInputs: TypeAlias = Phi4MMAudioFeatureInputs | Phi4MMAudioEmbeddingInputs
-
-
-def cat_with_pad(tensors, dim, padding_value=0):
-    """
-    cat along dim, while pad to max for all other dims
-    """
-    ndim = tensors[0].dim()
-    assert all(t.dim() == ndim for t in tensors[1:]), (
-        "All tensors must have the same number of dimensions"
-    )
-
-    out_size = [max(t.shape[i] for t in tensors) for i in range(ndim)]
-    out_size[dim] = sum(t.shape[dim] for t in tensors)
-    output = tensors[0].new_full(out_size, padding_value)
-
-    index = 0
-    for t in tensors:
-        # Create a slice list where every dimension except dim is full slice
-        slices = [slice(0, t.shape[d]) for d in range(ndim)]
-        # Update only the concat dimension slice
-        slices[dim] = slice(index, index + t.shape[dim])
-
-        output[slices] = t
-        index += t.shape[dim]
-
-    return output
-
-
-class Phi4MMProcessingInfo(BaseProcessingInfo):
-    def get_hf_config(self) -> Phi4MultimodalConfig:
-        return self.ctx.get_hf_config(Phi4MultimodalConfig)
-
-    def get_hf_processor(self, **kwargs: object) -> Phi4MMProcessor:
-        return self.ctx.get_hf_processor(Phi4MMProcessor, **kwargs)
-
-    def get_feature_extractor(self, **kwargs: object) -> Phi4MultimodalFeatureExtractor:
-        return self.get_hf_processor(**kwargs).audio_processor
-
-    def get_image_processor(
-        self,
-        processor: Phi4MMProcessor | None = None,
-    ) -> Phi4MultimodalImageProcessorFast:
-        if processor is None:
-            processor = self.get_hf_processor()
-        return processor.image_processor
-
-    def get_dynamic_hd(
-        self,
-        processor: Phi4MMProcessor | None = None,
-    ) -> int:
-        return self.get_image_processor(processor).dynamic_hd
-
-    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
-        return {"audio": None, "image": None}
-
-    def _find_target_aspect_ratio(
-        self,
-        orig_width: int,
-        orig_height: int,
-        image_size: int,
-        max_num: int,
-        min_num: int,
-    ):
-        w_crop_num = math.ceil(orig_width / float(image_size))
-        h_crop_num = math.ceil(orig_height / float(image_size))
-        if w_crop_num * h_crop_num > max_num:
-            aspect_ratio = orig_width / orig_height
-
-            # calculate the existing image aspect ratio
-            target_ratios = set(
-                (i, j)
-                for i in range(1, max_num + 1)
-                for j in range(1, max_num + 1)
-                if i * j <= max_num and i * j >= min_num
-            )
-            target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
-
-            # find the closest aspect ratio to the target
-            image_processor = self.get_image_processor()
-            target_aspect_ratio = image_processor.find_closest_aspect_ratio(
-                aspect_ratio,
-                target_ratios,
-                orig_width,
-                orig_height,
-                image_size,
-            )
-
-            # calculate the target width and height
-            target_width = image_size * target_aspect_ratio[0]
-            target_height = image_size * target_aspect_ratio[1]
-        else:
-            target_width = image_size * w_crop_num
-            target_height = image_size * h_crop_num
-            target_aspect_ratio = (w_crop_num, h_crop_num)
-        return target_aspect_ratio, target_height, target_width
-
-    def _compute_num_image_tokens(
-        self,
-        orig_width: int,
-        orig_height: int,
-        dynamic_hd_size: int,
-        vit_image_size: int,
-        vit_patch_size: int,
-        token_compression_factor: int = 2,
-    ):
-        """
-        compute the number of tokens an image is expected to take up considering
-        the image encoder architecture and exclude output features containing
-        only padding pixels
-
-        for siglip, vit_image_size=448, vit_patch_size=14, so output will be
-        32x32 feature map
-        NOTE right now, Phi4MM uses hard-coded token_compression_factor=2
-        """
-        assert vit_image_size % vit_patch_size == 0, (
-            "vit_image_size must be divisible by vit_patch_size"
-        )
-        assert vit_image_size // vit_patch_size % token_compression_factor == 0, (
-            "vit_image_size // vit_patch_size must be divisible by "
-            "token_compression_factor"
-        )
-
-        target_aspect_ratio, target_height, target_width = (
-            self._find_target_aspect_ratio(
-                orig_width, orig_height, vit_image_size, dynamic_hd_size, min_num=1
-            )
-        )
-        assert target_aspect_ratio[0] * vit_image_size == target_width, (
-            f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}"
-        )
-        assert target_aspect_ratio[1] * vit_image_size == target_height, (
-            f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}"
-        )
-        assert (
-            target_height % vit_image_size == 0 and target_width % vit_image_size == 0
-        )
-
-        padding_height, padding_width = _get_padding_size(
-            orig_width, orig_height, target_height, target_width
-        )
-        assert padding_width == 0 or padding_height == 0, (
-            "padding_width or padding_height must be 0"
-        )
-
-        target_feat_width = target_width // vit_patch_size
-        target_feat_height = target_height // vit_patch_size
-        if padding_width >= vit_patch_size:
-            assert padding_height == 0, "padding_height not 0"
-            non_pad_feat_width = target_feat_width - math.floor(
-                padding_width / vit_patch_size
-            )
-            non_pad_feat_height = target_feat_height
-        elif padding_height >= vit_patch_size:
-            assert padding_width == 0, "padding_width not 0"
-            non_pad_feat_height = target_feat_height - math.floor(
-                padding_height / vit_patch_size
-            )
-            non_pad_feat_width = target_feat_width
-        else:
-            # small padding shorter than a vit patch
-            non_pad_feat_width = target_feat_width
-            non_pad_feat_height = target_feat_height
-
-        feat_width = non_pad_feat_width // token_compression_factor
-        feat_height = non_pad_feat_height // token_compression_factor
-        # NOTE it's possible that the non-padding feature is not divisible
-        if non_pad_feat_width % token_compression_factor != 0:
-            feat_width += 1
-        if non_pad_feat_height % token_compression_factor != 0:
-            feat_height += 1
-        num_hd_patch_tokens = feat_width * feat_height
-        num_hd_newline_tokens = feat_height
-        vit_feature_size = vit_image_size // vit_patch_size
-        num_global_image_tokens = (vit_feature_size // token_compression_factor) ** 2
-        num_sep_tokens = 1
-        num_global_image_newline_tokens = vit_feature_size // token_compression_factor
-
-        return (
-            num_global_image_tokens
-            + num_sep_tokens
-            + num_hd_patch_tokens
-            + num_hd_newline_tokens
-            + num_global_image_newline_tokens
-        )
-
-    def get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-        processor: Phi4MMProcessor | None = None,
-    ) -> int:
-        hf_config = self.get_hf_config()
-        vision_config = hf_config.vision_config
-        vit_image_size = vision_config.image_size
-        vit_patch_size = vision_config.patch_size
-
-        dynamic_hd_size = self.get_dynamic_hd(processor=processor)
-
-        # we use default `token_compression_factor=2`,
-        # since it's not in HF vision config.
-        image_num_tokens = self._compute_num_image_tokens(
-            image_width,
-            image_height,
-            dynamic_hd_size=dynamic_hd_size,
-            vit_image_size=vit_image_size,
-            vit_patch_size=vit_patch_size,
-        )
-
-        return image_num_tokens
-
-    def get_image_size_with_most_features(
-        self,
-        processor: Phi4MMProcessor | None = None,
-    ) -> ImageSize:
-        vit_image_size = self.get_hf_config().vision_config.image_size
-
-        max_side = vit_image_size * self.get_dynamic_hd(processor=processor)
-        return ImageSize(height=max_side, width=vit_image_size)
-
-    def get_audio_num_frames(self, audio_len: int, sr: float) -> int:
-        """
-        Compute the output size of the `extract_features` method.
-
-        Args:
-            audio_len (int): Length of the input waveform in samples.
-            sr (float): Sampling rate of the waveform, either 16000 or 8000.
-
-        Returns:
-            tuple (int, int): Output size as (T, D), where:
-                T: Number of time frames.
-                D: Number of Mel filterbank bins (80).
-        """
-
-        # Resample to 16000 or 8000 if needed
-        if sr > 16000:
-            audio_len //= sr // 16000
-        elif 8000 <= sr < 16000:
-            # We'll resample to 16K from 8K
-            audio_len *= 2
-        elif sr < 8000:
-            raise RuntimeError(f"Unsupported sample rate {sr}")
-
-        # Spectrogram parameters for 16 kHz
-        win_length = 400  # Frame length in samples
-        hop_length = 160  # Frame shift in samples
-
-        # Calculate number of frames (T)
-        num_frames = (audio_len - win_length) // hop_length + 1
-        if num_frames < 1:
-            raise ValueError("Waveform too short for given parameters.")
-
-        # Return time frames (T)
-        return num_frames
-
-    def _compute_audio_embed_size(self, audio_frames: int) -> int:
-        """
-        Compute the size of audio embeddings from the number of audio frames.
-        """
-        # `_compute_audio_embed_size` in audio_processor use torch for
-        # computation, therefore we re-implement it to use pythonic
-        # numeric computation to avoid extra tensor conversion.
-        audio_processor = self.get_feature_extractor()
-        audio_compression_rate = audio_processor.audio_compression_rate
-        audio_downsample_rate = audio_processor.audio_downsample_rate
-
-        integer = audio_frames // audio_compression_rate
-        remainder = audio_frames % audio_compression_rate
-        result = integer + int(remainder > 0)
-
-        integer = result // audio_downsample_rate
-        remainder = result % audio_downsample_rate
-        result = integer + int(remainder > 0)  # qformer compression
-
-        return result
-
-
-class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
-    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
-        num_audios = mm_counts.get("audio", 0)
-        num_images = mm_counts.get("image", 0)
-
-        tokenizer = self.info.get_tokenizer()
-        image_tokens: str = tokenizer.image_token * num_images
-        audio_tokens: str = tokenizer.audio_token * num_audios
-
-        return image_tokens + audio_tokens
-
-    def get_dummy_mm_data(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-        mm_options: Mapping[str, BaseDummyOptions] | None = None,
-    ) -> MultiModalDataDict:
-        num_audios = mm_counts.get("audio", 0)
-        num_images = mm_counts.get("image", 0)
-
-        target_width, target_height = self.info.get_image_size_with_most_features()
-
-        image_overrides = mm_options.get("image") if mm_options else None
-        audio_overrides = mm_options.get("audio") if mm_options else None
-
-        mm_data = {
-            "image": self._get_dummy_images(
-                width=target_width,
-                height=target_height,
-                num_images=num_images,
-                overrides=image_overrides,
-            ),
-            "audio": self._get_dummy_audios(
-                length=_AUDIO_MAX_SOUNDFILE_SIZE,
-                num_audios=num_audios,
-                overrides=audio_overrides,
-            ),
-        }
-
-        return mm_data
-
-
-class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
-    def _get_data_parser(self) -> MultiModalDataParser:
-        feature_extractor = self.info.get_feature_extractor()
-        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
-
-    def _call_hf_processor(
-        self,
-        prompt: str,
-        mm_data: Mapping[str, object],
-        mm_kwargs: Mapping[str, object],
-        tok_kwargs: Mapping[str, object],
-    ) -> BatchFeature:
-        if not mm_data:
-            prompt_ids = self.info.get_tokenizer().encode(prompt)
-            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
-            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
-
-        audio_data = mm_data.pop("audios", [])
-        if audio_data:
-            mm_data["audio"] = audio_data
-
-        processed_outputs = super()._call_hf_processor(
-            prompt, mm_data, mm_kwargs, tok_kwargs
-        )
-
-        if "image_pixel_values" in processed_outputs:
-            num_img_tokens = [
-                self.info.get_num_image_tokens(
-                    image_width=img_size[0], image_height=img_size[1]
-                )
-                for img_size in processed_outputs["image_sizes"]
-            ]
-            processed_outputs["num_img_tokens"] = num_img_tokens
-
-        if audio_data:
-            audio_features = processed_outputs["audio_input_features"]
-            sr = self.info.get_feature_extractor(**mm_kwargs).sampling_rate
-            feature_sizes = [
-                self.info.get_audio_num_frames(len(audio), sr) for audio in audio_data
-            ]
-            processed_outputs["audio_input_features"] = [
-                audio_features[idx, :size] for idx, size in enumerate(feature_sizes)
-            ]
-
-        return processed_outputs
-
-    def _get_mm_fields_config(
-        self,
-        hf_inputs: BatchFeature,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(
-            image_pixel_values=MultiModalFieldConfig.batched("image"),
-            image_attention_mask=MultiModalFieldConfig.batched("image"),
-            image_sizes=MultiModalFieldConfig.batched("image"),
-            num_img_tokens=MultiModalFieldConfig.batched("image"),
-            audio_input_features=MultiModalFieldConfig.batched("audio"),
-        )
-
-    def _get_prompt_updates(
-        self,
-        mm_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargsItems,
-    ) -> Sequence[PromptUpdate]:
-        tokenizer = self.info.get_tokenizer()
-        image_token_id: int = tokenizer.vocab[tokenizer.image_token]
-        audio_token_id: int = tokenizer.vocab[tokenizer.audio_token]
-
-        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
-        audio_processor = self.info.get_feature_extractor(**hf_processor_mm_kwargs)
-
-        def get_image_replacement_phi4mm(item_idx: int):
-            images = mm_items.get_items(
-                "image", (ImageEmbeddingItems, ImageProcessorItems)
-            )
-
-            if isinstance(images, ImageEmbeddingItems):
-                num_image_tokens = images.get_feature_size(item_idx)
-            else:
-                image_size = images.get_image_size(item_idx)
-                num_image_tokens = self.info.get_num_image_tokens(
-                    image_width=image_size.width,
-                    image_height=image_size.height,
-                    processor=hf_processor,
-                )
-
-            return [image_token_id] * num_image_tokens
-
-        def get_audio_replacement_phi4mm(item_idx: int):
-            audios = mm_items.get_items("audio", AudioProcessorItems)
-            # TODO(Isotr0py): support embedding inputs
-            audio_len = audios.get_audio_length(item_idx)
-            audio_frames = self.info.get_audio_num_frames(
-                audio_len, audio_processor.sampling_rate
-            )
-            audio_embed_size = self.info._compute_audio_embed_size(audio_frames)
-
-            return [audio_token_id] * audio_embed_size
-
-        return [
-            PromptReplacement(
-                modality="audio",
-                target=[audio_token_id],
-                replacement=get_audio_replacement_phi4mm,
-            ),
-            PromptReplacement(
-                modality="image",
-                target=[image_token_id],
-                replacement=get_image_replacement_phi4mm,
-            ),
-        ]
-
-
-@MULTIMODAL_REGISTRY.register_processor(
-    Phi4MMMultiModalProcessor,
-    info=Phi4MMProcessingInfo,
-    dummy_inputs=Phi4MMDummyInputsBuilder,
-)
-class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
-    """
-    Implements the Phi-4-multimodal-instruct model in vLLM.
-    """
-
-    merge_by_field_config = True
-
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "qkv_proj",
-        ],
-        "gate_up_proj": [
-            "gate_up_proj",
-        ],
-    }
-
-    hf_to_vllm_mapper = WeightsMapper(
-        orig_to_new_prefix={
-            # Multimodal embedding
-            "model.embed_tokens_extend.": "",
-            # LLM backbone
-            "model.": "language_model.model.",
-        },
-        orig_to_new_substr={
-            # projection
-            ".img_projection_": ".img_projection.",
-            ".up_proj_for_speech.": ".speech_projection.up.",
-            ".up_proj_for_vision_speech.": ".vision_speech_projection.up.",
-            ".down_proj_for_speech.": ".speech_projection.down.",
-            ".down_proj_for_vision_speech.": ".vision_speech_projection.down.",
-        },
-    )
-
-    @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
-        if modality.startswith("image"):
-            return "<|image|>"
-        if modality.startswith("audio"):
-            return "<|audio|>"
-
-        raise ValueError("Only image or audio modality is supported")
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        multimodal_config = vllm_config.model_config.multimodal_config
-        self.config = config
-        self.multimodal_config = multimodal_config
-
-        # TODO: Optionally initializes these for supporting input embeddings.
-        self.image_embed = Phi4MMImageEmbedding(
-            config,
-            # prefix=maybe_prefix(prefix, "image_embed"),
-        )
-        self.audio_embed = Phi4MMAudioEmbedding(
-            config,
-            # prefix=maybe_prefix(prefix, "audio_embed"),
-        )
-
-        self.language_model = init_vllm_registered_model(
-            vllm_config=vllm_config,
-            prefix=maybe_prefix(prefix, "language_model"),
-            architectures=["Phi3ForCausalLM"],
-        )
-
-        self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors
-        )
-
-    def _parse_and_validate_audio_input(
-        self, **kwargs: object
-    ) -> Phi4MMAudioInputs | None:
-        """
-        Parse and validate the audio input to the model.  This handles both
-        audio features and audio embeddings, but only the former is used for
-        now.
-
-        Args:
-            kwargs (object): Keyword arguments.
-
-        Returns:
-            Optional[Phi4MMAudioInputs]: Parsed and validated audio inputs.
-        """
-        audio_features = kwargs.pop("audio_input_features", None)
-        audio_embeds = kwargs.pop("audio_embeds", None)
-
-        if audio_features is None and audio_embeds is None:
-            return None
-
-        if audio_features is not None:
-            return Phi4MMAudioFeatureInputs(
-                type="audio_features",
-                audio_features=audio_features,
-            )
-
-        if audio_embeds is not None:
-            return Phi4MMAudioEmbeddingInputs(type="audio_embeds", data=audio_embeds)
-
-        raise AssertionError("This line should be unreachable.")
-
-    def _process_audio_input(
-        self, audio_input: Phi4MMAudioInputs, audio_projection_mode: str
-    ) -> NestedTensors:
-        """
-        Create the audio embeddings from the audio input, where the audio input
-        is pairs of audio features and audio embed lengths.  The audio input is
-        created by `input_mapper_for_phi4mm_audio`.
-
-        Args:
-            audio_input (Phi4MMAudioInputs): Audio input.
-
-        Returns:
-            NestedTensors: Audio embeddings
-        """
-        if audio_input["type"] == "audio_embeds":
-            return audio_input["data"]
-
-        audio_features = audio_input["audio_features"]
-        # (e.g. multiple examples) and the second dim is the multi-audio dim
-        # (e.g. multiple audios in the same example)
-
-        dtype = next(self.audio_embed.parameters()).dtype
-        audio_embeds = [
-            self.audio_embed(
-                features.unsqueeze(0).to(dtype),
-                audio_projection_mode=audio_projection_mode,
-            )
-            for features in audio_features
-        ]
-        return audio_embeds
-
-    def _parse_and_validate_image_input(
-        self, **kwargs: object
-    ) -> Phi4MMImagePixelInputs | None:
-        pixel_values = kwargs.get("image_pixel_values")
-        if pixel_values is None:
-            return None
-
-        image_sizes = kwargs.get("image_sizes")
-        image_attention_mask = kwargs.get("image_attention_mask")
-        num_img_tokens = kwargs.get("num_img_tokens")
-        assert (
-            image_sizes is not None
-            and image_attention_mask is not None
-            and num_img_tokens is not None
-        ), "Missing image inputs"
-
-        return Phi4MMImagePixelInputs(
-            type="pixel_values",
-            pixel_values=pixel_values,
-            image_sizes=image_sizes,
-            image_attention_mask=image_attention_mask,
-            num_img_tokens=num_img_tokens,
-        )
-
-    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
-        modalities = {}
-
-        # Preserve the order of modalities if there are multiple of them
-        # from the order of kwargs.
-        for input_key in kwargs:
-            if (
-                input_key in ("image_pixel_values", "image_embeds")
-                and "images" not in modalities
-            ):
-                modalities["images"] = self._parse_and_validate_image_input(**kwargs)
-            if (
-                input_key in ("audio_input_features", "audio_embeds")
-                and "audios" not in modalities
-            ):
-                modalities["audios"] = self._parse_and_validate_audio_input(**kwargs)
-
-        return modalities
-
-    def _process_image_input(
-        self, image_input: Phi4MMImagePixelInputs
-    ) -> list[torch.Tensor]:
-        if image_input["type"] == "image_embeds":
-            image_embeds = image_input["image_embeds"].type(self.visual.dtype)
-        else:
-            dtype = next(self.image_embed.parameters()).dtype
-            pixel_values = image_input["pixel_values"].to(dtype)
-            image_sizes = image_input["image_sizes"]
-            image_attention_mask = image_input["image_attention_mask"]
-            image_embeds = self.image_embed(
-                pixel_values, image_sizes, image_attention_mask
-            )
-        return image_embeds
-
-    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
-        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
-        if not modalities:
-            return []
-
-        # The result multimodal_embeddings is tuple of tensors, with each
-        # tensor corresponding to a multimodal data item (image or video).
-        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
-
-        # NOTE: It is important to iterate over the keys in this dictionary
-        # to preserve the order of the modalities.
-        audio_projection_mode = "speech"
-        for modality in modalities:
-            # make sure process images first
-            if modality == "images":
-                audio_projection_mode = "vision"
-                image_input = modalities["images"]
-                image_embeddings = self._process_image_input(image_input)
-                multimodal_embeddings += tuple(image_embeddings)
-            if modality == "audios":
-                audio_input = modalities["audios"]
-                audio_embeddings = self._process_audio_input(
-                    audio_input, audio_projection_mode=audio_projection_mode
-                )
-                multimodal_embeddings += tuple(audio_embeddings)
-
-        return multimodal_embeddings
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: IntermediateTensors | None = None,
-        inputs_embeds: torch.Tensor | None = None,
-        **kwargs: object,
-    ) -> torch.Tensor:
-        if intermediate_tensors is not None:
-            inputs_embeds = None
-
-        hidden_states = self.language_model(
-            input_ids,
-            positions,
-            intermediate_tensors,
-            inputs_embeds=inputs_embeds,
-        )
-
-        return hidden_states
-
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-    ) -> torch.Tensor | None:
-        return self.language_model.compute_logits(hidden_states)
-
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
-
-    def get_mm_mapping(self) -> MultiModelKeys:
-        """
-        Get the module prefix in multimodal models
-        """
-        return MultiModelKeys.from_string_field(
-            language_model="language_model.",
-            connector=[
-                "img_projection",
-                "vision_speech_projection",
-                "speech_projection",
-            ],
-            tower_model=["image_embed", "audio_embed"],
-        )
-
-    def get_language_model(self) -> torch.nn.Module:
-        return self.language_model
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index d3b6268e7..a4a964bc7 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -374,7 +374,6 @@ _MULTIMODAL_MODELS = {
     ),
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
     "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
-    "Phi4MultimodalForCausalLM": ("phi4_multimodal", "Phi4MultimodalForCausalLM"),  # noqa: E501
     "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"),  # noqa: E501
     "QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"),  # noqa: E501
     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
@@ -507,6 +506,7 @@ _PREVIOUSLY_SUPPORTED_MODELS = {
     "MotifForCausalLM": "0.10.2",
     "Phi3SmallForCausalLM": "0.9.2",
     "Phi4FlashForCausalLM": "0.10.2",
+    "Phi4MultimodalForCausalLM": "0.12.0",
     # encoder-decoder models except whisper
     # have been removed for V0 deprecation.
     "BartModel": "0.10.2",
-- 
GitLab


From dd97e047e03f5218bddf05fa71f0592df0b8e30e Mon Sep 17 00:00:00 2001
From: Yongtao Huang <yongtaoh2022@gmail.com>
Date: Thu, 4 Dec 2025 22:04:42 +0800
Subject: [PATCH 088/258] Fix broken multiline assert in
 `LoRAModelManager.register_module` (#30032)

Signed-off-by: Yongtao Huang <yongtaoh2022@gmail.com>
---
 vllm/lora/models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index f568b8b9b..ada30da60 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -574,9 +574,9 @@ class LoRAModelManager:
 
     def register_module(self, module_name: str, module: "BaseLayerWithLoRA"):
         assert isinstance(module, BaseLayerWithLoRA), (
-            f"Module {module_name} must be a BaseLayerWithLoRA instance,"
+            f"Module {module_name} must be a BaseLayerWithLoRA instance, "
+            f"got {type(module)}"
         )
-        f" got {type(module)}"
         self.modules[module_name] = module
 
     def create_dummy_lora(
-- 
GitLab


From 5c32a06a049829c59300929627002998d1cc34ba Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 4 Dec 2025 14:54:28 +0000
Subject: [PATCH 089/258] Use Transformers v5 RoPE standardisation and
 validation (#30046)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/transformers_utils/config.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 2911dcff2..1075bc244 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -304,14 +304,19 @@ def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> No
 
 def patch_rope_parameters(config: PretrainedConfig) -> None:
     """Provide backwards compatibility for RoPE."""
+    rope_theta = getattr(config, "rope_theta", None)
     if Version(version("transformers")) < Version("5.0.0.dev0"):
         # Transformers v4 installed, legacy config fields may be present
         if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
             config.rope_parameters = rope_scaling
-        if (rope_theta := getattr(config, "rope_theta", None)) is not None:
+        if rope_theta is not None:
             if not hasattr(config, "rope_parameters"):
                 config.rope_parameters = {"rope_type": "default"}
             config.rope_parameters["rope_theta"] = rope_theta
+    elif rope_theta is not None or hasattr(config, "rope_parameters"):
+        # Transformers v5 installed
+        config.standardize_rope_params()
+        config.validate_rope()
 
     # No RoPE parameters to patch
     if getattr(config, "rope_parameters", None) is None:
-- 
GitLab


From cc050558f424714f9548774cc2c661b3916d96ca Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 4 Dec 2025 07:19:42 -0800
Subject: [PATCH 090/258] [Model Runner V2] Implement
 get_num_sampled_and_rejected kernel (#30029)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/input_batch.py             | 49 +++++++++++++++++++
 vllm/v1/worker/gpu/model_runner.py            | 33 ++++++-------
 .../gpu/spec_decode/rejection_sample.py       | 12 -----
 3 files changed, 65 insertions(+), 29 deletions(-)

diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 8ae887fe8..1b78734fb 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -354,6 +354,55 @@ def combine_sampled_and_draft_tokens(
     return logits_indices
 
 
+@triton.jit
+def _get_num_sampled_and_rejected_kernel(
+    num_sampled_ptr,
+    num_rejected_ptr,
+    seq_lens_ptr,
+    cu_num_logits_ptr,
+    idx_mapping_ptr,
+    prefill_len_ptr,
+):
+    batch_idx = tl.program_id(0)
+    req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
+
+    seq_len = tl.load(seq_lens_ptr + batch_idx)
+    prefill_len = tl.load(prefill_len_ptr + req_state_idx)
+    is_chunked_prefilling = seq_len < prefill_len
+
+    num_sampled = tl.load(num_sampled_ptr + batch_idx)
+    num_sampled = tl.where(is_chunked_prefilling, 0, num_sampled)
+    tl.store(num_sampled_ptr + batch_idx, num_sampled)
+
+    logits_start = tl.load(cu_num_logits_ptr + batch_idx)
+    logits_end = tl.load(cu_num_logits_ptr + batch_idx + 1)
+    num_logits = logits_end - logits_start
+
+    num_rejected = num_logits - num_sampled
+    num_rejected = tl.where(is_chunked_prefilling, 0, num_rejected)
+    tl.store(num_rejected_ptr + batch_idx, num_rejected)
+
+
+def get_num_sampled_and_rejected(
+    num_sampled: torch.Tensor,
+    seq_lens: torch.Tensor,
+    cu_num_logits: torch.Tensor,
+    idx_mapping: torch.Tensor,
+    prefill_len: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    num_reqs = idx_mapping.shape[0]
+    num_rejected = torch.empty_like(num_sampled)
+    _get_num_sampled_and_rejected_kernel[(num_reqs,)](
+        num_sampled,
+        num_rejected,
+        seq_lens,
+        cu_num_logits,
+        idx_mapping,
+        prefill_len,
+    )
+    return num_sampled, num_rejected
+
+
 @triton.jit
 def _post_update_kernel(
     idx_mapping_ptr,
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 9bf345053..464f7b7bd 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -43,6 +43,7 @@ from vllm.v1.worker.gpu.input_batch import (
     InputBatch,
     InputBuffers,
     combine_sampled_and_draft_tokens,
+    get_num_sampled_and_rejected,
     post_update,
     prepare_pos_seq_lens,
     prepare_prefill_inputs,
@@ -54,10 +55,7 @@ from vllm.v1.worker.gpu.sample.metadata import (
 )
 from vllm.v1.worker.gpu.sample.sampler import Sampler
 from vllm.v1.worker.gpu.spec_decode import init_speculator
-from vllm.v1.worker.gpu.spec_decode.rejection_sample import (
-    get_num_rejected,
-    rejection_sample,
-)
+from vllm.v1.worker.gpu.spec_decode.rejection_sample import rejection_sample
 from vllm.v1.worker.gpu.states import RequestState
 from vllm.v1.worker.gpu.structured_outputs import apply_grammar_bitmask
 from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
@@ -621,16 +619,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Sample tokens and compute logprobs (if needed).
         sampler_output = self.sampler(logits, sampling_metadata)
 
-        # Get the number of sampled tokens.
-        prefill_len = self.req_states.prefill_len.gpu[input_batch.idx_mapping]
-        is_chunked_prefilling = input_batch.seq_lens < prefill_len
         if input_batch.num_draft_tokens == 0:
             # No draft tokens (common case).
-            # 0 if chunked-prefilling, 1 if not.
-            num_sampled = (~is_chunked_prefilling).int()
-            num_rejected = torch.zeros_like(num_sampled)
+            num_sampled = torch.ones(
+                input_batch.num_reqs, dtype=torch.int32, device=self.device
+            )
         else:
-            # Draft tokens for spec decoding.
+            # Rejection sampling for spec decoding.
             input_ids = input_batch.input_ids[input_batch.logits_indices]
             sampled_tokens, num_sampled = rejection_sample(
                 sampler_output.sampled_token_ids,
@@ -638,13 +633,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 input_batch.cu_num_logits,
                 self.num_speculative_steps,
             )
-            num_sampled *= ~is_chunked_prefilling
-            num_rejected = get_num_rejected(
-                input_batch.cu_num_logits,
-                num_sampled,
-            )
             sampler_output.sampled_token_ids = sampled_tokens
-            # TODO(woosuk): Support logprobs with spec decoding.
+
+        # Get the number of sampled and rejected tokens.
+        # For chunked prefills, num_sampled and num_rejected are both 0.
+        num_sampled, num_rejected = get_num_sampled_and_rejected(
+            num_sampled,
+            input_batch.seq_lens,
+            input_batch.cu_num_logits,
+            input_batch.idx_mapping,
+            self.req_states.prefill_len.gpu,
+        )
         return sampler_output, num_sampled, num_rejected
 
     def compute_prompt_logprobs(
diff --git a/vllm/v1/worker/gpu/spec_decode/rejection_sample.py b/vllm/v1/worker/gpu/spec_decode/rejection_sample.py
index 43c6ac518..8a7bf28ba 100644
--- a/vllm/v1/worker/gpu/spec_decode/rejection_sample.py
+++ b/vllm/v1/worker/gpu/spec_decode/rejection_sample.py
@@ -69,15 +69,3 @@ def rejection_sample(
         num_warps=1,
     )
     return sampled, num_sampled
-
-
-@torch.compile(dynamic=True)
-def get_num_rejected(
-    cu_num_logits: torch.Tensor,
-    num_sampled: torch.Tensor,
-) -> torch.Tensor:
-    num_logits = cu_num_logits[1:] - cu_num_logits[:-1]
-    num_rejected = num_logits - num_sampled
-    # No token is rejected for chunked prefills.
-    num_rejected *= num_sampled > 0
-    return num_rejected
-- 
GitLab


From 5b4b42c0b6e190dacbf6dbfed3506c9e58bfc8de Mon Sep 17 00:00:00 2001
From: Doug Smith <dosmith@redhat.com>
Date: Thu, 4 Dec 2025 10:38:03 -0500
Subject: [PATCH 091/258] Mark DBO test as flaky on b200 for Distributed B200
 test (#29913)

Signed-off-by: dougbtv <dosmith@redhat.com>
---
 tests/v1/distributed/test_dbo.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tests/v1/distributed/test_dbo.py b/tests/v1/distributed/test_dbo.py
index 16f154d19..f3a159762 100644
--- a/tests/v1/distributed/test_dbo.py
+++ b/tests/v1/distributed/test_dbo.py
@@ -9,10 +9,22 @@ correctly with the DeepSeek-V2-Lite model using GSM8K evaluation.
 """
 
 import pytest
+import torch
 
 from tests.evals.gsm8k.gsm8k_eval import evaluate_gsm8k
 from tests.utils import RemoteOpenAIServer
 
+# Detect Blackwell / B200 (compute capability 10.x)
+try:
+    if torch.cuda.is_available():
+        cap = torch.cuda.get_device_capability(0)
+        IS_BLACKWELL = cap[0] >= 10
+    else:
+        IS_BLACKWELL = False
+except Exception:
+    # Be conservative: if we can't detect, don't xfail by default
+    IS_BLACKWELL = False
+
 MODEL_NAME = "deepseek-ai/DeepSeek-V2-Lite-Chat"
 DP_SIZE = 2
 
@@ -33,6 +45,13 @@ DEEPEP_BACKENDS = [
 
 
 @pytest.mark.parametrize("all2all_backend", DEEPEP_BACKENDS)
+@pytest.mark.xfail(
+    IS_BLACKWELL,
+    reason=(
+        "Temporary: DBO accuracy unstable on Blackwell "
+        "(doesn't meet expectation of MIN_ACCURACY = 0.62)"
+    ),
+)
 def test_dbo_dp_ep_gsm8k(all2all_backend: str, num_gpus_available):
     """
     Test DBO with DP+EP using GSM8K evaluation.
-- 
GitLab


From 990f806473888451ef6590f85a6ed8436db7801c Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Fri, 5 Dec 2025 00:28:37 +0800
Subject: [PATCH 092/258] [Doc] clarify nightly builds in developer docs
 (#30019)

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 docs/contributing/ci/nightly_builds.md | 160 +++++++++++++++++++++++++
 1 file changed, 160 insertions(+)
 create mode 100644 docs/contributing/ci/nightly_builds.md

diff --git a/docs/contributing/ci/nightly_builds.md b/docs/contributing/ci/nightly_builds.md
new file mode 100644
index 000000000..a07b9c1c2
--- /dev/null
+++ b/docs/contributing/ci/nightly_builds.md
@@ -0,0 +1,160 @@
+# Nightly Builds of vLLM Wheels
+
+vLLM maintains a per-commit wheel repository (commonly referred to as "nightly") at `https://wheels.vllm.ai` that provides pre-built wheels for every commit on the `main` branch since `v0.5.3`. This document explains how the nightly wheel index mechanism works.
+
+## Build and Upload Process on CI
+
+### Wheel Building
+
+Wheels are built in the `Release` pipeline (`.buildkite/release-pipeline.yaml`) after a PR is merged into the main branch, with multiple variants:
+
+- **Backend variants**: `cpu` and `cuXXX` (e.g., `cu129`, `cu130`).
+- **Architecture variants**: `x86_64` and `aarch64`.
+
+Each build step:
+
+1. Builds the wheel in a Docker container.
+2. Renames the wheel filename to use the correct manylinux tag (currently `manylinux_2_31`) for PEP 600 compliance.
+3. Uploads the wheel to S3 bucket `vllm-wheels` under `/{commit_hash}/`.
+
+### Index Generation
+
+After uploading each wheel, the `.buildkite/scripts/upload-wheels.sh` script:
+
+1. **Lists all existing wheels** in the commit directory from S3
+2. **Generates indices** using `.buildkite/scripts/generate-nightly-index.py`:
+    - Parses wheel filenames to extract metadata (version, variant, platform tags).
+    - Creates HTML index files (`index.html`) for PyPI compatibility.
+    - Generates machine-readable `metadata.json` files.
+3. **Uploads indices** to multiple locations (overriding existing ones):
+    - `/{commit_hash}/` - Always uploaded for commit-specific access.
+    - `/nightly/` - Only for commits on `main` branch (not PRs).
+    - `/{version}/` - Only for release wheels (no `dev` in its version).
+
+!!! tip "Handling Concurrent Builds"
+    The index generation script can handle multiple variants being built concurrently by always listing all wheels in the commit directory before generating indices, avoiding race conditions.
+
+## Directory Structure
+
+The S3 bucket structure follows this pattern:
+
+```text
+s3://vllm-wheels/
+├── {commit_hash}/              # Commit-specific wheels and indices
+│   ├── vllm-*.whl              # All wheel files
+│   ├── index.html              # Project list (default variant)
+│   ├── vllm/
+│   │   ├── index.html          # Package index (default variant)
+│   │   └── metadata.json       # Metadata (default variant)
+│   ├── cu129/                  # Variant subdirectory
+│   │   ├── index.html          # Project list (cu129 variant)
+│   │   └── vllm/
+│   │       ├── index.html      # Package index (cu129 variant)
+│   │       └── metadata.json   # Metadata (cu129 variant)
+│   ├── cu130/                  # Variant subdirectory
+│   ├── cpu/                    # Variant subdirectory
+│   └── .../                    # More variant subdirectories
+├── nightly/                    # Latest main branch wheels (mirror of latest commit)
+└── {version}/                  # Release version indices (e.g., 0.11.2)
+```
+
+All built wheels are stored in `/{commit_hash}/`, while different indices are generated and reference them.
+This avoids duplication of wheel files.
+
+For example, you can specify the following URLs to use different indices:
+
+- `https://wheels.vllm.ai/nightly/cu130` for the latest main branch wheels built with CUDA 13.0.
+- `https://wheels.vllm.ai/{commit_hash}` for wheels built at a specific commit (default variant).
+- `https://wheels.vllm.ai/0.12.0/cpu` for 0.12.0 release wheels built for CPU variant.
+
+Please note that not all variants are present on every commit. The available variants are subject to change over time, e.g., changing cu130 to cu131.
+
+### Variant Organization
+
+Indices are organized by variant:
+
+- **Default variant**: Wheels without variant suffix (i.e., built with the current `VLLM_MAIN_CUDA_VERSION`) are placed in the root.
+- **Variant subdirectories**: Wheels with variant suffixes (e.g., `+cu130`, `.cpu`) are organized in subdirectories.
+- **Alias to default**: The default variant can have an alias (e.g., `cu129` for now) for consistency and convenience.
+
+The variant is extracted from the wheel filename (as described in the [file name convention](https://packaging.python.org/en/latest/specifications/binary-distribution-format/#file-name-convention)):
+
+- The variant is encoded in the local version identifier (e.g. `+cu129` or `dev<N>+g<hash>.cu130`).
+- Examples:
+    - `vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl` → default variant
+    - `vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl` → `cu129` variant
+    - `vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl` → `cu130` variant
+
+## Index Generation Details
+
+The `generate-nightly-index.py` script performs the following:
+
+1. **Parses wheel filenames** using regex to extract:
+    - Package name
+    - Version (with variant extracted)
+    - Python tag, ABI tag, platform tag
+    - Build tag (if present)
+2. **Groups wheels by variant**, then by package name:
+    - Currently only `vllm` is built, but the structure supports multiple packages in the future.
+3. **Generates HTML indices** (compliant with the [Simple repository API](https://packaging.python.org/en/latest/specifications/simple-repository-api/#simple-repository-api)):
+    - Top-level `index.html`: Lists all packages and variant subdirectories
+    - Package-level `index.html`: Lists all wheel files for that package
+    - Uses relative paths to wheel files for portability
+4. **Generates metadata.json**:
+    - Machine-readable JSON containing all wheel metadata
+    - Includes `path` field with URL-encoded relative path to wheel file
+    - Used by `setup.py` to locate compatible pre-compiled wheels during Python-only builds
+
+### Special Handling for AWS Services
+
+The wheels and indices are directly stored on AWS S3, and we use AWS CloudFront as a CDN in front of the S3 bucket.
+
+Since S3 does not provide proper directory listing, to support PyPI-compatible simple repository API behavior, we deploy a CloudFront Function that:
+
+- redirects any URL that does not end with `/` and does not look like a file (i.e., does not contain a dot `.` in the last path segment) to the same URL with a trailing `/`
+- appends `/index.html` to any URL that ends with `/`
+
+For example, the following requests would be handled as:
+
+- `/nightly` -> `/nightly/index.html`
+- `/nightly/cu130/` -> `/nightly/cu130/index.html`
+- `/nightly/index.html` or `/nightly/vllm.whl` -> unchanged
+
+!!! note "AWS S3 Filename Escaping"
+
+    S3 will automatically escape filenames upon upload according to its [naming rule](https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html). The direct impact on vllm is that `+` in filenames will be converted to `%2B`. We take special care in the index generation script to escape filenames properly when generating the HTML indices and JSON metadata, to ensure the URLs are correct and can be directly used.
+
+## Usage of precompiled wheels in `setup.py` {#precompiled-wheels-usage}
+
+When installing vLLM with `VLLM_USE_PRECOMPILED=1`, the `setup.py` script:
+
+1. **Determines wheel location** via `precompiled_wheel_utils.determine_wheel_url()`:
+    - Env var `VLLM_PRECOMPILED_WHEEL_LOCATION` (user-specified URL/path) always takes precedence and skips all other steps.
+    - Determines the variant from `VLLM_MAIN_CUDA_VERSION` (can be overridden with env var `VLLM_PRECOMPILED_WHEEL_VARIANT`); the default variant will also be tried as a fallback.
+    - Determines the _base commit_ (explained later) of this branch (can be overridden with env var `VLLM_PRECOMPILED_WHEEL_COMMIT`).
+2. **Fetches metadata** from `https://wheels.vllm.ai/{commit}/vllm/metadata.json` (for the default variant) or `https://wheels.vllm.ai/{commit}/{variant}/vllm/metadata.json` (for a specific variant).
+3. **Selects compatible wheel** based on:
+    - Package name (`vllm`)
+    - Platform tag (architecture match)
+4. **Downloads and extracts** precompiled binaries from the wheel:
+    - C++ extension modules (`.so` files)
+    - Flash Attention Python modules
+    - Triton kernel Python files
+5. **Patches package_data** to include extracted files in the installation
+
+!!! note "What is the base commit?"
+
+    The base commit is determined by finding the merge-base
+    between the current branch and upstream `main`, ensuring
+    compatibility between source code and precompiled binaries.
+
+_Note: it's users' responsibility to ensure there is no native code (e.g., C++ or CUDA) changes before using precompiled wheels._
+
+## Implementation Files
+
+Key files involved in the nightly wheel mechanism:
+
+- **`.buildkite/release-pipeline.yaml`**: CI pipeline that builds wheels
+- **`.buildkite/scripts/upload-wheels.sh`**: Script that uploads wheels and generates indices
+- **`.buildkite/scripts/generate-nightly-index.py`**: Python script that generates PyPI-compatible indices
+- **`setup.py`**: Contains `precompiled_wheel_utils` class for fetching and using precompiled wheels
-- 
GitLab


From b286a311c2bab639ce6edc998a9a67c0affc9cbf Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 5 Dec 2025 01:21:24 +0800
Subject: [PATCH 093/258] [Chore] Deprecate `merge_by_field_config` arg
 (#30035)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../multimodal/processing/test_common.py      |   4 +-
 .../multimodal/processing/test_glm4_1v.py     |   7 +-
 .../processing/test_tensor_schema.py          |   5 +-
 tests/multimodal/test_cache.py                |   9 +-
 tests/multimodal/test_inputs.py               |  91 -----------
 vllm/model_executor/models/deepseek_ocr.py    |   4 +-
 vllm/model_executor/models/interfaces.py      |   2 +-
 vllm/model_executor/models/lightonocr.py      |   4 +-
 .../model_executor/models/nano_nemotron_vl.py |  12 +-
 vllm/model_executor/models/opencua.py         |   4 +-
 vllm/model_executor/models/paddleocr_vl.py    |   4 +-
 vllm/model_executor/models/paligemma.py       |   8 +-
 vllm/model_executor/models/qwen2_5_vl.py      |   4 +-
 vllm/multimodal/cache.py                      |   9 +-
 vllm/multimodal/inputs.py                     | 142 ++++++------------
 vllm/multimodal/utils.py                      |  59 ++------
 vllm/v1/serial_utils.py                       |  19 ---
 vllm/v1/worker/gpu_model_runner.py            |   3 -
 vllm/v1/worker/tpu_model_runner.py            |   2 -
 19 files changed, 90 insertions(+), 302 deletions(-)
 delete mode 100644 tests/multimodal/test_inputs.py

diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 6b9d388f2..2e032ac4c 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -20,7 +20,7 @@ from vllm.config.multimodal import (
 )
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
 from vllm.multimodal.cache import MultiModalProcessorOnlyCache
-from vllm.multimodal.inputs import MultiModalInputs
+from vllm.multimodal.inputs import MultiModalInputs, batched_tensors_equal
 from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
 from vllm.tokenizers import (
     MistralTokenizer,
@@ -418,4 +418,4 @@ def _assert_inputs_equal(
         a_data.pop(key, None)
         b_data.pop(key, None)
 
-    assert a_data == b_data, msg
+    assert batched_tensors_equal(a_data, b_data), msg
diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py
index 553a5f719..51071c935 100644
--- a/tests/models/multimodal/processing/test_glm4_1v.py
+++ b/tests/models/multimodal/processing/test_glm4_1v.py
@@ -5,6 +5,7 @@ import pytest
 
 from vllm.assets.video import VideoAsset
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import batched_tensors_equal
 from vllm.multimodal.video import OpenCVDynamicVideoBackend, OpenCVVideoBackend
 
 from ...utils import build_model_context
@@ -103,7 +104,7 @@ def test_video_loader_consistency(
     dynamic_outputs = processor.apply(prompt, dynamic_mm_data, hf_processor_mm_kwargs)
 
     assert static_outputs["prompt_token_ids"] == dynamic_outputs["prompt_token_ids"]
-    assert (
-        static_outputs["mm_kwargs"].get_data()
-        == dynamic_outputs["mm_kwargs"].get_data()
+    assert batched_tensors_equal(
+        static_outputs["mm_kwargs"].get_data(),
+        dynamic_outputs["mm_kwargs"].get_data(),
     )
diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index 7628ab4fe..5d489549c 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -130,10 +130,9 @@ def create_batched_mm_kwargs(
         hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
         tokenization_kwargs=processor_inputs.tokenization_kwargs,
     )["mm_kwargs"].require_data()
-    items = [item for modality in supported_mm_limits for item in mm_kwargs[modality]]
+
     return group_mm_kwargs_by_modality(
-        items,
-        merge_by_field_config=model_cls.merge_by_field_config,
+        [item for modality in supported_mm_limits for item in mm_kwargs[modality]]
     )
 
 
diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py
index 2ddc93f8d..e4fcc3474 100644
--- a/tests/multimodal/test_cache.py
+++ b/tests/multimodal/test_cache.py
@@ -85,12 +85,6 @@ def _dummy_items(
         (_dummy_item("a", {"a1": 100}), 100),
         (_dummy_item("a", {"a1": 100, "a2": 110}), 210),
         (_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460),  # noqa: E501
-        (
-            _dummy_items(
-                {"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}
-            ).get_data(),
-            460,
-        ),  # noqa: E501
     ],
 )
 def test_cache_item_size(item, expected_size):
@@ -107,6 +101,9 @@ def test_cache_item_size(item, expected_size):
     cache[""] = MultiModalProcessorCacheItemMetadata(item, [prompt_update])
     assert cache.currsize == expected_size
 
+    cache[""] = item.get_data()
+    assert cache.currsize == expected_size
+
 
 def _create_vllm_config(
     *,
diff --git a/tests/multimodal/test_inputs.py b/tests/multimodal/test_inputs.py
deleted file mode 100644
index 88e92bee3..000000000
--- a/tests/multimodal/test_inputs.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import torch
-
-from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
-
-pytestmark = pytest.mark.cpu_test
-
-
-def assert_nested_tensors_equal(expected: NestedTensors, actual: NestedTensors):
-    assert type(expected) == type(actual)  # noqa: E721
-    if isinstance(expected, torch.Tensor):
-        assert torch.equal(expected, actual)
-    else:
-        for expected_item, actual_item in zip(expected, actual):
-            assert_nested_tensors_equal(expected_item, actual_item)
-
-
-def assert_multimodal_inputs_equal(
-    expected: MultiModalKwargs, actual: MultiModalKwargs
-):
-    assert set(expected.keys()) == set(actual.keys())
-    for key in expected:
-        assert_nested_tensors_equal(expected[key], actual[key])
-
-
-def test_multimodal_input_batch_single_tensor():
-    t = torch.rand([1, 2])
-    result = MultiModalKwargs.batch([{"image": t}])
-    assert_multimodal_inputs_equal(result, {"image": t.unsqueeze(0)})
-
-
-def test_multimodal_input_batch_multiple_tensors():
-    a = torch.rand([1, 1, 2])
-    b = torch.rand([1, 1, 2])
-    c = torch.rand([1, 1, 2])
-    result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}])
-    assert_multimodal_inputs_equal(result, {"image": torch.stack([a, b, c])})
-
-
-def test_multimodal_input_batch_multiple_heterogeneous_tensors():
-    a = torch.rand([1, 2, 2])
-    b = torch.rand([1, 3, 2])
-    c = torch.rand([1, 4, 2])
-    result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}])
-    assert_multimodal_inputs_equal(result, {"image": [a, b, c]})
-
-
-def test_multimodal_input_batch_nested_tensors():
-    a = torch.rand([2, 3])
-    b = torch.rand([2, 3])
-    c = torch.rand([2, 3])
-    result = MultiModalKwargs.batch([{"image": [a]}, {"image": [b]}, {"image": [c]}])
-    assert_multimodal_inputs_equal(
-        result, {"image": torch.stack([a.unsqueeze(0), b.unsqueeze(0), c.unsqueeze(0)])}
-    )
-
-
-def test_multimodal_input_batch_heterogeneous_lists():
-    a = torch.rand([1, 2, 3])
-    b = torch.rand([1, 2, 3])
-    c = torch.rand([1, 2, 3])
-    result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
-    assert_multimodal_inputs_equal(
-        result, {"image": [torch.stack([a, b]), c.unsqueeze(0)]}
-    )
-
-
-def test_multimodal_input_batch_multiple_batchable_lists():
-    a = torch.rand([1, 2, 3])
-    b = torch.rand([1, 2, 3])
-    c = torch.rand([1, 2, 3])
-    d = torch.rand([1, 2, 3])
-    result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c, d]}])
-    assert_multimodal_inputs_equal(
-        result, {"image": torch.stack([torch.stack([a, b]), torch.stack([c, d])])}
-    )
-
-
-def test_multimodal_input_batch_mixed_stacking_depths():
-    a = torch.rand([1, 2, 3])
-    b = torch.rand([1, 3, 3])
-    c = torch.rand([1, 4, 3])
-
-    result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
-    assert_multimodal_inputs_equal(result, {"image": [[a, b], c.unsqueeze(0)]})
-
-    result = MultiModalKwargs.batch([{"image": [a]}, {"image": [b, c]}])
-    assert_multimodal_inputs_equal(result, {"image": [a.unsqueeze(0), [b, c]]})
diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py
index 019fb3e29..a612ebd95 100644
--- a/vllm/model_executor/models/deepseek_ocr.py
+++ b/vllm/model_executor/models/deepseek_ocr.py
@@ -27,7 +27,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
     MultiModalFieldConfig,
-    MultiModalKwargs,
+    MultiModalKwargsItems,
     NestedTensors,
 )
 from vllm.multimodal.parse import (
@@ -305,7 +305,7 @@ class DeepseekOCRMultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 0f65683cf..01b3e7827 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -78,7 +78,7 @@ class SupportsMultiModal(Protocol):
     `multimodal_config.mm_encoder_tp_mode="data"`.
     """
 
-    merge_by_field_config: ClassVar[bool] = False
+    merge_by_field_config: ClassVar[bool] = True
     """
     A flag that indicates which implementation of
     `vllm.multimodal.utils.group_mm_kwargs_by_modality` to use.
diff --git a/vllm/model_executor/models/lightonocr.py b/vllm/model_executor/models/lightonocr.py
index 9839e4f8f..353ee7806 100644
--- a/vllm/model_executor/models/lightonocr.py
+++ b/vllm/model_executor/models/lightonocr.py
@@ -28,7 +28,7 @@ from vllm.model_executor.models.utils import (
 )
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.cache import BaseMultiModalProcessorCache
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems
 from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
 from vllm.multimodal.processing import (
     BaseMultiModalProcessor,
@@ -103,7 +103,7 @@ class LightOnOCRMultiModalProcessor(BaseMultiModalProcessor[Mistral3ProcessingIn
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 891a9ce08..c4198d36b 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -52,7 +52,6 @@ from vllm.multimodal.evs import (
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
     MultiModalFieldConfig,
-    MultiModalKwargs,
     MultiModalKwargsItems,
     VideoItem,
 )
@@ -849,17 +848,18 @@ class NanoNemotronBaseVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
-        if "image_num_patches" in out_mm_kwargs:
-            image_num_patches = out_mm_kwargs["image_num_patches"]
+        out_mm_data = out_mm_kwargs.get_data()
+        if "image_num_patches" in out_mm_data:
+            image_num_patches = out_mm_data["image_num_patches"]
             assert isinstance(image_num_patches, torch.Tensor)
             image_num_patches = image_num_patches.tolist()
-        elif "image_embeds" in out_mm_kwargs:
+        elif "image_embeds" in out_mm_data:
             # to compute num_patches (similar to Qwen2-VL)
-            image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
+            image_num_patches = [None] * len(out_mm_data["image_embeds"])
         else:
             image_num_patches = []
 
diff --git a/vllm/model_executor/models/opencua.py b/vllm/model_executor/models/opencua.py
index 433891866..b92f0c9da 100644
--- a/vllm/model_executor/models/opencua.py
+++ b/vllm/model_executor/models/opencua.py
@@ -23,7 +23,7 @@ from vllm.config import VllmConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
     MultiModalFieldConfig,
-    MultiModalKwargs,
+    MultiModalKwargsItems,
 )
 from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
 from vllm.multimodal.processing import (
@@ -153,7 +153,7 @@ class OpenCUAMultiModalProcessor(BaseMultiModalProcessor[OpenCUAProcessingInfo])
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py
index 5256d8ba7..1df5ff62f 100644
--- a/vllm/model_executor/models/paddleocr_vl.py
+++ b/vllm/model_executor/models/paddleocr_vl.py
@@ -62,7 +62,7 @@ from vllm.multimodal.inputs import (
     MultiModalDataDict,
     MultiModalFeatureSpec,
     MultiModalFieldConfig,
-    MultiModalKwargs,
+    MultiModalKwargsItems,
 )
 from vllm.multimodal.parse import (
     ImageProcessorItems,
@@ -307,7 +307,7 @@ class PaddleOCRVLMultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
         hf_config = self.info.get_hf_config()
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index ec5d0fa62..9fa32f01d 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -40,7 +40,6 @@ from .siglip import SiglipVisionModel
 from .utils import (
     AutoWeightsLoader,
     WeightsMapper,
-    flatten_bn,
     init_vllm_registered_model,
     maybe_prefix,
 )
@@ -252,6 +251,8 @@ class PaliGemmaMultiModalProcessor(BaseMultiModalProcessor[PaliGemmaProcessingIn
     dummy_inputs=PaliGemmaDummyInputsBuilder,
 )
 class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+    merge_by_field_config = True
+
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -327,9 +328,8 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
             return None
 
         if pixel_values is not None:
-            pixel_values = flatten_bn(pixel_values, concat=True)
-
             h = w = self.config.vision_config.image_size
+
             return PaliGemmaImagePixelInputs(
                 type="pixel_values",
                 data=pixel_values,
@@ -337,8 +337,6 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
             )
 
         if image_embeds is not None:
-            image_embeds = flatten_bn(image_embeds, concat=True)
-
             return PaliGemmaImageEmbeddingInputs(
                 type="image_embeds",
                 data=image_embeds,
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 6ca490f46..cb521ebdf 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -77,7 +77,7 @@ from vllm.multimodal.evs import (
 from vllm.multimodal.inputs import (
     MultiModalFeatureSpec,
     MultiModalFieldConfig,
-    MultiModalKwargs,
+    MultiModalKwargsItems,
 )
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import PromptReplacement, PromptUpdate
@@ -973,7 +973,7 @@ class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor):
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
index 97f6aa461..67bdf5e15 100644
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@@ -25,7 +25,6 @@ from .inputs import (
     MultiModalBatchedField,
     MultiModalFeatureSpec,
     MultiModalFieldElem,
-    MultiModalKwargs,
     MultiModalKwargsItem,
     MultiModalKwargsItems,
     NestedTensors,
@@ -90,7 +89,6 @@ MultiModalCacheValue: TypeAlias = (
     | MultiModalProcessorCacheItemMetadata
     | MultiModalKwargsItems
     | MultiModalKwargsItem
-    | MultiModalKwargs
     | Mapping[str, NestedTensors]
 )
 
@@ -108,12 +106,7 @@ class MultiModalCache:
         # These are not subclasses of dict
         if isinstance(
             leaf,
-            (
-                MultiModalKwargs,
-                MultiModalKwargsItems,
-                MultiModalKwargsItem,
-                MultiModalFieldElem,
-            ),
+            (MultiModalKwargsItems, MultiModalKwargsItem, MultiModalFieldElem),
         ):
             return cls.get_item_size(leaf.data)  # type: ignore
 
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 397684fa2..32f15240c 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -3,7 +3,7 @@
 
 from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
-from collections.abc import Mapping, Sequence
+from collections.abc import Mapping, Sequence, Set
 from dataclasses import dataclass
 from functools import partial
 from itertools import accumulate
@@ -201,8 +201,10 @@ Uses a list instead of a tensor if the dimensions of each element do not match.
 
 
 def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
-    """Equality check between
-    [`NestedTensors`][vllm.multimodal.inputs.NestedTensors] objects."""
+    """
+    Equality check between
+    [`NestedTensors`][vllm.multimodal.inputs.NestedTensors] objects.
+    """
     if isinstance(a, torch.Tensor):
         return isinstance(b, torch.Tensor) and torch.equal(a, b)
     elif isinstance(b, torch.Tensor):
@@ -224,10 +226,24 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
 BatchedTensorInputs: TypeAlias = dict[str, NestedTensors]
 """
 A dictionary containing nested tensors which have been batched via
-[`MultiModalKwargs.batch`][vllm.multimodal.inputs.MultiModalKwargs.batch].
+[`MultiModalKwargsItems.get_data`][vllm.multimodal.inputs.MultiModalKwargsItems.get_data].
 """
 
 
+def batched_tensors_equal(a: BatchedTensorInputs, b: BatchedTensorInputs) -> bool:
+    """
+    Equality check between
+    [`BatchedTensorInputs`][vllm.multimodal.inputs.BatchedTensorInputs] objects.
+    """
+    for k in a:
+        if k not in b:
+            return False
+        if not nested_tensors_equal(a[k], b[k]):
+            return False
+
+    return True
+
+
 @dataclass
 class MultiModalFeatureSpec:
     """
@@ -823,7 +839,14 @@ class MultiModalKwargsItems(UserDict[str, Sequence[_I]]):
 
         return self  # type: ignore[return-value]
 
-    def get_data(self, *, pin_memory: bool = False) -> "MultiModalKwargs":
+    def get_data(
+        self,
+        *,
+        device: torch.types.Device = None,
+        pin_memory: bool = False,
+        cpu_fields: Set[str] = frozenset(),
+    ) -> BatchedTensorInputs:
+        """Construct a dictionary of keyword arguments to pass to the model."""
         elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
         for modality, items in self.items():
             for i, item in enumerate(items):
@@ -835,12 +858,23 @@ class MultiModalKwargsItems(UserDict[str, Sequence[_I]]):
                 for key, elem in item.items():
                     elems_by_key[key].append(elem)
 
-        return MultiModalKwargs(
-            {
-                key: elems[0].field.reduce_data(elems, pin_memory=pin_memory)
-                for key, elems in elems_by_key.items()
-            }
-        )
+        data = {
+            key: elems[0].field.reduce_data(elems, pin_memory=pin_memory)
+            for key, elems in elems_by_key.items()
+        }
+
+        if device is not None:
+            for k in data.keys() - cpu_fields:
+                data[k] = json_map_leaves(
+                    (
+                        lambda x: x.to(device=device, non_blocking=True)
+                        if isinstance(x, torch.Tensor)
+                        else x
+                    ),
+                    data[k],
+                )
+
+        return data
 
 
 MultiModalKwargsOptionalItems: TypeAlias = (
@@ -849,6 +883,7 @@ MultiModalKwargsOptionalItems: TypeAlias = (
 )
 
 
+@deprecated("`MultiModalKwargs` is deprecated and will be removed in v0.13.")
 class MultiModalKwargs(UserDict[str, NestedTensors]):
     """
     A dictionary that represents the keyword arguments to
@@ -882,91 +917,6 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
     ):
         return MultiModalKwargsItems.from_seq(items).get_data(pin_memory=pin_memory)
 
-    @staticmethod
-    def _try_stack(
-        nested_tensors: NestedTensors, pin_memory: bool = False
-    ) -> NestedTensors:
-        """
-        Stack the inner dimensions that have the same shape in
-        a nested list of tensors.
-
-        Thus, a dimension represented by a list means that the inner
-        dimensions are different for each element along that dimension.
-        """
-        if isinstance(nested_tensors, torch.Tensor):
-            return nested_tensors
-
-        # TODO: Remove these once all models have been migrated
-        if isinstance(nested_tensors, np.ndarray):
-            return torch.from_numpy(nested_tensors)
-        if isinstance(nested_tensors, (int, float)):
-            return torch.tensor(nested_tensors)
-
-        stacked = [MultiModalKwargs._try_stack(t, pin_memory) for t in nested_tensors]
-        if not is_list_of(stacked, torch.Tensor, check="all"):
-            # Only tensors (not lists) can be stacked.
-            return stacked
-
-        tensors_ = cast(list[torch.Tensor], stacked)
-        if len(tensors_) == 1:
-            # An optimization when `tensors_` contains only one tensor:
-            # - produce exactly same result as `torch.stack(tensors_)`
-            # - will achieve zero-copy if the tensor is contiguous
-            return tensors_[0].unsqueeze(0).contiguous()
-
-        if any(t.shape != tensors_[0].shape for t in tensors_):
-            # The tensors have incompatible shapes and can't be stacked.
-            return tensors_
-
-        outputs = torch.empty(
-            len(tensors_),
-            *tensors_[0].shape,
-            dtype=tensors_[0].dtype,
-            device=tensors_[0].device,
-            pin_memory=pin_memory,
-        )
-        return torch.stack(tensors_, out=outputs)
-
-    @staticmethod
-    def batch(
-        inputs_list: list["MultiModalKwargs"], pin_memory: bool = False
-    ) -> BatchedTensorInputs:
-        """
-        Batch multiple inputs together into a dictionary.
-
-        The resulting dictionary has the same keys as the inputs.
-        If the corresponding value from each input is a tensor and they all
-        share the same shape, the output value is a single batched tensor;
-        otherwise, the output value is a list containing the original value
-        from each input.
-        """
-        if len(inputs_list) == 0:
-            return {}
-
-        # We need to consider the case where each item in the batch
-        # contains different modalities (i.e. different keys).
-        item_lists = defaultdict[str, list[NestedTensors]](list)
-
-        for inputs in inputs_list:
-            for k, v in inputs.items():
-                item_lists[k].append(v)
-
-        return {
-            k: MultiModalKwargs._try_stack(item_list, pin_memory)
-            for k, item_list in item_lists.items()
-        }
-
-    @staticmethod
-    def as_kwargs(
-        batched_inputs: BatchedTensorInputs,
-        *,
-        device: torch.types.Device,
-    ) -> BatchedTensorInputs:
-        return json_map_leaves(
-            lambda x: x.to(device=device, non_blocking=True),
-            batched_inputs,
-        )
-
     def __getitem__(self, key: str):
         if key not in self:
             raise KeyError(
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 184022085..f8e8847e8 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -19,7 +19,6 @@ from PIL import Image, UnidentifiedImageError
 import vllm.envs as envs
 from vllm.connections import HTTPConnection, global_http_connection
 from vllm.logger import init_logger
-from vllm.utils.jsontree import json_map_leaves
 from vllm.utils.registry import ExtensionManager
 
 from .audio import AudioEmbeddingMediaIO, AudioMediaIO
@@ -427,59 +426,25 @@ def group_mm_kwargs_by_modality(
     Yields:
         A tuple `(modality, num_items, grouped_kwargs)`.
     """
-    if merge_by_field_config is None:
-        raise RuntimeError(
-            "`group_mm_kwargs_by_modality` now requires "
-            "`merge_by_field_config` arg, please update your model runner "
-            "according to https://github.com/vllm-project/vllm/pull/25676."
-        )
-    if merge_by_field_config is False:
+    # TODO: After v0.13, remove merge_by_field_config attribute from model impls
+    if merge_by_field_config is not None:
         logger.warning_once(
-            "The legacy code for batching multi-modal kwargs is deprecated and "
-            "will be removed in v0.12. Please update your model with "
-            "`merge_by_field_config=True` to use the new code defined by "
-            "`MultiModalFieldConfig`. You can refer to "
-            "https://github.com/vllm-project/vllm/issues/26149 "
-            "for some examples on how to do this."
+            "The `merge_by_field_config` argument of `group_mm_kwargs_by_modality` "
+            "is deprecated and will be removed in v0.13."
         )
 
-    from vllm.multimodal.inputs import MultiModalKwargs, MultiModalKwargsItems
+    from vllm.multimodal.inputs import MultiModalKwargsItems
 
     for modality, items in groupby(mm_kwargs, key=lambda item: item.modality):
         items_lst = list(items)
+        mm_kwargs_items = MultiModalKwargsItems.from_seq(items_lst)
+        mm_kwargs_data = mm_kwargs_items.get_data(
+            device=device,
+            pin_memory=pin_memory,
+            cpu_fields=multimodal_cpu_fields,
+        )
 
-        if merge_by_field_config:
-            mm_kwargs_group: BatchedTensorInputs = dict(
-                MultiModalKwargsItems.from_seq(items_lst).get_data(
-                    pin_memory=pin_memory
-                )
-            )
-
-            if device is not None:
-                mm_kwargs_group = {
-                    k: json_map_leaves(
-                        lambda x: x.to(device=device, non_blocking=True)
-                        if isinstance(x, torch.Tensor)
-                        else x,
-                        v,
-                    )
-                    if k not in multimodal_cpu_fields
-                    else v
-                    for k, v in mm_kwargs_group.items()
-                }
-        else:
-            mm_kwargs_group = MultiModalKwargs.as_kwargs(
-                MultiModalKwargs.batch(
-                    [
-                        MultiModalKwargsItems.from_seq([item]).get_data()
-                        for item in items_lst
-                    ],
-                    pin_memory=pin_memory,
-                ),
-                device=device,
-            )
-
-        yield modality, len(items_lst), mm_kwargs_group
+        yield modality, len(items_lst), mm_kwargs_data
 
 
 def fetch_audio(
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 0a6806390..14ae487f3 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -27,7 +27,6 @@ from vllm.multimodal.inputs import (
     MultiModalFieldConfig,
     MultiModalFieldElem,
     MultiModalFlatField,
-    MultiModalKwargs,
     MultiModalKwargsItem,
     MultiModalKwargsItems,
     MultiModalSharedField,
@@ -176,9 +175,6 @@ class MsgpackEncoder:
         if isinstance(obj, MultiModalKwargsItems):
             return self._encode_mm_items(obj)
 
-        if isinstance(obj, MultiModalKwargs):
-            return self._encode_mm_kwargs(obj)
-
         if isinstance(obj, UtilityResult):
             result = obj.result
             if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
@@ -259,11 +255,6 @@ class MsgpackEncoder:
             "field": self._encode_mm_field(elem.field),
         }
 
-    def _encode_mm_kwargs(self, kw: MultiModalKwargs) -> dict[str, Any]:
-        return {
-            modality: self._encode_nested_tensors(data) for modality, data in kw.items()
-        }
-
     def _encode_nested_tensors(self, nt: NestedTensors) -> Any:
         if isinstance(nt, torch.Tensor):
             return self._encode_tensor(nt)
@@ -325,8 +316,6 @@ class MsgpackDecoder:
                 return self._decode_mm_item(obj)
             if issubclass(t, MultiModalKwargsItems):
                 return self._decode_mm_items(obj)
-            if issubclass(t, MultiModalKwargs):
-                return self._decode_mm_kwargs(obj)
             if t is UtilityResult:
                 return self._decode_utility_result(obj)
         return obj
@@ -414,14 +403,6 @@ class MsgpackDecoder:
         obj["field"] = factory_meth(None, *field_args).field
         return MultiModalFieldElem(**obj)
 
-    def _decode_mm_kwargs(self, obj: dict[str, Any]) -> MultiModalKwargs:
-        return MultiModalKwargs(
-            {
-                modality: self._decode_nested_tensors(data)
-                for modality, data in obj.items()
-            }
-        )
-
     def _decode_nested_tensors(self, obj: Any) -> NestedTensors:
         if isinstance(obj, (int, float)):
             # Although it violates NestedTensors type, MultiModalKwargs
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a7eb9cdae..58043a42d 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2106,7 +2106,6 @@ class GPUModelRunner(
             mm_kwargs,
             device=self.device,
             pin_memory=self.pin_memory,
-            merge_by_field_config=model.merge_by_field_config,
             multimodal_cpu_fields=model.multimodal_cpu_fields,
         ):
             curr_group_outputs: list[torch.Tensor] = []
@@ -2133,7 +2132,6 @@ class GPUModelRunner(
                             [video_mm_kwargs_item],
                             device=self.device,
                             pin_memory=self.pin_memory,
-                            merge_by_field_config=model.merge_by_field_config,
                             multimodal_cpu_fields=model.multimodal_cpu_fields,
                         )
                     )
@@ -3849,7 +3847,6 @@ class GPUModelRunner(
                 dummy_mm_items,
                 device=self.device,
                 pin_memory=self.pin_memory,
-                merge_by_field_config=model.merge_by_field_config,
                 multimodal_cpu_fields=model.multimodal_cpu_fields,
             )
         )
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index f3dd9aa96..292f12969 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -969,7 +969,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             mm_kwargs,
             device=self.device,
             pin_memory=self.pin_memory,
-            merge_by_field_config=model.merge_by_field_config,
             multimodal_cpu_fields=model.multimodal_cpu_fields,
         ):
             # Run the encoder.
@@ -2058,7 +2057,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 dummy_mm_items,
                 device=self.device,
                 pin_memory=self.pin_memory,
-                merge_by_field_config=model.merge_by_field_config,
                 multimodal_cpu_fields=model.multimodal_cpu_fields,
             )
         )
-- 
GitLab


From 46cbbca05c31372f672476f5fc3f37b8bbdd5457 Mon Sep 17 00:00:00 2001
From: Qiu <qiuchunshuo@huawei.com>
Date: Fri, 5 Dec 2025 01:28:21 +0800
Subject: [PATCH 094/258] [CI][DCP][Perf] reduce DCP CI execution time (#29858)

Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
---
 tests/distributed/test_context_parallel.py | 188 +++++++++++----------
 tests/models/registry.py                   |   6 +-
 2 files changed, 100 insertions(+), 94 deletions(-)

diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py
index 7e4713b8a..3cb533dcc 100644
--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
@@ -16,16 +16,35 @@ from typing import Literal, NamedTuple
 import pytest
 import torch
 
+from tests.evals.gsm8k.gsm8k_eval import evaluate_gsm8k
+from tests.utils import RemoteOpenAIServer, create_new_process_for_each_test
 from vllm.config.model import RunnerOption
 from vllm.logger import init_logger
 
 from ..models.registry import HF_EXAMPLE_MODELS
-from ..utils import compare_two_settings, create_new_process_for_each_test
 
 logger = init_logger("test_context_parallel")
 
 VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
 
+CP_TEST_MODELS = [
+    # TODO support other models
+    # [LANGUAGE GENERATION]
+    "deepseek-ai/DeepSeek-V2-Lite-Chat",
+    "Qwen/Qwen2.5-1.5B-Instruct",
+]
+
+# GSM8K eval configuration
+NUM_QUESTIONS = 256  # Fast eval for CI
+NUM_SHOTS = 5  # Few-shot examples
+# tp accuracy with 2% buffer
+MIN_ACCURACY = {
+    # .buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": 0.64,
+    # .buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml
+    "Qwen/Qwen2.5-1.5B-Instruct": 0.52,
+}
+
 
 class ParallelSetup(NamedTuple):
     tp_size: int
@@ -38,7 +57,6 @@ class ParallelSetup(NamedTuple):
 
 class CPTestOptions(NamedTuple):
     multi_node_only: bool
-    load_format: str | None = None
     attn_backend: str | None = None
 
 
@@ -54,17 +72,20 @@ class CPTestSettings:
         *,
         tp_base: int = 4,
         pp_base: int = 1,
-        dcp_base: int = 1,
+        dcp_multipliers: list[float] | None = None,
         cp_kv_cache_interleave_size: int = 1,
         multi_node_only: bool = False,
         runner: RunnerOption = "auto",
-        load_format: str | None = None,
         attn_backend: str | None = None,
     ):
         parallel_setups = []
+        if dcp_multipliers is None:
+            dcp_multipliers = [
+                0.5,
+            ]
         for eager_mode_val in [False]:
             for pp_multiplier in [1]:
-                for dcp_multiplier in [0.5, 1]:
+                for dcp_multiplier in dcp_multipliers:
                     for chunked_prefill_val in [True]:
                         parallel_setups.append(
                             ParallelSetup(
@@ -82,7 +103,6 @@ class CPTestSettings:
             runner=runner,
             test_options=CPTestOptions(
                 multi_node_only=multi_node_only,
-                load_format=load_format,
                 attn_backend=attn_backend,
             ),
         )
@@ -101,7 +121,24 @@ class CPTestSettings:
                 )
 
 
-def _compare_cp_with_tp(
+CP_TEXT_GENERATION_MODELS = {
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": [
+        CPTestSettings.detailed(
+            dcp_multipliers=[0.5, 1], cp_kv_cache_interleave_size=64
+        ),
+    ],
+    "Qwen/Qwen2.5-1.5B-Instruct": [
+        CPTestSettings.detailed(
+            cp_kv_cache_interleave_size=16, attn_backend="FLASH_ATTN"
+        ),
+        CPTestSettings.detailed(
+            cp_kv_cache_interleave_size=16, attn_backend="FLASHINFER"
+        ),
+    ],
+}
+
+
+def _test_cp_gsm8k(
     model_id: str,
     parallel_setup: ParallelSetup,
     distributed_backend: str,
@@ -121,7 +158,7 @@ def _compare_cp_with_tp(
         chunked_prefill,
     ) = parallel_setup
 
-    multi_node_only, load_format, attn_backend = test_options
+    multi_node_only, attn_backend = test_options
 
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
     model_info.check_transformers_version(on_fail="skip")
@@ -130,22 +167,7 @@ def _compare_cp_with_tp(
     tokenizer_mode = model_info.tokenizer_mode
     hf_overrides = model_info.hf_overrides
 
-    if load_format == "dummy":
-        # Avoid OOM
-        text_overrides = {
-            "num_hidden_layers": 4,
-            "hidden_size": 512,
-            "intermediate_size": 800,
-            "num_attention_heads": 4,
-            "num_key_value_heads": 1,
-        }
-
-        if is_multimodal:
-            hf_overrides.update({"text_config": text_overrides})
-        else:
-            hf_overrides.update(text_overrides)
-    else:
-        model_info.check_available_online(on_fail="skip")
+    model_info.check_available_online(on_fail="skip")
 
     if num_gpus_available < tp_size * pp_size:
         pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
@@ -157,90 +179,70 @@ def _compare_cp_with_tp(
     if multi_node_only and not VLLM_MULTI_NODE:
         pytest.skip("Not in multi-node setting")
 
-    common_args = [
+    server_args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         "bfloat16",
         "--max-model-len",
-        "2048",
+        "4096",
         "--max-num-seqs",
-        "8",
+        "64",
     ]
     if chunked_prefill:
-        common_args.append("--enable-chunked-prefill")
+        server_args.append("--enable-chunked-prefill")
     if eager_mode:
-        common_args.append("--enforce-eager")
+        server_args.append("--enforce-eager")
     if runner != "auto":
-        common_args.extend(["--runner", runner])
+        server_args.extend(["--runner", runner])
     if trust_remote_code:
-        common_args.append("--trust-remote-code")
+        server_args.append("--trust-remote-code")
     if tokenizer_mode:
-        common_args.extend(["--tokenizer-mode", tokenizer_mode])
-    if load_format:
-        common_args.extend(["--load-format", load_format])
+        server_args.extend(["--tokenizer-mode", tokenizer_mode])
     if hf_overrides:
-        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
-
-    if not attn_backend:
-        cp_env = tp_env = {}
-    else:
-        cp_env = tp_env = {
-            "VLLM_ATTENTION_BACKEND": attn_backend,
-        }
-
-    cp_args = [
-        *common_args,
-        "--tensor-parallel-size",
-        str(tp_size),
-        "--pipeline-parallel-size",
-        str(pp_size),
-        "--decode-context-parallel-size",
-        str(dcp_size),
-        "--dcp-kv-cache-interleave-size",
-        str(cp_kv_cache_interleave_size),
-        "--distributed-executor-backend",
-        distributed_backend,
-    ]
+        server_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
+
+    server_args.extend(
+        [
+            "--tensor-parallel-size",
+            str(tp_size),
+            "--pipeline-parallel-size",
+            str(pp_size),
+            "--decode-context-parallel-size",
+            str(dcp_size),
+            "--dcp-kv-cache-interleave-size",
+            str(cp_kv_cache_interleave_size),
+            "--distributed-executor-backend",
+            distributed_backend,
+        ]
+    )
 
-    tp_args = [
-        *common_args,
-        "--tensor-parallel-size",
-        str(tp_size),
-        "--pipeline-parallel-size",
-        str(pp_size),
-        "--distributed-executor-backend",
-        distributed_backend,
-    ]
+    server_env = {}
+    if attn_backend:
+        server_env["VLLM_ATTENTION_BACKEND"] = attn_backend
 
-    compare_two_settings(
+    with RemoteOpenAIServer(
         model_id,
-        cp_args,
-        tp_args,
-        cp_env,
-        tp_env,
-        method=method,
+        server_args,
+        env_dict=server_env,
         max_wait_seconds=720,
-    )
-
-
-CP_TEXT_GENERATION_MODELS = {
-    "deepseek-ai/DeepSeek-V2-Lite-Chat": [
-        CPTestSettings.detailed(),
-        CPTestSettings.detailed(tp_base=2),
-        CPTestSettings.detailed(tp_base=2, cp_kv_cache_interleave_size=64),
-    ],
-    "bigcode/gpt_bigcode-santacoder": [
-        CPTestSettings.detailed(),
-        CPTestSettings.detailed(tp_base=2),
-    ],
-}
+    ) as remote_server:
+        host = f"http://{remote_server.host}"
+        port = remote_server.port
+
+        # Run GSM8K evaluation
+        results = evaluate_gsm8k(
+            num_questions=NUM_QUESTIONS,
+            num_shots=NUM_SHOTS,
+            host=host,
+            port=port,
+        )
 
-CP_TEST_MODELS = [
-    # TODO support other models
-    # [LANGUAGE GENERATION]
-    "deepseek-ai/DeepSeek-V2-Lite-Chat",
-    "bigcode/gpt_bigcode-santacoder",
-]
+        # Validate accuracy is reasonable
+        accuracy = results["accuracy"]
+        min_accuracy = MIN_ACCURACY[model_id]
+        assert accuracy >= min_accuracy, (
+            f"TP+DCP accuracy too low: {accuracy:.3f} < {min_accuracy:.3f}"
+        )
 
 
 @pytest.mark.parametrize(
@@ -274,12 +276,12 @@ def test_cp_generation(
     ):
         pytest.skip(reason="MLA+DCP requires compute capability of 9.0 or higher")
     if (
-        model_id == "bigcode/gpt_bigcode-santacoder"
+        model_id == "Qwen/Qwen2.5-1.5B-Instruct"
         and torch.cuda.get_device_capability() != (9, 0)
     ):
         pytest.skip(reason="GQA+DCP currently requires compute capability of 9.0")
 
-    _compare_cp_with_tp(
+    _test_cp_gsm8k(
         model_id,
         parallel_setup,
         distributed_backend,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index b9f9945eb..352abdd2d 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -416,7 +416,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
         trust_remote_code=True,
     ),
     "Qwen2ForCausalLM": _HfExamplesInfo(
-        "Qwen/Qwen2-0.5B-Instruct", extras={"2.5": "Qwen/Qwen2.5-0.5B-Instruct"}
+        "Qwen/Qwen2-0.5B-Instruct",
+        extras={
+            "2.5": "Qwen/Qwen2.5-0.5B-Instruct",
+            "2.5-1.5B": "Qwen/Qwen2.5-1.5B-Instruct",
+        },
     ),
     "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
     "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),
-- 
GitLab


From 6dcb07f676ae473703b4ab69ef15d93f51d24ce8 Mon Sep 17 00:00:00 2001
From: Tao Yun <30410832+taoyun951753@users.noreply.github.com>
Date: Fri, 5 Dec 2025 01:34:06 +0800
Subject: [PATCH 095/258] support qwen3-vl handle requests with embeddings
 (#30037)

Signed-off-by: taoyun <1069423820@qq.com>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/features/multimodal_inputs.md     | 2 ++
 vllm/model_executor/models/qwen3_vl.py | 7 +++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index 2b25dc766..0adb32a7a 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -443,6 +443,8 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
         print(generated_text)
     ```
 
+For Qwen3-VL, the `image_embeds` should contain both the base image embedding and deepstack features.
+
 #### Audio Embeddings
 
 You can pass pre-computed audio embeddings similar to image embeddings:
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 1d3929b93..58721303d 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -103,7 +103,7 @@ from .qwen2_5_vl import (
     Qwen2_5_VLVideoInputs,
     Qwen2_5_VLVideoPixelInputs,
 )
-from .qwen2_vl import Qwen2VLProcessingInfo
+from .qwen2_vl import Qwen2VLMultiModalDataParser, Qwen2VLProcessingInfo
 from .qwen3 import Qwen3ForCausalLM, Qwen3Model
 from .utils import (
     AutoWeightsLoader,
@@ -884,7 +884,10 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]):
 
 class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]):
     def _get_data_parser(self) -> MultiModalDataParser:
-        return MultiModalDataParser(video_needs_metadata=True)
+        return Qwen2VLMultiModalDataParser(
+            self.info.get_hf_config().vision_config.spatial_merge_size,
+            video_needs_metadata=True,
+        )
 
     def _call_hf_processor(
         self,
-- 
GitLab


From 652ba93da36d793e7f3ff8a0ecdb5d6b00107e68 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 5 Dec 2025 02:17:49 +0800
Subject: [PATCH 096/258] [Bugfix] Fix FP8 MoE LoRA (#29890)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/layers/quantization/fp8.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 48223c9f1..0e3e13f59 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -124,12 +124,16 @@ class Fp8MoeBackend(Enum):
 
 
 def get_fp8_moe_backend(
-    block_quant: bool, moe_parallel_config: FusedMoEParallelConfig
+    block_quant: bool,
+    moe_parallel_config: FusedMoEParallelConfig,
+    with_lora_support: bool,
 ) -> Fp8MoeBackend:
     """
     Select the primary FP8 MoE backend
     Note: Shape-specific fallbacks may still occur at runtime.
     """
+    if with_lora_support:
+        return Fp8MoeBackend.TRITON
     # Prefer FlashInfer backends on supported GPUs; allow SM90 and SM100.
     if (
         current_platform.is_cuda()
@@ -665,7 +669,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         self.weight_block_size = self.quant_config.weight_block_size
         self.block_quant: bool = self.weight_block_size is not None
         self.fp8_backend = get_fp8_moe_backend(
-            self.block_quant, layer.moe_parallel_config
+            self.block_quant, layer.moe_parallel_config, self.moe.is_lora_enabled
         )
 
         self.marlin_input_dtype = None
@@ -1084,6 +1088,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         from vllm.model_executor.layers.fused_moe import (
             BatchedDeepGemmExperts,
             BatchedTritonExperts,
+            TritonExperts,
             TritonOrDeepGemmExperts,
         )
 
@@ -1116,7 +1121,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 num_dispatchers=prepare_finalize.num_dispatchers(),
                 quant_config=self.moe_quant_config,
             )
-
+        elif self.moe.is_lora_enabled:
+            return TritonExperts(quant_config=self.moe_quant_config)
         elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
             # Select GEMM experts with block-scale when weights are block-quantized
             experts = select_cutlass_fp8_gemm_impl(
-- 
GitLab


From ece2825a29e6b54ce6b114c27ec7ea498c66b416 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 5 Dec 2025 02:20:48 +0800
Subject: [PATCH 097/258] [KVConnector] Remove v0-related kv connector
 components such as kv pipe and kv lookup buffer (#29705)

Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
---
 tests/kv_transfer/test_lookup_buffer.py       | 160 ----------
 tests/kv_transfer/test_lookup_buffer.sh       |   8 -
 tests/kv_transfer/test_module.py              |  62 ----
 tests/kv_transfer/test_send_recv.py           | 154 ---------
 tests/kv_transfer/test_send_recv.sh           |   9 -
 .../kv_transfer/kv_lookup_buffer/__init__.py  |   0
 .../kv_transfer/kv_lookup_buffer/base.py      | 179 -----------
 .../kv_lookup_buffer/mooncake_store.py        | 164 ----------
 .../kv_lookup_buffer/simple_buffer.py         | 242 --------------
 .../kv_transfer/kv_pipe/__init__.py           |   0
 vllm/distributed/kv_transfer/kv_pipe/base.py  |  66 ----
 .../kv_transfer/kv_pipe/mooncake_pipe.py      | 295 ------------------
 .../kv_transfer/kv_pipe/pynccl_pipe.py        | 285 -----------------
 13 files changed, 1624 deletions(-)
 delete mode 100644 tests/kv_transfer/test_lookup_buffer.py
 delete mode 100644 tests/kv_transfer/test_lookup_buffer.sh
 delete mode 100644 tests/kv_transfer/test_module.py
 delete mode 100644 tests/kv_transfer/test_send_recv.py
 delete mode 100644 tests/kv_transfer/test_send_recv.sh
 delete mode 100644 vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py
 delete mode 100644 vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
 delete mode 100644 vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
 delete mode 100644 vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
 delete mode 100644 vllm/distributed/kv_transfer/kv_pipe/__init__.py
 delete mode 100644 vllm/distributed/kv_transfer/kv_pipe/base.py
 delete mode 100644 vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
 delete mode 100644 vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py

diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
deleted file mode 100644
index a61ccef70..000000000
--- a/tests/kv_transfer/test_lookup_buffer.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-import random
-
-import torch
-from tqdm import tqdm
-
-from vllm.config import KVTransferConfig
-from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import SimpleBuffer
-from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
-
-# TODO: the test depends on a lot of fields in the current implementation.
-# We should have standard interface instead direct field access
-
-
-def test_run(my_rank, buffer, device):
-    # buffer should be empty in the beginning
-    if my_rank == 0:
-        assert buffer.buffer_size == 0
-        assert len(buffer.buffer) == 0
-
-    print(f"My rank: {my_rank}, device: {device}")
-
-    # insert
-    tokens = torch.tensor([1, 2, 3]).to(device)
-    roi = tokens > 0
-    if my_rank == 0:
-        key = 2.0 * torch.ones([5, 6]).to(device)
-        value = 3.0 * torch.ones([5, 6]).to(device)
-
-        placeholder = torch.tensor([1]).to(device)
-
-        buffer.insert(tokens, roi, key, value, placeholder)
-
-    torch.distributed.barrier()
-
-    # drop_select
-    if my_rank == 1:
-        tok, roi_, key, value, hidden = buffer.drop_select(tokens, roi)
-        assert torch.allclose(tokens, tok)
-        assert torch.allclose(roi, roi_)
-        assert torch.allclose(key, 2.0 * torch.ones([5, 6], device=device))
-        assert torch.allclose(value, 3.0 * torch.ones([5, 6], device=device))
-    torch.distributed.barrier()
-
-    if my_rank == 0:
-        assert buffer.buffer_size == 0
-        assert len(buffer.buffer) == 0
-
-    print(f"My rank: {my_rank}, Test run passed!")
-
-
-def stress_test(my_rank, buf, device):
-    torch.distributed.barrier()
-    torch.manual_seed(100)
-
-    reqs = [
-        (
-            torch.rand(100).to(device),  # tokens
-            torch.ones(100).bool().to(device),  # roi
-            torch.rand(100).to(device),  # key
-            torch.rand(100).to(device),  # value
-            torch.rand(100).to(device),  # hidden
-        )
-        for i in tqdm(range(200))
-    ]
-
-    random.seed(my_rank)
-    random.shuffle(reqs)
-
-    torch.distributed.barrier()
-
-    n = 0
-
-    # the buffer size can only store 100 reqs
-    # so the sender will occasionally block to wait for the receiver.
-    for req in tqdm(reqs):
-        if my_rank == 0:
-            buf.insert(*req)
-        else:
-            tok, roi, k, v, h = req
-            tok_, roi_, k_, v_, h_ = buf.drop_select(tok, roi)
-
-            if tok_ is None:
-                assert roi_ is None
-                assert k_ is None
-                assert v_ is None
-                assert h_ is None
-                n += 1
-            else:
-                assert torch.allclose(tok, tok_)
-                assert torch.allclose(roi, roi_)
-                assert torch.allclose(k, k_)
-                assert torch.allclose(v, v_)
-                assert torch.allclose(h, h_)
-    print(f"Rank {my_rank} done")
-    torch.distributed.barrier()
-
-    if my_rank == 0:
-        x = torch.tensor([0])
-        torch.distributed.recv(x, 1)
-        # the # of None received is the kv that are not selected
-        assert x.item() == len(buf.buffer)
-        # and the size of the buffer should be 2000 * buffer len
-        print(buf.buffer_size)
-        assert buf.buffer_size == 1700 * len(buf.buffer)
-    else:
-        torch.distributed.send(torch.tensor([n]), 0)
-
-    print(f"My rank: {my_rank}, Passed stress test!")
-
-
-if __name__ == "__main__":
-    my_rank = int(os.environ["RANK"])
-
-    torch.distributed.init_process_group(
-        backend="gloo",
-        init_method="tcp://localhost:12398",
-        world_size=2,
-        rank=my_rank,
-    )
-
-    print(f"initialized! My rank is {my_rank}")
-
-    config = KVTransferConfig(
-        kv_connector="P2pNcclConnector",
-        kv_buffer_device="cuda",
-        kv_buffer_size=1e9,
-        kv_rank=my_rank,
-        kv_role="kv_both",  # this arg doesn't matter in this test
-        kv_parallel_size=2,
-        kv_ip="127.0.0.1",
-        kv_port=12345,
-    )
-
-    data_pipe = PyNcclPipe(
-        local_rank=my_rank,
-        config=config,
-        device="cuda",
-        port_offset=0,
-    )
-    cpu_pipe = PyNcclPipe(
-        local_rank=my_rank,
-        config=config,
-        device="cpu",
-        port_offset=1,
-    )
-
-    buffer = SimpleBuffer(cpu_pipe, data_pipe, 170000)
-
-    test_run(my_rank, buffer, data_pipe.device)
-
-    stress_test(my_rank, buffer, data_pipe.device)
-
-    buffer.close()
-    data_pipe.close()
-    cpu_pipe.close()
-    print("Done")
diff --git a/tests/kv_transfer/test_lookup_buffer.sh b/tests/kv_transfer/test_lookup_buffer.sh
deleted file mode 100644
index f2aeaee9c..000000000
--- a/tests/kv_transfer/test_lookup_buffer.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-RANK=0 python3 test_lookup_buffer.py &
-PID0=$!
-RANK=1 python3 test_lookup_buffer.py &
-PID1=$!
-
-wait $PID0
-wait $PID1
diff --git a/tests/kv_transfer/test_module.py b/tests/kv_transfer/test_module.py
deleted file mode 100644
index b9a28e4bc..000000000
--- a/tests/kv_transfer/test_module.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import subprocess
-import sys
-
-import pytest
-import torch
-
-
-def run_python_script(script_name, timeout):
-    script_name = f"kv_transfer/{script_name}"
-    try:
-        # Start both processes asynchronously using Popen
-        process0 = subprocess.Popen(
-            [sys.executable, script_name],
-            env={"RANK": "0"},  # Set the RANK environment variable for process 0
-            stdout=sys.stdout,  # Pipe stdout to current stdout
-            stderr=sys.stderr,  # Pipe stderr to current stderr
-        )
-
-        process1 = subprocess.Popen(
-            [sys.executable, script_name],
-            env={"RANK": "1"},  # Set the RANK environment variable for process 1
-            stdout=sys.stdout,  # Pipe stdout to current stdout
-            stderr=sys.stderr,  # Pipe stderr to current stderr
-        )
-
-        # Wait for both processes to complete, with a timeout
-        process0.wait(timeout=timeout)
-        process1.wait(timeout=timeout)
-
-        # Check the return status of both processes
-        if process0.returncode != 0:
-            pytest.fail(f"Test {script_name} failed for RANK=0, {process0.returncode}")
-        if process1.returncode != 0:
-            pytest.fail(f"Test {script_name} failed for RANK=1, {process1.returncode}")
-
-    except subprocess.TimeoutExpired:
-        # If either process times out, terminate both and fail the test
-        process0.terminate()
-        process1.terminate()
-        pytest.fail(f"Test {script_name} timed out")
-    except Exception as e:
-        pytest.fail(f"Test {script_name} failed with error: {str(e)}")
-
-
-# Define the test cases using pytest's parametrize
-@pytest.mark.parametrize(
-    "script_name,timeout",
-    [
-        ("test_lookup_buffer.py", 60),  # Second test case with a 60-second timeout
-        ("test_send_recv.py", 120),  # First test case with a 120-second timeout
-    ],
-)
-def test_run_python_script(script_name, timeout):
-    # Check the number of GPUs
-    if torch.cuda.device_count() < 2:
-        pytest.skip(f"Skipping test {script_name} because <2 GPUs are available")
-
-    # Run the test if there are at least 2 GPUs
-    run_python_script(script_name, timeout)
diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
deleted file mode 100644
index 5762224ef..000000000
--- a/tests/kv_transfer/test_send_recv.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-import time
-
-import torch
-from tqdm import tqdm
-
-from vllm.config import KVTransferConfig
-from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
-
-
-def test_run(my_rank, pipe):
-    print(f"rank {my_rank} test_run starts....")
-    # test run
-    x = torch.tensor([1]).to(pipe.device)
-    y = torch.tensor([[2.0, 3.0, 4.0, 8.0]]).to(pipe.device)
-    if my_rank == 0:
-        pipe.send_tensor(x)
-        print(f"rank {my_rank} sent tensor x")
-        pipe.send_tensor(y)
-        print(f"rank {my_rank} sent tensor y")
-        x2 = pipe.recv_tensor()
-        print(f"rank {my_rank} received x2 = ", x2)
-        y2 = pipe.recv_tensor()
-        print(f"rank {my_rank} received y2 = ", y2)
-
-    else:
-        x2 = pipe.recv_tensor()
-        print(f"rank {my_rank} received x2 = ", x2)
-        y2 = pipe.recv_tensor()
-        print(f"rank {my_rank} received y2 = ", y2)
-        pipe.send_tensor(x)
-        print(f"rank {my_rank} sent tensor x")
-        pipe.send_tensor(y)
-        print(f"rank {my_rank} sent tensor y")
-
-    assert torch.allclose(x, x2)
-    assert torch.allclose(y, y2)
-
-    print(f"rank {my_rank} test_run passed!")
-
-
-def stress_test(my_rank, pipe):
-    print(f"rank {my_rank} stress_test starts....")
-
-    tensors: list[torch.Tensor] = []
-
-    torch.distributed.barrier()
-    torch.manual_seed(0)
-
-    for i in tqdm(range(500)):
-        mean = torch.rand(1).item() * 100
-        std = torch.rand(1).item() * 100
-        size = torch.randint(900, 1000, (2,))
-        x = torch.normal(mean * 1.0, std * 1.0, size=size.tolist()).to(pipe.device)
-
-        # 5% probability of sending a None
-        if torch.rand(1).item() < 0.05:
-            tensors.append(None)
-            tensors.append(None)
-            tensors.append(None)
-        else:
-            tensors.append(x)
-            tensors.append(x.mean().unsqueeze(0))
-            tensors.append(x.std().unsqueeze(0))
-
-    torch.distributed.barrier()
-
-    for i in tqdm(range(500)):
-        if my_rank == int((i % 10) > 3):
-            pipe.send_tensor(tensors[3 * i])
-            pipe.send_tensor(tensors[3 * i + 1])
-            pipe.send_tensor(tensors[3 * i + 2])
-        else:
-            x = pipe.recv_tensor()
-            mean = pipe.recv_tensor()
-            std = pipe.recv_tensor()
-
-            if x is None:
-                assert mean is None
-                assert std is None
-            else:
-                assert torch.allclose(x, tensors[3 * i])
-                assert x.mean() == mean[0]
-                assert x.std() == std[0]
-
-        torch.distributed.barrier()
-
-
-def latency_test(my_rank, pipe, nelement, ntensor):
-    latencies = []
-
-    torch.distributed.barrier()
-
-    for i in tqdm(range(500)):
-        tensors = []
-
-        if my_rank == 0:
-            # create tensor
-            tensors = [torch.rand(nelement).to(pipe.device) for _ in range(ntensor)]
-
-        torch.distributed.barrier()
-
-        if my_rank == 0:
-            t = torch.tensor([time.time()], dtype=torch.float64).to(pipe.device)
-            for tensor in tensors:
-                pipe.send_tensor(tensor)
-            pipe.send_tensor(t)
-        else:
-            for _ in range(ntensor):
-                pipe.recv_tensor()
-            t = pipe.recv_tensor()
-            latencies.append(time.time() - t.item())
-
-    torch.distributed.barrier()
-
-    print("Latency test passed.")
-    print("Latency:", torch.tensor(latencies).mean().item() * 1000, "ms")
-
-
-if __name__ == "__main__":
-    my_rank = int(os.environ["RANK"])
-
-    torch.distributed.init_process_group(
-        backend="gloo",
-        init_method="tcp://localhost:12398",
-        world_size=2,
-        rank=my_rank,
-    )
-
-    config = KVTransferConfig(
-        kv_connector="P2pNcclConnector",
-        kv_buffer_device="cuda",
-        kv_buffer_size=1e9,
-        kv_rank=my_rank,
-        kv_role="kv_both",  # this arg doesn't matter in this test
-        kv_parallel_size=2,
-        kv_ip="127.0.0.1",
-        kv_port=12345,
-    )
-
-    pipe = PyNcclPipe(
-        local_rank=my_rank,
-        config=config,
-    )
-
-    test_run(my_rank, pipe)
-
-    stress_test(my_rank, pipe)
-
-    # Use this function if you want to test the latency of pipe impl.
-    # latency_test(my_rank, pipe, 1024 * 8 * 128, 80)
diff --git a/tests/kv_transfer/test_send_recv.sh b/tests/kv_transfer/test_send_recv.sh
deleted file mode 100644
index 54e060480..000000000
--- a/tests/kv_transfer/test_send_recv.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-RANK=0 python3 test_send_recv.py &
-PID0=$!
-RANK=1 python3 test_send_recv.py &
-PID1=$!
-
-wait $PID0
-wait $PID1
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
deleted file mode 100644
index f48d03d0b..000000000
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This file contains a new class `KVLookupBufferBase` that allows developers to
-think of KV cache operations as inserting new KV cache entries (`insert`)
-into the lookup buffer and querying existing KV caches (`drop_select`)
-from the lookup buffer.
-
-This file also contains a new class `KVStoreBufferBase` that allows developers
-to manage the KVCache buffer as a simple key-value storage buffer with basic
-put/get operations.
-
-These classes above are abstracted behind class `KVCacheBufferBase`.
-"""
-
-from abc import ABC, abstractmethod
-
-import torch
-
-
-class KVCacheBufferBase(ABC):
-    """
-    Abstract base class for a KVCache buffer.
-    """
-
-    @abstractmethod
-    def close(self) -> None:
-        """Close the buffer and release resources.
-
-        This method is responsible for cleaning up resources related to the
-        KVCache buffer when it is no longer needed.
-
-        Raises:
-            NotImplementedError: This method must be implemented in subclasses.
-        """
-        raise NotImplementedError
-
-
-class KVLookupBufferBase(KVCacheBufferBase):
-    """
-    Abstract base class for a KVCache lookup buffer.
-
-    This class provides an abstraction for a key-value (KV) cache lookup buffer.
-
-    The key of the lookup buffer:
-    - input_tokens: token IDs of the request
-    - roi: a binary mask on top of input_tokens.
-      - Purpose of roi: Since KV cache may only be available for a subset of
-        tokens in the input (for example, when vLLM is connected to an external
-        KV cache service), roi specifies the subset of tokens that the KV cache
-        is associated with.
-      - NOTE: roi can be further extended to describe which part of KV the
-        current process is holding (each process may only hold a part of KV
-        due to TP and PP). This is not implemented for now.
-
-    The value of the lookup buffer:
-    - key: the key tensor in the KV cache
-    - value: the value tensor in the KV cache
-    - hidden: the final hidden state generated by model forwarding. This allows
-      vLLM to bypass further model forwarding by transmitting the hidden state.
-    """
-
-    @abstractmethod
-    def insert(
-        self,
-        input_tokens: torch.Tensor,
-        roi: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        hidden: torch.Tensor,
-    ) -> None:
-        """Insert into the lookup buffer.
-
-        The functionality is similar to the following python statement
-        ```
-        buffer[input_tokens, roi] = [key, value, hidden]
-        ```
-
-        FIXME: in the future, we should only have two arguments, key and value,
-        where key is a tensor dict and value is a tensor dict.
-
-        FIXME: we should transmit both sampler outputs and the hidden states.
-
-        Args:
-            input_tokens (torch.Tensor): token IDs.
-            roi (torch.Tensor): A binary mask on top of the input tokens
-            key (torch.Tensor): The key tensor in the KV cache.
-            value (torch.Tensor): The value tensor in the KV cache.
-            hidden (torch.Tensor): The final hidden state tensor generated
-                                   during model forwarding to bypass model
-                                   forwarding.
-
-        Raises:
-            NotImplementedError: This method must be implemented in subclasses.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def drop_select(
-        self, input_tokens: torch.Tensor | None, roi: torch.Tensor | None
-    ) -> list[torch.Tensor | None]:
-        """Select and *drop* KV cache entries from the lookup buffer.
-
-        The functionality is similar to the following python statements
-        ```
-        ret = buffer.pop(input_tokens, roi)
-        return ret
-        ```
-
-        If `input_tokens` and `roi` is `None`, it means selecting any of the
-        KV caches in the buffer, return, and remove it from the buffer, useful
-        when offloading KV cache to KV cache storage service.
-
-        Args:
-            input_tokens (torch.Tensor): token IDs.
-            roi (torch.Tensor): A binary mask on top of the input tokens
-
-        Returns:
-            list[Optional[torch.Tensor]]: A list of tensors. Can be None.
-
-        Raises:
-            NotImplementedError: This method must be implemented in subclasses.
-        """
-        raise NotImplementedError
-
-
-class KVStoreBufferBase(KVCacheBufferBase):
-    """
-    Abstract base class for a KVCache storage buffer with key-value semantics.
-    This class provides a simple key-value storage buffer abstract with basic
-    put/get operations, which enables flexible KVCache transfer granular
-    control.
-
-    The functionality is similar to a distributed key-value store, where:
-    - Key: A unique string identifier for the cached entry
-    - Value:
-        - Tensor to be stored and retrieved
-        - None (indicating deletion or empty value)
-    """
-
-    @abstractmethod
-    def put(
-        self,
-        key: str,
-        value: torch.Tensor | None,
-    ) -> None:
-        """Store a key-value pair in the buffer.
-
-        Args:
-            key (str): Unique identifier for a tensor, this tensor could be the
-                key cache tensor, value cache tensor, or hidden state tensor
-                generated during model forwarding.
-
-            value (Optional[torch.Tensor]): Tensor to be stored.
-
-        Raises:
-            NotImplementedError: This method must be implemented in subclasses.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def get(
-        self,
-        key: str,
-    ) -> torch.Tensor | None:
-        """Retrieve a value from the buffer by key.
-
-        Args:
-            key (str): Unique identifier for a tensor, this tensor could be the
-                key cache tensor, value cache tensor, or hidden state tensor
-                generated during model forwarding.
-
-        Returns:
-            Optional[torch.Tensor]: Stored tensor if exists, None otherwise.
-
-        Raises:
-            NotImplementedError: This method must be implemented in subclasses.
-        """
-        raise NotImplementedError
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
deleted file mode 100644
index 7861bea1f..000000000
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This file contains a new class `MooncakeStore` that allows developers to
-think of KV cache transfer operations as putting new KV cache entries
-into a remote KVStore-based lookup buffer and getting existing KV caches
-from this remote lookup buffer.
-"""
-
-import json
-import os
-from dataclasses import dataclass
-
-import torch
-from safetensors.torch import load as safetensors_load
-from safetensors.torch import save as safetensors_save
-
-from vllm.config import VllmConfig
-from vllm.distributed.kv_transfer.kv_lookup_buffer.base import KVStoreBufferBase
-from vllm.logger import init_logger
-
-DEFAULT_GLOBAL_SEGMENT_SIZE = 3355443200  # 3.125 GiB
-DEFAULT_LOCAL_BUFFER_SIZE = 1073741824  # 1.0 GiB
-
-logger = init_logger(__name__)
-
-
-@dataclass
-class MooncakeStoreConfig:
-    local_hostname: str
-    metadata_server: str
-    global_segment_size: int
-    local_buffer_size: int
-    protocol: str
-    device_name: str
-    master_server_address: str
-
-    @staticmethod
-    def from_file(file_path: str) -> "MooncakeStoreConfig":
-        """Load the config from a JSON file."""
-        with open(file_path) as fin:
-            config = json.load(fin)
-        return MooncakeStoreConfig(
-            local_hostname=config.get("local_hostname"),
-            metadata_server=config.get("metadata_server"),
-            global_segment_size=config.get(
-                "global_segment_size", DEFAULT_GLOBAL_SEGMENT_SIZE
-            ),
-            local_buffer_size=config.get(
-                "local_buffer_size", DEFAULT_LOCAL_BUFFER_SIZE
-            ),
-            protocol=config.get("protocol", "tcp"),
-            device_name=config.get("device_name", ""),
-            master_server_address=config.get("master_server_address"),
-        )
-
-    @staticmethod
-    def load_from_env() -> "MooncakeStoreConfig":
-        """Load config from a file specified in the environment variable."""
-        config_file_path = os.getenv("MOONCAKE_CONFIG_PATH")
-        if config_file_path is None:
-            raise ValueError(
-                "The environment variable 'MOONCAKE_CONFIG_PATH' is not set."
-            )
-        return MooncakeStoreConfig.from_file(config_file_path)
-
-
-class MooncakeStore(KVStoreBufferBase):
-    def __init__(
-        self,
-        config: VllmConfig,
-    ):
-        try:
-            from mooncake.store import MooncakeDistributedStore
-        except ImportError as e:
-            raise ImportError(
-                "Please install mooncake by following the instructions at "
-                "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "  # noqa: E501
-                "to run vLLM with MooncakeConnector."
-            ) from e
-
-        try:
-            self.store = MooncakeDistributedStore()
-            self.config = MooncakeStoreConfig.load_from_env()
-            logger.info("Mooncake Configuration loaded successfully.")
-
-            self.store.setup(
-                self.config.local_hostname,
-                self.config.metadata_server,
-                self.config.global_segment_size,
-                self.config.local_buffer_size,
-                self.config.protocol,
-                self.config.device_name,
-                self.config.master_server_address,
-            )
-
-        except ValueError as e:
-            logger.error("Configuration loading failed: %s", e)
-            raise
-        except Exception as exc:
-            logger.error("An error occurred while loading the configuration: %s", exc)
-            raise
-
-    def close(self):
-        # MooncakeDistributedStore will automatically call the destructor, so
-        # it is unnecessary to close it manually.
-        pass
-
-    def put(
-        self,
-        key: str,
-        value: torch.Tensor | None,
-    ) -> None:
-        # A message queue needs to be introduced before making it asynchronous.
-        if value is not None:
-            self._put_impl(key, value)
-
-    def get(
-        self,
-        key: str,
-    ) -> torch.Tensor | None:
-        # A message queue needs to be introduced before making it asynchronous.
-        value = self._get_impl(key)
-        return value
-
-    def _put_impl(
-        self,
-        key: str,
-        value: torch.Tensor,
-    ) -> None:
-        """Put KVCache to Mooncake Store"""
-        device_id = value.device.index if value.device.type == "cuda" else -1
-        device_tensor = torch.tensor(device_id, dtype=torch.int32)
-        value_bytes = safetensors_save({"tensor": value, "device_id": device_tensor})
-        try:
-            self.store.put(key, value_bytes)
-        except TypeError as err:
-            logger.error("Failed to put value into Mooncake Store: %s", err)
-            raise TypeError("Mooncake Store Put Type Error.") from err
-
-    def _get_impl(
-        self,
-        key: str,
-    ) -> torch.Tensor | None:
-        """Get KVCache from Mooncake Store"""
-        try:
-            data = self.store.get(key)
-        except TypeError as err:
-            logger.error("Failed to get value from Mooncake Store: %s", err)
-            raise TypeError("Mooncake Store Get Type Error.") from err
-
-        if data:
-            loaded_tensors = safetensors_load(data)
-            tensor = loaded_tensors["tensor"]
-            device_id_tensor = loaded_tensors["device_id"]
-            device_id = int(device_id_tensor.item())
-            device = (
-                torch.device("cuda", device_id)
-                if device_id >= 0
-                else torch.device("cpu")
-            )
-            return tensor.to(device)
-
-        return None
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
deleted file mode 100644
index f046a3498..000000000
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Implements a distributed key-value (KV) cache transfer mechanism.
-
-Key Features:
-- Distributed KV cache transmission using PyNccl pipes.
-- Non-blocking `insert`, blocking `drop_select`.
-- Use CPU signal pipe to avoid racing condition
-- Handles buffer size constraints and provide backpressure mechanism to
-  stop the prefill instance when the decode instance is slow.
-"""
-
-import threading
-from collections import deque
-
-import torch
-
-from vllm.distributed.kv_transfer.kv_lookup_buffer.base import KVLookupBufferBase
-from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-class SimpleBuffer(KVLookupBufferBase):
-    def __init__(
-        self, signal_pipe: KVPipeBase, data_pipe: KVPipeBase, buffer_size_thresh: float
-    ):
-        """
-        signal_pipe: on CPU
-
-        NOTE: on-device recv will block all threads in the process, making the
-        KV cache producer unable to listen to new request while transmitting
-        KV cache. Luckily CPU recv only blocks the current thread so we use
-        CPU recv to listen to new request.
-
-        data_pipe: on device (e.g. GPU)
-        """
-
-        self.buffer: deque[list[torch.Tensor]] = deque()
-
-        self.buffer_size = 0
-        self.buffer_size_threshold = buffer_size_thresh
-        self.buffer_cv = threading.Condition()
-        self.signal_pipe = signal_pipe
-        self.data_pipe = data_pipe
-        self.request_handling_thread: threading.Thread | None = None
-
-        self.normal_signal = torch.tensor([0], device="cpu")
-        self.end_signal = None
-
-    def _matches(
-        self,
-        tokens_roi_sender: list[torch.Tensor],
-        tokens_roi_recver: list[torch.Tensor],
-    ):
-        # tokens_roi_sender: tokens and roi of the producer (in the buffer)
-        # tokens_roi_recver: tokens and roi of the consumer (query)
-
-        tokens_sender = tokens_roi_sender[0]
-        tokens_recver = tokens_roi_recver[0]
-        roi_sender = tokens_roi_sender[1]
-        roi_recver = tokens_roi_recver[1]
-
-        if tokens_recver is None:
-            # consumer sends an empty request
-            # semantics: DROP SELECT * LIMIT 1
-            # so any of the data in the buffer can be drop-selected
-            return True
-
-        # Assuming that roi is a binary mask on tokens
-        tokens_sender = tokens_sender[roi_sender]
-        tokens_recver = tokens_recver[roi_recver]
-
-        # simple common prefix matching
-        min_length = min(len(tokens_sender), len(tokens_recver))
-        if torch.allclose(tokens_sender[:min_length], tokens_recver[:min_length]):
-            return min_length
-
-        return 0
-
-    def _send_tensor_and_dec_size(self, tensor: torch.Tensor | None) -> None:
-        assert tensor is not None, "Use self.data_pipe.send(None) instead"
-        self.buffer_size -= tensor.element_size() * tensor.numel()
-        if tensor.dtype == torch.bool:
-            tensor = tensor.float()
-        self.data_pipe.send_tensor(tensor)
-
-    def _get_element_size(self, data: list | torch.Tensor | None):
-        if isinstance(data, torch.Tensor):
-            return data.element_size() * data.numel()
-        if not data:
-            # cannot perform `not data` on a tensor
-            # so this check needs to go after the check above
-            return 0
-
-        raise AssertionError(f"Unknown data type {type(data)}")
-
-    def _add_to_buffer(
-        self,
-        input_tokens: torch.Tensor,
-        roi: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        hidden: torch.Tensor,
-    ):
-        if isinstance(input_tokens, torch.Tensor):
-            input_tokens = input_tokens.clone()
-        if isinstance(roi, torch.Tensor):
-            roi = roi.clone()
-        if isinstance(key, torch.Tensor):
-            key = key.clone()
-        if isinstance(value, torch.Tensor):
-            value = value.clone()
-        if isinstance(hidden, torch.Tensor):
-            hidden = hidden.clone()
-
-        buffer_item = [input_tokens, roi, key, value, hidden]
-        data_size = sum([self._get_element_size(data) for data in buffer_item])
-
-        with self.buffer_cv:
-            if self.buffer_size + data_size > self.buffer_size_threshold:
-                # log outside the while loop to avoid this message being logged
-                # repeatedly.
-                logger.debug("KV transfer buffer is full. Handling...")
-                while self.buffer_size + data_size > self.buffer_size_threshold:
-                    self.buffer_cv.wait()
-
-            self.buffer_size += data_size
-            self.buffer.append(buffer_item)
-            self.buffer_cv.notify()
-
-    def _is_end_signal(self, signal):
-        return signal is None
-
-    def drop_select_handler(self):
-        try:
-            while True:
-                signal = self.signal_pipe.recv_tensor()
-                if self._is_end_signal(signal):
-                    logger.info("Received end signal!")
-                    break
-
-                input_tokens = self.data_pipe.recv_tensor()
-
-                roi = self.data_pipe.recv_tensor()
-                assert roi is not None, (
-                    "Please provide the roi when sending drop-select request"
-                )
-                roi = roi > 0.5
-                tokens_roi_recver = [input_tokens, roi]
-
-                def is_buffer_available(
-                    tokens_roi_recver: list[torch.Tensor],
-                ) -> bool:
-                    # perform input tokens and roi matching
-                    # FIXME: this matching is O(n), ideally it should be O(1)
-                    # but this buffer size won't (and shouldn't) be too large so
-                    # the fix is not urgent.
-                    for _ in range(len(self.buffer)):
-                        if self._matches(self.buffer[0], tokens_roi_recver) > 0:
-                            return True
-                        # rotate the element we just accessed to the end
-                        self.buffer.rotate(-1)
-                    return False
-
-                with self.buffer_cv:
-                    while not is_buffer_available(tokens_roi_recver):
-                        logger.debug("KV transfer buffer is not available. Waiting...")
-                        self.buffer_cv.wait()
-                    # need to clone the tensor
-                    # in case the tensor is freed before sending finishes
-                    matched_item = self.buffer.popleft()
-                    for tensor in matched_item:
-                        self._send_tensor_and_dec_size(tensor)
-                    self.buffer_cv.notify()
-
-        except RuntimeError as e:
-            if "Connection closed by peer" not in str(e):
-                raise e
-
-        logger.debug("Closing drop_select_handler")
-
-    def drop_select(
-        self, input_tokens: torch.Tensor | None, roi: torch.Tensor | None
-    ) -> list[torch.Tensor | None]:
-        assert self.request_handling_thread is None, (
-            "drop_select should be called by the KV cache consumer "
-            "(e.g. the decode vLLM instance)"
-        )
-
-        if isinstance(input_tokens, torch.Tensor):
-            input_tokens = input_tokens.clone()
-        if isinstance(roi, torch.Tensor):
-            roi = roi.clone().float()
-
-        self.signal_pipe.send_tensor(self.normal_signal)
-        self.data_pipe.send_tensor(input_tokens)
-        self.data_pipe.send_tensor(roi)
-
-        input_tokens = self.data_pipe.recv_tensor()
-        roi = self.data_pipe.recv_tensor()
-        if roi is not None:
-            # convert from float tensor to bool tensor
-            # as PyNccl does not support sending bool tensor
-            roi = roi > 0.5
-        key = self.data_pipe.recv_tensor()
-        value = self.data_pipe.recv_tensor()
-        hidden = self.data_pipe.recv_tensor()
-
-        return [input_tokens, roi, key, value, hidden]
-
-    def insert(
-        self,
-        input_tokens: torch.Tensor,
-        roi: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        hidden: torch.Tensor,
-    ) -> None:
-        self._add_to_buffer(input_tokens, roi, key, value, hidden)
-
-        # when calling the insert, the current process is a sender
-        # need to launch the request handler and start listening to request.
-        if self.request_handling_thread is None:
-            self.request_handling_thread = threading.Thread(
-                target=self.drop_select_handler
-            )
-            self.request_handling_thread.start()
-
-    def close(self):
-        if (
-            hasattr(self, "request_handling_thread")
-            and self.request_handling_thread is not None
-        ):
-            self.request_handling_thread.join()
-
-        else:
-            # TODO: have a explicit close signal and have a explicit way to
-            # check if it's requester
-            self.signal_pipe.send_tensor(self.end_signal)
diff --git a/vllm/distributed/kv_transfer/kv_pipe/__init__.py b/vllm/distributed/kv_transfer/kv_pipe/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py
deleted file mode 100644
index 1fe7a90e9..000000000
--- a/vllm/distributed/kv_transfer/kv_pipe/base.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This file defines an interface `KVPipeBase`
-that provides an abstraction for sending and receiving tensors, or None, via
-distributed communications.
-
-All classes instantiated from this interface are assumed to be a FIFO pipe.
-
-If your distributed communication platform already supports key-value lookup,
-you can bypass this interface and directly start from `kv_lookup_buffer`.
-"""
-
-from abc import ABC, abstractmethod
-
-import torch
-
-
-class KVPipeBase(ABC):
-    """
-    This class provides an interface for sending and receiving tensors, or
-    None, by distributed communications.
-    """
-
-    @abstractmethod
-    def send_tensor(self, tensor: torch.Tensor | None) -> None:
-        """Send a tensor, or None, via the pipe.
-
-        Need to support sending None -- important for error handling.
-
-        TODO: add a `key` argument so that we can use traditional
-        key-value database as the distributed communication mechanism behind
-        the pipe.
-
-        Args:
-            tensor (Optional[torch.Tensor]): The tensor to be sent. Can be None.
-
-        Raises:
-            NotImplementedError: This method must be implemented in subclasses.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def recv_tensor(self) -> torch.Tensor | None:
-        """Receive a tensor (can be None) from the pipeline.
-
-        Returns:
-            Optional[torch.Tensor]: The tensor received from the pipeline. Can
-                                    be None.
-
-        Raises:
-            NotImplementedError: This method must be implemented in subclasses.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def close(self) -> None:
-        """Close the pipeline and release resources.
-
-        This method is responsible for closing the communication pipeline
-        and releasing any resources associated with it.
-
-        Raises:
-            NotImplementedError: This method must be implemented in subclasses.
-        """
-        raise NotImplementedError
diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
deleted file mode 100644
index 542dde09a..000000000
--- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+++ /dev/null
@@ -1,295 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import json
-import os
-import struct
-from concurrent.futures import ThreadPoolExecutor
-from dataclasses import dataclass
-
-import torch
-import zmq
-from safetensors.torch import load as safetensors_load
-from safetensors.torch import save as safetensors_save
-
-from vllm.config.kv_transfer import KVTransferConfig
-from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
-from vllm.logger import init_logger
-from vllm.utils.network_utils import join_host_port, make_zmq_path, split_host_port
-
-logger = init_logger(__name__)
-NONE_INT = -150886311
-
-
-@dataclass
-class MooncakeTransferEngineConfig:
-    prefill_url: str
-    decode_url: str
-    metadata_backend: str | None
-    metadata_server: str
-    protocol: str
-    device_name: str
-
-    @staticmethod
-    def from_file(file_path: str) -> "MooncakeTransferEngineConfig":
-        """Load the config from a JSON file."""
-        with open(file_path) as fin:
-            config = json.load(fin)
-        return MooncakeTransferEngineConfig(
-            prefill_url=config.get("prefill_url"),
-            decode_url=config.get("decode_url"),
-            metadata_backend=config.get("metadata_backend", None),
-            metadata_server=config.get("metadata_server"),
-            protocol=config.get("protocol", "tcp"),
-            device_name=config.get("device_name", ""),
-        )
-
-    @staticmethod
-    def load_from_env() -> "MooncakeTransferEngineConfig":
-        """Load config from a file specified in the environment variable."""
-        config_file_path = os.getenv("MOONCAKE_CONFIG_PATH")
-        if config_file_path is None:
-            raise ValueError(
-                "The environment variable 'MOONCAKE_CONFIG_PATH' is not set."
-            )
-        return MooncakeTransferEngineConfig.from_file(config_file_path)
-
-
-class MooncakeTransferEngine:
-    """Handles the transfer of data using mooncake_vllm_adaptor and ZeroMQ."""
-
-    def __init__(self, kv_rank: int, local_rank: int):
-        try:
-            from mooncake.engine import TransferEngine
-        except ImportError as e:
-            raise ImportError(
-                "Please install mooncake by following the instructions at "
-                "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "  # noqa: E501
-                "to run vLLM with MooncakeConnector."
-            ) from e
-
-        self.engine = TransferEngine()
-        self.local_rank = local_rank
-
-        try:
-            self.config = MooncakeTransferEngineConfig.load_from_env()
-            logger.info("Mooncake Configuration loaded successfully.")
-        except ValueError as e:
-            logger.error(e)
-            raise
-        except Exception as exc:
-            logger.error("An error occurred while loading the configuration: %s", exc)
-            raise
-        prefill_host, base_prefill_port = split_host_port(self.config.prefill_url)
-        decode_host, base_decode_port = split_host_port(self.config.decode_url)
-
-        # Avoid ports conflict when running prefill and decode on the same node
-        if prefill_host == decode_host and base_prefill_port == base_decode_port:
-            base_decode_port = base_decode_port + 100
-
-        prefill_port = base_prefill_port + self.local_rank
-        decode_port = base_decode_port + self.local_rank
-        self.prefill_url = join_host_port(prefill_host, prefill_port)
-        self.decode_url = join_host_port(decode_host, decode_port)
-
-        self.initialize(
-            self.prefill_url if kv_rank == 0 else self.decode_url,
-            self.config.metadata_server,
-            self.config.protocol,
-            self.config.device_name,
-            self.config.metadata_backend,
-        )
-
-        self.remote_url = self.decode_url if kv_rank == 0 else self.prefill_url
-
-        # Initialize ZeroMQ context and sockets
-        self.context = zmq.Context()  # type: ignore[attr-defined]
-        self.sender_socket = self.context.socket(zmq.constants.PUSH)
-        self.receiver_socket = self.context.socket(zmq.constants.PULL)
-        self.sender_ack = self.context.socket(zmq.constants.PULL)
-        self.receiver_ack = self.context.socket(zmq.constants.PUSH)
-
-        self.buffer_cleaner = ThreadPoolExecutor(max_workers=1)
-        self._setup_metadata_sockets(
-            kv_rank, prefill_host, base_prefill_port, decode_host, base_decode_port
-        )
-
-    def _setup_metadata_sockets(
-        self, kv_rank: int, p_host: str, p_port: int, d_host: str, d_port: int
-    ) -> None:
-        """Set up ZeroMQ sockets for sending and receiving data."""
-        # Offsets < 8 are left for initialization in case tp and pp are enabled
-        p_rank_offset = p_port + 8 + self.local_rank * 2
-        d_rank_offset = d_port + 8 + self.local_rank * 2
-        if kv_rank == 0:
-            self.sender_socket.bind(make_zmq_path("tcp", p_host, p_rank_offset + 1))
-            self.receiver_socket.connect(
-                make_zmq_path("tcp", d_host, d_rank_offset + 1)
-            )
-            self.sender_ack.connect(make_zmq_path("tcp", d_host, d_rank_offset + 2))
-            self.receiver_ack.bind(make_zmq_path("tcp", p_host, p_rank_offset + 2))
-        else:
-            self.receiver_socket.connect(
-                make_zmq_path("tcp", p_host, p_rank_offset + 1)
-            )
-            self.sender_socket.bind(make_zmq_path("tcp", d_host, d_rank_offset + 1))
-            self.receiver_ack.bind(make_zmq_path("tcp", d_host, d_rank_offset + 2))
-            self.sender_ack.connect(make_zmq_path("tcp", p_host, p_rank_offset + 2))
-
-    def initialize(
-        self,
-        local_hostname: str,
-        metadata_server: str,
-        protocol: str,
-        device_name: str,
-        metadata_backend: str | None,
-    ) -> None:
-        """Initialize the mooncake instance."""
-        if metadata_backend is None:
-            self.engine.initialize(
-                local_hostname, metadata_server, protocol, device_name
-            )
-        else:
-            supported_backend = ["etcd", "redis"]
-            metadata_backend = metadata_backend.lower()
-            if metadata_backend not in supported_backend:
-                raise ValueError(
-                    "Mooncake Configuration error. `metadata_backend`"
-                    f" should be one of {supported_backend}."
-                )
-
-            self.engine.initialize_ext(
-                local_hostname, metadata_server, protocol, device_name, metadata_backend
-            )
-
-    def allocate_managed_buffer(self, length: int) -> int:
-        """Allocate a managed buffer of the specified length."""
-        ret = self.engine.allocate_managed_buffer(length)
-        if ret <= 0:
-            logger.error("Allocation Return Error")
-            raise Exception("Allocation Return Error")
-        return ret
-
-    def free_managed_buffer(self, buffer: int, length: int) -> int:
-        """Free a previously allocated managed buffer."""
-        return self.engine.free_managed_buffer(buffer, length)
-
-    def transfer_sync(self, buffer: int, peer_buffer_address: int, length: int) -> int:
-        """Synchronously transfer data to the specified address."""
-        ret = self.engine.transfer_sync_read(
-            self.remote_url, buffer, peer_buffer_address, length
-        )
-        if ret < 0:
-            logger.error("Transfer Return Error")
-            raise Exception("Transfer Return Error")
-        return ret
-
-    def write_bytes_to_buffer(self, buffer: int, user_data: bytes, length: int) -> int:
-        """Write bytes to the allocated buffer."""
-        return self.engine.write_bytes_to_buffer(buffer, user_data, length)
-
-    def read_bytes_from_buffer(self, buffer: int, length: int) -> bytes:
-        """Read bytes from the allocated buffer."""
-        return self.engine.read_bytes_from_buffer(buffer, length)
-
-    def wait_for_ack(self, src_ptr: int, length: int) -> None:
-        """Asynchronously wait for ACK from the receiver."""
-        ack = self.sender_ack.recv()
-        if ack != b"ACK":
-            logger.error("Failed to receive ACK from the receiver")
-
-        self.free_managed_buffer(src_ptr, length)
-
-    def send_bytes(self, user_data: bytes) -> None:
-        """Send bytes to the remote process."""
-        length = len(user_data)
-        src_ptr = self.allocate_managed_buffer(length)
-        self.write_bytes_to_buffer(src_ptr, user_data, length)
-        self.sender_socket.send_multipart(
-            [struct.pack("!Q", src_ptr), struct.pack("!Q", length)]
-        )
-        self.buffer_cleaner.submit(self.wait_for_ack, src_ptr, length)
-
-    def recv_bytes(self) -> bytes:
-        """Receive bytes from the remote process."""
-        data = self.receiver_socket.recv_multipart()
-        src_ptr = struct.unpack("!Q", data[0])[0]
-        length = struct.unpack("!Q", data[1])[0]
-        dst_ptr = self.allocate_managed_buffer(length)
-        self.transfer_sync(dst_ptr, src_ptr, length)
-        ret = self.read_bytes_from_buffer(dst_ptr, length)
-
-        # Buffer cleanup
-        self.receiver_ack.send(b"ACK")
-        self.free_managed_buffer(dst_ptr, length)
-
-        return ret
-
-
-class MooncakePipe(KVPipeBase):
-    """MooncakeTransferEngine based Pipe implementation."""
-
-    def __init__(
-        self, local_rank: int, config: KVTransferConfig, device: str | None = None
-    ):
-        """Initialize the mooncake pipe and set related parameters."""
-        self.config = config
-        self.local_rank = local_rank
-        self.kv_rank = self.config.kv_rank
-        assert self.kv_rank is not None
-        if device is None:
-            self.device = self._select_device(self.config.kv_buffer_device)
-        else:
-            self.device = self._select_device(device)
-
-        self.transfer_engine = MooncakeTransferEngine(self.kv_rank, self.local_rank)
-        self.transport_thread: ThreadPoolExecutor | None = None
-        self.none_tensor = torch.tensor([NONE_INT], device=self.device)
-
-    def _select_device(self, device: str) -> torch.device:
-        """Select available device (CUDA or CPU)."""
-        logger.info("Selecting device: %s", device)
-        if device == "cuda":
-            return torch.device(f"cuda:{self.local_rank}")
-        else:
-            return torch.device("cpu")
-
-    def tensor_hash(self, tensor: torch.Tensor) -> int:
-        """Calculate the hash value of the tensor."""
-        return hash(tensor.data_ptr())
-
-    def _send_impl(self, tensor: torch.Tensor) -> None:
-        """Implement the tensor sending logic using safetensors."""
-        self.transfer_engine.send_bytes(safetensors_save({"tensor": tensor}))
-
-    def _recv_impl(self) -> torch.Tensor:
-        """Implement the tensor receiving logic using safetensors."""
-        data = self.transfer_engine.recv_bytes()
-        return safetensors_load(data)["tensor"].to(self.device)
-
-    def send_tensor(self, tensor: torch.Tensor | None) -> None:
-        """Send tensor to the target process."""
-        if self.transport_thread is None:
-            self.transport_thread = ThreadPoolExecutor(max_workers=1)
-        tensor = tensor if tensor is not None else self.none_tensor
-        assert len(tensor.shape) > 0
-        self.transport_thread.submit(self._send_impl, tensor)
-
-    def recv_tensor(self) -> torch.Tensor | None:
-        """Receive tensor from other processes."""
-        if self.transport_thread is None:
-            self.transport_thread = ThreadPoolExecutor(max_workers=1)
-        tensor = self.transport_thread.submit(self._recv_impl).result()
-        if tensor.numel() == 1 and tensor.item() == NONE_INT:
-            return None
-        else:
-            return tensor
-
-    def close(self) -> None:
-        """Cleanup logic when closing the pipe."""
-        self.transfer_engine.sender_socket.close()
-        self.transfer_engine.receiver_socket.close()
-        self.transfer_engine.sender_ack.close()
-        self.transfer_engine.receiver_ack.close()
-        self.transfer_engine.context.term()  # Terminate the ZMQ context
-        logger.info("Closed the transfer engine and cleaned up resources.")
diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
deleted file mode 100644
index 526c5cd1d..000000000
--- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
+++ /dev/null
@@ -1,285 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This module implements a PyNccl pipe for sending and receiving
-Optional[torch.Tensor] between distributed ranks with advanced
-communication features.
-
-Key Features:
-- Supports sending and receiving tensors with metadata
-- Handles both CUDA and CPU device communications
-- Implements a non-blocking tensor transfer mechanism
-- Manages buffer size and provides backpressure control
-- Supports distributed process groups with configurable parameters
-"""
-
-import threading
-import time
-from collections.abc import Callable
-from concurrent.futures import ThreadPoolExecutor
-
-import torch
-
-from vllm.config.kv_transfer import KVTransferConfig
-from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
-from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
-from vllm.distributed.utils import StatelessProcessGroup
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-class BrokenPipeException(Exception):
-    def __init__(self, message):
-        self.message = message
-        super().__init__(self.message)
-
-
-Metadata = dict[str, torch.Tensor | None]
-
-
-class PyNcclPipe(KVPipeBase):
-    METADATA_LENGTH = 16
-    MAX_TENSOR_DIMENSIONS = 14
-    METADATA_DTYPE = torch.int64
-
-    def __init__(
-        self,
-        local_rank: int,
-        config: KVTransferConfig,
-        device: str | None = None,
-        port_offset: int = 0,
-    ):
-        self.config = config
-        self.local_rank = local_rank
-        self.kv_rank = self.config.kv_rank
-        assert self.kv_rank is not None
-        self.kv_parallel_size = self.config.kv_parallel_size
-        if device is None:
-            self.device = self._select_device(self.config.kv_buffer_device)
-        else:
-            self.device = self._select_device(device)
-
-        # build distributed connection and send/recv implementation
-        store_timeout = self.config.get_from_extra_config("store_timeout", 300)
-        self.group = StatelessProcessGroup.create(
-            host=self.config.kv_ip,
-            port=self.config.kv_port + port_offset,
-            rank=self.kv_rank,
-            world_size=self.kv_parallel_size,
-            store_timeout=store_timeout,
-        )
-        # add a barrier to make sure the connection is initiated properly
-        self.group.barrier()
-        impl = self._get_device_send_recv_impl(self.group)
-        self.device_send_func, self.device_recv_func = impl
-        # set target rank
-        self.target_rank_for_send = (self.kv_rank + 1) % self.kv_parallel_size
-        self.target_rank_for_recv = (self.kv_rank - 1) % self.kv_parallel_size
-
-        # transportation-related variables
-        self.transport_thread: ThreadPoolExecutor | None = None
-        self.buffer_size = 0
-        self.buffer_size_lock = threading.Lock()
-        self.buffer_size_thresh = self.config.kv_buffer_size
-
-    def _get_device_send_recv_impl(
-        self, group: StatelessProcessGroup
-    ) -> tuple[
-        Callable[[torch.Tensor, int], None], Callable[[torch.Tensor, int], None]
-    ]:
-        send: Callable[[torch.Tensor, int], None]
-        recv: Callable[[torch.Tensor, int], None]
-        if self.device.type == "cuda":
-            # use PyNCCL for send / recv
-            comm = PyNcclCommunicator(group, device=self.local_rank)
-            comm.disabled = False
-            send, recv = comm.send, comm.recv  # type: ignore
-        else:
-            # This send / recv implementation here is NOT intended to transfer
-            # KV caches (and should NOT be repurposed to transfer KV caches).
-            # Currently it is only used to transmit control-plane messages
-            # for PyNcclBuffer.
-            send = group.send_obj
-
-            def my_recv(x, src):
-                x[...] = group.recv_obj(src)
-
-            recv = my_recv
-
-        return send, recv
-
-    def _select_device(self, device: str):
-        logger.info("Selecting device: %s", device)
-        if device == "cuda":
-            return torch.device(f"cuda:{self.local_rank}")
-        else:
-            return torch.device("cpu")
-
-    def _make_metadata(self, tensor: torch.Tensor | None) -> Metadata:
-        """
-        Create the metadata as a dictionary based on the input tensor.
-
-        Args:
-            tensor: The input tensor or None if no tensor is provided.
-
-        Returns:
-            metadata: A dictionary with the following keys:
-                - "dtype": The data type of the tensor or None.
-                - "shape": The shape of the tensor or None.
-        """
-        if tensor is None:
-            return {"dtype": None, "shape": None}
-        else:
-            return {"dtype": tensor.dtype, "shape": tensor.shape}
-
-    def _prepare_recv_buffer(self, metadata: Metadata) -> torch.Tensor:
-        """
-        Create a buffer to receive the tensor based on the provided metadata.
-
-        Args:
-            metadata: A dictionary with keys "dtype" and "shape",
-                describing the tensor's data type and shape.
-
-        Returns:
-            buffer: A tensor of the specified type and shape,
-                allocated on `self.device`.
-        """
-        return torch.empty(
-            metadata["shape"], dtype=metadata["dtype"], device=self.device
-        )
-
-    def _send_metadata(self, metadata: Metadata):
-        """
-        Send the metadata dictionary to the target rank.
-
-        Args:
-            metadata: A dictionary with keys "dtype" and "shape".
-        """
-        self.group.send_obj(metadata, self.target_rank_for_send)
-
-    def _recv_metadata(self) -> Metadata:
-        """
-        Receive the metadata dictionary from the target rank.
-
-        Returns:
-            metadata: A dictionary with keys "dtype" and "shape"
-                describing the tensor.
-        """
-        return self.group.recv_obj(self.target_rank_for_recv)
-
-    def _send_impl(self, tensor: torch.Tensor | None) -> None:
-        """
-        The actual implementation of sending the tensor and its metadata to the
-        target rank.
-
-        Args:
-            tensor: The input tensor to be sent, or `None` if no tensor is
-                being sent.
-        """
-        metadata = self._make_metadata(tensor)
-        self._send_metadata(metadata)
-        if tensor is not None:
-            self.device_send_func(tensor.to(self.device), self.target_rank_for_send)
-
-    def _recv_impl(self) -> torch.Tensor | None:
-        """
-        The actual implementation of receiving a tensor and its metadata from
-        the target rank.
-
-        Returns:
-            buffer: The received tensor, or `None` if no tensor is received.
-        """
-        metadata = self._recv_metadata()
-        if metadata["dtype"] is None:
-            return None
-        buffer = self._prepare_recv_buffer(metadata)
-        self.device_recv_func(buffer, self.target_rank_for_recv)
-
-        return buffer
-
-    def send_tensor_wrapper(
-        self, tensor: torch.Tensor | None, tensor_size: int
-    ) -> None:
-        """
-        Wrapper for _send_impl to handle exceptions and update buffer size.
-        """
-        try:
-            self._send_impl(tensor)
-
-            with self.buffer_size_lock:
-                self.buffer_size -= tensor_size
-        except Exception as e:
-            logger.error(
-                "[rank%d]: Exception when trying to send %s, msg: %s",
-                torch.distributed.get_rank(),
-                str(tensor),
-                str(e),
-            )
-            import traceback
-
-            traceback.print_exc()
-
-    def block_if_full(self):
-        """
-        Block the current thread if the buffer size is larger than the
-        threshold.
-        """
-        while self.buffer_size > self.buffer_size_thresh:
-            logger.debug("KV cache transfer pipe is full. Waiting...")
-            time.sleep(0.05)
-
-    def send_tensor(self, tensor: torch.Tensor | None) -> None:
-        """
-        Sends a tensor and its metadata to the destination rank in a
-        non-blocking way.
-
-        Args:
-            tensor: The tensor to send, or `None` if no tensor is being sent.
-        """
-        if self.transport_thread is None:
-            self.transport_thread = ThreadPoolExecutor(max_workers=1)
-
-        if tensor is not None:
-            tensor_size = tensor.element_size() * tensor.numel()
-        else:
-            tensor_size = 0
-
-        self.block_if_full()
-
-        with self.buffer_size_lock:
-            self.buffer_size += tensor_size
-
-        self.transport_thread.submit(self.send_tensor_wrapper, tensor, tensor_size)
-
-    def recv_tensor(self) -> torch.Tensor | None:
-        """
-        Receives a tensor and its metadata from the source rank. Blocking call.
-
-        Returns:
-            The received tensor, or `None` if no tensor is received.
-        """
-        if self.transport_thread is None:
-            self.transport_thread = ThreadPoolExecutor(max_workers=1)
-
-        future = self.transport_thread.submit(self._recv_impl)
-
-        try:
-            tensor = future.result()
-        except Exception as e:
-            logger.error("Encountering exception in KV receiving thread")
-            logger.error("%s", e)
-            logger.error("My device: %s", self.device)
-            import traceback
-
-            traceback.print_exc()
-            raise e
-
-        return tensor
-
-    def close(self):
-        """
-        Close the pipe and release associated resources.
-        """
-        if hasattr(self, "transport_thread") and self.transport_thread is not None:
-            self.transport_thread.shutdown()
-- 
GitLab


From e10c84e06af7264d5c0b3e7ec5604ada2eee7094 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 4 Dec 2025 18:42:49 +0000
Subject: [PATCH 098/258] Access `partial_rotary_factor` from `rope_parameters`
 (#29966)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/kernels/core/test_mrope.py              |  8 ++------
 .../layers/rotary_embedding/__init__.py       |  5 ++++-
 vllm/model_executor/models/apertus.py         |  5 +----
 vllm/model_executor/models/bailing_moe.py     |  3 ---
 vllm/model_executor/models/bamba.py           |  4 +---
 vllm/model_executor/models/config.py          |  5 -----
 vllm/model_executor/models/falcon_h1.py       |  4 +---
 vllm/model_executor/models/glm.py             |  3 ++-
 vllm/model_executor/models/glm4.py            |  3 +--
 vllm/model_executor/models/glm4_moe.py        |  3 +--
 vllm/model_executor/models/gpt_neox.py        |  6 ++----
 vllm/model_executor/models/llama.py           |  3 ---
 vllm/model_executor/models/nemotron.py        |  2 --
 vllm/model_executor/models/nemotron_nas.py    |  1 -
 vllm/model_executor/models/persimmon.py       |  2 --
 vllm/model_executor/models/phi.py             |  5 +----
 vllm/model_executor/models/qwen3_next.py      |  1 -
 vllm/model_executor/models/stablelm.py        |  4 ----
 vllm/transformers_utils/config.py             | 10 +++++++++-
 vllm/transformers_utils/configs/nemotron.py   | 20 ++++++++++++-------
 vllm/transformers_utils/configs/qwen3_next.py |  8 +++++---
 21 files changed, 43 insertions(+), 62 deletions(-)

diff --git a/tests/kernels/core/test_mrope.py b/tests/kernels/core/test_mrope.py
index 43b242ab2..4e1559a04 100644
--- a/tests/kernels/core/test_mrope.py
+++ b/tests/kernels/core/test_mrope.py
@@ -113,12 +113,10 @@ def test_mrope(
     is_neox_style = True
 
     max_position = config.max_position_embeddings
-    partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
-    rotary_dim = int(head_dim * partial_rotary_factor)
 
     mrope_helper_class = get_rope(
         head_size=head_dim,
-        rotary_dim=rotary_dim,
+        rotary_dim=head_dim,
         max_position=max_position,
         is_neox_style=is_neox_style,
         rope_parameters=config.rope_parameters,
@@ -184,12 +182,10 @@ def test_mrope_torch_compile_tracing(
     )
     is_neox_style = True
     max_position = config.max_position_embeddings
-    partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
-    rotary_dim = int(head_dim * partial_rotary_factor)
 
     mrope_helper_class = get_rope(
         head_size=head_dim,
-        rotary_dim=rotary_dim,
+        rotary_dim=head_dim,
         max_position=max_position,
         is_neox_style=is_neox_style,
         rope_parameters=config.rope_parameters,
diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py
index aa6ece300..4dff984f9 100644
--- a/vllm/model_executor/layers/rotary_embedding/__init__.py
+++ b/vllm/model_executor/layers/rotary_embedding/__init__.py
@@ -30,7 +30,6 @@ def get_rope(
     is_neox_style: bool = True,
     rope_parameters: dict[str, Any] | None = None,
     dtype: torch.dtype | None = None,
-    partial_rotary_factor: float = 1.0,
     dual_chunk_attention_config: dict[str, Any] | None = None,
 ) -> RotaryEmbedding:
     if dtype is None:
@@ -55,6 +54,10 @@ def get_rope(
     else:
         dual_chunk_attention_args = None
 
+    partial_rotary_factor = 1.0
+    if rope_parameters is not None:
+        partial_rotary_factor = rope_parameters.get("partial_rotary_factor", 1.0)
+
     if partial_rotary_factor < 1.0:
         rotary_dim = int(rotary_dim * partial_rotary_factor)
     key = (
diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py
index 4a69787af..2a8be29d8 100644
--- a/vllm/model_executor/models/apertus.py
+++ b/vllm/model_executor/models/apertus.py
@@ -148,8 +148,6 @@ class ApertusAttention(nn.Module):
         if head_dim is None:
             head_dim = self.hidden_size // self.total_num_heads
         self.head_dim = head_dim
-        # Phi models introduced a partial_rotary_factor parameter in the config
-        self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
@@ -228,11 +226,10 @@ class ApertusAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=int(self.partial_rotary_factor * self.head_dim),
+            rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
             rope_parameters=config.rope_parameters,
             is_neox_style=is_neox_style,
-            partial_rotary_factor=self.partial_rotary_factor,
         )
 
 
diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py
index f7a5d4e78..0143e140a 100644
--- a/vllm/model_executor/models/bailing_moe.py
+++ b/vllm/model_executor/models/bailing_moe.py
@@ -127,8 +127,6 @@ class BailingAttention(nn.Module):
             prefix=f"{prefix}.dense",
         )
 
-        self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
-
         self.rotary_dim = getattr(config, "rotary_dim", self.head_dim)
 
         self.rotary_emb = get_rope(
@@ -137,7 +135,6 @@ class BailingAttention(nn.Module):
             max_position=config.max_position_embeddings,
             rope_parameters=config.rope_parameters,
             is_neox_style=True,
-            partial_rotary_factor=self.partial_rotary_factor,
         )
 
         self.attn = Attention(
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index 1d6493b18..00d742f84 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -178,9 +178,7 @@ class BambaAttentionDecoderLayer(nn.Module):
         self.scaling = self.head_dim**-0.5
         self.max_position_embeddings = max_position_embeddings
 
-        if hasattr(config, "partial_rotary_factor"):
-            rotary_dim = int(self.head_dim * config.partial_rotary_factor)
-        elif hasattr(config, "attn_rotary_emb"):
+        if hasattr(config, "attn_rotary_emb"):
             rotary_dim = config.attn_rotary_emb  # for backward compatibility
         else:
             rotary_dim = self.head_dim  # default
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index d7e802ba1..4bca36aa4 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -8,7 +8,6 @@ import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.model_executor.models import ModelRegistry
 from vllm.platforms import current_platform
-from vllm.transformers_utils.config import set_default_rope_theta
 from vllm.utils.math_utils import cdiv, round_up
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec
@@ -78,8 +77,6 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig):
             if not model_config.enforce_eager:
                 max_position = round_up(max_position, 8)
 
-            set_default_rope_theta(config, default_theta=config.rotary_emb_base)
-
             config.rotary_kwargs = {
                 "head_size": head_dim,
                 "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
@@ -119,8 +116,6 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
         rotary_emb_dim = int(head_dim * config.rotary_emb_fraction)
         max_trained_positions = getattr(config, "max_trained_positions", 2048)
 
-        set_default_rope_theta(config, default_theta=config.rotary_emb_base)
-
         config.rotary_kwargs = {
             "head_size": head_dim,
             "rotary_dim": rotary_emb_dim,
diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py
index 83ceb9303..a1c1263f8 100644
--- a/vllm/model_executor/models/falcon_h1.py
+++ b/vllm/model_executor/models/falcon_h1.py
@@ -242,9 +242,7 @@ class FalconH1AttentionDecoderLayer(nn.Module):
         self.scaling = self.head_dim**-0.5
         self.max_position_embeddings = max_position_embeddings
 
-        if hasattr(config, "partial_rotary_factor"):
-            rotary_dim = self.head_dim * config.partial_rotary_factor
-        elif hasattr(config, "attn_rotary_emb"):
+        if hasattr(config, "attn_rotary_emb"):
             rotary_dim = config.attn_rotary_emb  # for backward compatibility
         else:
             rotary_dim = self.head_dim  # default
diff --git a/vllm/model_executor/models/glm.py b/vllm/model_executor/models/glm.py
index a6991f8e4..26d7c29aa 100644
--- a/vllm/model_executor/models/glm.py
+++ b/vllm/model_executor/models/glm.py
@@ -10,7 +10,8 @@ from .utils import PPMissingLayer
 
 class GlmForCausalLM(LlamaForCausalLM):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        vllm_config.model_config.hf_config.partial_rotary_factor = 0.5
+        hf_config = vllm_config.model_config.hf_config
+        hf_config.rope_parameters["partial_rotary_factor"] = 0.5
         super().__init__(vllm_config=vllm_config, prefix=prefix)
         # Hack Llama model to fit HF format GLM implementation
         # Attention difference between GLM and Llama:
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
index 002cdb721..9adfa942b 100644
--- a/vllm/model_executor/models/glm4.py
+++ b/vllm/model_executor/models/glm4.py
@@ -78,7 +78,7 @@ class Glm4Attention(nn.Module):
             # Number of KV heads is less than TP size, so we replicate
             # the KV heads across multiple tensor parallel GPUs.
             assert tp_size % self.total_num_kv_heads == 0
-        partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
+        config.rope_parameters.setdefault("partial_rotary_factor", 0.5)
         self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
         self.head_dim = head_dim or hidden_size // self.total_num_heads
         self.rotary_dim = self.head_dim
@@ -106,7 +106,6 @@ class Glm4Attention(nn.Module):
             rotary_dim=self.rotary_dim,
             max_position=max_position,
             rope_parameters=config.rope_parameters,
-            partial_rotary_factor=partial_rotary_factor,
             is_neox_style=False,
         )
         self.attn = Attention(
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index c99f824e1..8cae5ee42 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -282,13 +282,12 @@ class Glm4MoeAttention(nn.Module):
             prefix=f"{prefix}.o_proj",
         )
 
-        partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
+        config.rope_parameters.setdefault("partial_rotary_factor", 0.5)
         self.rotary_emb = get_rope(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=config.rope_parameters,
-            partial_rotary_factor=partial_rotary_factor,
         )
         self.attn = Attention(
             self.num_heads,
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index b9959682c..212d605c1 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -89,16 +89,14 @@ class GPTNeoXAttention(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.dense",
         )
-        scaling = self.head_size**-0.5
-        rotary_dim = int(self.head_size * config.rotary_pct)
-        assert rotary_dim % 2 == 0
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.rotary_emb = get_rope(
             self.head_size,
-            rotary_dim=rotary_dim,
+            rotary_dim=self.head_size,
             max_position=max_position_embeddings,
             rope_parameters=config.rope_parameters,
         )
+        scaling = self.head_size**-0.5
         self.attn = Attention(
             self.num_heads,
             self.head_size,
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 8f5a967cd..167dfbca2 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -149,8 +149,6 @@ class LlamaAttention(nn.Module):
         if head_dim is None:
             head_dim = self.hidden_size // self.total_num_heads
         self.head_dim = head_dim
-        # Phi models introduced a partial_rotary_factor parameter in the config
-        self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
@@ -265,7 +263,6 @@ class LlamaAttention(nn.Module):
             max_position=self.max_position_embeddings,
             rope_parameters=getattr(config, "rope_parameters", None),
             is_neox_style=is_neox_style,
-            partial_rotary_factor=self.partial_rotary_factor,
         )
 
 
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index ffba6c9df..bf83ee5e4 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -178,7 +178,6 @@ class NemotronAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.partial_rotary_factor = config.partial_rotary_factor
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -203,7 +202,6 @@ class NemotronAttention(nn.Module):
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             rope_parameters=config.rope_parameters,
-            partial_rotary_factor=self.partial_rotary_factor,
         )
         self.attn = Attention(
             self.num_heads,
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index 9d968dee8..734fbc607 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -122,7 +122,6 @@ class DeciLMAttention(LlamaAttention):
             max_position=self.max_position_embeddings,
             rope_parameters=config.rope_parameters,
             is_neox_style=is_neox_style,
-            partial_rotary_factor=self.partial_rotary_factor,
         )
 
 
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index 795cd25f1..8f26c6872 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -106,7 +106,6 @@ class PersimmonAttention(nn.Module):
         self.num_heads = self.total_num_heads // tensor_parallel_world_size
         self.head_dim = self.hidden_size // self.total_num_heads
         self.max_position_embeddings = config.max_position_embeddings
-        self.partial_rotary_factor = config.partial_rotary_factor
         self.is_causal = True
 
         assert (self.head_dim * self.total_num_heads) == self.hidden_size
@@ -138,7 +137,6 @@ class PersimmonAttention(nn.Module):
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
             rope_parameters=config.rope_parameters,
-            partial_rotary_factor=self.partial_rotary_factor,
         )
         self.scaling = self.head_dim**-0.5
         self.attn = Attention(
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 70016d9ed..253fbbc41 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -109,10 +109,7 @@ class PhiAttention(nn.Module):
         )
 
         scaling = self.head_size**-0.5
-        rotary_dim = int(
-            config.partial_rotary_factor
-            * (config.hidden_size // config.num_attention_heads)
-        )
+        rotary_dim = config.hidden_size // config.num_attention_heads
         assert rotary_dim % 2 == 0
 
         max_position_embeddings = getattr(config, "max_position_embeddings", 2048)
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index 661a18215..dd64e3983 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -750,7 +750,6 @@ class Qwen3NextAttention(nn.Module):
             rotary_dim=self.head_dim,
             max_position=config.max_position_embeddings,
             rope_parameters=config.rope_parameters,
-            partial_rotary_factor=config.partial_rotary_factor,
             dual_chunk_attention_config=self.dual_chunk_attention_config,
         )
 
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 65092584e..e879599ad 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -119,9 +119,6 @@ class StablelmAttention(nn.Module):
         self.num_key_value_heads = max(1, self.total_num_key_value_heads // tp_size)
         self.head_dim = self.hidden_size // self.total_num_heads
         self.max_position_embeddings = config.max_position_embeddings
-        self.partial_rotary_factor = getattr(
-            config, "rope_pct", getattr(config, "partial_rotary_factor", 1)
-        )
         self.scaling = self.head_dim**-0.5
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_key_value_heads * self.head_dim
@@ -154,7 +151,6 @@ class StablelmAttention(nn.Module):
             rotary_dim=self.head_dim,
             max_position=self.config.max_position_embeddings,
             rope_parameters=self.config.rope_parameters,
-            partial_rotary_factor=self.partial_rotary_factor,
         )
         self.attn = Attention(
             self.num_heads,
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 1075bc244..f926b523a 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -25,6 +25,7 @@ from transformers.models.auto.tokenization_auto import get_tokenizer_config
 from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
 
 from vllm import envs
+from vllm.config.utils import getattr_iter
 from vllm.logger import init_logger
 from vllm.transformers_utils.utils import parse_safetensors_file_metadata
 
@@ -304,7 +305,8 @@ def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> No
 
 def patch_rope_parameters(config: PretrainedConfig) -> None:
     """Provide backwards compatibility for RoPE."""
-    rope_theta = getattr(config, "rope_theta", None)
+    rope_theta_names = ("rope_theta", "rotary_emb_base")
+    rope_theta = getattr_iter(config, rope_theta_names, None)
     if Version(version("transformers")) < Version("5.0.0.dev0"):
         # Transformers v4 installed, legacy config fields may be present
         if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
@@ -313,6 +315,12 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
             if not hasattr(config, "rope_parameters"):
                 config.rope_parameters = {"rope_type": "default"}
             config.rope_parameters["rope_theta"] = rope_theta
+        partial_rotary_factor_names = ("partial_rotary_factor", "rotary_pct")
+        partial_rotary_factor = getattr_iter(config, partial_rotary_factor_names, None)
+        if partial_rotary_factor is not None:
+            if not hasattr(config, "rope_parameters"):
+                config.rope_parameters = {"rope_type": "default"}
+            config.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
     elif rope_theta is not None or hasattr(config, "rope_parameters"):
         # Transformers v5 installed
         config.standardize_rope_params()
diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py
index d112c71d7..62f527030 100644
--- a/vllm/transformers_utils/configs/nemotron.py
+++ b/vllm/transformers_utils/configs/nemotron.py
@@ -89,9 +89,14 @@ class NemotronConfig(PretrainedConfig):
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether to tie weight embeddings
         rope_parameters (`dict`, *optional*):
-            The parameters of the RoPE embeddings.
-        partial_rotary_factor (`float`, *optional*, defaults to 0.5):
-            Percentage of the query and keys which will have rotary embedding.
+            The parameters of the RoPE embeddings. Expected contents:
+                `rope_theta` (`float`): The base period of the RoPE embeddings.
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear',
+                    'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the
+                    original RoPE implementation.
+                `partial_rotary_factor` (`float`, *optional*, defaults to 0.5):
+                    Percentage of the query and keys which will have rotary embedding.
         attention_bias (`bool`, *optional*, defaults to `False`):
             Whether to use a bias in the query, key, value and output
             projection layers during self-attention.
@@ -133,7 +138,6 @@ class NemotronConfig(PretrainedConfig):
         eos_token_id=3,
         tie_word_embeddings=False,
         rope_parameters=None,
-        partial_rotary_factor=0.5,
         attention_bias=False,
         attention_dropout=0.0,
         mlp_bias=False,
@@ -165,14 +169,16 @@ class NemotronConfig(PretrainedConfig):
         rope_theta = kwargs.pop("rope_theta", 10000.0)
         if "rope_theta" not in rope_parameters:
             rope_parameters["rope_theta"] = rope_theta
-        self.rope_parameters = rope_parameters
         # for backward compatibility
         partial_rotary_factor = (
             kwargs.get("rope_percent")
             or kwargs.get("rope_percentage")
-            or partial_rotary_factor
+            or kwargs.get("partial_rotary_factor")
+            or 0.5
         )
-        self.partial_rotary_factor = partial_rotary_factor
+        if "partial_rotary_factor" not in rope_parameters:
+            rope_parameters["partial_rotary_factor"] = partial_rotary_factor
+        self.rope_parameters = rope_parameters
         self._rope_parameters_validation()
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
diff --git a/vllm/transformers_utils/configs/qwen3_next.py b/vllm/transformers_utils/configs/qwen3_next.py
index fd36b4924..8230a1834 100644
--- a/vllm/transformers_utils/configs/qwen3_next.py
+++ b/vllm/transformers_utils/configs/qwen3_next.py
@@ -103,8 +103,8 @@ class Qwen3NextConfig(PretrainedConfig):
                     Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                 `high_freq_factor` (`float`, *optional*):
                     Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
-        partial_rotary_factor (`float`, *optional*, defaults to 0.25):
-            Percentage of the query and keys which will have rotary embedding.
+                `partial_rotary_factor` (`float`, *optional*, defaults to 0.25):
+                    Percentage of the query and keys which will have rotary embedding.
         attention_bias (`bool`, *optional*, defaults to `False`):
             Whether to use a bias in the query, key, value and output projection layers during self-attention.
         attention_dropout (`float`, *optional*, defaults to 0.0):
@@ -198,7 +198,6 @@ class Qwen3NextConfig(PretrainedConfig):
         use_cache=True,
         tie_word_embeddings=False,
         rope_parameters=None,
-        partial_rotary_factor=0.25,
         attention_bias=False,
         attention_dropout=0.0,
         head_dim=256,
@@ -239,6 +238,9 @@ class Qwen3NextConfig(PretrainedConfig):
         rope_theta = kwargs.pop("rope_theta", 10000.0)
         if "rope_theta" not in rope_parameters:
             rope_parameters["rope_theta"] = rope_theta
+        partial_rotary_factor = kwargs.pop("partial_rotary_factor", 0.25)
+        if "partial_rotary_factor" not in rope_parameters:
+            rope_parameters["partial_rotary_factor"] = partial_rotary_factor
         self.rope_parameters = rope_parameters
         self.partial_rotary_factor = partial_rotary_factor
         self.attention_bias = attention_bias
-- 
GitLab


From 1119f6e47abd8a56d2279520ccf1721ac5176f66 Mon Sep 17 00:00:00 2001
From: Mercykid-bash <ruanche0218@gmail.com>
Date: Fri, 5 Dec 2025 03:09:09 +0800
Subject: [PATCH 099/258] Abstract eplb algo (#26471)

Signed-off-by: Che Ruan <cr623@ic.ac.uk>
Signed-off-by: mengxingkongzhouhan <117415539+mengxingkongzhouhan@users.noreply.github.com>
Signed-off-by: Mercykid-bash <ruanche0218@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Che Ruan <cr623@ic.ac.uk>
Co-authored-by: mengxingkongzhouhan <117415539+mengxingkongzhouhan@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/distributed/test_eplb_algo.py      |  32 +--
 vllm/config/parallel.py                  |   4 +
 vllm/distributed/eplb/__init__.py        |   7 +-
 vllm/distributed/eplb/eplb_state.py      |  20 +-
 vllm/distributed/eplb/policy/__init__.py |  19 ++
 vllm/distributed/eplb/policy/abstract.py |  40 ++++
 vllm/distributed/eplb/policy/default.py  | 267 +++++++++++++++++++++++
 vllm/distributed/eplb/rebalance_algo.py  | 260 ----------------------
 8 files changed, 364 insertions(+), 285 deletions(-)
 create mode 100644 vllm/distributed/eplb/policy/__init__.py
 create mode 100644 vllm/distributed/eplb/policy/abstract.py
 create mode 100644 vllm/distributed/eplb/policy/default.py
 delete mode 100644 vllm/distributed/eplb/rebalance_algo.py

diff --git a/tests/distributed/test_eplb_algo.py b/tests/distributed/test_eplb_algo.py
index 79805a7cc..a53a61840 100644
--- a/tests/distributed/test_eplb_algo.py
+++ b/tests/distributed/test_eplb_algo.py
@@ -4,7 +4,7 @@
 import pytest
 import torch
 
-from vllm.distributed.eplb.rebalance_algo import rebalance_experts
+from vllm.distributed.eplb.policy.default import DefaultEplbPolicy
 
 
 def test_basic_rebalance():
@@ -23,7 +23,7 @@ def test_basic_rebalance():
     num_nodes = 2
     num_gpus = 8
 
-    phy2log, log2phy, logcnt = rebalance_experts(
+    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
         weight, num_replicas, num_groups, num_nodes, num_gpus
     )
 
@@ -77,7 +77,7 @@ def test_single_gpu_case():
     num_nodes = 1
     num_gpus = 1
 
-    phy2log, log2phy, logcnt = rebalance_experts(
+    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
         weight, num_replicas, num_groups, num_nodes, num_gpus
     )
 
@@ -99,7 +99,7 @@ def test_equal_weights():
     num_nodes = 2
     num_gpus = 4
 
-    phy2log, log2phy, logcnt = rebalance_experts(
+    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
         weight, num_replicas, num_groups, num_nodes, num_gpus
     )
 
@@ -122,7 +122,7 @@ def test_extreme_weight_imbalance():
     num_nodes = 2
     num_gpus = 4
 
-    phy2log, log2phy, logcnt = rebalance_experts(
+    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
         weight, num_replicas, num_groups, num_nodes, num_gpus
     )
 
@@ -150,7 +150,7 @@ def test_multiple_layers():
     num_nodes = 2
     num_gpus = 4
 
-    phy2log, log2phy, logcnt = rebalance_experts(
+    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
         weight, num_replicas, num_groups, num_nodes, num_gpus
     )
 
@@ -175,14 +175,14 @@ def test_parameter_validation():
     # Test non-divisible case - this should handle normally without throwing
     # errors because the function will fall back to global load balancing
     # strategy
-    phy2log, log2phy, logcnt = rebalance_experts(weight, 8, 3, 2, 4)
+    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(weight, 8, 3, 2, 4)
     assert phy2log.shape == (1, 8)
     assert logcnt.shape == (1, 4)
 
     # Test cases that will actually cause errors:
     # num_physical_experts not divisible by num_gpus
     with pytest.raises(AssertionError):
-        rebalance_experts(weight, 7, 2, 2, 4)  # 7 not divisible by 4
+        DefaultEplbPolicy.rebalance_experts(weight, 7, 2, 2, 4)  # 7 not divisible by 4
 
 
 def test_small_scale_hierarchical():
@@ -197,7 +197,7 @@ def test_small_scale_hierarchical():
     num_nodes = 2  # 2 nodes
     num_gpus = 4  # 4 GPUs
 
-    phy2log, log2phy, logcnt = rebalance_experts(
+    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
         weight, num_replicas, num_groups, num_nodes, num_gpus
     )
 
@@ -224,7 +224,7 @@ def test_global_load_balance_fallback():
     num_nodes = 2
     num_gpus = 4
 
-    phy2log, log2phy, logcnt = rebalance_experts(
+    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
         weight, num_replicas, num_groups, num_nodes, num_gpus
     )
 
@@ -246,7 +246,7 @@ def test_device_compatibility(device):
     num_nodes = 1
     num_gpus = 2
 
-    phy2log, log2phy, logcnt = rebalance_experts(
+    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
         weight, num_replicas, num_groups, num_nodes, num_gpus
     )
 
@@ -263,7 +263,9 @@ def test_additional_cases():
     weight1 = torch.tensor(
         [[50, 100, 75, 120, 90, 60, 80, 110, 40, 70, 95, 85, 65, 55, 45, 35]]
     )
-    phy2log1, log2phy1, logcnt1 = rebalance_experts(weight1, 24, 8, 4, 8)
+    phy2log1, log2phy1, logcnt1 = DefaultEplbPolicy.rebalance_experts(
+        weight1, 24, 8, 4, 8
+    )
 
     assert phy2log1.shape == (1, 24)
     assert logcnt1.shape == (1, 16)
@@ -276,7 +278,9 @@ def test_additional_cases():
             [12, 25, 50, 100, 150, 200],  # Increasing weights
         ]
     )
-    phy2log2, log2phy2, logcnt2 = rebalance_experts(weight2, 10, 3, 1, 2)
+    phy2log2, log2phy2, logcnt2 = DefaultEplbPolicy.rebalance_experts(
+        weight2, 10, 3, 1, 2
+    )
 
     assert phy2log2.shape == (2, 10)
     assert logcnt2.shape == (2, 6)
@@ -300,7 +304,7 @@ if __name__ == "__main__":
     num_nodes = 2
     num_gpus = 8
 
-    phy2log, log2phy, logcnt = rebalance_experts(
+    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
         weight, num_replicas, num_groups, num_nodes, num_gpus
     )
     print(phy2log)
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 20de67225..3a768bcd4 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -35,6 +35,7 @@ logger = init_logger(__name__)
 ExpertPlacementStrategy = Literal["linear", "round_robin"]
 DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"]
 DataParallelBackend = Literal["ray", "mp"]
+EPLBPolicyOption = Literal["default"]
 
 
 @config
@@ -65,6 +66,9 @@ class EPLBConfig:
     Whether to use non-blocking EPLB.
     """
 
+    policy: EPLBPolicyOption = "default"
+    """The policy type for expert parallel load balancing (EPLB)."""
+
 
 @config
 @dataclass
diff --git a/vllm/distributed/eplb/__init__.py b/vllm/distributed/eplb/__init__.py
index 4cd51dd38..12e6cd417 100644
--- a/vllm/distributed/eplb/__init__.py
+++ b/vllm/distributed/eplb/__init__.py
@@ -1,8 +1,3 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Expert parallelism load balancer (EPLB).
-"""
-
-from .eplb_state import *
-from .rebalance_algo import *
+"""Expert parallelism load balancer (EPLB)."""
diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py
index 9f8798a96..c5654659b 100644
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@@ -45,7 +45,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.models.interfaces import MixtureOfExperts
 
 from .async_worker import start_async_worker
-from .rebalance_algo import rebalance_experts
+from .policy import EPLB_POLICIES, AbstractEplbPolicy, DefaultEplbPolicy
 from .rebalance_execute import move_from_buffer, rearrange_expert_weights_inplace
 
 logger = init_logger(__name__)
@@ -213,18 +213,23 @@ class EplbState:
         self.parallel_config = parallel_config
         self.device = device
         self.model_states: dict[str, EplbModelState] = {}
+        self.policy: type[AbstractEplbPolicy] = DefaultEplbPolicy
+        """
+        Selected EPLB algorithm class
+        """
+        self.expert_load_window_step: int = 0
         """
         Current step in the sliding window.
 
         Different from `expert_rearrangement_step`, 
         each EP rank may have its own `expert_load_window_step`.
         """
-        self.expert_load_window_step: int = 0
+        self.expert_load_window_size: int = 0
         """
         Size of the expert load sliding window.
         This is a constant and is taken from the config.
         """
-        self.expert_load_window_size: int = 0
+        self.expert_rearrangement_step: int = 0
         """
         Steps after last rearrangement.
         Will trigger a rearrangement if it exceeds the threshold.
@@ -415,6 +420,10 @@ class EplbState:
         )
         self.expert_rearrangement_step_interval = eplb_step_interval
 
+        # Set the policy based on the selected eplb algorithm type.
+        policy_type = self.parallel_config.eplb_config.policy
+        self.policy = EPLB_POLICIES[policy_type]
+        logger.debug("Selected EPLB policy: %d", policy_type)
         if global_expert_load is not None:
             ep_group = get_ep_group().device_group
             assert global_expert_load.shape == (
@@ -441,7 +450,7 @@ class EplbState:
                 new_physical_to_logical_map,
                 new_logical_to_physical_map,
                 new_logical_replica_count,
-            ) = rebalance_experts(
+            ) = self.policy.rebalance_experts(
                 global_expert_load,
                 num_replicas,
                 num_groups,
@@ -776,6 +785,7 @@ class EplbState:
                 f"{num_gpus=}, {num_nodes=}"
             )
 
+        # Get new expert mappings
         for eplb_model_state, global_expert_load_window in zip(
             self.model_states.values(), global_expert_load_windows
         ):
@@ -784,7 +794,7 @@ class EplbState:
                 new_physical_to_logical_map,
                 new_logical_to_physical_map,
                 new_logical_replica_count,
-            ) = rebalance_experts(
+            ) = self.policy.rebalance_experts(
                 global_expert_load_window,
                 num_replicas,
                 num_groups,
diff --git a/vllm/distributed/eplb/policy/__init__.py b/vllm/distributed/eplb/policy/__init__.py
new file mode 100644
index 000000000..8e78d7bac
--- /dev/null
+++ b/vllm/distributed/eplb/policy/__init__.py
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import get_args
+
+from vllm.config.parallel import EPLBPolicyOption
+
+from .abstract import AbstractEplbPolicy
+from .default import DefaultEplbPolicy
+
+EPLB_POLICIES = {"default": DefaultEplbPolicy}
+
+# Ensure that the EPLB_POLICIES keys match the EPLBPolicyOption values
+assert set(EPLB_POLICIES.keys()) == set(get_args(EPLBPolicyOption))
+
+__all__ = [
+    "AbstractEplbPolicy",
+    "DefaultEplbPolicy",
+    "EPLB_POLICIES",
+]
diff --git a/vllm/distributed/eplb/policy/abstract.py b/vllm/distributed/eplb/policy/abstract.py
new file mode 100644
index 000000000..40ed621c8
--- /dev/null
+++ b/vllm/distributed/eplb/policy/abstract.py
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import ABC, abstractmethod
+
+import torch
+
+
+class AbstractEplbPolicy(ABC):
+    @classmethod
+    @abstractmethod
+    def rebalance_experts(
+        cls,
+        weight: torch.Tensor,
+        num_replicas: int,
+        num_groups: int,
+        num_nodes: int,
+        num_ranks: int,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Entry point for expert-parallelism load balancer.
+
+        Parameters:
+            weight: [layers, num_logical_experts], the load statistics
+                for all logical experts
+            num_replicas: number of physical experts, must be a multiple of
+                `num_ranks`
+            num_groups: number of expert groups
+            num_nodes: number of server nodes
+            num_ranks: number of ranks, must be a multiple of `num_nodes`
+
+        Returns:
+            physical_to_logical_map: [layers, num_replicas], the expert
+                index of each replica
+            logical_to_physical_map: [layers, num_logical_experts, X],
+                the replica indices for each expert
+            expert_count: [layers, num_logical_experts], number of
+                physical replicas for each logical expert
+        """
+        raise NotImplementedError
diff --git a/vllm/distributed/eplb/policy/default.py b/vllm/distributed/eplb/policy/default.py
new file mode 100644
index 000000000..6127ec703
--- /dev/null
+++ b/vllm/distributed/eplb/policy/default.py
@@ -0,0 +1,267 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Expert parallelism load balancer (EPLB) for vLLM.
+
+This module implements the core rearrangement algorithm.
+
+The rearrangement algorithm is adapted from
+[DeepSeek EPLB](https://github.com/deepseek-ai/eplb).
+
+Please find at [#12](https://github.com/deepseek-ai/EPLB/issues/12) an example
+on how the EPLB algorithm works.
+"""
+
+import numpy as np
+import torch
+
+from .abstract import AbstractEplbPolicy
+
+
+class DefaultEplbPolicy(AbstractEplbPolicy):
+    @classmethod
+    def balanced_packing(
+        cls, weight: torch.Tensor, num_packs: int
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Pack n weighted objects to m packs, such that each bin contains exactly
+        n/m objects and the weights of all packs are as balanced as possible.
+
+        Parameters:
+            weight: [X, n], the weight of each item
+            num_packs: number of packs
+
+        Returns:
+            pack_index: [X, n], the pack index of each item
+            rank_in_pack: [X, n], the rank of the item in the pack
+        """
+        num_layers, num_groups = weight.shape
+        assert num_groups % num_packs == 0
+        groups_per_pack = num_groups // num_packs
+
+        device = weight.device
+
+        if groups_per_pack == 1:
+            pack_index = torch.arange(
+                weight.size(-1), dtype=torch.int64, device=device
+            ).expand(weight.shape)
+            rank_in_pack = torch.zeros_like(weight, dtype=torch.int64, device=device)
+            return pack_index, rank_in_pack
+
+        weight_np = weight.cpu().numpy()
+
+        # Sort and get indices in decending order
+        indices_np = np.argsort(-weight_np, axis=-1)
+
+        pack_index_np = np.full((num_layers, num_groups), -1, dtype=np.int64)
+        rank_in_pack_np = np.full((num_layers, num_groups), -1, dtype=np.int64)
+
+        # Run the packing algorithm
+        for i in range(num_layers):
+            pack_weights = [0.0] * num_packs
+            pack_items = [0] * num_packs
+
+            for group in indices_np[i]:
+                # Find a pack with capacity that has the lowest weight
+                pack = min(
+                    (j for j in range(num_packs) if pack_items[j] < groups_per_pack),
+                    key=pack_weights.__getitem__,
+                )
+
+                assert pack_items[pack] < groups_per_pack
+                pack_index_np[i, group] = pack
+                rank_in_pack_np[i, group] = pack_items[pack]
+                pack_weights[pack] += weight_np[i, group]
+                pack_items[pack] += 1
+
+        pack_index = torch.from_numpy(pack_index_np).to(device)
+        rank_in_pack = torch.from_numpy(rank_in_pack_np).to(device)
+
+        return pack_index, rank_in_pack
+
+    @classmethod
+    def replicate_experts(
+        cls, weight: torch.Tensor, num_phy: int
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Replicate `num_log` experts to `num_phy` replicas, such that the maximum
+        load of all replicas is minimized.
+
+        Parameters:
+            weight: [X, num_log]
+            num_phy: total number of experts after replication
+
+        Returns:
+            phy2log: [X, num_phy], logical expert id of each physical expert
+            rank: [X, num_phy], the replica rank
+            logcnt: [X, num_log], number of replicas for each logical expert
+        """
+        n, num_log = weight.shape
+        num_redundant = num_phy - num_log
+        assert num_redundant >= 0
+        device = weight.device
+        phy2log = torch.arange(num_phy, dtype=torch.int64, device=device).repeat(n, 1)
+        rank = torch.zeros(n, num_phy, dtype=torch.int64, device=device)
+        logcnt = torch.ones(n, num_log, dtype=torch.int64, device=device)
+        arangen = torch.arange(n, dtype=torch.int64, device=device)
+        for i in range(num_log, num_phy):
+            redundant_indices = (weight / logcnt).max(dim=-1).indices
+            phy2log[:, i] = redundant_indices
+            rank[:, i] = logcnt[arangen, redundant_indices]
+            logcnt[arangen, redundant_indices] += 1
+        return phy2log, rank, logcnt
+
+    @classmethod
+    def rebalance_experts_hierarchical(
+        cls,
+        weight: torch.Tensor,
+        num_physical_experts: int,
+        num_groups: int,
+        num_nodes: int,
+        num_gpus: int,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            weight: [num_moe_layers, num_logical_experts]
+            num_physical_experts: number of physical experts after replication
+            num_groups: number of expert groups
+            num_nodes: number of server nodes, where the intra-node network
+                (e.g, NVLink) is faster
+            num_gpus: number of GPUs, must be a multiple of `num_nodes`
+
+        Returns:
+            phy2log: [layers, num_replicas], the expert
+                index of each replica
+            log2phy: [layers, num_logical_experts, X],
+                the replica indices for each expert
+            logcnt: [layers, num_logical_experts], number of
+                physical replicas for each logical expert
+        """
+        num_layers, num_logical_experts = weight.shape
+        assert num_logical_experts % num_groups == 0
+        group_size = num_logical_experts // num_groups
+        assert num_groups % num_nodes == 0
+        groups_per_node = num_groups // num_nodes
+        assert num_gpus % num_nodes == 0
+        assert num_physical_experts % num_gpus == 0
+        phy_experts_per_gpu = num_physical_experts // num_gpus
+
+        def inverse(perm: torch.Tensor) -> torch.Tensor:
+            inv = torch.empty_like(perm)
+            inv.scatter_(
+                1,
+                perm,
+                torch.arange(
+                    perm.size(1), dtype=torch.int64, device=perm.device
+                ).expand(perm.shape),
+            )
+            return inv
+
+        # Step 1: pack groups to nodes
+        tokens_per_group = weight.unflatten(-1, (num_groups, group_size)).sum(-1)
+        group_pack_index, group_rank_in_pack = cls.balanced_packing(
+            tokens_per_group, num_nodes
+        )
+        log2mlog = (
+            (
+                (group_pack_index * groups_per_node + group_rank_in_pack) * group_size
+            ).unsqueeze(-1)
+            + torch.arange(
+                group_size, dtype=torch.int64, device=group_pack_index.device
+            )
+        ).flatten(-2)
+        mlog2log = inverse(log2mlog)
+
+        # Step 2: construct redundant experts within nodes
+        # [num_layers * num_nodes, num_logical_experts // num_nodes]
+        tokens_per_mlog = weight.gather(-1, mlog2log).view(
+            -1, num_logical_experts // num_nodes
+        )
+        phy2mlog, phyrank, mlogcnt = cls.replicate_experts(
+            tokens_per_mlog, num_physical_experts // num_nodes
+        )
+
+        # Step 3: pack physical_experts to GPUs
+        # [num_layers * num_nodes, num_physical_experts // num_nodes]
+        tokens_per_phy = (tokens_per_mlog / mlogcnt).gather(-1, phy2mlog)
+        pack_index, rank_in_pack = cls.balanced_packing(
+            tokens_per_phy, num_gpus // num_nodes
+        )
+        phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack
+        pphy2phy = inverse(phy2pphy)
+
+        pphy2mlog = phy2mlog.gather(
+            -1, pphy2phy
+        )  # [num_layers * num_nodes, num_log_per_nodes]
+        pphy2mlog = (
+            pphy2mlog.view(num_layers, num_nodes, -1)
+            + torch.arange(
+                0,
+                num_logical_experts,
+                num_logical_experts // num_nodes,
+                device=group_pack_index.device,
+            ).view(1, -1, 1)
+        ).flatten(-2)
+        pphy2log = mlog2log.gather(-1, pphy2mlog)
+        pphyrank = phyrank.gather(-1, pphy2phy).view(num_layers, -1)
+        logcnt = mlogcnt.view(num_layers, -1).gather(-1, log2mlog)
+        return pphy2log, pphyrank, logcnt
+
+    @classmethod
+    def rebalance_experts(
+        cls,
+        weight: torch.Tensor,
+        num_replicas: int,
+        num_groups: int,
+        num_nodes: int,
+        num_ranks: int,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Entry point for expert-parallelism load balancer.
+
+        Parameters:
+            weight: [layers, num_logical_experts], the load statistics for all
+                logical experts
+            num_replicas: number of physical experts, must be a multiple of
+                `num_gpus`
+            num_groups: number of expert groups
+            num_nodes: number of server nodes, where the intra-node network
+                (e.g, NVLink) is faster
+            num_ranks: number of ranks, must be a multiple of `num_nodes`
+
+        Returns:
+            phy2log: [layers, num_replicas], the expert
+                index of each replica
+            log2phy: [layers, num_logical_experts, X],
+                the replica indices for each expert
+            logcnt: [layers, num_logical_experts], number of
+                physical replicas for each logical expert
+        """
+        num_layers, num_logical_experts = weight.shape
+        weight = weight.float()
+        if num_groups % num_nodes == 0:
+            # use hierarchical load-balance policy
+            phy2log, phyrank, logcnt = cls.rebalance_experts_hierarchical(
+                weight, num_replicas, num_groups, num_nodes, num_ranks
+            )
+        else:
+            # use global load-balance policy
+            phy2log, phyrank, logcnt = cls.rebalance_experts_hierarchical(
+                weight, num_replicas, 1, 1, num_ranks
+            )
+        num_redundant_experts = num_replicas - num_logical_experts
+        maxlogcnt = num_redundant_experts + 1
+        log2phy: torch.Tensor = torch.full(
+            (num_layers, num_logical_experts, maxlogcnt),
+            -1,
+            dtype=torch.int64,
+            device=logcnt.device,
+        )
+        log2phy.view(num_layers, -1).scatter_(
+            -1,
+            phy2log * maxlogcnt + phyrank,
+            torch.arange(num_replicas, dtype=torch.int64, device=log2phy.device).expand(
+                num_layers, -1
+            ),
+        )
+        return phy2log, log2phy, logcnt
diff --git a/vllm/distributed/eplb/rebalance_algo.py b/vllm/distributed/eplb/rebalance_algo.py
deleted file mode 100644
index e6645e524..000000000
--- a/vllm/distributed/eplb/rebalance_algo.py
+++ /dev/null
@@ -1,260 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Expert parallelism load balancer (EPLB) for vLLM.
-
-This module implements the core rearrangement algorithm.
-
-The rearrangement algorithm is adapted from
-[DeepSeek EPLB](https://github.com/deepseek-ai/eplb).
-
-Please find at [#12](https://github.com/deepseek-ai/EPLB/issues/12) an example
-on how the EPLB algorithm works.
-"""
-
-import numpy as np
-import torch
-
-
-def balanced_packing(
-    weight: torch.Tensor, num_packs: int
-) -> tuple[torch.Tensor, torch.Tensor]:
-    """
-    Pack n weighted objects to m packs, such that each bin contains exactly
-    n/m objects and the weights of all packs are as balanced as possible.
-
-    Parameters:
-        weight: [X, n], the weight of each item
-        num_packs: number of packs
-
-    Returns:
-        pack_index: [X, n], the pack index of each item
-        rank_in_pack: [X, n], the rank of the item in the pack
-    """
-    num_layers, num_groups = weight.shape
-    assert num_groups % num_packs == 0
-    groups_per_pack = num_groups // num_packs
-
-    device = weight.device
-
-    if groups_per_pack == 1:
-        pack_index = torch.arange(
-            weight.size(-1), dtype=torch.int64, device=device
-        ).expand(weight.shape)
-        rank_in_pack = torch.zeros_like(weight, dtype=torch.int64, device=device)
-        return pack_index, rank_in_pack
-
-    weight_np = weight.cpu().numpy()
-
-    # Sort and get indices in decending order
-    indices_np = np.argsort(-weight_np, axis=-1)
-
-    pack_index_np = np.full((num_layers, num_groups), -1, dtype=np.int64)
-    rank_in_pack_np = np.full((num_layers, num_groups), -1, dtype=np.int64)
-
-    # Run the packing algorithm
-    for i in range(num_layers):
-        pack_weights = [0.0] * num_packs
-        pack_items = [0] * num_packs
-
-        for group in indices_np[i]:
-            # Find a pack with capacity that has the lowest weight
-            pack = min(
-                (j for j in range(num_packs) if pack_items[j] < groups_per_pack),
-                key=pack_weights.__getitem__,
-            )
-
-            assert pack_items[pack] < groups_per_pack
-            pack_index_np[i, group] = pack
-            rank_in_pack_np[i, group] = pack_items[pack]
-            pack_weights[pack] += weight_np[i, group]
-            pack_items[pack] += 1
-
-    pack_index = torch.from_numpy(pack_index_np).to(device)
-    rank_in_pack = torch.from_numpy(rank_in_pack_np).to(device)
-
-    return pack_index, rank_in_pack
-
-
-def replicate_experts(
-    weight: torch.Tensor, num_phy: int
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    """
-    Replicate `num_log` experts to `num_phy` replicas, such that the maximum
-    load of all replicas is minimized.
-
-    Parameters:
-        weight: [X, num_log]
-        num_phy: total number of experts after replication
-
-    Returns:
-        phy2log: [X, num_phy], logical expert id of each physical expert
-        rank: [X, num_phy], the replica rank
-        logcnt: [X, num_log], number of replicas for each logical expert
-    """
-    n, num_log = weight.shape
-    num_redundant = num_phy - num_log
-    assert num_redundant >= 0
-    device = weight.device
-    phy2log = torch.arange(num_phy, dtype=torch.int64, device=device).repeat(n, 1)
-    rank = torch.zeros(n, num_phy, dtype=torch.int64, device=device)
-    logcnt = torch.ones(n, num_log, dtype=torch.int64, device=device)
-    arangen = torch.arange(n, dtype=torch.int64, device=device)
-    for i in range(num_log, num_phy):
-        redundant_indices = (weight / logcnt).max(dim=-1).indices
-        phy2log[:, i] = redundant_indices
-        rank[:, i] = logcnt[arangen, redundant_indices]
-        logcnt[arangen, redundant_indices] += 1
-    return phy2log, rank, logcnt
-
-
-def rebalance_experts_hierarchical(
-    weight: torch.Tensor,
-    num_physical_experts: int,
-    num_groups: int,
-    num_nodes: int,
-    num_gpus: int,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    """
-    Parameters:
-        weight: [num_moe_layers, num_logical_experts]
-        num_physical_experts: number of physical experts after replication
-        num_groups: number of expert groups
-        num_nodes: number of server nodes, where the intra-node network
-            (e.g., NVLink) is faster
-        num_gpus: number of GPUs, must be a multiple of `num_nodes`
-
-    Returns:
-        physical_to_logical_map (torch.Tensor):
-            [num_moe_layers, num_physical_experts]
-        logical_to_physical_map (torch.Tensor):
-            [num_moe_layers, num_logical_experts, X]
-        logical_count (torch.Tensor):
-            [num_moe_layers, num_logical_experts]
-    """
-    num_layers, num_logical_experts = weight.shape
-    assert num_logical_experts % num_groups == 0
-    group_size = num_logical_experts // num_groups
-    assert num_groups % num_nodes == 0
-    groups_per_node = num_groups // num_nodes
-    assert num_gpus % num_nodes == 0
-    assert num_physical_experts % num_gpus == 0
-    phy_experts_per_gpu = num_physical_experts // num_gpus
-
-    def inverse(perm: torch.Tensor) -> torch.Tensor:
-        inv = torch.empty_like(perm)
-        inv.scatter_(
-            1,
-            perm,
-            torch.arange(perm.size(1), dtype=torch.int64, device=perm.device).expand(
-                perm.shape
-            ),
-        )
-        return inv
-
-    # Step 1: pack groups to nodes
-    tokens_per_group = weight.unflatten(-1, (num_groups, group_size)).sum(-1)
-    group_pack_index, group_rank_in_pack = balanced_packing(tokens_per_group, num_nodes)
-    log2mlog = (
-        (
-            (group_pack_index * groups_per_node + group_rank_in_pack) * group_size
-        ).unsqueeze(-1)
-        + torch.arange(group_size, dtype=torch.int64, device=group_pack_index.device)
-    ).flatten(-2)
-    mlog2log = inverse(log2mlog)
-
-    # Step 2: construct redundant experts within nodes
-    # [num_layers * num_nodes, num_logical_experts // num_nodes]
-    tokens_per_mlog = weight.gather(-1, mlog2log).view(
-        -1, num_logical_experts // num_nodes
-    )
-    phy2mlog, phyrank, mlogcnt = replicate_experts(
-        tokens_per_mlog, num_physical_experts // num_nodes
-    )
-
-    # Step 3: pack physical_experts to GPUs
-    # [num_layers * num_nodes, num_physical_experts // num_nodes]
-    tokens_per_phy = (tokens_per_mlog / mlogcnt).gather(-1, phy2mlog)
-    pack_index, rank_in_pack = balanced_packing(tokens_per_phy, num_gpus // num_nodes)
-    phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack
-    pphy2phy = inverse(phy2pphy)
-
-    pphy2mlog = phy2mlog.gather(
-        -1, pphy2phy
-    )  # [num_layers * num_nodes, num_log_per_nodes]
-    pphy2mlog = (
-        pphy2mlog.view(num_layers, num_nodes, -1)
-        + torch.arange(
-            0,
-            num_logical_experts,
-            num_logical_experts // num_nodes,
-            device=group_pack_index.device,
-        ).view(1, -1, 1)
-    ).flatten(-2)
-    pphy2log = mlog2log.gather(-1, pphy2mlog)
-    pphyrank = phyrank.gather(-1, pphy2phy).view(num_layers, -1)
-    logcnt = mlogcnt.view(num_layers, -1).gather(-1, log2mlog)
-    return pphy2log, pphyrank, logcnt
-
-
-def rebalance_experts(
-    weight: torch.Tensor,
-    num_replicas: int,
-    num_groups: int,
-    num_nodes: int,
-    num_gpus: int,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    """
-    Entry point for expert-parallelism load balancer.
-
-    Parameters:
-        weight: [layers, num_logical_experts], the load statistics for all
-            logical experts
-        num_replicas: number of physical experts, must be a multiple of
-            `num_gpus`
-        num_groups: number of expert groups
-        num_nodes: number of server nodes, where the intra-node network
-            (e.g, NVLink) is faster
-        num_gpus: number of GPUs, must be a multiple of `num_nodes`
-
-    Returns:
-        physical_to_logical_map:
-            [layers, num_replicas], the expert index of each replica
-        logical_to_physical_map:
-            [layers, num_logical_experts, X], the replica indices for each
-            expert
-        expert_count:
-            [layers, num_logical_experts], number of physical
-            replicas for each logical expert
-    """
-    num_layers, num_logical_experts = weight.shape
-    weight = weight.float()
-    if num_groups % num_nodes == 0:
-        # use hierarchical load-balance policy
-        phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
-            weight, num_replicas, num_groups, num_nodes, num_gpus
-        )
-    else:
-        # use global load-balance policy
-        phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
-            weight, num_replicas, 1, 1, num_gpus
-        )
-    num_redundant_experts = num_replicas - num_logical_experts
-    maxlogcnt = num_redundant_experts + 1
-    log2phy: torch.Tensor = torch.full(
-        (num_layers, num_logical_experts, maxlogcnt),
-        -1,
-        dtype=torch.int64,
-        device=logcnt.device,
-    )
-    log2phy.view(num_layers, -1).scatter_(
-        -1,
-        phy2log * maxlogcnt + phyrank,
-        torch.arange(num_replicas, dtype=torch.int64, device=log2phy.device).expand(
-            num_layers, -1
-        ),
-    )
-    return phy2log, log2phy, logcnt
-
-
-__all__ = ["rebalance_experts"]
-- 
GitLab


From 48a5fff66e78985a634abac0d8d7f271da744000 Mon Sep 17 00:00:00 2001
From: Peng-YM <1048217874pengym@gmail.com>
Date: Fri, 5 Dec 2025 03:09:39 +0800
Subject: [PATCH 100/258] [Bugfix] Missing tokens in `return_token_ids` when
 tool parsers is enabled in streaming mode (#29074)

Signed-off-by: Peng-YM <1048217874pengym@gmail.com>
---
 vllm/entrypoints/openai/serving_chat.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index cecd1da1e..9b7bc461e 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -1072,10 +1072,15 @@ class OpenAIServingChat(OpenAIServing):
                     # wasn't ready to send a token, then
                     #   get the next token without streaming a chunk
                     if delta_message is None:
-                        if output.finish_reason is None:
+                        # NOTE: If return_token_ids is enabled, we still need to
+                        # send a chunk with token_ids even if delta_message is None
+                        # to ensure all tokens are included in the response
+                        if (
+                            output.finish_reason is None
+                            and not request.return_token_ids
+                        ):
                             continue
-                        else:
-                            delta_message = DeltaMessage()
+                        delta_message = DeltaMessage()
 
                     # Log streaming delta if output logging is enabled
                     if self.enable_log_outputs and self.request_logger:
-- 
GitLab


From c8ab988b15af5e30e87c6eb27a0ededf0377ac9e Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 4 Dec 2025 14:48:54 -0500
Subject: [PATCH 101/258] [BugFix] Fix DBO assert `assert B_block_table == B_q`
 (#29933)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 .../v1/attention/test_attention_splitting.py  | 12 +++--
 vllm/v1/spec_decode/eagle.py                  |  4 +-
 vllm/v1/worker/dp_utils.py                    | 43 ++----------------
 vllm/v1/worker/gpu_model_runner.py            | 45 ++++++++++++-------
 vllm/v1/worker/ubatch_utils.py                | 42 +++++++++++++++--
 5 files changed, 83 insertions(+), 63 deletions(-)

diff --git a/tests/v1/attention/test_attention_splitting.py b/tests/v1/attention/test_attention_splitting.py
index 1cbd0fe56..f60861e34 100644
--- a/tests/v1/attention/test_attention_splitting.py
+++ b/tests/v1/attention/test_attention_splitting.py
@@ -13,7 +13,7 @@ from vllm.v1.attention.backends.utils import (
     split_attn_metadata,
     split_decodes_and_prefills,
 )
-from vllm.v1.worker.ubatch_utils import create_ubatch_slices
+from vllm.v1.worker.ubatch_utils import maybe_create_ubatch_slices
 
 
 @pytest.fixture
@@ -294,8 +294,14 @@ def test_prefill_split_across_ubatches(
     qsl_np = common.query_start_loc_cpu.numpy()
     num_tokens = common.num_actual_tokens
 
-    ubatch_slices = create_ubatch_slices(num_scheduled_tokens, split_point)
-    assert len(ubatch_slices) == 2
+    ubatch_slices, _ = maybe_create_ubatch_slices(
+        True,
+        num_scheduled_tokens,
+        num_tokens,
+        batch_spec.batch_size,
+        split_point=split_point,
+    )
+    assert ubatch_slices is not None and len(ubatch_slices) == 2
 
     first_meta = _make_metadata_with_slice(ubatch_slices[0], common)
     second_meta = _make_metadata_with_slice(ubatch_slices[1], common)
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 1c7845a14..31428db2d 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -1258,7 +1258,7 @@ class EagleProposer:
         num_tokens_padded: int,
     ) -> tuple[int, torch.Tensor]:
         # TODO(Flechman): support DBO ubatching
-        ubatch_slices, num_toks_across_dp = coordinate_batch_across_dp(
+        should_ubatch, num_toks_across_dp = coordinate_batch_across_dp(
             num_tokens_unpadded=num_tokens_unpadded,
             parallel_config=self.vllm_config.parallel_config,
             allow_microbatching=False,
@@ -1267,7 +1267,7 @@ class EagleProposer:
             uniform_decode=None,
             num_scheduled_tokens_per_request=None,
         )
-        assert ubatch_slices is None, "DBO ubatching not implemented for EAGLE"
+        assert not should_ubatch, "DBO ubatching not implemented for EAGLE"
 
         num_tokens_dp_padded = num_tokens_padded
         if num_toks_across_dp is not None:
diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py
index 6539d72d8..5da55d740 100644
--- a/vllm/v1/worker/dp_utils.py
+++ b/vllm/v1/worker/dp_utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+
 import numpy as np
 import torch
 import torch.distributed as dist
@@ -9,10 +10,7 @@ from vllm.config import ParallelConfig
 from vllm.distributed.parallel_state import get_dp_group
 from vllm.logger import init_logger
 from vllm.v1.worker.ubatch_utils import (
-    UBatchSlice,
-    UBatchSlices,
     check_ubatch_thresholds,
-    create_ubatch_slices,
     is_second_ubatch_empty,
 )
 
@@ -91,20 +89,6 @@ def _post_process_dp_padding(tensor: torch.Tensor, should_dp_pad: bool) -> torch
         return num_tokens_across_dp.cpu()
 
 
-# This just pads the second ubatch slice out to the total number of tokens
-# (num_tokens + padding) since we do `create_ubatch_slices` before applying DP padding.
-def _pad_out_ubatch_slice(
-    ubatch_slices: UBatchSlices, num_total_tokens: int
-) -> UBatchSlices:
-    padded_second_token_slice = slice(
-        ubatch_slices[1].token_slice.start, num_total_tokens
-    )
-    ubatch_slices[1] = UBatchSlice(
-        ubatch_slices[1].request_slice, padded_second_token_slice
-    )
-    return ubatch_slices
-
-
 def _synchronize_dp_ranks(
     num_tokens_unpadded: int,
     num_tokens_padded: int,
@@ -175,7 +159,7 @@ def coordinate_batch_across_dp(
     num_tokens_padded: int | None = None,
     uniform_decode: bool | None = None,
     num_scheduled_tokens_per_request: np.ndarray | None = None,
-) -> tuple[UBatchSlices | None, torch.Tensor | None]:
+) -> tuple[bool, torch.Tensor | None]:
     """
     Coordinates amongst all DP ranks to determine if and how the full batch
     should be split into microbatches.
@@ -204,7 +188,7 @@ def coordinate_batch_across_dp(
     """
     if parallel_config.data_parallel_size == 1:
         # Early exit.
-        return None, None
+        return False, None
 
     # If the caller has explicitly enabled microbatching.
     should_attempt_ubatching = False
@@ -228,23 +212,4 @@ def coordinate_batch_across_dp(
         parallel_config,
     )
 
-    # Don't microbatch unless every other DP worker is also microbatching
-    if not should_ubatch:
-        return (None, num_tokens_after_padding)
-
-    # This doesn't actually pad the ubatch slices. It just initializes the
-    # split point to the padded value so that padding can be applied
-    # to the second ubatch in pad_out_ubatch_slice after attention
-    # metadata creation
-    assert num_tokens_after_padding is not None
-    num_tokens_padded = int(num_tokens_after_padding[0].item())
-    token_split_point = int(num_tokens_padded) // 2
-
-    assert num_scheduled_tokens_per_request is not None
-    ubatch_slices = create_ubatch_slices(
-        num_scheduled_tokens_per_request, token_split_point
-    )
-    ubatch_slices = _pad_out_ubatch_slice(ubatch_slices, num_tokens_padded)
-    assert sum(s.num_tokens for s in ubatch_slices) == num_tokens_padded
-
-    return (ubatch_slices, num_tokens_after_padding)
+    return (should_ubatch, num_tokens_after_padding)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 58043a42d..152bea2c0 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -153,6 +153,7 @@ from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 from vllm.v1.worker.ubatch_utils import (
     UBatchSlices,
     check_ubatch_thresholds,
+    maybe_create_ubatch_slices,
 )
 from vllm.v1.worker.utils import is_residual_scattered_for_sp
 
@@ -2743,7 +2744,7 @@ class GPUModelRunner(
     ) -> tuple[
         CUDAGraphMode,
         BatchDescriptor,
-        UBatchSlices | None,
+        bool,
         torch.Tensor | None,
         CUDAGraphStat | None,
     ]:
@@ -2779,7 +2780,7 @@ class GPUModelRunner(
 
         # Extra coordination when running data-parallel since we need to coordinate
         # across ranks
-        ubatch_slices, num_tokens_across_dp = None, None
+        should_ubatch, num_tokens_across_dp = False, None
         if self.vllm_config.parallel_config.data_parallel_size > 1:
             # Disable DP padding when running eager to avoid excessive padding when
             # running prefills. This lets us set cudagraph_mode="NONE" on the prefiller
@@ -2789,8 +2790,8 @@ class GPUModelRunner(
                 self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
             )
 
-            ubatch_slices, num_tokens_across_dp = coordinate_batch_across_dp(
-                num_tokens_unpadded=num_tokens_padded,
+            should_ubatch, num_tokens_across_dp = coordinate_batch_across_dp(
+                num_tokens_unpadded=num_tokens,
                 parallel_config=self.parallel_config,
                 allow_microbatching=allow_microbatching,
                 allow_dp_padding=allow_dp_padding,
@@ -2822,7 +2823,7 @@ class GPUModelRunner(
         return (
             cudagraph_mode,
             batch_descriptor,
-            ubatch_slices,
+            should_ubatch,
             num_tokens_across_dp,
             cudagraph_stats,
         )
@@ -2921,7 +2922,7 @@ class GPUModelRunner(
                 (
                     cudagraph_mode,
                     batch_desc,
-                    ubatch_slices,
+                    should_ubatch,
                     num_tokens_across_dp,
                     cudagraph_stats,
                 ) = self._determine_batch_execution_and_padding(
@@ -2934,10 +2935,10 @@ class GPUModelRunner(
 
                 logger.debug(
                     "Running batch with cudagraph_mode: %s, batch_descriptor: %s, "
-                    "ubatch_slices: %s, num_tokens_across_dp: %s",
+                    "should_ubatch: %s, num_tokens_across_dp: %s",
                     cudagraph_mode,
                     batch_desc,
-                    ubatch_slices,
+                    should_ubatch,
                     num_tokens_across_dp,
                 )
 
@@ -2945,10 +2946,18 @@ class GPUModelRunner(
                 num_reqs_padded = (
                     batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs
                 )
+                ubatch_slices, ubatch_slices_padded = maybe_create_ubatch_slices(
+                    should_ubatch,
+                    num_scheduled_tokens_np,
+                    num_tokens_padded,
+                    num_reqs_padded,
+                )
 
-                use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0
                 pad_attn = cudagraph_mode == CUDAGraphMode.FULL
 
+                use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0
+                ubatch_slices_attn = ubatch_slices_padded if pad_attn else ubatch_slices
+
                 (attn_metadata, spec_decode_common_attn_metadata) = (
                     self._build_attention_metadata(
                         num_tokens=num_tokens_unpadded,
@@ -2956,7 +2965,7 @@ class GPUModelRunner(
                         num_reqs=num_reqs,
                         num_reqs_padded=num_reqs_padded if pad_attn else None,
                         max_query_len=max_num_scheduled_tokens,
-                        ubatch_slices=ubatch_slices,
+                        ubatch_slices=ubatch_slices_attn,
                         logits_indices=logits_indices,
                         use_spec_decode=use_spec_decode,
                         num_scheduled_tokens=scheduler_output.num_scheduled_tokens,
@@ -2993,7 +3002,7 @@ class GPUModelRunner(
                 num_tokens_across_dp=num_tokens_across_dp,
                 cudagraph_runtime_mode=cudagraph_mode,
                 batch_descriptor=batch_desc,
-                ubatch_slices=ubatch_slices,
+                ubatch_slices=ubatch_slices_padded,
             ),
             record_function_or_nullcontext("gpu_model_runner: forward"),
             self.maybe_get_kv_connector_output(scheduler_output) as kv_connector_output,
@@ -3945,7 +3954,7 @@ class GPUModelRunner(
 
         num_sampled_tokens = np.ones(num_reqs, dtype=np.int32)
 
-        _cudagraph_mode, batch_desc, ubatch_slices, num_tokens_across_dp, _ = (
+        _cudagraph_mode, batch_desc, should_ubatch, num_tokens_across_dp, _ = (
             self._determine_batch_execution_and_padding(
                 num_tokens=num_tokens_unpadded,
                 num_reqs=num_reqs,
@@ -3979,6 +3988,9 @@ class GPUModelRunner(
         num_reqs_padded = (
             batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs
         )
+        ubatch_slices, ubatch_slices_padded = maybe_create_ubatch_slices(
+            should_ubatch, num_scheduled_tokens, num_tokens_padded, num_reqs_padded
+        )
 
         attn_metadata: PerLayerAttnMetadata | None = None
 
@@ -4000,11 +4012,12 @@ class GPUModelRunner(
             self.query_start_loc.np[1 : num_reqs + 1] = cum_num_tokens
             self.query_start_loc.copy_to_gpu()
 
+            pad_attn = cudagraph_runtime_mode == CUDAGraphMode.FULL
             attn_metadata, _ = self._build_attention_metadata(
                 num_tokens=num_tokens_unpadded,
                 num_reqs=num_reqs_padded,
                 max_query_len=max_query_len,
-                ubatch_slices=ubatch_slices,
+                ubatch_slices=ubatch_slices_padded if pad_attn else ubatch_slices,
                 for_cudagraph_capture=is_graph_capturing,
             )
 
@@ -4056,11 +4069,11 @@ class GPUModelRunner(
                     num_tokens_padded, None, False
                 )
 
-            if ubatch_slices is not None:
+            if ubatch_slices_padded is not None:
                 # Adjust values to reflect a single ubatch.
                 # TODO(sage,lucas): this is cruft that should be addressed in
                 #  the padding refactor.
-                num_tokens_padded = ubatch_slices[0].num_tokens
+                num_tokens_padded = ubatch_slices_padded[0].num_tokens
                 if num_tokens_across_dp is not None:
                     num_tokens_across_dp[:] = num_tokens_padded
 
@@ -4073,7 +4086,7 @@ class GPUModelRunner(
                     num_tokens_across_dp=num_tokens_across_dp,
                     cudagraph_runtime_mode=cudagraph_runtime_mode,
                     batch_descriptor=batch_desc,
-                    ubatch_slices=ubatch_slices,
+                    ubatch_slices=ubatch_slices_padded,
                 ),
             ):
                 outputs = self.model(
diff --git a/vllm/v1/worker/ubatch_utils.py b/vllm/v1/worker/ubatch_utils.py
index 33a1921d2..44788476f 100644
--- a/vllm/v1/worker/ubatch_utils.py
+++ b/vllm/v1/worker/ubatch_utils.py
@@ -42,9 +42,37 @@ def check_ubatch_thresholds(
         return num_tokens >= config.dbo_prefill_token_threshold
 
 
-def create_ubatch_slices(
-    num_scheduled_tokens: np.ndarray, split_point: int
+# This just pads the second ubatch slice out to the total number of tokens
+# (num_tokens + padding) since we do `create_ubatch_slices` before applying DP padding.
+def _pad_out_ubatch_slices(
+    ubatch_slices: UBatchSlices, num_total_tokens: int, num_reqs_padded: int
 ) -> UBatchSlices:
+    # TODO(lucas): handle empty second ubatch
+    padded_second_request_slice = slice(
+        ubatch_slices[1].request_slice.start, num_reqs_padded
+    )
+    padded_second_token_slice = slice(
+        ubatch_slices[1].token_slice.start, num_total_tokens
+    )
+    return [
+        ubatch_slices[0],
+        UBatchSlice(padded_second_request_slice, padded_second_token_slice),
+    ]
+
+
+def maybe_create_ubatch_slices(
+    should_ubatch: bool,
+    num_scheduled_tokens: np.ndarray,
+    num_tokens_padded: int,
+    num_reqs_padded: int,
+    split_point: int | None = None,
+) -> tuple[UBatchSlices | None, UBatchSlices | None]:
+    if not should_ubatch:
+        return None, None
+
+    if split_point is None:
+        split_point = int(num_tokens_padded) // 2
+
     # TODO(lucas): Refactor the gpu_model_runner.py so we can pass
     # in cu_num_tokens directly (i.e. query_start_loc)
     cu_num_tokens = np.zeros(len(num_scheduled_tokens) + 1, dtype=np.int32)
@@ -67,7 +95,15 @@ def create_ubatch_slices(
     )
     second_ubatch_req_slice = slice(second_ubatch_req_start, len(cu_num_tokens) - 1)
 
-    return [
+    ubatch_slices = [
         UBatchSlice(first_ubatch_req_slice, first_ubatch_token_slice),
         UBatchSlice(second_ubatch_req_slice, second_ubatch_token_slice),
     ]
+
+    ubatch_slices_padded = _pad_out_ubatch_slices(
+        ubatch_slices, num_tokens_padded, num_reqs_padded
+    )
+
+    assert sum(s.num_tokens for s in ubatch_slices_padded) == num_tokens_padded
+
+    return ubatch_slices, ubatch_slices_padded
-- 
GitLab


From 1f0d1845909877b3492d725cb63b386d80c39b47 Mon Sep 17 00:00:00 2001
From: Laith Sakka <lsakka@meta.com>
Date: Thu, 4 Dec 2025 14:33:45 -0800
Subject: [PATCH 102/258] [aot_compile]change VLLM backend to read fake args
 from example_value (#29104)

Signed-off-by: Laith Sakka <lsakka@meta.com>
---
 tests/compile/test_aot_compile.py | 66 +++++++++++++++++++++++++++++++
 vllm/compilation/backends.py      | 24 ++++++-----
 vllm/compilation/decorators.py    |  1 -
 3 files changed, 81 insertions(+), 10 deletions(-)

diff --git a/tests/compile/test_aot_compile.py b/tests/compile/test_aot_compile.py
index c65e5a259..8fa305d6d 100644
--- a/tests/compile/test_aot_compile.py
+++ b/tests/compile/test_aot_compile.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import functools
+import multiprocessing
 import tempfile
 from contextlib import contextmanager
 
@@ -137,3 +139,67 @@ def test_shape_env(monkeypatch: pytest.MonkeyPatch):
                 artifacts = compiled_mod.aot_compiled_fn._artifacts
                 guards_string = artifacts.compiled_fn.shape_env.format_guards()
                 assert guards_string == " - s77 <= 42\n - Eq(Mod(s77, 2), 0)"
+
+
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
+@use_vllm_config(make_vllm_config())
+def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch):
+    """
+    Test that compiling gpt2 twice results in a cache hit and
+    capture torch dynamic symbol creations to ensure make_symbol
+    not called on cache hit.
+    """
+
+    import torch.fx.experimental.symbolic_shapes as symbolic_shapes_module
+    from torch.utils._sympy.symbol import make_symbol
+
+    from vllm import LLM
+
+    create_symbol_counter = multiprocessing.Value("i", 0)
+    original_make_symbol = make_symbol
+
+    @functools.wraps(original_make_symbol)
+    def counting_make_symbol(prefix, idx, **kwargs):
+        with create_symbol_counter.get_lock():
+            create_symbol_counter.value += 1
+        return original_make_symbol(prefix, idx, **kwargs)
+
+    symbolic_shapes_module.make_symbol = counting_make_symbol
+    try:
+        with monkeypatch.context() as m, tempfile.TemporaryDirectory() as tmpdirname:
+            m.setenv("VLLM_CACHE_ROOT", tmpdirname)
+            m.setenv("VLLM_USE_AOT_COMPILE", "1")
+            # First compilation - initialize model and generate
+            llm_model = LLM(
+                model="gpt2",
+                compilation_config=CompilationConfig(
+                    mode=CompilationMode.VLLM_COMPILE,
+                ),
+                max_model_len=256,
+            )
+
+            llm_model.generate("Hello, my name is")
+            assert create_symbol_counter.value == 2
+            create_symbol_counter.value = 0
+
+            # Clean up first model
+            del llm_model
+
+            # Second compilation - should hit cache
+            m.setenv("VLLM_FORCE_AOT_LOAD", "1")
+            llm_model = LLM(
+                model="gpt2",
+                compilation_config=CompilationConfig(
+                    mode=CompilationMode.VLLM_COMPILE,
+                ),
+                max_model_len=256,
+            )
+            llm_model.generate("Hello, my name is")
+
+            assert create_symbol_counter.value == 0
+
+    finally:
+        # Restore original method
+        symbolic_shapes_module.make_symbol = original_make_symbol
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 1773913d0..b5b7fe2b7 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -402,6 +402,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
         self.extra_traceback = False
 
     def run(self, *args):
+        # maybe instead just assert inputs are fake?
         fake_args = [
             self.fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
             for t in args
@@ -416,11 +417,13 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
         kwargs: dict[str, Any],
     ) -> Any:
         assert isinstance(target, str)
+
         output = super().call_module(target, args, kwargs)
 
         if target in self.compile_submod_names:
             index = self.compile_submod_names.index(target)
             submod = self.fetch_attr(target)
+
             sym_shape_indices = [
                 i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
             ]
@@ -746,11 +749,21 @@ class VllmBackend:
             if not item.is_splitting_graph
         ]
 
+        # Extract fake values from the graph to use them when needed.
+        all_fake_values = []
+        for i in graph.graph.find_nodes(op="placeholder"):
+            all_fake_values.append(i.meta["example_value"])
+
+        fake_args = [
+            all_fake_values[i] if isinstance(t, torch.Tensor) else t
+            for i, t in enumerate(example_inputs)
+        ]
+
         # propagate the split graph to the piecewise backend,
         # compile submodules with symbolic shapes
         PiecewiseCompileInterpreter(
             self.split_gm, submod_names_to_compile, self.vllm_config, self
-        ).run(*example_inputs)
+        ).run(*fake_args)
 
         graph_path = os.path.join(local_cache_dir, "computation_graph.py")
         if not os.path.exists(graph_path):
@@ -780,14 +793,7 @@ class VllmBackend:
             )
 
         # if we need to copy input buffers for cudagraph
-        from torch._guards import detect_fake_mode
-
-        fake_mode = detect_fake_mode()
-        fake_args = [
-            fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
-            for t in example_inputs
-        ]
-
+        #
         # index of tensors that have symbolic shapes (batch size)
         # for weights and static buffers, they will have concrete shapes.
         # symbolic shape only happens for input tensors.
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 6d9da1c48..eed7795cd 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -433,7 +433,6 @@ def _support_torch_compile(
             return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)
 
         # This is the path for the first compilation.
-
         # the first compilation needs to have dynamic shapes marked
         _mark_dynamic_inputs(
             self,
-- 
GitLab


From 690cc3ef20eec0d080b8e2fce397bf4f981beaf1 Mon Sep 17 00:00:00 2001
From: TimWang <7367474+haitwang-cloud@users.noreply.github.com>
Date: Fri, 5 Dec 2025 07:37:14 +0800
Subject: [PATCH 103/258] docs: update metrics design doc to use new
 vllm:kv_cache_usage_perc (#30041)

Signed-off-by: Tim <tim.wang03@sap.com>
---
 docs/design/metrics.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/design/metrics.md b/docs/design/metrics.md
index 13264f686..28b540587 100644
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@@ -62,7 +62,7 @@ The subset of metrics exposed in the Grafana dashboard gives us an indication of
 - `vllm:time_per_output_token_seconds` - Inter-token latency (Time Per Output Token, TPOT) in seconds.
 - `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds.
 - `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in the RUNNING, WAITING, and SWAPPED states.
-- `vllm:gpu_cache_usage_perc` - Percentage of used cache blocks by vLLM.
+- `vllm:kv_cache_usage_perc` - Percentage of used cache blocks by vLLM.
 - `vllm:request_prompt_tokens` - Request prompt length.
 - `vllm:request_generation_tokens` - Request generation length.
 - `vllm:request_success` - Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.
-- 
GitLab


From 4470ee2f90661d9eb632687750dfb1a5a2404032 Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com>
Date: Thu, 4 Dec 2025 19:03:17 -0500
Subject: [PATCH 104/258] [Perf] Enable separate shared_experts stream only for
 CUDA (#30085)

Signed-off-by: Alexander Matveev <amatveev@redhat.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 902a77987..6001b6d83 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -863,7 +863,8 @@ class FusedMoE(CustomOp):
         use_chunked_impl: bool,
     ) -> tuple[bool, torch.Tensor | None]:
         use_shared_experts_stream = (
-            has_separate_shared_experts
+            current_platform.is_cuda()
+            and has_separate_shared_experts
             and not use_chunked_impl
             and self.shared_experts_stream is not None
             and (
-- 
GitLab


From bcf43ab1f380208ea33769c49d116ea83f915080 Mon Sep 17 00:00:00 2001
From: Zhewen Li <zhewenli@meta.com>
Date: Thu, 4 Dec 2025 16:07:20 -0800
Subject: [PATCH 105/258] [CI/Build][AMD] Add Llama4 Maverick FP8 to AMD CI
 (#28695)

Signed-off-by: zhewenli <zhewenli@meta.com>
---
 ...lama-4-Maverick-17B-128E-Instruct-FP8.yaml |   1 +
 .../configs/models-large-rocm.txt             |   1 +
 .../test_lm_eval_correctness.py               |  75 +++++--
 .buildkite/test-amd.yaml                      | 193 +++++++++---------
 4 files changed, 158 insertions(+), 112 deletions(-)
 create mode 100644 .buildkite/lm-eval-harness/configs/models-large-rocm.txt

diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
index 46f1a9fbf..6c0b5540c 100644
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@@ -8,3 +8,4 @@ tasks:
     value: 0.80
 limit: 250 # will run on 250 * 14 subjects = 3500 samples
 num_fewshot: 5
+rtol: 0.05
diff --git a/.buildkite/lm-eval-harness/configs/models-large-rocm.txt b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
new file mode 100644
index 000000000..4fb0b84bc
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
@@ -0,0 +1 @@
+Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
index 3627b760e..f94d68119 100644
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -9,11 +9,40 @@ pytest -s -v test_lm_eval_correctness.py \
     --tp-size=1
 """
 
+import os
+from contextlib import contextmanager
+
 import lm_eval
 import numpy as np
 import yaml
 
-RTOL = 0.08
+DEFAULT_RTOL = 0.08
+
+
+@contextmanager
+def scoped_env_vars(new_env: dict[str, str]):
+    if not new_env:
+        # Fast path: nothing to do
+        yield
+        return
+
+    old_values = {}
+    new_keys = []
+
+    try:
+        for key, value in new_env.items():
+            if key in os.environ:
+                old_values[key] = os.environ[key]
+            else:
+                new_keys.append(key)
+            os.environ[key] = str(value)
+        yield
+    finally:
+        # Restore / clean up
+        for key, value in old_values.items():
+            os.environ[key] = value
+        for key in new_keys:
+            os.environ.pop(key, None)
 
 
 def launch_lm_eval(eval_config, tp_size):
@@ -32,23 +61,26 @@ def launch_lm_eval(eval_config, tp_size):
         f"trust_remote_code={trust_remote_code},"
         f"max_model_len={max_model_len},"
     )
-    results = lm_eval.simple_evaluate(
-        model=backend,
-        model_args=model_args,
-        tasks=[task["name"] for task in eval_config["tasks"]],
-        num_fewshot=eval_config["num_fewshot"],
-        limit=eval_config["limit"],
-        # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
-        # text models. however, this is regressing measured strict-match for
-        # existing text models in CI, so only apply it for mm, or explicitly set
-        apply_chat_template=eval_config.get(
-            "apply_chat_template", backend == "vllm-vlm"
-        ),
-        fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
-        # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
-        gen_kwargs=eval_config.get("gen_kwargs"),
-        batch_size=batch_size,
-    )
+
+    env_vars = eval_config.get("env_vars", None)
+    with scoped_env_vars(env_vars):
+        results = lm_eval.simple_evaluate(
+            model=backend,
+            model_args=model_args,
+            tasks=[task["name"] for task in eval_config["tasks"]],
+            num_fewshot=eval_config["num_fewshot"],
+            limit=eval_config["limit"],
+            # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
+            # text models. however, this is regressing measured strict-match for
+            # existing text models in CI, so only apply it for mm, or explicitly set
+            apply_chat_template=eval_config.get(
+                "apply_chat_template", backend == "vllm-vlm"
+            ),
+            fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
+            # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
+            gen_kwargs=eval_config.get("gen_kwargs"),
+            batch_size=batch_size,
+        )
     return results
 
 
@@ -57,6 +89,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
 
     results = launch_lm_eval(eval_config, tp_size)
 
+    rtol = eval_config.get("rtol", DEFAULT_RTOL)
+
     success = True
     for task in eval_config["tasks"]:
         for metric in task["metrics"]:
@@ -64,8 +98,9 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
             measured_value = results["results"][task["name"]][metric["name"]]
             print(
                 f"{task['name']} | {metric['name']}: "
-                f"ground_truth={ground_truth} | measured={measured_value}"
+                f"ground_truth={ground_truth:.3f} | "
+                f"measured={measured_value:.3f} | rtol={rtol}"
             )
-            success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
+            success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
 
     assert success
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 022b6ea23..6950ad774 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -718,17 +718,6 @@ steps:
   - uv pip install --system conch-triton-kernels
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
-- label: LM Eval Small Models # 15min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
-
 - label: OpenAI API correctness # 10min
   timeout_in_minutes: 15
   mirror_hardwares: [amdexperimental, amdproduction]
@@ -974,19 +963,6 @@ steps:
     - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
     - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 
-- label: Multi-Modal Accuracy Eval (Small Models) # 10min
-  timeout_in_minutes: 70
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - vllm/multimodal/
-  - vllm/inputs/
-  - vllm/v1/core/
-  commands:
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
-
 - label: Multi-Modal Models Test (Extended) 1 # 60min
   timeout_in_minutes: 120
   mirror_hardwares: [amdexperimental]
@@ -1162,21 +1138,6 @@ steps:
     # Run all e2e fusion tests
     - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
 
-- label: ROCm GPT-OSS Eval
-  timeout_in_minutes: 60
-  working_dir: "/vllm-workspace/"
-  agent_pool: mi325_1
-  mirror_hardwares: [amdexperimental, amdproduction]
-  optional: true # run on nightlies
-  source_file_dependencies:
-  - tests/evals/gpt_oss
-  - vllm/model_executor/models/gpt_oss.py
-  - vllm/model_executor/layers/quantization/mxfp4.py
-  - vllm/v1/attention/backends/flashinfer.py
-  commands:
-    - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
-
 - label: Blackwell Quantized MoE Test
   timeout_in_minutes: 60
   working_dir: "/vllm-workspace/"
@@ -1194,16 +1155,6 @@ steps:
   commands:
     - pytest -s -v tests/quantization/test_blackwell_moe.py
 
-- label: Blackwell LM Eval Small Models
-  timeout_in_minutes: 120
-  gpu: b200
-  optional: true # run on nightlies
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
-
 #####  1 GPU test  #####
 #####  multi gpus test  #####
 
@@ -1380,7 +1331,7 @@ steps:
     - pytest -v -s -x lora/test_llm_with_multi_loras.py
     - pytest -v -s -x lora/test_olmoe_tp.py
 
-    # Disabled for now because MXFP4 backend on non-cuda platform 
+    # Disabled for now because MXFP4 backend on non-cuda platform
     # doesn't support LoRA yet
     #- pytest -v -s -x lora/test_gptoss_tp.py
 
@@ -1446,37 +1397,6 @@ steps:
   - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
   - pytest -v -s -x lora/test_mixtral.py
 
-- label: LM Eval Large Models # optional
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_4
-  # grade: Blocking
-  gpu: a100
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
-
-##### H100 test #####
-- label: LM Eval Large Models (H100) # optional
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_4
-  # grade: Blocking
-  gpu: h100
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
-    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
-
 ##### H200 test #####
 - label: Distributed Tests (H200) # optional
   mirror_hardwares: [amdexperimental]
@@ -1508,20 +1428,94 @@ steps:
     - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
     - pytest -v -s tests/v1/distributed/test_dbo.py
 
-##### RL Integration Tests #####
-- label: Prime-RL Integration Test # 15min
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_2
+##### E2E Eval Tests #####
+- label: LM Eval Small Models (1 Card) # 15min
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
   # grade: Blocking
-  timeout_in_minutes: 30
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+
+- label: Blackwell LM Eval Small Models
+  timeout_in_minutes: 120
+  gpu: b200
+  optional: true # run on nightlies
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+
+- label: Multi-Modal Accuracy Eval (Small Models) # 10min
+  timeout_in_minutes: 70
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - vllm/multimodal/
+  - vllm/inputs/
+  - vllm/v1/core/
+  commands:
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+
+- label: LM Eval Large Models (4 Card)
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
+  gpu: a100
   optional: true
-  num_gpus: 2
-  working_dir: "/vllm-workspace"
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
   source_file_dependencies:
-  - vllm/
-  - .buildkite/scripts/run-prime-rl-test.sh
+  - csrc/
+  - vllm/model_executor/layers/quantization
   commands:
-    - bash .buildkite/scripts/run-prime-rl-test.sh
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+- label: LM Eval Large Models (H100) # optional
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
+- label: ROCm LM Eval Large Models (8 Card)
+  mirror_hardwares: [amdproduction]
+  agent_pool: mi325_8
+  num_gpus: 8
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8
+
+- label: ROCm GPT-OSS Eval
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  agent_pool: mi325_1
+  mirror_hardwares: [amdexperimental, amdproduction]
+  optional: true # run on nightlies
+  source_file_dependencies:
+  - tests/evals/gpt_oss
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - uv pip install --system 'gpt-oss[eval]==0.0.5'
+    - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
 
 - label: DeepSeek V2-Lite Accuracy
   mirror_hardwares: [amdexperimental, amdproduction]
@@ -1554,4 +1548,19 @@ steps:
   num_gpus: 2
   working_dir: "/vllm-workspace"
   commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
\ No newline at end of file
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+
+##### RL Integration Tests #####
+- label: Prime-RL Integration Test # 15min
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_2
+  # grade: Blocking
+  timeout_in_minutes: 30
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  source_file_dependencies:
+  - vllm/
+  - .buildkite/scripts/run-prime-rl-test.sh
+  commands:
+    - bash .buildkite/scripts/run-prime-rl-test.sh
-- 
GitLab


From 263c38d74d87b196ad08449002fc7799e76bad9d Mon Sep 17 00:00:00 2001
From: Zhewen Li <zhewenli@meta.com>
Date: Thu, 4 Dec 2025 16:42:37 -0800
Subject: [PATCH 106/258] [CI/Build] Update batch invariant test trigger
 (#30080)

Signed-off-by: zhewenli <zhewenli@meta.com>
---
 .buildkite/test-pipeline.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index a79f0b0c6..0a99994e2 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -350,7 +350,8 @@ steps:
   timeout_in_minutes: 25
   gpu: h100
   source_file_dependencies:
-    - vllm/
+    - vllm/v1/attention
+    - vllm/model_executor/layers
     - tests/v1/determinism/
   commands:
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-- 
GitLab


From aaddc9c82a6df73f0f93912d3aee987859d28a53 Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Fri, 5 Dec 2025 08:48:59 +0800
Subject: [PATCH 107/258] [CI] fix silent error in nightly wheel index
 generation script, add generation time to HTML index (#30060)

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 .buildkite/scripts/generate-nightly-index.py | 33 +++++++++++++++-----
 .buildkite/scripts/upload-wheels.sh          |  5 ++-
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
index 4d28ec961..f10cb2f0b 100644
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -9,6 +9,7 @@ import argparse
 import json
 import sys
 from dataclasses import asdict, dataclass
+from datetime import datetime
 from pathlib import Path
 from typing import Any
 from urllib.parse import quote
@@ -20,6 +21,7 @@ if not sys.version_info >= (3, 12):
 
 INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
 <html>
+  <!-- {comment} -->
   <meta name="pypi:repository-version" content="1.0">
   <body>
 {items}
@@ -90,7 +92,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:
     )
 
 
-def generate_project_list(subdir_names: list[str]) -> str:
+def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
     """
     Generate project list HTML content linking to each project & variant sub-directory.
     """
@@ -98,11 +100,14 @@ def generate_project_list(subdir_names: list[str]) -> str:
     for name in sorted(subdir_names):
         name = name.strip("/").strip(".")
         href_tags.append(f'    <a href="{name}/">{name}/</a><br/>')
-    return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
+    return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
 
 
 def generate_package_index_and_metadata(
-    wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path
+    wheel_files: list[WheelFileInfo],
+    wheel_base_dir: Path,
+    index_base_dir: Path,
+    comment: str = "",
 ) -> tuple[str, str]:
     """
     Generate package index HTML content for a specific package, linking to actual wheel files.
@@ -120,7 +125,7 @@ def generate_package_index_and_metadata(
         file_meta = asdict(file)
         file_meta["path"] = file_path_quoted
         metadata.append(file_meta)
-    index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
+    index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
     metadata_str = json.dumps(metadata, indent=2)
     return index_str, metadata_str
 
@@ -131,6 +136,7 @@ def generate_index_and_metadata(
     index_base_dir: Path,
     default_variant: str | None = None,
     alias_to_default: str | None = None,
+    comment: str = "",
 ):
     """
     Generate index for all wheel files.
@@ -141,6 +147,7 @@ def generate_index_and_metadata(
         index_base_dir (Path): Base directory to store index files.
         default_variant (str | None): The default variant name, if any.
         alias_to_default (str | None): Alias variant name for the default variant, if any.
+        comment (str | None): Optional comment to include in the generated HTML files.
 
     First, parse all wheel files to extract metadata.
     We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
@@ -234,6 +241,10 @@ def generate_index_and_metadata(
             variant_to_files[alias_to_default] = variant_to_files["default"].copy()
             print(f"Alias variant '{alias_to_default}' created for default variant.")
 
+    # Generate comment in HTML header
+    comment_str = f" ({comment})" if comment else ""
+    comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}"
+
     # Generate index for each variant
     subdir_names = set()
     for variant, files in variant_to_files.items():
@@ -253,7 +264,7 @@ def generate_index_and_metadata(
             subdir_names = subdir_names.union(packages)
         else:
             # generate project list for this variant directly
-            project_list_str = generate_project_list(sorted(packages))
+            project_list_str = generate_project_list(sorted(packages), comment_tmpl)
             with open(variant_dir / "index.html", "w") as f:
                 f.write(project_list_str)
 
@@ -263,7 +274,7 @@ def generate_index_and_metadata(
             package_dir = variant_dir / package
             package_dir.mkdir(parents=True, exist_ok=True)
             index_str, metadata_str = generate_package_index_and_metadata(
-                package_files, wheel_base_dir, package_dir
+                package_files, wheel_base_dir, package_dir, comment
             )
             with open(package_dir / "index.html", "w") as f:
                 f.write(index_str)
@@ -271,7 +282,7 @@ def generate_index_and_metadata(
                 f.write(metadata_str)
 
     # Generate top-level project list index
-    project_list_str = generate_project_list(sorted(subdir_names))
+    project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl)
     with open(index_base_dir / "index.html", "w") as f:
         f.write(project_list_str)
 
@@ -283,6 +294,7 @@ if __name__ == "__main__":
         --current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
         --output-dir <output_directory> : directory to store generated index files
         --alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
+        --comment <comment_string> : (optional) comment string to include in generated HTML files
     """
 
     parser = argparse.ArgumentParser(
@@ -312,6 +324,12 @@ if __name__ == "__main__":
         default=None,
         help="Alias variant name for the default variant",
     )
+    parser.add_argument(
+        "--comment",
+        type=str,
+        default="",
+        help="Optional comment string to include in generated HTML files",
+    )
 
     args = parser.parse_args()
 
@@ -366,5 +384,6 @@ if __name__ == "__main__":
         index_base_dir=index_base_dir,
         default_variant=None,
         alias_to_default=args.alias_to_default,
+        comment=args.comment.strip(),
     )
     print(f"Successfully generated index and metadata in {output_dir}")
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 0ac8fdd45..8e38ace0b 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -81,7 +81,10 @@ else
     alias_arg=""
 fi
 
-$PYTHON pip install regex && .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
+# HACK: we do not need regex module here, but it is required by pre-commit hook
+# To avoid any external dependency, we simply replace it back to the stdlib re module
+sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
 
 # copy indices to /<commit>/ unconditionally
 echo "Uploading indices to $S3_COMMIT_PREFIX"
-- 
GitLab


From befb59e5b102d2ffbdea85a85347eec8e2f7c27f Mon Sep 17 00:00:00 2001
From: Hubert de La Jonquiere <hubert@hcompany.ai>
Date: Fri, 5 Dec 2025 03:38:45 +0100
Subject: [PATCH 108/258] [Model] Add Holo2 reasoning parser (#30048)

Signed-off-by: hdlj-h <hubert@hcompany.ai>
---
 docs/features/reasoning_outputs.md            |   2 +
 .../reasoning/test_holo2_reasoning_parser.py  | 188 ++++++++++++++++++
 vllm/reasoning/__init__.py                    |   4 +
 vllm/reasoning/holo2_reasoning_parser.py      |  83 ++++++++
 4 files changed, 277 insertions(+)
 create mode 100644 tests/reasoning/test_holo2_reasoning_parser.py
 create mode 100644 vllm/reasoning/holo2_reasoning_parser.py

diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index 08a0dd69e..3315c0949 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -18,6 +18,7 @@ vLLM currently supports the following reasoning models:
 | [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ |
 | [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ |
 | [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |
+| [Holo2 series](https://huggingface.co/collections/Hcompany/holo2) | `holo2` | `json`, `regex` | ✅ |
 | [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `json`, `regex` | ✅ |
 | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
 | [MiniMax-M2](https://huggingface.co/MiniMaxAI/MiniMax-M2) | `minimax_m2_append_think` | `json`, `regex` | ✅ |
@@ -28,6 +29,7 @@ vLLM currently supports the following reasoning models:
     IBM Granite 3.2 and DeepSeek-V3.1 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
     The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`.
     DeepSeek-V3.1 tool calling is supported in non-thinking mode.
+    Holo2 reasoning is enabled by default. To disable it, you must also pass `thinking=False` in your `chat_template_kwargs`.
 
 ## Quickstart
 
diff --git a/tests/reasoning/test_holo2_reasoning_parser.py b/tests/reasoning/test_holo2_reasoning_parser.py
new file mode 100644
index 000000000..438bb2e95
--- /dev/null
+++ b/tests/reasoning/test_holo2_reasoning_parser.py
@@ -0,0 +1,188 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+from transformers import AutoTokenizer
+
+from tests.reasoning.utils import run_reasoning_extraction
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
+from vllm.reasoning.holo2_reasoning_parser import Holo2ReasoningParser
+from vllm.reasoning.identity_reasoning_parser import IdentityReasoningParser
+
+REASONING_MODEL_NAME = "HCompany/Holo2-4B"
+
+
+@pytest.fixture(scope="module")
+def tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+
+
+@pytest.mark.parametrize(
+    "thinking,expected_parser_type",
+    [
+        (True, DeepSeekR1ReasoningParser),
+        (False, IdentityReasoningParser),
+    ],
+)
+def test_parser_selection(tokenizer, thinking, expected_parser_type):
+    parser = Holo2ReasoningParser(
+        tokenizer,
+        chat_template_kwargs={
+            "thinking": thinking,
+        },
+    )
+
+    assert isinstance(parser._parser, expected_parser_type)
+
+
+def test_holo2_default_parser_is_deepseekr1(tokenizer):
+    parser = Holo2ReasoningParser(tokenizer)
+
+    assert isinstance(parser._parser, DeepSeekR1ReasoningParser)
+
+
+def test_holo2_supports_structured_output(tokenizer):
+    # Structured output manager uses the reasoning parser to check if the
+    # reasoning content is ended before applying the grammar. The main function
+    # used is is_reasoning_end. This test checks if the parser is able to
+    # correctly identify the end of the reasoning content.
+
+    # important to not pass chat_template_kwargs here as it is done in the
+    # StructuredOutputManager
+    parser = Holo2ReasoningParser(tokenizer)
+
+    end_token_id = tokenizer.encode("</think>", add_special_tokens=False)[0]
+
+    assert parser.is_reasoning_end([1, 2, 4, end_token_id])
+    assert not parser.is_reasoning_end([1, 2, 4])
+    assert parser.is_reasoning_end([1, 2, 4, end_token_id, 5])
+
+
+# thinking is True, non-streaming
+WITH_THINK = {
+    "output": "This is a reasoning section</think>This is the rest",
+    "reasoning": "This is a reasoning section",
+    "content": "This is the rest",
+}
+# thinking is True, streaming
+WITH_THINK_STREAM = {
+    "output": "This is a reasoning section</think>This is the rest",
+    "reasoning": "This is a reasoning section",
+    "content": "This is the rest",
+}
+# thinking is False, non-streaming
+THINKING_DISABLED = {
+    "output": "This is the rest",
+    "reasoning": None,
+    "content": "This is the rest",
+}
+# thinking is False, streaming
+THINKING_DISABLED_STREAM = {
+    "output": "This is the rest",
+    "reasoning": None,
+    "content": "This is the rest",
+}
+# thinking is False but the model output </think>, non-streaming
+THINKING_DISABLED_WITH_CLOSE_TAG = {
+    "output": "</think>This is the rest",
+    "reasoning": None,
+    "content": "</think>This is the rest",
+}
+# thinking is False but the model output </think>, streaming
+THINKING_DISABLED_WITH_CLOSE_TAG_STREAM = {
+    "output": "some text</think>This is the rest",
+    "reasoning": None,
+    "content": "some text</think>This is the rest",
+}
+COMPLETE_REASONING = {
+    "output": "This is a reasoning section</think>",
+    "reasoning": "This is a reasoning section",
+    "content": None,
+}
+
+TEST_CASES = [
+    pytest.param(
+        False,
+        WITH_THINK,
+        None,
+        id="with_think",
+    ),
+    pytest.param(
+        True,
+        WITH_THINK_STREAM,
+        None,
+        id="with_think_stream",
+    ),
+    pytest.param(
+        False,
+        WITH_THINK,
+        {"thinking": True},
+        id="with_think_enabled",
+    ),
+    pytest.param(
+        True,
+        WITH_THINK_STREAM,
+        {"thinking": True},
+        id="with_think_stream_enabled",
+    ),
+    pytest.param(
+        False,
+        THINKING_DISABLED,
+        {"thinking": False},
+        id="thinking_disabled",
+    ),
+    pytest.param(
+        True,
+        THINKING_DISABLED_STREAM,
+        {"thinking": False},
+        id="thinking_disabled_stream",
+    ),
+    pytest.param(
+        False,
+        THINKING_DISABLED_WITH_CLOSE_TAG,
+        {"thinking": False},
+        id="thinking_disabled_with_close_tag",
+    ),
+    pytest.param(
+        True,
+        THINKING_DISABLED_WITH_CLOSE_TAG_STREAM,
+        {"thinking": False},
+        id="thinking_disabled_with_close_tag_stream",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING,
+        None,
+        id="complete_reasoning",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING,
+        None,
+        id="complete_reasoning_stream",
+    ),
+]
+
+
+@pytest.mark.parametrize("streaming, param_dict, chat_template_kwargs", TEST_CASES)
+def test_reasoning(
+    streaming: bool,
+    param_dict: dict,
+    chat_template_kwargs: dict | None,
+    tokenizer,
+):
+    output = tokenizer.tokenize(param_dict["output"])
+    output_tokens: list[str] = [
+        tokenizer.convert_tokens_to_string([token]) for token in output
+    ]
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser("holo2")(
+        tokenizer,
+        chat_template_kwargs=chat_template_kwargs,
+    )
+
+    reasoning, content = run_reasoning_extraction(
+        parser, output_tokens, streaming=streaming
+    )
+
+    assert reasoning == param_dict["reasoning"]
+    assert content == param_dict["content"]
diff --git a/vllm/reasoning/__init__.py b/vllm/reasoning/__init__.py
index 36e58dba6..7b918d2e3 100644
--- a/vllm/reasoning/__init__.py
+++ b/vllm/reasoning/__init__.py
@@ -44,6 +44,10 @@ _REASONING_PARSERS_TO_REGISTER = {
         "granite_reasoning_parser",
         "GraniteReasoningParser",
     ),
+    "holo2": (
+        "holo2_reasoning_parser",
+        "Holo2ReasoningParser",
+    ),
     "hunyuan_a13b": (
         "hunyuan_a13b_reasoning_parser",
         "HunyuanA13BReasoningParser",
diff --git a/vllm/reasoning/holo2_reasoning_parser.py b/vllm/reasoning/holo2_reasoning_parser.py
new file mode 100644
index 000000000..76de1c077
--- /dev/null
+++ b/vllm/reasoning/holo2_reasoning_parser.py
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
+from vllm.logger import init_logger
+from vllm.reasoning import (
+    ReasoningParser,
+)
+from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
+from vllm.reasoning.identity_reasoning_parser import IdentityReasoningParser
+from vllm.tokenizers import TokenizerLike
+
+logger = init_logger(__name__)
+
+
+class Holo2ReasoningParser(ReasoningParser):
+    """
+    Reasoning parser for the Holo2 models which are based on Qwen3.
+
+    The Holo2 model uses <think>...</think> tokens to denote reasoning text but <think>
+    is part of the chat template. This parser extracts the reasoning content until
+    </think> in the model's output.
+
+    The model provides a switch to enable or disable reasoning
+    output via the 'thinking=False' parameter.
+
+    Chat template args:
+    - thinking: Whether to enable reasoning output (default: True)
+
+
+    Parsing rules on model output:
+        - thinking == False
+            -> Model output is treated as purely the content |content|
+        - thinking == True
+            -> Model output is |reasoning_content|</think>|content|
+    """
+
+    def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
+        super().__init__(tokenizer, *args, **kwargs)
+
+        chat_kwargs = kwargs.get("chat_template_kwargs", {}) or {}
+        # Deepseek V3 and Holo2 are similar. However, Holo2 models think by default.
+        # this parser without user specified chat template args is initiated once for
+        # all requests in the structured output manager. So it is important that without
+        # user specified chat template args, the default thinking is True.
+
+        enable_thinking = bool(chat_kwargs.get("thinking", True))
+
+        if enable_thinking:
+            self._parser = DeepSeekR1ReasoningParser(tokenizer, *args, **kwargs)
+        else:
+            self._parser = IdentityReasoningParser(tokenizer, *args, **kwargs)
+
+    def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
+        return self._parser.is_reasoning_end(input_ids)
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        return self._parser.extract_content_ids(input_ids)
+
+    def extract_reasoning(
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[str | None, str | None]:
+        return self._parser.extract_reasoning(model_output, request)
+
+    def extract_reasoning_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        return self._parser.extract_reasoning_streaming(
+            previous_text,
+            current_text,
+            delta_text,
+            previous_token_ids,
+            current_token_ids,
+            delta_token_ids,
+        )
-- 
GitLab


From 0098a6e3dab74ac1e3e9371638bd9173c1ba83ad Mon Sep 17 00:00:00 2001
From: Qiu <qiuchunshuo@huawei.com>
Date: Fri, 5 Dec 2025 10:40:51 +0800
Subject: [PATCH 109/258] [PCP&DCP] move CUDAGraph check for PCP&DCP to the
 check func of platforms (#29952)

Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/config/vllm.py    | 30 ++++++++----------------------
 vllm/platforms/cuda.py | 17 +++++++++++++++++
 vllm/platforms/rocm.py | 18 ++++++++++++++++++
 3 files changed, 43 insertions(+), 22 deletions(-)

diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 735b0afba..823bd96db 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -671,36 +671,22 @@ class VllmConfig:
 
         if current_platform.support_static_graph_mode():
             # if cudagraph_mode has full cudagraphs, we need to check support
-            if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
-                # decode context parallel does not support full cudagraphs
-                if self.parallel_config.decode_context_parallel_size > 1:
+            if (
+                self.compilation_config.cudagraph_mode.has_full_cudagraphs()
+                and self.model_config is not None
+            ):
+                if self.model_config.pooler_config is not None:
                     logger.warning_once(
-                        "Decode context parallel (DCP) is enabled, which is "
-                        "incompatible with full CUDA graphs. "
+                        "Pooling models do not support full cudagraphs. "
                         "Overriding cudagraph_mode to PIECEWISE."
                     )
                     self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
-                # prefill context parallel do not support full cudagraphs
-                elif self.parallel_config.prefill_context_parallel_size > 1:
+                elif self.model_config.is_encoder_decoder:
                     logger.warning_once(
-                        "Prefill context parallel (PCP) is enabled, which is "
-                        "incompatible with full CUDA graphs. "
+                        "Encoder-decoder models do not support full cudagraphs. "
                         "Overriding cudagraph_mode to PIECEWISE."
                     )
                     self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
-                elif self.model_config is not None:
-                    if self.model_config.pooler_config is not None:
-                        logger.warning_once(
-                            "Pooling models do not support full cudagraphs. "
-                            "Overriding cudagraph_mode to PIECEWISE."
-                        )
-                        self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
-                    elif self.model_config.is_encoder_decoder:
-                        logger.warning_once(
-                            "Encoder-decoder models do not support full cudagraphs. "
-                            "Overriding cudagraph_mode to PIECEWISE."
-                        )
-                        self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
 
             # disable cudagraph when enforce eager execution
             if self.model_config is not None and self.model_config.enforce_eager:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 4bf9401b6..1467ca71e 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -233,6 +233,23 @@ class CudaPlatformBase(Platform):
         from vllm.config import CUDAGraphMode
 
         compilation_config = vllm_config.compilation_config
+        if compilation_config.cudagraph_mode.has_full_cudagraphs():
+            # decode context parallel does not support full cudagraphs
+            if parallel_config.decode_context_parallel_size > 1:
+                logger.warning_once(
+                    "Decode context parallel (DCP) is enabled, which is "
+                    "incompatible with full CUDA graphs. "
+                    "Overriding cudagraph_mode to PIECEWISE."
+                )
+                compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
+            # prefill context parallel do not support full cudagraphs
+            elif parallel_config.prefill_context_parallel_size > 1:
+                logger.warning_once(
+                    "Prefill context parallel (PCP) is enabled, which is "
+                    "incompatible with full CUDA graphs. "
+                    "Overriding cudagraph_mode to PIECEWISE."
+                )
+                compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
         if (
             parallel_config.all2all_backend == "deepep_high_throughput"
             and parallel_config.data_parallel_size > 1
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index ccf3446a3..32c7f8e53 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -381,6 +381,24 @@ class RocmPlatform(Platform):
         parallel_config = vllm_config.parallel_config
         is_eager_execution = compilation_config == CUDAGraphMode.NONE
 
+        if compilation_config.cudagraph_mode.has_full_cudagraphs():
+            # decode context parallel does not support full cudagraphs
+            if parallel_config.decode_context_parallel_size > 1:
+                logger.warning_once(
+                    "Decode context parallel (DCP) is enabled, which is "
+                    "incompatible with full CUDA graphs. "
+                    "Overriding cudagraph_mode to PIECEWISE."
+                )
+                compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
+            # prefill context parallel do not support full cudagraphs
+            elif parallel_config.prefill_context_parallel_size > 1:
+                logger.warning_once(
+                    "Prefill context parallel (PCP) is enabled, which is "
+                    "incompatible with full CUDA graphs. "
+                    "Overriding cudagraph_mode to PIECEWISE."
+                )
+                compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
+
         use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled()
 
         if cache_config and cache_config.block_size is None:
-- 
GitLab


From 7c9b2c8f8132e47fa9b04c0ae9a49872e0172f5f Mon Sep 17 00:00:00 2001
From: Charlie Fu <charlifu@amd.com>
Date: Thu, 4 Dec 2025 21:34:51 -0600
Subject: [PATCH 110/258] [ROCm][CI] Add jiwer dependency for testing (#30081)

Signed-off-by: charlifu <charlifu@amd.com>
---
 requirements/rocm-test.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index 394728b67..a92f14d7a 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -58,6 +58,7 @@ schemathesis==3.39.15
 
 # Evaluation and benchmarking
 lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
+jiwer==4.0.0
 
 # Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test
 multiprocess==0.70.16
-- 
GitLab


From 5867819eaffe3c939c0920c15d5048cb7f9129f8 Mon Sep 17 00:00:00 2001
From: Laith Sakka <lsakka@meta.com>
Date: Thu, 4 Dec 2025 20:10:12 -0800
Subject: [PATCH 111/258] Do not guard during noop elimination pass (#30095)

Signed-off-by: Laith Sakka <lsakka@meta.com>
---
 vllm/compilation/noop_elimination.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/vllm/compilation/noop_elimination.py b/vllm/compilation/noop_elimination.py
index 42b8d3daa..06e1771ba 100644
--- a/vllm/compilation/noop_elimination.py
+++ b/vllm/compilation/noop_elimination.py
@@ -5,6 +5,7 @@ from collections.abc import Iterable
 
 import torch.fx
 from torch import SymInt
+from torch.fx.experimental.symbolic_shapes import statically_known_true
 
 from vllm.logger import init_logger
 
@@ -116,12 +117,7 @@ class NoOpEliminationPass(VllmInductorPass):
         2. The dimensions both correspond to the same SymInt
         """
         # Case 1
-        if isinstance(i_dim, int) and isinstance(dim, int):
-            return dim == i_dim
-        # Case 2
-        if isinstance(i_dim, SymInt) and isinstance(dim, SymInt):
-            return dim == i_dim
-        return False
+        return statically_known_true(dim == i_dim)
 
     def all_dims_equivalent(
         self, dims: Iterable[int | SymInt], i_dims: Iterable[int | SymInt]
-- 
GitLab


From 2c22c4ca2d88df503818741470a3ffc21f30a4b4 Mon Sep 17 00:00:00 2001
From: Charlie Fu <charlifu@amd.com>
Date: Thu, 4 Dec 2025 22:51:44 -0600
Subject: [PATCH 112/258] [ROCm][CI] Increase the memory threshold for
 test_deep_sleep_fp8_kvcache (#30104)

Signed-off-by: charlifu <charlifu@amd.com>
---
 tests/basic_correctness/test_cumem.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index dc9c69bf5..3bd0b6609 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -260,13 +260,18 @@ def test_deep_sleep_fp8_kvcache():
     llm.sleep(level=2)
 
     used_bytes = current_platform.get_current_memory_usage() - used_bytes_baseline
-    assert used_bytes < 3 * GiB_bytes
+
+    # Rocm uses more memory for CudaGraphs, so we add 2 GiB more for the threshold
+    rocm_extra_mem_bytes = 2 * GiB_bytes if current_platform.is_rocm() else 0
+    mem_threshold_after_sleep = 3 * GiB_bytes + rocm_extra_mem_bytes
+    assert used_bytes < mem_threshold_after_sleep
 
     llm.wake_up(tags=["weights"])
     llm.collective_rpc("reload_weights")
 
     used_bytes = current_platform.get_current_memory_usage() - used_bytes_baseline
-    assert used_bytes < 4 * GiB_bytes
+    mem_threshold_after_wake_up = 4 * GiB_bytes + rocm_extra_mem_bytes
+    assert used_bytes < mem_threshold_after_wake_up
 
     # now allocate kv cache and cuda graph memory
     llm.wake_up(tags=["kv_cache"])
-- 
GitLab


From d698bb382db95af6b8836936eb0dfae71c791d11 Mon Sep 17 00:00:00 2001
From: Jingchun Gao <63247409+gjc0824@users.noreply.github.com>
Date: Fri, 5 Dec 2025 13:54:31 +0800
Subject: [PATCH 113/258] [Bugfix] Correct num_q_heads on DCP for Flashinfer
 backends  (#29487)

Signed-off-by: Jingchun Gao <gaojingchun1@huawei.com>
Signed-off-by: Jingchun Gao <63247409+gjc0824@users.noreply.github.com>
Co-authored-by: Jingchun Gao <gaojingchun1@huawei.com>
---
 vllm/v1/attention/backends/flashinfer.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 69a6a5e5f..3d9640a2d 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -482,9 +482,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             self.dcp_rank = 0
             self.dcp_kv_cache_interleave_size = 1
 
-        self.num_qo_heads = (
-            self.model_config.get_num_attention_heads(self.vllm_config.parallel_config)
-            * self.dcp_world_size
+        self.num_qo_heads = self.model_config.get_num_attention_heads(
+            self.vllm_config.parallel_config
         )
 
         self.num_kv_heads = self.kv_cache_spec.num_kv_heads
-- 
GitLab


From 6e865b6a83565c5d661091ec0886403edb171794 Mon Sep 17 00:00:00 2001
From: Chukwuma Nwaugha <20521315+nwaughachukwuma@users.noreply.github.com>
Date: Fri, 5 Dec 2025 06:44:32 +0000
Subject: [PATCH 114/258] Refactor example prompts fixture (#29854)

Signed-off-by: nwaughac@gmail.com
---
 tests/conftest.py | 47 +++++++++++++++++++++++++----------------------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index b20c9efef..204452b58 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -27,7 +27,7 @@ import threading
 from collections.abc import Generator
 from contextlib import nullcontext
 from enum import Enum
-from typing import Any, Callable, TypedDict, TypeVar, cast
+from typing import Any, Callable, TypedDict, TypeVar, cast, TYPE_CHECKING
 
 import numpy as np
 import pytest
@@ -67,6 +67,11 @@ from vllm.transformers_utils.utils import maybe_model_redirect
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.torch_utils import set_default_torch_num_threads
 
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+    from transformers.generation.utils import GenerateOutput
+
+
 logger = init_logger(__name__)
 
 _TEST_DIR = os.path.dirname(__file__)
@@ -202,10 +207,7 @@ def dynamo_reset():
 
 @pytest.fixture
 def example_prompts() -> list[str]:
-    prompts = []
-    for filename in _TEST_PROMPTS:
-        prompts += _read_prompts(filename)
-    return prompts
+    return [prompt for filename in _TEST_PROMPTS for prompt in _read_prompts(filename)]
 
 
 @pytest.fixture
@@ -224,10 +226,7 @@ class DecoderPromptType(Enum):
 
 @pytest.fixture
 def example_long_prompts() -> list[str]:
-    prompts = []
-    for filename in _LONG_PROMPTS:
-        prompts += _read_prompts(filename)
-    return prompts
+    return [prompt for filename in _LONG_PROMPTS for prompt in _read_prompts(filename)]
 
 
 @pytest.fixture(scope="session")
@@ -353,10 +352,13 @@ class HfRunner:
                 trust_remote_code=trust_remote_code,
             )
         else:
-            model = auto_cls.from_pretrained(
-                model_name,
-                trust_remote_code=trust_remote_code,
-                **model_kwargs,
+            model = cast(
+                nn.Module,
+                auto_cls.from_pretrained(
+                    model_name,
+                    trust_remote_code=trust_remote_code,
+                    **model_kwargs,
+                ),
             )
 
             # in case some unquantized custom models are not in same dtype
@@ -374,10 +376,12 @@ class HfRunner:
             self.model = model
 
         if not skip_tokenizer_init:
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                model_name,
-                dtype=dtype,
-                trust_remote_code=trust_remote_code,
+            self.tokenizer: "PreTrainedTokenizer | PreTrainedTokenizerFast" = (
+                AutoTokenizer.from_pretrained(
+                    model_name,
+                    dtype=dtype,
+                    trust_remote_code=trust_remote_code,
+                )
             )
 
         # don't put this import at the top level
@@ -495,7 +499,7 @@ class HfRunner:
 
         outputs: list[tuple[list[list[int]], list[str]]] = []
         for inputs in all_inputs:
-            output_ids = self.model.generate(
+            output_ids: torch.Tensor = self.model.generate(
                 **self.wrap_device(inputs),
                 use_cache=True,
                 **kwargs,
@@ -505,8 +509,7 @@ class HfRunner:
                 skip_special_tokens=True,
                 clean_up_tokenization_spaces=False,
             )
-            output_ids = output_ids.cpu().tolist()
-            outputs.append((output_ids, output_str))
+            outputs.append((output_ids.cpu().tolist(), output_str))
         return outputs
 
     def generate_greedy(
@@ -574,7 +577,7 @@ class HfRunner:
 
         all_logprobs: list[list[torch.Tensor]] = []
         for inputs in all_inputs:
-            output = self.model.generate(
+            output: "GenerateOutput" = self.model.generate(
                 **self.wrap_device(inputs),
                 use_cache=True,
                 do_sample=False,
@@ -656,7 +659,7 @@ class HfRunner:
         all_output_strs: list[str] = []
 
         for inputs in all_inputs:
-            output = self.model.generate(
+            output: "GenerateOutput" = self.model.generate(
                 **self.wrap_device(inputs),
                 use_cache=True,
                 do_sample=False,
-- 
GitLab


From 06579f9a82daa4be451a59c0bbc2d28f8f653b1c Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Fri, 5 Dec 2025 00:48:23 -0600
Subject: [PATCH 115/258] [AMD][CI] Add ray[default] Dependency On ROCm To Pass
 v1/metrics/test_engine_logger_apis.py (#30110)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 requirements/rocm-test.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index a92f14d7a..9d3d711c3 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -63,6 +63,9 @@ jiwer==4.0.0
 # Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test
 multiprocess==0.70.16
 
+# Required for v1/metrics/test_engine_logger_apis.py
+ray[cgraph,default]>=2.48.0
+
 # Plugins test
 terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
 torchgeo==0.7.0
-- 
GitLab


From 60a66ea2dc887d50a88f20a1ecffaee2ada8f9dc Mon Sep 17 00:00:00 2001
From: Tiger Xu / Zhonghu Xu <xuzhonghu@huawei.com>
Date: Fri, 5 Dec 2025 16:11:03 +0800
Subject: [PATCH 116/258] [DOC]: Add kthena to integrations (#29931)

Signed-off-by: Zhonghu Xu <xuzhonghu@huawei.com>
---
 docs/deployment/integrations/kthena.md | 333 +++++++++++++++++++++++++
 docs/deployment/k8s.md                 |   1 +
 2 files changed, 334 insertions(+)
 create mode 100644 docs/deployment/integrations/kthena.md

diff --git a/docs/deployment/integrations/kthena.md b/docs/deployment/integrations/kthena.md
new file mode 100644
index 000000000..483dd7474
--- /dev/null
+++ b/docs/deployment/integrations/kthena.md
@@ -0,0 +1,333 @@
+# Kthena
+
+[**Kthena**](https://github.com/volcano-sh/kthena) is a Kubernetes-native LLM inference platform that transforms how organizations deploy and manage Large Language Models in production. Built with declarative model lifecycle management and intelligent request routing, it provides high performance and enterprise-grade scalability for LLM inference workloads.
+
+This guide shows how to deploy a production-grade, **multi-node vLLM** service on Kubernetes.
+
+We’ll:
+
+- Install the required components (Kthena + Volcano).
+- Deploy a multi-node vLLM model via Kthena’s `ModelServing` CR.
+- Validate the deployment.
+
+---
+
+## 1. Prerequisites
+
+You need:
+
+- A Kubernetes cluster with **GPU nodes**.
+- `kubectl` access with cluster-admin or equivalent permissions.
+- **Volcano** installed for gang scheduling.
+- **Kthena** installed with the `ModelServing` CRD available.
+- A valid **Hugging Face token** if loading models from Hugging Face Hub.
+
+### 1.1 Install Volcano
+
+```bash
+helm repo add volcano-sh https://volcano-sh.github.io/helm-charts
+helm repo update
+helm install volcano volcano-sh/volcano -n volcano-system --create-namespace
+```
+
+This provides the gang-scheduling and network topology features used by Kthena.
+
+### 1.2 Install Kthena
+
+```bash
+helm install kthena oci://ghcr.io/volcano-sh/charts/kthena --version v0.1.0 --namespace kthena-system --create-namespace
+```
+
+- The `kthena-system` namespace is created.
+- Kthena controllers and CRDs, including `ModelServing`, are installed and healthy.
+
+Validate:
+
+```bash
+kubectl get crd | grep modelserving
+```
+
+You should see:
+
+```text
+modelservings.workload.serving.volcano.sh   ...
+```
+
+---
+
+## 2. The Multi-Node vLLM `ModelServing` Example
+
+Kthena provides an example manifest to deploy a **multi-node vLLM cluster running Llama**. Conceptually this is equivalent to the vLLM production stack Helm deployment, but expressed with `ModelServing`.
+
+A simplified version of the example (`llama-multinode`) looks like:
+
+- `spec.replicas: 1` – one `ServingGroup` (one logical model deployment).
+- `roles`:
+    - `entryTemplate` – defines **leader** pods that run:
+        - vLLM’s **multi-node cluster bootstrap script** (Ray cluster).
+        - vLLM **OpenAI-compatible API server**.
+    - `workerTemplate` – defines **worker** pods that join the leader’s Ray cluster.
+
+Key points from the example YAML:
+
+- **Image**: `vllm/vllm-openai:latest` (matches upstream vLLM images).
+- **Command** (leader):
+
+  ```yaml
+  command:
+    - sh
+    - -c
+    - >
+      bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=2;
+      python3 -m vllm.entrypoints.openai.api_server
+        --port 8080
+        --model meta-llama/Llama-3.1-405B-Instruct
+        --tensor-parallel-size 8
+        --pipeline-parallel-size 2
+  ```
+
+- **Command** (worker):
+
+  ```yaml
+  command:
+    - sh
+    - -c
+    - >
+      bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS)
+  ```
+
+---
+
+## 3. Deploying Multi-Node llama vLLM via Kthena
+
+### 3.1 Prepare the Manifest
+
+**Recommended**: use a Secret instead of a raw env var:
+
+```bash
+kubectl create secret generic hf-token \
+  -n default \
+  --from-literal=HUGGING_FACE_HUB_TOKEN='<your-token>'
+```
+
+### 3.2 Apply the `ModelServing`
+
+```bash
+cat  <<EOF | kubectl apply -f -
+apiVersion: workload.serving.volcano.sh/v1alpha1
+kind: ModelServing
+metadata:
+  name: llama-multinode
+  namespace: default
+spec:
+  schedulerName: volcano
+  replicas: 1  # group replicas
+  template:
+    restartGracePeriodSeconds: 60
+    gangPolicy:
+      minRoleReplicas:
+        405b: 1
+    roles:
+      - name: 405b
+        replicas: 2
+        entryTemplate:
+          spec:
+            containers:
+              - name: leader
+                image: vllm/vllm-openai:latest
+                env:
+                  - name: HUGGING_FACE_HUB_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token
+                        key: HUGGING_FACE_HUB_TOKEN
+                command:
+                  - sh
+                  - -c
+                  - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=2; 
+                    python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline-parallel-size 2"
+                resources:
+                  limits:
+                    nvidia.com/gpu: "8"
+                    memory: 1124Gi
+                    ephemeral-storage: 800Gi
+                  requests:
+                    ephemeral-storage: 800Gi
+                    cpu: 125
+                ports:
+                  - containerPort: 8080
+                readinessProbe:
+                  tcpSocket:
+                    port: 8080
+                  initialDelaySeconds: 15
+                  periodSeconds: 10
+                volumeMounts:
+                  - mountPath: /dev/shm
+                    name: dshm
+            volumes:
+            - name: dshm
+              emptyDir:
+                medium: Memory
+                sizeLimit: 15Gi
+        workerReplicas: 1
+        workerTemplate:
+          spec:
+            containers:
+              - name: worker
+                image: vllm/vllm-openai:latest
+                command:
+                  - sh
+                  - -c
+                  - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS)"
+                resources:
+                  limits:
+                    nvidia.com/gpu: "8"
+                    memory: 1124Gi
+                    ephemeral-storage: 800Gi
+                  requests:
+                    ephemeral-storage: 800Gi
+                    cpu: 125
+                env:
+                  - name: HUGGING_FACE_HUB_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token
+                        key: HUGGING_FACE_HUB_TOKEN
+                volumeMounts:
+                  - mountPath: /dev/shm
+                    name: dshm   
+            volumes:
+            - name: dshm
+              emptyDir:
+                medium: Memory
+                sizeLimit: 15Gi
+EOF
+```
+
+Kthena will:
+
+- Create a `ModelServing` object.
+- Derive a `PodGroup` for Volcano gang scheduling.
+- Create the leader and worker pods for each `ServingGroup` and `Role`.
+
+---
+
+## 4. Verifying the Deployment
+
+### 4.1 Check ModelServing Status
+
+Use the snippet from the Kthena docs:
+
+```bash
+kubectl get modelserving -oyaml | grep status -A 10
+```
+
+You should see something like:
+
+```yaml
+status:
+  availableReplicas: 1
+  conditions:
+    - type: Available
+      status: "True"
+      reason: AllGroupsReady
+      message: All Serving groups are ready
+    - type: Progressing
+      status: "False"
+      ...
+  replicas: 1
+  updatedReplicas: 1
+```
+
+### 4.2 Check Pods
+
+List pods for your deployment:
+
+```bash
+kubectl get pod -owide -l modelserving.volcano.sh/name=llama-multinode
+```
+
+Example output (from docs):
+
+```text
+NAMESPACE   NAME                          READY   STATUS    RESTARTS   AGE   IP            NODE           ...
+default     llama-multinode-0-405b-0-0    1/1     Running   0          15m   10.244.0.56   192.168.5.12   ...
+default     llama-multinode-0-405b-0-1    1/1     Running   0          15m   10.244.0.58   192.168.5.43   ...
+default     llama-multinode-0-405b-1-0    1/1     Running   0          15m   10.244.0.57   192.168.5.58   ...
+default     llama-multinode-0-405b-1-1    1/1     Running   0          15m   10.244.0.53   192.168.5.36   ...
+```
+
+Pod name pattern:
+
+- `llama-multinode-<group-idx>-<role-name>-<replica-idx>-<ordinal>`.
+
+The first number indicates `ServingGroup`. The second (`405b`) is the `Role`. The remaining indices identify the pod within the role.
+
+---
+
+## 6. Accessing the vLLM OpenAI-Compatible API
+
+Expose the entry via a Service:
+
+```yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: llama-multinode-openai
+  namespace: default
+spec:
+  selector:
+    modelserving.volcano.sh/name: llama-multinode
+    modelserving.volcano.sh/entry: "true"
+    # optionally further narrow to leader role if you label it
+  ports:
+    - name: http
+      port: 80
+      targetPort: 8080
+  type: ClusterIP
+```
+
+Port-forward from your local machine:
+
+```bash
+kubectl port-forward svc/llama-multinode-openai 30080:80 -n default
+```
+
+Then:
+
+- List models:
+
+  ```bash
+  curl -s http://localhost:30080/v1/models
+  ```
+
+- Send a completion request (mirroring vLLM production stack docs):
+
+  ```bash
+  curl -X POST http://localhost:30080/v1/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+      "model": "meta-llama/Llama-3.1-405B-Instruct",
+      "prompt": "Once upon a time,",
+      "max_tokens": 10
+    }'
+  ```
+
+You should see an OpenAI-style response from vLLM.
+
+---
+
+## 7. Clean Up
+
+To remove the deployment and its resources:
+
+```bash
+kubectl delete modelserving llama-multinode -n default
+```
+
+If you’re done with the entire stack:
+
+```bash
+helm uninstall kthena -n kthena-system   # or your Kthena release name
+helm uninstall volcano -n volcano-system
+```
diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md
index abffb7bc5..05814cbad 100644
--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@@ -14,6 +14,7 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
 - [InftyAI/llmaz](integrations/llmaz.md)
 - [KAITO](integrations/kaito.md)
 - [KServe](integrations/kserve.md)
+- [Kthena](integrations/kthena.md)
 - [KubeRay](integrations/kuberay.md)
 - [kubernetes-sigs/lws](frameworks/lws.md)
 - [meta-llama/llama-stack](integrations/llamastack.md)
-- 
GitLab


From 6038b1b04b7ee1b716a75591040fd3ecaa596a3d Mon Sep 17 00:00:00 2001
From: amitz-nv <203509407+amitz-nv@users.noreply.github.com>
Date: Fri, 5 Dec 2025 10:34:33 +0200
Subject: [PATCH 117/258] [Frontend][Model] Add 'float16' to possible mamba
 cache dtype values, override mamba SSM cache dtype value for NemotronH
 (#29978)

Signed-off-by: amitz-nv <203509407+amitz-nv@users.noreply.github.com>
---
 vllm/config/cache.py                 |  2 +-
 vllm/model_executor/models/config.py | 21 +++++++++++++++++++++
 vllm/utils/torch_utils.py            |  1 +
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index 91f083a55..067799a44 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -29,7 +29,7 @@ CacheDType = Literal[
     "fp8_inc",
     "fp8_ds_mla",
 ]
-MambaDType = Literal["auto", "float32"]
+MambaDType = Literal["auto", "float32", "float16"]
 PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor", "xxhash", "xxhash_cbor"]
 KVOffloadingBackend = Literal["native", "lmcache"]
 
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 4bca36aa4..fbeb28a1c 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -485,6 +485,26 @@ class DeepseekV32ForCausalLM(VerifyAndUpdateConfig):
             logger.info("Using bfloat16 kv-cache for DeepSeekV3.2")
 
 
+class NemotronHForCausalLMConfig(VerifyAndUpdateConfig):
+    @staticmethod
+    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+        """Update mamba_ssm_cache_dtype for NemotronH models when set to 'auto'
+        (or not explicitly set), to the value specified in the HF config, or to
+        float16 if not specified.
+        """
+        cache_config = vllm_config.cache_config
+        if cache_config.mamba_ssm_cache_dtype == "auto":
+            hf_config = vllm_config.model_config.hf_config
+            mamba_ssm_cache_dtype = getattr(
+                hf_config, "mamba_ssm_cache_dtype", "float16"
+            )
+            logger.info(
+                "Updating mamba_ssm_cache_dtype to '%s' for NemotronH model",
+                mamba_ssm_cache_dtype,
+            )
+            cache_config.mamba_ssm_cache_dtype = mamba_ssm_cache_dtype
+
+
 MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
     "GteModel": SnowflakeGteNewModelConfig,
     "GteNewModel": GteNewModelConfig,
@@ -502,4 +522,5 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
     "Mamba2ForCausalLM": MambaModelConfig,
     "FalconMambaForCausalLM": MambaModelConfig,
     "DeepseekV32ForCausalLM": DeepseekV32ForCausalLM,
+    "NemotronHForCausalLM": NemotronHForCausalLMConfig,
 }
diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py
index f5c49ac16..c97efce31 100644
--- a/vllm/utils/torch_utils.py
+++ b/vllm/utils/torch_utils.py
@@ -28,6 +28,7 @@ else:
 STR_DTYPE_TO_TORCH_DTYPE = {
     "float32": torch.float32,
     "half": torch.half,
+    "float16": torch.float16,
     "bfloat16": torch.bfloat16,
     "float": torch.float,
     "fp8": torch.uint8,
-- 
GitLab


From feecba09afcd3c9a37c6b03b868dedf4cb851eb1 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Fri, 5 Dec 2025 02:42:25 -0600
Subject: [PATCH 118/258] [CI/Build][AMD] Use float16 in
 test_reset_prefix_cache_e2e to avoid accuracy issues (#29997)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/v1/core/test_reset_prefix_cache_e2e.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/v1/core/test_reset_prefix_cache_e2e.py b/tests/v1/core/test_reset_prefix_cache_e2e.py
index 083fc3f34..b80789945 100644
--- a/tests/v1/core/test_reset_prefix_cache_e2e.py
+++ b/tests/v1/core/test_reset_prefix_cache_e2e.py
@@ -21,6 +21,7 @@ def test_reset_prefix_cache_e2e(monkeypatch):
         max_num_batched_tokens=32,
         max_model_len=2048,
         compilation_config={"mode": 0},
+        dtype="float16",
     )
     engine = LLMEngine.from_engine_args(engine_args)
     sampling_params = SamplingParams(
-- 
GitLab


From 62b3333448c9401f8e14feaab3b30192323f3a33 Mon Sep 17 00:00:00 2001
From: Yanan Cao <gmagogsfm@users.noreply.github.com>
Date: Fri, 5 Dec 2025 00:47:22 -0800
Subject: [PATCH 119/258] [Frontend] Remove deprecated -O.xx flag (#29991)

Signed-off-by: Yanan Cao <gmagogsfm@gmail.com>
---
 docs/design/debug_vllm_compile.md   |  2 +-
 tests/utils_/test_argparse_utils.py | 27 ++++++++++++---------------
 vllm/utils/argparse_utils.py        | 21 ++++++++-------------
 3 files changed, 21 insertions(+), 29 deletions(-)

diff --git a/docs/design/debug_vllm_compile.md b/docs/design/debug_vllm_compile.md
index e565f17da..731e542a0 100644
--- a/docs/design/debug_vllm_compile.md
+++ b/docs/design/debug_vllm_compile.md
@@ -86,7 +86,7 @@ LLM(model, enforce_eager=True)
 ```
 
 To turn off just torch.compile, pass `mode = NONE` to the compilation config.
-(`-cc` is short for `--compilation_config`; `-O.*` dotted syntax is deprecated):
+(`-cc` is short for `--compilation_config`):
 
 ```sh
 # Online
diff --git a/tests/utils_/test_argparse_utils.py b/tests/utils_/test_argparse_utils.py
index 2d969b8c9..6f24c77e0 100644
--- a/tests/utils_/test_argparse_utils.py
+++ b/tests/utils_/test_argparse_utils.py
@@ -460,23 +460,20 @@ def test_flat_product():
     ]
 
 
-def test_o_legacy_syntax_deprecation(caplog_vllm):
-    """Test that -O.* dotted syntax emits warnings and converts correctly to -cc syntax."""
+def test_o_dotted_syntax_error():
+    """Test that -O.* dotted syntax raises a clear error message."""
     parser = FlexibleArgumentParser()
     parser.add_argument("-cc", "--compilation-config", type=json.loads)
 
-    # Test that -O.backend gets converted correctly AND emits warning
-    args = parser.parse_args(["-O.backend=eager"])
-    assert args.compilation_config == {"backend": "eager"}
+    # Test that -O.* syntax raises a clear ValueError
+    with pytest.raises(ValueError, match=r"The -O\.\* syntax is no longer supported"):
+        parser.parse_args(["-O.backend=eager"])
 
-    # Check that deprecation warning was logged
-    assert len(caplog_vllm.records) >= 1
-    assert (
-        "The -O.* dotted syntax for --compilation-config is deprecated"
-        in caplog_vllm.text
-    )
+    with pytest.raises(ValueError, match=r"Please use -cc\.\* instead"):
+        parser.parse_args(["-O.mode=2"])
 
-    # Test that -O.mode gets converted correctly
-    # Note: warning_once won't emit again in same session
-    args = parser.parse_args(["-O.mode=2"])
-    assert args.compilation_config == {"mode": 2}
+    with pytest.raises(
+        ValueError,
+        match=r"replace '-O\.cudagraph_mode=NONE' with '-cc\.cudagraph_mode=NONE'",
+    ):
+        parser.parse_args(["-O.cudagraph_mode=NONE"])
diff --git a/vllm/utils/argparse_utils.py b/vllm/utils/argparse_utils.py
index 555fcfea4..356f383cc 100644
--- a/vllm/utils/argparse_utils.py
+++ b/vllm/utils/argparse_utils.py
@@ -244,9 +244,15 @@ class FlexibleArgumentParser(ArgumentParser):
                 else:
                     key = pattern.sub(repl, arg, count=1)
                     processed_args.append(key)
-            elif arg.startswith("-O") and arg != "-O" and arg[2] != ".":
+            elif arg.startswith("-O."):
+                # Provide clear error for deprecated -O.* syntax
+                raise ValueError(
+                    f"The -O.* syntax is no longer supported. "
+                    f"Please use -cc.* instead. "
+                    f"For example, replace '{arg}' with '{arg.replace('-O', '-cc', 1)}'"
+                )
+            elif arg.startswith("-O") and arg != "-O":
                 # allow -O flag to be used without space, e.g. -O3 or -Odecode
-                # -O.<...> handled later
                 # also handle -O=<optimization_level> here
                 optimization_level = arg[3:] if arg[2] == "=" else arg[2:]
                 processed_args += ["--optimization-level", optimization_level]
@@ -257,17 +263,6 @@ class FlexibleArgumentParser(ArgumentParser):
             ):
                 # Convert -O <n> to --optimization-level <n>
                 processed_args.append("--optimization-level")
-            elif arg.startswith("-O."):
-                # Handle -O.* dotted syntax - ALL dotted syntax is deprecated
-                logger.warning_once(
-                    "The -O.* dotted syntax for --compilation-config is "
-                    "deprecated and will be removed in v0.13.0 or v1.0.0"
-                    ", whichever is earlier.  Please use -cc.* instead. "
-                    "Example: -cc.backend=eager instead of "
-                    "-O.backend=eager."
-                )
-                converted_arg = arg.replace("-O", "-cc", 1)
-                processed_args.append(converted_arg)
             else:
                 processed_args.append(arg)
 
-- 
GitLab


From 65ee97288a5b3b622de8d7a460fa965514ff543a Mon Sep 17 00:00:00 2001
From: Alec S <10566873+alecsolder@users.noreply.github.com>
Date: Fri, 5 Dec 2025 03:49:37 -0500
Subject: [PATCH 120/258] [BugFix] Adding env variable to disable async grammar
 compilation (#29996)

Signed-off-by: Alec Solder <alecs@fb.com>
Signed-off-by: Alec S <10566873+alecsolder@users.noreply.github.com>
Co-authored-by: Alec Solder <alecs@fb.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
---
 .../test_backend_guidance.py                  | 74 +++++++++++++++++++
 vllm/v1/structured_output/__init__.py         | 17 ++++-
 2 files changed, 89 insertions(+), 2 deletions(-)

diff --git a/tests/v1/structured_output/test_backend_guidance.py b/tests/v1/structured_output/test_backend_guidance.py
index 771076186..4c01560fc 100644
--- a/tests/v1/structured_output/test_backend_guidance.py
+++ b/tests/v1/structured_output/test_backend_guidance.py
@@ -1,9 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import time
+from concurrent.futures import Future
+
+import pytest
 from transformers import AutoTokenizer
 
 from vllm.config import StructuredOutputsConfig, VllmConfig
 from vllm.config.model import ModelConfig
+from vllm.config.parallel import ParallelConfig
 from vllm.config.speculative import SpeculativeConfig
 from vllm.sampling_params import SamplingParams, StructuredOutputsParams
 from vllm.v1.request import Request
@@ -116,3 +121,72 @@ def test_grammar_bitmask_with_specdec():
         )  # EOS not the final token
         grammar_bitmask(request, prompt[i:])  # EOS not present
         grammar_bitmask(request, prompt[i:] + [tokenizer.eos_token_id])
+
+
+@pytest.mark.parametrize("async_grammar", [True, False])
+def test_grammar_init_async_and_sync(async_grammar):
+    """Test grammar initialization works correctly in both async and sync modes.
+
+    This test validates that the distributed_executor_backend config option
+    correctly controls whether grammar compilation happens asynchronously
+    (via executor.submit) or synchronously. When set to "external_launcher",
+    grammar compilation is synchronous to avoid deadlocks.
+    """
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
+    prompt = tokenizer.encode('{"a": "b"}')
+
+    # Use "external_launcher" for sync mode, None for async mode
+    executor_backend = None if async_grammar else "external_launcher"
+    vllm_config = VllmConfig(
+        model_config=ModelConfig(tokenizer=TOKENIZER),
+        structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
+        parallel_config=ParallelConfig(distributed_executor_backend=executor_backend),
+    )
+    structured_output_manager = StructuredOutputManager(vllm_config)
+
+    sampling_params = SamplingParams(
+        structured_outputs=StructuredOutputsParams(
+            json='{"type": "object"}',
+        ),
+    )
+    sampling_params.structured_outputs._backend = "guidance"
+
+    request = Request(
+        "test_request",
+        prompt_token_ids=prompt,
+        sampling_params=sampling_params,
+        pooling_params=None,
+        eos_token_id=tokenizer.eos_token_id,
+    )
+
+    structured_output_manager.grammar_init(request)
+
+    # Check the internal _grammar type immediately after init
+    # Before _check_grammar_completion is called, async mode should have a Future
+    raw_grammar = request.structured_output_request._grammar
+    if async_grammar:
+        assert isinstance(raw_grammar, Future), (
+            "Async mode should store a Future before completion"
+        )
+    else:
+        assert not isinstance(raw_grammar, Future), (
+            "Sync mode should store the grammar directly, not a Future"
+        )
+
+    # Wait for grammar to be ready (handles both async and sync cases)
+    start_time = time.time()
+    while not request.structured_output_request._check_grammar_completion():
+        if time.time() - start_time > 5:  # 5-second timeout
+            pytest.fail("Grammar compilation timed out")
+        time.sleep(0.01)
+
+    # After completion, _grammar should no longer be a Future
+    assert not isinstance(request.structured_output_request._grammar, Future)
+
+    # Verify grammar is properly initialized and functional
+    grammar = request.structured_output_request.grammar
+    assert grammar is not None
+    assert not grammar.is_terminated()
+
+    # Verify the grammar can accept valid tokens
+    assert grammar.accept_tokens(request.request_id, prompt)
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index d087d28b1..5ee88178c 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -40,6 +40,16 @@ class StructuredOutputManager:
         self.reasoner: ReasoningParser | None = None
         self.vllm_config = vllm_config
 
+        # When in external_launcher mode, async grammar compilation causes deadlocks
+        # due to external_launcher mode having a scheduler for each TP rank.
+        # Async grammar compilation causes the WAITING_FOR_FSM → WAITING transition to
+        # happen at different times on different TP ranks,
+        # breaking the determinism assumption that external_launcher relies on.
+        self._use_async_grammar_compilation = (
+            vllm_config.parallel_config.distributed_executor_backend
+            != "external_launcher"
+        )
+
         self._grammar_bitmask: torch.Tensor | None = None
         self._full_mask = torch.tensor(-1, dtype=torch.int32)
 
@@ -138,10 +148,13 @@ class StructuredOutputManager:
             else:
                 raise ValueError(f"Unsupported structured output backend: {backend}")
 
-        grammar = self.executor.submit(self._async_create_grammar, request)
+        if self._use_async_grammar_compilation:
+            grammar = self.executor.submit(self._create_grammar, request)
+        else:
+            grammar = self._create_grammar(request)  # type: ignore[assignment]
         request.structured_output_request.grammar = grammar  # type: ignore[assignment]
 
-    def _async_create_grammar(
+    def _create_grammar(
         self,
         request: Request,
     ) -> StructuredOutputGrammar:
-- 
GitLab


From f16356fe361df9ccef3a9b46d6a43d43a854e2e0 Mon Sep 17 00:00:00 2001
From: Ming Yang <minos.future@gmail.com>
Date: Fri, 5 Dec 2025 02:26:52 -0800
Subject: [PATCH 121/258] [bench] Support common prefix len config (for
 decode-only bench) (#29934)

Signed-off-by: Ming Yang <minos.future@gmail.com>
---
 vllm/benchmarks/datasets.py | 1 +
 vllm/benchmarks/serve.py    | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index ec9b0fd6e..638ece260 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -1842,6 +1842,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 random_seed=args.seed,
                 dataset_path=args.dataset_path,
                 disable_shuffle=args.disable_shuffle,
+                prefix_len=args.common_prefix_len,
             ).sample(
                 tokenizer=tokenizer,
                 num_requests=args.num_prompts,
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 2933f5d01..890cd7e08 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -1221,6 +1221,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
         help="Repetition penalty sampling parameter. Only has effect on "
         "openai-compatible backends.",
     )
+    sampling_group.add_argument(
+        "--common-prefix-len",
+        type=int,
+        default=None,
+        help="Common prefix length shared by all prompts (used by random dataset)",
+    )
 
     parser.add_argument(
         "--tokenizer-mode",
-- 
GitLab


From 7ae13c66ba63a1e999d9a8939856bea3e6e152a0 Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Fri, 5 Dec 2025 18:46:08 +0800
Subject: [PATCH 122/258] [typing] fix type (#29964)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 vllm/reasoning/abs_reasoning_parsers.py   | 2 +-
 vllm/reasoning/gptoss_reasoning_parser.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
index 4a04292be..5c6ac7dad 100644
--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -121,7 +121,7 @@ class ReasoningParser:
         self,
         original_tag: str | None,
         tool_server: ToolServer | None,
-    ) -> str:
+    ) -> str | None:
         """
         Instance method that is implemented for preparing the structured tag
         Otherwise, None is returned
diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py
index 0c1b54d0b..fa45b1285 100644
--- a/vllm/reasoning/gptoss_reasoning_parser.py
+++ b/vllm/reasoning/gptoss_reasoning_parser.py
@@ -145,7 +145,7 @@ class GptOssReasoningParser(ReasoningParser):
     # This function prepares the structural tag to format reasoning output
     def prepare_structured_tag(
         self, original_tag: str | None, tool_server: ToolServer | None
-    ) -> str:
+    ) -> str | None:
         if original_tag is None:
             if tool_server is None:
                 return json.dumps(no_func_reaonsing_tag)
-- 
GitLab


From b73b158ab0715d860e5a86218e0e9605fc2c1fe0 Mon Sep 17 00:00:00 2001
From: strinczer <strinczer@icloud.com>
Date: Fri, 5 Dec 2025 10:51:12 +0000
Subject: [PATCH 123/258] [Bugfix] Fix parse_output_message crash on commentary
 with no recipient (#29972)

Signed-off-by: Shai Trinczer <strinczer@icloud.com>
Signed-off-by: strinczer <strinczer@icloud.com>
---
 tests/entrypoints/test_harmony_utils.py | 189 +++++++++++++++++++++++-
 vllm/entrypoints/harmony_utils.py       |  10 +-
 2 files changed, 194 insertions(+), 5 deletions(-)

diff --git a/tests/entrypoints/test_harmony_utils.py b/tests/entrypoints/test_harmony_utils.py
index 6fa051a67..82ff562d5 100644
--- a/tests/entrypoints/test_harmony_utils.py
+++ b/tests/entrypoints/test_harmony_utils.py
@@ -1,11 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from openai_harmony import Role
+from openai.types.responses import ResponseFunctionToolCall, ResponseReasoningItem
+from openai_harmony import Author, Message, Role, TextContent
 
 from vllm.entrypoints.harmony_utils import (
     has_custom_tools,
     parse_input_to_harmony_message,
+    parse_output_message,
 )
 
 
@@ -257,6 +259,191 @@ class TestParseInputToHarmonyMessage:
         assert messages[0].content[1].text == "actual text"
 
 
+class TestParseOutputMessage:
+    """Tests for parse_output_message function."""
+
+    def test_commentary_with_no_recipient_creates_reasoning(self):
+        """Test that commentary with recipient=None (preambles) creates reasoning items.
+
+        Per Harmony format, commentary channel can contain preambles to calling
+        multiple functions - explanatory text with no recipient.
+        """
+        message = Message.from_role_and_content(
+            Role.ASSISTANT, "I will now search for the weather information."
+        )
+        message = message.with_channel("commentary")
+        # recipient is None by default, representing a preamble
+
+        output_items = parse_output_message(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], ResponseReasoningItem)
+        assert output_items[0].type == "reasoning"
+        assert (
+            output_items[0].content[0].text
+            == "I will now search for the weather information."
+        )
+        assert output_items[0].content[0].type == "reasoning_text"
+
+    def test_commentary_with_function_recipient_creates_function_call(self):
+        """Test commentary with recipient='functions.X' creates function calls."""
+        message = Message.from_role_and_content(
+            Role.ASSISTANT, '{"location": "San Francisco", "units": "celsius"}'
+        )
+        message = message.with_channel("commentary")
+        message = message.with_recipient("functions.get_weather")
+
+        output_items = parse_output_message(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], ResponseFunctionToolCall)
+        assert output_items[0].type == "function_call"
+        assert output_items[0].name == "get_weather"
+        assert (
+            output_items[0].arguments
+            == '{"location": "San Francisco", "units": "celsius"}'
+        )
+        assert output_items[0].call_id.startswith("call_")
+        assert output_items[0].id.startswith("fc_")
+
+    def test_commentary_with_python_recipient_creates_reasoning(self):
+        """Test that commentary with recipient='python' creates reasoning items."""
+        message = Message.from_role_and_content(
+            Role.ASSISTANT, "import numpy as np\nprint(np.array([1, 2, 3]))"
+        )
+        message = message.with_channel("commentary")
+        message = message.with_recipient("python")
+
+        output_items = parse_output_message(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], ResponseReasoningItem)
+        assert output_items[0].type == "reasoning"
+        assert (
+            output_items[0].content[0].text
+            == "import numpy as np\nprint(np.array([1, 2, 3]))"
+        )
+
+    def test_commentary_with_browser_recipient_creates_reasoning(self):
+        """Test that commentary with recipient='browser' creates reasoning items."""
+        message = Message.from_role_and_content(
+            Role.ASSISTANT, "Navigating to the specified URL"
+        )
+        message = message.with_channel("commentary")
+        message = message.with_recipient("browser")
+
+        output_items = parse_output_message(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], ResponseReasoningItem)
+        assert output_items[0].type == "reasoning"
+        assert output_items[0].content[0].text == "Navigating to the specified URL"
+
+    def test_commentary_with_container_recipient_creates_reasoning(self):
+        """Test that commentary with recipient='container' creates reasoning items."""
+        message = Message.from_role_and_content(
+            Role.ASSISTANT, "Running command in container"
+        )
+        message = message.with_channel("commentary")
+        message = message.with_recipient("container")
+
+        output_items = parse_output_message(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], ResponseReasoningItem)
+        assert output_items[0].type == "reasoning"
+        assert output_items[0].content[0].text == "Running command in container"
+
+    def test_commentary_with_empty_content_and_no_recipient(self):
+        """Test edge case: empty commentary with recipient=None."""
+        message = Message.from_role_and_content(Role.ASSISTANT, "")
+        message = message.with_channel("commentary")
+
+        output_items = parse_output_message(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], ResponseReasoningItem)
+        assert output_items[0].content[0].text == ""
+
+    def test_commentary_with_multiple_contents_and_no_recipient(self):
+        """Test multiple content items in commentary with no recipient."""
+        contents = [
+            TextContent(text="Step 1: Analyze the request"),
+            TextContent(text="Step 2: Prepare to call functions"),
+        ]
+        message = Message.from_role_and_contents(Role.ASSISTANT, contents)
+        message = message.with_channel("commentary")
+
+        output_items = parse_output_message(message)
+
+        assert len(output_items) == 2
+        assert all(isinstance(item, ResponseReasoningItem) for item in output_items)
+        assert output_items[0].content[0].text == "Step 1: Analyze the request"
+        assert output_items[1].content[0].text == "Step 2: Prepare to call functions"
+
+    def test_commentary_with_multiple_function_calls(self):
+        """Test multiple function calls in commentary channel."""
+        contents = [
+            TextContent(text='{"location": "San Francisco"}'),
+            TextContent(text='{"location": "New York"}'),
+        ]
+        message = Message.from_role_and_contents(Role.ASSISTANT, contents)
+        message = message.with_channel("commentary")
+        message = message.with_recipient("functions.get_weather")
+
+        output_items = parse_output_message(message)
+
+        assert len(output_items) == 2
+        assert all(isinstance(item, ResponseFunctionToolCall) for item in output_items)
+        assert output_items[0].name == "get_weather"
+        assert output_items[1].name == "get_weather"
+        assert output_items[0].arguments == '{"location": "San Francisco"}'
+        assert output_items[1].arguments == '{"location": "New York"}'
+
+    def test_commentary_with_unknown_recipient_raises_error(self):
+        """Test that commentary with unknown recipient raises ValueError."""
+        message = Message.from_role_and_content(Role.ASSISTANT, "some content")
+        message = message.with_channel("commentary")
+        message = message.with_recipient("unknown_recipient")
+
+        try:
+            parse_output_message(message)
+            raise AssertionError("Expected ValueError to be raised")
+        except ValueError as e:
+            assert "Unknown recipient: unknown_recipient" in str(e)
+
+    def test_analysis_channel_creates_reasoning(self):
+        """Test that analysis channel creates reasoning items."""
+        message = Message.from_role_and_content(
+            Role.ASSISTANT, "Analyzing the problem step by step..."
+        )
+        message = message.with_channel("analysis")
+
+        output_items = parse_output_message(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], ResponseReasoningItem)
+        assert output_items[0].type == "reasoning"
+        assert (
+            output_items[0].content[0].text == "Analyzing the problem step by step..."
+        )
+
+    def test_non_assistant_message_returns_empty(self):
+        """Test that non-assistant messages return empty list.
+
+        Per the implementation, tool messages to assistant (e.g., search results)
+        are not included in final output to align with OpenAI behavior.
+        """
+        message = Message.from_author_and_content(
+            Author.new(Role.TOOL, "functions.get_weather"),
+            "The weather is sunny, 72°F",
+        )
+
+        output_items = parse_output_message(message)
+
+        assert len(output_items) == 0
+
+
 def test_has_custom_tools() -> None:
     assert not has_custom_tools(set())
     assert not has_custom_tools({"web_search_preview", "code_interpreter", "container"})
diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py
index bb932e39e..7da0914ce 100644
--- a/vllm/entrypoints/harmony_utils.py
+++ b/vllm/entrypoints/harmony_utils.py
@@ -455,11 +455,13 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
             output_items.extend(_parse_function_call(message, recipient))
 
         # Built-in tools on commentary channel are treated as reasoning for now
-        elif recipient is not None and (
-            recipient.startswith("python")
-            or recipient.startswith("browser")
-            or recipient.startswith("container")
+        elif (
+            recipient is None  # Preambles: explanatory text before tool calls
+            or recipient.startswith(("python", "browser", "container"))
         ):
+            # Per Harmony format, commentary channel can contain preambles to calling
+            # multiple functions - explanatory text with no recipient. Built-in tool
+            # recipients (python/browser/container) also generate reasoning output.
             output_items.extend(_parse_reasoning_content(message))
         else:
             raise ValueError(f"Unknown recipient: {recipient}")
-- 
GitLab


From 3628bcaaf229f3ce86b64e73ab88dd64211ddf38 Mon Sep 17 00:00:00 2001
From: Zhiwei <532707544@qq.com>
Date: Fri, 5 Dec 2025 19:01:16 +0800
Subject: [PATCH 124/258] [ROCm][MXFP4] Infer w4a4 quant method in rocm aiter
 fused moe (#29775)

Signed-off-by: ZhiweiYan-96 <zhiwei.yan@amd.com>
---
 vllm/model_executor/layers/fused_moe/config.py               | 4 ++++
 vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index 1826fafa8..e52845dfa 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -345,6 +345,10 @@ class FusedMoEQuantConfig:
     def use_mxfp4_w4a16(self) -> bool:
         return self._a1.dtype is None and self._w1.dtype == "mxfp4"
 
+    @property
+    def use_mxfp4_w4a4(self) -> bool:
+        return self._a1.dtype == "mxfp4" and self._w1.dtype == "mxfp4"
+
     @property
     def use_nvfp4_w4a4(self) -> bool:
         return self.quant_dtype == "nvfp4"
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index 8f05828d7..882ad0a53 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -221,8 +221,8 @@ def rocm_aiter_fused_experts(
 
     else:
         quant_method = QuantMethod.NO.value
-        # quark moe for mxfp4 w_dtype
-        if quant_config.use_mxfp4_w4a16:
+        # quark moe for mxfp4 w_dtype mxfp4 a_dtype
+        if quant_config.use_mxfp4_w4a4:
             quant_method = QuantMethod.BLOCK_1X32.value
         # w8a8 block-scaled
         if quant_config.block_shape is not None and quant_config.use_fp8_w8a8:
-- 
GitLab


From c2894d3883c1b21299822c685e6be73ec3be046b Mon Sep 17 00:00:00 2001
From: Max Hu <hyoung2991@gmail.com>
Date: Fri, 5 Dec 2025 06:20:07 -0500
Subject: [PATCH 125/258] [Feature] Add Layer-wise NVTX Support (#29990)

Signed-off-by: Max Hu <hyoung2991@gmail.com>
Signed-off-by: Max Hu <maxhu@nvidia.com>
Co-authored-by: Max Hu <maxhu@nvidia.com>
---
 vllm/compilation/wrapper.py        |  30 ++-
 vllm/config/observability.py       |   5 +
 vllm/engine/arg_utils.py           |   8 +
 vllm/utils/nvtx_pytorch_hooks.py   | 286 +++++++++++++++++++++++++++++
 vllm/v1/worker/gpu_model_runner.py |  49 +++++
 5 files changed, 375 insertions(+), 3 deletions(-)
 create mode 100644 vllm/utils/nvtx_pytorch_hooks.py

diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index b120c85bf..69e1ed37a 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -14,6 +14,7 @@ import torch._C._dynamo.guards
 import vllm.envs as envs
 from vllm.config import CompilationMode, CUDAGraphMode, get_current_vllm_config
 from vllm.logger import init_logger
+from vllm.utils.nvtx_pytorch_hooks import layerwise_nvtx_marker_context
 
 logger = init_logger(__name__)
 
@@ -92,12 +93,29 @@ class TorchCompileWithNoGuardsWrapper:
 
         return self.forward(*args, **kwargs)
 
+    def _call_with_optional_nvtx_range(self, callable_fn, *args, **kwargs):
+        if self.layerwise_nvtx_tracing_enabled:
+            args_list = list(args)
+            kwargs_dict = dict(kwargs)
+            with layerwise_nvtx_marker_context(
+                "Torch Compiled Module (input):{}".format(self.__class__.__name__),
+                self,
+                in_tensor=args_list,
+                kwargs=kwargs_dict,
+            ) as ctx:
+                ctx.result = callable_fn(*args, **kwargs)
+            return ctx.result
+        return callable_fn(*args, **kwargs)
+
     def __init__(self):
         self.compiled = False
 
         vllm_config = get_current_vllm_config()
         self.vllm_config = vllm_config
         mode = vllm_config.compilation_config.mode
+        self.layerwise_nvtx_tracing_enabled = (
+            vllm_config.observability_config.enable_layerwise_nvtx_tracing
+        )
         if mode is None:
             raise RuntimeError("Compilation mode cannot be NO_COMPILATION")
 
@@ -168,13 +186,19 @@ class TorchCompileWithNoGuardsWrapper:
                 # Make sure a compilation is triggered by clearing dynamo
                 # cache.
                 torch._dynamo.eval_frame.remove_from_cache(self.original_code_object())
-                return self._compiled_callable(*args, **kwargs)
+                return self._call_with_optional_nvtx_range(
+                    self._compiled_callable, *args, **kwargs
+                )
             else:
                 with self._dispatch_to_compiled_code():
-                    return self.forward(*args, **kwargs)
+                    return self._call_with_optional_nvtx_range(
+                        self.forward, *args, **kwargs
+                    )
         else:
             with _compilation_context():
-                return self._compiled_callable(*args, **kwargs)
+                return self._call_with_optional_nvtx_range(
+                    self._compiled_callable, *args, **kwargs
+                )
 
     @abstractmethod
     def forward(self, *args, **kwargs): ...
diff --git a/vllm/config/observability.py b/vllm/config/observability.py
index fdc27aee3..e40bf18a0 100644
--- a/vllm/config/observability.py
+++ b/vllm/config/observability.py
@@ -59,6 +59,11 @@ class ObservabilityConfig:
     """Enable CUDA graph metrics (number of padded/unpadded tokens, runtime cudagraph
     dispatch modes, and their observed frequencies at every logging interval)."""
 
+    enable_layerwise_nvtx_tracing: bool = False
+    """Enable layerwise NVTX tracing. This traces the execution of each layer or
+    module in the model and attach informations such as input/output shapes to
+    nvtx range markers. Noted that this doesn't work with CUDA graphs enabled."""
+
     @cached_property
     def collect_model_forward_time(self) -> bool:
         """Whether to collect model forward time for the request."""
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index fd07cded7..883ae370f 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -519,6 +519,9 @@ class EngineArgs:
         ObservabilityConfig, "kv_cache_metrics_sample"
     )
     cudagraph_metrics: bool = ObservabilityConfig.cudagraph_metrics
+    enable_layerwise_nvtx_tracing: bool = (
+        ObservabilityConfig.enable_layerwise_nvtx_tracing
+    )
     scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
     scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls
 
@@ -1026,6 +1029,10 @@ class EngineArgs:
             "--cudagraph-metrics",
             **observability_kwargs["cudagraph_metrics"],
         )
+        observability_group.add_argument(
+            "--enable-layerwise-nvtx-tracing",
+            **observability_kwargs["enable_layerwise_nvtx_tracing"],
+        )
 
         # Scheduler arguments
         scheduler_kwargs = get_kwargs(SchedulerConfig)
@@ -1704,6 +1711,7 @@ class EngineArgs:
             kv_cache_metrics=self.kv_cache_metrics,
             kv_cache_metrics_sample=self.kv_cache_metrics_sample,
             cudagraph_metrics=self.cudagraph_metrics,
+            enable_layerwise_nvtx_tracing=self.enable_layerwise_nvtx_tracing,
         )
 
         # Compilation config overrides
diff --git a/vllm/utils/nvtx_pytorch_hooks.py b/vllm/utils/nvtx_pytorch_hooks.py
new file mode 100644
index 000000000..39e2a9a13
--- /dev/null
+++ b/vllm/utils/nvtx_pytorch_hooks.py
@@ -0,0 +1,286 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from contextlib import contextmanager
+
+import torch
+import torch.cuda.nvtx as nvtx
+
+
+def print_tensor(tensor_obj, prefix, tensor_list=None):
+    """Descends iterators that contains Tensors and prints the Tensor.
+    Recursive function that descends iterator type arguments until
+    it finds a Tensor object.
+    """
+    if tensor_list is None:
+        tensor_list = []
+
+    if isinstance(tensor_obj, (list, tuple)):
+        for ten in tensor_obj:
+            tensor_list = print_tensor(ten, prefix, tensor_list)
+    elif isinstance(tensor_obj, torch.Tensor):
+        tensor_dims = list(tensor_obj.size())
+        tensor_list.append(tensor_dims)
+    return tensor_list
+
+
+def process_layer_params(module_obj):
+    """Extract the static parameters from LLM and VLM relevant layer types"""
+    param_info = {}
+    # Extract parameters for layers commonly used in LLMs and VLMs
+    if isinstance(module_obj, (torch.nn.Conv1d, torch.nn.Conv2d, torch.nn.Conv3d)):
+        conv_params = {}
+        conv_params["in_chan"] = module_obj.in_channels
+        conv_params["out_chan"] = module_obj.out_channels
+        conv_params["filter_dim"] = module_obj.kernel_size
+        conv_params["stride"] = module_obj.stride
+        conv_params["padding"] = module_obj.padding
+        conv_params["dilation"] = module_obj.dilation
+        conv_params["transposed"] = module_obj.transposed
+        conv_params["output_padding"] = module_obj.output_padding
+        conv_params["groups"] = module_obj.groups
+        conv_params["padding_mode"] = module_obj.padding_mode
+        param_info = conv_params
+    elif isinstance(
+        module_obj,
+        (
+            torch.nn.ConvTranspose1d,
+            torch.nn.ConvTranspose2d,
+            torch.nn.ConvTranspose3d,
+        ),
+    ):
+        convtranspose_params = {}
+        convtranspose_params["in_chan"] = module_obj.in_channels
+        convtranspose_params["out_chan"] = module_obj.out_channels
+        convtranspose_params["filter_dim"] = module_obj.kernel_size
+        convtranspose_params["stride"] = module_obj.stride
+        convtranspose_params["padding"] = module_obj.padding
+        convtranspose_params["dilation"] = module_obj.dilation
+        convtranspose_params["transposed"] = module_obj.transposed
+        convtranspose_params["output_padding"] = module_obj.output_padding
+        convtranspose_params["groups"] = module_obj.groups
+        convtranspose_params["padding_mode"] = module_obj.padding_mode
+        param_info = convtranspose_params
+    elif isinstance(
+        module_obj, (torch.nn.MaxPool1d, torch.nn.MaxPool2d, torch.nn.MaxPool3d)
+    ):
+
+        def _handle_int_or_tuple(parameter):
+            if isinstance(parameter, tuple):
+                return list(parameter)
+            elif isinstance(parameter, int):
+                return [parameter, parameter]
+
+        pooling_params = {}
+        pooling_params["filter_dim"] = _handle_int_or_tuple(module_obj.kernel_size)
+        pooling_params["stride"] = _handle_int_or_tuple(module_obj.stride)
+        pooling_params["padding"] = _handle_int_or_tuple(module_obj.padding)
+        pooling_params["dilation"] = _handle_int_or_tuple(module_obj.dilation)
+        param_info = pooling_params
+    elif isinstance(
+        module_obj, (torch.nn.AvgPool1d, torch.nn.AvgPool2d, torch.nn.AvgPool3d)
+    ):
+        pooling_params = {}
+        pooling_params["filter_dim"] = [
+            module_obj.kernel_size,
+            module_obj.kernel_size,
+        ]
+        pooling_params["stride"] = [module_obj.stride, module_obj.stride]
+        pooling_params["padding"] = [module_obj.padding, module_obj.padding]
+        pooling_params["ceil_mode"] = module_obj.ceil_mode
+        pooling_params["count_include_pad"] = module_obj.count_include_pad
+        param_info = pooling_params
+    elif isinstance(
+        module_obj,
+        (
+            torch.nn.AdaptiveAvgPool1d,
+            torch.nn.AdaptiveAvgPool2d,
+            torch.nn.AdaptiveAvgPool3d,
+        ),
+    ):
+        pooling_params = {}
+        pooling_params["output_size"] = [
+            module_obj.output_size,
+            module_obj.output_size,
+        ]
+        param_info = pooling_params
+    elif isinstance(module_obj, torch.nn.Linear):
+        param_info["in_features"] = module_obj.in_features
+        param_info["out_features"] = module_obj.out_features
+    elif isinstance(
+        module_obj,
+        (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d),
+    ):
+        param_info["num_features"] = module_obj.num_features
+        param_info["epsilon"] = module_obj.eps
+        param_info["momentum"] = module_obj.momentum
+    elif isinstance(module_obj, torch.nn.ReLU):
+        param_info["in_place"] = module_obj.inplace
+    elif isinstance(module_obj, torch.nn.Dropout):
+        param_info["p"] = module_obj.p
+        param_info["in_place"] = module_obj.inplace
+    elif isinstance(module_obj, torch.nn.Embedding):
+        param_info["num_embeddings"] = module_obj.num_embeddings
+        param_info["embedding_dim"] = module_obj.embedding_dim
+    elif isinstance(
+        module_obj,
+        (
+            torch.nn.Upsample,
+            torch.nn.UpsamplingNearest2d,
+            torch.nn.UpsamplingBilinear2d,
+        ),
+    ):
+        param_info["scale_factor"] = module_obj.scale_factor
+
+    return param_info
+
+
+def construct_marker_dict_and_push(
+    module_name, module_obj, in_tensor, kwargs=None, out_tensor=None
+):
+    marker_dict = {}
+    marker_dict["Module"] = module_name
+
+    ## Get trainable parameters like weights and bias
+    module_params = module_obj.named_parameters(recurse=False)
+    for idx, (param_name, param_obj) in enumerate(module_params):
+        if idx == 0:
+            marker_dict["TrainableParams"] = {}
+        marker_dict["TrainableParams"][param_name] = list(param_obj.size())
+
+    in_tensor_list = print_tensor(in_tensor, "Input")
+    if in_tensor_list:
+        marker_dict["Inputs"] = in_tensor_list
+
+    out_tensor_list = print_tensor(out_tensor, "Output")
+    if out_tensor_list:
+        marker_dict["Outputs"] = out_tensor_list
+
+    ## Get Kwargs like input_ids and positions for the top module
+    if kwargs:
+        for key, value in kwargs.items():
+            if isinstance(value, (torch.Tensor, list, tuple)):
+                tensor_list = print_tensor(value, key)
+                if tensor_list:
+                    marker_dict[key] = tensor_list
+
+    param_info = process_layer_params(module_obj)
+    if param_info:
+        marker_dict["StaticParams"] = param_info
+    nvtx.range_push("{}".format(marker_dict))
+
+
+class ResultHolder:
+    """Holder for storing results from within a context manager."""
+
+    result = None
+
+
+@contextmanager
+def layerwise_nvtx_marker_context(module_name, module_obj, in_tensor=None, kwargs=None):
+    """Context manager for NVTX markers that automatically pushes on enter
+    and pops on exit.
+
+    Example:
+        with nvtx_marker_context("Module:MyModule", module, in_tensor=args,
+                                 kwargs=kwargs) as ctx:
+            ctx.result = module(*args, **kwargs)
+        return ctx.result
+    """
+    holder = ResultHolder()
+
+    # Push input marker
+    construct_marker_dict_and_push(
+        module_name,
+        module_obj,
+        in_tensor=in_tensor,
+        kwargs=kwargs,
+    )
+    try:
+        yield holder
+    finally:
+        # Pop input marker
+        nvtx.range_pop()
+        # Push and pop output marker
+        output_name = module_name.replace("(input)", "(output)")
+        construct_marker_dict_and_push(
+            output_name,
+            module_obj,
+            in_tensor=None,
+            kwargs=None,
+            out_tensor=holder.result,
+        )
+        nvtx.range_pop()
+
+
+class PytHooks:
+    """This module contains all the code needed to enable forward hooks
+    in a pytorch network.
+
+    To register the hooks for a given network, the user needs to instantiate
+    a PytHook object. Then call the register_hooks method.
+
+    Example:
+
+        my_hook = PytHook()
+        my_hook.register_hooks(my_network_model)
+    """
+
+    def __init__(self):
+        """Initialize module variables."""
+        super().__init__()
+        self.module_to_name_map = {}
+
+    def _process_layer_params(self, module_obj):
+        return process_layer_params(module_obj)
+
+    def module_fwd_hook(self, module_obj, in_tensor, out_tensor):
+        """Callback function that ends the NVTX marker.
+        Records the module name and tensor information.
+        Called after the module executes the forward method.
+        """
+        nvtx.range_pop()
+        module_name = self.module_to_name_map.get(module_obj, "unknown")
+        construct_marker_dict_and_push(
+            module_name, module_obj, in_tensor=None, kwargs=None, out_tensor=out_tensor
+        )
+        nvtx.range_pop()
+        return
+
+    def module_fwd_pre_hook(self, module_obj, in_tensor, kwargs):
+        """Creates an NVTX marker with the module name in it.
+        This function is called before the module executes.
+        """
+        module_name = self.module_to_name_map.get(module_obj, "unknown")
+        construct_marker_dict_and_push(
+            module_name, module_obj, in_tensor=in_tensor, kwargs=kwargs, out_tensor=None
+        )
+        return
+
+    def register_hooks(self, network_model, module_prefix="top"):
+        """User level function that activates all the hooks.
+        The user needs to call this method from the network source code.
+        The code descends all the modules in the network and registers their
+        respective hooks.
+        """
+        # Module types to skip (simple operations that don't need detailed profiling)
+        skip_types = (
+            torch.nn.Identity,
+            torch.nn.Dropout,
+            torch.nn.Dropout1d,
+            torch.nn.Dropout2d,
+            torch.nn.Dropout3d,
+        )
+
+        for name, module in network_model.named_modules(prefix=module_prefix):
+            # Skip certain module types to reduce profiling overhead
+            if isinstance(module, skip_types):
+                continue
+
+            module.register_forward_pre_hook(self.module_fwd_pre_hook, with_kwargs=True)
+            module.register_forward_hook(self.module_fwd_hook)
+            if module not in self.module_to_name_map:
+                self.module_to_name_map[module] = name
+            else:
+                raise ValueError("Module instance {} is not unique ".format(module))
+        return
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 152bea2c0..b6a814522 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -88,6 +88,7 @@ from vllm.utils.jsontree import json_map_leaves
 from vllm.utils.math_utils import cdiv, round_up
 from vllm.utils.mem_constants import GiB_bytes
 from vllm.utils.mem_utils import DeviceMemoryProfiler
+from vllm.utils.nvtx_pytorch_hooks import PytHooks
 from vllm.utils.platform_utils import is_pin_memory_available
 from vllm.utils.torch_utils import (
     get_dtype_size,
@@ -599,6 +600,7 @@ class GPUModelRunner(
         # Ephemeral state transferred between execute_model() and sample_tokens().
         self.execute_model_state: ExecuteModelState | None = None
         self.kv_connector_output: KVConnectorOutput | None = None
+        self.layerwise_nvtx_hooks_registered = False
 
     def reset_mm_cache(self) -> None:
         if self.mm_budget:
@@ -2828,6 +2830,42 @@ class GPUModelRunner(
             cudagraph_stats,
         )
 
+    def _register_layerwise_nvtx_hooks(self) -> None:
+        """
+        Register layerwise NVTX hooks if --enable-layerwise-nvtx-tracing is enabled
+        to trace detailed information of each layer or module in the model.
+        """
+
+        if (
+            self.vllm_config.observability_config.enable_layerwise_nvtx_tracing
+            and not self.layerwise_nvtx_hooks_registered
+        ):
+            if self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE:
+                logger.debug_once(
+                    "layerwise NVTX tracing is not supported when CUDA graph is "
+                    "turned off; you may observe part or all of the model "
+                    "missing NVTX markers"
+                )
+
+            # In STOCK_TORCH_COMPILE mode, after registering hooks here,
+            # the __call__ function of nn.module will be recompiled with
+            # fullgraph=True. Since nvtx.range_push/pop are not traceable
+            # by torch dynamo, we can't register hook functions here
+            # because hook functions will also be traced by torch dynamo.
+            if (
+                self.vllm_config.compilation_config.mode
+                == CompilationMode.STOCK_TORCH_COMPILE
+            ):
+                logger.debug_once(
+                    "layerwise NVTX tracing is not supported when "
+                    "CompilationMode is STOCK_TORCH_COMPILE, skipping "
+                    "function hooks registration"
+                )
+            else:
+                pyt_hooks = PytHooks()
+                pyt_hooks.register_hooks(self.model, self.model.__class__.__name__)
+                self.layerwise_nvtx_hooks_registered = True
+
     @torch.inference_mode()
     def execute_model(
         self,
@@ -4122,6 +4160,17 @@ class GPUModelRunner(
                     is_graph_capturing=is_graph_capturing,
                 )
 
+        # We register layerwise NVTX hooks here after the first dynamo tracing is
+        # done to avoid nvtx operations in hook functions being traced by
+        # torch dynamo and causing graph breaks.
+        # Note that for DYNAMO_ONCE and VLLM_COMPILE mode,
+        # compiled model's dynamo tracing is only done once and the compiled model's
+        # __call__ function is replaced by calling the compiled function.
+        # So it's safe to register hooks here. Hooks will be registered to
+        # both compiled and uncompiled models but they will never
+        # be called on the compiled model execution path.
+        self._register_layerwise_nvtx_hooks()
+
         # This is necessary to avoid blocking DP.
         # For dummy runs, we typically skip EPLB since we don't have any real
         # requests to process.
-- 
GitLab


From b7d85cf25c0a6699eb493e595ec44923ffce21b1 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 5 Dec 2025 13:03:45 +0000
Subject: [PATCH 126/258] [CI] Have pre-commit comment on a PR if pre-commit
 was not used (#30077)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .github/mergify.yml | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 997a40e18..5cb9fcdf9 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -14,6 +14,38 @@ pull_request_rules:
     comment:
       message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
 
+- name: comment-pre-commit-failure
+  description: Comment on PR when pre-commit check fails
+  conditions:
+    - status-failure=pre-commit
+    - -closed
+    - -draft
+  actions:
+    comment:
+      message: |
+        Hi @{{author}}, the pre-commit checks have failed. Please run:
+
+        ```bash 
+        uv pip install pre-commit
+        pre-commit install
+        pre-commit run --all-files
+        ```
+
+        Then, commit the changes and push to your branch.
+
+        For future commits, `pre-commit` will run automatically on changed files before each commit.
+
+- name: comment-dco-failure
+  description: Comment on PR when DCO check fails
+  conditions:
+    - status-failure=dco
+    - -closed
+    - -draft
+  actions:
+    comment:
+      message: |
+        Hi @{{author}}, the DCO check has failed. Please click on DCO in the Checks section for instructions on how to resolve this.
+
 - name: label-ci-build
   description: Automatically apply ci/build label
   conditions:
-- 
GitLab


From 9843e332da56307597deb2739bb83b85c18c5dde Mon Sep 17 00:00:00 2001
From: Elham <harirpoush.elham@gmail.com>
Date: Fri, 5 Dec 2025 08:09:20 -0500
Subject: [PATCH 127/258] [CPU][Perf] Add fast vectorized exp impl from Arm
 Optimized Routines (#30068)

Signed-off-by: Ubuntu <ubuntu@ip-10-252-30-150.eu-west-1.compute.internal>
Signed-off-by: Elham Harirpoush <elham.harirpoush@arm.com>
Co-authored-by: Ubuntu <ubuntu@ip-10-252-30-150.eu-west-1.compute.internal>
---
 csrc/cpu/cpu_attn_impl.hpp | 13 ----------
 csrc/cpu/cpu_attn_macros.h | 50 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp
index 98f55d7c0..02164ed36 100644
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -1246,14 +1246,8 @@ class AttentionMainLoop {
         // rescale sum and partial outputs
         if (need_rescale) {
           // compute rescale factor
-#ifdef DEFINE_FAST_EXP
-          vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
-          rescale_factor_vec = fast_exp(rescale_factor_vec);
-          rescale_factor = rescale_factor_vec.get_last_elem();
-#else
           rescale_factor = std::exp(rescale_factor);
           vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
-#endif
 
           // rescale sum
           new_sum_val += rescale_factor * init_sum_val;
@@ -1889,15 +1883,8 @@ class AttentionMainLoop {
                                    : curr_output_buffer;
           float rescale_factor = final_max > curr_max ? curr_max - final_max
                                                       : final_max - curr_max;
-
-#ifdef DEFINE_FAST_EXP
-          vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
-          rescale_factor_vec = fast_exp(rescale_factor_vec);
-          rescale_factor = rescale_factor_vec.get_last_elem();
-#else
           rescale_factor = std::exp(rescale_factor);
           vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
-#endif
 
           local_sum[head_idx] = final_max > curr_max
                                     ? final_sum + rescale_factor * curr_sum
diff --git a/csrc/cpu/cpu_attn_macros.h b/csrc/cpu/cpu_attn_macros.h
index 6458e4341..35716a079 100644
--- a/csrc/cpu/cpu_attn_macros.h
+++ b/csrc/cpu/cpu_attn_macros.h
@@ -60,4 +60,54 @@
 
 #endif
 
+#ifdef __aarch64__
+  // Implementation copied from Arm Optimized Routines (expf AdvSIMD)
+  // https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c
+  #include <limits>
+  #define DEFINE_FAST_EXP                                                      \
+    const float32x4_t inv_ln2 = vdupq_n_f32(0x1.715476p+0f);                   \
+    const float ln2_hi = 0x1.62e4p-1f;                                         \
+    const float ln2_lo = 0x1.7f7d1cp-20f;                                      \
+    const float c0 = 0x1.0e4020p-7f;                                           \
+    const float c2 = 0x1.555e66p-3f;                                           \
+    const float32x4_t ln2_c02 = {ln2_hi, ln2_lo, c0, c2};                      \
+    const uint32x4_t exponent_bias = vdupq_n_u32(0x3f800000);                  \
+    const float32x4_t c1 = vdupq_n_f32(0x1.573e2ep-5f);                        \
+    const float32x4_t c3 = vdupq_n_f32(0x1.fffdb6p-2f);                        \
+    const float32x4_t c4 = vdupq_n_f32(0x1.ffffecp-1f);                        \
+    const float32x4_t pos_special_bound = vdupq_n_f32(0x1.5d5e2ap+6f);         \
+    const float32x4_t neg_special_bound = vnegq_f32(pos_special_bound);        \
+    const float32x4_t inf =                                                    \
+        vdupq_n_f32(std::numeric_limits<float>::infinity());                   \
+    const float32x4_t zero = vdupq_n_f32(0.0f);                                \
+    auto neon_expf = [&](float32x4_t values) __attribute__((always_inline)) {  \
+      float32x4_t n = vrndaq_f32(vmulq_f32(values, inv_ln2));                  \
+      float32x4_t r = vfmsq_laneq_f32(values, n, ln2_c02, 0);                  \
+      r = vfmsq_laneq_f32(r, n, ln2_c02, 1);                                   \
+      uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_s32(vcvtq_s32_f32(n)), 23); \
+      float32x4_t scale = vreinterpretq_f32_u32(vaddq_u32(e, exponent_bias));  \
+      float32x4_t r2 = vmulq_f32(r, r);                                        \
+      float32x4_t p = vfmaq_laneq_f32(c1, r, ln2_c02, 2);                      \
+      float32x4_t q = vfmaq_laneq_f32(c3, r, ln2_c02, 3);                      \
+      q = vfmaq_f32(q, p, r2);                                                 \
+      p = vmulq_f32(c4, r);                                                    \
+      float32x4_t poly = vfmaq_f32(p, q, r2);                                  \
+      poly = vfmaq_f32(scale, poly, scale);                                    \
+      const uint32x4_t hi_mask = vcgeq_f32(values, pos_special_bound);         \
+      const uint32x4_t lo_mask = vcleq_f32(values, neg_special_bound);         \
+      poly = vbslq_f32(hi_mask, inf, poly);                                    \
+      return vbslq_f32(lo_mask, zero, poly);                                   \
+    };                                                                         \
+    auto fast_exp = [&](vec_op::FP32Vec16& vec)                                \
+                        __attribute__((always_inline)) {                       \
+                          float32x4x4_t result;                                \
+                          result.val[0] = neon_expf(vec.reg.val[0]);           \
+                          result.val[1] = neon_expf(vec.reg.val[1]);           \
+                          result.val[2] = neon_expf(vec.reg.val[2]);           \
+                          result.val[3] = neon_expf(vec.reg.val[3]);           \
+                          return vec_op::FP32Vec16(result);                    \
+                        };
+
+#endif  // __aarch64__
+
 #endif
\ No newline at end of file
-- 
GitLab


From 0d8a7d8a264354ed53f822821b29cb0485bfa70f Mon Sep 17 00:00:00 2001
From: Yi Liu <yi4.liu@intel.com>
Date: Fri, 5 Dec 2025 22:02:09 +0800
Subject: [PATCH 128/258] [Compressed Tensors] Add XPU `wNa16` support (#29484)

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../scripts/hardware_ci/run-xpu-test.sh       |  1 +
 .../kernels/mixed_precision/__init__.py       |  4 +
 .../kernels/mixed_precision/xpu.py            | 97 +++++++++++++++++++
 3 files changed, 102 insertions(+)
 create mode 100644 vllm/model_executor/layers/quantization/kernels/mixed_precision/xpu.py

diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index 4d163399c..1d5dba3f2 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -38,6 +38,7 @@ docker run \
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+    python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
     VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
     cd tests
     pytest -v -s v1/core
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
index 0cf3f12af..c4160157c 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
@@ -30,6 +30,9 @@ from vllm.model_executor.layers.quantization.kernels.mixed_precision.MPLinearKer
     MPLinearKernel,
     MPLinearLayerConfig,
 )
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.xpu import (  # noqa: E501
+    XPUwNa16LinearKernel,
+)
 from vllm.platforms import current_platform
 
 # in priority/performance order (when available)
@@ -42,6 +45,7 @@ _POSSIBLE_KERNELS: list[type[MPLinearKernel]] = [
     BitBLASLinearKernel,
     ConchLinearKernel,
     ExllamaLinearKernel,
+    XPUwNa16LinearKernel,
 ]
 
 
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/xpu.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/xpu.py
new file mode 100644
index 000000000..abd2e047a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/xpu.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import torch
+
+from vllm.platforms import current_platform
+
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+
+
+class XPUwNa16LinearKernel(MPLinearKernel):
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 0
+
+    @classmethod
+    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
+        if not current_platform.is_xpu():
+            return False, "IPEX wNa16 only supported on XPU/CPU devices"
+
+        # TODO: (yiliu30) relax these restrictions in later PRs
+        if c.zero_points:
+            return False, "Zero points not supported for Now"
+
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        from packaging import version
+
+        MIN_IPEX_VERSION = "2.6.0"
+        bias = layer.bias if not layer.skip_bias_add else None
+
+        try:
+            import intel_extension_for_pytorch as ipex
+
+            if version.parse(ipex.__version__) < version.parse(MIN_IPEX_VERSION):
+                raise ImportError(
+                    "intel_extension_for_pytorch version is "
+                    "wrong. Please install "
+                    f"intel_extension_for_pytorch>={MIN_IPEX_VERSION}."
+                )
+        except ImportError as err:
+            raise ImportError(
+                "Please install "
+                f"intel_extension_for_pytorch>={MIN_IPEX_VERSION} via "
+                f"`pip install intel_extension_for_pytorch>={MIN_IPEX_VERSION}`"
+                " to use IPEX-AWQ linear method."
+            ) from err
+        # Using the compute dtype (lowp_mode) as INT8 to leverage instructions
+        # with better performance.
+        lowp_mode = ipex.quantization.WoqLowpMode.INT8
+        # The weight will be de-packed from INT4 to INT8.
+        weight_dtype = ipex.quantization.WoqWeightDtype.INT4
+        # The float activation will be quantized (dynamic, per-token) to INT8.
+        act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH
+
+        qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping(
+            weight_dtype=weight_dtype,
+            lowp_mode=lowp_mode,
+            act_quant_mode=act_quant_mode,
+            group_size=self.config.group_size,
+            weight_qscheme=ipex.quantization.WoqWeightQScheme.SYMMETRIC,
+        )
+        qweight = layer.weight_packed
+        g_idx = layer.weight_g_idx if self.config.has_g_idx else None
+        scales = layer.weight_scale
+        qzeros = None
+        if self.config.zero_points:
+            qzeros = layer.weight_zero_point.contiguous()
+        qweight = qweight.t().contiguous()
+        scales = scales.t().contiguous()
+        layer.ipex_output_size = self.config.partition_weight_shape[1]
+        layer.ipex_qlinear = (
+            ipex.llm.quantization.woq_linear.IPEXWeightOnlyQuantizedLinear.from_weight(
+                qweight,
+                scales,
+                qzeros,
+                in_features=self.config.partition_weight_shape[0],
+                out_features=self.config.partition_weight_shape[1],
+                qconfig=qconfig,
+                g_idx=g_idx,
+                bias=bias,
+                group_size=self.config.group_size,
+                quant_method=0,  # `0` stands for the IPEX GPTQ
+            )
+        )
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        reshaped_x = x.reshape(-1, x.shape[-1])
+        out = layer.ipex_qlinear(reshaped_x)
+        return out.reshape(x.shape[:-1] + (layer.ipex_output_size,))
-- 
GitLab


From 2c174420f5184256159e3d1acfe4184f3f70083e Mon Sep 17 00:00:00 2001
From: Alec S <10566873+alecsolder@users.noreply.github.com>
Date: Fri, 5 Dec 2025 09:02:49 -0500
Subject: [PATCH 129/258] Reduce validation to a warning (#28749)

Signed-off-by: Alec Solder <alecs@fb.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Alec Solder <alecs@fb.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config/structured_outputs.py       | 16 ----------------
 vllm/reasoning/abs_reasoning_parsers.py |  5 ++++-
 2 files changed, 4 insertions(+), 17 deletions(-)

diff --git a/vllm/config/structured_outputs.py b/vllm/config/structured_outputs.py
index 1672c1d7c..8c060c816 100644
--- a/vllm/config/structured_outputs.py
+++ b/vllm/config/structured_outputs.py
@@ -65,22 +65,6 @@ class StructuredOutputsConfig:
 
     @model_validator(mode="after")
     def _validate_structured_output_config(self) -> Self:
-        # Import here to avoid circular import
-        from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
-
-        if self.reasoning_parser_plugin and len(self.reasoning_parser_plugin) > 3:
-            ReasoningParserManager.import_reasoning_parser(self.reasoning_parser_plugin)
-
-        valid_reasoning_parsers = ReasoningParserManager.list_registered()
-        if (
-            self.reasoning_parser != ""
-            and self.reasoning_parser not in valid_reasoning_parsers
-        ):
-            raise ValueError(
-                f"invalid reasoning parser: {self.reasoning_parser} "
-                f"(chose from {{ {','.join(valid_reasoning_parsers)} }})"
-            )
-
         if self.disable_any_whitespace and self.backend not in ("xgrammar", "guidance"):
             raise ValueError(
                 "disable_any_whitespace is only supported for "
diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
index 5c6ac7dad..d0661d1f2 100644
--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -160,7 +160,10 @@ class ReasoningParserManager:
         if name in cls.lazy_parsers:
             return cls._load_lazy_parser(name)
 
-        raise KeyError(f"Reasoning parser '{name}' not found.")
+        registered = ", ".join(cls.list_registered())
+        raise KeyError(
+            f"Reasoning parser '{name}' not found. Available parsers: {registered}"
+        )
 
     @classmethod
     def list_registered(cls) -> list[str]:
-- 
GitLab


From 949a6a19d26f2179c0e5fc5d94f7d033e4c6a695 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 5 Dec 2025 14:52:45 +0000
Subject: [PATCH 130/258] [NIXL] Add compatibility checking to NIXL KV
 connector handshake (#29503)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 .../kv_connector/unit/test_nixl_connector.py  | 224 +++++++++++++++++-
 tests/v1/kv_connector/unit/utils.py           |  10 +-
 .../kv_connector/v1/nixl_connector.py         | 172 +++++++++++++-
 3 files changed, 380 insertions(+), 26 deletions(-)

diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index b7d7a1005..ae4125d54 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -9,8 +9,10 @@ import textwrap
 import time
 import uuid
 from collections import defaultdict
-from unittest.mock import patch
+from typing import Any
+from unittest.mock import MagicMock, patch
 
+import msgspec
 import pytest
 import ray
 import torch
@@ -18,6 +20,7 @@ import torch
 from vllm import LLM
 from vllm.config import KVTransferConfig
 from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
+from vllm.distributed.kv_transfer.kv_connector.v1 import nixl_connector
 from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
 from vllm.distributed.kv_transfer.kv_connector.v1.multi_connector import (
     MultiKVConnectorStats,
@@ -29,7 +32,9 @@ from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
     NixlConnectorMetadata,
     NixlConnectorScheduler,
     NixlConnectorWorker,
+    NixlHandshakePayload,
     NixlKVConnectorStats,
+    compute_nixl_compatibility_hash,
 )
 from vllm.distributed.kv_transfer.kv_transfer_state import (
     ensure_kv_transfer_shutdown,
@@ -317,13 +322,19 @@ def test_kv_transfer_handshake(dist_init):
     }
     prefill_connector.register_kv_caches(kv_caches)
 
-    # Simulate EngineCore initialization that would
-    # gather connector metadata from all workers, the scheduler connector
-    # expects metadata to be in dict[int, KVConnectorHandshakeMetadata],
-    # where the first key is the dp_rank, the second key is the tp_rank.
-    metadata = {0: prefill_connector.get_handshake_metadata()}
+    # Simulate EngineCore initialization that would gather connector
+    # metadata from all workers
+    metadata = prefill_connector.get_handshake_metadata()
+
+    # metadata is a NixlHandshakePayload, decode it to get NixlAgentMetadata
+    decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
+    expected_agent_metadata = decoder.decode(metadata.agent_metadata_bytes)
+
+    # The scheduler connector expects metadata to be in
+    # dict[int, KVConnectorHandshakeMetadata], where the first key is
+    # the dp_rank, the second key is the tp_rank.
     scheduler_connector = scheduler.get_kv_connector()
-    scheduler_connector.set_xfer_handshake_metadata(metadata)
+    scheduler_connector.set_xfer_handshake_metadata({0: metadata})
 
     # Simulate a request that finishes prefill, which returns
     # corresponding NixlConnectorMetadata for decode instance.
@@ -362,9 +373,9 @@ def test_kv_transfer_handshake(dist_init):
         )
 
         received_metadata = mock_add_remote_agent.call_args.args
+        assert received_metadata[0] == expected_agent_metadata
         assert received_metadata[1] == 0  # remote_tp_rank
         assert received_metadata[2] == 1  # remote_tp_size
-        assert metadata[0] == received_metadata[0]
 
     # Need to shutdown the background thread to release NIXL side channel port
     scheduler_connector.shutdown()
@@ -403,7 +414,6 @@ class FakeNixlConnectorWorker(NixlConnectorWorker):
                 device_id=0,
                 num_blocks=1,
                 block_lens=self.block_len_per_layer,
-                attn_backend_name=self.backend_name,
                 # `self.kv_cache_layout` is only forced to HND when vllm engine
                 # is started. We mock HND here.
                 kv_cache_layout="HND",
@@ -651,7 +661,6 @@ class TestNixlHandshake:
                 device_id=0,
                 num_blocks=1,
                 block_lens=worker.block_len_per_layer,
-                attn_backend_name=worker.backend_name,
                 kv_cache_layout=mismatched_layout,
                 block_size=worker.block_size,
             )
@@ -706,7 +715,6 @@ class TestNixlHandshake:
                 num_blocks=1,
                 # prefill TP=1, decode TP=2, remote block_lens is double to local
                 block_lens=[i * 2 for i in worker.block_len_per_layer],
-                attn_backend_name=worker.backend_name,
                 kv_cache_layout="HND",
                 block_size=worker.block_size,
             )
@@ -1168,6 +1176,9 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
         mock_wrapper_instance = mock_nixl_wrapper.return_value
         connector.connector_worker.nixl_wrapper = mock_wrapper_instance
 
+        # Appease NixlHandshakePayload encoding with some bytes
+        mock_wrapper_instance.get_agent_metadata.return_value = b"fake_agent_metadata"
+
         # Reassure the shutdown() check that the thread is terminated
         mock_thread.return_value.is_alive.return_value = False
 
@@ -1534,3 +1545,194 @@ def test_transfer_setup_failure_returns_finished(dist_init):
     # ensure request appears in get_finished
     _, done_recving = connector.get_finished(finished_req_ids=set())
     assert request_id in done_recving
+
+
+@pytest.mark.parametrize(
+    "mismatch_type,config_overrides,version_override,should_fail,enforce_handshake_compat",
+    [
+        ("vllm_version", {}, {"vllm_version": "0.6.1"}, True, True),
+        ("nixl_connector_version", {}, {"connector_version": 37}, True, True),
+        ("model_name", {"model": "facebook/opt-350m"}, {}, True, True),
+        ("dtype", {"dtype": "bfloat16"}, {}, True, True),
+        ("cache_dtype", {"cache_dtype": "fp8"}, {}, True, True),
+        ("num_kv_heads", {"hf_overrides": {"num_key_value_heads": 8}}, {}, True, True),
+        (
+            "num_hidden_layers",
+            {"hf_overrides": {"num_hidden_layers": 24}},
+            {},
+            True,
+            True,
+        ),
+        ("hidden_size", {"hf_overrides": {"hidden_size": 1536}}, {}, True, True),
+        ("block_size", {"block_size": 8}, {}, False, True),
+        ("matching_config", {}, {}, False, True),
+        ("escape_hatch", {"model": "facebook/opt-350m"}, {}, False, False),
+    ],
+)
+@patch(
+    "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+    FakeNixlWrapper,
+)
+def test_compatibility_hash_validation(
+    dist_init,
+    mismatch_type,
+    config_overrides,
+    version_override,
+    should_fail,
+    enforce_handshake_compat,
+):
+    """
+    Test NIXL compatibility hash validation during handshake.
+
+    Parameters:
+        mismatch_type: description of what is being tested
+        config_overrides: dict of config to override for the remote instance
+        version_override: version dict e.g. {"vllm_version": "0.6.1"}
+        should_fail: whether the handshake should fail
+        enforce_handshake_compat: whether to enforce compatibility checking
+    """
+    local_vllm_config = create_vllm_config(
+        model="facebook/opt-125m",
+        block_size=16,
+        kv_connector_extra_config={
+            "enforce_handshake_compat": enforce_handshake_compat
+        },
+    )
+    decode_connector = NixlConnector(local_vllm_config, KVConnectorRole.WORKER)
+    decode_worker = decode_connector.connector_worker
+
+    remote_config_params: dict[str, Any] = {
+        "model": "facebook/opt-125m",
+        "block_size": 16,
+        **config_overrides,
+    }
+    remote_vllm_config = create_vllm_config(**remote_config_params)
+
+    with contextlib.ExitStack() as stack:
+        if "vllm_version" in version_override:
+            stack.enter_context(
+                patch("vllm.__version__", version_override["vllm_version"])
+            )
+        elif "connector_version" in version_override:
+            stack.enter_context(
+                patch.object(
+                    nixl_connector,
+                    "NIXL_CONNECTOR_VERSION",
+                    version_override["connector_version"],
+                )
+            )
+        remote_hash = compute_nixl_compatibility_hash(
+            remote_vllm_config, decode_worker.backend_name
+        )
+
+    prefill_block_size = config_overrides.get("block_size", 16)
+    prefill_metadata = NixlAgentMetadata(
+        engine_id=FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+        agent_metadata=FakeNixlWrapper.AGENT_METADATA,
+        kv_caches_base_addr=[0],
+        device_id=0,
+        num_blocks=1,
+        block_lens=[4096 * prefill_block_size],  # slot_size * block_size
+        kv_cache_layout="HND",
+        block_size=prefill_block_size,
+    )
+    handshake_payload = NixlHandshakePayload(
+        compatibility_hash=remote_hash,
+        agent_metadata_bytes=msgspec.msgpack.encode(prefill_metadata),
+    )
+
+    # Mock ZMQ socket to return our handshake payload
+    mock_socket = MagicMock()
+    mock_socket.recv.return_value = msgspec.msgpack.encode(handshake_payload)
+
+    # Mock add_remote_agent to avoid actual NIXL operations
+    # Patch zmq_ctx to return our mock socket
+    with (
+        patch.object(decode_worker, "add_remote_agent", return_value="fake_agent"),
+        patch.object(nixl_connector, "zmq_ctx") as mock_zmq_ctx,
+    ):
+        mock_zmq_ctx.return_value.__enter__.return_value = mock_socket
+
+        if should_fail:
+            with pytest.raises(RuntimeError, match="compatibility hash mismatch"):
+                decode_worker._nixl_handshake(
+                    host="localhost",
+                    port=1234,
+                    remote_tp_size=1,
+                    expected_engine_id=FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+                )
+        else:
+            result = decode_worker._nixl_handshake(
+                host="localhost",
+                port=1234,
+                remote_tp_size=1,
+                expected_engine_id=FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+            )
+            # Verify handshake returned agent mapping
+            assert isinstance(result, dict)
+            assert len(result) == 1
+
+
+@pytest.mark.parametrize(
+    "error_scenario",
+    [
+        "handshake_decode_error",
+        "handshake_validation_error",
+        "metadata_decode_error",
+        "metadata_validation_error",
+    ],
+)
+@patch(
+    "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+    FakeNixlWrapper,
+)
+def test_handshake_decode_errors(dist_init, error_scenario):
+    """
+    Test that msgspec decode errors are properly handled during handshake.
+
+    Tests both DecodeError and ValidationError for both decoders:
+    - NixlHandshakePayload decoder
+    - NixlAgentMetadata decoder
+    """
+    local_vllm_config = create_vllm_config(
+        model="facebook/opt-125m",
+        block_size=16,
+    )
+    decode_connector = NixlConnector(local_vllm_config, KVConnectorRole.WORKER)
+    decode_worker = decode_connector.connector_worker
+
+    if error_scenario == "handshake_decode_error":
+        msg_bytes = b"this is not valid msgpack data"
+    elif error_scenario == "handshake_validation_error":
+        msg_bytes = msgspec.msgpack.encode({"wrong_field": "value"})
+    elif error_scenario == "metadata_decode_error":
+        valid_handshake = NixlHandshakePayload(
+            compatibility_hash=decode_worker.compat_hash,
+            agent_metadata_bytes=b"invalid msgpack for metadata",
+        )
+        msg_bytes = msgspec.msgpack.encode(valid_handshake)
+
+    elif error_scenario == "metadata_validation_error":
+        valid_handshake = NixlHandshakePayload(
+            compatibility_hash=decode_worker.compat_hash,
+            agent_metadata_bytes=msgspec.msgpack.encode({"missing": "fields"}),
+        )
+        msg_bytes = msgspec.msgpack.encode(valid_handshake)
+    else:
+        raise AssertionError(f"{error_scenario} not a valid scenario")
+
+    mock_socket = MagicMock()
+    mock_socket.recv.return_value = msg_bytes
+    with (
+        patch.object(decode_worker, "add_remote_agent", return_value="fake_agent"),
+        patch.object(nixl_connector, "zmq_ctx") as mock_zmq_ctx,
+    ):
+        mock_zmq_ctx.return_value.__enter__.return_value = mock_socket
+
+        with pytest.raises(RuntimeError):
+            decode_worker._nixl_handshake(
+                host="localhost",
+                port=1234,
+                remote_tp_size=1,
+                expected_engine_id=FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+            )
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 98f1f4492..cea41c3ab 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -90,13 +90,18 @@ def create_vllm_config(
     max_model_len: int = 10000,
     enable_chunked_prefill: bool = True,
     enable_permute_local_kv: bool = False,
+    kv_connector_extra_config: dict[str, Any] | None = None,
+    dtype: str = "float16",
+    cache_dtype: str = "auto",
+    hf_overrides: dict[str, Any] | None = None,
 ) -> VllmConfig:
     """Initialize VllmConfig For Testing."""
     model_config = ModelConfig(
         model=model,
         trust_remote_code=True,
-        dtype="float16",
+        dtype=dtype,
         seed=42,
+        hf_overrides=hf_overrides or {},
     )
     scheduler_config = SchedulerConfig(
         max_num_seqs=max_num_seqs,
@@ -110,13 +115,14 @@ def create_vllm_config(
         block_size=block_size,
         gpu_memory_utilization=0.9,
         swap_space=0,
-        cache_dtype="auto",
+        cache_dtype=cache_dtype,
         enable_prefix_caching=True,
     )
     kv_transfer_config = KVTransferConfig(
         kv_connector="NixlConnector",
         kv_role="kv_both",
         enable_permute_local_kv=enable_permute_local_kv,
+        kv_connector_extra_config=kv_connector_extra_config or {},
     )
     return VllmConfig(
         scheduler_config=scheduler_config,
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 24b7599a4..49330abce 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -59,6 +59,21 @@ Transfer = tuple[int, float]  # (xfer_handle, start_time)
 EngineId = str
 ReqId = str
 
+#
+# NIXL Connector Version
+#
+# Increment this version whenever there is an incompatible change to:
+#   - NixlAgentMetadata schema
+#   - kv_transfer_params schema or semantics
+#   - NIXL transfer protocol or wire format
+#   - KV cache memory layout or block organization
+#   - Any other change that breaks P/D interoperability
+#
+# Version History:
+#   1: Initial version with compatibility checking
+#
+NIXL_CONNECTOR_VERSION: int = 1
+
 GET_META_MSG = b"get_meta_msg"
 
 logger = init_logger(__name__)
@@ -97,18 +112,95 @@ _NIXL_SUPPORTED_DEVICE.update(current_platform.get_nixl_supported_devices())
 
 
 @dataclass
-class NixlAgentMetadata(KVConnectorHandshakeMetadata):
+class NixlAgentMetadata:
     engine_id: str
     agent_metadata: bytes
     kv_caches_base_addr: list[int]
     device_id: int
     num_blocks: int
     block_lens: list[int]
-    attn_backend_name: str
     kv_cache_layout: str
     block_size: int
 
 
+@dataclass
+class NixlHandshakePayload(KVConnectorHandshakeMetadata):
+    """
+    Wrapper for NIXL handshake sent over the wire.
+
+    Enables two-phase decoding for graceful compatibility checking:
+    1. Decode NixlHandshakePayload to get compatibility_hash
+    2. Compute local hash and compare
+    3. Only if hashes match, decode agent_metadata_bytes
+
+    This prevents decoder errors when NixlAgentMetadata schema is
+    incompatible, allowing graceful failure with clear error message.
+    """
+
+    compatibility_hash: str
+    agent_metadata_bytes: bytes  # NixlAgentMetadata encoded
+
+
+def compute_nixl_compatibility_hash(
+    vllm_config: VllmConfig, attn_backend_name: str
+) -> str:
+    """
+    Compute compatibility hash for NIXL KV transfer.
+
+    Hash only the factors that affect whether two NIXL instances can
+    successfully transfer KV cache data.
+
+    Factors included:
+    - vLLM version and NIXL connector version
+    - Model architecture (name, dtype, KV heads, layers)
+    - KV cache format (dtype, sliding window)
+    - Attention backend
+
+    Note: Factors like tensor_parallel_size, block_size, and kv_cache_layout
+    are validated at runtime in _validate_remote_agent_handshake and are not
+    included in this hash to support heterogeneous deployments.
+
+    Note - the set of factors are likely to evolve significantly over
+    time to be more or less permissive.
+
+    Returns:
+        SHA-256 hex digest
+    """
+    from vllm import __version__ as vllm_version
+    from vllm.config.utils import hash_factors
+
+    model_config = vllm_config.model_config
+    cache_config = vllm_config.cache_config
+
+    factors = {
+        # Version compatibility
+        "vllm_version": vllm_version,
+        "nixl_connector_version": NIXL_CONNECTOR_VERSION,
+        # Model architecture - affects KV cache shape
+        "model": model_config.model,
+        "dtype": str(model_config.dtype),
+        "num_kv_heads": model_config.get_total_num_kv_heads(),
+        "head_size": model_config.get_head_size(),
+        "num_hidden_layers": model_config.get_total_num_hidden_layers(),
+        # Attention backend and KV cache dtype affect memory layout
+        "attn_backend_name": attn_backend_name,
+        "cache_dtype": str(cache_config.cache_dtype),
+    }
+
+    compat_hash = hash_factors(factors)
+    logger.info(
+        "NIXL compatibility hash: %s (model=%s, dtype=%s, num_kv_heads=%d, "
+        "cache_dtype=%s, attn_backend=%s)",
+        compat_hash,
+        factors["model"],
+        factors["dtype"],
+        factors["num_kv_heads"],
+        factors["cache_dtype"],
+        attn_backend_name,
+    )
+    return compat_hash
+
+
 @dataclass
 class ReqMeta:
     local_block_ids: list[int]
@@ -396,14 +488,14 @@ class NixlConnectorScheduler:
         encoded_data: dict[int, bytes] = {}
         encoder = msgspec.msgpack.Encoder()
         for tp_rank, rank_metadata in metadata.items():
-            if not isinstance(rank_metadata, NixlAgentMetadata):
+            if not isinstance(rank_metadata, NixlHandshakePayload):
                 raise ValueError(
-                    "NixlConnectorScheduler expects NixlAgentMetadata for "
+                    "NixlConnectorScheduler expects NixlHandshakePayload for "
                     "handshake metadata."
                 )
             encoded_data[tp_rank] = encoder.encode(rank_metadata)
             logger.debug(
-                "Tp rank %d: encoded NixlAgentMetadata size: %s bytes",
+                "Tp rank %d: encoded NixlHandshakePayload size: %s bytes",
                 tp_rank,
                 str(len(encoded_data[tp_rank])),
             )
@@ -794,7 +886,7 @@ class NixlConnectorWorker:
         self._failed_recv_reqs: set[ReqId] = set()
 
         # Handshake metadata of this worker for NIXL transfers.
-        self.xfer_handshake_metadata: NixlAgentMetadata | None = None
+        self.xfer_handshake_metadata: NixlHandshakePayload | None = None
         # Background thread for initializing new NIXL handshakes.
         self._handshake_initiation_executor = ThreadPoolExecutor(
             # NIXL is not guaranteed to be thread-safe, limit 1 worker.
@@ -829,6 +921,13 @@ class NixlConnectorWorker:
         logger.debug("Detected attention backend %s", self.backend_name)
         logger.debug("Detected kv cache layout %s", self.kv_cache_layout)
 
+        self.compat_hash = compute_nixl_compatibility_hash(
+            self.vllm_config, self.backend_name
+        )
+        self.enforce_compat_hash = self.kv_transfer_config.get_from_extra_config(
+            "enforce_handshake_compat", True
+        )
+
         self._tp_size: dict[EngineId, int] = {self.engine_id: self.world_size}
         self._block_size: dict[EngineId, int] = {self.engine_id: self.block_size}
         # With heterogeneous TP, P must wait for all assigned D TP workers to
@@ -877,14 +976,58 @@ class NixlConnectorWorker:
             # Set receive timeout to 5 seconds to avoid hanging on dead server
             sock.setsockopt(zmq.RCVTIMEO, 5000)  # milliseconds
             sock.send(msg)
-            metadata_bytes = sock.recv()
-            decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
-            metadata = decoder.decode(metadata_bytes)
+            handshake_bytes = sock.recv()
+
+            # Decode handshake payload to get compatibility hash
+            handshake_decoder = msgspec.msgpack.Decoder(NixlHandshakePayload)
+            try:
+                handshake_payload = handshake_decoder.decode(handshake_bytes)
+            except (msgspec.DecodeError, msgspec.ValidationError) as e:
+                raise RuntimeError(
+                    f"Failed to decode NixlHandshakePayload. This likely indicates "
+                    f"an incompatibility between connector version. Error: {e}"
+                ) from e
+
             got_metadata_time = time.perf_counter()
             logger.debug(
                 "NIXL handshake: get metadata took: %s", got_metadata_time - start_time
             )
 
+            # Check compatibility hash BEFORE decoding agent metadata
+            if (
+                self.enforce_compat_hash
+                and handshake_payload.compatibility_hash != self.compat_hash
+            ):
+                raise RuntimeError(
+                    f"NIXL compatibility hash mismatch. "
+                    f"Local: {self.compat_hash}, "
+                    f"Remote: {handshake_payload.compatibility_hash}. "
+                    f"Prefill and decode instances have incompatible configurations. "
+                    f"This may be due to: different vLLM versions, models, dtypes, "
+                    f"KV cache layouts, attention backends, etc. "
+                    f"Both instances must use identical configurations."
+                    f"Disable this check using "
+                    f'--kv-transfer-config \'{{"kv_connector_extra_config": '
+                    f'{{"enforce_handshake_compat": false}}}}\''
+                )
+
+            logger.info(
+                "NIXL compatibility check passed (hash: %s)",
+                handshake_payload.compatibility_hash,
+            )
+
+            # Decode agent metadata
+            metadata_decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
+            try:
+                metadata = metadata_decoder.decode(
+                    handshake_payload.agent_metadata_bytes
+                )
+            except (msgspec.DecodeError, msgspec.ValidationError) as e:
+                # This should not happen if hash matched
+                raise RuntimeError(
+                    f"Failed to decode NixlAgentMetadata. Error: {e}"
+                ) from e
+
             # Ensure engine id matches.
             if metadata.engine_id != expected_engine_id:
                 raise RuntimeError(
@@ -1175,19 +1318,24 @@ class NixlConnectorWorker:
             assert len(self.block_window_per_layer) == self.num_layers
 
         # After KV Caches registered, listen for new connections.
-        self.xfer_handshake_metadata = NixlAgentMetadata(
+        agent_metadata = NixlAgentMetadata(
             engine_id=self.engine_id,
             agent_metadata=self.nixl_wrapper.get_agent_metadata(),
             kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id],
             device_id=self.device_id,
             num_blocks=self.num_blocks,
             block_lens=self.block_len_per_layer,
-            attn_backend_name=self.backend_name,
             kv_cache_layout=self.kv_cache_layout
             if not self.use_host_buffer
             else self.host_buffer_kv_cache_layout,
             block_size=self.block_size,
         )
+        # Wrap metadata in payload with hash for defensive decoding
+        encoder = msgspec.msgpack.Encoder()
+        self.xfer_handshake_metadata = NixlHandshakePayload(
+            compatibility_hash=self.compat_hash,
+            agent_metadata_bytes=encoder.encode(agent_metadata),
+        )
 
     def register_local_xfer_handler(
         self,
@@ -1402,8 +1550,6 @@ class NixlConnectorWorker:
         remote_engine_id = nixl_agent_meta.engine_id
 
         assert self._tp_size[remote_engine_id] == remote_tp_size
-        # TODO We may eventually want to skip enforcing the same attn backend.
-        assert nixl_agent_meta.attn_backend_name == self.backend_name
 
         tp_ratio = self.kv_topo.tp_ratio_from_engine_id(remote_engine_id)
         block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(
-- 
GitLab


From da7bc54ea8f44a2dcacc4a9869721bd105006e10 Mon Sep 17 00:00:00 2001
From: Andrew Xia <axia@meta.com>
Date: Fri, 5 Dec 2025 08:11:50 -0800
Subject: [PATCH 131/258] [responsesAPI][5] ResponsesParser with tools for full
 MCP python loop (#29798)

Signed-off-by: Andrew Xia <axia@fb.com>
Signed-off-by: Andrew Xia <axia@meta.com>
Co-authored-by: Andrew Xia <axia@fb.com>
---
 .../openai_responses_client_with_tools.py     |  2 +-
 .../test_response_api_parsable_context.py     | 99 ++++++++++++++++++-
 vllm/entrypoints/context.py                   | 92 ++++++++++++++++-
 .../openai/parser/responses_parser.py         | 34 +++++++
 vllm/entrypoints/openai/serving_engine.py     | 65 ++++++++++--
 vllm/entrypoints/openai/serving_responses.py  |  6 +-
 vllm/entrypoints/responses_utils.py           | 21 +++-
 vllm/entrypoints/tool.py                      | 44 +++++++++
 8 files changed, 347 insertions(+), 16 deletions(-)

diff --git a/examples/online_serving/openai_responses_client_with_tools.py b/examples/online_serving/openai_responses_client_with_tools.py
index 276010197..c85c8cf80 100644
--- a/examples/online_serving/openai_responses_client_with_tools.py
+++ b/examples/online_serving/openai_responses_client_with_tools.py
@@ -3,7 +3,7 @@
 """
 Set up this example by starting a vLLM OpenAI-compatible server with tool call
 options enabled.
-Reasoning models can be used through the Responses API as seen here 
+Reasoning models can be used through the Responses API as seen here
 https://platform.openai.com/docs/api-reference/responses
 For example:
 vllm serve Qwen/Qwen3-1.7B --reasoning-parser qwen3 \
diff --git a/tests/entrypoints/openai/test_response_api_parsable_context.py b/tests/entrypoints/openai/test_response_api_parsable_context.py
index 1b2795770..1899c5f04 100644
--- a/tests/entrypoints/openai/test_response_api_parsable_context.py
+++ b/tests/entrypoints/openai/test_response_api_parsable_context.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import importlib
+import json
 
 import pytest
 import pytest_asyncio
@@ -13,12 +15,27 @@ MODEL_NAME = "Qwen/Qwen3-8B"
 
 @pytest.fixture(scope="module")
 def server():
-    args = ["--reasoning-parser", "qwen3", "--max_model_len", "5000"]
+    assert importlib.util.find_spec("gpt_oss") is not None, (
+        "Harmony tests require gpt_oss package to be installed"
+    )
+
+    args = [
+        "--reasoning-parser",
+        "qwen3",
+        "--max_model_len",
+        "5000",
+        "--structured-outputs-config.backend",
+        "xgrammar",
+        "--enable-auto-tool-choice",
+        "--tool-call-parser",
+        "hermes",
+        "--tool-server",
+        "demo",
+    ]
     env_dict = dict(
         VLLM_ENABLE_RESPONSES_API_STORE="1",
         VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT="1",
-        # uncomment for tool calling
-        # PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
+        PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
     )
 
     with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
@@ -85,3 +102,79 @@ async def test_reasoning_and_function_items(client: OpenAI, model_name: str):
     assert response.output[0].type == "reasoning"
     assert response.output[1].type == "message"
     assert type(response.output[1].content[0].text) is str
+
+
+def get_horoscope(sign):
+    return f"{sign}: Next Tuesday you will befriend a baby otter."
+
+
+def call_function(name, args):
+    if name == "get_horoscope":
+        return get_horoscope(**args)
+    else:
+        raise ValueError(f"Unknown function: {name}")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_function_call_first_turn(client: OpenAI, model_name: str):
+    tools = [
+        {
+            "type": "function",
+            "name": "get_horoscope",
+            "description": "Get today's horoscope for an astrological sign.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "sign": {"type": "string"},
+                },
+                "required": ["sign"],
+                "additionalProperties": False,
+            },
+            "strict": True,
+        }
+    ]
+
+    response = await client.responses.create(
+        model=model_name,
+        input="What is the horoscope for Aquarius today?",
+        tools=tools,
+        temperature=0.0,
+    )
+    assert response is not None
+    assert response.status == "completed"
+    assert len(response.output) == 2
+    assert response.output[0].type == "reasoning"
+    assert response.output[1].type == "function_call"
+
+    function_call = response.output[1]
+    assert function_call.name == "get_horoscope"
+    assert function_call.call_id is not None
+
+    args = json.loads(function_call.arguments)
+    assert "sign" in args
+
+    # the multi turn function call is tested above in
+    # test_reasoning_and_function_items
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_mcp_tool_call(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is 13 * 24? Use python to calculate the result.",
+        tools=[{"type": "code_interpreter", "container": {"type": "auto"}}],
+        temperature=0.0,
+    )
+
+    assert response is not None
+    assert response.status == "completed"
+    assert response.output[0].type == "reasoning"
+    assert response.output[1].type == "mcp_call"
+    assert type(response.output[1].arguments) is str
+    assert type(response.output[1].output) is str
+    assert response.output[2].type == "reasoning"
+    # make sure the correct math is in the final output
+    assert response.output[3].type == "message"
+    assert "312" in response.output[3].content[0].text
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index 43783c926..f50c473d7 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -9,10 +9,16 @@ from collections.abc import Callable
 from contextlib import AsyncExitStack
 from typing import TYPE_CHECKING, Union
 
+from openai.types.responses.response_function_tool_call_output_item import (
+    ResponseFunctionToolCallOutputItem,
+)
 from openai.types.responses.tool import Mcp
 from openai_harmony import Author, Message, Role, StreamState, TextContent
 
 from vllm import envs
+from vllm.entrypoints.chat_utils import (
+    ChatTemplateContentFormatOption,
+)
 from vllm.entrypoints.harmony_utils import (
     get_encoding,
     get_streamable_parser_for_assistant,
@@ -22,16 +28,20 @@ from vllm.entrypoints.openai.parser.responses_parser import (
     get_responses_parser_for_simple_context,
 )
 from vllm.entrypoints.openai.protocol import (
+    FunctionCall,
     ResponseInputOutputItem,
     ResponseRawMessageAndToken,
     ResponsesRequest,
 )
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ToolParser
 from vllm.entrypoints.responses_utils import construct_tool_dicts
 from vllm.entrypoints.tool import Tool
 from vllm.entrypoints.tool_server import ToolServer
 from vllm.outputs import RequestOutput
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
+from vllm.tokenizers.protocol import TokenizerLike
 from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import random_uuid
 
 if TYPE_CHECKING:
     from mcp.client import ClientSession
@@ -221,6 +231,10 @@ class ParsableContext(ConversationContext):
         tokenizer: AnyTokenizer,
         reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser] | None,
         request: ResponsesRequest,
+        available_tools: list[str] | None,
+        tool_parser_cls: Callable[[TokenizerLike], ToolParser] | None,
+        chat_template: str | None,
+        chat_template_content_format: ChatTemplateContentFormatOption,
     ):
         self.num_prompt_tokens = 0
         self.num_output_tokens = 0
@@ -238,12 +252,19 @@ class ParsableContext(ConversationContext):
             reasoning_parser_cls=reasoning_parser_cls,
             response_messages=response_messages,
             request=request,
+            tool_parser_cls=tool_parser_cls,
         )
+        self.tool_parser_cls = tool_parser_cls
+        self.request = request
+        self.tokenizer = tokenizer
 
+        self.available_tools = available_tools or []
         self._tool_sessions: dict[str, ClientSession | Tool] = {}
         self.called_tools: set[str] = set()
 
         self.tool_dicts = construct_tool_dicts(request.tools, request.tool_choice)
+        self.chat_template = chat_template
+        self.chat_template_content_format = chat_template_content_format
 
     def append_output(self, output: RequestOutput) -> None:
         self.num_prompt_tokens = len(output.prompt_token_ids or [])
@@ -252,14 +273,50 @@ class ParsableContext(ConversationContext):
         self.parser.process(output.outputs[0])
 
     def append_tool_output(self, output: list[ResponseInputOutputItem]) -> None:
-        raise NotImplementedError("Should not be called.")
+        self.parser.response_messages.extend(output)
 
     def need_builtin_tool_call(self) -> bool:
         """Return true if the last message is a MCP tool call"""
+        last_message = self.parser.response_messages[-1]
+        # TODO: figure out which tools are MCP tools
+        if (  # noqa: SIM103
+            last_message.type == "function_call"
+            and last_message.name in ("code_interpreter", "python")
+        ):
+            return True
+
         return False
 
+    async def call_python_tool(
+        self, tool_session: Union["ClientSession", Tool], last_msg: FunctionCall
+    ) -> list[ResponseInputOutputItem]:
+        self.called_tools.add("python")
+        if isinstance(tool_session, Tool):
+            return await tool_session.get_result_parsable_context(self)
+        args = json.loads(last_msg.arguments)
+        param = {
+            "code": args["code"],
+        }
+        result = await tool_session.call_tool("python", param)
+        result_str = result.content[0].text
+
+        message = ResponseFunctionToolCallOutputItem(
+            id=f"fco_{random_uuid()}",
+            type="function_call_output",
+            call_id=f"call_{random_uuid()}",
+            output=result_str,
+            status="completed",
+        )
+
+        return [message]
+
     async def call_tool(self) -> list[ResponseInputOutputItem]:
-        raise NotImplementedError("Should not be called.")
+        if not self.parser.response_messages:
+            return []
+        last_msg = self.parser.response_messages[-1]
+        if last_msg.name == "code_interpreter":
+            return await self.call_python_tool(self._tool_sessions["python"], last_msg)
+        return []
 
     def render_for_completion(self):
         raise NotImplementedError("Should not be called.")
@@ -271,11 +328,38 @@ class ParsableContext(ConversationContext):
         request_id: str,
         mcp_tools: dict[str, Mcp],
     ):
-        pass
+        if tool_server:
+            for tool_name in self.available_tools:
+                if tool_name in self._tool_sessions:
+                    continue
+
+                tool_type = _map_tool_name_to_tool_type(tool_name)
+                headers = (
+                    mcp_tools[tool_type].headers if tool_type in mcp_tools else None
+                )
+                tool_session = await exit_stack.enter_async_context(
+                    tool_server.new_session(tool_name, request_id, headers)
+                )
+                self._tool_sessions[tool_name] = tool_session
+                exit_stack.push_async_exit(self.cleanup_session)
 
     async def cleanup_session(self, *args, **kwargs) -> None:
         """Can be used as coro to used in __aexit__"""
-        raise NotImplementedError("Should not be called.")
+
+        async def cleanup_tool_session(tool_session):
+            if not isinstance(tool_session, Tool):
+                logger.info(
+                    "Cleaning up tool session for %s", tool_session._client_info
+                )
+                with contextlib.suppress(Exception):
+                    await tool_session.call_tool("cleanup_session", {})
+
+        await asyncio.gather(
+            *(
+                cleanup_tool_session(self._tool_sessions[tool])
+                for tool in self.called_tools
+            )
+        )
 
 
 class HarmonyContext(ConversationContext):
diff --git a/vllm/entrypoints/openai/parser/responses_parser.py b/vllm/entrypoints/openai/parser/responses_parser.py
index 1bc8e81bd..00045a7cc 100644
--- a/vllm/entrypoints/openai/parser/responses_parser.py
+++ b/vllm/entrypoints/openai/parser/responses_parser.py
@@ -3,6 +3,7 @@
 import logging
 from collections.abc import Callable
 
+from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall
 from openai.types.responses.response_output_message import ResponseOutputMessage
 from openai.types.responses.response_output_text import ResponseOutputText
 from openai.types.responses.response_reasoning_item import (
@@ -11,8 +12,10 @@ from openai.types.responses.response_reasoning_item import (
 )
 
 from vllm.entrypoints.openai.protocol import ResponseInputOutputItem, ResponsesRequest
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ToolParser
 from vllm.outputs import CompletionOutput
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
+from vllm.tokenizers.protocol import TokenizerLike
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import random_uuid
 
@@ -29,6 +32,7 @@ class ResponsesParser:
         reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser],
         response_messages: list[ResponseInputOutputItem],
         request: ResponsesRequest,
+        tool_parser_cls: Callable[[TokenizerLike], ToolParser] | None,
     ):
         self.response_messages: list[ResponseInputOutputItem] = (
             # TODO: initial messages may not be properly typed
@@ -39,6 +43,9 @@ class ResponsesParser:
         self.request = request
 
         self.reasoning_parser_instance = reasoning_parser_cls(tokenizer)
+        self.tool_parser_instance = None
+        if tool_parser_cls is not None:
+            self.tool_parser_instance = tool_parser_cls(tokenizer)
 
     def process(self, output: CompletionOutput) -> "ResponsesParser":
         reasoning_content, content = self.reasoning_parser_instance.extract_reasoning(
@@ -59,6 +66,29 @@ class ResponsesParser:
                 )
             )
 
+        function_calls: list[ResponseFunctionToolCall] = []
+        if self.tool_parser_instance is not None:
+            tool_call_info = self.tool_parser_instance.extract_tool_calls(
+                content if content is not None else "",
+                request=self.request,  # type: ignore
+            )
+            if tool_call_info is not None and tool_call_info.tools_called:
+                # extract_tool_calls() returns a list of tool calls.
+                function_calls.extend(
+                    ResponseFunctionToolCall(
+                        id=f"fc_{random_uuid()}",
+                        call_id=f"call_{random_uuid()}",
+                        type="function_call",
+                        status="completed",
+                        name=tool_call.function.name,
+                        arguments=tool_call.function.arguments,
+                    )
+                    for tool_call in tool_call_info.tool_calls
+                )
+                content = tool_call_info.content
+                if content and content.strip() == "":
+                    content = None
+
         if content:
             self.response_messages.append(
                 ResponseOutputMessage(
@@ -76,6 +106,8 @@ class ResponsesParser:
                     ],
                 )
             )
+        if len(function_calls) > 0:
+            self.response_messages.extend(function_calls)
 
         return self
 
@@ -86,6 +118,7 @@ def get_responses_parser_for_simple_context(
     reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser],
     response_messages: list[ResponseInputOutputItem],
     request: ResponsesRequest,
+    tool_parser_cls,
 ) -> ResponsesParser:
     """Factory function to create a ResponsesParser with
     optional reasoning parser.
@@ -98,4 +131,5 @@ def get_responses_parser_for_simple_context(
         reasoning_parser_cls=reasoning_parser_cls,
         response_messages=response_messages,
         request=request,
+        tool_parser_cls=tool_parser_cls,
     )
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index bfa98f29a..99936f588 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -18,6 +18,16 @@ from pydantic import ConfigDict, TypeAdapter
 from starlette.datastructures import Headers
 from typing_extensions import TypeIs
 
+from vllm.entrypoints.context import (
+    HarmonyContext,
+    ParsableContext,
+    StreamingHarmonyContext,
+)
+from vllm.entrypoints.openai.protocol import (
+    FunctionCall,
+    ResponseInputOutputItem,
+    ResponsesRequest,
+)
 from vllm.entrypoints.pooling.classify.protocol import (
     ClassificationChatRequest,
     ClassificationCompletionRequest,
@@ -39,6 +49,7 @@ from vllm.entrypoints.pooling.score.protocol import (
     ScoreRequest,
     ScoreResponse,
 )
+from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 if sys.version_info >= (3, 12):
     from typing import TypedDict
@@ -72,9 +83,7 @@ from vllm.entrypoints.openai.protocol import (
     DetokenizeRequest,
     ErrorInfo,
     ErrorResponse,
-    FunctionCall,
     FunctionDefinition,
-    ResponsesRequest,
     TokenizeChatRequest,
     TokenizeCompletionRequest,
     TokenizeResponse,
@@ -85,6 +94,9 @@ from vllm.entrypoints.openai.protocol import (
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer, RenderConfig
+from vllm.entrypoints.responses_utils import (
+    construct_input_messages,
+)
 from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse
 from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.inputs.data import PromptType
@@ -1224,6 +1236,31 @@ class OpenAIServing:
         )
         return engine_request, tokenization_kwargs
 
+    async def _render_next_turn(
+        self,
+        request: ResponsesRequest,
+        tokenizer: AnyTokenizer,
+        messages: list[ResponseInputOutputItem],
+        tool_dicts: list[dict[str, Any]] | None,
+        tool_parser,
+        chat_template: str | None,
+        chat_template_content_format: ChatTemplateContentFormatOption,
+    ):
+        new_messages = construct_input_messages(
+            request_input=messages,
+        )
+
+        _, request_prompts, engine_prompts = await self._preprocess_chat(
+            request,
+            tokenizer,
+            new_messages,
+            tool_dicts=tool_dicts,
+            tool_parser=tool_parser,
+            chat_template=chat_template,
+            chat_template_content_format=chat_template_content_format,
+        )
+        return request_prompts, engine_prompts
+
     async def _generate_with_builtin_tools(
         self,
         request_id: str,
@@ -1286,11 +1323,27 @@ class OpenAIServing:
 
             # Create inputs for the next turn.
             # Render the next prompt token ids.
-            prompt_token_ids = context.render_for_completion()
-            engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids)
-            request_prompt = prompt_token_ids
+            if isinstance(context, (HarmonyContext, StreamingHarmonyContext)):
+                prompt_token_ids = context.render_for_completion()
+                engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids)
+                request_prompt = prompt_token_ids
+            elif isinstance(context, ParsableContext):
+                request_prompts, engine_prompts = await self._render_next_turn(
+                    context.request,
+                    context.tokenizer,
+                    context.parser.response_messages,
+                    context.tool_dicts,
+                    context.tool_parser_cls,
+                    context.chat_template,
+                    context.chat_template_content_format,
+                )
+                engine_prompt = engine_prompts[0]
+                request_prompt = request_prompts[0]
+
             # Update the sampling params.
-            sampling_params.max_tokens = self.max_model_len - len(prompt_token_ids)
+            sampling_params.max_tokens = self.max_model_len - len(
+                engine_prompt["prompt_token_ids"]
+            )
             # OPTIMIZATION
             priority = orig_priority - 1
             sub_request += 1
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 3c9ae8e8c..1eb1243e7 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -375,7 +375,7 @@ class OpenAIServingResponses(OpenAIServing):
         generators: list[AsyncGenerator[ConversationContext, None]] = []
 
         builtin_tool_list: list[str] = []
-        if self.use_harmony and self.tool_server is not None:
+        if self.tool_server is not None:
             if self.tool_server.has_tool("browser"):
                 builtin_tool_list.append("browser")
             if self.tool_server.has_tool("python"):
@@ -423,6 +423,10 @@ class OpenAIServingResponses(OpenAIServing):
                             tokenizer=tokenizer,
                             reasoning_parser_cls=self.reasoning_parser,
                             request=request,
+                            tool_parser_cls=self.tool_parser,
+                            available_tools=available_tools,
+                            chat_template=self.chat_template,
+                            chat_template_content_format=self.chat_template_content_format,
                         )
                     else:
                         context = SimpleContext()
diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py
index 5f21e2c44..fbc137bac 100644
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@@ -16,6 +16,7 @@ from openai.types.responses.response import ToolChoice
 from openai.types.responses.response_function_tool_call_output_item import (
     ResponseFunctionToolCallOutputItem,
 )
+from openai.types.responses.response_output_item import McpCall
 from openai.types.responses.response_output_message import ResponseOutputMessage
 from openai.types.responses.response_reasoning_item import ResponseReasoningItem
 from openai.types.responses.tool import Tool
@@ -25,6 +26,7 @@ from vllm.entrypoints.openai.protocol import (
     ChatCompletionMessageParam,
     ResponseInputOutputItem,
 )
+from vllm.utils import random_uuid
 
 
 def make_response_output_items_from_parsable_context(
@@ -36,7 +38,24 @@ def make_response_output_items_from_parsable_context(
         if not isinstance(message, ResponseFunctionToolCallOutputItem):
             output_messages.append(message)
         else:
-            raise NotImplementedError("tool calls not supported for response context")
+            if len(output_messages) == 0:
+                raise ValueError(
+                    "Cannot have a FunctionToolCallOutput before FunctionToolCall."
+                )
+            if isinstance(output_messages[-1], ResponseFunctionToolCall):
+                mcp_message = McpCall(
+                    id=f"mcp_{random_uuid()}",
+                    arguments=output_messages[-1].arguments,
+                    name=output_messages[-1].name,
+                    server_label=output_messages[
+                        -1
+                    ].name,  # TODO: store the server label
+                    type="mcp_call",
+                    status="completed",
+                    output=message.output,
+                    # TODO: support error output
+                )
+                output_messages[-1] = mcp_message
 
     return output_messages
 
diff --git a/vllm/entrypoints/tool.py b/vllm/entrypoints/tool.py
index c74ce1ee1..4feed8273 100644
--- a/vllm/entrypoints/tool.py
+++ b/vllm/entrypoints/tool.py
@@ -1,12 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
 import os
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Any
 
+from openai.types.responses.response_function_tool_call_output_item import (
+    ResponseFunctionToolCallOutputItem,
+)
 from openai_harmony import Author, Message, Role, TextContent
 
 from vllm.logger import init_logger
+from vllm.utils import random_uuid
 
 if TYPE_CHECKING:
     # Avoid circular import.
@@ -46,6 +51,10 @@ class Tool(ABC):
     async def get_result(self, context: "ConversationContext") -> Any:
         pass
 
+    @abstractmethod
+    async def get_result_parsable_context(self, context: "ConversationContext") -> Any:
+        pass
+
 
 class HarmonyBrowserTool(Tool):
     def __init__(self):
@@ -81,6 +90,9 @@ class HarmonyBrowserTool(Tool):
             tool_output_msgs.append(msg)
         return tool_output_msgs
 
+    async def get_result_parsable_context(self, context: "ConversationContext") -> Any:
+        raise NotImplementedError("Not implemented yet")
+
     @property
     def tool_config(self) -> Any:
         return self.browser_tool.tool_config
@@ -138,6 +150,38 @@ class HarmonyPythonTool(Tool):
             tool_output_msgs.append(msg)
         return tool_output_msgs
 
+    async def get_result_parsable_context(self, context: "ConversationContext") -> Any:
+        """
+        This function converts parsable context types to harmony and
+        back so we can use GPTOSS demo python tool
+        """
+        from vllm.entrypoints.context import ParsableContext
+
+        assert isinstance(context, ParsableContext)
+
+        last_msg = context.parser.response_messages[-1]
+        args = json.loads(last_msg.arguments)
+
+        last_msg_harmony = Message(
+            author=Author(role="assistant", name=None),
+            content=[TextContent(text=args["code"])],
+            channel="analysis",
+            recipient="python",
+            content_type="code",
+        )
+
+        tool_output_msgs = []
+        async for msg in self.python_tool.process(last_msg_harmony):
+            processed = ResponseFunctionToolCallOutputItem(
+                id=f"fco_{random_uuid()}",
+                type="function_call_output",
+                call_id=f"call_{random_uuid()}",
+                output=msg.content[0].text,
+                status="completed",
+            )
+            tool_output_msgs.append(processed)
+        return tool_output_msgs
+
     @property
     def tool_config(self) -> Any:
         return self.python_tool.tool_config
-- 
GitLab


From e7296b08da66b4c79eb43d0932a3d8628178d036 Mon Sep 17 00:00:00 2001
From: Angela Yi <yiangela7@gmail.com>
Date: Fri, 5 Dec 2025 08:54:26 -0800
Subject: [PATCH 132/258] [bugfix] Pass globals to aot_compiled function
 (#29428)

Signed-off-by: angelayi <yiangela7@gmail.com>
---
 vllm/compilation/decorators.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index eed7795cd..6bb66ce3e 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -409,7 +409,9 @@ def _support_torch_compile(
                     open(aot_compilation_path, "rb") as f,
                 ):
                     start_monitoring_torch_compile(self.vllm_config)
-                    loaded_fn = torch.compiler.load_compiled_function(f)
+                    loaded_fn = torch.compiler.load_compiled_function(
+                        f, f_globals=self.forward.__globals__
+                    )
                 _verify_source_unchanged(loaded_fn.source_info(), self.vllm_config)
                 loaded_fn.disable_guard_check()
                 self.aot_compiled_fn = loaded_fn
-- 
GitLab


From 78c44fd722fe3ce010b0ba9c5e7349fe4094b39d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Fri, 5 Dec 2025 18:17:36 +0100
Subject: [PATCH 133/258] [NIXL] Small cleanup of unused variables (#29618)

Signed-off-by: NickLucche <nlucches@redhat.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 .../kv_connector/unit/test_nixl_connector.py  |  2 +-
 .../kv_connector/v1/nixl_connector.py         | 20 +++++++------------
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index ae4125d54..65db16f48 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -1307,7 +1307,7 @@ def test_shutdown_cleans_up_resources(dist_init):
         patch.object(nixl_wrapper, "remove_remote_agent") as mock_rem_agent,
         patch.object(nixl_wrapper, "deregister_memory") as mock_dereg,
     ):
-        worker._recving_transfers = {"req1": [(123, time.perf_counter())]}
+        worker._recving_transfers = {"req1": [123]}
         worker.src_xfer_side_handle = 456
         worker.dst_xfer_side_handles = {"engine1": 789}
         worker._remote_agents = {"engine1": {0: "agent1"}}
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 49330abce..649e54ada 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -55,7 +55,7 @@ if TYPE_CHECKING:
     from vllm.v1.kv_cache_interface import KVCacheConfig
     from vllm.v1.request import Request
 
-Transfer = tuple[int, float]  # (xfer_handle, start_time)
+TransferHandle = int
 EngineId = str
 ReqId = str
 
@@ -874,7 +874,7 @@ class NixlConnectorWorker:
         # In progress transfers.
         # [req_id -> list[handle]]
         self._recving_metadata: dict[ReqId, ReqMeta] = {}
-        self._recving_transfers = defaultdict[ReqId, list[Transfer]](list)
+        self._recving_transfers = defaultdict[ReqId, list[TransferHandle]](list)
         # Track the expiration time of requests that are waiting to be sent.
         self._reqs_to_send: dict[ReqId, float] = {}
         # Set of requests that have been part of a batch, regardless of status.
@@ -1201,14 +1201,11 @@ class NixlConnectorWorker:
         # Enable different block lengths for different layers when MLA is used.
         self.block_len_per_layer = list[int]()
         self.slot_size_per_layer = list[int]()  # HD bytes in kv terms
-        self.device_id = self.tp_rank
         for layer_name, cache_or_caches in xfer_buffers.items():
             cache_list = cache_or_caches if split_k_and_v else [cache_or_caches]
 
             for cache in cache_list:
                 base_addr = cache.data_ptr()
-                if not self.use_host_buffer and current_platform.is_cuda_alike():
-                    self.device_id = cache.device.index
                 if base_addr in seen_base_addresses:
                     continue
 
@@ -1251,8 +1248,7 @@ class NixlConnectorWorker:
                         "All kv cache tensors must have the same size"
                     )
                 # Need to make sure the device ID is non-negative for NIXL,
-                # Torch uses -1 to indicate CPU tensors while NIXL uses explicit
-                # memory type.
+                # Torch uses -1 to indicate CPU tensors.
                 self.device_id = max(cache.get_device(), 0)
                 caches_data.append(
                     (base_addr, curr_tensor_size_bytes, self.device_id, "")
@@ -1842,9 +1838,7 @@ class NixlConnectorWorker:
                     self._reqs_to_send.pop(req_id, None)
         return notified_req_ids
 
-    def _pop_done_transfers(
-        self, transfers: dict[str, list[tuple[int, float]]]
-    ) -> set[str]:
+    def _pop_done_transfers(self, transfers: dict[str, list[int]]) -> set[str]:
         """
         Pop completed xfers by checking for DONE state.
         Args:
@@ -1855,7 +1849,7 @@ class NixlConnectorWorker:
         done_req_ids: set[str] = set()
         for req_id, handles in list(transfers.items()):
             in_progress = False
-            for handle, xfer_start_time in handles:
+            for handle in handles:
                 try:
                     xfer_state = self.nixl_wrapper.check_xfer_state(handle)
                     if xfer_state == "DONE":
@@ -2120,7 +2114,7 @@ class NixlConnectorWorker:
             self.nixl_wrapper.transfer(handle)
 
             # Use handle to check completion in future step().
-            self._recving_transfers[request_id].append((handle, time.perf_counter()))
+            self._recving_transfers[request_id].append(handle)
         except Exception:
             logger.exception(
                 "NIXL transfer setup/initiation failed for request %s. "
@@ -2251,7 +2245,7 @@ class NixlConnectorWorker:
         """Shutdown the connector worker."""
         self._handshake_initiation_executor.shutdown(wait=False)
         for handles in self._recving_transfers.values():
-            for handle, _ in handles:
+            for handle in handles:
                 self.nixl_wrapper.release_xfer_handle(handle)
         self._recving_transfers.clear()
         if self.src_xfer_side_handle:
-- 
GitLab


From dc264bcea17b279173c7cbb99645c35a56bed778 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 5 Dec 2025 09:28:32 -0800
Subject: [PATCH 134/258] [BugFix] Eagerly abort cancelled final-step requests
 (#29987)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, when requests are cancelled while executing their final
step, "completion" is handled based on normal stop processing (e.g.
length or stop token), so the abort has no effect. This is typically
not a problem, but when a kv connector is involved it thinks the
request completed successfully rather than being aborted.

This is problematic for disaggregated prefill which will free kv
cache blocks if the request was aborted but not if it completed
successfully—since the cancelled request will never be sent to
the decode side, kv cache blocks remain pinned until the fall-back
timeout expires. The problem is exacerbated when many requests
are cancelled and/or there are large prefills whose forward pass
takes a long time (since the window is bigger).

This PR fixes the problem by processing pending aborts
immediately prior to processing model output each step; we process
only aborts, not new requests, since it's preferable for latency to
process model outputs before new incoming requests.

Fixes #26400.

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/engine/test_abort_final_step.py | 311 +++++++++++++++++++++++
 vllm/v1/engine/core.py                   |  37 ++-
 vllm/v1/worker/gpu_worker.py             |  10 +-
 3 files changed, 352 insertions(+), 6 deletions(-)
 create mode 100644 tests/v1/engine/test_abort_final_step.py

diff --git a/tests/v1/engine/test_abort_final_step.py b/tests/v1/engine/test_abort_final_step.py
new file mode 100644
index 000000000..560c5c2b1
--- /dev/null
+++ b/tests/v1/engine/test_abort_final_step.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Test for the fix in PR #29987: Eagerly abort cancelled final-step requests.
+
+This test verifies that when a request is aborted during its final execution
+step (when it would naturally complete), it is properly marked as aborted
+rather than being treated as normally completed.
+
+The test uses a dummy KV connector to verify that the connector receives
+the correct finish status (FINISHED_ABORTED, not FINISHED_LENGTH_CAPPED).
+"""
+
+import asyncio
+import tempfile
+import time
+from pathlib import Path
+from typing import Any
+from unittest.mock import patch
+
+import pytest
+
+from vllm import SamplingParams
+from vllm.config import KVTransferConfig, VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1,
+    KVConnectorMetadata,
+    KVConnectorRole,
+)
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.platforms import current_platform
+from vllm.sampling_params import RequestOutputKind
+from vllm.utils.torch_utils import set_default_torch_num_threads
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.request import Request
+
+if not current_platform.is_cuda():
+    pytest.skip(reason="V1 currently only supported on CUDA.", allow_module_level=True)
+
+TEXT_PROMPT = "Hello"
+
+
+class DummyKVConnectorMetadata(KVConnectorMetadata):
+    """Dummy metadata for the test connector."""
+
+    def __init__(self):
+        self.requests: list = []
+
+
+class DummyKVConnector(KVConnectorBase_V1):
+    """
+    Dummy KV connector that captures request finish statuses to a file.
+    This is used to verify the fix - without the fix, a request aborted
+    during its final step would be captured as FINISHED_LENGTH_CAPPED
+    instead of FINISHED_ABORTED.
+
+    The connector runs in a separate process, so we write statuses to a file
+    that can be read by the test process.
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        role: KVConnectorRole,
+        kv_cache_config: KVCacheConfig | None = None,
+    ):
+        super().__init__(vllm_config, role, kv_cache_config)
+        # Get the status file path from extra config
+        extra_config = vllm_config.kv_transfer_config.kv_connector_extra_config or {}
+        self.status_file = extra_config.get("status_file")
+        # Log that we were initialized
+        if self.status_file:
+            try:
+                with open(self.status_file, "a") as f:
+                    f.write(f"INIT:{role.name}\n")
+            except Exception:
+                pass
+
+    def get_num_new_matched_tokens(
+        self,
+        request: Request,
+        num_computed_tokens: int,
+    ) -> tuple[int | None, bool]:
+        return (0, False)
+
+    def update_state_after_alloc(
+        self,
+        request: Request,
+        blocks: Any,
+        num_external_tokens: int,
+    ):
+        pass
+
+    def build_connector_meta(
+        self, scheduler_output: SchedulerOutput
+    ) -> KVConnectorMetadata:
+        return DummyKVConnectorMetadata()
+
+    def request_finished(
+        self,
+        request: Request,
+        block_ids: list[int],
+    ) -> tuple[bool, dict[str, Any] | None]:
+        """Capture the request status when finished by writing to a file."""
+        if self.status_file:
+            try:
+                with open(self.status_file, "a") as f:
+                    # Write the status name (e.g., "FINISHED_ABORTED")
+                    f.write(f"{request.status.name}\n")
+            except Exception as e:
+                # Log but don't fail - this is just test instrumentation
+                print(f"[DummyKVConnector] Failed to write status: {e}")
+        return False, None
+
+    def start_load_kv(self, forward_context: Any, **kwargs: Any) -> None:
+        pass
+
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        pass
+
+    def save_kv_layer(
+        self,
+        layer_name: str,
+        kv_layer: Any,
+        attn_metadata: Any,
+        **kwargs: Any,
+    ) -> None:
+        pass
+
+    def wait_for_save(self):
+        pass
+
+
+# Register the dummy connector
+KVConnectorFactory.register_connector(
+    "DummyKVConnector", __name__, DummyKVConnector.__name__
+)
+
+
+@pytest.mark.parametrize("async_scheduling", [False, True])
+@pytest.mark.asyncio
+async def test_abort_during_final_step(async_scheduling: bool):
+    """
+    Test that a request aborted during its final execution step is treated as
+    aborted rather than completed.
+
+    This test:
+    1. Monkeypatches execute_model to wait for a file to be deleted
+    2. Configures a dummy KV connector to capture finish statuses
+    3. Starts a request with max_tokens=1 (will complete on first decode step)
+    4. Aborts the request, then deletes the file to unblock execute_model
+    5. Verifies the KV connector received FINISHED_ABORTED not FINISHED_LENGTH_CAPPED
+
+    See https://github.com/vllm-project/vllm/pull/29987.
+
+    Without the fix, the KV connector would see FINISHED_LENGTH_CAPPED because
+    update_from_output() would mark the request as completed before processing
+    the abort. This causes KV cache blocks to not be freed properly in
+    disaggregated prefill scenarios.
+
+    With the fix, _process_aborts_queue() runs before update_from_output(), so the
+    abort takes precedence and the KV connector sees FINISHED_ABORTED.
+    """
+
+    # Create three temporary files:
+    # 1. ready_file: deleted by execute_model to signal it has started
+    # 2. block_file: execute_model waits for this to be deleted
+    # 3. status_file: KV connector writes finish statuses here
+    with tempfile.NamedTemporaryFile(delete=False) as f:
+        ready_file = Path(f.name)
+    with tempfile.NamedTemporaryFile(delete=False) as f2:
+        block_file = Path(f2.name)
+    with tempfile.NamedTemporaryFile(delete=False, mode="w") as f3:
+        status_file = Path(f3.name)
+
+    try:
+        # Get the original execute_model method
+        from vllm.v1.worker.gpu_worker import Worker
+
+        original_execute_model = Worker.execute_model
+
+        def execute_model_with_wait(self, scheduler_output):
+            # Signal that execute_model has been called by deleting ready_file
+            if ready_file.exists():
+                ready_file.unlink()
+
+            # Wait for the block file to be deleted (triggered from test after abort)
+            # This runs in the worker process (after fork), so we poll the filesystem
+            while block_file.exists():
+                time.sleep(0.01)
+            return original_execute_model(self, scheduler_output)
+
+        # Patch execute_model to inject the wait
+        # This happens before the worker process is forked, so the patch applies there
+        with patch.object(Worker, "execute_model", execute_model_with_wait):
+            request_id = "test-abort-final-step"
+
+            # Configure engine with dummy KV connector
+            # Pass the status file path so the connector can write to it
+            kv_transfer_config = KVTransferConfig(
+                kv_connector="DummyKVConnector",
+                kv_role="kv_both",
+                kv_connector_extra_config={"status_file": str(status_file)},
+            )
+            engine_args = AsyncEngineArgs(
+                model="meta-llama/Llama-3.2-1B-Instruct",
+                enforce_eager=True,
+                async_scheduling=async_scheduling,
+                kv_transfer_config=kv_transfer_config,
+            )
+
+            with set_default_torch_num_threads(1):
+                engine = AsyncLLM.from_engine_args(engine_args)
+
+            try:
+                # Create a request that will complete after just 1 token
+                sampling_params = SamplingParams(
+                    max_tokens=1,
+                    ignore_eos=True,
+                    output_kind=RequestOutputKind.DELTA,
+                )
+
+                # Start generation in a task
+                outputs = []
+
+                async def generate():
+                    async for output in engine.generate(
+                        request_id=request_id,
+                        prompt=TEXT_PROMPT,
+                        sampling_params=sampling_params,
+                    ):
+                        outputs.append(output)
+
+                gen_task = asyncio.create_task(generate())
+
+                # Wait for execute_model to signal it has started (with timeout)
+                timeout = 5.0  # 5 second timeout
+                start_time = time.time()
+                while ready_file.exists():
+                    if time.time() - start_time > timeout:
+                        raise TimeoutError(
+                            "Timeout waiting for execute_model to start. "
+                            "The monkeypatch may not be working correctly, "
+                            "for example if spawn was used instead of fork."
+                        )
+                    await asyncio.sleep(0.01)
+
+                # Abort the request while execute_model is blocked
+                await engine.abort(request_id)
+
+                # Now unblock execute_model by deleting the file
+                # The abort should be processed before the model output
+                block_file.unlink()
+
+                # Wait for generation to complete
+                await gen_task
+
+                # Give the scheduler a moment to finish cleanup
+                await asyncio.sleep(0.1)
+
+                # Verify we got output
+                assert len(outputs) > 0, "Should have received at least one output"
+
+                # The final output should have finish_reason="abort"
+                final_output = outputs[-1]
+                assert final_output.finished, (
+                    "Final output should be marked as finished"
+                )
+                assert final_output.outputs[0].finish_reason == "abort", (
+                    f"Expected finish_reason='abort' but got "
+                    f"'{final_output.outputs[0].finish_reason}'. "
+                )
+
+                with open(status_file) as f4:
+                    status_lines = f4.read().strip().split("\n")
+                    # Filter for actual finish statuses (not INIT or empty lines)
+                    captured_statuses = [
+                        line
+                        for line in status_lines
+                        if line and line.startswith("FINISHED_")
+                    ]
+
+                assert len(captured_statuses) >= 1, (
+                    f"Expected at least 1 captured finish status, got "
+                    f"{len(captured_statuses)}. File content: {status_lines}"
+                )
+
+                assert "FINISHED_ABORTED" in captured_statuses, (
+                    f"KV connector should see FINISHED_ABORTED but got "
+                    f"{captured_statuses}. "
+                )
+
+                # Verify cleanup
+                assert not engine.output_processor.has_unfinished_requests()
+
+            finally:
+                # Shutdown the engine
+                engine.shutdown()
+
+    finally:
+        # Clean up temporary files if they still exist
+        if ready_file.exists():
+            ready_file.unlink()
+        if block_file.exists():
+            block_file.unlink()
+        if status_file.exists():
+            status_file.unlink()
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 61b8422dd..8e34dfcea 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -204,6 +204,8 @@ class EngineCore:
         )
         self.async_scheduling = vllm_config.scheduler_config.async_scheduling
 
+        self.aborts_queue = queue.Queue[list[str]]()
+
         # Mark the startup heap as static so that it's ignored by GC.
         # Reduces pause times of oldest generation collections.
         freeze_gc_heap()
@@ -347,6 +349,9 @@ class EngineCore:
             if model_output is None:
                 model_output = self.model_executor.sample_tokens(grammar_output)
 
+        # Before processing the model output, process any aborts that happened
+        # during the model execution.
+        self._process_aborts_queue()
         engine_core_outputs = self.scheduler.update_from_output(
             scheduler_output, model_output
         )
@@ -440,6 +445,9 @@ class EngineCore:
         with self.log_error_detail(scheduler_output):
             model_output = future.result()
 
+        # Before processing the model output, process any aborts that happened
+        # during the model execution.
+        self._process_aborts_queue()
         engine_core_outputs = self.scheduler.update_from_output(
             scheduler_output, model_output
         )
@@ -458,6 +466,18 @@ class EngineCore:
 
         return engine_core_outputs, model_executed
 
+    def _process_aborts_queue(self):
+        if not self.aborts_queue.empty():
+            request_ids = []
+            while not self.aborts_queue.empty():
+                ids = self.aborts_queue.get_nowait()
+                if isinstance(ids, str):
+                    # Should be a list here, but also handle string just in case.
+                    ids = (ids,)
+                request_ids.extend(ids)
+            # More efficient to abort all as a single batch.
+            self.abort_requests(request_ids)
+
     def shutdown(self):
         self.structured_output_manager.clear_backend()
         if self.model_executor:
@@ -871,9 +891,13 @@ class EngineCoreProc(EngineCore):
             and not self.scheduler.has_requests()
             and not self.batch_queue
         ):
-            if logger.isEnabledFor(DEBUG) and self.input_queue.empty():
-                logger.debug("EngineCore waiting for work.")
-                waited = True
+            if self.input_queue.empty():
+                # Drain aborts queue; all aborts are also processed via input_queue.
+                with self.aborts_queue.mutex:
+                    self.aborts_queue.queue.clear()
+                if logger.isEnabledFor(DEBUG):
+                    logger.debug("EngineCore waiting for work.")
+                    waited = True
             req = self.input_queue.get()
             self._handle_client_request(*req)
 
@@ -1027,6 +1051,13 @@ class EngineCoreProc(EngineCore):
                     else:
                         request = generic_decoder.decode(data_frames)
 
+                        if request_type == EngineCoreRequestType.ABORT:
+                            # Aborts are added to *both* queues, allows us to eagerly
+                            # process aborts while also ensuring ordering in the input
+                            # queue to avoid leaking requests. This is ok because
+                            # aborting in the scheduler is idempotent.
+                            self.aborts_queue.put_nowait(request)
+
                     # Push to input queue for core busy loop.
                     self.input_queue.put_nowait((request_type, request))
 
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index a133575cb..d189d0860 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -51,7 +51,6 @@ from vllm.v1.outputs import (
     ModelRunnerOutput,
 )
 from vllm.v1.utils import report_usage_stats
-from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 from vllm.v1.worker.utils import is_residual_scattered_for_sp
 from vllm.v1.worker.worker_base import WorkerBase
 
@@ -59,6 +58,7 @@ logger = init_logger(__name__)
 
 if TYPE_CHECKING:
     from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+    from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
 
 class Worker(WorkerBase):
@@ -259,7 +259,11 @@ class Worker(WorkerBase):
                 self.vllm_config, self.device
             )
         else:
-            self.model_runner = GPUModelRunner(self.vllm_config, self.device)
+            from vllm.v1.worker.gpu_model_runner import (
+                GPUModelRunner as GPUModelRunnerV1,
+            )
+
+            self.model_runner = GPUModelRunnerV1(self.vllm_config, self.device)
 
         if self.rank == 0:
             # If usage stat is enabled, collect relevant info.
@@ -556,7 +560,7 @@ class Worker(WorkerBase):
             and forward_pass
         ):
             # currently only supported by V1 GPUModelRunner
-            assert isinstance(self.model_runner, GPUModelRunner)
+            assert not self.use_v2_model_runner
             num_scheduled_tokens_np = np.array(
                 list(scheduler_output.num_scheduled_tokens.values()),
                 dtype=np.int32,
-- 
GitLab


From dff0a2b39475096f5456721bfc8df3c7fea3cc57 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 5 Dec 2025 17:43:48 +0000
Subject: [PATCH 135/258] [NIXL] Add remote_request_id to kv_transfer_params
 (#29665)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 .../v1/kv_connector/unit/test_nixl_connector.py |  6 ++++++
 tests/v1/kv_connector/unit/utils.py             |  1 +
 .../kv_connector/v1/nixl_connector.py           | 17 ++++++++++++++---
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index 65db16f48..5045ae0ee 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -470,6 +470,7 @@ class TestNixlHandshake:
                             num_xfers + 6,
                         ],
                         "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+                        "remote_request_id": f"prefill-{request_id}",
                         "remote_host": "localhost",
                         "remote_port": 1234,
                         "remote_tp_size": 1,
@@ -536,6 +537,7 @@ class TestNixlHandshake:
             kv_transfer_params={
                 "remote_block_ids": [4, 5, 6],
                 "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+                "remote_request_id": "prefill-id",
                 "remote_host": "localhost",
                 "remote_port": 1234,
                 "remote_tp_size": prefill_tp_size,
@@ -591,6 +593,7 @@ class TestNixlHandshake:
                 kv_transfer_params={
                     "remote_block_ids": [4, 5, 6],
                     "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+                    "remote_request_id": f"prefill-id-{i}",
                     "remote_host": "localhost",
                     "remote_port": 1234,
                     "remote_tp_size": 1,
@@ -754,6 +757,7 @@ def test_kv_connector_stats(dist_init):
         kv_transfer_params={
             "remote_block_ids": [4, 5, 6],
             "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+            "remote_request_id": f"prefill-{request_id}",
             "remote_host": "localhost",
             "remote_port": 1234,
             "remote_tp_size": 1,
@@ -1470,6 +1474,7 @@ def test_handshake_failure_returns_finished(dist_init):
         kv_transfer_params={
             "remote_block_ids": [4, 5, 6],
             "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+            "remote_request_id": f"prefill-{request_id}",
             "remote_host": "localhost",
             "remote_port": 1234,
             "remote_tp_size": 1,
@@ -1519,6 +1524,7 @@ def test_transfer_setup_failure_returns_finished(dist_init):
         kv_transfer_params={
             "remote_block_ids": [10, 11, 12],
             "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+            "remote_request_id": f"prefill-{request_id}",
             "remote_host": "localhost",
             "remote_port": 1234,
             "remote_tp_size": 1,
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index cea41c3ab..58f1a7282 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -194,6 +194,7 @@ def create_request(
             do_remote_prefill=True,
             do_remote_decode=False,
             remote_engine_id="my-engine-id",
+            remote_request_id=f"prefill-{request_id}",
             remote_block_ids=list(range(num_remote_blocks)),
             remote_host="my-host",
             remote_port=1234,
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 649e54ada..7aa12e999 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -71,8 +71,9 @@ ReqId = str
 #
 # Version History:
 #   1: Initial version with compatibility checking
+#   2: Add remote_request_id to kv_transfer_params
 #
-NIXL_CONNECTOR_VERSION: int = 1
+NIXL_CONNECTOR_VERSION: int = 2
 
 GET_META_MSG = b"get_meta_msg"
 
@@ -210,6 +211,7 @@ class ReqMeta:
     remote_host: str
     remote_port: int
     remote_engine_id: str
+    remote_request_id: str
     tp_size: int
 
 
@@ -236,6 +238,7 @@ class NixlConnectorMetadata(KVConnectorMetadata):
             local_physical_block_ids=local_block_ids,
             remote_block_ids=kv_transfer_params["remote_block_ids"],
             remote_engine_id=kv_transfer_params["remote_engine_id"],
+            remote_request_id=kv_transfer_params["remote_request_id"],
             remote_host=kv_transfer_params["remote_host"],
             remote_port=kv_transfer_params["remote_port"],
             # P workers don't need to receive tp_size from proxy here.
@@ -622,7 +625,12 @@ class NixlConnectorScheduler:
             if params.get("remote_block_ids"):
                 if all(
                     p in params
-                    for p in ("remote_engine_id", "remote_host", "remote_port")
+                    for p in (
+                        "remote_engine_id",
+                        "remote_request_id",
+                        "remote_host",
+                        "remote_port",
+                    )
                 ):
                     # If remote_blocks and num_external_tokens = 0, we have
                     # a full prefix cache hit on the D worker. We need to call
@@ -751,6 +759,7 @@ class NixlConnectorScheduler:
             do_remote_decode=False,
             remote_block_ids=block_ids,
             remote_engine_id=self.engine_id,
+            remote_request_id=request.request_id,
             remote_host=self.side_channel_host,
             remote_port=self.side_channel_port,
             tp_size=self.vllm_config.parallel_config.tensor_parallel_size,
@@ -1964,6 +1973,7 @@ class NixlConnectorWorker:
         self._read_blocks(
             request_id=req_id,
             dst_engine_id=meta.remote_engine_id,
+            remote_request_id=meta.remote_request_id,
             local_block_ids=meta.local_physical_block_ids,
             remote_block_ids=meta.remote_block_ids,
         )
@@ -1974,6 +1984,7 @@ class NixlConnectorWorker:
         remote_block_ids: list[int],
         dst_engine_id: str,
         request_id: str,
+        remote_request_id: str,
     ):
         block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(dst_engine_id)
         if block_size_ratio > 1:
@@ -2006,7 +2017,7 @@ class NixlConnectorWorker:
         # Number of D TP workers that will read from dst P. Propagate tp_ratio
         # on notification so that dst worker can wait before freeing blocks.
         tp_ratio = self.kv_topo.tp_ratio_from_engine_id(dst_engine_id)
-        notif_id = f"{request_id}:{tp_ratio}".encode()
+        notif_id = f"{remote_request_id}:{tp_ratio}".encode()
 
         # Full prefix cache hit: do not need to read remote blocks,
         # just notify P worker that we have the blocks we need.
-- 
GitLab


From 66e674cdd549e89b341fb33405a2c02ec5f5ae8d Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Fri, 5 Dec 2025 12:48:43 -0500
Subject: [PATCH 136/258] [Attention][UX][1/N] Add AttentionConfig and change
 attention env vars to CLI arguments (#26315)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
---
 tests/compile/test_fusion_attn.py             |   5 +-
 .../kv_connector/unit/test_nixl_connector.py  |  39 +++-
 tests/v1/worker/test_gpu_model_runner.py      | 172 +++++++++---------
 vllm/attention/backends/abstract.py           |  26 +--
 vllm/attention/layer.py                       |   4 +-
 vllm/attention/selector.py                    | 137 +-------------
 vllm/attention/utils/fa_utils.py              |  11 +-
 vllm/config/__init__.py                       |   3 +
 vllm/config/attention.py                      | 114 ++++++++++++
 vllm/config/model.py                          |  13 --
 vllm/config/vllm.py                           |   7 +
 vllm/engine/arg_utils.py                      |  31 +++-
 vllm/model_executor/models/config.py          |  11 +-
 vllm/model_executor/models/vision.py          |   7 +-
 vllm/platforms/cuda.py                        |  23 ++-
 vllm/utils/flashinfer.py                      |  37 ++--
 vllm/v1/attention/backends/flash_attn.py      |   9 +-
 vllm/v1/attention/backends/flashinfer.py      |  26 +--
 vllm/v1/attention/backends/mla/common.py      |  19 +-
 .../attention/backends/mla/flashattn_mla.py   |   5 +-
 vllm/v1/attention/backends/rocm_attn.py       |   2 +-
 vllm/v1/attention/backends/triton_attn.py     |   5 +-
 22 files changed, 374 insertions(+), 332 deletions(-)
 create mode 100644 vllm/config/attention.py

diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index 9b4486e56..db95dff5e 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -12,13 +12,13 @@ from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import Attention
-from vllm.attention.selector import global_force_attn_backend_context_manager
 from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
 from vllm.compilation.fx_utils import find_op_nodes
 from vllm.compilation.matcher_utils import QUANT_OPS
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.compilation.post_cleanup import PostCleanupPass
 from vllm.config import (
+    AttentionConfig,
     CacheConfig,
     CompilationConfig,
     CompilationMode,
@@ -335,6 +335,7 @@ def test_attention_quant_pattern(
             custom_ops=custom_ops_list,
         ),
         cache_config=CacheConfig(cache_dtype="fp8"),
+        attention_config=AttentionConfig(backend=backend),
     )
 
     # Create test inputs
@@ -352,7 +353,6 @@ def test_attention_quant_pattern(
     with (
         set_current_vllm_config(vllm_config_unfused),
         set_forward_context(attn_metadata=None, vllm_config=vllm_config_unfused),
-        global_force_attn_backend_context_manager(backend),
     ):
         model_unfused = model_class(
             num_qo_heads=num_qo_heads,
@@ -378,7 +378,6 @@ def test_attention_quant_pattern(
     with (
         set_current_vllm_config(vllm_config),
         set_forward_context(attn_metadata=None, vllm_config=vllm_config),
-        global_force_attn_backend_context_manager(backend),
     ):
         model_fused = model_class(
             num_qo_heads=num_qo_heads,
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index 5045ae0ee..ec9ff7315 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -1151,13 +1151,29 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
     }
 
     # Store tensor info for validation
-    expected_tensor_size = shared_tensor[0].element_size() * shared_tensor[0].numel()
-    expected_base_addrs = [
-        shared_tensor[0].data_ptr(),
-        shared_tensor[1].data_ptr(),
-        unique_tensor[0].data_ptr(),
-        unique_tensor[1].data_ptr(),
-    ]
+    test_shape = backend_cls.get_kv_cache_shape(
+        num_blocks=1, block_size=16, num_kv_heads=1, head_size=1
+    )
+    is_blocks_first = len(test_shape) == 5 and test_shape[0] == 1
+
+    if is_blocks_first:
+        expected_tensor_size = shared_tensor.element_size() * shared_tensor.numel()
+        expected_base_addrs = [
+            shared_tensor.data_ptr(),
+            unique_tensor.data_ptr(),
+        ]
+        expected_num_entries = 2
+    else:
+        expected_tensor_size = (
+            shared_tensor[0].element_size() * shared_tensor[0].numel()
+        )
+        expected_base_addrs = [
+            shared_tensor[0].data_ptr(),
+            shared_tensor[1].data_ptr(),
+            unique_tensor[0].data_ptr(),
+            unique_tensor[1].data_ptr(),
+        ]
+        expected_num_entries = 4
 
     with (
         patch(
@@ -1192,7 +1208,7 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
         # Verify get_reg_descs was called with caches_data
         assert mock_wrapper_instance.get_reg_descs.called
         caches_data, _ = mock_wrapper_instance.get_reg_descs.call_args[0]
-        assert len(caches_data) == 4
+        assert len(caches_data) == expected_num_entries
 
         for i, cache_entry in enumerate(caches_data):
             base_addr, size, _tp_rank, _ = cache_entry
@@ -1214,7 +1230,12 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
             f"Expected {expected_blocks_count} blocks, got {len(blocks_data)}"
         )
 
-        expected_block_len = expected_tensor_size // 2
+        num_blocks = 2
+        if is_blocks_first:
+            expected_block_len = expected_tensor_size // num_blocks // 2
+        else:
+            expected_block_len = expected_tensor_size // num_blocks
+
         for i, block_entry in enumerate(blocks_data):
             block_start_addr, block_len, tp_rank = block_entry
             assert block_len == expected_block_len, (
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 459abcfdd..7b8c4268a 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -6,8 +6,10 @@ import pytest
 import torch
 
 from vllm.attention.backends.abstract import MultipleOf
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import Attention
 from vllm.config import (
+    AttentionConfig,
     CacheConfig,
     ModelConfig,
     ParallelConfig,
@@ -765,7 +767,7 @@ def test_init_kv_cache_with_kv_sharing_valid():
     current_platform.is_rocm(),
     reason="Attention backend FLASHINFER is not supported on ROCm.",
 )
-def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
+def test_hybrid_attention_mamba_tensor_shapes():
     """
     The GPU model runner creates different views into the
     KVCacheTensors for the attention and mamba layers
@@ -806,11 +808,13 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
         cache_dtype="auto",
     )
     parallel_config = ParallelConfig()
+    attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASHINFER)
     vllm_config = VllmConfig(
         model_config=model_config,
         cache_config=cache_config,
         scheduler_config=scheduler_config,
         parallel_config=parallel_config,
+        attention_config=attention_config,
     )
 
     layer_0 = "model.layers.0.self_attn.attn"
@@ -820,8 +824,7 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
     layer_4 = "model.layers.4.mixer"
     layer_5 = "model.layers.5.mixer"
 
-    with set_current_vllm_config(vllm_config), monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
+    with set_current_vllm_config(vllm_config):
         hf_config = vllm_config.model_config.hf_config
         fwd_context = {}
         for key in [layer_0, layer_1]:
@@ -851,10 +854,7 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
             )
         # suppress var not used error
         assert fwd_context is not None
-    vllm_ctx = vllm_config.compilation_config.static_forward_context
-
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
+        vllm_ctx = vllm_config.compilation_config.static_forward_context
 
         runner = GPUModelRunner(vllm_config, DEVICE)
         kv_cache_spec = runner.get_kv_cache_spec()
@@ -865,94 +865,94 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
         )[0]
         runner.initialize_kv_cache(kv_cache_config)
 
-        # random partition of blocks
-        # blocks0 will be assigned to attention layers
-        # blocks1 will be assigned to mamba layers
-        num_blocks = kv_cache_config.num_blocks
-        ind = np.arange(num_blocks)
-        np.random.shuffle(ind)
-        blocks0, blocks1 = ind[: (num_blocks // 2)], ind[(num_blocks // 2) :]
+    # random partition of blocks
+    # blocks0 will be assigned to attention layers
+    # blocks1 will be assigned to mamba layers
+    num_blocks = kv_cache_config.num_blocks
+    ind = np.arange(num_blocks)
+    np.random.shuffle(ind)
+    blocks0, blocks1 = ind[: (num_blocks // 2)], ind[(num_blocks // 2) :]
 
-        attn_shape = vllm_ctx[layer_0].kv_cache[0].shape
-        conv_shape = vllm_ctx[layer_2].kv_cache[0][0].shape
-        ssm_shape = vllm_ctx[layer_2].kv_cache[0][1].shape
+    attn_shape = vllm_ctx[layer_0].kv_cache[0].shape
+    conv_shape = vllm_ctx[layer_2].kv_cache[0][0].shape
+    ssm_shape = vllm_ctx[layer_2].kv_cache[0][1].shape
 
-        # assert we are using FlashInfer
-        assert attn_shape[0] % num_blocks == 0
-        block_split_ratio = attn_shape[0] // num_blocks
+    # assert we are using FlashInfer
+    assert attn_shape[0] % num_blocks == 0
+    block_split_ratio = attn_shape[0] // num_blocks
 
-        # use small blocks for testing to avoid memory issues
-        test_block_size = min(2, len(blocks0), len(blocks1))
+    # use small blocks for testing to avoid memory issues
+    test_block_size = min(2, len(blocks0), len(blocks1))
 
-        # use non-overlapping blocks to avoid data contamination
-        # Split kernel blocks: first half for attention, second half for mamba
-        mid_point = num_blocks // 2
+    # use non-overlapping blocks to avoid data contamination
+    # Split kernel blocks: first half for attention, second half for mamba
+    mid_point = num_blocks // 2
 
-        # attention uses kernel blocks from first half (mapped to logical blocks)
-        kv_blocks_for_attention = np.array([0, 1])[:test_block_size]
+    # attention uses kernel blocks from first half (mapped to logical blocks)
+    kv_blocks_for_attention = np.array([0, 1])[:test_block_size]
 
-        # mamba uses kernel blocks from second half
-        kv_blocks_for_mamba = np.array([mid_point, mid_point + 1])[:test_block_size]
+    # mamba uses kernel blocks from second half
+    kv_blocks_for_mamba = np.array([mid_point, mid_point + 1])[:test_block_size]
 
-        # create small constant tensors for testing with corrected shapes
-        # attention: [block_size, ...] starting from dimension 2
-        attn_constant_shape = attn_shape[2:]
-        conv_constant_shape = conv_shape[1:]
-        ssm_constant_shape = ssm_shape[1:]
+    # create small constant tensors for testing with corrected shapes
+    # attention: [block_size, ...] starting from dimension 2
+    attn_constant_shape = attn_shape[2:]
+    conv_constant_shape = conv_shape[1:]
+    ssm_constant_shape = ssm_shape[1:]
 
-        attn_blocks_constant = torch.full(
-            (test_block_size, *attn_constant_shape), device=DEVICE, fill_value=3.33
-        )
-        conv_blocks_constant = torch.full(
-            (test_block_size, *conv_constant_shape), device=DEVICE, fill_value=6.66
-        )
-        ssm_blocks_constant = torch.full(
-            (test_block_size, *ssm_constant_shape), device=DEVICE, fill_value=9.99
-        )
+    attn_blocks_constant = torch.full(
+        (test_block_size, *attn_constant_shape), device=DEVICE, fill_value=3.33
+    )
+    conv_blocks_constant = torch.full(
+        (test_block_size, *conv_constant_shape), device=DEVICE, fill_value=6.66
+    )
+    ssm_blocks_constant = torch.full(
+        (test_block_size, *ssm_constant_shape), device=DEVICE, fill_value=9.99
+    )
 
-        # Fill attention blocks with constants using kv block indices
-        kernel_blocks_for_attention = kv_blocks_for_attention * block_split_ratio
-
-        for layer in [layer_0, layer_1]:
-            # attention: kv_cache[0][kernel_block_idx, kv_idx, ...]
-            for i, kernel_block in enumerate(kernel_blocks_for_attention):
-                vllm_ctx[layer].kv_cache[0][kernel_block, :] = attn_blocks_constant[i]
-
-        # fill mamba blocks with constants using kernel block indices
-        for layer in [layer_2, layer_3, layer_4, layer_5]:
-            # mamba: kv_cache[0][component][kernel_block_idx, ...]
-            for i, kv_block in enumerate(kv_blocks_for_mamba):
-                vllm_ctx[layer].kv_cache[0][0][kv_block, :] = conv_blocks_constant[i]
-                vllm_ctx[layer].kv_cache[0][1][kv_block, :] = ssm_blocks_constant[i]
-
-        # verify attention and mamba contents are correct
-        for layer in [layer_0, layer_1]:
-            for i, kernel_block in enumerate(kernel_blocks_for_attention):
-                actual_kv = vllm_ctx[layer].kv_cache[0][kernel_block, :]
-                expected = attn_blocks_constant[i]
-
-                # Check K and V separately
-                assert torch.equal(actual_kv[0], expected)
-                assert torch.equal(actual_kv[1], expected)
-
-        for layer in [layer_2, layer_3, layer_4, layer_5]:
-            for i, kv_block in enumerate(kv_blocks_for_mamba):
-                actual_conv = vllm_ctx[layer].kv_cache[0][0][kv_block, :]
-                actual_ssm = vllm_ctx[layer].kv_cache[0][1][kv_block, :]
-                expected_conv = conv_blocks_constant[i]
-                expected_ssm = ssm_blocks_constant[i]
-
-                assert torch.equal(actual_conv, expected_conv)
-                assert torch.equal(actual_ssm, expected_ssm)
-
-        for layer in [layer_2, layer_3, layer_4, layer_5]:
-            for i, kv_block in enumerate(kv_blocks_for_mamba):
-                actual_conv = vllm_ctx[layer].kv_cache[0][0][kv_block, :]
-                actual_ssm = vllm_ctx[layer].kv_cache[0][1][kv_block, :]
-                expected_conv = conv_blocks_constant[i]
-                expected_ssm = ssm_blocks_constant[i]
-                assert torch.equal(actual_conv, expected_conv)
-                assert torch.equal(actual_ssm, expected_ssm)
+    # Fill attention blocks with constants using kv block indices
+    kernel_blocks_for_attention = kv_blocks_for_attention * block_split_ratio
+
+    for layer in [layer_0, layer_1]:
+        # attention: kv_cache[0][kernel_block_idx, kv_idx, ...]
+        for i, kernel_block in enumerate(kernel_blocks_for_attention):
+            vllm_ctx[layer].kv_cache[0][kernel_block, :] = attn_blocks_constant[i]
+
+    # fill mamba blocks with constants using kernel block indices
+    for layer in [layer_2, layer_3, layer_4, layer_5]:
+        # mamba: kv_cache[0][component][kernel_block_idx, ...]
+        for i, kv_block in enumerate(kv_blocks_for_mamba):
+            vllm_ctx[layer].kv_cache[0][0][kv_block, :] = conv_blocks_constant[i]
+            vllm_ctx[layer].kv_cache[0][1][kv_block, :] = ssm_blocks_constant[i]
+
+    # verify attention and mamba contents are correct
+    for layer in [layer_0, layer_1]:
+        for i, kernel_block in enumerate(kernel_blocks_for_attention):
+            actual_kv = vllm_ctx[layer].kv_cache[0][kernel_block, :]
+            expected = attn_blocks_constant[i]
+
+            # Check K and V separately
+            assert torch.equal(actual_kv[0], expected)
+            assert torch.equal(actual_kv[1], expected)
+
+    for layer in [layer_2, layer_3, layer_4, layer_5]:
+        for i, kv_block in enumerate(kv_blocks_for_mamba):
+            actual_conv = vllm_ctx[layer].kv_cache[0][0][kv_block, :]
+            actual_ssm = vllm_ctx[layer].kv_cache[0][1][kv_block, :]
+            expected_conv = conv_blocks_constant[i]
+            expected_ssm = ssm_blocks_constant[i]
+
+            assert torch.equal(actual_conv, expected_conv)
+            assert torch.equal(actual_ssm, expected_ssm)
+
+    for layer in [layer_2, layer_3, layer_4, layer_5]:
+        for i, kv_block in enumerate(kv_blocks_for_mamba):
+            actual_conv = vllm_ctx[layer].kv_cache[0][0][kv_block, :]
+            actual_ssm = vllm_ctx[layer].kv_cache[0][1][kv_block, :]
+            expected_conv = conv_blocks_constant[i]
+            expected_ssm = ssm_blocks_constant[i]
+            assert torch.equal(actual_conv, expected_conv)
+            assert torch.equal(actual_ssm, expected_ssm)
 
 
 def test_hybrid_block_table_initialization():
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index c290670ee..84cca8e68 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -289,6 +289,16 @@ class AttentionImpl(ABC, Generic[T]):
     # even if they can return lse (for efficiency reasons)
     need_to_return_lse_for_decode: bool = False
 
+    # Whether this attention implementation supports pre-quantized query input.
+    # When True, the attention layer will quantize queries before passing them
+    # to this backend, allowing torch.compile to fuse the quantization with
+    # previous operations. This is typically supported when using FP8 KV cache
+    # with compatible attention kernels (e.g., TRT-LLM).
+    # Subclasses should set this in __init__.
+    # TODO add support to more backends:
+    # https://github.com/vllm-project/vllm/issues/25584
+    supports_quant_query_input: bool = False
+
     dcp_world_size: int
     dcp_rank: int
 
@@ -368,22 +378,6 @@ class AttentionImpl(ABC, Generic[T]):
         """
         return False
 
-    def supports_quant_query_input(self) -> bool:
-        """
-        Check if this attention implementation supports pre-quantized query input.
-
-        When True, the attention layer will quantize queries before passing them
-        to this backend, allowing torch.compile to fuse the quantization with
-        previous operations. This is typically supported when using FP8 KV cache
-        with compatible attention kernels (e.g., TRT-LLM).
-        TODO add support to more backends:
-        https://github.com/vllm-project/vllm/issues/25584
-
-        Returns:
-            bool: True if the implementation can accept pre-quantized queries.
-        """
-        return False
-
     def process_weights_after_loading(self, act_dtype: torch.dtype):
         pass
 
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index da5a62617..8a522deed 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -303,7 +303,7 @@ class Attention(nn.Module, AttentionLayerBase):
         self.query_quant = None
         if (
             self.kv_cache_dtype.startswith("fp8")
-            and self.impl.supports_quant_query_input()
+            and self.impl.supports_quant_query_input
         ):
             self.query_quant = QuantFP8(static=True, group_shape=GroupShape.PER_TENSOR)
 
@@ -338,7 +338,7 @@ class Attention(nn.Module, AttentionLayerBase):
             assert self.kv_cache_dtype in {"fp8", "fp8_e4m3"}
 
             # check if query quantization is supported
-            if self.impl.supports_quant_query_input():
+            if self.impl.supports_quant_query_input:
                 query, _ = self.query_quant(query, self._q_scale)
 
         if self.use_output:
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index a7190df3c..aeb130dfe 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -2,19 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import inspect
-import os
-from collections.abc import Generator
-from contextlib import contextmanager
 from functools import cache
 from typing import cast, get_args
 
 import torch
 
-import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.backends.registry import (
     MAMBA_TYPE_TO_BACKEND_MAP,
-    AttentionBackendEnum,
     MambaAttentionBackendEnum,
 )
 from vllm.config.cache import CacheDType
@@ -24,60 +19,6 @@ from vllm.utils.import_utils import resolve_obj_by_qualname
 logger = init_logger(__name__)
 
 
-def get_env_variable_attn_backend() -> AttentionBackendEnum | None:
-    """
-    Get the backend override specified by the vLLM attention
-    backend environment variable, if one is specified.
-
-    Returns:
-
-    * AttentionBackendEnum value if an override is specified
-    * None otherwise
-    """
-    backend_name = os.environ.get("VLLM_ATTENTION_BACKEND")
-    if backend_name is None:
-        return None
-    if backend_name == "XFORMERS":
-        raise ValueError(
-            "Attention backend 'XFORMERS' has been removed (See PR #29262 for "
-            "details). Please select a supported attention backend."
-        )
-    return AttentionBackendEnum[backend_name]
-
-
-# Global state allows a particular choice of backend
-# to be forced, overriding the logic which auto-selects
-# a backend based on system & workload configuration
-# (default behavior if this variable is None)
-#
-# THIS SELECTION TAKES PRECEDENCE OVER THE
-# VLLM_ATTENTION_BACKEND ENVIRONMENT VARIABLE
-forced_attn_backend: AttentionBackendEnum | None = None
-
-
-def global_force_attn_backend(attn_backend: AttentionBackendEnum | None) -> None:
-    """
-    Force all attention operations to use a specified backend.
-
-    Passing `None` for the argument re-enables automatic
-    backend selection.,
-
-    Arguments:
-
-    * attn_backend: backend selection (None to revert to auto)
-    """
-    global forced_attn_backend
-    forced_attn_backend = attn_backend
-
-
-def get_global_forced_attn_backend() -> AttentionBackendEnum | None:
-    """
-    Get the currently-forced choice of attention backend,
-    or None if auto-selection is currently enabled.
-    """
-    return forced_attn_backend
-
-
 def get_attn_backend(
     head_size: int,
     dtype: torch.dtype,
@@ -97,7 +38,13 @@ def get_attn_backend(
             f"Valid values are: {valid_cache_dtypes}"
         )
 
+    from vllm.config import get_current_vllm_config
+
+    vllm_config = get_current_vllm_config()
+    backend_enum = vllm_config.attention_config.backend
+
     return _cached_get_attn_backend(
+        backend=backend_enum,
         head_size=head_size,
         dtype=dtype,
         kv_cache_dtype=cast(CacheDType | None, kv_cache_dtype),
@@ -111,6 +58,7 @@ def get_attn_backend(
 
 @cache
 def _cached_get_attn_backend(
+    backend,
     head_size: int,
     dtype: torch.dtype,
     kv_cache_dtype: CacheDType | None,
@@ -120,39 +68,6 @@ def _cached_get_attn_backend(
     use_sparse: bool = False,
     attn_type: str | None = None,
 ) -> type[AttentionBackend]:
-    # Check whether a particular choice of backend was
-    # previously forced.
-    #
-    # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND
-    # ENVIRONMENT VARIABLE.
-    selected_backend = None
-    backend_by_global_setting: AttentionBackendEnum | None = (
-        get_global_forced_attn_backend()
-    )
-    if backend_by_global_setting is not None:
-        selected_backend = backend_by_global_setting
-    else:
-        # Check the environment variable and override if specified
-        backend_by_env_var: str | None = envs.VLLM_ATTENTION_BACKEND
-        if backend_by_env_var is not None:
-            if backend_by_env_var.endswith("_VLLM_V1"):
-                logger.warning(
-                    "The suffix '_VLLM_V1' in the environment variable "
-                    "VLLM_ATTENTION_BACKEND is no longer necessary as "
-                    "V0 backends have been deprecated. "
-                    "Please remove this suffix from your "
-                    "environment variable setting.",
-                )
-                backend_by_env_var = backend_by_env_var.removesuffix("_VLLM_V1")
-            try:
-                selected_backend = AttentionBackendEnum[backend_by_env_var]
-            except KeyError as e:
-                raise ValueError(
-                    f"Invalid attention backend: '{backend_by_env_var}'. Valid "
-                    f"backends are: {list(AttentionBackendEnum.__members__.keys())}"
-                ) from e
-
-    # get device-specific attn_backend
     from vllm.platforms import current_platform
 
     sig = inspect.signature(current_platform.get_attn_backend_cls)
@@ -163,7 +78,7 @@ def _cached_get_attn_backend(
             "remove it from your plugin code."
         )
         attention_cls = current_platform.get_attn_backend_cls(
-            selected_backend,
+            backend,
             head_size,
             dtype,
             kv_cache_dtype,
@@ -176,7 +91,7 @@ def _cached_get_attn_backend(
         )
     else:
         attention_cls = current_platform.get_attn_backend_cls(
-            selected_backend,
+            backend,
             head_size,
             dtype,
             kv_cache_dtype,
@@ -232,37 +147,3 @@ def _cached_get_mamba_attn_backend(
 
     mamba_attn_backend = selected_backend.get_class()
     return mamba_attn_backend
-
-
-@contextmanager
-def global_force_attn_backend_context_manager(
-    attn_backend: AttentionBackendEnum,
-) -> Generator[None, None, None]:
-    """
-    Globally force a vLLM attention backend override within a
-    context manager, reverting the global attention backend
-    override to its prior state upon exiting the context
-    manager.
-
-    Arguments:
-
-    * attn_backend: attention backend to force
-
-    Returns:
-
-    * Generator
-    """
-
-    # Save the current state of the global backend override (if any)
-    original_value = get_global_forced_attn_backend()
-
-    # Globally force the new backend override
-    global_force_attn_backend(attn_backend)
-
-    # Yield control back to the enclosed code block
-    try:
-        yield
-    finally:
-        # Revert the original global backend override, if any
-        global_force_attn_backend(original_value)
-        _cached_get_attn_backend.cache_clear()
diff --git a/vllm/attention/utils/fa_utils.py b/vllm/attention/utils/fa_utils.py
index 8a4658747..e38c88f48 100644
--- a/vllm/attention/utils/fa_utils.py
+++ b/vllm/attention/utils/fa_utils.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from vllm import envs
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
@@ -49,10 +48,12 @@ def get_flash_attn_version(requires_alibi: bool = False) -> int | None:
             3 if (device_capability.major == 9 and is_fa_version_supported(3)) else 2
         )
 
-        # 2. override if passed by environment
-        if envs.VLLM_FLASH_ATTN_VERSION is not None:
-            assert envs.VLLM_FLASH_ATTN_VERSION in [2, 3]
-            fa_version = envs.VLLM_FLASH_ATTN_VERSION
+        # 2. override if passed by environment or config
+        from vllm.config import get_current_vllm_config
+
+        vllm_config = get_current_vllm_config()
+        if vllm_config.attention_config.flash_attn_version is not None:
+            fa_version = vllm_config.attention_config.flash_attn_version
 
         # 3. fallback for unsupported combinations
         if device_capability.major == 10 and fa_version == 3:
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index dd76a7221..0f84f3ca9 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from vllm.config.attention import AttentionConfig
 from vllm.config.cache import CacheConfig
 from vllm.config.compilation import (
     CompilationConfig,
@@ -46,6 +47,8 @@ from vllm.config.vllm import (
 # __all__ should only contain classes and functions.
 # Types and globals should be imported from their respective modules.
 __all__ = [
+    # From vllm.config.attention
+    "AttentionConfig",
     # From vllm.config.cache
     "CacheConfig",
     # From vllm.config.compilation
diff --git a/vllm/config/attention.py b/vllm/config/attention.py
new file mode 100644
index 000000000..dd62d8882
--- /dev/null
+++ b/vllm/config/attention.py
@@ -0,0 +1,114 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Literal
+
+from pydantic import field_validator
+from pydantic.dataclasses import dataclass
+
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.config.utils import config
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+@config
+@dataclass
+class AttentionConfig:
+    """Configuration for attention mechanisms in vLLM."""
+
+    backend: AttentionBackendEnum | None = None
+    """Attention backend to use. If None, will be selected automatically."""
+
+    flash_attn_version: Literal[2, 3] | None = None
+    """Force vllm to use a specific flash-attention version (2 or 3).
+    Only valid when using the flash-attention backend."""
+
+    use_prefill_decode_attention: bool = False
+    """Use separate prefill and decode kernels for attention instead of
+    the unified triton kernel."""
+
+    flash_attn_max_num_splits_for_cuda_graph: int = 32
+    """Flash Attention max number splits for cuda graph decode."""
+
+    use_cudnn_prefill: bool = False
+    """Whether to use cudnn prefill."""
+
+    use_trtllm_ragged_deepseek_prefill: bool = False
+    """Whether to use TRTLLM ragged deepseek prefill."""
+
+    use_trtllm_attention: bool | None = None
+    """If set to True/False, use or don't use the TRTLLM attention backend
+    in flashinfer. If None, auto-detect the attention backend in flashinfer."""
+
+    disable_flashinfer_prefill: bool = False
+    """Whether to disable flashinfer prefill."""
+
+    disable_flashinfer_q_quantization: bool = False
+    """If set, when using fp8 kv, do not quantize Q to fp8."""
+
+    def compute_hash(self) -> str:
+        """
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        from vllm.config.utils import get_hash_factors, hash_factors
+
+        ignored_factors: list[str] = []
+        factors = get_hash_factors(self, ignored_factors)
+        return hash_factors(factors)
+
+    @field_validator("backend", mode="before")
+    @classmethod
+    def validate_backend_before(cls, value: Any) -> Any:
+        """Enable parsing of the `backend` enum type from string."""
+        if isinstance(value, str):
+            return AttentionBackendEnum[value.upper()]
+        return value
+
+    def _set_from_env_if_set(self, field_name: str, env_var_name: str) -> None:
+        """Set field from env var if set, with deprecation warning."""
+        from vllm import envs
+
+        if envs.is_set(env_var_name):
+            value = getattr(envs, env_var_name)
+            if field_name == "backend":
+                value = self.validate_backend_before(value)
+            setattr(self, field_name, value)
+            logger.warning_once(
+                "Using %s environment variable is deprecated and will be removed in "
+                "v0.14.0 or v1.0.0, whichever is soonest. Please use "
+                "--attention-config.%s command line argument or "
+                "AttentionConfig(%s=...) config field instead.",
+                env_var_name,
+                field_name,
+                field_name,
+            )
+
+    def __post_init__(self) -> None:
+        self._set_from_env_if_set("backend", "VLLM_ATTENTION_BACKEND")
+        self._set_from_env_if_set("flash_attn_version", "VLLM_FLASH_ATTN_VERSION")
+        self._set_from_env_if_set(
+            "use_prefill_decode_attention", "VLLM_V1_USE_PREFILL_DECODE_ATTENTION"
+        )
+        self._set_from_env_if_set(
+            "flash_attn_max_num_splits_for_cuda_graph",
+            "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH",
+        )
+        self._set_from_env_if_set("use_cudnn_prefill", "VLLM_USE_CUDNN_PREFILL")
+        self._set_from_env_if_set(
+            "use_trtllm_ragged_deepseek_prefill",
+            "VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL",
+        )
+        self._set_from_env_if_set("use_trtllm_attention", "VLLM_USE_TRTLLM_ATTENTION")
+        self._set_from_env_if_set(
+            "disable_flashinfer_prefill", "VLLM_DISABLE_FLASHINFER_PREFILL"
+        )
+        self._set_from_env_if_set(
+            "disable_flashinfer_q_quantization",
+            "VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION",
+        )
diff --git a/vllm/config/model.py b/vllm/config/model.py
index ae5189ce6..5be7d5e7f 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -4,7 +4,6 @@
 import warnings
 from collections.abc import Callable
 from dataclasses import InitVar, field
-from importlib.util import find_spec
 from typing import TYPE_CHECKING, Any, Literal, cast, get_args
 
 import torch
@@ -467,18 +466,6 @@ class ModelConfig:
 
         self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)
 
-        if (
-            (backend := envs.VLLM_ATTENTION_BACKEND)
-            and backend == "FLASHINFER"
-            and find_spec("flashinfer") is None
-        ):
-            raise ValueError(
-                "VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer "
-                "module was not found. See "
-                "https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile "  # noqa: E501
-                "for instructions on how to install it."
-            )
-
         from vllm.platforms import current_platform
 
         if self.override_attention_dtype is not None and not current_platform.is_rocm():
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 823bd96db..ce3d3b208 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -27,6 +27,7 @@ from vllm.transformers_utils.runai_utils import is_runai_obj_uri
 from vllm.utils import random_uuid
 from vllm.utils.hashing import safe_hash
 
+from .attention import AttentionConfig
 from .cache import CacheConfig
 from .compilation import CompilationConfig, CompilationMode, CUDAGraphMode
 from .device import DeviceConfig
@@ -192,6 +193,8 @@ class VllmConfig:
     """Device configuration."""
     load_config: LoadConfig = Field(default_factory=LoadConfig)
     """Load configuration."""
+    attention_config: AttentionConfig = Field(default_factory=AttentionConfig)
+    """Attention configuration."""
     lora_config: LoRAConfig | None = None
     """LoRA configuration."""
     speculative_config: SpeculativeConfig | None = None
@@ -279,6 +282,10 @@ class VllmConfig:
             vllm_factors.append(self.load_config.compute_hash())
         else:
             vllm_factors.append("None")
+        if self.attention_config:
+            vllm_factors.append(self.attention_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.lora_config:
             vllm_factors.append(self.lora_config.compute_hash())
         else:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 883ae370f..aad071954 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -34,6 +34,7 @@ from typing_extensions import TypeIs
 import vllm.envs as envs
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import (
+    AttentionConfig,
     CacheConfig,
     CompilationConfig,
     ConfigType,
@@ -527,6 +528,7 @@ class EngineArgs:
 
     pooler_config: PoolerConfig | None = ModelConfig.pooler_config
     compilation_config: CompilationConfig = get_field(VllmConfig, "compilation_config")
+    attention_config: AttentionConfig = get_field(VllmConfig, "attention_config")
     worker_cls: str = ParallelConfig.worker_cls
     worker_extension_cls: str = ParallelConfig.worker_extension_cls
 
@@ -542,6 +544,7 @@ class EngineArgs:
     )
     model_impl: str = ModelConfig.model_impl
     override_attention_dtype: str = ModelConfig.override_attention_dtype
+    attention_backend: AttentionBackendEnum | None = AttentionConfig.backend
 
     calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
     mamba_cache_dtype: MambaDType = CacheConfig.mamba_cache_dtype
@@ -580,6 +583,8 @@ class EngineArgs:
         # CompilationConfig object
         if isinstance(self.compilation_config, dict):
             self.compilation_config = CompilationConfig(**self.compilation_config)
+        if isinstance(self.attention_config, dict):
+            self.attention_config = AttentionConfig(**self.attention_config)
         if isinstance(self.eplb_config, dict):
             self.eplb_config = EPLBConfig(**self.eplb_config)
         # Setup plugins
@@ -717,6 +722,16 @@ class EngineArgs:
             "--pt-load-map-location", **load_kwargs["pt_load_map_location"]
         )
 
+        # Attention arguments
+        attention_kwargs = get_kwargs(AttentionConfig)
+        attention_group = parser.add_argument_group(
+            title="AttentionConfig",
+            description=AttentionConfig.__doc__,
+        )
+        attention_group.add_argument(
+            "--attention-backend", **attention_kwargs["backend"]
+        )
+
         # Structured outputs arguments
         structured_outputs_kwargs = get_kwargs(StructuredOutputsConfig)
         structured_outputs_group = parser.add_argument_group(
@@ -1140,6 +1155,9 @@ class EngineArgs:
         vllm_group.add_argument(
             "--compilation-config", "-cc", **vllm_kwargs["compilation_config"]
         )
+        vllm_group.add_argument(
+            "--attention-config", "-ac", **vllm_kwargs["attention_config"]
+        )
         vllm_group.add_argument(
             "--additional-config", **vllm_kwargs["additional_config"]
         )
@@ -1693,6 +1711,16 @@ class EngineArgs:
         if model_config.quantization == "bitsandbytes":
             self.quantization = self.load_format = "bitsandbytes"
 
+        # Attention config overrides
+        attention_config = copy.deepcopy(self.attention_config)
+        if self.attention_backend is not None:
+            if attention_config.backend is not None:
+                raise ValueError(
+                    "attention_backend and attention_config.backend "
+                    "are mutually exclusive"
+                )
+            attention_config.backend = self.attention_backend
+
         load_config = self.create_load_config()
 
         # Pass reasoning_parser into StructuredOutputsConfig
@@ -1750,9 +1778,10 @@ class EngineArgs:
             parallel_config=parallel_config,
             scheduler_config=scheduler_config,
             device_config=device_config,
+            load_config=load_config,
+            attention_config=attention_config,
             lora_config=lora_config,
             speculative_config=speculative_config,
-            load_config=load_config,
             structured_outputs_config=self.structured_outputs_config,
             observability_config=observability_config,
             compilation_config=compilation_config,
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index fbeb28a1c..55dd6e50a 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -4,7 +4,7 @@ from copy import deepcopy
 from math import lcm
 from typing import TYPE_CHECKING
 
-import vllm.envs as envs
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 from vllm.model_executor.models import ModelRegistry
 from vllm.platforms import current_platform
@@ -331,6 +331,7 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
         # Enable FULL_AND_PIECEWISE by default
         MambaModelConfig.verify_and_update_config(vllm_config)
 
+        attention_config = vllm_config.attention_config
         cache_config = vllm_config.cache_config
         model_config = vllm_config.model_config
         parallel_config = vllm_config.parallel_config
@@ -347,7 +348,9 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
         #   * CUTLASS_MLA backend: kernel_block_size 128 alignment
         #   * Other MLA backends: kernel_block_size 64 alignment
         if model_config.use_mla:
-            use_cutlass_mla = envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA"
+            use_cutlass_mla = (
+                attention_config.backend == AttentionBackendEnum.CUTLASS_MLA
+            )
             kernel_block_alignment_size = 128 if use_cutlass_mla else 64
             attn_page_size_1_token = MLAAttentionSpec(
                 block_size=1,
@@ -361,8 +364,8 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
                 current_platform.is_device_capability(100)
                 and model_config.get_head_size() == 256
                 and (
-                    envs.VLLM_ATTENTION_BACKEND is None
-                    or envs.VLLM_ATTENTION_BACKEND == "FLASHINFER"
+                    attention_config.backend is None
+                    or attention_config.backend == AttentionBackendEnum.FLASHINFER
                 )
             ):
                 # https://github.com/flashinfer-ai/flashinfer/issues/1993 reports that`
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index e5d70eb7b..7602eca9c 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -11,7 +11,7 @@ import torch
 from transformers import PretrainedConfig
 
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
@@ -91,10 +91,7 @@ def get_vit_attn_backend(
     if attn_backend_override is not None:
         return attn_backend_override
 
-    # Lazy import to avoid circular dependency
-    from vllm.attention.selector import get_env_variable_attn_backend
-
-    selected_backend: AttentionBackendEnum | None = get_env_variable_attn_backend()
+    selected_backend = get_current_vllm_config().attention_config.backend
     if selected_backend is not None:
         return selected_backend
 
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 1467ca71e..7e6ce6aee 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -14,7 +14,6 @@ from typing_extensions import ParamSpec
 
 # import custom ops, trigger op registration
 import vllm._C  # noqa
-import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
@@ -149,6 +148,8 @@ class CudaPlatformBase(Platform):
 
     @classmethod
     def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
+        from vllm.attention.backends.registry import AttentionBackendEnum
+
         parallel_config = vllm_config.parallel_config
         model_config = vllm_config.model_config
 
@@ -171,7 +172,7 @@ class CudaPlatformBase(Platform):
             and cache_config.block_size is not None
         ):
             use_sparse = hasattr(vllm_config.model_config.hf_config, "index_topk")
-            # If `VLLM_ATTENTION_BACKEND` is not set and we are using MLA,
+            # If `--attention-config.backend` is not set and we are using MLA,
             # then we default to FlashMLA backend for non-blackwell GPUs,
             # else we default to CutlassMLA. For each case, we force the
             # required block_size.
@@ -179,23 +180,25 @@ class CudaPlatformBase(Platform):
             use_cutlass_mla = False
             use_flashinfer_mla = False
 
-            if envs.VLLM_ATTENTION_BACKEND is None:
+            if vllm_config.attention_config.backend is None:
                 # Default case
                 if cls.is_device_capability(100):
                     # Blackwell => Force CutlassMLA.
                     use_cutlass_mla = True
-                    # TODO: This does not work, because the
-                    # global_force_attn_backend_context_manager is not set.
-                    # See vllm/attention/selector.py:_cached_get_attn_backend
-                    envs.VLLM_ATTENTION_BACKEND = "CUTLASS_MLA"
+                    # Set the backend in AttentionConfig so it's used during
+                    # backend selection
+                    vllm_config.attention_config.backend = (
+                        AttentionBackendEnum.CUTLASS_MLA
+                    )
                 else:
                     # Not Blackwell
                     use_flashmla = True
             else:
                 # Forced case
-                use_flashmla = envs.VLLM_ATTENTION_BACKEND == "FLASHMLA"
-                use_cutlass_mla = envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA"
-                use_flashinfer_mla = envs.VLLM_ATTENTION_BACKEND == "FLASHINFER_MLA"
+                backend = vllm_config.attention_config.backend
+                use_flashmla = backend == AttentionBackendEnum.FLASHMLA
+                use_cutlass_mla = backend == AttentionBackendEnum.CUTLASS_MLA
+                use_flashinfer_mla = backend == AttentionBackendEnum.FLASHINFER_MLA
 
             from vllm.attention.ops.flashmla import is_flashmla_dense_supported
 
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 9f9976d52..7aaf690cb 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -267,21 +267,16 @@ def supports_trtllm_attention() -> bool:
     return current_platform.is_device_capability(100) and has_nvidia_artifactory()
 
 
-@functools.cache
-def _force_use_trtllm_attention(env_value: bool | None) -> bool | None:
-    """Cache the env value for VLLM_USE_TRTLLM_ATTENTION"""
-    if env_value is not None:
-        logger.info_once("VLLM_USE_TRTLLM_ATTENTION is set to %s", env_value)
-    return env_value
-
-
 def force_use_trtllm_attention() -> bool | None:
     """
-    Return `None` if VLLM_USE_TRTLLM_ATTENTION is not set,
+    Return `None` if --attention-config.use_trtllm_attention is not set,
     return `True` if TRTLLM attention is forced to be used,
     return `False` if TRTLLM attention is forced to be not used.
     """
-    return _force_use_trtllm_attention(envs.VLLM_USE_TRTLLM_ATTENTION)
+    from vllm.config import get_current_vllm_config
+
+    vllm_config = get_current_vllm_config()
+    return vllm_config.attention_config.use_trtllm_attention
 
 
 def can_use_trtllm_attention(num_qo_heads: int, num_kv_heads: int) -> bool:
@@ -307,7 +302,7 @@ def use_trtllm_attention(
     """Return `True` if TRTLLM attention is used."""
     force_use_trtllm = force_use_trtllm_attention()
 
-    # Environment variable is set to 0 - respect it
+    # CLI argument is set to 0 - respect it
     if force_use_trtllm is not None and not force_use_trtllm:
         return False
 
@@ -324,7 +319,7 @@ def use_trtllm_attention(
         if force_use_trtllm:
             logger.warning_once(
                 "TRTLLM attention is not supported on this platform, "
-                "but VLLM_USE_TRTLLM_ATTENTION is set to 1"
+                "but --attention-config.use_trtllm_attention is set to 1"
             )
         return False
 
@@ -333,7 +328,8 @@ def use_trtllm_attention(
         if force_use_trtllm:
             logger.warning_once(
                 "TRTLLM attention is not supported for this combination of "
-                "query and key heads, but VLLM_USE_TRTLLM_ATTENTION is set to 1"
+                "query and key heads, but --attention-config.use_trtllm_attention is "
+                "set to 1"
             )
         return False
 
@@ -354,7 +350,7 @@ def use_trtllm_attention(
         return True
 
     if force_use_trtllm is None:
-        # Environment variable not set - use auto-detection
+        # CLI argument not set - use auto-detection
         if is_prefill:
             # Prefill auto-detection
             use_trtllm = kv_cache_dtype == "auto"
@@ -367,8 +363,10 @@ def use_trtllm_attention(
                 logger.warning_once("Using TRTLLM decode attention (auto-detected).")
         return use_trtllm
 
-    # Environment variable is set to 1 - respect it
-    logger.info_once("Using TRTLLM attention (VLLM_USE_TRTLLM_ATTENTION is set to 1)")
+    # CLI argument is set to 1 - respect it
+    logger.info_once(
+        "Using TRTLLM attention (--attention-config.use_trtllm_attention is set to 1)"
+    )
     return True
 
 
@@ -500,12 +498,6 @@ def flashinfer_scaled_fp8_mm(
     return output
 
 
-@functools.cache
-def flashinfer_disable_q_quantization() -> bool:
-    """Cache result which only depends on the environment"""
-    return envs.VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION
-
-
 __all__ = [
     "has_flashinfer",
     "flashinfer_trtllm_fp8_block_scale_moe",
@@ -526,7 +518,6 @@ __all__ = [
     "supports_trtllm_attention",
     "can_use_trtllm_attention",
     "use_trtllm_attention",
-    "flashinfer_disable_q_quantization",
     "flashinfer_scaled_fp4_mm",
     "flashinfer_scaled_fp8_mm",
 ]
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index fb080b0b3..f5ad98cf2 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -8,7 +8,6 @@ from typing import ClassVar
 import numpy as np
 import torch
 
-from vllm import envs
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionImpl,
@@ -264,6 +263,7 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
         self.parallel_config = vllm_config.parallel_config
         self.cache_config = vllm_config.cache_config
         self.compilation_config = vllm_config.compilation_config
+        self.attention_config = vllm_config.attention_config
 
         self.num_heads_q = self.model_config.get_num_attention_heads(
             self.parallel_config
@@ -304,7 +304,9 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
             # When using cuda graph, we need to set the upper bound of the
             # number of splits so that large enough intermediate buffers are
             # pre-allocated during capture.
-            self.max_num_splits = envs.VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH
+            self.max_num_splits = (
+                self.attention_config.flash_attn_max_num_splits_for_cuda_graph
+            )
 
         # Sliding window size to be used with the AOT scheduler will be
         # populated on first build() call.
@@ -554,8 +556,7 @@ class FlashAttentionImpl(AttentionImpl):
                 "heads in the layer"
             )
 
-    def supports_quant_query_input(self) -> bool:
-        return True
+        self.supports_quant_query_input = True
 
     def forward(
         self,
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 3d9640a2d..8e9d764e4 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -26,7 +26,7 @@ from vllm.attention.backends.abstract import (
 )
 from vllm.attention.ops.common import cp_lse_ag_out_rs
 from vllm.attention.ops.merge_attn_states import merge_attn_states
-from vllm.config import CUDAGraphMode, VllmConfig
+from vllm.config import CUDAGraphMode, VllmConfig, get_current_vllm_config
 from vllm.config.cache import CacheDType
 from vllm.distributed.parallel_state import get_dcp_group
 from vllm.logger import init_logger
@@ -43,7 +43,6 @@ from vllm.platforms.interface import DeviceCapability
 from vllm.triton_utils import tl, triton
 from vllm.utils.flashinfer import (
     can_use_trtllm_attention,
-    flashinfer_disable_q_quantization,
     use_trtllm_attention,
 )
 from vllm.utils.math_utils import cdiv
@@ -362,7 +361,8 @@ class FlashInferBackend(AttentionBackend):
             supports_trtllm_attention,
         )
 
-        # Respect explicit disable flag (e.g., VLLM_USE_TRTLLM_ATTENTION=0)
+        # Respect explicit disable flag (e.g.,
+        # --attention-config.use_trtllm_attention=0)
         if force_use_trtllm_attention() is False:
             return False
 
@@ -500,11 +500,14 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             self.kv_cache_dtype = self.kv_cache_spec.dtype
 
         # Use model dtype as q dtype when TRTLLM attn is not supported, or
-        # VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION is set to 1. Otherwise, try to
-        # use fp8 q if kv cache is fp8, and will fall back to model dtype
+        # --attention-config.disable_flashinfer_q_quantization is set to 1. Otherwise,
+        # try to use fp8 q if kv cache is fp8, and will fall back to model dtype
         # if TRTLLM attention kernel is not used when building attn metadata
         can_use_trtllm = can_use_trtllm_attention(self.num_qo_heads, self.num_kv_heads)
-        if can_use_trtllm and not flashinfer_disable_q_quantization():
+        if (
+            can_use_trtllm
+            and not vllm_config.attention_config.disable_flashinfer_q_quantization
+        ):
             self.q_data_type = self.kv_cache_dtype
         else:
             self.q_data_type = self.model_config.dtype
@@ -1035,6 +1038,11 @@ class FlashInferImpl(AttentionImpl):
             self.sinks = sinks
 
         self.support_trtllm_attn = can_use_trtllm_attention(num_heads, num_kv_heads)
+        vllm_config = get_current_vllm_config()
+        self.supports_quant_query_input = (
+            self.support_trtllm_attn
+            and not vllm_config.attention_config.disable_flashinfer_q_quantization
+        )
         self.bmm1_scale: float | None = None
         self.bmm2_scale: float | None = None
         self.o_sf_scale: float | None = None
@@ -1046,12 +1054,6 @@ class FlashInferImpl(AttentionImpl):
             and quant_key in (kFp8StaticTensorSym, kNvfp4Quant)
         )
 
-    def supports_quant_query_input(self) -> bool:
-        if flashinfer_disable_q_quantization():
-            return False
-
-        return self.support_trtllm_attn
-
     # FlashInfer requires attention sinks to be float32
     def process_weights_after_loading(self, act_dtype: torch.dtype):
         if self.sinks is not None and self.sinks.dtype != torch.float32:
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 180625b6c..309ddee4f 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -438,19 +438,25 @@ A = TypeVar("A")
 def use_flashinfer_prefill() -> bool:
     # For blackwell default to flashinfer prefill if it's available since
     # it is faster than FA2.
+    from vllm.config import get_current_vllm_config
+
+    vllm_config = get_current_vllm_config()
     return (
-        not envs.VLLM_DISABLE_FLASHINFER_PREFILL
+        not vllm_config.attention_config.disable_flashinfer_prefill
         and flashinfer_available
-        and not envs.VLLM_USE_CUDNN_PREFILL
-        and not envs.VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL
+        and not vllm_config.attention_config.use_cudnn_prefill
+        and not vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill
         and current_platform.is_device_capability(100)
     )
 
 
 def use_cudnn_prefill() -> bool:
+    from vllm.config import get_current_vllm_config
+
+    vllm_config = get_current_vllm_config()
     return (
         flashinfer_available
-        and envs.VLLM_USE_CUDNN_PREFILL
+        and vllm_config.attention_config.use_cudnn_prefill
         and current_platform.is_device_capability(100)
         and has_nvidia_artifactory()
     )
@@ -458,9 +464,12 @@ def use_cudnn_prefill() -> bool:
 
 def use_trtllm_ragged_deepseek_prefill() -> bool:
     """Check if TRT-LLM ragged DeepSeek prefill should be used."""
+    from vllm.config import get_current_vllm_config
+
+    vllm_config = get_current_vllm_config()
     return (
         flashinfer_available
-        and envs.VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL
+        and vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill
         and current_platform.is_device_capability(100)
     )
 
diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py
index d369814c1..eccf4ec79 100644
--- a/vllm/v1/attention/backends/mla/flashattn_mla.py
+++ b/vllm/v1/attention/backends/mla/flashattn_mla.py
@@ -6,7 +6,6 @@ from typing import ClassVar
 
 import torch
 
-from vllm import envs
 from vllm.attention.backends.abstract import (
     AttentionLayer,
     AttentionType,
@@ -131,7 +130,9 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata]
             # When using cuda graph, we need to set the upper bound of the
             # number of splits so that large enough intermediate buffers are
             # pre-allocated during capture.
-            self.max_num_splits = envs.VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH
+            self.max_num_splits = (
+                vllm_config.attention_config.flash_attn_max_num_splits_for_cuda_graph
+            )
 
         if vllm_is_batch_invariant():
             self.max_num_splits = 1
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index 868143cc1..e2410a70b 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -165,7 +165,7 @@ class RocmAttentionBackend(AttentionBackend):
             raise ValueError(
                 f"Head size {head_size} is not supported by {attn_type}. "
                 f"Supported head sizes are: {cls.get_supported_head_sizes()}. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
+                "Set --attention-config.backend=FLEX_ATTENTION to use "
                 "FlexAttention backend which supports all head sizes."
             )
 
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index d051a89f0..3b17c4bcd 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -210,9 +210,6 @@ class TritonAttentionImpl(AttentionImpl):
     def fused_output_quant_supported(self, quant_key: QuantKey):
         return quant_key == kFp8StaticTensorSym
 
-    def supports_quant_query_input(self) -> bool:
-        return current_platform.is_cuda()
-
     def __init__(
         self,
         num_heads: int,
@@ -262,6 +259,8 @@ class TritonAttentionImpl(AttentionImpl):
                 f"num_heads: {num_heads}."
             )
 
+        self.supports_quant_query_input = current_platform.is_cuda()
+
     def forward(
         self,
         layer: torch.nn.Module,
-- 
GitLab


From 4e26d3b09e9895279bf127f225c5f45922612261 Mon Sep 17 00:00:00 2001
From: Ilya Markov <markovilya197@gmail.com>
Date: Fri, 5 Dec 2025 19:17:32 +0100
Subject: [PATCH 137/258] [Compile] Conditional compilation. Introduce
 compile_ranges (#24252)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Signed-off-by: ilmarkov <markovilya197@gmail.com>
Signed-off-by: Luka Govedič <luka.govedic@gmail.com>
Signed-off-by: ProExpertProg <lgovedic@redhat.com>
Co-authored-by: Luka Govedič <lgovedic@redhat.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Co-authored-by: Luka Govedič <luka.govedic@gmail.com>
---
 tests/compile/distributed/test_fusions_e2e.py |  23 ++-
 tests/compile/test_compile_ranges.py          | 168 ++++++++++++++++++
 tests/conftest.py                             |  14 ++
 vllm/compilation/backends.py                  | 147 ++++++---------
 vllm/compilation/collective_fusion.py         | 146 +++++++--------
 vllm/compilation/compiler_interface.py        |  40 ++---
 vllm/compilation/inductor_pass.py             |  11 +-
 vllm/compilation/pass_manager.py              |  16 +-
 vllm/compilation/piecewise_backend.py         | 125 ++++++++-----
 vllm/compilation/sequence_parallelism.py      |   5 +-
 vllm/config/compilation.py                    |  38 +++-
 vllm/config/utils.py                          |  34 +++-
 vllm/config/vllm.py                           |  48 +++++
 vllm/v1/worker/gpu_worker.py                  |  35 +++-
 vllm/v1/worker/utils.py                       |   2 +-
 15 files changed, 583 insertions(+), 269 deletions(-)
 create mode 100644 tests/compile/test_compile_ranges.py

diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
index 5d2786e12..75a81efed 100644
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -298,10 +298,14 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
         r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
         log_holder.text,
     )
-    assert len(log_matches) == 2, log_holder.text
+    # 2 for each compile range
+    # (global compile range can be split due to fuse_allreduce_rmsnorm)
+    num_compile_ranges = len(compilation_config.get_compile_ranges())
+    assert num_compile_ranges in [1, 2]
 
-    assert int(log_matches[0]) == matches.attention_fusion
-    assert int(log_matches[1]) == matches.attention_fusion
+    assert len(log_matches) == 2 * num_compile_ranges, log_holder.text
+
+    assert all(int(log_match) == matches.attention_fusion for log_match in log_matches)
 
     log_matches = re.findall(
         r"collective_fusion.py:\d+] Replaced (\d+) patterns",
@@ -312,6 +316,12 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
     assert int(log_matches[0]) == matches.allreduce_fusion
     assert int(log_matches[1]) == matches.allreduce_fusion
 
+    log_matches = re.findall(
+        r"pass_manager.py:\d+] Skipping .*AllReduceFusionPass.* with compile range",
+        log_holder.text,
+    )
+    assert len(log_matches) == 2 * (num_compile_ranges - 1), log_holder.text
+
 
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize(
@@ -446,7 +456,6 @@ def run_model(compile_config: int | CompilationConfig, model: str, **model_kwarg
     # No cudagraphs by default
     if compilation_config.cudagraph_mode is None:
         compilation_config.cudagraph_mode = CUDAGraphMode.NONE
-
     llm = LLM(
         model=model,
         compilation_config=compilation_config,
@@ -459,3 +468,9 @@ def run_model(compile_config: int | CompilationConfig, model: str, **model_kwarg
         prompt = output.prompt
         generated_text = output.outputs[0].text
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    # Get the compile ranges split points after vllm config post init
+    # in order to compute compile ranges correctly
+    compilation_config.compile_ranges_split_points = (
+        llm.llm_engine.vllm_config.compilation_config.compile_ranges_split_points
+    )
diff --git a/tests/compile/test_compile_ranges.py b/tests/compile/test_compile_ranges.py
new file mode 100644
index 000000000..d849a8617
--- /dev/null
+++ b/tests/compile/test_compile_ranges.py
@@ -0,0 +1,168 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+import torch
+from torch import fx as fx
+from torch import nn
+
+# This import automatically registers `torch.ops.silly.attention`
+import tests.compile.silly_attention  # noqa
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.decorators import support_torch_compile
+from vllm.compilation.inductor_pass import (
+    InductorPass,
+    get_pass_context,
+)
+from vllm.config import (
+    VllmConfig,
+    set_current_vllm_config,
+)
+from vllm.config.compilation import CompilationConfig, CompilationMode
+from vllm.config.scheduler import SchedulerConfig
+from vllm.config.utils import Range
+from vllm.forward_context import set_forward_context
+
+BATCH_SIZE = 64
+MLP_SIZE = 128
+
+
+@support_torch_compile
+class TestModel(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> None:
+        super().__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + x
+        attn_output = torch.empty_like(x)
+        torch.ops.silly.attention(x, x, x, attn_output)
+        x = attn_output
+        x = x * 3
+        return x
+
+
+@torch.inference_mode
+def run_model(vllm_config: VllmConfig, model: nn.Module, batch_sizes: list[int]):
+    with set_forward_context({}, vllm_config=vllm_config):
+        model(torch.randn(BATCH_SIZE, MLP_SIZE))
+        for batch_size in batch_sizes:
+            model(torch.randn(batch_size, MLP_SIZE))
+
+
+class PostGradRangeChecker(InductorPass):
+    def __init__(self, ranges: list[Range]):
+        self.ranges = ranges
+        self.num_calls = 0
+
+    def __call__(self, graph: fx.Graph):
+        compile_range = get_pass_context().compile_range
+        assert compile_range in self.ranges, (
+            f"Compile range {compile_range} not in {self.ranges}"
+        )
+        self.num_calls += 1
+
+    def uuid(self) -> str:
+        state: dict[str, Any] = {}
+        return InductorPass.hash_dict(state)
+
+
+def test_compile_ranges(use_fresh_inductor_cache):
+    post_grad_range_checker = PostGradRangeChecker(
+        [
+            Range(start=1, end=8),
+            Range(start=16, end=16),
+            Range(start=9, end=32),
+            Range(start=64, end=64),
+            Range(start=33, end=8192),
+        ]
+    )
+    torch.set_default_device("cuda")
+    vllm_config = VllmConfig(
+        scheduler_config=SchedulerConfig(
+            max_num_batched_tokens=8192,
+        ),
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            compile_ranges_split_points=[8, 32],
+            compile_sizes=[16, 64, 128],
+            inductor_compile_config={
+                "post_grad_custom_post_pass": post_grad_range_checker,
+            },
+        ),
+    )
+
+    with set_current_vllm_config(vllm_config):
+        model = TestModel(vllm_config=vllm_config, prefix="").eval()
+        # Number of compilations: 3 for each compile range + 2 compile sizes
+        batch_sizes = [1, 4, 16, 24, 48, 64, 8192]
+
+        with compilation_counter.expect(
+            num_graphs_seen=1,
+            num_piecewise_graphs_seen=1,
+            num_backend_compilations=5,
+        ):
+            run_model(vllm_config, model, batch_sizes)
+        assert post_grad_range_checker.num_calls == 5
+
+
+def test_compile_config_get_compile_ranges():
+    compilation_config = CompilationConfig(
+        compile_ranges_split_points=[8, 32],
+    )
+    VllmConfig(
+        scheduler_config=SchedulerConfig(
+            max_num_batched_tokens=8192,
+        ),
+        compilation_config=compilation_config,
+    )
+    assert compilation_config.get_compile_ranges() == [
+        Range(start=1, end=8),
+        Range(start=9, end=32),
+        Range(start=33, end=8192),
+    ]
+
+
+def test_inductor_cache_compile_ranges(monkeypatch, use_fresh_inductor_cache):
+    # To force multiple compilations, we disable the compile cache
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+
+    post_grad_range_checker = PostGradRangeChecker(
+        ranges=[
+            Range(start=1, end=8),
+            Range(start=9, end=8192),
+        ]
+    )
+    scheduler_config = SchedulerConfig(
+        max_num_batched_tokens=8192,
+    )
+    torch.set_default_device("cuda")
+
+    def create_vllm_config():
+        return VllmConfig(
+            scheduler_config=scheduler_config,
+            compilation_config=CompilationConfig(
+                mode=CompilationMode.VLLM_COMPILE,
+                compile_ranges_split_points=[8],
+                inductor_compile_config={
+                    "post_grad_custom_post_pass": post_grad_range_checker,
+                },
+            ),
+        )
+
+    vllm_config_1 = create_vllm_config()
+    with set_current_vllm_config(vllm_config_1):
+        model1 = TestModel(vllm_config=vllm_config_1, prefix="").eval()
+        batch_sizes = [1, 16]
+        run_model(vllm_config_1, model1, batch_sizes)
+        assert post_grad_range_checker.num_calls == 2
+
+    post_grad_range_checker.num_calls = 0
+    # Create a new vllm config with the new pass context
+    vllm_config_2 = create_vllm_config()
+    with set_current_vllm_config(vllm_config_2):
+        model2 = TestModel(vllm_config=vllm_config_2, prefix="").eval()
+        batch_sizes = [4, 32]
+        run_model(vllm_config_2, model2, batch_sizes)
+        # Check that cache is used, so the number of calls
+        # should be 0
+        assert post_grad_range_checker.num_calls == 0
diff --git a/tests/conftest.py b/tests/conftest.py
index 204452b58..0d456fb36 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -67,6 +67,9 @@ from vllm.transformers_utils.utils import maybe_model_redirect
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.torch_utils import set_default_torch_num_threads
 
+from torch._inductor.utils import fresh_cache
+
+
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
     from transformers.generation.utils import GenerateOutput
@@ -1465,3 +1468,14 @@ def clean_gpu_memory_between_tests():
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
         gc.collect()
+
+
+@pytest.fixture
+def use_fresh_inductor_cache():
+    """
+    Use a fresh inductor cache for the test.
+    This is useful to ensure that the test is not affected by the
+    previous test calls.
+    """
+    with fresh_cache():
+        yield
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index b5b7fe2b7..26f4f16a8 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -26,7 +26,7 @@ from vllm.compilation.partition_rules import (
     should_split,
 )
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
-from vllm.config.utils import hash_factors
+from vllm.config.utils import Range, hash_factors
 from vllm.logger import init_logger
 from vllm.logging_utils import lazy
 from vllm.platforms import current_platform
@@ -90,7 +90,7 @@ class CompilerManager:
     """
 
     def __init__(self, compilation_config: CompilationConfig):
-        self.cache: dict[tuple[int | None, int, str], Any] = dict()
+        self.cache: dict[tuple[Range, int, str], Any] = dict()
         self.is_cache_updated = False
         self.compilation_config = compilation_config
         self.compiler = make_compiler(compilation_config)
@@ -99,11 +99,11 @@ class CompilerManager:
         return self.compiler.compute_hash(vllm_config)
 
     @contextmanager
-    def compile_context(self, runtime_shape: int | None = None):
+    def compile_context(self, compile_range: Range):
         """Provide compilation context for the duration of compilation to set
         any torch global properties we want to scope to a single Inductor
         compilation (e.g. partition rules, pass context)."""
-        with pass_context(runtime_shape):
+        with pass_context(compile_range):
             if self.compilation_config.use_inductor_graph_partition:
                 with inductor_partition_rule_context(
                     self.compilation_config.splitting_ops
@@ -159,29 +159,21 @@ class CompilerManager:
         graph: fx.GraphModule,
         example_inputs: list[Any],
         graph_index: int,
-        runtime_shape: int | None = None,
+        compile_range: Range,
     ) -> Callable | None:
-        if (runtime_shape, graph_index, self.compiler.name) not in self.cache:
+        if (compile_range, graph_index, self.compiler.name) not in self.cache:
             return None
-        handle = self.cache[(runtime_shape, graph_index, self.compiler.name)]
+        handle = self.cache[(compile_range, graph_index, self.compiler.name)]
         compiled_graph = self.compiler.load(
-            handle, graph, example_inputs, graph_index, runtime_shape
+            handle, graph, example_inputs, graph_index, compile_range
+        )
+        logger.debug(
+            "Directly load the %s-th graph for compile range %sfrom %s via handle %s",
+            graph_index,
+            str(compile_range),
+            self.compiler.name,
+            handle,
         )
-        if runtime_shape is None:
-            logger.debug(
-                "Directly load the %s-th graph for dynamic shape from %s via handle %s",
-                graph_index,
-                self.compiler.name,
-                handle,
-            )
-        else:
-            logger.debug(
-                "Directly load the %s-th graph for shape %s from %s via handle %s",
-                graph_index,
-                str(runtime_shape),
-                self.compiler.name,
-                handle,
-            )
         return compiled_graph
 
     def compile(
@@ -190,9 +182,9 @@ class CompilerManager:
         example_inputs,
         additional_inductor_config,
         compilation_config: CompilationConfig,
+        compile_range: Range,
         graph_index: int = 0,
         num_graphs: int = 1,
-        runtime_shape: int | None = None,
     ) -> Any:
         if graph_index == 0:
             # before compiling the first graph, record the start time
@@ -204,7 +196,7 @@ class CompilerManager:
         compiled_graph = None
 
         # try to load from the cache
-        compiled_graph = self.load(graph, example_inputs, graph_index, runtime_shape)
+        compiled_graph = self.load(graph, example_inputs, graph_index, compile_range)
         if compiled_graph is not None:
             if graph_index == num_graphs - 1:
                 # after loading the last graph for this shape, record the time.
@@ -212,19 +204,12 @@ class CompilerManager:
                 now = time.time()
                 elapsed = now - compilation_start_time
                 compilation_config.compilation_time += elapsed
-                if runtime_shape is None:
-                    logger.info(
-                        "Directly load the compiled graph(s) for dynamic shape "
-                        "from the cache, took %.3f s",
-                        elapsed,
-                    )
-                else:
-                    logger.info(
-                        "Directly load the compiled graph(s) for shape %s "
-                        "from the cache, took %.3f s",
-                        str(runtime_shape),
-                        elapsed,
-                    )
+                logger.info(
+                    "Directly load the compiled graph(s) for compile range %s "
+                    "from the cache, took %.3f s",
+                    str(compile_range),
+                    elapsed,
+                )
             return compiled_graph
 
         # no compiler cached the graph, or the cache is disabled,
@@ -233,14 +218,15 @@ class CompilerManager:
             # Let compile_fx generate a key for us
             maybe_key = None
         else:
-            maybe_key = f"artifact_shape_{runtime_shape}_subgraph_{graph_index}"
-
-        with self.compile_context(runtime_shape):
+            maybe_key = "artifact_compile_range_"
+            maybe_key += f"{compile_range.start}_{compile_range.end}"
+            maybe_key += f"_subgraph_{graph_index}"
+        with self.compile_context(compile_range):
             compiled_graph, handle = self.compiler.compile(
                 graph,
                 example_inputs,
                 additional_inductor_config,
-                runtime_shape,
+                compile_range,
                 maybe_key,
             )
 
@@ -248,55 +234,34 @@ class CompilerManager:
 
         # store the artifact in the cache
         if is_compile_cache_enabled(additional_inductor_config) and handle is not None:
-            self.cache[(runtime_shape, graph_index, self.compiler.name)] = handle
+            self.cache[(compile_range, graph_index, self.compiler.name)] = handle
             compilation_counter.num_cache_entries_updated += 1
             self.is_cache_updated = True
             if graph_index == 0:
                 # adds some info logging for the first graph
-                if runtime_shape is None:
-                    logger.info_once(
-                        "Cache the graph for dynamic shape for later use", scope="local"
-                    )
-                else:
-                    logger.info_once(
-                        "Cache the graph of shape %s for later use",
-                        str(runtime_shape),
-                        scope="local",
-                    )
-            if runtime_shape is None:
-                logger.debug(
-                    "Store the %s-th graph for dynamic shape from %s via handle %s",
-                    graph_index,
-                    self.compiler.name,
-                    handle,
-                )
-            else:
-                logger.debug(
-                    "Store the %s-th graph for shape %s from %s via handle %s",
-                    graph_index,
-                    str(runtime_shape),
-                    self.compiler.name,
-                    handle,
+                logger.info_once(
+                    "Cache the graph of compile range %s for later use",
+                    str(compile_range),
                 )
+            logger.debug(
+                "Store the %s-th graph for compile range%s from %s via handle %s",
+                graph_index,
+                str(compile_range),
+                self.compiler.name,
+                handle,
+            )
 
         # after compiling the last graph, record the end time
         if graph_index == num_graphs - 1:
             now = time.time()
             elapsed = now - compilation_start_time
             compilation_config.compilation_time += elapsed
-            if runtime_shape is None:
-                logger.info_once(
-                    "Compiling a graph for dynamic shape takes %.2f s",
-                    elapsed,
-                    scope="local",
-                )
-            else:
-                logger.info_once(
-                    "Compiling a graph for shape %s takes %.2f s",
-                    runtime_shape,
-                    elapsed,
-                    scope="local",
-                )
+            logger.info_once(
+                "Compiling a graph for compile range %s takes %.2f s",
+                str(compile_range),
+                elapsed,
+                scope="local",
+            )
 
         return compiled_graph
 
@@ -427,19 +392,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
             sym_shape_indices = [
                 i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
             ]
-            global compilation_start_time
 
-            compiled_graph_for_dynamic_shape = (
-                self.vllm_backend.compiler_manager.compile(
-                    submod,
-                    args,
-                    self.vllm_backend.inductor_config,
-                    self.compilation_config,
-                    graph_index=index,
-                    num_graphs=len(self.compile_submod_names),
-                    runtime_shape=None,
-                )
-            )
             # Lazy import here to avoid circular import
             from .piecewise_backend import PiecewiseBackend
 
@@ -449,7 +402,6 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
                 index,
                 len(self.compile_submod_names),
                 sym_shape_indices,
-                compiled_graph_for_dynamic_shape,
                 self.vllm_backend,
             )
 
@@ -589,8 +541,13 @@ class VllmBackend:
                 )
             else:
                 # Config should automatically wrap all inductor passes
-                assert isinstance(self.inductor_config[self.pass_key], InductorPass)
-                self.pass_manager.add(self.inductor_config[self.pass_key])
+                assert isinstance(
+                    self.compilation_config.inductor_compile_config[self.pass_key],
+                    InductorPass,
+                )
+                self.pass_manager.add(
+                    self.compilation_config.inductor_compile_config[self.pass_key]
+                )
         self.inductor_config[self.pass_key] = self.pass_manager
 
     def __call__(
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
index 69d4606d7..2717738dd 100644
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -10,6 +10,7 @@ from torch._inductor.pattern_matcher import PatternMatcherPass
 from torch.distributed._symmetric_memory import enable_symm_mem_for_group
 
 from vllm.config import VllmConfig
+from vllm.config.utils import Range
 from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import (
     get_tensor_model_parallel_rank,
@@ -431,7 +432,7 @@ class AsyncTPPass(VllmPatternMatcherPass):
 
         self.dump_patterns(config, self.patterns)
 
-    def is_applicable(self, shape: int | None) -> bool:
+    def is_applicable_for_range(self, compile_range: Range) -> bool:
         # This pass is applied on top of the sequence parallelism pass.
         # It inherits the same applicability condition as `SequenceParallelismPass`.
         # See `SequenceParallelismPass.is_applicable` for more details.
@@ -441,7 +442,7 @@ class AsyncTPPass(VllmPatternMatcherPass):
         ):
             return True
         tp_size = get_tensor_model_parallel_world_size()
-        return shape is not None and shape % tp_size == 0
+        return compile_range.is_single_size() and compile_range.end % tp_size == 0
 
     @VllmInductorPass.time_and_log
     def __call__(self, graph: fx.Graph):
@@ -505,91 +506,60 @@ if flashinfer_comm is not None:
         num_tokens, hidden_size = allreduce_in.shape
         element_size = allreduce_in.element_size()
         current_tensor_size = num_tokens * hidden_size * element_size
+        max_tensor_size = max_token_num * hidden_size * element_size
+        assert current_tensor_size <= max_tensor_size, (
+            f"Current tensor size {current_tensor_size} is larger than "
+            f"max token num {max_token_num} * hidden size {hidden_size} * "
+            f"element size {element_size}"
+        )
+        device_capability = current_platform.get_device_capability().to_int()
+        # Get one shot input size limit for the current world size
+        # for the current device capability
+        max_one_shot_size = _FI_ALLREDUCE_ONE_SHOT_MAX_SIZES_MB.get(
+            device_capability, {}
+        ).get(world_size, None)
+        # Use one shot if no max size is specified
+        use_oneshot = (
+            max_one_shot_size is None or current_tensor_size <= max_one_shot_size * MiB
+        )
 
-        if num_tokens <= max_token_num:
-            device_capability = current_platform.get_device_capability().to_int()
-            # Get one shot input size limit for the current world size
-            # for the current device capability
-            max_one_shot_size_mb = _FI_ALLREDUCE_ONE_SHOT_MAX_SIZES_MB.get(
-                device_capability, {}
-            ).get(world_size, None)
-            # Use one shot if no max size for one shot is specified
-            use_oneshot = (
-                max_one_shot_size_mb is None
-                or current_tensor_size <= max_one_shot_size_mb * MiB
-            )
-
-            assert _FI_WORKSPACE_TENSOR is not None, (
-                "Flashinfer must be enabled when using flashinfer"
-            )
-            if norm_out is None:
-                norm_out = allreduce_in
-                residual_out = residual
-            else:
-                # return residual_out as allreduce_out with zeroed residual_in
-                # as flashinfer does not support rms_norm
-                # and allreduce_out together
-                residual_out = allreduce_in
-            # For the sizes that are smaller than the max size,
-            # we only use flashinfer one shot allreduce
-            flashinfer_comm.trtllm_allreduce_fusion(
-                allreduce_in=allreduce_in,
-                token_num=allreduce_in.shape[0],
-                residual_in=residual,
-                residual_out=residual_out,
-                norm_out=norm_out,
-                rms_gamma=rms_gamma,
-                rms_eps=rms_eps,
-                world_rank=world_rank,
-                world_size=world_size,
-                hidden_dim=allreduce_in.shape[-1],
-                workspace_ptrs=_FI_WORKSPACE_TENSOR,
-                launch_with_pdl=launch_with_pdl,
-                use_oneshot=use_oneshot,
-                trigger_completion_at_end=trigger_completion_at_end,
-                fp32_acc=fp32_acc,
-                pattern_code=pattern_code,
-                allreduce_out=None,
-                quant_out=quant_out,
-                scale_out=scale_out,
-                # in vllm we only support swizzled layout
-                layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
-                scale_factor=scale_factor,
-            )
+        assert _FI_WORKSPACE_TENSOR is not None, (
+            "Flashinfer must be enabled when using flashinfer"
+        )
+        if norm_out is None:
+            norm_out = allreduce_in
+            residual_out = residual
         else:
-            allreduce_out = tensor_model_parallel_all_reduce(allreduce_in)
-            if scale_factor is not None and scale_out is None:
-                # Do fused rms norm static fp8 quant fused op
-                if norm_out is None:
-                    torch.ops._C.fused_add_rms_norm_static_fp8_quant(
-                        quant_out,
-                        allreduce_out,
-                        residual,
-                        rms_gamma,
-                        scale_factor,
-                        rms_eps,
-                    )
-                else:
-                    torch.ops._C.rms_norm_static_fp8_quant(
-                        quant_out, allreduce_out, rms_gamma, scale_factor, rms_eps
-                    )
-            else:
-                if norm_out is None:
-                    torch.ops._C.fused_add_rms_norm(
-                        allreduce_out, residual, rms_gamma, rms_eps
-                    )
-                    norm_out = allreduce_out
-                else:
-                    torch.ops._C.rms_norm(norm_out, allreduce_out, rms_gamma, rms_eps)
-                if scale_factor is not None and scale_out is not None:
-                    torch.ops._C.scaled_fp4_quant(
-                        quant_out, norm_out, scale_out, scale_factor
-                    )
-            if scale_factor is None or norm_out is not None:
-                # we need to return allreduce output
-                # in cases of non quant fused AR + RMS norm
-                # and fused AR + RMS norm + quant without fused add
-                allreduce_in.copy_(allreduce_out)
+            # return residual_out as allreduce_out with zeroed residual_in
+            # as flashinfer does not support rms_norm
+            # and allreduce_out together
+            residual_out = allreduce_in
+        # For the sizes that are smaller than the max size,
+        # we only use flashinfer one shot allreduce
+        flashinfer_comm.trtllm_allreduce_fusion(
+            allreduce_in=allreduce_in,
+            token_num=allreduce_in.shape[0],
+            residual_in=residual,
+            residual_out=residual_out,
+            norm_out=norm_out,
+            rms_gamma=rms_gamma,
+            rms_eps=rms_eps,
+            world_rank=world_rank,
+            world_size=world_size,
+            hidden_dim=allreduce_in.shape[-1],
+            workspace_ptrs=_FI_WORKSPACE_TENSOR,
+            launch_with_pdl=launch_with_pdl,
+            use_oneshot=use_oneshot,
+            trigger_completion_at_end=trigger_completion_at_end,
+            fp32_acc=fp32_acc,
+            pattern_code=pattern_code,
+            allreduce_out=None,
+            quant_out=quant_out,
+            scale_out=scale_out,
+            # in vllm we only support swizzled layout
+            layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
+            scale_factor=scale_factor,
+        )
 
     def call_trtllm_fused_allreduce_norm_fake(
         allreduce_in: torch.Tensor,
@@ -1128,7 +1098,8 @@ class AllReduceFusionPass(VllmPatternMatcherPass):
         if max_size is None:
             # Flashinfer doesn't support current world size
             logger.warning(
-                "Flashinfer allreduce fusion is not supported for world size %s",
+                "Flashinfer allreduce fusion is not supported for world size %s"
+                " or max size is not provided",
                 self.tp_size,
             )
             return
@@ -1216,6 +1187,9 @@ class AllReduceFusionPass(VllmPatternMatcherPass):
 
         self.disabled = False
 
+    def is_applicable_for_range(self, compile_range: Range) -> bool:
+        return compile_range.end <= self.max_token_num
+
     @VllmInductorPass.time_and_log
     def __call__(self, graph: fx.Graph):
         if self.disabled:
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index 7deaba1a9..ab56d3561 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -15,6 +15,7 @@ import torch.fx as fx
 import vllm.envs as envs
 from vllm.compilation.counter import compilation_counter
 from vllm.config import VllmConfig
+from vllm.config.utils import Range
 from vllm.utils.hashing import safe_hash
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
@@ -63,16 +64,16 @@ class CompilerInterface:
         graph: fx.GraphModule,
         example_inputs: list[Any],
         compiler_config: dict[str, Any],
-        runtime_shape: int | None = None,
+        compile_range: Range,
         key: str | None = None,
     ) -> tuple[Callable | None, Any | None]:
         """
         Compile the graph with the given example inputs and compiler config,
-        with a runtime shape. If the `runtime_shape` is None, it means
-        the `example_inputs` have a dynamic shape. Otherwise, the
-        `runtime_shape` specifies the shape of the inputs. Right now we only
-        support one variable shape for all inputs, which is the batchsize
-        (number of tokens) during inference.
+        with a range. The `compile_range` specifies the range of the inputs,
+        it could be concrete size (if compile_sizes is provided), e.g. [4, 4]
+        or a range [5, 8].
+        Right now we only support one variable in ranges for all inputs,
+         which is the batchsize (number of tokens) during inference.
 
         Dynamo will make sure `graph(*example_inputs)` is valid.
 
@@ -98,7 +99,7 @@ class CompilerInterface:
         graph: fx.GraphModule,
         example_inputs: list[Any],
         graph_index: int,
-        runtime_shape: int | None = None,
+        compile_range: Range,
     ) -> Callable:
         """
         Load the compiled function from the handle.
@@ -212,20 +213,20 @@ class InductorStandaloneAdaptor(CompilerInterface):
         graph: fx.GraphModule,
         example_inputs: list[Any],
         compiler_config: dict[str, Any],
-        runtime_shape: int | None = None,
+        compile_range: Range,
         key: str | None = None,
     ) -> tuple[Callable | None, Any | None]:
         compilation_counter.num_inductor_compiles += 1
         current_config = {}
         if compiler_config is not None:
             current_config.update(compiler_config)
-        set_inductor_config(current_config, runtime_shape)
+        set_inductor_config(current_config, compile_range)
         set_functorch_config()
 
-        if isinstance(runtime_shape, int):
+        if compile_range.is_single_size():
             dynamic_shapes = "from_example_inputs"
         else:
-            dynamic_shapes = "from_tracing_context"
+            dynamic_shapes = "from_graph"
 
         from torch._inductor import standalone_compile
 
@@ -235,7 +236,6 @@ class InductorStandaloneAdaptor(CompilerInterface):
             dynamic_shapes=dynamic_shapes,
             options={"config_patches": current_config},
         )
-
         # Save the compiled artifact to disk in the specified path
         assert key is not None
         path = os.path.join(self.cache_dir, key)
@@ -251,7 +251,7 @@ class InductorStandaloneAdaptor(CompilerInterface):
         graph: fx.GraphModule,
         example_inputs: list[Any],
         graph_index: int,
-        runtime_shape: int | None = None,
+        compile_range: Range,
     ) -> Callable:
         assert isinstance(handle, tuple)
         assert isinstance(handle[0], str)
@@ -315,7 +315,7 @@ class InductorAdaptor(CompilerInterface):
         graph: fx.GraphModule,
         example_inputs: list[Any],
         compiler_config: dict[str, Any],
-        runtime_shape: int | None = None,
+        compile_range: Range,
         key: str | None = None,
     ) -> tuple[Callable | None, Any | None]:
         compilation_counter.num_inductor_compiles += 1
@@ -329,7 +329,7 @@ class InductorAdaptor(CompilerInterface):
         current_config["fx_graph_cache"] = True
         current_config["fx_graph_remote_cache"] = False
 
-        set_inductor_config(current_config, runtime_shape)
+        set_inductor_config(current_config, compile_range)
         set_functorch_config()
 
         # inductor can inplace modify the graph, so we need to copy it
@@ -512,7 +512,7 @@ class InductorAdaptor(CompilerInterface):
         graph: fx.GraphModule,
         example_inputs: list[Any],
         graph_index: int,
-        runtime_shape: int | None = None,
+        compile_range: Range,
     ) -> Callable:
         assert isinstance(handle, tuple)
         assert isinstance(handle[0], str)
@@ -608,9 +608,9 @@ class InductorAdaptor(CompilerInterface):
             return contextlib.nullcontext()
 
 
-def set_inductor_config(config, runtime_shape):
-    if isinstance(runtime_shape, int):
-        # for a specific batchsize, tuning triton kernel parameters
+def set_inductor_config(config, compile_range: Range):
+    if compile_range.is_single_size():
+        # for a specific batch size, tuning triton kernel parameters
         # can be beneficial
         config["max_autotune"] = envs.VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE
         config["coordinate_descent_tuning"] = (
@@ -630,7 +630,7 @@ class EagerAdaptor(CompilerInterface):
         graph: fx.GraphModule,
         example_inputs: list[Any],
         compiler_config: dict[str, Any],
-        runtime_shape: int | None = None,
+        compile_range: Range,
         key: str | None = None,
     ) -> tuple[Callable | None, Any | None]:
         compilation_counter.num_eager_compiles += 1
diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py
index 9af635a92..8159b817f 100644
--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
@@ -14,6 +14,7 @@ import torch
 from torch import fx
 from torch._subclasses.fake_tensor import FakeTensorMode, unset_fake_temporarily
 
+from vllm.config.utils import Range
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 if is_torch_equal_or_newer("2.6"):
@@ -28,8 +29,8 @@ _pass_context = None
 
 
 class PassContext:
-    def __init__(self, runtime_shape: int | None):
-        self.runtime_shape = runtime_shape
+    def __init__(self, compile_range: Range):
+        self.compile_range: Range = compile_range
 
 
 def get_pass_context() -> PassContext:
@@ -39,13 +40,13 @@ def get_pass_context() -> PassContext:
 
 
 @contextmanager
-def pass_context(runtime_shape: int | None):
+def pass_context(compile_range: Range):
     """A context manager that stores the current pass context,
     usually it is a list of sizes to specialize.
     """
     global _pass_context
     prev_context = _pass_context
-    _pass_context = PassContext(runtime_shape)
+    _pass_context = PassContext(compile_range)
     try:
         yield
     finally:
@@ -96,7 +97,7 @@ class InductorPass(CustomGraphPass):
         encoded = json.dumps(dict_, sort_keys=True).encode("utf-8")
         return hashlib.sha256(encoded).hexdigest()
 
-    def is_applicable(self, shape: int | None):
+    def is_applicable_for_range(self, compile_range: Range):
         return True
 
 
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index 37f48721e..6848bfb6a 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -24,7 +24,11 @@ if current_platform.is_cuda():
     from .collective_fusion import AllReduceFusionPass, AsyncTPPass
 
 from .fix_functionalization import FixFunctionalizationPass
-from .inductor_pass import CustomGraphPass, InductorPass, get_pass_context
+from .inductor_pass import (
+    CustomGraphPass,
+    InductorPass,
+    get_pass_context,
+)
 from .noop_elimination import NoOpEliminationPass
 
 logger = init_logger(__name__)
@@ -70,13 +74,13 @@ class PostGradPassManager(CustomGraphPass):
     def __call__(self, graph: fx.Graph):
         VllmInductorPass.dump_prefix = 0  # reset dump index
 
-        shape = get_pass_context().runtime_shape
+        compile_range = get_pass_context().compile_range
         for pass_ in self.passes:
-            if pass_.is_applicable(shape):
+            if pass_.is_applicable_for_range(compile_range):
                 pass_(graph)
                 VllmInductorPass.dump_prefix += 1
             else:
-                logger.debug("Skipping %s with shape %s", pass_, shape)
+                logger.debug("Skipping %s with compile range %s", pass_, compile_range)
 
         # post-cleanup goes before fix_functionalization
         # because it requires a functional graph
@@ -133,4 +137,8 @@ class PostGradPassManager(CustomGraphPass):
             state["passes"].append(pass_.uuid())
         state["passes"].append(self.fix_functionalization.uuid())
 
+        # Include the compile range in the uuid to ensure that inductor
+        # recompiles the graph for the new dynamic compile range.
+        state["compile_range"] = str(get_pass_context().compile_range)
+
         return InductorPass.hash_dict(state)
diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py
index e535d2c46..129b9b5de 100644
--- a/vllm/compilation/piecewise_backend.py
+++ b/vllm/compilation/piecewise_backend.py
@@ -7,18 +7,18 @@ from typing import Any
 
 import torch.fx as fx
 
-import vllm.envs as envs
 from vllm.compilation.backends import VllmBackend
 from vllm.compilation.monitor import end_monitoring_torch_compile
 from vllm.config import VllmConfig
+from vllm.config.compilation import Range
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
 
 @dataclasses.dataclass
-class ConcreteSizeEntry:
-    runtime_shape: int
+class RangeEntry:
+    compile_range: Range
     compiled: bool = False
     runnable: Callable = None  # type: ignore
 
@@ -31,7 +31,6 @@ class PiecewiseBackend:
         piecewise_compile_index: int,
         total_piecewise_compiles: int,
         sym_shape_indices: list[int],
-        compiled_graph_for_general_shape: Callable,
         vllm_backend: VllmBackend,
     ):
         """
@@ -55,67 +54,111 @@ class PiecewiseBackend:
 
         self.is_full_graph = total_piecewise_compiles == 1
 
-        self.compile_sizes: set[int] = set(self.compilation_config.compile_sizes)
+        self.compile_ranges = self.compilation_config.get_compile_ranges()
+        log_string = f"PiecewiseBackend: compile_ranges: {self.compile_ranges}"
+        logger.debug_once(log_string)
 
-        self.first_run_finished = False
-
-        self.compiled_graph_for_general_shape = compiled_graph_for_general_shape  # noqa
+        self.compile_sizes = self.compilation_config.compile_sizes
+        log_string = f"PiecewiseBackend: compile_sizes: {self.compile_sizes}"
+        logger.debug_once(log_string)
 
         self.sym_shape_indices = sym_shape_indices
 
-        self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG"
-
-        # the entries for different shapes that we need to compile
-        self.concrete_size_entries: dict[int, ConcreteSizeEntry] = {}
+        # the entries for ranges that we need to either
+        self.range_entries: dict[Range, RangeEntry] = {}
 
-        # to_be_compiled_sizes tracks the remaining sizes to compile,
+        # to_be_compiled_ranges tracks the remaining ranges to compile,
         # and updates during the compilation process, so we need to copy it
-        self.to_be_compiled_sizes: set[int] = self.compile_sizes.copy()
+        self.to_be_compiled_ranges: set[Range] = set(self.compile_ranges)
 
         # We only keep compilation management inside this class directly.
-        for shape in self.compile_sizes:
-            self.concrete_size_entries[shape] = ConcreteSizeEntry(
-                runtime_shape=shape,
-                runnable=self.compiled_graph_for_general_shape,
+        for size in self.compile_sizes:
+            range = Range(start=size, end=size)
+            if range not in self.compile_ranges:
+                self.range_entries[range] = RangeEntry(
+                    compile_range=range,
+                )
+                self.to_be_compiled_ranges.add(range)
+
+        for range in self.compile_ranges:
+            self.range_entries[range] = RangeEntry(
+                compile_range=range,
             )
 
     def check_for_ending_compilation(self):
-        if self.is_last_graph and not self.to_be_compiled_sizes:
+        if self.is_last_graph and not self.to_be_compiled_ranges:
             # no specific sizes to compile
             # save the hash of the inductor graph for the next run
             self.vllm_backend.compiler_manager.save_to_file()
             end_monitoring_torch_compile(self.vllm_config)
 
-    def __call__(self, *args) -> Any:
-        if not self.first_run_finished:
-            self.first_run_finished = True
-            self.check_for_ending_compilation()
-            return self.compiled_graph_for_general_shape(*args)
-
-        runtime_shape = args[self.sym_shape_indices[0]]
-
-        if runtime_shape not in self.concrete_size_entries:
-            # we don't need to do anything for this shape
-            return self.compiled_graph_for_general_shape(*args)
-
-        entry = self.concrete_size_entries[runtime_shape]
+    def _fakify_args(self, args: list[Any]) -> list[Any]:
+        # We need to pass fake example_inputs, otherwise torch.compile
+        # will fakify the example_inputs potentially causing some non dynamic
+        # dimension to be be duck shaped to other existing shapes that have hints
+        # matching their values.
+        # This is problem because it can lead to unintended specializations!
+        # if the new wrongly dynamic dim is specialized
+        # it will force specializing the whole shape
+        # torch.compile probably should not accept
+        # non fake tensors as example inputs!
+        # See issue https://github.com/vllm-project/vllm/issues/27899
+        fake_example_inputs = []
+        for node in self.graph.graph.nodes:
+            # All place holders come first
+            if node.op == "placeholder":
+                fake_example_inputs.append(node.meta["example_value"])
+            else:
+                break
+        assert len(fake_example_inputs) == len(args)
+        return fake_example_inputs
+
+    def _maybe_compile_for_range_entry(self, range_entry: RangeEntry, args) -> Any:
+        if not range_entry.compiled:
+            range_entry.compiled = True
+            self.to_be_compiled_ranges.remove(range_entry.compile_range)
 
-        if not entry.compiled:
-            entry.compiled = True
-            self.to_be_compiled_sizes.remove(runtime_shape)
             # args are real arguments
-            entry.runnable = self.vllm_backend.compiler_manager.compile(
+            # fakify for range, real args for concrete size.
+            # For concrete size, we clear the shape env in
+            # compiler_manager.compile() so no need to fakify.
+            args = (
+                self._fakify_args(args)
+                if not range_entry.compile_range.is_single_size()
+                else args
+            )
+            range_entry.runnable = self.vllm_backend.compiler_manager.compile(
                 self.graph,
                 args,
                 self.vllm_backend.inductor_config,
                 self.compilation_config,
+                compile_range=range_entry.compile_range,
                 graph_index=self.piecewise_compile_index,
                 num_graphs=self.total_piecewise_compiles,
-                runtime_shape=runtime_shape,
             )
 
-            # finished compilations for all required shapes
-            if self.is_last_graph and not self.to_be_compiled_sizes:
-                self.check_for_ending_compilation()
+            self.check_for_ending_compilation()
+
+    def _find_range_for_shape(self, runtime_shape: int) -> Range | None:
+        # First we try to find the range entry for the concrete compile size
+        # If not found, we search for the range entry
+        # that contains the runtime shape.
+        if runtime_shape in self.compile_sizes:
+            return self.range_entries[Range(start=runtime_shape, end=runtime_shape)]
+        else:
+            for range in self.compile_ranges:
+                if runtime_shape in range:
+                    return self.range_entries[range]
+        return None
+
+    def __call__(self, *args) -> Any:
+        runtime_shape = args[self.sym_shape_indices[0]]
+        range_entry = self._find_range_for_shape(runtime_shape)
+
+        assert range_entry is not None, (
+            f"Shape out of considered range: {runtime_shape} "
+            "[1, max_num_batched_tokens]"
+        )
 
-        return entry.runnable(*args)
+        self._maybe_compile_for_range_entry(range_entry, args)
+        return range_entry.runnable(*args)
diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py
index cf4b8118f..a4046356b 100644
--- a/vllm/compilation/sequence_parallelism.py
+++ b/vllm/compilation/sequence_parallelism.py
@@ -9,6 +9,7 @@ import torch.fx as fx
 from torch._inductor.pattern_matcher import PatternMatcherPass
 
 from vllm.config import VllmConfig
+from vllm.config.compilation import Range
 from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
@@ -333,7 +334,7 @@ class SequenceParallelismPass(VllmPatternMatcherPass):
 
         self.dump_patterns(config, self.patterns)
 
-    def is_applicable(self, shape: int | None) -> bool:
+    def is_applicable_for_range(self, compile_range: Range) -> bool:
         # When sequence parallelism is enabled, the residual tensor from RMSNorm
         # needs to be split along the sequence dimension. However, this dimension
         # is symbolic during piecewise compilation, and splitting symbolic shapes
@@ -353,7 +354,7 @@ class SequenceParallelismPass(VllmPatternMatcherPass):
         ):
             return True
         tp_size = get_tensor_model_parallel_world_size()
-        return shape is not None and shape % tp_size == 0
+        return (compile_range.is_single_size()) and (compile_range.end % tp_size == 0)
 
     @VllmInductorPass.time_and_log
     def __call__(self, graph: fx.Graph):
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index d3d50e6ae..5f9e2cfdd 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -13,7 +13,13 @@ from pydantic.dataclasses import dataclass
 
 import vllm.envs as envs
 from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
-from vllm.config.utils import config, get_hash_factors, handle_deprecated, hash_factors
+from vllm.config.utils import (
+    Range,
+    config,
+    get_hash_factors,
+    handle_deprecated,
+    hash_factors,
+)
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import resolve_obj_by_qualname
@@ -173,6 +179,9 @@ class PassConfig:
         """
 
         MiB = 1024 * 1024
+        FI_SUPPORTED_WORLD_SIZES = [2, 4, 8]
+        if world_size not in FI_SUPPORTED_WORLD_SIZES:
+            return None
         max_size_mb = self.fi_allreduce_fusion_max_size_mb
         if max_size_mb is None:
             max_size_mb = self.default_fi_allreduce_fusion_max_size_mb().get(world_size)
@@ -379,6 +388,8 @@ class CompilationConfig:
         [vllm.config.CompilationConfig.cudagraph_copy_inputs]
     - Inductor compilation:
         - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
+        - [`compile_ranges_split_points`]
+            [vllm.config.CompilationConfig.compile_ranges_split_points]
         - [`inductor_compile_config`]
         [vllm.config.CompilationConfig.inductor_compile_config]
         - [`inductor_passes`][vllm.config.CompilationConfig.inductor_passes]
@@ -492,6 +503,21 @@ class CompilationConfig:
     to integers, it also supports "cudagraph_capture_sizes" to
     specify the sizes for cudagraph capture."""
 
+    compile_ranges_split_points: list[int] | None = None
+    """Split points that represent compile ranges for inductor.
+    The compile ranges are 
+    [1, split_points[0]], 
+    [split_points[0] + 1, split_points[1]], ..., 
+    [split_points[-1] + 1, max_num_batched_tokens].
+    Compile sizes are also used single element ranges,
+    the range is represented as [compile_sizes[i], compile_sizes[i]].
+    
+    If a range overlaps with the compile size, graph for compile size 
+    will be prioritized, i.e. if we have a range [1, 8] and a compile size 4,
+    graph for compile size 4 will be compiled and used instead of the graph
+    for range [1, 8].
+    """
+
     inductor_compile_config: dict = field(default_factory=dict)
     """Additional configurations for inductor.
     - None: use default configurations."""
@@ -1153,3 +1179,13 @@ class CompilationConfig:
                     self.bs_to_padded_graph_size[bs] = start
                 else:
                     self.bs_to_padded_graph_size[bs] = end
+
+    def get_compile_ranges(self) -> list[Range]:
+        """Get the compile ranges for the compilation config."""
+        if self.compile_ranges_split_points is None:
+            return []
+        split_points = sorted(set(self.compile_ranges_split_points))
+        return [
+            Range(start=s + 1, end=e)
+            for s, e in zip([0] + split_points[:-1], split_points)
+        ]
diff --git a/vllm/config/utils.py b/vllm/config/utils.py
index 3124fcf00..93da3fd41 100644
--- a/vllm/config/utils.py
+++ b/vllm/config/utils.py
@@ -10,7 +10,7 @@ import json
 import pathlib
 import textwrap
 from collections.abc import Iterable, Mapping, Sequence, Set
-from dataclasses import MISSING, Field, field, fields, is_dataclass, replace
+from dataclasses import MISSING, Field, dataclass, field, fields, is_dataclass, replace
 from itertools import pairwise
 from typing import TYPE_CHECKING, Any, Protocol, TypeVar
 
@@ -322,3 +322,35 @@ def handle_deprecated(
 
     for new_name in new_names:
         setattr(config, new_name, old_val)
+
+
+@dataclass
+class Range:
+    """
+    A range of numbers.
+    Inclusive of start, inclusive of end.
+    """
+
+    start: int
+    end: int
+
+    def is_single_size(self) -> bool:
+        return self.start == self.end
+
+    def __contains__(self, size: int) -> bool:
+        # Inclusive of start, inclusive of end
+        return self.start <= size <= self.end
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Range):
+            return False
+        return self.start == other.start and self.end == other.end
+
+    def __hash__(self) -> int:
+        return hash((self.start, self.end))
+
+    def __str__(self) -> str:
+        return f"({self.start}, {self.end})"
+
+    def __repr__(self) -> str:
+        return self.__str__()
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index ce3d3b208..47e7ffded 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -725,6 +725,8 @@ class VllmConfig:
                 "--kv-sharing-fast-prefill requires changes on model side for "
                 "correctness and to realize prefill savings. "
             )
+        # TODO: Move after https://github.com/vllm-project/vllm/pull/26847 lands
+        self._set_compile_ranges()
 
         if self.model_config and self.model_config.is_encoder_decoder:
             from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -1126,6 +1128,52 @@ class VllmConfig:
         # complete the remaining process.
         self.compilation_config.post_init_cudagraph_sizes()
 
+    def _set_compile_ranges(self):
+        """
+        Set the compile ranges for the compilation config.
+        """
+        compilation_config = self.compilation_config
+        computed_compile_ranges_split_points = []
+
+        # The upper bound of the compile ranges is the max_num_batched_tokens
+        max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
+        if max_num_batched_tokens is not None:
+            computed_compile_ranges_split_points.append(max_num_batched_tokens)
+
+        # Add the compile ranges for flashinfer
+        if compilation_config.pass_config.fuse_allreduce_rms:
+            tp_size = self.parallel_config.tensor_parallel_size
+            max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
+            if max_size is not None:
+                max_token_num = max_size // (
+                    self.model_config.get_hidden_size()
+                    * self.model_config.dtype.itemsize
+                )
+                if (
+                    max_num_batched_tokens is not None
+                    and max_token_num < max_num_batched_tokens
+                ):
+                    computed_compile_ranges_split_points.append(max_token_num)
+                else:
+                    logger.debug(
+                        "Max num batched tokens below allreduce-rms fusion threshold, "
+                        "allreduce-rms fusion will be enabled for all num_tokens."
+                    )
+
+        if compilation_config.compile_ranges_split_points is not None:
+            for x in compilation_config.compile_ranges_split_points:
+                assert isinstance(x, int)
+                assert x > 0, f"Invalid compile range split point: {x}"
+                if (
+                    max_num_batched_tokens is not None
+                    and x < max_num_batched_tokens
+                    and x > 1
+                ):
+                    computed_compile_ranges_split_points.append(x)
+        compilation_config.compile_ranges_split_points = sorted(
+            computed_compile_ranges_split_points
+        )
+
     def recalculate_max_model_len(self, max_model_len: int):
         # Can only be called in try_verify_and_update_config
         model_config = self.model_config
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index d189d0860..a46ec2bd1 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -15,6 +15,7 @@ import torch.nn as nn
 
 import vllm.envs as envs
 from vllm.config import CUDAGraphMode, VllmConfig
+from vllm.config.compilation import CompilationMode
 from vllm.distributed import (
     ensure_model_parallel_initialized,
     init_distributed_environment,
@@ -407,15 +408,31 @@ class Worker(WorkerBase):
             self.model_runner.initialize_kv_cache(kv_cache_config)
 
     def compile_or_warm_up_model(self) -> None:
-        # warm up sizes that are not in cudagraph capture sizes,
-        # but users still want to compile for better performance,
-        # e.g. for the max-num-batched token size in chunked prefill.
-        compile_sizes = self.vllm_config.compilation_config.compile_sizes
-        warmup_sizes = compile_sizes.copy() if compile_sizes is not None else []
-        if not self.model_config.enforce_eager:
-            capture_sizes = self.vllm_config.compilation_config.cudagraph_capture_sizes
-            if capture_sizes is not None:
-                warmup_sizes = [x for x in warmup_sizes if x not in capture_sizes]
+        warmup_sizes = []
+
+        if self.vllm_config.compilation_config.mode == CompilationMode.VLLM_COMPILE:
+            # warm up sizes that are not in cudagraph capture sizes,
+            # but users still want to compile for better performance,
+            # e.g. for the max-num-batched token size in chunked prefill.
+            compile_sizes = self.vllm_config.compilation_config.compile_sizes
+            warmup_sizes = compile_sizes.copy() if compile_sizes is not None else []
+            cg_capture_sizes: list[int] = []
+
+            if self.vllm_config.compilation_config.cudagraph_mode != CUDAGraphMode.NONE:
+                cg_sizes = self.vllm_config.compilation_config.cudagraph_capture_sizes
+                cg_capture_sizes = [] if cg_sizes is None else cg_sizes
+                warmup_sizes = [x for x in warmup_sizes if x not in cg_capture_sizes]
+
+            compile_ranges = self.vllm_config.compilation_config.get_compile_ranges()
+            # For each compile_range, if none of the batch sizes
+            # in warmup_sizes or cudagraph_capture_sizes are in the range,
+            # add the end of the range to ensure compilation/warmup.
+            all_sizes = set(cg_capture_sizes)
+            all_sizes.update([x for x in warmup_sizes if isinstance(x, int)])
+            for compile_range in compile_ranges:
+                if not any(x in compile_range for x in all_sizes):
+                    warmup_sizes.append(compile_range.end)
+
         # We skip EPLB here since we don't want to record dummy metrics
         for size in sorted(warmup_sizes, reverse=True):
             logger.info("Compile and warming up model for size %d", size)
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 427a0d296..0b0e2006d 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -337,7 +337,7 @@ def is_residual_scattered_for_sp(
     The residual tensor is scattered across tensor parallel ranks when sequence
     parallelism and tensor parallelism is enabled.
 
-    This follows the same logic as SequenceParallelismPass.is_applicable():
+    This follows the same logic as SequenceParallelismPass.is_applicable_for_range():
     - In full-graph compilation mode (no splitting ops or using inductor graph
       partition), SP is always applied
     - Otherwise, SP is only applied for specific shapes in compile_sizes
-- 
GitLab


From adb315060cd9943b38e68141cd9a54421144ae64 Mon Sep 17 00:00:00 2001
From: Tova Movshovitz <tovam@pliops.com>
Date: Fri, 5 Dec 2025 20:33:26 +0200
Subject: [PATCH 138/258] [KVConnector][Feature] Support KV connector cache
 reset via /reset_prefix_cache (#27170)

Signed-off-by: tovam <tovam@pliops.com>
Signed-off-by: Tova Movshovitz <tovam@pliops.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 .../kv_transfer/kv_connector/v1/base.py       | 14 ++++++++++
 .../kv_connector/v1/multi_connector.py        |  4 +++
 vllm/engine/protocol.py                       |  6 +++--
 vllm/entrypoints/llm.py                       |  8 ++++--
 vllm/entrypoints/openai/api_server.py         | 21 ++++++++++++---
 vllm/v1/core/sched/interface.py               |  4 ++-
 vllm/v1/core/sched/scheduler.py               | 22 +++++++++++++++-
 vllm/v1/engine/async_llm.py                   |  8 ++++--
 vllm/v1/engine/core.py                        |  8 ++++--
 vllm/v1/engine/core_client.py                 | 26 +++++++++++++------
 vllm/v1/engine/llm_engine.py                  |  8 ++++--
 11 files changed, 105 insertions(+), 24 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index d37ec2567..8e9182a9b 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -573,3 +573,17 @@ class KVConnectorBase_V1(ABC):
         expose connector transfer stats via Prometheus.
         """
         return None
+
+    def reset_cache(self) -> bool | None:
+        """
+        Reset the connector's internal cache.
+
+        Returns:
+            bool: True if the cache was successfully reset, False otherwise.
+        """
+        logger.debug(
+            "Connector cache reset requested, but %s does not implement reset_cache().",
+            type(self).__name__,
+        )
+
+        return None
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index 51d5df6c6..c80dc1a56 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -452,3 +452,7 @@ class MultiConnector(KVConnectorBase_V1):
             per_engine_labelvalues,
             prom_metrics,
         )
+
+    def reset_cache(self) -> bool:
+        results = [c.reset_cache() is not False for c in self._connectors]
+        return all(results)
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 1b6330c9f..d94951a0c 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -116,8 +116,10 @@ class EngineClient(ABC):
         ...
 
     @abstractmethod
-    async def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
-        """Reset the prefix cache"""
+    async def reset_prefix_cache(
+        self, reset_running_requests: bool = False, reset_connector: bool = False
+    ) -> bool:
+        """Reset the prefix cache and optionally any configured connector cache"""
         ...
 
     @abstractmethod
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 481a47a97..add917634 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1491,8 +1491,12 @@ class LLM:
     def stop_profile(self) -> None:
         self.llm_engine.stop_profile()
 
-    def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
-        return self.llm_engine.reset_prefix_cache(reset_running_requests)
+    def reset_prefix_cache(
+        self, reset_running_requests: bool = False, reset_connector: bool = False
+    ) -> bool:
+        return self.llm_engine.reset_prefix_cache(
+            reset_running_requests, reset_connector
+        )
 
     def sleep(self, level: int = 1):
         """
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 2fa6afa2b..7be601d82 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -663,14 +663,27 @@ if envs.VLLM_SERVER_DEV_MODE:
 
     @router.post("/reset_prefix_cache")
     async def reset_prefix_cache(
-        raw_request: Request, reset_running_requests: bool = Query(default=False)
+        raw_request: Request,
+        reset_running_requests: bool = Query(default=False),
+        reset_external: bool = Query(default=False),
     ):
         """
-        Reset the prefix cache. Note that we currently do not check if the
-        prefix cache is successfully reset in the API server.
+        Reset the local prefix cache.
+
+        Optionally, if the query parameter `reset_external=true`
+        also resets the external (connector-managed) prefix cache.
+
+        Note that we currently do not check if the prefix cache
+        is successfully reset in the API server.
+
+        Example:
+            POST /reset_prefix_cache?reset_external=true
         """
         logger.info("Resetting prefix cache...")
-        await engine_client(raw_request).reset_prefix_cache(reset_running_requests)
+
+        await engine_client(raw_request).reset_prefix_cache(
+            reset_running_requests, reset_external
+        )
         return Response(status_code=200)
 
     @router.post("/reset_mm_cache")
diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py
index c2f503ef2..596ab05ad 100644
--- a/vllm/v1/core/sched/interface.py
+++ b/vllm/v1/core/sched/interface.py
@@ -152,7 +152,9 @@ class SchedulerInterface(ABC):
         return self.has_unfinished_requests() or self.has_finished_requests()
 
     @abstractmethod
-    def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
+    def reset_prefix_cache(
+        self, reset_running_requests: bool = False, reset_connector: bool = False
+    ) -> bool:
         """Reset the prefix cache for KV cache.
 
         This is particularly required when the model weights are live-updated.
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 75a7385df..0a8efa2fd 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1380,7 +1380,9 @@ class Scheduler(SchedulerInterface):
     def has_finished_requests(self) -> bool:
         return len(self.finished_req_ids) > 0
 
-    def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
+    def reset_prefix_cache(
+        self, reset_running_requests: bool = False, reset_connector: bool = False
+    ) -> bool:
         """Reset the KV prefix cache.
 
         If reset_running_requests is True, all the running requests will be
@@ -1418,8 +1420,26 @@ class Scheduler(SchedulerInterface):
                 "the presence of running requests waiting for remote KV transfer, "
                 "which is not supported yet."
             )
+
+        if reset_connector:
+            reset_successful = self.reset_connector_cache() and reset_successful
+
         return reset_successful
 
+    def reset_connector_cache(self) -> bool:
+        if self.connector is None:
+            logger.warning("reset_connector called but no KV connector is configured.")
+            return False
+
+        if self.connector.reset_cache() is False:
+            return False
+
+        if self.log_stats:
+            assert self.connector_prefix_cache_stats is not None
+            self.connector_prefix_cache_stats.reset = True
+
+        return True
+
     def make_stats(
         self,
         spec_decoding_stats: SpecDecodingStats | None = None,
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index ec5d6e95c..fd7e04dc0 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -749,8 +749,12 @@ class AsyncLLM(EngineClient):
         self.input_processor.clear_mm_cache()
         await self.engine_core.reset_mm_cache_async()
 
-    async def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
-        return await self.engine_core.reset_prefix_cache_async(reset_running_requests)
+    async def reset_prefix_cache(
+        self, reset_running_requests: bool = False, reset_connector: bool = False
+    ) -> bool:
+        return await self.engine_core.reset_prefix_cache_async(
+            reset_running_requests, reset_connector
+        )
 
     async def sleep(self, level: int = 1) -> None:
         await self.reset_prefix_cache()
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 8e34dfcea..3d3a1e138 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -503,8 +503,12 @@ class EngineCore:
 
         self.model_executor.reset_mm_cache()
 
-    def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
-        return self.scheduler.reset_prefix_cache(reset_running_requests)
+    def reset_prefix_cache(
+        self, reset_running_requests: bool = False, reset_connector: bool = False
+    ) -> bool:
+        return self.scheduler.reset_prefix_cache(
+            reset_running_requests, reset_connector
+        )
 
     def sleep(self, level: int = 1):
         self.model_executor.sleep(level)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index afa059392..c936646aa 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -138,7 +138,9 @@ class EngineCoreClient(ABC):
     def reset_mm_cache(self) -> None:
         raise NotImplementedError
 
-    def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
+    def reset_prefix_cache(
+        self, reset_running_requests: bool = False, reset_connector: bool = False
+    ) -> bool:
         raise NotImplementedError
 
     def sleep(self, level: int = 1) -> None:
@@ -209,7 +211,7 @@ class EngineCoreClient(ABC):
         raise NotImplementedError
 
     async def reset_prefix_cache_async(
-        self, reset_running_requests: bool = False
+        self, reset_running_requests: bool = False, reset_connector: bool = False
     ) -> bool:
         raise NotImplementedError
 
@@ -289,8 +291,12 @@ class InprocClient(EngineCoreClient):
     def reset_mm_cache(self) -> None:
         self.engine_core.reset_mm_cache()
 
-    def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
-        return self.engine_core.reset_prefix_cache(reset_running_requests)
+    def reset_prefix_cache(
+        self, reset_running_requests: bool = False, reset_connector: bool = False
+    ) -> bool:
+        return self.engine_core.reset_prefix_cache(
+            reset_running_requests, reset_connector
+        )
 
     def sleep(self, level: int = 1) -> None:
         self.engine_core.sleep(level)
@@ -753,8 +759,12 @@ class SyncMPClient(MPClient):
     def reset_mm_cache(self) -> None:
         self.call_utility("reset_mm_cache")
 
-    def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
-        return self.call_utility("reset_prefix_cache", reset_running_requests)
+    def reset_prefix_cache(
+        self, reset_running_requests: bool = False, reset_connector: bool = False
+    ) -> bool:
+        return self.call_utility(
+            "reset_prefix_cache", reset_running_requests, reset_connector
+        )
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
         return self.call_utility("add_lora", lora_request)
@@ -958,10 +968,10 @@ class AsyncMPClient(MPClient):
         await self.call_utility_async("reset_mm_cache")
 
     async def reset_prefix_cache_async(
-        self, reset_running_requests: bool = False
+        self, reset_running_requests: bool = False, reset_connector: bool = False
     ) -> bool:
         return await self.call_utility_async(
-            "reset_prefix_cache", reset_running_requests
+            "reset_prefix_cache", reset_running_requests, reset_connector
         )
 
     async def sleep_async(self, level: int = 1) -> None:
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 8772f2e48..4c3129100 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -328,8 +328,12 @@ class LLMEngine:
         self.input_processor.clear_mm_cache()
         self.engine_core.reset_mm_cache()
 
-    def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
-        return self.engine_core.reset_prefix_cache(reset_running_requests)
+    def reset_prefix_cache(
+        self, reset_running_requests: bool = False, reset_connector: bool = False
+    ) -> bool:
+        return self.engine_core.reset_prefix_cache(
+            reset_running_requests, reset_connector
+        )
 
     def sleep(self, level: int = 1):
         self.engine_core.sleep(level)
-- 
GitLab


From bff78310d979bf516b45d8908b5e621a170578b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Fri, 5 Dec 2025 20:23:33 +0100
Subject: [PATCH 139/258] [Enc-Dec] Fix OOT tokenizer issue (#30144)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 vllm/inputs/preprocess.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 2893a56b1..0372b06d0 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -198,7 +198,7 @@ class InputPreprocessor:
     ) -> dict[str, Any]:
         kwargs = dict[str, Any]()
 
-        if self.model_config.hf_config.model_type == "whisper":
+        if self.model_config.is_encoder_decoder:
             # For Whisper, special tokens should be provided by the user based
             # on the task and language of their request. Also needed to avoid
             # appending an EOS token to the prompt which disrupts generation.
@@ -573,7 +573,6 @@ class InputPreprocessor:
         """
         encoder_inputs: SingletonInputs
         decoder_inputs: SingletonInputs | None
-
         if is_explicit_encoder_decoder_prompt(prompt):
             # `cast` is needed for mypy, but not pyright
             prompt_ = cast(ExplicitEncoderDecoderPrompt, prompt)
@@ -585,7 +584,9 @@ class InputPreprocessor:
             if (decoder_input := prompt_["decoder_prompt"]) is None:
                 decoder_inputs = None
             else:
-                decoder_inputs = self._prompt_to_llm_inputs(decoder_input)
+                decoder_inputs = self._prompt_to_llm_inputs(
+                    decoder_input, tokenization_kwargs=tokenization_kwargs
+                )
             # For multimodal model, override decoder prompt from processor
             # with explicit decoder prompt.
             if self.model_config.is_multimodal_model:
-- 
GitLab


From 3633035a3fdee20cca8a8deb72490dc9cacea0f8 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 5 Dec 2025 14:41:40 -0500
Subject: [PATCH 140/258] [Misc] Rename CohereForAI references to CohereLabs
 (#30147)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 docs/models/supported_models.md                           | 2 +-
 examples/offline_inference/vision_language.py             | 2 +-
 examples/offline_inference/vision_language_multi_image.py | 2 +-
 tests/distributed/test_pipeline_parallel.py               | 2 +-
 tests/models/multimodal/generation/test_common.py         | 4 ++--
 tests/models/registry.py                                  | 6 +++---
 6 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 96d5ec25c..1089de87b 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -666,7 +666,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|--------|-------------------|----------------------|---------------------------|
 | `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | |
-| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ |
+| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereLabs/aya-vision-8b`, `CohereLabs/aya-vision-32b`, etc. | | ✅︎ |
 | `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ |
 | `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ |
 | `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ |
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 0888a9d60..22802dddf 100755
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -72,7 +72,7 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
 # Aya Vision
 def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
-    model_name = "CohereForAI/aya-vision-8b"
+    model_name = "CohereLabs/aya-vision-8b"
 
     engine_args = EngineArgs(
         model=model_name,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 560ca768d..28c466c03 100755
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -76,7 +76,7 @@ def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
 
 
 def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "CohereForAI/aya-vision-8b"
+    model_name = "CohereLabs/aya-vision-8b"
 
     engine_args = EngineArgs(
         model=model_name,
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 89f035d2c..cc6251514 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -109,7 +109,7 @@ TEXT_GENERATION_MODELS = {
     "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(),
     "bigscience/bloomz-1b1": PPTestSettings.fast(),
     "zai-org/chatglm3-6b": PPTestSettings.fast(),
-    "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(load_format="dummy"),
+    "CohereLabs/c4ai-command-r-v01": PPTestSettings.fast(load_format="dummy"),
     "databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"),
     "Deci/DeciLM-7B-instruct": PPTestSettings.fast(),
     "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index f896126a4..fd26b838a 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -278,7 +278,7 @@ VLM_TEST_SETTINGS = {
         marks=[large_gpu_mark(min_gb=64)],
     ),
     "aya_vision": VLMTestInfo(
-        models=["CohereForAI/aya-vision-8b"],
+        models=["CohereLabs/aya-vision-8b"],
         test_type=(VLMTestType.IMAGE),
         prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",  # noqa: E501
         single_image_prompts=IMAGE_ASSETS.prompts(
@@ -294,7 +294,7 @@ VLM_TEST_SETTINGS = {
         vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
     ),
     "aya_vision-multi_image": VLMTestInfo(
-        models=["CohereForAI/aya-vision-8b"],
+        models=["CohereLabs/aya-vision-8b"],
         test_type=(VLMTestType.MULTI_IMAGE),
         prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",  # noqa: E501
         single_image_prompts=IMAGE_ASSETS.prompts(
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 352abdd2d..020cb7493 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -211,10 +211,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
         trust_remote_code=True,
     ),
     "CohereForCausalLM": _HfExamplesInfo(
-        "CohereForAI/c4ai-command-r-v01", trust_remote_code=True
+        "CohereLabs/c4ai-command-r-v01", trust_remote_code=True
     ),
     "Cohere2ForCausalLM": _HfExamplesInfo(
-        "CohereForAI/c4ai-command-r7b-12-2024",
+        "CohereLabs/c4ai-command-r7b-12-2024",
         trust_remote_code=True,
     ),
     "CwmForCausalLM": _HfExamplesInfo("facebook/cwm", min_transformers_version="4.58"),
@@ -581,7 +581,7 @@ _AUTOMATIC_CONVERTED_MODELS = {
 _MULTIMODAL_EXAMPLE_MODELS = {
     # [Decoder-only]
     "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
-    "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"),
+    "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"),
     "BeeForConditionalGeneration": _HfExamplesInfo(
         "Open-Bee/Bee-8B-RL",
         trust_remote_code=True,
-- 
GitLab


From e23ca3a0e8bbf8d1e386bd619e0056ec7255810c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Fri, 5 Dec 2025 20:47:37 +0100
Subject: [PATCH 141/258] [CI] Re-use whisper_client for all tests (#30148)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 .../test_transcription_validation_whisper.py  | 30 ++++++++-----------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/tests/entrypoints/openai/test_transcription_validation_whisper.py b/tests/entrypoints/openai/test_transcription_validation_whisper.py
index 47cd7b1f1..3c507ee0a 100644
--- a/tests/entrypoints/openai/test_transcription_validation_whisper.py
+++ b/tests/entrypoints/openai/test_transcription_validation_whisper.py
@@ -32,24 +32,20 @@ async def whisper_client(server):
 
 
 @pytest.mark.asyncio
-async def test_basic_audio(mary_had_lamb):
-    server_args = ["--enforce-eager"]
-
+async def test_basic_audio(whisper_client, mary_had_lamb):
     # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
-    with RemoteOpenAIServer(MODEL_NAME, server_args) as remote_server:
-        client = remote_server.get_async_client()
-        transcription = await client.audio.transcriptions.create(
-            model=MODEL_NAME,
-            file=mary_had_lamb,
-            language="en",
-            response_format="text",
-            temperature=0.0,
-        )
-        out = json.loads(transcription)
-        out_text = out["text"]
-        out_usage = out["usage"]
-        assert "Mary had a little lamb," in out_text
-        assert out_usage["seconds"] == 16, out_usage["seconds"]
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    out_usage = out["usage"]
+    assert "Mary had a little lamb," in out_text
+    assert out_usage["seconds"] == 16, out_usage["seconds"]
 
 
 @pytest.mark.asyncio
-- 
GitLab


From 962d703818c038b6ea2ae18eab8061a6f2fb8f65 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Fri, 5 Dec 2025 13:57:26 -0600
Subject: [PATCH 142/258] [Bugfix][llama4_eagle] Fix missing 'lm_head'
 attribute (#29926)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 tests/v1/e2e/test_spec_decode.py           |  6 +++++-
 vllm/model_executor/models/llama4_eagle.py | 13 +++++++++++--
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 5246ea651..575a6a151 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -402,7 +402,11 @@ def test_eagle_correctness(
             # Scout requires default backend selection
             # because vision encoder has head_dim 88 being incompatible
             #  with FLASH_ATTN and needs to fall back to Flex Attn
-            pass
+
+            # pass if not ROCm
+            if current_platform.is_rocm():
+                # TODO: Enable Flex Attn for spec_decode on ROCm
+                pytest.skip("Flex Attn for spec_decode not supported on ROCm currently")
         else:
             m.setenv("VLLM_MLA_DISABLE", "1")
             m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py
index 0146b3057..02f5b5ff6 100644
--- a/vllm/model_executor/models/llama4_eagle.py
+++ b/vllm/model_executor/models/llama4_eagle.py
@@ -28,7 +28,10 @@ from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.torchao import TorchAOConfig
-from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama4 import Llama4DecoderLayer, Llama4ForCausalLM
 from vllm.model_executor.models.utils import extract_layer_index
@@ -182,6 +185,12 @@ class EagleLlama4ForCausalLM(Llama4ForCausalLM):
             self.config.vocab_size, scale=logit_scale
         )
 
+        self.lm_head = ParallelLMHead(
+            self.config.draft_vocab_size,
+            self.config.hidden_size,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
+
         # Set MoE hyperparameters
         self.set_moe_parameters()
 
@@ -211,6 +220,6 @@ class EagleLlama4ForCausalLM(Llama4ForCausalLM):
         loader = AutoWeightsLoader(
             self,
             # lm_head is tied with target model (Llama4ForCausalLM)
-            skip_prefixes=(["lm_head."]),
+            skip_prefixes=([]),
         )
         loader.load_weights(map(transform, weights))
-- 
GitLab


From 77e44728090cf11b6ece2ae25d7c732a065fbb45 Mon Sep 17 00:00:00 2001
From: Bangsheng Tang <5318912+bangshengtang@users.noreply.github.com>
Date: Fri, 5 Dec 2025 13:33:42 -0800
Subject: [PATCH 143/258] let draft model follow target model's config_format
 (#30152)

---
 vllm/config/speculative.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index c6d6f705f..bf533bf14 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -337,6 +337,7 @@ class SpeculativeConfig:
                     enforce_eager=self.target_model_config.enforce_eager,
                     max_logprobs=self.target_model_config.max_logprobs,
                     hf_overrides=SpeculativeConfig.hf_config_override,
+                    config_format=self.target_model_config.config_format,
                 )
 
                 # Automatically detect the method
-- 
GitLab


From 7b5575fa7dcf76ac86ab8d18501b9cc04f74f6bb Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 5 Dec 2025 16:42:12 -0500
Subject: [PATCH 144/258] [Bug] Fix vLLM config is not set error (#29999)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 .../layers/fused_moe/cutlass_moe.py           |  2 +
 .../fused_moe/fused_moe_modular_method.py     |  6 ++
 .../layers/fused_moe/modular_kernel.py        | 57 ++++++++++---------
 .../compressed_tensors_moe.py                 |  3 +
 .../quantization/utils/flashinfer_utils.py    |  6 ++
 5 files changed, 47 insertions(+), 27 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index 6753a1925..30144ca54 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -460,6 +460,7 @@ def cutlass_moe_fp8(
     expert_map: torch.Tensor | None = None,
     apply_router_weight_on_input: bool = False,
     global_num_experts: int = -1,
+    parallel_config=None,
 ) -> torch.Tensor:
     """
     This function computes a a8w8-quantized Mixture of Experts (MoE) layer
@@ -537,6 +538,7 @@ def cutlass_moe_fp8(
             c_strides2=c_strides2,
             quant_config=quant_config,
         ),
+        parallel_config=parallel_config,
     )
 
     return fn(
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
index c23c41df2..b33e7fd8a 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
@@ -44,6 +44,11 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
         prepare_finalize: FusedMoEPrepareAndFinalize,
         shared_experts: torch.nn.Module | None,
     ) -> "FusedMoEModularMethod":
+        parallel_config = getattr(
+            getattr(moe_layer, "vllm_config", None),
+            "parallel_config",
+            None,
+        )
         return FusedMoEModularMethod(
             old_quant_method,
             FusedMoEModularKernel(
@@ -51,6 +56,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
                 old_quant_method.select_gemm_impl(prepare_finalize, moe_layer),
                 shared_experts,
                 getattr(moe_layer, "shared_experts_stream", None),
+                parallel_config=parallel_config,
             ),
         )
 
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index b2af58cdc..51d3299e7 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -10,7 +10,7 @@ from typing import final
 import torch
 
 import vllm.envs as envs
-from vllm.config import get_current_vllm_config
+from vllm.config import ParallelConfig, get_current_vllm_config
 from vllm.forward_context import get_forward_context, is_forward_context_available
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
@@ -716,6 +716,7 @@ class FusedMoEModularKernel(torch.nn.Module):
         fused_experts: FusedMoEPermuteExpertsUnpermute,
         shared_experts: torch.nn.Module | None = None,
         shared_experts_stream: torch.cuda.Stream | None = None,
+        parallel_config: ParallelConfig | None = None,
     ):
         super().__init__()
         self.prepare_finalize = prepare_finalize
@@ -723,6 +724,14 @@ class FusedMoEModularKernel(torch.nn.Module):
         self.shared_experts = shared_experts
         self.shared_experts_stream = shared_experts_stream
 
+        # cache whether this worker is using DP+EP
+        if parallel_config is None:
+            parallel_config = get_current_vllm_config().parallel_config
+        self.is_dp_ep = (
+            parallel_config.data_parallel_size > 1
+            and parallel_config.enable_expert_parallel
+        )
+
         self._post_init_setup()
         assert (
             prepare_finalize.activation_format == fused_experts.activation_formats[0]
@@ -811,33 +820,27 @@ class FusedMoEModularKernel(torch.nn.Module):
             is_forward_context_available()
             and get_forward_context().attn_metadata is None
         )
-        if is_profile_run and self.fused_experts.supports_chunking():
-            parallel_config = get_current_vllm_config().parallel_config
-            is_dp_ep = (
-                parallel_config.data_parallel_size > 1
-                and parallel_config.enable_expert_parallel
-            )
-            if is_dp_ep:
-                max_workspace_13, max_workspace_2, max_fused_out_shape = (
-                    self.fused_experts.workspace_shapes(
-                        envs.VLLM_FUSED_MOE_CHUNK_SIZE,
-                        N,
-                        K,
-                        top_k,
-                        global_num_experts,
-                        local_num_experts,
-                        expert_tokens_meta,
-                    )
-                )
-                buffers.workspace13.get(
-                    max_workspace_13, device=device, dtype=workspace_dtype
-                )
-                buffers.workspace2.get(
-                    max_workspace_2, device=device, dtype=workspace_dtype
-                )
-                buffers.fused_out.get(
-                    max_fused_out_shape, device=device, dtype=workspace_dtype
+        if is_profile_run and self.fused_experts.supports_chunking() and self.is_dp_ep:
+            max_workspace_13, max_workspace_2, max_fused_out_shape = (
+                self.fused_experts.workspace_shapes(
+                    envs.VLLM_FUSED_MOE_CHUNK_SIZE,
+                    N,
+                    K,
+                    top_k,
+                    global_num_experts,
+                    local_num_experts,
+                    expert_tokens_meta,
                 )
+            )
+            buffers.workspace13.get(
+                max_workspace_13, device=device, dtype=workspace_dtype
+            )
+            buffers.workspace2.get(
+                max_workspace_2, device=device, dtype=workspace_dtype
+            )
+            buffers.fused_out.get(
+                max_fused_out_shape, device=device, dtype=workspace_dtype
+            )
 
         # Get intermediate workspace shapes based off the chunked M size.
         workspace13_shape, workspace2_shape, _ = self.fused_experts.workspace_shapes(
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index d7fb6d2ca..8013b29f7 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1287,6 +1287,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                     ab_strides2=self.ab_strides2,
                     c_strides1=self.c_strides1,
                     c_strides2=self.ab_strides1_c_strides2,
+                    parallel_config=getattr(
+                        getattr(layer, "vllm_config", None), "parallel_config", None
+                    ),
                 )
 
         else:
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
index eef7a0896..00c2720a3 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -247,6 +247,11 @@ def flashinfer_cutlass_moe_fp8(
     assert quant_config is not None
 
     # Construct modular kernel with block-scale support when requested.
+    parallel_config = getattr(
+        getattr(layer, "vllm_config", None),
+        "parallel_config",
+        None,
+    )
     fused_experts = mk.FusedMoEModularKernel(
         build_flashinfer_fp8_cutlass_moe_prepare_finalize(
             moe=moe, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale
@@ -257,6 +262,7 @@ def flashinfer_cutlass_moe_fp8(
             out_dtype=hidden_states.dtype,
             use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
         ),
+        parallel_config=parallel_config,
     )
 
     return fused_experts(
-- 
GitLab


From 02a41691932683aa544b8a0139586f43e2f8b4bd Mon Sep 17 00:00:00 2001
From: Deboleina <debroy@redhat.com>
Date: Fri, 5 Dec 2025 22:03:29 -0500
Subject: [PATCH 145/258] [Tests] Tool call tests for openai/gpt-oss-20b
 (#26237)

Signed-off-by: Debolina Roy <debroy@redhat.com>
---
 requirements/rocm-test.txt                    |   1 +
 .../tool_parsers/test_openai_tool_parser.py   | 359 ++++++++++++++++++
 2 files changed, 360 insertions(+)
 create mode 100644 tests/entrypoints/openai/tool_parsers/test_openai_tool_parser.py

diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index 9d3d711c3..f25835c68 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -49,6 +49,7 @@ blobfile==3.0.0
     # Multi-Modal Models Test
 decord==0.6.0
     # video processing, required by entrypoints/openai/test_video.py
+rapidfuzz==3.12.1
 
 # OpenAI compatibility and testing
 gpt-oss==0.0.8
diff --git a/tests/entrypoints/openai/tool_parsers/test_openai_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_openai_tool_parser.py
new file mode 100644
index 000000000..7cb87fd13
--- /dev/null
+++ b/tests/entrypoints/openai/tool_parsers/test_openai_tool_parser.py
@@ -0,0 +1,359 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import jsonschema
+import openai
+import pytest
+import pytest_asyncio
+from rapidfuzz import fuzz
+
+from ....utils import RemoteOpenAIServer
+
+MODEL_NAME = "openai/gpt-oss-20b"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--enable-auto-tool-choice",
+        "--tool-call-parser",
+        "openai",
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    """Async fixture providing an OpenAI-compatible vLLM client."""
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+# ==========================================================
+# Tool Definitions
+# ==========================================================
+TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "calculator",
+            "description": "Performs basic arithmetic calculations.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "expression": {
+                        "type": "string",
+                        "description": (
+                            "Arithmetic expression to evaluate, e.g. '123 + 456'."
+                        ),
+                    }
+                },
+                "required": ["expression"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_time",
+            "description": "Retrieves the current local time for a given city.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string",
+                        "description": "City name, e.g. 'New York'.",
+                    }
+                },
+                "required": ["city"],
+            },
+        },
+    },
+]
+
+
+# ==========================================================
+# Message Examples
+# ==========================================================
+MESSAGES_CALC = [
+    {"role": "user", "content": "Calculate 123 + 456 using the calculator."}
+]
+
+MESSAGES_GET_TIME = [
+    {"role": "user", "content": "What is the current time in New York?"}
+]
+
+MESSAGES_MULTIPLE_CALLS = [
+    {
+        "role": "system",
+        "content": (
+            "You can call multiple tools. "
+            "When using more than one, return single JSON object with tool_calls array"
+            "containing each tool call with its function name and arguments. "
+            "Do not output multiple JSON objects separately."
+        ),
+    },
+    {
+        "role": "user",
+        "content": "First, calculate 7 * 8 using the calculator. "
+        "Then, use get_time to tell me the current time in New York.",
+    },
+]
+
+MESSAGES_INVALID_CALL = [
+    {
+        "role": "user",
+        "content": "Can you help with something, "
+        "but don’t actually perform any calculation?",
+    }
+]
+
+
+# Expected outputs
+FUNC_CALC = "calculator"
+FUNC_ARGS_CALC = '{"expression":"123 + 456"}'
+
+FUNC_TIME = "get_time"
+FUNC_ARGS_TIME = '{"city": "New York"}'
+
+
+# ==========================================================
+# Utility to extract reasoning and tool calls
+# ==========================================================
+def extract_reasoning_and_calls(chunks: list) -> tuple[str, list[str], list[str]]:
+    """
+    Extract accumulated reasoning text and tool call arguments
+    from streaming chunks.
+    """
+    reasoning_content: str = ""
+    tool_calls: dict[int, dict[str, str]] = {}
+
+    for chunk in chunks:
+        choice = getattr(chunk.choices[0], "delta", None)
+        if not choice:
+            continue
+
+        if hasattr(choice, "reasoning_content") and choice.reasoning_content:
+            reasoning_content += choice.reasoning_content
+
+        for tc in getattr(choice, "tool_calls", []) or []:
+            idx = getattr(tc, "index", 0)
+            tool_entry = tool_calls.setdefault(idx, {"name": "", "arguments": ""})
+
+            if getattr(tc, "function", None):
+                func = tc.function
+                if getattr(func, "name", None):
+                    tool_entry["name"] = func.name
+                if getattr(func, "arguments", None):
+                    tool_entry["arguments"] += func.arguments
+
+    function_names: list[str] = [v["name"] for _, v in sorted(tool_calls.items())]
+    arguments: list[str] = [v["arguments"] for _, v in sorted(tool_calls.items())]
+
+    return reasoning_content, arguments, function_names
+
+
+# ==========================================================
+# Test Scenarios
+# ==========================================================
+@pytest.mark.asyncio
+async def test_calculator_tool_call_and_argument_accuracy(client: openai.AsyncOpenAI):
+    """Verify calculator tool call is made and arguments are accurate."""
+
+    response = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES_CALC,
+        tools=TOOLS,
+        temperature=0.0,
+        stream=False,
+    )
+
+    message = response.choices[0].message
+    tool_calls = getattr(message, "tool_calls", [])
+    assert tool_calls, "No tool calls detected"
+
+    calc_call = next((c for c in tool_calls if c.function.name == FUNC_CALC), None)
+    assert calc_call, "Calculator function not called"
+
+    raw_args = calc_call.function.arguments
+    assert raw_args, "Calculator arguments missing"
+    assert "123" in raw_args and "456" in raw_args, (
+        f"Expected values not in raw arguments: {raw_args}"
+    )
+
+    try:
+        parsed_args = json.loads(raw_args)
+    except json.JSONDecodeError:
+        pytest.fail(f"Invalid JSON in calculator arguments: {raw_args}")
+
+    expected_expr = "123 + 456"
+    actual_expr = parsed_args.get("expression", "")
+    similarity = fuzz.ratio(actual_expr, expected_expr)
+
+    assert similarity > 90, (
+        f"Expression mismatch: expected '{expected_expr}' "
+        f"got '{actual_expr}' (similarity={similarity}%)"
+    )
+
+
+@pytest.mark.asyncio
+async def test_streaming_tool_call_get_time_with_reasoning(client: openai.AsyncOpenAI):
+    """Verify streamed reasoning and tool call behavior for get_time."""
+
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES_GET_TIME,
+        tools=TOOLS,
+        temperature=0.0,
+        stream=True,
+    )
+
+    chunks = [chunk async for chunk in stream]
+    reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
+
+    assert FUNC_TIME in function_names, "get_time function not called"
+
+    assert any("New York" in arg for arg in arguments), (
+        f"Expected get_time arguments for New York not found in {arguments}"
+    )
+
+    assert len(reasoning) > 0, "Expected reasoning content missing"
+
+    assert any(keyword in reasoning for keyword in ["New York", "time", "current"]), (
+        f"Reasoning is not relevant to the request: {reasoning}"
+    )
+
+
+@pytest.mark.asyncio
+async def test_streaming_multiple_tools(client: openai.AsyncOpenAI):
+    """Test streamed multi-tool response with reasoning."""
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES_MULTIPLE_CALLS,
+        tools=TOOLS,
+        temperature=0.0,
+        stream=True,
+    )
+
+    chunks = [chunk async for chunk in stream]
+    reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
+
+    try:
+        assert FUNC_CALC in function_names, (
+            f"Calculator tool missing — found {function_names}"
+        )
+        assert FUNC_TIME in function_names, (
+            f"Time tool missing — found {function_names}"
+        )
+        assert len(reasoning) > 0, "Expected reasoning content in streamed response"
+    except AssertionError as e:
+        print(f"ERROR: {e}")
+
+
+@pytest.mark.asyncio
+async def test_invalid_tool_call(client: openai.AsyncOpenAI):
+    """
+    Verify that ambiguous instructions that should not trigger a tool
+    do not produce any tool calls.
+    """
+    response = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES_INVALID_CALL,
+        tools=TOOLS,
+        temperature=0.0,
+        stream=False,
+    )
+
+    message = response.choices[0].message
+
+    assert message is not None, "Expected message in response"
+    assert hasattr(message, "content"), "Expected 'content' field in message"
+
+    tool_calls = getattr(message, "tool_calls", [])
+    assert not tool_calls, (
+        f"Model unexpectedly attempted a tool call on invalid input: {tool_calls}"
+    )
+
+
+@pytest.mark.asyncio
+async def test_tool_call_with_temperature(client: openai.AsyncOpenAI):
+    """
+    Verify model produces valid tool or text output
+    under non-deterministic sampling.
+    """
+    response = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES_CALC,
+        tools=TOOLS,
+        temperature=0.7,
+        stream=False,
+    )
+
+    message = response.choices[0].message
+    assert message is not None, "Expected non-empty message in response"
+    assert message.tool_calls or message.content, (
+        "Response missing both text and tool calls"
+    )
+
+    print(f"\nTool calls: {message.tool_calls}")
+    print(f"Text: {message.content}")
+
+
+@pytest.mark.asyncio
+async def test_tool_response_schema_accuracy(client: openai.AsyncOpenAI):
+    """Validate that tool call arguments adhere to their declared JSON schema."""
+    response = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES_MULTIPLE_CALLS,
+        tools=TOOLS,
+        temperature=0.0,
+    )
+
+    calls = response.choices[0].message.tool_calls
+    assert calls, "No tool calls produced"
+
+    for call in calls:
+        func_name = call.function.name
+        args = json.loads(call.function.arguments)
+
+        schema: dict[str, object] | None = None
+        for tool_entry in TOOLS:
+            function_def = tool_entry.get("function")
+            if (
+                function_def
+                and isinstance(function_def, dict)
+                and function_def.get("name") == func_name
+            ):
+                schema = function_def.get("parameters")
+                break
+
+        assert schema is not None, f"No matching tool schema found for {func_name}"
+
+        jsonschema.validate(instance=args, schema=schema)
+
+
+@pytest.mark.asyncio
+async def test_semantic_consistency_with_temperature(client: openai.AsyncOpenAI):
+    """Test that temperature variation doesn't cause contradictory reasoning."""
+    responses = []
+    for temp in [0.0, 0.5, 1.0]:
+        resp = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=MESSAGES_CALC,
+            tools=TOOLS,
+            temperature=temp,
+        )
+        text = (resp.choices[0].message.content or "").strip()
+        responses.append(text)
+
+    # Compare fuzzy similarity between low- and mid-temperature outputs
+    low_mid_sim = fuzz.ratio(responses[0], responses[1])
+    assert low_mid_sim > 60, (
+        f"Semantic drift too large between T=0.0 and T=0.5 ({low_mid_sim}%)"
+    )
-- 
GitLab


From dc839ad03d31104c8ebcb0b8f5a75021f1796760 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Fri, 5 Dec 2025 22:52:11 -0600
Subject: [PATCH 146/258] [CI/Build][AMD][Quantization] Fix test_int8_kernel.py
 by updating int8_utils to use hip.libdevice.round (#30151)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 .../layers/quantization/utils/int8_utils.py   | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/int8_utils.py b/vllm/model_executor/layers/quantization/utils/int8_utils.py
index 925d0a516..32192225f 100644
--- a/vllm/model_executor/layers/quantization/utils/int8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/int8_utils.py
@@ -83,26 +83,11 @@ def block_dequant(
 
 
 if current_platform.is_rocm():
-    from triton.language import core
-
-    # NOTE: This can be removed when hip.libdevice.round() is available.
-    @core.extern
-    def round_f32(arg0, _builder=None):
-        return core.extern_elementwise(
-            "",
-            "",
-            [arg0],
-            {
-                (core.dtype("fp32"),): ("llvm.round", core.dtype("fp32")),
-                (core.dtype("fp64"),): ("llvm.round", core.dtype("fp64")),
-            },
-            is_pure=True,
-            _builder=_builder,
-        )
 
     @triton.jit
     def round_int8(x):
-        return round_f32(x).to(tl.int8)
+        return tl.extra.hip.libdevice.round(x).to(tl.int8)
+
 else:
 
     @triton.jit
-- 
GitLab


From 7e31c3a3f699b2ec4bf18cb195edaf7270c1434c Mon Sep 17 00:00:00 2001
From: Samuel Shen <102553648+sammshen@users.noreply.github.com>
Date: Fri, 5 Dec 2025 20:53:34 -0800
Subject: [PATCH 147/258] [CI]: Remove unnecessary imports from
 test_lmache_integration (#30157)

Signed-off-by: Samuel Shen <slshen@uchicago.edu>
Co-authored-by: Samuel Shen <slshen@uchicago.edu>
---
 .../unit/test_lmcache_integration.py          | 62 +------------------
 1 file changed, 2 insertions(+), 60 deletions(-)

diff --git a/tests/v1/kv_connector/unit/test_lmcache_integration.py b/tests/v1/kv_connector/unit/test_lmcache_integration.py
index 33418edc3..cfe8d810c 100644
--- a/tests/v1/kv_connector/unit/test_lmcache_integration.py
+++ b/tests/v1/kv_connector/unit/test_lmcache_integration.py
@@ -64,22 +64,6 @@ def test_multimodal_interface():
     assumes(PlaceholderRange, "offset")
     assumes(PlaceholderRange, "length")
 
-    # test a minimal case
-    import torch
-
-    from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration.utils import (
-        apply_mm_hashes_to_token_ids,
-    )
-
-    token_ids = torch.arange(10, dtype=torch.long)
-    mm_hashes = ["0000", "1111"]  # hex repr of 0 and 4369
-    mm_positions = [
-        PlaceholderRange(offset=0, length=4),
-        PlaceholderRange(offset=5, length=4),
-    ]
-    apply_mm_hashes_to_token_ids(token_ids, mm_hashes, mm_positions)
-    assert token_ids.tolist() == [0, 0, 0, 0, 4, 4369, 4369, 4369, 4369, 9]
-
 
 @pytest.mark.skipif(
     current_platform.is_rocm(), reason="Requires libcudart.so, not available on ROCm"
@@ -122,16 +106,6 @@ def test_config_interface():
     assumes(CacheConfig, "block_size")
     assumes(CacheConfig, "gpu_memory_utilization")
 
-    # mla metadata minimal cases
-    from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration.utils import (
-        mla_enabled,
-    )
-
-    model_config = ModelConfig(model="deepseek-ai/DeepSeek-R1")
-    assert mla_enabled(model_config)
-    model_config = ModelConfig(model="Qwen/Qwen3-0.6B")
-    assert not mla_enabled(model_config)
-
     # kv metadata minimal case
     from vllm.utils.torch_utils import get_kv_cache_torch_dtype
 
@@ -139,7 +113,7 @@ def test_config_interface():
     parallel_config = ParallelConfig()
     cache_config = CacheConfig(cache_dtype="bfloat16")
     kv_dtype = get_kv_cache_torch_dtype(cache_config.cache_dtype, model_config.dtype)
-    use_mla = mla_enabled(model_config)
+    use_mla = False
     chunk_size = 256
     num_layer = model_config.get_num_layers(parallel_config)
     num_kv_head = model_config.get_num_kv_heads(parallel_config)
@@ -184,43 +158,11 @@ def test_request_interface():
     assumes(req, "num_tokens")
     assumes(req, "kv_transfer_params", is_instance_of=(dict, NoneType))
 
-    from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItem
+    from vllm.multimodal.inputs import MultiModalFeatureSpec
 
     assumes(MultiModalFeatureSpec, "identifier")
     assumes(MultiModalFeatureSpec, "mm_position")
 
-    # minimal case:
-    from vllm.multimodal.inputs import PlaceholderRange
-
-    request = Request(
-        request_id="test_request",
-        prompt_token_ids=[1, 2, 3],
-        sampling_params=SamplingParams(max_tokens=10),
-        pooling_params=None,
-        eos_token_id=100,
-        lora_request=None,
-        mm_features=[
-            MultiModalFeatureSpec(
-                modality="image",
-                identifier="0000",
-                data=MultiModalKwargsItem.dummy("dummy_m"),
-                mm_position=PlaceholderRange(offset=0, length=10),
-            )
-        ],
-    )
-
-    from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration.utils import (
-        extract_mm_features,
-    )
-
-    mm_hashes, mm_positions = extract_mm_features(request)
-    assert isinstance(mm_hashes, list)
-    assert len(mm_hashes) == 1
-    assert isinstance(mm_positions, list)
-    assert len(mm_positions) == 1
-    assert mm_positions[0].offset == 0
-    assert mm_positions[0].length == 10
-
 
 def test_new_request_interface():
     # protect against interface changes
-- 
GitLab


From bf4a901af91e431a4cdb51ed4557b31bf89c0e5d Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 6 Dec 2025 04:53:52 +0000
Subject: [PATCH 148/258] Better error when world size is larger than node and
 `distributed_executor_backend` is not set (#30140)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config/parallel.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 3a768bcd4..0327832c4 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -184,13 +184,14 @@ class ParallelConfig:
     distributed_executor_backend: (
         str | DistributedExecutorBackend | type[Executor] | None
     ) = None
-    """Backend to use for distributed model
-    workers, either "ray" or "mp" (multiprocessing). If the product
-    of pipeline_parallel_size and tensor_parallel_size is less than
-    or equal to the number of GPUs available, "mp" will be used to
-    keep processing on a single host. Otherwise, this will default
-    to "ray" if Ray is installed and fail otherwise. Note that tpu
-    only support Ray for distributed inference."""
+    """Backend to use for distributed model workers, either "ray" or "mp"
+    (multiprocessing). If the product of pipeline_parallel_size and tensor_parallel_size
+    is less than or equal to the number of GPUs available, "mp" will be used to
+    keep processing on a single host. Otherwise, an error will be raised. To use "mp"
+    you must also set nnodes, and to use "ray" you must manually set
+    distributed_executor_backend to "ray".
+
+    Note that tpu only support Ray for distributed inference."""
 
     worker_cls: str = "auto"
     """The full name of the worker class to use. If "auto", the worker class
@@ -566,8 +567,11 @@ class ParallelConfig:
             ):
                 gpu_count = cuda_device_count_stateless()
                 raise ValueError(
-                    f"Tensor parallel size ({self.world_size}) cannot be "
-                    f"larger than the number of available GPUs ({gpu_count})."
+                    f"World size ({self.world_size}) is larger than the number of "
+                    f"available GPUs ({gpu_count}) in this node. If this is "
+                    "intentional and you are using:\n"
+                    "- ray, set '--distributed-executor-backend ray'.\n"
+                    "- multiprocessing, set '--nnodes' appropriately."
                 )
             elif self.data_parallel_backend == "ray":
                 logger.info(
-- 
GitLab


From 62079d86004448a42c4205d00e4a977fe85b69a6 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Fri, 5 Dec 2025 22:54:17 -0600
Subject: [PATCH 149/258] [CI/Build][AMD] Skip marlin, machete, and hadacore
 tests since these require _C functions not defined for ROCm (#30109)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/kernels/quantization/test_hadacore.py    | 7 +++++++
 tests/kernels/quantization/test_machete_mm.py  | 6 ++++++
 tests/kernels/quantization/test_marlin_gemm.py | 8 ++++++++
 3 files changed, 21 insertions(+)

diff --git a/tests/kernels/quantization/test_hadacore.py b/tests/kernels/quantization/test_hadacore.py
index 3ccee9db0..7a5c7fbd5 100644
--- a/tests/kernels/quantization/test_hadacore.py
+++ b/tests/kernels/quantization/test_hadacore.py
@@ -8,6 +8,13 @@ import torch
 from compressed_tensors.transform import deterministic_hadamard_matrix
 
 from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "These tests require hadacore_transform, not supported on ROCm.",
+        allow_module_level=True,
+    )
 
 
 @pytest.mark.parametrize("batch_size", [1, 32])
diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py
index efa81de15..7f4ce2a08 100644
--- a/tests/kernels/quantization/test_machete_mm.py
+++ b/tests/kernels/quantization/test_machete_mm.py
@@ -23,6 +23,12 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
 
+if current_platform.is_rocm():
+    pytest.skip(
+        "These tests require machete_prepack_B, not supported on ROCm.",
+        allow_module_level=True,
+    )
+
 CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
 # TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py
index 59516db1b..995e777bb 100644
--- a/tests/kernels/quantization/test_marlin_gemm.py
+++ b/tests/kernels/quantization/test_marlin_gemm.py
@@ -56,6 +56,14 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
+if current_platform.is_rocm():
+    pytest.skip(
+        "These tests require gptq_marlin_repack,"
+        "marlin_int4_fp8_preprocess, gptq_marlin_24_gemm,"
+        "or gptq_marlin_gemm which are not supported on ROCm.",
+        allow_module_level=True,
+    )
+
 ACT_ORDER_OPTS = [False, True]
 K_FULL_OPTS = [False, True]
 USE_ATOMIC_ADD_OPTS = [False, True]
-- 
GitLab


From c4d62618ca7cf8507c1e357fa1180fb162c670fa Mon Sep 17 00:00:00 2001
From: yuttian1 <yuttian@amd.com>
Date: Sat, 6 Dec 2025 12:54:38 +0800
Subject: [PATCH 150/258] Fix AWQ MoE marlin check issue in marlin_utils.py for
 AMD backend (#30102)

Signed-off-by: yuttian1 <yuttian@amd.com>
---
 vllm/model_executor/layers/quantization/utils/marlin_utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 14337ee1d..072b46f05 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -179,6 +179,8 @@ def check_marlin_supports_shape(
 
 
 def check_marlin_supports_layer(layer: LinearBase, group_size: int) -> bool:
+    if current_platform.is_rocm():
+        return False
     output_size_per_partition = (
         getattr(layer, "output_size_per_partition", None) or layer.output_size
     )
@@ -195,6 +197,8 @@ def check_marlin_supports_layer(layer: LinearBase, group_size: int) -> bool:
 
 
 def check_moe_marlin_supports_layer(layer: LinearBase, group_size: int) -> bool:
+    if current_platform.is_rocm():
+        return False
     hidden_size = layer.hidden_size
     intermediate_size_per_partition = layer.intermediate_size_per_partition
     # apply_router_weight_on_input is not supported for moe marlin
-- 
GitLab


From e3fbb6f152fe721506f22b3d8d2ec70c10229569 Mon Sep 17 00:00:00 2001
From: Dongjie Zou <85092850+baonudesifeizhai@users.noreply.github.com>
Date: Fri, 5 Dec 2025 23:55:09 -0500
Subject: [PATCH 151/258] fix#30092 Kimi-Linear model loading failure with
 missing indexer_rotary_emb (#30093)

Signed-off-by: baonudesifeizhai <baonudesifeizhai@gmail.com>
---
 vllm/model_executor/layers/mla.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py
index dad960160..1656f4deb 100644
--- a/vllm/model_executor/layers/mla.py
+++ b/vllm/model_executor/layers/mla.py
@@ -24,9 +24,9 @@ class MLAModules:
     q_b_proj: torch.nn.Module | None
     q_proj: torch.nn.Module | None
     indexer: torch.nn.Module | None
-    indexer_rotary_emb: torch.nn.Module | None
     is_sparse: bool
     topk_indices_buffer: torch.Tensor | None
+    indexer_rotary_emb: torch.nn.Module | None = None
 
 
 @CustomOp.register("multi_head_latent_attention")
-- 
GitLab


From e858bc4d1490808e23bbc32e17202ebbf8713a76 Mon Sep 17 00:00:00 2001
From: Peter Salas <peter@fixie.ai>
Date: Fri, 5 Dec 2025 20:55:43 -0800
Subject: [PATCH 152/258] [Model] Add support for transformer-based Ultravox
 v0.7 projector (#30089)

Signed-off-by: Peter Salas <peter@fixie.ai>
---
 vllm/model_executor/models/ultravox.py      | 96 +++++++++++++++++++--
 vllm/transformers_utils/configs/ultravox.py |  2 +
 2 files changed, 91 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 26a8355cd..2444159b2 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -4,15 +4,21 @@
 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py
 """PyTorch Ultravox model."""
 
+import copy
 from collections.abc import Iterable, Mapping, Sequence
+from types import SimpleNamespace
 from typing import Annotated, Any, Literal, TypeAlias
 
 import torch
 from torch import nn
 from torch.nn import functional as F
 from transformers import BatchFeature, ProcessorMixin
+from transformers.modeling_utils import ModuleUtilsMixin
 from transformers.models.whisper import WhisperFeatureExtractor
-from transformers.models.whisper.modeling_whisper import WhisperEncoder
+from transformers.models.whisper.modeling_whisper import (
+    WhisperEncoder,
+    WhisperEncoderLayer,
+)
 
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
@@ -282,7 +288,7 @@ class StackAudioFrames(nn.Module):
         return audio_embeds
 
 
-class UltravoxProjector(nn.Module):
+class UltravoxFeedForwardProjector(nn.Module):
     def __init__(self, config: UltravoxConfig):
         super().__init__()
         self.hidden_dim = config.hidden_size
@@ -310,7 +316,9 @@ class UltravoxProjector(nn.Module):
             self.ln_mid = nn.Identity()
             self.ln_post = RMSNorm(dim_out)
 
-    def forward(self, audio_features: torch.Tensor) -> torch.Tensor:
+    def forward(
+        self, audio_features: torch.Tensor, audio_token_len: torch.Tensor
+    ) -> torch.Tensor:
         audio_features = self._pad_and_stack(audio_features)
         audio_features = self.ln_pre(audio_features)
         hidden_states = self.linear_1(audio_features)
@@ -321,6 +329,70 @@ class UltravoxProjector(nn.Module):
         return hidden_states
 
 
+class UltravoxTransformerProjector(nn.Module, ModuleUtilsMixin):
+    def __init__(self, config: UltravoxConfig):
+        super().__init__()
+        self.config = SimpleNamespace(is_decoder=False)
+
+        self._pad_and_stack = StackAudioFrames(config.stack_factor)
+        dim_in = config.audio_config.hidden_size * config.stack_factor
+
+        projector_audio_config = copy.deepcopy(config.audio_config)
+
+        self.ln_pre = RMSNorm(dim_in)
+        self.linear_in = nn.Linear(dim_in, projector_audio_config.d_model)
+
+        self.embed_positions = nn.Embedding(
+            projector_audio_config.max_source_positions,
+            projector_audio_config.d_model,
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                WhisperEncoderLayer(projector_audio_config)
+                for _ in range(config.num_projector_layers)
+            ]
+        )
+
+        self.ln_post = RMSNorm(projector_audio_config.d_model)
+        self.linear_out = nn.Linear(
+            projector_audio_config.d_model, config.text_config.hidden_size
+        )
+
+    def forward(
+        self, audio_features: torch.Tensor, audio_token_len: torch.Tensor
+    ) -> torch.Tensor:
+        audio_features = self._pad_and_stack(audio_features)
+
+        max_len_stacked = audio_features.shape[1]
+        attention_mask = torch.arange(max_len_stacked, device=audio_features.device)[
+            None, :
+        ].lt(audio_token_len[:, None])
+        extended_attention_mask = self.get_extended_attention_mask(
+            attention_mask, attention_mask.shape, audio_features.dtype
+        )
+
+        hidden_states = self.ln_pre(audio_features)
+        hidden_states = self.linear_in(hidden_states)
+
+        positions = self.embed_positions(
+            torch.arange(hidden_states.size(1), device=hidden_states.device)
+        )
+        hidden_states = hidden_states + positions
+
+        for layer in self.layers:
+            layer_outputs = layer(
+                hidden_states,
+                attention_mask=extended_attention_mask,
+                layer_head_mask=None,
+            )
+            hidden_states = layer_outputs[0]
+
+        hidden_states = self.ln_post(hidden_states)
+        hidden_states = self.linear_out(hidden_states)
+        return hidden_states
+
+
 class ModifiedWhisperEncoder(WhisperEncoder):
     """
     Encoder portion of OpenAI's Whisper model.
@@ -464,7 +536,10 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
                     prefix="audio_tower.",
                 )
             )
-        self.multi_modal_projector = UltravoxProjector(config)
+        if config.num_projector_layers > 0:
+            self.multi_modal_projector = UltravoxTransformerProjector(config)
+        else:
+            self.multi_modal_projector = UltravoxFeedForwardProjector(config)
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
             hf_config=config.wrapped_model_config,
@@ -496,7 +571,10 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
         )
 
     def _audio_features_to_embeddings(
-        self, input_features: torch.Tensor, audio_lens: torch.Tensor
+        self,
+        input_features: torch.Tensor,
+        audio_lens: torch.Tensor,
+        audio_token_len: torch.Tensor,
     ) -> torch.Tensor:
         audio_features = input_features.to(self.audio_tower.dtype)
         batch_size = audio_features.size(0)
@@ -512,7 +590,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
             batch_features = batch_features.to(self.audio_tower.dtype)
 
             # Process through projector
-            batch_embeddings = self.multi_modal_projector(batch_features)
+            batch_embeddings = self.multi_modal_projector(
+                batch_features, audio_token_len[start:end]
+            )
             audio_embeddings.append(batch_embeddings)
 
         # Concatenate results
@@ -559,7 +639,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
         audio_lens = audio_input["lens"]
         audio_token_len = audio_input["token_len"]
 
-        embeddings = self._audio_features_to_embeddings(audio_features, audio_lens)
+        embeddings = self._audio_features_to_embeddings(
+            audio_features, audio_lens, audio_token_len
+        )
 
         # We should flatten and concatenate embeddings based on token lengths
         # For example, with token_len = [4, 2, 3], flattened_embeddings will be
diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py
index fc0360a9e..395b3130d 100644
--- a/vllm/transformers_utils/configs/ultravox.py
+++ b/vllm/transformers_utils/configs/ultravox.py
@@ -61,6 +61,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
         norm_init: float = 0.4,
         projector_act: str = "swiglu",
         projector_ln_mid: bool = False,
+        num_projector_layers: int = 0,
         **kwargs,
     ):
         self.ignore_index = ignore_index
@@ -71,6 +72,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
         self.norm_init = norm_init
         self.projector_act = projector_act
         self.projector_ln_mid = projector_ln_mid
+        self.num_projector_layers = num_projector_layers
 
         # N.B. May set the wrapped_model_config below.
         self.text_model_id = text_model_id
-- 
GitLab


From 40a046cd82af87e65d3be9db8bd27a4be65f1b00 Mon Sep 17 00:00:00 2001
From: Rohan Potdar <66227218+Rohan138@users.noreply.github.com>
Date: Fri, 5 Dec 2025 22:56:40 -0600
Subject: [PATCH 153/258] [Bugfix]: Fix `TokenizerLike` interface (#30009)

Signed-off-by: Rohan138 <rohanpotdar138@gmail.com>
---
 vllm/benchmarks/datasets.py    | 59 ++++++++++++++++++----------------
 vllm/benchmarks/serve.py       | 32 +++++++++---------
 vllm/benchmarks/throughput.py  | 22 +++++++++----
 vllm/config/model.py           |  3 +-
 vllm/tokenizers/deepseekv32.py |  3 ++
 vllm/tokenizers/mistral.py     |  6 +++-
 vllm/tokenizers/protocol.py    |  3 ++
 vllm/tokenizers/registry.py    |  2 +-
 8 files changed, 78 insertions(+), 52 deletions(-)

diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 638ece260..49ee0faf0 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -32,7 +32,6 @@ from typing import Any, cast
 
 import numpy as np
 from PIL import Image
-from transformers import PreTrainedTokenizerBase
 from typing_extensions import deprecated
 
 from vllm.lora.request import LoRARequest
@@ -189,7 +188,7 @@ class BenchmarkDataset(ABC):
     @abstractmethod
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         request_id_prefix: str = "",
         no_oversample: bool = False,
@@ -201,7 +200,7 @@ class BenchmarkDataset(ABC):
         for generating a list of SampleRequest objects.
 
         Args:
-            tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
+            tokenizer (TokenizerLike): The tokenizer to be used
                 for processing the dataset's text.
             num_requests (int): The number of sample requests to generate.
             request_id_prefix (str): The prefix of request_id.
@@ -380,7 +379,7 @@ def process_video(video: Any) -> Mapping[str, Any]:
 
 
 def gen_prompt_decode_to_target_len(
-    tokenizer: PreTrainedTokenizerBase,
+    tokenizer: TokenizerLike,
     token_sequence: list[int],
     target_token_len: int,
     max_retry: int = 10,
@@ -468,7 +467,7 @@ class RandomDataset(BenchmarkDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         request_id_prefix: str = "",
         no_oversample: bool = False,
@@ -580,7 +579,7 @@ class RandomDataset(BenchmarkDataset):
         range_ratio: float,
         input_len: int,
         output_len: int,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
     ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
         """
         Get the sampling parameters for the dataset.
@@ -626,7 +625,7 @@ class RandomDataset(BenchmarkDataset):
     def generate_token_sequence(
         self,
         *,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         prefix_token_ids: list[int],
         prefix_len: int,
         vocab_size: int,
@@ -686,7 +685,7 @@ class RandomDatasetForReranking(RandomDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         request_id_prefix: str = "",
         range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO,
@@ -716,7 +715,11 @@ class RandomDatasetForReranking(RandomDataset):
         doc_lens, _, doc_offsets = self.get_sampling_params(
             num_requests, range_ratio, doc_len_param, 0, tokenizer
         )
+
         vocab_size = tokenizer.vocab_size
+        prohibited_tokens = tokenizer.all_special_ids
+        all_tokens = np.arange(vocab_size)
+        allowed_tokens = np.array(list(set(all_tokens) - set(prohibited_tokens)))
 
         query_prompt, query_input_len, token_mismatch_total = (
             self.generate_token_sequence(
@@ -727,6 +730,7 @@ class RandomDatasetForReranking(RandomDataset):
                 input_len=query_len,
                 offset=int(query_offsets[0]),
                 index=0,
+                allowed_tokens=allowed_tokens,
             )
         )
 
@@ -740,6 +744,7 @@ class RandomDatasetForReranking(RandomDataset):
                 input_len=int(doc_lens[i]),
                 offset=int(doc_offsets[i]),
                 index=i + 1,
+                allowed_tokens=allowed_tokens,
             )
             token_mismatch_total += token_mismatch
             requests.append((prompt, total_input_len))
@@ -1077,7 +1082,7 @@ class RandomMultiModalDataset(RandomDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         request_id_prefix: str = "",
         no_oversample: bool = False,
@@ -1231,7 +1236,7 @@ class ShareGPTDataset(BenchmarkDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         lora_path: str | None = None,
         max_loras: int | None = None,
@@ -1633,7 +1638,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
     )
 
 
-def get_samples(args, tokenizer) -> list[SampleRequest]:
+def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
     if not hasattr(args, "request_id_prefix"):
         args.request_id_prefix = ""
 
@@ -1971,7 +1976,7 @@ class CustomDataset(BenchmarkDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         lora_path: str | None = None,
         max_loras: int | None = None,
@@ -2101,7 +2106,7 @@ class SonnetDataset(BenchmarkDataset):
 
     def sample(
         self,
-        tokenizer,
+        tokenizer: TokenizerLike,
         num_requests: int,
         prefix_len: int = DEFAULT_PREFIX_LEN,
         input_len: int = DEFAULT_INPUT_LEN,
@@ -2202,7 +2207,7 @@ class BurstGPTDataset(BenchmarkDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         max_loras: int | None = None,
         lora_path: str | None = None,
@@ -2287,7 +2292,7 @@ class ConversationDataset(HuggingFaceDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         output_len: int | None = None,
         enable_multimodal_chat: bool = False,
@@ -2347,7 +2352,7 @@ class MultiModalConversationDataset(HuggingFaceDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         output_len: int | None = None,
         enable_multimodal_chat: bool = False,
@@ -2416,7 +2421,7 @@ class VisionArenaDataset(HuggingFaceDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         output_len: int | None = None,
         enable_multimodal_chat: bool = False,
@@ -2470,7 +2475,7 @@ class MMVUDataset(HuggingFaceDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         output_len: int | None = None,
         enable_multimodal_chat: bool = False,
@@ -2531,7 +2536,7 @@ class InstructCoderDataset(HuggingFaceDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         output_len: int | None = None,
         enable_multimodal_chat: bool = False,
@@ -2595,7 +2600,7 @@ class MTBenchDataset(HuggingFaceDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         output_len: int | None = None,
         enable_multimodal_chat: bool = False,
@@ -2661,7 +2666,7 @@ class BlazeditDataset(HuggingFaceDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         output_len: int | None = None,
         skip_chat_template: bool = False,
@@ -2742,7 +2747,7 @@ class AIMODataset(HuggingFaceDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         output_len: int | None = None,
         request_id_prefix: str = "",
@@ -2852,7 +2857,7 @@ class NextEditPredictionDataset(HuggingFaceDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         request_id_prefix: str = "",
         no_oversample: bool = False,
@@ -2924,7 +2929,7 @@ class ASRDataset(HuggingFaceDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         output_len: int | None = None,
         request_id_prefix: str = "",
@@ -3002,7 +3007,7 @@ class MLPerfDataset(HuggingFaceDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         output_len: int | None = None,
         request_id_prefix: str = "",
@@ -3081,7 +3086,7 @@ class PrefixRepetitionRandomDataset(BenchmarkDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         prefix_len: int = DEFAULT_PREFIX_LEN,
         suffix_len: int = DEFAULT_SUFFIX_LEN,
@@ -3167,7 +3172,7 @@ class MMStarDataset(HuggingFaceDataset):
 
     def sample(
         self,
-        tokenizer: PreTrainedTokenizerBase,
+        tokenizer: TokenizerLike,
         num_requests: int,
         output_len: int | None = None,
         enable_multimodal_chat: bool = False,
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 890cd7e08..568290aa8 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -36,7 +36,6 @@ from typing import Any, Literal
 import aiohttp
 import numpy as np
 from tqdm.asyncio import tqdm
-from transformers import PreTrainedTokenizerBase
 
 from vllm.benchmarks.datasets import SampleRequest, add_dataset_parser, get_samples
 from vllm.benchmarks.lib.endpoint_request_func import (
@@ -47,7 +46,7 @@ from vllm.benchmarks.lib.endpoint_request_func import (
 )
 from vllm.benchmarks.lib.ready_checker import wait_for_endpoint
 from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
-from vllm.tokenizers import get_tokenizer
+from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.utils.gc_utils import freeze_gc_heap
 from vllm.utils.network_utils import join_host_port
 
@@ -286,7 +285,7 @@ def calculate_metrics(
     input_requests: list[SampleRequest],
     outputs: list[RequestFuncOutput],
     dur_s: float,
-    tokenizer: PreTrainedTokenizerBase,
+    tokenizer: TokenizerLike,
     selected_percentiles: list[float],
     goodput_config_dict: dict[str, float],
 ) -> tuple[BenchmarkMetrics, list[int]]:
@@ -489,7 +488,7 @@ async def benchmark(
     base_url: str,
     model_id: str,
     model_name: str,
-    tokenizer: PreTrainedTokenizerBase,
+    tokenizer: TokenizerLike,
     input_requests: list[SampleRequest],
     logprobs: int | None,
     request_rate: float,
@@ -1032,6 +1031,19 @@ def add_cli_args(parser: argparse.ArgumentParser):
         type=str,
         help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
     )
+    parser.add_argument(
+        "--tokenizer-mode",
+        type=str,
+        default="auto",
+        help="""Tokenizer mode:\n
+        - "auto" will use the tokenizer from `mistral_common` for Mistral models
+        if available, otherwise it will use the "hf" tokenizer.\n
+        - "hf" will use the fast tokenizer if available.\n
+        - "slow" will always use the slow tokenizer.\n
+        - "mistral" will always use the tokenizer from `mistral_common`.\n
+        - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
+        - Other custom values can be supported via plugins.""",
+    )
     parser.add_argument("--use-beam-search", action="store_true")
     parser.add_argument(
         "--logprobs",
@@ -1228,18 +1240,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
         help="Common prefix length shared by all prompts (used by random dataset)",
     )
 
-    parser.add_argument(
-        "--tokenizer-mode",
-        type=str,
-        default="auto",
-        choices=["auto", "slow", "mistral", "custom"],
-        help='The tokenizer mode.\n\n* "auto" will use the '
-        'fast tokenizer if available.\n* "slow" will '
-        "always use the slow tokenizer. \n* "
-        '"mistral" will always use the `mistral_common` tokenizer. \n*'
-        '"custom" will use --tokenizer to select the preregistered tokenizer.',
-    )
-
     parser.add_argument(
         "--served-model-name",
         type=str,
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
index 23b5faa1b..ea693613f 100644
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -14,7 +14,7 @@ from typing import Any
 import torch
 import uvloop
 from tqdm import tqdm
-from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
+from transformers import AutoModelForCausalLM, PreTrainedTokenizerBase
 
 from vllm.benchmarks.datasets import (
     AIMODataset,
@@ -35,6 +35,7 @@ from vllm.inputs import TextPrompt, TokensPrompt
 from vllm.lora.request import LoRARequest
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
+from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.utils.async_utils import merge_async_iterators
 
 
@@ -246,12 +247,15 @@ async def run_vllm_async(
 def run_hf(
     requests: list[SampleRequest],
     model: str,
-    tokenizer: PreTrainedTokenizerBase,
+    tokenizer: TokenizerLike,
     n: int,
     max_batch_size: int,
     trust_remote_code: bool,
     disable_detokenize: bool = False,
 ) -> float:
+    assert isinstance(tokenizer, PreTrainedTokenizerBase), (
+        "the hf backend only supports HF tokenizers"
+    )
     llm = AutoModelForCausalLM.from_pretrained(
         model, dtype=torch.float16, trust_remote_code=trust_remote_code
     )
@@ -692,15 +696,21 @@ def add_cli_args(parser: argparse.ArgumentParser):
 
 
 def main(args: argparse.Namespace):
-    if args.tokenizer is None:
-        args.tokenizer = args.model
     validate_args(args)
     if args.seed is None:
         args.seed = 0
     random.seed(args.seed)
     # Sample the requests.
-    tokenizer = AutoTokenizer.from_pretrained(
-        args.tokenizer, trust_remote_code=args.trust_remote_code
+    if (
+        args.backend == "hf" or args.backend == "mii"
+    ) and args.tokenizer_mode == "auto":
+        # mistral_common tokenizer is only supported on vllm and vllm-chat backends;
+        # for hf and mii backends, we use hf tokenizer
+        args.tokenizer_mode = "hf"
+    tokenizer = get_tokenizer(
+        args.tokenizer,
+        tokenizer_mode=args.tokenizer_mode,
+        trust_remote_code=args.trust_remote_code,
     )
     requests = get_requests(args, tokenizer)
     is_multi_modal = any(request.multi_modal_data is not None for request in requests)
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 5be7d5e7f..509a9c5e1 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -136,7 +136,8 @@ class ModelConfig:
     name or path will be used."""
     tokenizer_mode: TokenizerMode | str = "auto"
     """Tokenizer mode:\n
-    - "auto" will use "hf" tokenizer if Mistral's tokenizer is not available.\n
+    - "auto" will use the tokenizer from `mistral_common` for Mistral models
+    if available, otherwise it will use the "hf" tokenizer.\n
     - "hf" will use the fast tokenizer if available.\n
     - "slow" will always use the slow tokenizer.\n
     - "mistral" will always use the tokenizer from `mistral_common`.\n
diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py
index 1140357cf..b0490dacb 100644
--- a/vllm/tokenizers/deepseekv32.py
+++ b/vllm/tokenizers/deepseekv32.py
@@ -54,6 +54,9 @@ class DeepseekV32Tokenizer(HfTokenizer):
         prompt_str = encode_messages(messages, **encode_config)  # type: ignore
         return prompt_str
 
+    def num_special_tokens_to_add(self) -> int:
+        return len(self.encode(""))
+
     @property
     def all_special_tokens(self) -> list[str]:
         return self.tokenizer.all_special_tokens
diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py
index 37d67607c..1f44037dd 100644
--- a/vllm/tokenizers/mistral.py
+++ b/vllm/tokenizers/mistral.py
@@ -309,6 +309,9 @@ class MistralTokenizer(TokenizerLike):
             for i in all_special_ids
         ]
 
+    def num_special_tokens_to_add(self) -> int:
+        return len(self.encode(""))
+
     # the following attributes are set to fit vLLM's design and are used
     # by the structured output backends.
     @property
@@ -421,6 +424,7 @@ class MistralTokenizer(TokenizerLike):
     ) -> list[int]:
         add_generation_prompt = kwargs.pop("add_generation_prompt", False)
         continue_final_message = kwargs.get("continue_final_message", False)
+        tokenize = kwargs.get("tokenize", True)
         padding = kwargs.get("padding", False)
         truncation = kwargs.get("truncation", False)
         max_length = kwargs.get("max_length")
@@ -433,7 +437,7 @@ class MistralTokenizer(TokenizerLike):
             conversation=messages,
             tools=tools,
             continue_final_message=continue_final_message,
-            tokenize=True,
+            tokenize=tokenize,
             padding=padding,
             truncation=truncation,
             max_length=max_length,
diff --git a/vllm/tokenizers/protocol.py b/vllm/tokenizers/protocol.py
index 6c807bd99..d6a3b0ba9 100644
--- a/vllm/tokenizers/protocol.py
+++ b/vllm/tokenizers/protocol.py
@@ -22,6 +22,9 @@ class TokenizerLike(Protocol):
     ) -> "TokenizerLike":
         raise NotImplementedError
 
+    def num_special_tokens_to_add(self) -> int:
+        raise NotImplementedError
+
     @property
     def all_special_tokens(self) -> list[str]:
         raise NotImplementedError
diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py
index 87048f2ec..1d44feeee 100644
--- a/vllm/tokenizers/registry.py
+++ b/vllm/tokenizers/registry.py
@@ -183,7 +183,7 @@ def get_tokenizer(
             "`tokenizer_mode='custom'` when initializing vLLM.",
             tokenizer_args,
             str(tokenizer_kwargs),
-            tokenizer_mode,
+            tokenizer_name,
         )
 
         tokenizer_mode = str(tokenizer_name)
-- 
GitLab


From b12f4a983077f0f085e3734d4d5b0c25f2576cec Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Fri, 5 Dec 2025 22:57:38 -0600
Subject: [PATCH 154/258] [CI/Build][AMD] Use ROCM_ATTN instead of FLASH_ATTN
 test for test_register_kv_caches for ROCm and update test for TRITON_ATTN
 (#29985)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
---
 .../kv_connector/unit/test_nixl_connector.py  | 48 ++++++++++++++-----
 1 file changed, 37 insertions(+), 11 deletions(-)

diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index ec9ff7315..53da09cfb 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -41,6 +41,7 @@ from vllm.distributed.kv_transfer.kv_transfer_state import (
     has_kv_transfer_group,
 )
 from vllm.forward_context import ForwardContext
+from vllm.platforms import current_platform
 from vllm.platforms.interface import Platform
 from vllm.sampling_params import SamplingParams
 from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
@@ -1111,7 +1112,26 @@ def _run_abort_timeout_test(llm: LLM, timeout: int):
     llm.llm_engine.engine_core.shutdown()
 
 
-@pytest.mark.parametrize("attn_backend", ["FLASH_ATTN", "TRITON_ATTN"])
+@pytest.mark.parametrize(
+    "attn_backend",
+    [
+        pytest.param(
+            "FLASH_ATTN",
+            marks=pytest.mark.skipif(
+                current_platform.is_rocm(),
+                reason="Attention backend FLASH_ATTN is not supported on ROCm",
+            ),
+        ),
+        pytest.param(
+            "ROCM_ATTN",
+            marks=pytest.mark.skipif(
+                not current_platform.is_rocm(),
+                reason="Attention backend ROCM_ATTN is only supported on ROCm",
+            ),
+        ),
+        "TRITON_ATTN",
+    ],
+)
 def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
     """
     Test that register_kv_caches() properly calls nixl_wrapper methods with
@@ -1133,6 +1153,10 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
         from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
 
         backend_cls = FlashAttentionBackend
+    elif attn_backend == "ROCM_ATTN":
+        from vllm.v1.attention.backends.rocm_attn import RocmAttentionBackend
+
+        backend_cls = RocmAttentionBackend
     else:  # TRITON_ATTN
         from vllm.v1.attention.backends.triton_attn import TritonAttentionBackend
 
@@ -1151,6 +1175,7 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
     }
 
     # Store tensor info for validation
+
     test_shape = backend_cls.get_kv_cache_shape(
         num_blocks=1, block_size=16, num_kv_heads=1, head_size=1
     )
@@ -1175,17 +1200,18 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
         ]
         expected_num_entries = 4
 
+    nixl_module = "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector"
     with (
-        patch(
-            "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper"
-        ) as mock_nixl_wrapper,
-        patch(
-            "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.threading.Event"
-        ),
-        patch(
-            "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.threading.Thread"
-        ) as mock_thread,
-    ):  # noqa: E501
+        patch(f"{nixl_module}.NixlWrapper") as mock_nixl_wrapper,
+        patch(f"{nixl_module}.threading.Event"),
+        patch(f"{nixl_module}.threading.Thread") as mock_thread,
+        patch(f"{nixl_module}.get_attn_backend") as mock_get_attn_backend,
+    ):
+        # Ensure get_attn_backend returns the correct value due to
+        # _cached_get_attn_backend returning the backend from previous
+        # test run if not mocking.
+        mock_get_attn_backend.return_value = backend_cls
+
         # Create connector
         connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
         connector.connector_worker = FakeNixlConnectorWorker(
-- 
GitLab


From 4026ae31e910d50da2b80c1c386f1d1db7f1b7d8 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 5 Dec 2025 20:59:04 -0800
Subject: [PATCH 155/258] [Misc] Move `disable_nccl_for_dp_synchronization`
 init logic into `VllmConfig` (#30161)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/config/vllm.py      | 9 +++++++++
 vllm/engine/arg_utils.py | 6 ------
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 47e7ffded..b99be1e5d 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -586,6 +586,15 @@ class VllmConfig:
             else:
                 self.scheduler_config.async_scheduling = True
 
+        if (
+            self.scheduler_config.async_scheduling
+            and not self.parallel_config.disable_nccl_for_dp_synchronization
+        ):
+            logger.info(
+                "Disabling NCCL for DP synchronization when using async scheduling."
+            )
+            self.parallel_config.disable_nccl_for_dp_synchronization = True
+
         from vllm.platforms import current_platform
 
         if (
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index aad071954..ceac5407a 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1602,12 +1602,6 @@ class EngineArgs:
             model_config.skip_tokenizer_init = True
             logger.info("Skipping tokenizer initialization for tokens-only mode.")
 
-        if self.async_scheduling and not self.disable_nccl_for_dp_synchronization:
-            logger.info(
-                "Disabling NCCL for DP synchronization when using async scheduling."
-            )
-            self.disable_nccl_for_dp_synchronization = True
-
         parallel_config = ParallelConfig(
             pipeline_parallel_size=self.pipeline_parallel_size,
             tensor_parallel_size=self.tensor_parallel_size,
-- 
GitLab


From a238cbd89d07b4b0ed8fb3dff3c219a3ee3a1651 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 5 Dec 2025 21:42:47 -0800
Subject: [PATCH 156/258] [Model Runner V2] Support min-p sampling (#30171)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/sample/metadata.py | 13 +++++++
 vllm/v1/worker/gpu/sample/min_p.py    | 53 +++++++++++++++++++++++++++
 vllm/v1/worker/gpu/sample/sampler.py  |  4 ++
 vllm/v1/worker/gpu/states.py          |  7 ++++
 4 files changed, 77 insertions(+)
 create mode 100644 vllm/v1/worker/gpu/sample/min_p.py

diff --git a/vllm/v1/worker/gpu/sample/metadata.py b/vllm/v1/worker/gpu/sample/metadata.py
index 040771c05..f10c72049 100644
--- a/vllm/v1/worker/gpu/sample/metadata.py
+++ b/vllm/v1/worker/gpu/sample/metadata.py
@@ -13,6 +13,7 @@ class SamplingMetadata:
 
     top_p: torch.Tensor | None
     top_k: torch.Tensor | None
+    min_p: torch.Tensor | None
 
     repetition_penalty: torch.Tensor
     frequency_penalty: torch.Tensor
@@ -44,6 +45,7 @@ class SamplingMetadata:
         # top_k = torch.full((num_reqs,), 20, dtype=torch.int32, device=device)
         top_p = None
         top_k = None
+        min_p = torch.zeros(num_reqs, dtype=torch.float32, device=device)
         # NOTE(woosuk): We must set penalties to their default values to make sure
         # the penalties kernel does not touch the placeholder bin_counts tensors.
         repetition_penalty = torch.ones(num_reqs, dtype=torch.float32, device=device)
@@ -64,6 +66,7 @@ class SamplingMetadata:
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
+            min_p=min_p,
             repetition_penalty=repetition_penalty,
             frequency_penalty=frequency_penalty,
             presence_penalty=presence_penalty,
@@ -85,6 +88,8 @@ def _expand_sampling_metadata_kernel(
     expanded_top_p_ptr,
     top_k_ptr,
     expanded_top_k_ptr,
+    min_p_ptr,
+    expanded_min_p_ptr,
     rep_penalty_ptr,
     expanded_rep_penalty_ptr,
     freq_penalty_ptr,
@@ -115,6 +120,10 @@ def _expand_sampling_metadata_kernel(
         top_k = tl.load(top_k_ptr + req_idx)
         tl.store(expanded_top_k_ptr + start_idx + block, top_k, mask=mask)
 
+    if min_p_ptr is not None:
+        min_p = tl.load(min_p_ptr + req_idx)
+        tl.store(expanded_min_p_ptr + start_idx + block, min_p, mask=mask)
+
     rep_penalty = tl.load(rep_penalty_ptr + req_idx)
     tl.store(expanded_rep_penalty_ptr + start_idx + block, rep_penalty, mask=mask)
 
@@ -138,6 +147,7 @@ def expand_sampling_metadata(
     expanded_temp = create_empty(sampling_metadata.temperature)
     expanded_top_p = create_empty(sampling_metadata.top_p)
     expanded_top_k = create_empty(sampling_metadata.top_k)
+    expanded_min_p = create_empty(sampling_metadata.min_p)
     expanded_repetition_penalty = create_empty(sampling_metadata.repetition_penalty)
     expanded_frequency_penalty = create_empty(sampling_metadata.frequency_penalty)
     expanded_presence_penalty = create_empty(sampling_metadata.presence_penalty)
@@ -151,6 +161,8 @@ def expand_sampling_metadata(
         expanded_top_p,
         sampling_metadata.top_k,
         expanded_top_k,
+        sampling_metadata.min_p,
+        expanded_min_p,
         sampling_metadata.repetition_penalty,
         expanded_repetition_penalty,
         sampling_metadata.frequency_penalty,
@@ -166,6 +178,7 @@ def expand_sampling_metadata(
         temperature=expanded_temp,
         top_p=expanded_top_p,
         top_k=expanded_top_k,
+        min_p=expanded_min_p,
         seeds=expanded_seeds,
         repetition_penalty=expanded_repetition_penalty,
         frequency_penalty=expanded_frequency_penalty,
diff --git a/vllm/v1/worker/gpu/sample/min_p.py b/vllm/v1/worker/gpu/sample/min_p.py
new file mode 100644
index 000000000..063881800
--- /dev/null
+++ b/vllm/v1/worker/gpu/sample/min_p.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.triton_utils import tl, triton
+
+
+@triton.jit
+def _min_p_kernel(
+    logits_ptr,
+    logits_stride,
+    min_p_ptr,
+    vocab_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    min_p = tl.load(min_p_ptr + req_idx).to(tl.float32)
+    if min_p == 0.0:
+        return
+
+    max_val = float("-inf")
+    for i in range(0, vocab_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < vocab_size
+        logits = tl.load(
+            logits_ptr + req_idx * logits_stride + block, mask=mask, other=float("-inf")
+        )
+        max_val = tl.max(tl.maximum(logits, max_val))
+    max_val = max_val.to(tl.float32)  # type: ignore
+
+    threshold = max_val + tl.log(min_p)
+    for i in range(0, vocab_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < vocab_size
+        logits = tl.load(
+            logits_ptr + req_idx * logits_stride + block, mask=mask, other=float("-inf")
+        )
+        logits = tl.where(logits < threshold, float("-inf"), logits)
+        tl.store(logits_ptr + req_idx * logits_stride + block, logits, mask=mask)
+
+
+def apply_min_p(logits: torch.Tensor, min_p: torch.Tensor | None) -> None:
+    if min_p is None:
+        return
+    num_reqs, vocab_size = logits.shape
+    BLOCK_SIZE = 1024
+    _min_p_kernel[(num_reqs,)](
+        logits,
+        logits.stride(0),
+        min_p,
+        vocab_size,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
diff --git a/vllm/v1/worker/gpu/sample/sampler.py b/vllm/v1/worker/gpu/sample/sampler.py
index 3429dd3e4..9a4224d8f 100644
--- a/vllm/v1/worker/gpu/sample/sampler.py
+++ b/vllm/v1/worker/gpu/sample/sampler.py
@@ -9,6 +9,7 @@ from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
 from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample
 from vllm.v1.worker.gpu.sample.logprob import compute_topk_logprobs
 from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
+from vllm.v1.worker.gpu.sample.min_p import apply_min_p
 from vllm.v1.worker.gpu.sample.penalties import apply_penalties_and_temperature
 
 
@@ -61,6 +62,9 @@ class Sampler:
 
         # Apply penalties and temperature in place.
         apply_penalties_and_temperature(logits, sampling_metadata)
+        # Apply min_p in place.
+        apply_min_p(logits, sampling_metadata.min_p)
+        # Apply top_k and/or top_p. This might return a new tensor.
         logits = apply_top_k_top_p(
             logits, sampling_metadata.top_k, sampling_metadata.top_p
         )
diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py
index 367348c4a..6823c0c8e 100644
--- a/vllm/v1/worker/gpu/states.py
+++ b/vllm/v1/worker/gpu/states.py
@@ -87,6 +87,7 @@ class RequestState:
         self.temperature = self._make_param(self.max_num_reqs, torch.float32)
         self.top_p = self._make_param(self.max_num_reqs, torch.float32)
         self.top_k = self._make_param(self.max_num_reqs, torch.int32)
+        self.min_p = self._make_param(self.max_num_reqs, torch.float32)
         self.repetition_penalty = self._make_param(self.max_num_reqs, torch.float32)
         self.frequency_penalty = self._make_param(self.max_num_reqs, torch.float32)
         self.presence_penalty = self._make_param(self.max_num_reqs, torch.float32)
@@ -162,6 +163,7 @@ class RequestState:
         else:
             top_k = self.vocab_size
         self.top_k.np[req_idx] = top_k
+        self.min_p.np[req_idx] = sampling_params.min_p
         self.repetition_penalty.np[req_idx] = sampling_params.repetition_penalty
         self.frequency_penalty.np[req_idx] = sampling_params.frequency_penalty
         self.presence_penalty.np[req_idx] = sampling_params.presence_penalty
@@ -217,6 +219,10 @@ class RequestState:
         no_top_k = np.all(top_k == self.vocab_size)
         top_k = self.top_k.copy_np_to_gpu(top_k) if not no_top_k else None
 
+        min_p = self.min_p.np[idx_mapping_np]
+        no_min_p = np.all(min_p == 0.0)
+        min_p = self.min_p.copy_np_to_gpu(min_p) if not no_min_p else None
+
         rep_penalty = self.repetition_penalty.np[idx_mapping_np]
         rep_penalty = self.repetition_penalty.copy_np_to_gpu(rep_penalty)
         freq_penalty = self.frequency_penalty.np[idx_mapping_np]
@@ -236,6 +242,7 @@ class RequestState:
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
+            min_p=min_p,
             repetition_penalty=rep_penalty,
             frequency_penalty=freq_penalty,
             presence_penalty=pres_penalty,
-- 
GitLab


From d6aeaddf4a6201e35ec89bcd4b3719e4e7293f1f Mon Sep 17 00:00:00 2001
From: kx <1670186653@qq.com>
Date: Sat, 6 Dec 2025 15:11:31 +0800
Subject: [PATCH 157/258] [bugfix] fix type[AttentionBackend] bug in
 kv_connector_base_v1 (#30051)

Signed-off-by: 01267596 <xiongkai123@cmbchina.com>
Co-authored-by: 01267596 <xiongkai123@cmbchina.com>
---
 vllm/distributed/kv_transfer/kv_connector/v1/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index 8e9182a9b..91f6443f9 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -239,7 +239,7 @@ class KVConnectorBase_V1(ABC):
         return
 
     def register_cross_layers_kv_cache(
-        self, kv_cache: torch.Tensor, attn_backend: type[AttentionBackend]
+        self, kv_cache: torch.Tensor, attn_backend: type["AttentionBackend"]
     ):
         """
         Initialize with a single KV cache tensor used by all layers.
-- 
GitLab


From 6476382384d3560367e30732995e2456ec569c5d Mon Sep 17 00:00:00 2001
From: redwrasse <mail@redwrasse.io>
Date: Fri, 5 Dec 2025 23:39:56 -0800
Subject: [PATCH 158/258] prefix caching design doc sha256 now default (#29261)

Signed-off-by: redwrasse <mail@redwrasse.io>
---
 docs/design/prefix_caching.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/design/prefix_caching.md b/docs/design/prefix_caching.md
index cf792fdab..6f2eb3062 100644
--- a/docs/design/prefix_caching.md
+++ b/docs/design/prefix_caching.md
@@ -22,8 +22,8 @@ In the example above, the KV cache in the first block can be uniquely identified
     We only cache full blocks.
 
 !!! note "Note 2"
-    The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value. To avoid any hash collisions **in a multi-tenant setup, we advise to use SHA256** as hash function instead of the default builtin hash.
-    SHA256 is supported since vLLM v0.8.3 and must be enabled with a command line argument. It comes with a performance impact of about 100-200ns per token (~6ms for 50k tokens of context).
+    The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value. To avoid any hash collisions **in a multi-tenant setup, we use SHA256** as hash function instead of the builtin hash.
+    SHA256 is supported since vLLM v0.8.3 and the default since v0.10.2. It comes with a negligible performance impact of about 75ns per token (<4ms for 50k tokens of context).
 
 **A hashing example with multi-modality inputs**  
 In this example, we illustrate how prefix caching works with multi-modality inputs (e.g., images). Assuming we have a request with the following messages:
-- 
GitLab


From c46b932df2b801ba0a6452e436268f086029d82b Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 6 Dec 2025 15:57:28 +0800
Subject: [PATCH 159/258] [Chore] Deprecate
 `SupportsMultiModal.merge_by_field_config` (#30170)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/aria.py            |  2 --
 vllm/model_executor/models/aya_vision.py      |  2 --
 vllm/model_executor/models/blip2.py           |  2 --
 vllm/model_executor/models/chameleon.py       |  2 --
 vllm/model_executor/models/clip.py            |  1 -
 vllm/model_executor/models/cohere2_vision.py  |  2 --
 vllm/model_executor/models/deepseek_ocr.py    |  2 --
 vllm/model_executor/models/deepseek_vl2.py    |  2 --
 vllm/model_executor/models/dots_ocr.py        |  2 --
 vllm/model_executor/models/ernie45_vl.py      |  2 --
 vllm/model_executor/models/fuyu.py            |  2 --
 vllm/model_executor/models/gemma3_mm.py       |  2 --
 vllm/model_executor/models/gemma3n_mm.py      |  1 -
 vllm/model_executor/models/glm4_1v.py         |  2 --
 vllm/model_executor/models/glm4v.py           |  2 --
 vllm/model_executor/models/granite_speech.py  |  1 -
 vllm/model_executor/models/hunyuan_vision.py  |  1 -
 .../models/hyperclovax_vision.py              |  2 --
 vllm/model_executor/models/idefics3.py        |  2 --
 vllm/model_executor/models/interfaces.py      | 25 ++++++++++++++++---
 vllm/model_executor/models/interns1.py        |  2 --
 vllm/model_executor/models/internvl.py        |  2 --
 vllm/model_executor/models/keye.py            |  2 --
 vllm/model_executor/models/kimi_vl.py         |  2 --
 vllm/model_executor/models/llava.py           |  2 --
 vllm/model_executor/models/llava_next.py      |  2 --
 .../model_executor/models/llava_next_video.py |  2 --
 vllm/model_executor/models/llava_onevision.py |  2 --
 vllm/model_executor/models/midashenglm.py     |  2 --
 vllm/model_executor/models/minicpmv.py        |  2 --
 vllm/model_executor/models/minimax_vl_01.py   |  2 --
 vllm/model_executor/models/mistral3.py        |  2 --
 vllm/model_executor/models/mllama4.py         |  2 --
 vllm/model_executor/models/molmo.py           |  2 --
 .../model_executor/models/nano_nemotron_vl.py |  2 --
 vllm/model_executor/models/nemotron_vl.py     |  2 --
 vllm/model_executor/models/opencua.py         |  1 -
 vllm/model_executor/models/ovis.py            |  2 --
 vllm/model_executor/models/ovis2_5.py         |  2 --
 vllm/model_executor/models/paddleocr_vl.py    |  2 --
 vllm/model_executor/models/paligemma.py       |  2 --
 vllm/model_executor/models/phi3v.py           |  2 --
 vllm/model_executor/models/phi4mm.py          |  2 --
 vllm/model_executor/models/pixtral.py         |  2 --
 .../models/qwen2_5_omni_thinker.py            |  2 --
 vllm/model_executor/models/qwen2_5_vl.py      |  1 -
 vllm/model_executor/models/qwen2_audio.py     |  2 --
 vllm/model_executor/models/qwen2_vl.py        |  1 -
 .../models/qwen3_omni_moe_thinker.py          |  2 --
 vllm/model_executor/models/qwen3_vl.py        |  1 -
 vllm/model_executor/models/qwen_vl.py         |  2 --
 vllm/model_executor/models/siglip.py          |  1 -
 vllm/model_executor/models/skyworkr1v.py      |  2 --
 vllm/model_executor/models/step3_vl.py        |  2 --
 vllm/model_executor/models/tarsier.py         |  2 --
 vllm/model_executor/models/terratorch.py      |  1 -
 .../models/transformers/multimodal.py         |  2 +-
 vllm/model_executor/models/ultravox.py        |  2 --
 vllm/model_executor/models/voxtral.py         |  2 --
 vllm/model_executor/models/whisper.py         |  1 -
 vllm/multimodal/utils.py                      |  1 -
 61 files changed, 23 insertions(+), 110 deletions(-)

diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 3d07e6b61..c6d7f19cb 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -499,8 +499,6 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
     model to perform tasks that involve both image and text inputs.
     """
 
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             # mapping for new names in checkpoint saved after transformers v4.52
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index 0ada2ed50..ee9e210a3 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -318,8 +318,6 @@ def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
     dummy_inputs=AyaVisionDummyInputsBuilder,
 )
 class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             # mapping for new names in checkpoint saved after transformers v4.52
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index f71b9c01d..1244f97a1 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -523,8 +523,6 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
 class Blip2ForConditionalGeneration(
     nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant
 ):
-    merge_by_field_config = True
-
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 3aa01bb19..dfc05a366 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -918,8 +918,6 @@ class ChameleonModel(nn.Module):
 class ChameleonForConditionalGeneration(
     nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant
 ):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index b8af30509..22f3ecad7 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -784,7 +784,6 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
     is_pooling_model = True
 
     packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
-    merge_by_field_config = True
 
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py
index 139ccba9d..07dc7a01d 100644
--- a/vllm/model_executor/models/cohere2_vision.py
+++ b/vllm/model_executor/models/cohere2_vision.py
@@ -331,8 +331,6 @@ class Cohere2VisionMultiModalProcessor(
     dummy_inputs=Cohere2VisionDummyInputsBuilder,
 )
 class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "model.vision_tower.": "vision_tower.",
diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py
index a612ebd95..1f07381c0 100644
--- a/vllm/model_executor/models/deepseek_ocr.py
+++ b/vllm/model_executor/models/deepseek_ocr.py
@@ -344,8 +344,6 @@ class DeepseekOCRMultiModalProcessor(
     dummy_inputs=DeepseekOCRDummyInputsBuilder,
 )
 class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             # map prefix for language backbone
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 56c1a87a2..9f8faf9ed 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -344,8 +344,6 @@ class DeepseekVL2MultiModalProcessor(
     dummy_inputs=DeepseekVL2DummyInputsBuilder,
 )
 class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "language.": "language_model.",
diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py
index 5cc2a48f2..da19d8fdb 100644
--- a/vllm/model_executor/models/dots_ocr.py
+++ b/vllm/model_executor/models/dots_ocr.py
@@ -690,8 +690,6 @@ class DotsVisionTransformer(nn.Module):
     dummy_inputs=DotsOCRDummyInputsBuilder,
 )
 class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={
             ".attn.qkv_proj.": ".attn.qkv.",
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
index 81663dd7b..3305b6a0e 100644
--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -1254,8 +1254,6 @@ class Ernie4_5_VLDummyInputsBuilder(BaseDummyInputsBuilder[Ernie4_5_VLProcessing
 class Ernie4_5_VLMoeForConditionalGeneration(
     nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
 ):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 269c36ab5..8a7a3dd77 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -260,8 +260,6 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
     dummy_inputs=FuyuDummyInputsBuilder,
 )
 class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "model.vision_embed_tokens.": "vision_embed_tokens.",
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 43c69e5e1..e8dec36a1 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -483,8 +483,6 @@ class Gemma3MultiModalProjector(nn.Module):
 class Gemma3ForConditionalGeneration(
     nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
 ):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
index 6ae76976e..7036118ad 100644
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -463,7 +463,6 @@ class Gemma3nMultimodalEmbedder(nn.Module):
 class Gemma3nForConditionalGeneration(
     nn.Module, SupportsMultiModal, SupportsTranscription
 ):
-    merge_by_field_config = True
     supported_languages = ISO639_1_SUPPORTED_LANGS
 
     packed_modules_mapping = {
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 5ba3c0a35..3cb53f2cb 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -1424,8 +1424,6 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
 class Glm4vForConditionalGeneration(
     nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
 ):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 514082cf6..ec5af94e2 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -561,8 +561,6 @@ class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
 class GLM4VForCausalLM(
     ChatGLMBaseModel, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
 ):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "query_key_value": ["query_key_value"],
         "dense_h_to_4h": ["dense_h_to_4h"],
diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
index accf7e6ef..a4e50f408 100644
--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -564,7 +564,6 @@ class GraniteSpeechForConditionalGeneration(
     SupportsLoRA,
     SupportsTranscription,
 ):
-    merge_by_field_config = True
     supported_languages = ISO639_1_SUPPORTED_LANGS
 
     packed_modules_mapping = {
diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py
index 5aef09ca9..52ce9564c 100644
--- a/vllm/model_executor/models/hunyuan_vision.py
+++ b/vllm/model_executor/models/hunyuan_vision.py
@@ -786,7 +786,6 @@ class HunYuanVLForConditionalGeneration(
     SupportsQuant,
     SupportsXDRoPE,
 ):
-    merge_by_field_config = True
     multimodal_cpu_fields = {"image_grid_thw"}
 
     # To ensure correct weight loading and mapping.
diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py
index db46353ef..3a083870e 100644
--- a/vllm/model_executor/models/hyperclovax_vision.py
+++ b/vllm/model_executor/models/hyperclovax_vision.py
@@ -592,8 +592,6 @@ class HCXVisionCAbstractor(nn.Module):
     dummy_inputs=HCXVisionDummyInputsBuilder,
 )
 class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 7c3933c6f..0eed46448 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -576,8 +576,6 @@ class Idefics3Model(nn.Module):
     dummy_inputs=Idefics3DummyInputsBuilder,
 )
 class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 01b3e7827..416ab236c 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -78,9 +78,9 @@ class SupportsMultiModal(Protocol):
     `multimodal_config.mm_encoder_tp_mode="data"`.
     """
 
-    merge_by_field_config: ClassVar[bool] = True
+    merge_by_field_config: ClassVar[bool | None] = None
     """
-    A flag that indicates which implementation of
+    [DEPRECATED] A flag that indicates which implementation of
     `vllm.multimodal.utils.group_mm_kwargs_by_modality` to use.
     """
 
@@ -260,7 +260,26 @@ def supports_multimodal(model: object) -> TypeIs[SupportsMultiModal]: ...
 def supports_multimodal(
     model: type[object] | object,
 ) -> TypeIs[type[SupportsMultiModal]] | TypeIs[SupportsMultiModal]:
-    return getattr(model, "supports_multimodal", False)
+    res = getattr(model, "supports_multimodal", False)
+
+    if res:
+        # We can remove this starting from v0.14
+        merge_by_field_config = getattr(model, "merge_by_field_config", None)
+        if merge_by_field_config is False:
+            raise ValueError(
+                "`merge_by_field_config=False` is no longer effective, "
+                "please update your model to consider the new batching logic "
+                "in `group_mm_kwargs_by_modality` (refer to "
+                "https://github.com/vllm-project/vllm/issues/26149), "
+                "and then remove the override from your model."
+            )
+        if merge_by_field_config is True:
+            logger.warning_once(
+                "`merge_by_field_config=True` is redundant, "
+                "please remove the override from your model."
+            )
+
+    return res
 
 
 def supports_multimodal_raw_input_only(model: type[object] | object) -> bool:
diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py
index c2195fd0c..18985cefb 100644
--- a/vllm/model_executor/models/interns1.py
+++ b/vllm/model_executor/models/interns1.py
@@ -509,8 +509,6 @@ class InternS1MultiModalProcessor(BaseMultiModalProcessor[InternS1ProcessingInfo
 class InternS1ForConditionalGeneration(
     nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
 ):
-    merge_by_field_config = True
-
     # To ensure correct weight loading and mapping.
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index fccddf3a6..15f7d4f41 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -1074,8 +1074,6 @@ class InternVLMultiModalProcessor(
     dummy_inputs=InternVLDummyInputsBuilder,
 )
 class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
-    merge_by_field_config = True
-
     supports_encoder_tp_data = True
 
     @classmethod
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 09acf8372..f31da0ee3 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -1292,8 +1292,6 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
 
 
 class BaseKeyeModule(nn.Module):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py
index 8167b82f3..85267ccda 100644
--- a/vllm/model_executor/models/kimi_vl.py
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -298,8 +298,6 @@ class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
     dummy_inputs=KimiVLDummyInputsBuilder,
 )
 class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     supports_encoder_tp_data = True
 
     @classmethod
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index c1fb2d4f4..66a327bb7 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -506,8 +506,6 @@ def init_vision_tower_for_llava(
     dummy_inputs=LlavaDummyInputsBuilder,
 )
 class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index b995cac47..526846d0d 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -223,8 +223,6 @@ class LlavaNextMultiModalProcessor(
     dummy_inputs=LlavaDummyInputsBuilder,
 )
 class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             # mapping for new names in checkpoint saved after transformers v4.52
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 902c598c2..cd55cfec6 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -299,8 +299,6 @@ class LlavaNextMultiModalProjector(nn.Module):
     dummy_inputs=LlavaNextVideoDummyInputsBuilder,
 )
 class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             # mapping for new names in checkpoint saved after transformers v4.52
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 4e243ade6..5aa8de7dc 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -479,8 +479,6 @@ class LlavaOnevisionMultiModalProjector(nn.Module):
     dummy_inputs=LlavaOnevisionDummyInputsBuilder,
 )
 class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             # mapping for new names in checkpoint saved after transformers v4.52
diff --git a/vllm/model_executor/models/midashenglm.py b/vllm/model_executor/models/midashenglm.py
index d9b238117..2d506978d 100644
--- a/vllm/model_executor/models/midashenglm.py
+++ b/vllm/model_executor/models/midashenglm.py
@@ -683,8 +683,6 @@ class MiDashengLMMultiModalProcessor(
     dummy_inputs=MiDashengLMDummyInputsBuilder,
 )
 class MiDashengLMModel(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 6d0ebf5c9..c45bdf95e 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -1003,8 +1003,6 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
     instantiated.
     """
 
-    merge_by_field_config = True
-
     supports_encoder_tp_data = True
 
     @classmethod
diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py
index 0939a72ba..e48045495 100644
--- a/vllm/model_executor/models/minimax_vl_01.py
+++ b/vllm/model_executor/models/minimax_vl_01.py
@@ -179,8 +179,6 @@ class MiniMaxVL01MultiModalProcessor(
     dummy_inputs=MiniMaxVL01DummyInputsBuilder,
 )
 class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index 1ddb470a0..e9161e69e 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -423,8 +423,6 @@ def init_vision_tower_for_llava(
 class Mistral3ForConditionalGeneration(
     nn.Module, SupportsLoRA, SupportsMultiModal, SupportsPP
 ):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 286859d18..e944c0ee3 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -741,8 +741,6 @@ class Llama4ForConditionalGeneration(
     SupportsEagle3,
     SupportsLoRA,
 ):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 7b53299cc..a6cd9ad16 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1354,8 +1354,6 @@ class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
 class MolmoForCausalLM(
     nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, SupportsQuant
 ):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={
             # vision backbone mapping
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index c4198d36b..6dfab595e 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -1116,8 +1116,6 @@ class NanoNemotronVLDummyInputsBuilder(
 class NemotronH_Nano_VL_V2(
     nn.Module, HasInnerState, IsHybrid, SupportsMultiModal, SupportsMultiModalPruning
 ):
-    merge_by_field_config = True
-
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py
index a57668b21..391980fc6 100644
--- a/vllm/model_executor/models/nemotron_vl.py
+++ b/vllm/model_executor/models/nemotron_vl.py
@@ -358,8 +358,6 @@ class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
     dummy_inputs=BaseInternVLDummyInputsBuilder[NemotronVLProcessingInfo],
 )
 class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
-    merge_by_field_config = True
-
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
diff --git a/vllm/model_executor/models/opencua.py b/vllm/model_executor/models/opencua.py
index b92f0c9da..76a2d1cc2 100644
--- a/vllm/model_executor/models/opencua.py
+++ b/vllm/model_executor/models/opencua.py
@@ -201,7 +201,6 @@ class OpenCUADummyInputsBuilder(Qwen2VLDummyInputsBuilder):
     dummy_inputs=OpenCUADummyInputsBuilder,
 )
 class OpenCUAForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
-    merge_by_field_config = True
     multimodal_cpu_fields = {"image_grid_thw"}
 
     packed_modules_mapping = {
diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py
index a0fab8207..0691bbc61 100644
--- a/vllm/model_executor/models/ovis.py
+++ b/vllm/model_executor/models/ovis.py
@@ -414,8 +414,6 @@ class OvisMultiModalProcessor(BaseMultiModalProcessor[OvisProcessingInfo]):
     dummy_inputs=OvisDummyInputsBuilder,
 )
 class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
index 85f37cfea..0ad22aab7 100644
--- a/vllm/model_executor/models/ovis2_5.py
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -456,8 +456,6 @@ class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo])
     dummy_inputs=Ovis2_5DummyInputsBuilder,
 )
 class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py
index 1df5ff62f..9703a5b41 100644
--- a/vllm/model_executor/models/paddleocr_vl.py
+++ b/vllm/model_executor/models/paddleocr_vl.py
@@ -1103,8 +1103,6 @@ class SiglipVisionModel(nn.Module):
     dummy_inputs=PaddleOCRVLDummyInputsBuilder,
 )
 class PaddleOCRVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsMRoPE):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "model.": "language_model.model.",
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 9fa32f01d..67240c6e7 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -251,8 +251,6 @@ class PaliGemmaMultiModalProcessor(BaseMultiModalProcessor[PaliGemmaProcessingIn
     dummy_inputs=PaliGemmaDummyInputsBuilder,
 )
 class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 384572217..b7ae54806 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -562,8 +562,6 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
     dummy_inputs=Phi3VDummyInputsBuilder,
 )
 class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "model.vision_embed_tokens.wte": "embed_tokens",
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 8425549a7..179d5df86 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -984,8 +984,6 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
     Implements the Phi-4-multimodal-instruct model in vLLM.
     """
 
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": [
             "qkv_proj",
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index cad241842..faf2d80d2 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -365,8 +365,6 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo])
     dummy_inputs=PixtralDummyInputsBuilder,
 )
 class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 1ce0fb4e4..3438406c4 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -773,8 +773,6 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
     SupportsMRoPE,
     Qwen2_5OmniConditionalGenerationMixin,
 ):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "thinker.lm_head.": "language_model.lm_head.",
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index cb521ebdf..488af192b 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -1039,7 +1039,6 @@ class Qwen2_5_VLForConditionalGeneration(
     SupportsMultiModalPruning,
     SupportsMRoPE,
 ):
-    merge_by_field_config = True
     multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
 
     packed_modules_mapping = {
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 7e883a393..f84ddfa84 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -313,8 +313,6 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor[Qwen2AudioProcessing
     dummy_inputs=Qwen2AudioDummyInputsBuilder,
 )
 class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("audio"):
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index b74876849..9da5080f8 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1131,7 +1131,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo])
 class Qwen2VLForConditionalGeneration(
     nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
 ):
-    merge_by_field_config = True
     multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
 
     # To ensure correct weight loading and mapping.
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index e6979211b..dbe7bcd07 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -1131,8 +1131,6 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
     SupportsMRoPE,
     Qwen3OmniMoeConditionalGenerationMixin,
 ):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "thinker.lm_head.": "language_model.lm_head.",
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 58721303d..a5b10c958 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -1190,7 +1190,6 @@ class Qwen3VLForConditionalGeneration(
     SupportsMRoPE,
     SupportsEagle3,
 ):
-    merge_by_field_config = True
     multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
 
     packed_modules_mapping = {
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 55680b8e7..caac14716 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -703,8 +703,6 @@ class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
 class QwenVLForConditionalGeneration(
     QWenBaseModel, SupportsPP, SupportsLoRA, SupportsMultiModal
 ):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "c_attn": ["c_attn"],
         "gate_up_proj": [
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 9db1423d9..2600dc1c9 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -989,7 +989,6 @@ class SiglipEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
     is_pooling_model = True
 
     packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
-    merge_by_field_config = True
 
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index 55c25ce61..f95fbffc1 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -647,8 +647,6 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[SkyworkR1VProcessing
     dummy_inputs=SkyworkR1VDummyInputsBuilder,
 )
 class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
index 3e55ada0e..e5038e56a 100644
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -916,8 +916,6 @@ class Step3VisionTransformer(nn.Module):
     dummy_inputs=Step3VLDummyInputsBuilder,
 )
 class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "model.": "language_model.model.",
diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py
index 4d310712f..7e82a4d72 100644
--- a/vllm/model_executor/models/tarsier.py
+++ b/vllm/model_executor/models/tarsier.py
@@ -400,8 +400,6 @@ def init_vision_tower_for_tarsier(
     dummy_inputs=TarsierDummyInputsBuilder,
 )
 class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py
index 9f34090e3..402081a70 100644
--- a/vllm/model_executor/models/terratorch.py
+++ b/vllm/model_executor/models/terratorch.py
@@ -227,7 +227,6 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor):
     dummy_inputs=TerratorchInputBuilder,
 )
 class Terratorch(nn.Module, IsAttentionFree, SupportsMultiModal):
-    merge_by_field_config = True
     supports_multimodal_raw_input_only = True
     is_pooling_model = True
 
diff --git a/vllm/model_executor/models/transformers/multimodal.py b/vllm/model_executor/models/transformers/multimodal.py
index ccf605371..9d77dee28 100644
--- a/vllm/model_executor/models/transformers/multimodal.py
+++ b/vllm/model_executor/models/transformers/multimodal.py
@@ -264,7 +264,7 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
 
 class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):
     supports_multimodal_raw_input_only = True
-    merge_by_field_config = True
+
     # Backwards compatibility for prev released models. State dicts back then
     # had different formats and cannot be loaded with `AutoModel` mapping as is
     hf_to_vllm_mapper = WeightsMapper(
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 2444159b2..32a2ba1ef 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -498,8 +498,6 @@ class ModifiedWhisperEncoder(WhisperEncoder):
     dummy_inputs=UltravoxDummyInputsBuilder,
 )
 class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
-    merge_by_field_config = True
-
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 45f8fa079..7b408248e 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -330,8 +330,6 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo])
 class VoxtralForConditionalGeneration(
     nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, SupportsTranscription
 ):
-    merge_by_field_config = True
-
     supported_languages = ISO639_1_SUPPORTED_LANGS
 
     packed_modules_mapping = {
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 0daf6bda6..b2feff133 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -775,7 +775,6 @@ class WhisperMultiModalProcessor(EncDecMultiModalProcessor[WhisperProcessingInfo
 class WhisperForConditionalGeneration(
     nn.Module, SupportsTranscription, SupportsMultiModal
 ):
-    merge_by_field_config = True
     packed_modules_mapping = {
         "self_attn.qkv_proj": [
             "self_attn.q_proj",
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index f8e8847e8..9c5e3fb2b 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -426,7 +426,6 @@ def group_mm_kwargs_by_modality(
     Yields:
         A tuple `(modality, num_items, grouped_kwargs)`.
     """
-    # TODO: After v0.13, remove merge_by_field_config attribute from model impls
     if merge_by_field_config is not None:
         logger.warning_once(
             "The `merge_by_field_config` argument of `group_mm_kwargs_by_modality` "
-- 
GitLab


From 43e75930318e1cb101fb6178876b027634f91f71 Mon Sep 17 00:00:00 2001
From: Yu Jiaqi <54204033+piood@users.noreply.github.com>
Date: Sat, 6 Dec 2025 17:12:53 +0800
Subject: [PATCH 160/258] Support tokenization_kwargs override (#29794)

Signed-off-by: piood <2477084691@qq.com>
---
 tests/conftest.py                             | 17 ++++++++++----
 .../models/multimodal/pooling/test_siglip.py  | 18 +++++++++++++--
 vllm/entrypoints/llm.py                       | 22 +++++++++++++++++--
 3 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 0d456fb36..9f811d5d8 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -405,6 +405,7 @@ class HfRunner:
         images: PromptImageInput | None = None,
         videos: PromptVideoInput | None = None,
         audios: PromptAudioInput | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]]:
         if images is not None:
             assert len(prompts) == len(images)
@@ -418,10 +419,18 @@ class HfRunner:
         all_inputs: list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]] = []
         for i, prompt in enumerate(prompts):
             if isinstance(prompt, str):
-                processor_kwargs: dict[str, Any] = {
-                    "text": prompt,
-                    "return_tensors": "pt",
-                }
+                # Create a copy to avoid modifying the original dict
+                processor_kwargs = (
+                    tokenization_kwargs.copy()
+                    if tokenization_kwargs is not None
+                    else {}
+                )
+                processor_kwargs.update(
+                    {
+                        "text": prompt,
+                        "return_tensors": "pt",
+                    }
+                )
                 if images is not None and (image := images[i]) is not None:
                     processor_kwargs["images"] = image
                 if videos is not None and (video := videos[i]) is not None:
diff --git a/tests/models/multimodal/pooling/test_siglip.py b/tests/models/multimodal/pooling/test_siglip.py
index 92ae115a1..72886cbf7 100644
--- a/tests/models/multimodal/pooling/test_siglip.py
+++ b/tests/models/multimodal/pooling/test_siglip.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from typing import Any
+
 import pytest
 from transformers import SiglipModel
 
@@ -35,7 +37,11 @@ def _run_test(
     model: str,
     *,
     dtype: str,
+    tokenization_kwargs: dict[str, Any] | None = None,
 ) -> None:
+    if tokenization_kwargs is None:
+        tokenization_kwargs = {}
+
     with vllm_runner(
         model,
         runner="pooling",
@@ -44,10 +50,14 @@ def _run_test(
         max_model_len=64,
         gpu_memory_utilization=0.7,
     ) as vllm_model:
-        vllm_outputs = vllm_model.embed(input_texts, images=input_images)
+        vllm_outputs = vllm_model.embed(
+            input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs
+        )
 
     with hf_runner(model, dtype=dtype, auto_cls=SiglipModel) as hf_model:
-        all_inputs = hf_model.get_inputs(input_texts, images=input_images)
+        all_inputs = hf_model.get_inputs(
+            input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs
+        )
 
         all_outputs = []
         for inputs in all_inputs:
@@ -94,6 +104,10 @@ def test_models_text(
         input_images,  # type: ignore
         model,
         dtype=dtype,
+        tokenization_kwargs={
+            "padding": "max_length",
+            "max_length": 64,
+        },  # siglip2 was trained with this padding setting.
     )
 
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index add917634..913324fd5 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1076,6 +1076,7 @@ class LLM:
             params=pooling_params,
             use_tqdm=use_tqdm,
             lora_request=lora_request,
+            tokenization_kwargs=tokenization_kwargs,
         )
 
         outputs = self._run_engine(use_tqdm=use_tqdm)
@@ -1113,6 +1114,7 @@ class LLM:
         use_tqdm: bool | Callable[..., tqdm] = True,
         pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
         lora_request: list[LoRARequest] | LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[EmbeddingRequestOutput]:
         """
         Generate an embedding vector for each prompt.
@@ -1150,6 +1152,7 @@ class LLM:
             pooling_params=pooling_params,
             lora_request=lora_request,
             pooling_task="embed",
+            tokenization_kwargs=tokenization_kwargs,
         )
 
         return [EmbeddingRequestOutput.from_base(item) for item in items]
@@ -1161,6 +1164,7 @@ class LLM:
         use_tqdm: bool | Callable[..., tqdm] = True,
         pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
         lora_request: list[LoRARequest] | LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[ClassificationRequestOutput]:
         """
         Generate class logits for each prompt.
@@ -1196,6 +1200,7 @@ class LLM:
             pooling_params=pooling_params,
             lora_request=lora_request,
             pooling_task="classify",
+            tokenization_kwargs=tokenization_kwargs,
         )
 
         return [ClassificationRequestOutput.from_base(item) for item in items]
@@ -1209,6 +1214,7 @@ class LLM:
         use_tqdm: bool | Callable[..., tqdm] = True,
         pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
         lora_request: list[LoRARequest] | LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[PoolingRequestOutput]:
         """
         Generate rewards for each prompt.
@@ -1236,6 +1242,7 @@ class LLM:
             pooling_params=pooling_params,
             truncate_prompt_tokens=truncate_prompt_tokens,
             pooling_task="token_classify",
+            tokenization_kwargs=tokenization_kwargs,
         )
 
     def _embedding_score(
@@ -1247,6 +1254,7 @@ class LLM:
         use_tqdm: bool | Callable[..., tqdm] = True,
         pooling_params: PoolingParams | None = None,
         lora_request: list[LoRARequest] | LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[ScoringRequestOutput]:
         encoded_output: list[PoolingRequestOutput] = self.encode(
             text_1 + text_2,
@@ -1255,6 +1263,7 @@ class LLM:
             lora_request=lora_request,
             pooling_params=pooling_params,
             pooling_task="embed",
+            tokenization_kwargs=tokenization_kwargs,
         )
 
         encoded_output_1: list[PoolingRequestOutput] = encoded_output[0 : len(text_1)]
@@ -1279,6 +1288,7 @@ class LLM:
         use_tqdm: bool | Callable[..., tqdm] = True,
         pooling_params: PoolingParams | None = None,
         lora_request: list[LoRARequest] | LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[ScoringRequestOutput]:
         model_config = self.model_config
 
@@ -1294,7 +1304,8 @@ class LLM:
         pooling_params.verify("score", model_config)
         pooling_params_list = list[PoolingParams]()
 
-        tokenization_kwargs: dict[str, Any] = {}
+        local_kwargs = tokenization_kwargs or {}
+        tokenization_kwargs = local_kwargs.copy()
 
         _validate_truncation_size(
             model_config.max_model_len, truncate_prompt_tokens, tokenization_kwargs
@@ -1557,6 +1568,7 @@ class LLM:
         use_tqdm: bool | Callable[..., tqdm] = True,
         lora_request: Sequence[LoRARequest] | LoRARequest | None,
         priority: list[int] | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> None:
         if isinstance(prompts, (str, dict)):
             # Convert a single prompt to a list.
@@ -1602,6 +1614,7 @@ class LLM:
                     if isinstance(lora_request, Sequence)
                     else lora_request,
                     priority=priority[i] if priority else 0,
+                    tokenization_kwargs=tokenization_kwargs,
                 )
                 added_request_ids.append(request_id)
         except Exception as e:
@@ -1665,9 +1678,12 @@ class LLM:
         *,
         lora_request: LoRARequest | None,
         priority: int,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> tuple[EngineCoreRequest, dict[str, Any]]:
         """Use the Processor to process inputs for LLMEngine."""
-        tokenization_kwargs: dict[str, Any] = {}
+
+        local_kwargs = tokenization_kwargs or {}
+        tokenization_kwargs = local_kwargs.copy()
         _validate_truncation_size(
             self.model_config.max_model_len,
             params.truncate_prompt_tokens,
@@ -1690,6 +1706,7 @@ class LLM:
         params: SamplingParams | PoolingParams,
         lora_request: LoRARequest | None = None,
         priority: int = 0,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> str:
         prompt_text, _, _ = get_prompt_components(prompt)
         request_id = str(next(self.request_counter))
@@ -1700,6 +1717,7 @@ class LLM:
             params,
             lora_request=lora_request,
             priority=priority,
+            tokenization_kwargs=tokenization_kwargs,
         )
 
         self.llm_engine.add_request(
-- 
GitLab


From 92c35abb242babbf592390960fb5a4155261e017 Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <yeq@meta.com>
Date: Sat, 6 Dec 2025 01:24:03 -0800
Subject: [PATCH 161/258] [Misc] Fix circular import in
 vllm.transformers_utils.config (#30179)

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
---
 vllm/transformers_utils/config.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index f926b523a..773fc05a5 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -25,7 +25,6 @@ from transformers.models.auto.tokenization_auto import get_tokenizer_config
 from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
 
 from vllm import envs
-from vllm.config.utils import getattr_iter
 from vllm.logger import init_logger
 from vllm.transformers_utils.utils import parse_safetensors_file_metadata
 
@@ -305,6 +304,8 @@ def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> No
 
 def patch_rope_parameters(config: PretrainedConfig) -> None:
     """Provide backwards compatibility for RoPE."""
+    from vllm.config.utils import getattr_iter
+
     rope_theta_names = ("rope_theta", "rotary_emb_base")
     rope_theta = getattr_iter(config, rope_theta_names, None)
     if Version(version("transformers")) < Version("5.0.0.dev0"):
-- 
GitLab


From 17a9abec2b7c38d55ebb6ea0dfe5cbe135993af2 Mon Sep 17 00:00:00 2001
From: Chukwuma Nwaugha <20521315+nwaughachukwuma@users.noreply.github.com>
Date: Sat, 6 Dec 2025 09:42:41 +0000
Subject: [PATCH 162/258] simplify requires_files list creation (#29656)

Signed-off-by: Chukwuma Nwaugha <nwaughac@gmail.com>
---
 use_existing_torch.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/use_existing_torch.py b/use_existing_torch.py
index fd4caa69e..e2d3f2ec8 100644
--- a/use_existing_torch.py
+++ b/use_existing_torch.py
@@ -3,9 +3,7 @@
 
 import glob
 
-requires_files = glob.glob("requirements/*.txt")
-requires_files += ["pyproject.toml"]
-for file in requires_files:
+for file in (*glob.glob("requirements/*.txt"), "pyproject.toml"):
     print(f">>> cleaning {file}")
     with open(file) as f:
         lines = f.readlines()
@@ -17,5 +15,4 @@ for file in requires_files:
                     f.write(line)
                 else:
                     print(line.strip())
-    print(f"<<< done cleaning {file}")
-    print()
+    print(f"<<< done cleaning {file}\n")
-- 
GitLab


From 21bb323542bad9d7a7206d949f33734caf48c40c Mon Sep 17 00:00:00 2001
From: Viacheslav <slava.barinov2002@yandex.ru>
Date: Sat, 6 Dec 2025 15:04:14 +0300
Subject: [PATCH 163/258] Gigachat 3 tool parser and tests (#29905)

Signed-off-by: Viacheslav Barinov <viacheslav.teh@gmail.com>
---
 docs/features/tool_calling.md                 |  13 ++
 .../test_gigachat3_tool_parser.py             | 176 ++++++++++++++++
 .../openai/tool_parsers/__init__.py           |   4 +
 .../tool_parsers/gigachat3_tool_parser.py     | 190 ++++++++++++++++++
 4 files changed, 383 insertions(+)
 create mode 100644 tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py
 create mode 100644 vllm/entrypoints/openai/tool_parsers/gigachat3_tool_parser.py

diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index b6dfbf10b..c77fe4465 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -376,6 +376,19 @@ Supported models:
 
 Flags: `--tool-call-parser olmo3`
 
+### Gigachat 3 Models (`gigachat3`)
+
+Use chat template from the Hugging Face model files.
+
+Supported models:
+
+* `ai-sage/GigaChat3-702B-A36B-preview`
+* `ai-sage/GigaChat3-702B-A36B-preview-bf16`
+* `ai-sage/GigaChat3-10B-A1.8B`
+* `ai-sage/GigaChat3-10B-A1.8B-bf16`
+
+Flags: `--tool-call-parser gigachat3`
+
 ### Models with Pythonic Tool Calls (`pythonic`)
 
 A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
diff --git a/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py
new file mode 100644
index 000000000..02c5189d0
--- /dev/null
+++ b/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py
@@ -0,0 +1,176 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+
+from tests.entrypoints.openai.tool_parsers.utils import (
+    run_tool_extraction,
+    run_tool_extraction_streaming,
+)
+from vllm.entrypoints.openai.protocol import FunctionCall
+from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
+from vllm.tokenizers import TokenizerLike
+
+SIMPLE_ARGS_DICT = {
+    "action": "create",
+    "id": "preferences",
+}
+SIMPLE_FUNCTION_JSON = json.dumps(
+    {
+        "name": "manage_user_memory",
+        "arguments": SIMPLE_ARGS_DICT,
+    },
+    ensure_ascii=False,
+)
+SIMPLE_FUNCTION_OUTPUT = "function call" + SIMPLE_FUNCTION_JSON
+SIMPLE_FUNCTION_CALL = FunctionCall(
+    name="manage_user_memory",
+    arguments=json.dumps(SIMPLE_ARGS_DICT, ensure_ascii=False),
+)
+
+
+PARAMETERLESS_FUNCTION_JSON = json.dumps(
+    {
+        "name": "manage_user_memory",
+        "arguments": {},
+    },
+    ensure_ascii=False,
+)
+PARAMETERLESS_FUNCTION_OUTPUT = "function call" + PARAMETERLESS_FUNCTION_JSON
+PARAMETERLESS_FUNCTION_CALL = FunctionCall(
+    name="manage_user_memory",
+    arguments=json.dumps({}, ensure_ascii=False),
+)
+
+
+COMPLEX_ARGS_DICT = {
+    "action": "create",
+    "id": "preferences",
+    "content": {
+        "short_answers": True,
+        "hate_emojis": True,
+        "english_ui": False,
+        "russian_math_explanations": True,
+    },
+}
+COMPLEX_FUNCTION_JSON = json.dumps(
+    {
+        "name": "manage_user_memory",
+        "arguments": COMPLEX_ARGS_DICT,
+    },
+    ensure_ascii=False,
+)
+COMPLEX_FUNCTION_OUTPUT = "function call" + COMPLEX_FUNCTION_JSON
+COMPLEX_FUNCTION_CALL = FunctionCall(
+    name="manage_user_memory",
+    arguments=json.dumps(COMPLEX_ARGS_DICT, ensure_ascii=False),
+)
+
+
+@pytest.mark.parametrize("streaming", [True, False])
+def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("gigachat3")(
+        default_tokenizer
+    )
+    model_output = "How can I help you today?"
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=streaming
+    )
+    assert content == model_output
+    assert len(tool_calls) == 0
+
+
+TEST_CASES = [
+    pytest.param(
+        True,
+        SIMPLE_FUNCTION_OUTPUT,
+        [SIMPLE_FUNCTION_CALL],
+        None,
+        id="simple_streaming",
+    ),
+    pytest.param(
+        False,
+        SIMPLE_FUNCTION_OUTPUT,
+        [SIMPLE_FUNCTION_CALL],
+        None,
+        id="simple_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        PARAMETERLESS_FUNCTION_OUTPUT,
+        [PARAMETERLESS_FUNCTION_CALL],
+        None,
+        id="parameterless_streaming",
+    ),
+    pytest.param(
+        False,
+        PARAMETERLESS_FUNCTION_OUTPUT,
+        [PARAMETERLESS_FUNCTION_CALL],
+        None,
+        id="parameterless_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        COMPLEX_FUNCTION_OUTPUT,
+        [COMPLEX_FUNCTION_CALL],
+        None,
+        id="complex_streaming",
+    ),
+    pytest.param(
+        False,
+        COMPLEX_FUNCTION_OUTPUT,
+        [COMPLEX_FUNCTION_CALL],
+        None,
+        id="complex_nonstreaming",
+    ),
+]
+
+
+@pytest.mark.parametrize(
+    "streaming, model_output, expected_tool_calls, expected_content", TEST_CASES
+)
+def test_tool_call(
+    streaming: bool,
+    model_output: str,
+    expected_tool_calls: list[FunctionCall],
+    expected_content: str | None,
+    default_tokenizer: TokenizerLike,
+):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("gigachat3")(
+        default_tokenizer
+    )
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=streaming
+    )
+    assert content == expected_content
+    assert len(tool_calls) == len(expected_tool_calls)
+    for actual, expected in zip(tool_calls, expected_tool_calls):
+        assert actual.type == "function"
+        assert actual.function.name == expected.name
+        actual_args = json.loads(actual.function.arguments)
+        expected_args = json.loads(expected.arguments)
+        assert actual_args == expected_args
+
+
+def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("gigachat3")(
+        default_tokenizer
+    )
+    model_output_deltas = [
+        "function call",
+        COMPLEX_FUNCTION_JSON[:40],
+        COMPLEX_FUNCTION_JSON[40:],
+    ]
+    reconstructor = run_tool_extraction_streaming(
+        tool_parser,
+        model_output_deltas,
+        assert_one_tool_per_delta=False,
+    )
+    assert len(reconstructor.tool_calls) == 1
+    call = reconstructor.tool_calls[0]
+    assert call.type == "function"
+    assert call.function.name == "manage_user_memory"
+    args_dict = json.loads(call.function.arguments)
+    assert args_dict == COMPLEX_ARGS_DICT
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index ed43ea7ee..7be1263e8 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -134,6 +134,10 @@ _TOOL_PARSERS_TO_REGISTER = {
         "xlam_tool_parser",
         "xLAMToolParser",
     ),
+    "gigachat3": (
+        "gigachat3_tool_parser",
+        "GigaChat3ToolParser",
+    ),
 }
 
 
diff --git a/vllm/entrypoints/openai/tool_parsers/gigachat3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/gigachat3_tool_parser.py
new file mode 100644
index 000000000..dd27ffa83
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/gigachat3_tool_parser.py
@@ -0,0 +1,190 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Sequence
+
+import regex as re
+
+from vllm.entrypoints.chat_utils import make_tool_call_id
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ToolParser
+from vllm.logger import init_logger
+from vllm.tokenizers import TokenizerLike
+
+logger = init_logger(__name__)
+
+REGEX_FUNCTION_CALL = re.compile(
+    r"function call(?:<\|role_sep\|>\n)?(\{.*)",
+    re.DOTALL,
+)
+
+NAME_REGEX = re.compile(
+    r'"name"\s*:\s*"([^"]*)"',
+    re.DOTALL,
+)
+
+ARGS_REGEX = re.compile(
+    r'"arguments"\s*:\s*(.*)',
+    re.DOTALL,
+)
+
+
+class GigaChat3ToolParser(ToolParser):
+    def __init__(self, tokenizer: TokenizerLike):
+        super().__init__(tokenizer)
+        self.tool_started: bool = False
+        self.tool_name_sent: bool = False
+        self.tool_id: str | None = None
+        self.prev_tool_call_arr: list[dict] = []
+        self.content_buffer: str = ""
+        self.trigger_start = "function call{"
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+        match = REGEX_FUNCTION_CALL.search(model_output)
+        if not match:
+            return ExtractedToolCallInformation(
+                tools_called=False,
+                tool_calls=[],
+                content=model_output,
+            )
+        json_candidate = match.group(1).strip()
+        try:
+            data = json.loads(json_candidate)
+        except json.JSONDecodeError:
+            return ExtractedToolCallInformation(
+                tools_called=False,
+                tool_calls=[],
+                content=model_output,
+            )
+        if not (isinstance(data, dict) and "name" in data and "arguments" in data):
+            return ExtractedToolCallInformation(
+                tools_called=False,
+                tool_calls=[],
+                content=model_output,
+            )
+        name = data["name"]
+        args = data["arguments"]
+        if not isinstance(args, str):
+            args = json.dumps(args, ensure_ascii=False)
+
+        tool_calls = [
+            ToolCall(
+                type="function",
+                function=FunctionCall(
+                    name=name,
+                    arguments=args,
+                ),
+            )
+        ]
+        prefix = model_output[: match.start()]
+        content = prefix.rstrip() if prefix and prefix.strip() else None
+
+        return ExtractedToolCallInformation(
+            tools_called=True,
+            tool_calls=tool_calls,
+            content=content,
+        )
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> DeltaMessage | None:
+        func_name = None
+        cur_args = None
+        if not self.tool_started:
+            match = REGEX_FUNCTION_CALL.search(current_text)
+            if match:
+                self.tool_started = True
+                self.content_buffer = ""
+            else:
+                self.content_buffer += delta_text
+                clean_buffer = self.content_buffer.lstrip()
+                is_prefix = self.trigger_start.startswith(clean_buffer)
+                starts_with_trigger = clean_buffer.startswith(self.trigger_start)
+                if is_prefix or starts_with_trigger:
+                    return None
+                else:
+                    flush_text = self.content_buffer
+                    self.content_buffer = ""
+                    return DeltaMessage(content=flush_text)
+
+        match = REGEX_FUNCTION_CALL.search(current_text)
+        if not match:
+            return None
+        json_tail = match.group(1).strip()
+        name_match = NAME_REGEX.search(json_tail)
+        if name_match:
+            func_name = name_match.group(1)
+        args_match = ARGS_REGEX.search(json_tail)
+        if args_match:
+            cur_args = args_match.group(1).strip()
+            if cur_args.endswith("}"):  # last '}' end of json
+                try:
+                    candidate = cur_args[:-1].strip()
+                    json.loads(candidate)
+                    cur_args = candidate
+                except json.JSONDecodeError:
+                    pass
+        if not self.prev_tool_call_arr:
+            self.prev_tool_call_arr.append({})
+        if not self.tool_name_sent:
+            if not func_name:
+                return None
+            self.tool_name_sent = True
+            self.tool_id = make_tool_call_id()
+            self.prev_tool_call_arr[0]["name"] = func_name
+            return DeltaMessage(
+                tool_calls=[
+                    DeltaToolCall(
+                        index=0,
+                        id=self.tool_id,
+                        type="function",
+                        function=DeltaFunctionCall(
+                            name=func_name,
+                        ).model_dump(exclude_none=True),
+                    )
+                ],
+                content=None,
+            )
+        if cur_args is None:
+            return None
+        prev_args = self.prev_tool_call_arr[0].get("arguments", "")
+        if not prev_args:
+            delta_args = cur_args
+        elif cur_args.startswith(prev_args):
+            delta_args = cur_args[len(prev_args) :]
+        else:
+            return None
+        if not delta_args:
+            return None
+        self.prev_tool_call_arr[0]["arguments"] = cur_args
+        return DeltaMessage(
+            tool_calls=[
+                DeltaToolCall(
+                    index=0,
+                    function=DeltaFunctionCall(
+                        arguments=delta_args,
+                    ).model_dump(exclude_none=True),
+                )
+            ],
+            content=None,
+        )
-- 
GitLab


From 671427efbf57d4e40370a336cf1d3a4d1fd8eb91 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 6 Dec 2025 21:40:02 +0800
Subject: [PATCH 164/258] [Model] Move `multimodal_cpu_fields` definition to
 field config (#30181)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/distributed/test_shm_storage.py        |   2 +-
 tests/multimodal/test_cache.py               |   2 +-
 tests/v1/test_serial_utils.py                |  21 +++-
 vllm/model_executor/models/glm4_1v.py        |  29 ++---
 vllm/model_executor/models/hunyuan_vision.py |   4 +-
 vllm/model_executor/models/interfaces.py     |  13 ++-
 vllm/model_executor/models/opencua.py        |   2 -
 vllm/model_executor/models/qwen2_5_vl.py     |   2 -
 vllm/model_executor/models/qwen2_vl.py       |  12 +-
 vllm/model_executor/models/qwen3_vl.py       |   6 +-
 vllm/multimodal/inputs.py                    | 116 +++++++++++++------
 vllm/multimodal/utils.py                     |   8 +-
 vllm/v1/serial_utils.py                      |  11 +-
 vllm/v1/worker/gpu_model_runner.py           |   5 -
 vllm/v1/worker/tpu_model_runner.py           |   3 -
 15 files changed, 141 insertions(+), 95 deletions(-)

diff --git a/tests/distributed/test_shm_storage.py b/tests/distributed/test_shm_storage.py
index b9a5c2244..9ab35a292 100644
--- a/tests/distributed/test_shm_storage.py
+++ b/tests/distributed/test_shm_storage.py
@@ -28,7 +28,7 @@ def _dummy_elem(modality: str, key: str, size: int):
         modality=modality,
         key=key,
         data=torch.empty((size,), dtype=torch.int8),
-        field=MultiModalSharedField(1),
+        field=MultiModalSharedField(batch_size=1),
     )
 
 
diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py
index e4fcc3474..e641b1111 100644
--- a/tests/multimodal/test_cache.py
+++ b/tests/multimodal/test_cache.py
@@ -51,7 +51,7 @@ def _dummy_elem(
         modality=modality,
         key=key,
         data=data,
-        field=MultiModalSharedField(1),
+        field=MultiModalSharedField(batch_size=1),
     )
 
 
diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py
index 00749c541..dbbbfce97 100644
--- a/tests/v1/test_serial_utils.py
+++ b/tests/v1/test_serial_utils.py
@@ -104,22 +104,31 @@ class MyRequest(msgspec.Struct):
 
 def test_multimodal_kwargs():
     e1 = MultiModalFieldElem(
-        "audio", "a0", torch.zeros(1000, dtype=torch.bfloat16), MultiModalBatchedField()
+        "audio",
+        "a0",
+        torch.zeros(1000, dtype=torch.bfloat16),
+        MultiModalBatchedField(),
     )
     e2 = MultiModalFieldElem(
         "video",
         "v0",
         [torch.zeros(1000, dtype=torch.int8) for _ in range(4)],
-        MultiModalFlatField([[slice(1, 2, 3), slice(4, 5, 6)], [slice(None, 2)]], 0),
+        MultiModalFlatField(
+            slices=[[slice(1, 2, 3), slice(4, 5, 6)], [slice(None, 2)]],
+            dim=0,
+        ),
     )
     e3 = MultiModalFieldElem(
-        "image", "i0", torch.zeros(1000, dtype=torch.int32), MultiModalSharedField(4)
+        "image",
+        "i0",
+        torch.zeros(1000, dtype=torch.int32),
+        MultiModalSharedField(batch_size=4),
     )
     e4 = MultiModalFieldElem(
         "image",
         "i1",
         torch.zeros(1000, dtype=torch.int32),
-        MultiModalFlatField([slice(1, 2, 3), slice(4, 5, 6)], 2),
+        MultiModalFlatField(slices=[slice(1, 2, 3), slice(4, 5, 6)], dim=2),
     )
     audio = MultiModalKwargsItem.from_elems([e1])
     video = MultiModalKwargsItem.from_elems([e2])
@@ -138,8 +147,8 @@ def test_multimodal_kwargs():
 
     total_len = sum(memoryview(x).cast("B").nbytes for x in encoded)
 
-    # expected total encoding length, should be 14306, +-20 for minor changes
-    assert 14275 <= total_len <= 14325
+    # expected total encoding length, should be 14395, +-20 for minor changes
+    assert 14375 <= total_len <= 14425
     decoded = decoder.decode(encoded).mm[0]
     assert isinstance(decoded, MultiModalKwargsItems)
 
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 3cb53f2cb..39a837b78 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -787,10 +787,10 @@ class Glm4vVisionTransformer(nn.Module):
     def forward(
         self,
         x: torch.Tensor,
-        grid_thw: list[list[int]],
+        grid_thw: torch.Tensor | list[list[int]],
     ) -> torch.Tensor:
-        # Convert grid_thw to tensor (always expecting list format now)
-        grid_thw = torch.tensor(grid_thw, device=x.device, dtype=torch.long)
+        if isinstance(grid_thw, list):
+            grid_thw = torch.tensor(grid_thw, dtype=torch.int32)
 
         # patchify
         x = x.to(device=self.device, dtype=self.dtype)
@@ -805,7 +805,8 @@ class Glm4vVisionTransformer(nn.Module):
         cu_seqlens = torch.repeat_interleave(
             grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
         ).cumsum(dim=0, dtype=torch.int32)
-        cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
+        cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
+        cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)
 
         # pre-compute max_seqlen for attn mask to reduce cuMemcpy operations
         max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
@@ -1548,7 +1549,6 @@ class Glm4vForConditionalGeneration(
     ) -> tuple[torch.Tensor, ...]:
         grid_thw = image_input["image_grid_thw"]
         assert grid_thw.ndim == 2
-        grid_thw_list = grid_thw.tolist()
 
         if image_input["type"] == "image_embeds":
             image_embeds = image_input["image_embeds"].type(self.visual.dtype)
@@ -1559,12 +1559,10 @@ class Glm4vForConditionalGeneration(
                     self.visual, pixel_values, grid_thw.tolist(), rope_type="rope_3d"
                 )
             else:
-                image_embeds = self.visual(pixel_values, grid_thw=grid_thw.tolist())
+                image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
+
         merge_size = self.visual.spatial_merge_size
-        sizes = (
-            torch.tensor(grid_thw_list, dtype=torch.long).prod(-1)
-            // (merge_size * merge_size)
-        ).tolist()
+        sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist()
         return image_embeds.split(sizes)
 
     def _process_video_input(
@@ -1572,7 +1570,6 @@ class Glm4vForConditionalGeneration(
     ) -> tuple[torch.Tensor, ...]:
         grid_thw = video_input["video_grid_thw"]
         assert grid_thw.ndim == 2
-        grid_thw_list = grid_thw.tolist()
 
         if video_input["type"] == "video_embeds":
             video_embeds = video_input["video_embeds"].type(self.visual.dtype)
@@ -1588,15 +1585,11 @@ class Glm4vForConditionalGeneration(
                     rope_type="rope_3d",
                 )
             else:
-                video_embeds = self.visual(
-                    pixel_values_videos, grid_thw=grid_thw.tolist()
-                )
+                video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
+
         # Split concatenated embeddings for each video item.
         merge_size = self.visual.spatial_merge_size
-        sizes = (
-            torch.tensor(grid_thw_list, dtype=torch.long).prod(-1)
-            // (merge_size * merge_size)
-        ).tolist()
+        sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist()
         return video_embeds.split(sizes)
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py
index 52ce9564c..e5c1be626 100644
--- a/vllm/model_executor/models/hunyuan_vision.py
+++ b/vllm/model_executor/models/hunyuan_vision.py
@@ -563,7 +563,7 @@ def _hunyuan_vl_field_config(hf_inputs: Mapping[str, torch.Tensor]):
     return dict(
         pixel_values=MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes),
         image_embeds=MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes),
-        image_grid_thw=MultiModalFieldConfig.batched("image"),
+        image_grid_thw=MultiModalFieldConfig.batched("image", keep_on_cpu=True),
     )
 
 
@@ -786,8 +786,6 @@ class HunYuanVLForConditionalGeneration(
     SupportsQuant,
     SupportsXDRoPE,
 ):
-    multimodal_cpu_fields = {"image_grid_thw"}
-
     # To ensure correct weight loading and mapping.
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 416ab236c..607ff5583 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -84,9 +84,9 @@ class SupportsMultiModal(Protocol):
     `vllm.multimodal.utils.group_mm_kwargs_by_modality` to use.
     """
 
-    multimodal_cpu_fields: ClassVar[Set[str]] = frozenset()
+    multimodal_cpu_fields: ClassVar[Set[str] | None] = None
     """
-    A set indicating CPU-only multimodal fields.
+    [DEPRECATED] A set indicating CPU-only multimodal fields.
     """
 
     _processor_factory: ClassVar[_ProcessorFactories]
@@ -279,6 +279,15 @@ def supports_multimodal(
                 "please remove the override from your model."
             )
 
+        multimodal_cpu_fields = getattr(model, "multimodal_cpu_fields", None)
+        if multimodal_cpu_fields is not None:
+            raise ValueError(
+                "`multimodal_cpu_fields` is no longer effective, "
+                "please set `keep_on_cpu=True` in `MultiModalFieldConfig` "
+                "(refer to https://github.com/vllm-project/vllm/pull/30181), "
+                "and then remove the override from your model."
+            )
+
     return res
 
 
diff --git a/vllm/model_executor/models/opencua.py b/vllm/model_executor/models/opencua.py
index 76a2d1cc2..23668cc2b 100644
--- a/vllm/model_executor/models/opencua.py
+++ b/vllm/model_executor/models/opencua.py
@@ -201,8 +201,6 @@ class OpenCUADummyInputsBuilder(Qwen2VLDummyInputsBuilder):
     dummy_inputs=OpenCUADummyInputsBuilder,
 )
 class OpenCUAForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
-    multimodal_cpu_fields = {"image_grid_thw"}
-
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 488af192b..3cc3a3a78 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -1039,8 +1039,6 @@ class Qwen2_5_VLForConditionalGeneration(
     SupportsMultiModalPruning,
     SupportsMRoPE,
 ):
-    multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
-
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 9da5080f8..885e172d1 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -811,14 +811,14 @@ def _create_qwen2vl_field_factory(
             image_embeds=MultiModalFieldConfig.flat_from_sizes(
                 "image", image_embed_grid_sizes
             ),
-            image_grid_thw=MultiModalFieldConfig.batched("image"),
+            image_grid_thw=MultiModalFieldConfig.batched("image", keep_on_cpu=True),
             pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
                 "video", video_grid_sizes
             ),
             video_embeds=MultiModalFieldConfig.flat_from_sizes(
                 "video", video_embed_grid_sizes
             ),
-            video_grid_thw=MultiModalFieldConfig.batched("video"),
+            video_grid_thw=MultiModalFieldConfig.batched("video", keep_on_cpu=True),
         )
 
     return _qwen2vl_field_config
@@ -1131,8 +1131,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo])
 class Qwen2VLForConditionalGeneration(
     nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
 ):
-    multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
-
     # To ensure correct weight loading and mapping.
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
@@ -1393,9 +1391,11 @@ class Qwen2VLForConditionalGeneration(
         else:
             pixel_values_videos = video_input["pixel_values_videos"]
             if self.use_data_parallel:
-                grid_thw_list = grid_thw.tolist()
                 return run_dp_sharded_mrope_vision_model(
-                    self.visual, pixel_values_videos, grid_thw_list, rope_type="rope_3d"
+                    self.visual,
+                    pixel_values_videos,
+                    grid_thw.tolist(),
+                    rope_type="rope_3d",
                 )
             else:
                 video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index a5b10c958..1add39d6b 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -984,14 +984,14 @@ class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo])
             image_embeds=MultiModalFieldConfig.flat_from_sizes(
                 "image", image_grid_sizes
             ),
-            image_grid_thw=MultiModalFieldConfig.batched("image"),
+            image_grid_thw=MultiModalFieldConfig.batched("image", keep_on_cpu=True),
             pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
                 "video", video_grid_sizes
             ),
             video_embeds=MultiModalFieldConfig.flat_from_sizes(
                 "video", video_grid_sizes
             ),
-            video_grid_thw=MultiModalFieldConfig.batched("video"),
+            video_grid_thw=MultiModalFieldConfig.batched("video", keep_on_cpu=True),
         )
 
     def _get_prompt_updates(
@@ -1190,8 +1190,6 @@ class Qwen3VLForConditionalGeneration(
     SupportsMRoPE,
     SupportsEagle3,
 ):
-    multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
-
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 32f15240c..d9118f5b9 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -3,7 +3,7 @@
 
 from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
-from collections.abc import Mapping, Sequence, Set
+from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
 from functools import partial
 from itertools import accumulate
@@ -223,6 +223,23 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
     return a == b
 
 
+def _nested_tensors_h2d(
+    tensors: NestedTensors,
+    device: torch.types.Device,
+) -> NestedTensors:
+    if device is None:
+        return tensors
+
+    return json_map_leaves(
+        (
+            lambda x: x.to(device=device, non_blocking=True)
+            if isinstance(x, torch.Tensor)
+            else x
+        ),
+        tensors,
+    )
+
+
 BatchedTensorInputs: TypeAlias = dict[str, NestedTensors]
 """
 A dictionary containing nested tensors which have been batched via
@@ -334,7 +351,7 @@ class MultiModalFieldElem:
         )  # noqa: E721
 
 
-@dataclass(frozen=True)
+@dataclass(frozen=True, kw_only=True)
 class BaseMultiModalField(ABC):
     """
     Defines how to interpret tensor data belonging to a keyword argument in
@@ -342,6 +359,12 @@ class BaseMultiModalField(ABC):
     multi-modal items, and vice versa.
     """
 
+    keep_on_cpu: bool = False
+    """
+    If `True`, then this field is excluded from being moved to the accelerator
+    when `MultiModalKwargsItems.get_data()` is called to batch the data.
+    """
+
     def _field_factory(self, *, modality: str, key: str):
         f = partial(
             MultiModalFieldElem,
@@ -386,6 +409,7 @@ class BaseMultiModalField(ABC):
         self,
         elems: list[MultiModalFieldElem],
         *,
+        device: torch.types.Device = None,
         pin_memory: bool = False,
     ) -> NestedTensors:
         """
@@ -399,11 +423,17 @@ class BaseMultiModalField(ABC):
         if len(set(field_types)) > 1:
             raise ValueError(f"Cannot merge different {field_types=}")
 
+        if device is not None and self.keep_on_cpu:
+            device = "cpu"
+        if pin_memory and self.keep_on_cpu:
+            pin_memory = False
+
         batch = [elem.data for elem in elems]
-        return self._reduce_data(batch, pin_memory=pin_memory)
+        out = self._reduce_data(batch, pin_memory=pin_memory)
+        return _nested_tensors_h2d(out, device=device)
 
 
-@dataclass(frozen=True)
+@dataclass(frozen=True, kw_only=True)
 class MultiModalBatchedField(BaseMultiModalField):
     """
     Info:
@@ -445,7 +475,7 @@ class MultiModalBatchedField(BaseMultiModalField):
         return batch
 
 
-@dataclass(frozen=True)
+@dataclass(frozen=True, kw_only=True)
 class MultiModalFlatField(BaseMultiModalField):
     """
     Info:
@@ -505,7 +535,7 @@ class MultiModalFlatField(BaseMultiModalField):
         return [e for elem in batch for e in elem]
 
 
-@dataclass(frozen=True)
+@dataclass(frozen=True, kw_only=True)
 class MultiModalSharedField(BaseMultiModalField):
     """
     Info:
@@ -532,9 +562,10 @@ class MultiModalSharedField(BaseMultiModalField):
         return batch[0]
 
 
+@dataclass(frozen=True)
 class MultiModalFieldConfig:
     @staticmethod
-    def batched(modality: str):
+    def batched(modality: str, *, keep_on_cpu: bool = False):
         """
         Defines a field where an element in the batch is obtained by
         indexing into the first dimension of the underlying data.
@@ -542,6 +573,7 @@ class MultiModalFieldConfig:
         Args:
             modality: The modality of the multi-modal item that uses this
                 keyword argument.
+            keep_on_cpu: Whether to keep this field on the CPU for the model inputs.
 
         Example:
 
@@ -558,7 +590,7 @@ class MultiModalFieldConfig:
         ```
         """
         return MultiModalFieldConfig(
-            field=MultiModalBatchedField(),
+            field=MultiModalBatchedField(keep_on_cpu=keep_on_cpu),
             modality=modality,
         )
 
@@ -567,6 +599,8 @@ class MultiModalFieldConfig:
         modality: str,
         slices: Sequence[slice] | Sequence[Sequence[slice]],
         dim: int = 0,
+        *,
+        keep_on_cpu: bool = False,
     ):
         """
         Defines a field where an element in the batch is obtained by
@@ -579,6 +613,7 @@ class MultiModalFieldConfig:
                 slices (dim>0) that is used to extract the data corresponding
                 to it.
             dim: The dimension to extract data, default to 0.
+            keep_on_cpu: Whether to keep this field on the CPU for the model inputs.
 
         Example:
 
@@ -613,12 +648,22 @@ class MultiModalFieldConfig:
         ```
         """
         return MultiModalFieldConfig(
-            field=MultiModalFlatField(slices=slices, dim=dim),
+            field=MultiModalFlatField(
+                slices=slices,
+                dim=dim,
+                keep_on_cpu=keep_on_cpu,
+            ),
             modality=modality,
         )
 
     @staticmethod
-    def flat_from_sizes(modality: str, size_per_item: "torch.Tensor", dim: int = 0):
+    def flat_from_sizes(
+        modality: str,
+        size_per_item: "torch.Tensor",
+        dim: int = 0,
+        *,
+        keep_on_cpu: bool = False,
+    ):
         """
         Defines a field where an element in the batch is obtained by
         slicing along the first dimension of the underlying data.
@@ -629,6 +674,7 @@ class MultiModalFieldConfig:
             size_per_item: For each multi-modal item, the size of the slice
                 that is used to extract the data corresponding to it.
             dim: The dimension to slice, default to 0.
+            keep_on_cpu: Whether to keep this field on the CPU for the model inputs.
 
         Example:
 
@@ -676,10 +722,20 @@ class MultiModalFieldConfig:
             for i in range(len(size_per_item))
         ]
 
-        return MultiModalFieldConfig.flat(modality, slices, dim=dim)
+        return MultiModalFieldConfig.flat(
+            modality,
+            slices,
+            dim=dim,
+            keep_on_cpu=keep_on_cpu,
+        )
 
     @staticmethod
-    def shared(modality: str, batch_size: int):
+    def shared(
+        modality: str,
+        batch_size: int,
+        *,
+        keep_on_cpu: bool = False,
+    ):
         """
         Defines a field where an element in the batch is obtained by
         taking the entirety of the underlying data.
@@ -690,6 +746,7 @@ class MultiModalFieldConfig:
             modality: The modality of the multi-modal item that uses this
                 keyword argument.
             batch_size: The number of multi-modal items which share this data.
+            keep_on_cpu: Whether to keep this field on the CPU for the model inputs.
 
         Example:
 
@@ -708,18 +765,15 @@ class MultiModalFieldConfig:
         ```
         """
         return MultiModalFieldConfig(
-            field=MultiModalSharedField(batch_size),
+            field=MultiModalSharedField(
+                batch_size=batch_size,
+                keep_on_cpu=keep_on_cpu,
+            ),
             modality=modality,
         )
 
-    def __init__(self, field: BaseMultiModalField, modality: str) -> None:
-        super().__init__()
-
-        self.field = field
-        self.modality = modality
-
-    def __repr__(self) -> str:
-        return f"MultiModalFieldConfig(field={self.field}, modality={self.modality})"
+    field: BaseMultiModalField
+    modality: str
 
     def build_elems(
         self,
@@ -744,7 +798,7 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
             modality=modality,
             key="dummy",
             data=torch.empty(nbytes, dtype=torch.uint8),
-            field=MultiModalSharedField(1),
+            field=MultiModalSharedField(batch_size=1),
         )
         return MultiModalKwargsItem.from_elems([mm_elem])
 
@@ -844,7 +898,6 @@ class MultiModalKwargsItems(UserDict[str, Sequence[_I]]):
         *,
         device: torch.types.Device = None,
         pin_memory: bool = False,
-        cpu_fields: Set[str] = frozenset(),
     ) -> BatchedTensorInputs:
         """Construct a dictionary of keyword arguments to pass to the model."""
         elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
@@ -859,21 +912,14 @@ class MultiModalKwargsItems(UserDict[str, Sequence[_I]]):
                     elems_by_key[key].append(elem)
 
         data = {
-            key: elems[0].field.reduce_data(elems, pin_memory=pin_memory)
+            key: elems[0].field.reduce_data(
+                elems,
+                device=device,
+                pin_memory=pin_memory,
+            )
             for key, elems in elems_by_key.items()
         }
 
-        if device is not None:
-            for k in data.keys() - cpu_fields:
-                data[k] = json_map_leaves(
-                    (
-                        lambda x: x.to(device=device, non_blocking=True)
-                        if isinstance(x, torch.Tensor)
-                        else x
-                    ),
-                    data[k],
-                )
-
         return data
 
 
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 9c5e3fb2b..d4bdc55e5 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -413,7 +413,7 @@ def group_mm_kwargs_by_modality(
     device: torch.types.Device = None,
     pin_memory: bool = False,
     merge_by_field_config: bool | None = None,
-    multimodal_cpu_fields: Set[str] = frozenset(),
+    multimodal_cpu_fields: Set[str] | None = None,
 ) -> Generator[tuple[str, int, BatchedTensorInputs], None, None]:
     """Group consecutive `MultiModalKwargsItem`s from `mm_kwargs` with the same
     modality together into the same `MultiModalKwargs` instance.
@@ -431,6 +431,11 @@ def group_mm_kwargs_by_modality(
             "The `merge_by_field_config` argument of `group_mm_kwargs_by_modality` "
             "is deprecated and will be removed in v0.13."
         )
+    if multimodal_cpu_fields is not None:
+        logger.warning_once(
+            "The `multimodal_cpu_fields` argument of `group_mm_kwargs_by_modality` "
+            "is deprecated and will be removed in v0.13."
+        )
 
     from vllm.multimodal.inputs import MultiModalKwargsItems
 
@@ -440,7 +445,6 @@ def group_mm_kwargs_by_modality(
         mm_kwargs_data = mm_kwargs_items.get_data(
             device=device,
             pin_memory=pin_memory,
-            cpu_fields=multimodal_cpu_fields,
         )
 
         yield modality, len(items_lst), mm_kwargs_data
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 14ae487f3..a3c30e368 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -269,10 +269,11 @@ class MsgpackEncoder:
         name = MMF_CLASS_TO_FACTORY.get(field.__class__)
         if not name:
             raise TypeError(f"Unsupported field type: {field.__class__}")
+
         # We just need to copy all of the field values in order
         # which will be then used to reconstruct the field.
-        field_values = (getattr(field, f.name) for f in dataclasses.fields(field))
-        return name, *field_values
+        factory_kw = {f.name: getattr(field, f.name) for f in dataclasses.fields(field)}
+        return name, factory_kw
 
 
 class MsgpackDecoder:
@@ -392,15 +393,15 @@ class MsgpackDecoder:
             obj["data"] = self._decode_nested_tensors(obj["data"])
 
         # Reconstruct the field processor using MultiModalFieldConfig
-        factory_meth_name, *field_args = obj["field"]
+        factory_meth_name, factory_kw = obj["field"]
         factory_meth = getattr(MultiModalFieldConfig, factory_meth_name)
 
         # Special case: decode the union "slices" field of
         # MultiModalFlatField
         if factory_meth_name == "flat":
-            field_args[0] = self._decode_nested_slices(field_args[0])
+            factory_kw["slices"] = self._decode_nested_slices(factory_kw["slices"])
 
-        obj["field"] = factory_meth(None, *field_args).field
+        obj["field"] = factory_meth("", **factory_kw).field
         return MultiModalFieldElem(**obj)
 
     def _decode_nested_tensors(self, obj: Any) -> NestedTensors:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index b6a814522..a50360ab0 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1097,7 +1097,6 @@ class GPUModelRunner(
             device=self.device,
             pin_memory=self.pin_memory,
             merge_by_field_config=model.merge_by_field_config,
-            multimodal_cpu_fields=model.multimodal_cpu_fields,
         ):
             mm_kwargs_combined.update(mm_kwargs_group)
 
@@ -2109,7 +2108,6 @@ class GPUModelRunner(
             mm_kwargs,
             device=self.device,
             pin_memory=self.pin_memory,
-            multimodal_cpu_fields=model.multimodal_cpu_fields,
         ):
             curr_group_outputs: list[torch.Tensor] = []
 
@@ -2135,7 +2133,6 @@ class GPUModelRunner(
                             [video_mm_kwargs_item],
                             device=self.device,
                             pin_memory=self.pin_memory,
-                            multimodal_cpu_fields=model.multimodal_cpu_fields,
                         )
                     )
 
@@ -3887,14 +3884,12 @@ class GPUModelRunner(
         dummy_mm_item = dummy_mm_data[modality][0]
         dummy_mm_items = [dummy_mm_item] * max_items_per_batch
 
-        model = cast(SupportsMultiModal, self.model)
         return next(
             mm_kwargs_group
             for _, _, mm_kwargs_group in group_mm_kwargs_by_modality(
                 dummy_mm_items,
                 device=self.device,
                 pin_memory=self.pin_memory,
-                multimodal_cpu_fields=model.multimodal_cpu_fields,
             )
         )
 
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 292f12969..283f21b77 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -969,7 +969,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             mm_kwargs,
             device=self.device,
             pin_memory=self.pin_memory,
-            multimodal_cpu_fields=model.multimodal_cpu_fields,
         ):
             # Run the encoder.
             # `curr_group_outputs` is either of the following:
@@ -2050,14 +2049,12 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         dummy_mm_item = dummy_mm_data[modality][0]
         dummy_mm_items = [dummy_mm_item] * max_items_per_batch
 
-        model = cast(SupportsMultiModal, self.model)
         return next(
             grouped_mm_kwargs
             for _, _, grouped_mm_kwargs in group_mm_kwargs_by_modality(
                 dummy_mm_items,
                 device=self.device,
                 pin_memory=self.pin_memory,
-                multimodal_cpu_fields=model.multimodal_cpu_fields,
             )
         )
 
-- 
GitLab


From 421125d03a110df7d49f84c7cf8ee9fa089d1dff Mon Sep 17 00:00:00 2001
From: Andrew Xia <axia@meta.com>
Date: Sat, 6 Dec 2025 14:34:34 -0800
Subject: [PATCH 165/258] [ez] move harmony utils to parser folder (#30117)

Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
---
 tests/entrypoints/openai/parser/__init__.py                 | 0
 tests/entrypoints/{ => openai/parser}/test_harmony_utils.py | 2 +-
 tests/entrypoints/openai/test_response_api_with_harmony.py  | 2 +-
 vllm/entrypoints/context.py                                 | 2 +-
 vllm/entrypoints/{ => openai/parser}/harmony_utils.py       | 0
 vllm/entrypoints/openai/serving_chat.py                     | 4 ++--
 vllm/entrypoints/openai/serving_responses.py                | 4 ++--
 vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py  | 2 +-
 vllm/reasoning/gptoss_reasoning_parser.py                   | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)
 create mode 100644 tests/entrypoints/openai/parser/__init__.py
 rename tests/entrypoints/{ => openai/parser}/test_harmony_utils.py (99%)
 rename vllm/entrypoints/{ => openai/parser}/harmony_utils.py (100%)

diff --git a/tests/entrypoints/openai/parser/__init__.py b/tests/entrypoints/openai/parser/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/entrypoints/test_harmony_utils.py b/tests/entrypoints/openai/parser/test_harmony_utils.py
similarity index 99%
rename from tests/entrypoints/test_harmony_utils.py
rename to tests/entrypoints/openai/parser/test_harmony_utils.py
index 82ff562d5..ae6f558f2 100644
--- a/tests/entrypoints/test_harmony_utils.py
+++ b/tests/entrypoints/openai/parser/test_harmony_utils.py
@@ -4,7 +4,7 @@
 from openai.types.responses import ResponseFunctionToolCall, ResponseReasoningItem
 from openai_harmony import Author, Message, Role, TextContent
 
-from vllm.entrypoints.harmony_utils import (
+from vllm.entrypoints.openai.parser.harmony_utils import (
     has_custom_tools,
     parse_input_to_harmony_message,
     parse_output_message,
diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py
index 8fd3545ec..6f2a50020 100644
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -726,7 +726,7 @@ async def test_function_calling_required(client: OpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_system_message_with_tools(client: OpenAI, model_name: str):
-    from vllm.entrypoints.harmony_utils import get_system_message
+    from vllm.entrypoints.openai.parser.harmony_utils import get_system_message
 
     # Test with custom tools enabled - commentary channel should be available
     sys_msg = get_system_message(with_custom_tools=True)
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index f50c473d7..a484a437c 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -19,7 +19,7 @@ from vllm import envs
 from vllm.entrypoints.chat_utils import (
     ChatTemplateContentFormatOption,
 )
-from vllm.entrypoints.harmony_utils import (
+from vllm.entrypoints.openai.parser.harmony_utils import (
     get_encoding,
     get_streamable_parser_for_assistant,
     render_for_completion,
diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/openai/parser/harmony_utils.py
similarity index 100%
rename from vllm/entrypoints/harmony_utils.py
rename to vllm/entrypoints/openai/parser/harmony_utils.py
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 9b7bc461e..c6333d170 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -21,7 +21,8 @@ from vllm.entrypoints.chat_utils import (
     get_history_tool_calls_cnt,
     make_tool_call_id,
 )
-from vllm.entrypoints.harmony_utils import (
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.parser.harmony_utils import (
     get_developer_message,
     get_stop_tokens_for_assistant_actions,
     get_streamable_parser_for_assistant,
@@ -30,7 +31,6 @@ from vllm.entrypoints.harmony_utils import (
     parse_input_to_harmony_message,
     render_for_completion,
 )
-from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionLogProb,
     ChatCompletionLogProbs,
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 1eb1243e7..91616a78e 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -64,7 +64,8 @@ from vllm.entrypoints.context import (
     SimpleContext,
     StreamingHarmonyContext,
 )
-from vllm.entrypoints.harmony_utils import (
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.parser.harmony_utils import (
     construct_harmony_previous_input_messages,
     get_developer_message,
     get_stop_tokens_for_assistant_actions,
@@ -76,7 +77,6 @@ from vllm.entrypoints.harmony_utils import (
     parse_response_input,
     render_for_completion,
 )
-from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
     DeltaMessage,
     ErrorResponse,
diff --git a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
index 8bdf35d40..387e87f20 100644
--- a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
@@ -4,7 +4,7 @@ import json
 from collections.abc import Sequence
 from typing import TYPE_CHECKING
 
-from vllm.entrypoints.harmony_utils import parse_output_into_messages
+from vllm.entrypoints.openai.parser.harmony_utils import parse_output_into_messages
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest,
     DeltaMessage,
diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py
index fa45b1285..e0920ef31 100644
--- a/vllm/reasoning/gptoss_reasoning_parser.py
+++ b/vllm/reasoning/gptoss_reasoning_parser.py
@@ -5,7 +5,7 @@ from collections.abc import Sequence
 
 from transformers import PreTrainedTokenizerBase
 
-from vllm.entrypoints.harmony_utils import parse_chat_output
+from vllm.entrypoints.openai.parser.harmony_utils import parse_chat_output
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 from vllm.entrypoints.tool_server import ToolServer
 from vllm.logger import init_logger
-- 
GitLab


From 8d3da4c79ddbd199d4185c505bcf3a5d7e7a3316 Mon Sep 17 00:00:00 2001
From: AuruTus <33182215+AuruTus@users.noreply.github.com>
Date: Sun, 7 Dec 2025 08:21:03 +0800
Subject: [PATCH 166/258] [MISC]: change NIXL compatibility hash logging level
 to debug (#30182)

---
 vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 7aa12e999..514b8534a 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -189,7 +189,7 @@ def compute_nixl_compatibility_hash(
     }
 
     compat_hash = hash_factors(factors)
-    logger.info(
+    logger.debug(
         "NIXL compatibility hash: %s (model=%s, dtype=%s, num_kv_heads=%d, "
         "cache_dtype=%s, attn_backend=%s)",
         compat_hash,
-- 
GitLab


From cbedb703cc594632db796f9ca748ea4b7b4e8435 Mon Sep 17 00:00:00 2001
From: Yanan Cao <gmagogsfm@users.noreply.github.com>
Date: Sat, 6 Dec 2025 18:53:42 -0800
Subject: [PATCH 167/258] [Frontend] Remove confusing -O.xx flag error (#30169)

Signed-off-by: Yanan Cao <gmagogsfm@gmail.com>
---
 tests/utils_/test_argparse_utils.py | 19 -------------------
 vllm/utils/argparse_utils.py        |  7 -------
 2 files changed, 26 deletions(-)

diff --git a/tests/utils_/test_argparse_utils.py b/tests/utils_/test_argparse_utils.py
index 6f24c77e0..fbc278404 100644
--- a/tests/utils_/test_argparse_utils.py
+++ b/tests/utils_/test_argparse_utils.py
@@ -458,22 +458,3 @@ def test_flat_product():
         (3, 4, "a", 5, 6),
         (3, 4, "b", 5, 6),
     ]
-
-
-def test_o_dotted_syntax_error():
-    """Test that -O.* dotted syntax raises a clear error message."""
-    parser = FlexibleArgumentParser()
-    parser.add_argument("-cc", "--compilation-config", type=json.loads)
-
-    # Test that -O.* syntax raises a clear ValueError
-    with pytest.raises(ValueError, match=r"The -O\.\* syntax is no longer supported"):
-        parser.parse_args(["-O.backend=eager"])
-
-    with pytest.raises(ValueError, match=r"Please use -cc\.\* instead"):
-        parser.parse_args(["-O.mode=2"])
-
-    with pytest.raises(
-        ValueError,
-        match=r"replace '-O\.cudagraph_mode=NONE' with '-cc\.cudagraph_mode=NONE'",
-    ):
-        parser.parse_args(["-O.cudagraph_mode=NONE"])
diff --git a/vllm/utils/argparse_utils.py b/vllm/utils/argparse_utils.py
index 356f383cc..87ee6f54c 100644
--- a/vllm/utils/argparse_utils.py
+++ b/vllm/utils/argparse_utils.py
@@ -244,13 +244,6 @@ class FlexibleArgumentParser(ArgumentParser):
                 else:
                     key = pattern.sub(repl, arg, count=1)
                     processed_args.append(key)
-            elif arg.startswith("-O."):
-                # Provide clear error for deprecated -O.* syntax
-                raise ValueError(
-                    f"The -O.* syntax is no longer supported. "
-                    f"Please use -cc.* instead. "
-                    f"For example, replace '{arg}' with '{arg.replace('-O', '-cc', 1)}'"
-                )
             elif arg.startswith("-O") and arg != "-O":
                 # allow -O flag to be used without space, e.g. -O3 or -Odecode
                 # also handle -O=<optimization_level> here
-- 
GitLab


From dce6d229f7d405a4757aa8f0a76ba62f0e39eaa4 Mon Sep 17 00:00:00 2001
From: jeremyteboul <80506730+jeremyteboul@users.noreply.github.com>
Date: Sat, 6 Dec 2025 20:34:24 -0800
Subject: [PATCH 168/258] Support multiple image/audio embeddings per requests
 (#29988)

Signed-off-by: Jeremy Teboul <jeremyteboul@fb.com>
Co-authored-by: Jeremy Teboul <jeremyteboul@fb.com>
---
 docs/features/multimodal_inputs.md   |  10 +-
 tests/entrypoints/test_chat_utils.py | 178 +++++++++++++++++++++++++++
 vllm/entrypoints/chat_utils.py       |  30 ++---
 3 files changed, 198 insertions(+), 20 deletions(-)

diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index 0adb32a7a..c3fd726e9 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -445,7 +445,7 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
 
 For Qwen3-VL, the `image_embeds` should contain both the base image embedding and deepstack features.
 
-#### Audio Embeddings
+#### Audio Embedding Inputs
 
 You can pass pre-computed audio embeddings similar to image embeddings:
 
@@ -892,5 +892,11 @@ For Online Serving, you can also skip sending media if you expect cache hits wit
     ```
 
 !!! note
-    Only one message can contain `{"type": "image_embeds"}`.
+    Multiple messages can now contain `{"type": "image_embeds"}`, enabling you to pass multiple image embeddings in a single request (similar to regular images). The number of embeddings is limited by `--limit-mm-per-prompt`.
+
+    **Important**: The embedding shape format differs based on the number of embeddings:
+
+    - **Single embedding**: 3D tensor of shape `(1, feature_size, hidden_size)`
+    - **Multiple embeddings**: List of 2D tensors, each of shape `(feature_size, hidden_size)`
+
     If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc.
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 75be34820..527322c71 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -6,6 +6,7 @@ from collections.abc import Mapping
 from typing import Literal
 
 import pytest
+import torch
 from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
 
 from vllm.assets.audio import AudioAsset
@@ -915,6 +916,183 @@ async def test_parse_chat_messages_audio_embeds_async(
     _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
 
 
+def test_parse_chat_messages_multiple_image_embeds(
+    phi3v_model_config_image_embeds,
+):
+    """Test that multiple image_embeds in a single message are now supported.
+
+    This test validates the fix for the limitation that previously only allowed
+    one message with {'type': 'image_embeds'}. Now multiple image embeddings
+    can be provided in a single request, similar to regular images.
+    """
+    # Create two sample image embedding tensors
+    image_embedding_1 = torch.randn(256, 1024)
+    image_embedding_2 = torch.randn(128, 1024)
+
+    # Encode them as base64 using the convenience function
+    base64_image_embedding_1 = tensor2base64(image_embedding_1)
+    base64_image_embedding_2 = tensor2base64(image_embedding_2)
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_embeds",
+                        "image_embeds": base64_image_embedding_1,
+                    },
+                    {
+                        "type": "image_embeds",
+                        "image_embeds": base64_image_embedding_2,
+                    },
+                    {"type": "text", "text": "Describe these two images."},
+                ],
+            }
+        ],
+        phi3v_model_config_image_embeds,
+        content_format="string",
+    )
+
+    # Verify conversation structure
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\n<|image_2|>\nDescribe these two images.",
+        }
+    ]
+
+    # Verify mm_data contains a list of embeddings (not a single embedding)
+    assert mm_data is not None
+    assert "image" in mm_data
+    assert isinstance(mm_data["image"], list)
+    assert len(mm_data["image"]) == 2
+
+    # Verify each embedding has the correct shape
+    assert isinstance(mm_data["image"][0], torch.Tensor)
+    assert mm_data["image"][0].shape == image_embedding_1.shape
+    assert isinstance(mm_data["image"][1], torch.Tensor)
+    assert mm_data["image"][1].shape == image_embedding_2.shape
+
+    # Verify UUIDs (None since we didn't provide any)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
+
+
+def test_parse_chat_messages_multiple_image_embeds_with_uuids(
+    phi3v_model_config_image_embeds,
+):
+    """Test multiple image_embeds with UUIDs.
+
+    This validates that UUIDs are properly tracked for multiple embeddings.
+    """
+    uuid1 = "image-uuid-1"
+    uuid2 = "image-uuid-2"
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_embeds",
+                        "image_embeds": None,
+                        "uuid": uuid1,
+                    },
+                    {
+                        "type": "image_embeds",
+                        "image_embeds": None,
+                        "uuid": uuid2,
+                    },
+                    {"type": "text", "text": "Compare these images."},
+                ],
+            }
+        ],
+        phi3v_model_config_image_embeds,
+        content_format="string",
+    )
+
+    # Verify conversation structure
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\n<|image_2|>\nCompare these images.",
+        }
+    ]
+
+    # Verify mm_data contains a list with None values (UUID references)
+    assert mm_data is not None
+    assert "image" in mm_data
+    assert isinstance(mm_data["image"], list)
+    assert len(mm_data["image"]) == 2
+    assert mm_data["image"][0] is None
+    assert mm_data["image"][1] is None
+
+    # Verify UUIDs are correctly tracked
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[uuid1, uuid2])
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_multiple_image_embeds_async(
+    phi3v_model_config_image_embeds,
+):
+    """Test multiple image_embeds with async parsing.
+
+    This validates the AsyncMultiModalItemTracker also supports multiple embeddings.
+    """
+    # Create two sample image embedding tensors
+    image_embedding_1 = torch.randn(200, 768)
+    image_embedding_2 = torch.randn(150, 768)
+
+    # Encode them as base64 using the convenience function
+    base64_image_embedding_1 = tensor2base64(image_embedding_1)
+    base64_image_embedding_2 = tensor2base64(image_embedding_2)
+
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_embeds",
+                        "image_embeds": base64_image_embedding_1,
+                    },
+                    {
+                        "type": "image_embeds",
+                        "image_embeds": base64_image_embedding_2,
+                    },
+                    {"type": "text", "text": "What do these images show?"},
+                ],
+            }
+        ],
+        phi3v_model_config_image_embeds,
+        content_format="string",
+    )
+
+    # Verify conversation structure
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\n<|image_2|>\nWhat do these images show?",
+        }
+    ]
+
+    # Await the future and verify mm_data
+    mm_data = await mm_future
+    assert mm_data is not None
+    assert "image" in mm_data
+    assert isinstance(mm_data["image"], list)
+    assert len(mm_data["image"]) == 2
+
+    # Verify each embedding has the correct shape
+    assert isinstance(mm_data["image"][0], torch.Tensor)
+    assert mm_data["image"][0].shape == image_embedding_1.shape
+    assert isinstance(mm_data["image"][1], torch.Tensor)
+    assert mm_data["image"][1].shape == image_embedding_2.shape
+
+    # Verify UUIDs
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
+
+
 @pytest.mark.asyncio
 async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
     phi3v_model_config_image_embeds,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 077fe681b..aceaa8bd4 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -694,16 +694,10 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
             raise ValueError("Mixing raw image and embedding inputs is not allowed")
 
         if "image_embeds" in uuids_by_modality:
-            image_embeds_uuids = uuids_by_modality["image_embeds"]
-            if len(image_embeds_uuids) > 1:
-                raise ValueError("Only one message can have {'type': 'image_embeds'}")
             mm_uuids["image"] = uuids_by_modality["image_embeds"]
         if "image" in uuids_by_modality:
             mm_uuids["image"] = uuids_by_modality["image"]  # UUIDs of images
         if "audio_embeds" in uuids_by_modality:
-            audio_embeds_uuids = uuids_by_modality["audio_embeds"]
-            if len(audio_embeds_uuids) > 1:
-                raise ValueError("Only one message can have {'type': 'audio_embeds'}")
             mm_uuids["audio"] = uuids_by_modality["audio_embeds"]
         if "audio" in uuids_by_modality:
             mm_uuids["audio"] = uuids_by_modality["audio"]  # UUIDs of audios
@@ -729,16 +723,16 @@ class MultiModalItemTracker(BaseMultiModalItemTracker[object]):
 
         if "image_embeds" in items_by_modality:
             image_embeds_lst = items_by_modality["image_embeds"]
-            if len(image_embeds_lst) > 1:
-                raise ValueError("Only one message can have {'type': 'image_embeds'}")
-            mm_inputs["image"] = image_embeds_lst[0]
+            mm_inputs["image"] = (
+                image_embeds_lst if len(image_embeds_lst) != 1 else image_embeds_lst[0]
+            )
         if "image" in items_by_modality:
             mm_inputs["image"] = items_by_modality["image"]  # A list of images
         if "audio_embeds" in items_by_modality:
             audio_embeds_lst = items_by_modality["audio_embeds"]
-            if len(audio_embeds_lst) > 1:
-                raise ValueError("Only one message can have {'type': 'audio_embeds'}")
-            mm_inputs["audio"] = audio_embeds_lst[0]
+            mm_inputs["audio"] = (
+                audio_embeds_lst if len(audio_embeds_lst) != 1 else audio_embeds_lst[0]
+            )
         if "audio" in items_by_modality:
             mm_inputs["audio"] = items_by_modality["audio"]  # A list of audios
         if "video" in items_by_modality:
@@ -771,16 +765,16 @@ class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
 
         if "image_embeds" in items_by_modality:
             image_embeds_lst = items_by_modality["image_embeds"]
-            if len(image_embeds_lst) > 1:
-                raise ValueError("Only one message can have {'type': 'image_embeds'}")
-            mm_inputs["image"] = image_embeds_lst[0]
+            mm_inputs["image"] = (
+                image_embeds_lst if len(image_embeds_lst) != 1 else image_embeds_lst[0]
+            )
         if "image" in items_by_modality:
             mm_inputs["image"] = items_by_modality["image"]  # A list of images
         if "audio_embeds" in items_by_modality:
             audio_embeds_lst = items_by_modality["audio_embeds"]
-            if len(audio_embeds_lst) > 1:
-                raise ValueError("Only one message can have {'type': 'audio_embeds'}")
-            mm_inputs["audio"] = audio_embeds_lst[0]
+            mm_inputs["audio"] = (
+                audio_embeds_lst if len(audio_embeds_lst) != 1 else audio_embeds_lst[0]
+            )
         if "audio" in items_by_modality:
             mm_inputs["audio"] = items_by_modality["audio"]  # A list of audios
         if "video" in items_by_modality:
-- 
GitLab


From 17eb25e3271f8f22a0d8920a8115158495827cba Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Sat, 6 Dec 2025 23:44:50 -0500
Subject: [PATCH 169/258] [Perf] Enable cuda graph for deepepHT, 5.3%
 throughput improvement, 4.4% TTFT improvement (#29558)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 tests/compile/test_config.py |  66 ++++++++++++++++++++-
 vllm/config/compilation.py   | 111 +++++++++++++++++++++++------------
 vllm/config/vllm.py          |   5 +-
 vllm/platforms/cuda.py       |  38 ------------
 4 files changed, 142 insertions(+), 78 deletions(-)

diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index 8dd6959a0..0e91cf525 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -10,7 +10,7 @@ from pydantic import ValidationError
 
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.fix_functionalization import FixFunctionalizationPass
-from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
+from vllm.config import CompilationConfig, CUDAGraphMode, ParallelConfig, VllmConfig
 from vllm.config.compilation import CompilationMode, PassConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.logger import _print_warning_once
@@ -235,6 +235,70 @@ def test_splitting_ops_dynamic():
     assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
 
 
+def test_moe_splitting_ops_deepep_ht_piecewise():
+    # Non-inductor, non-attn-fusion case: DeepEP HT with dp>1
+    # should add MoE ops to splitting_ops on top of attention ops.
+    config = VllmConfig(
+        parallel_config=ParallelConfig(
+            all2all_backend="deepep_high_throughput",
+            data_parallel_size=8,
+        ),
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+        ),
+    )
+    splitting_ops = config.compilation_config.splitting_ops
+    assert splitting_ops is not None
+    assert "vllm::moe_forward" in splitting_ops
+    assert "vllm::moe_forward_shared" in splitting_ops
+
+
+def test_moe_splitting_ops_deepep_ht_inductor_partition():
+    # Inductor partition case: user-provided splitting_ops should be
+    # preserved and MoE ops should be appended for DeepEP HT with dp>1.
+    config = VllmConfig(
+        parallel_config=ParallelConfig(
+            all2all_backend="deepep_high_throughput",
+            data_parallel_size=8,
+        ),
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            use_inductor_graph_partition=True,
+            splitting_ops=[
+                "vllm::unified_attention",
+                "vllm::moe_forward",
+                "vllm::moe_forward_shared",
+            ],
+        ),
+    )
+    splitting_ops = config.compilation_config.splitting_ops
+    assert splitting_ops == [
+        "vllm::unified_attention",
+        "vllm::moe_forward",
+        "vllm::moe_forward_shared",
+    ]
+
+
+def test_moe_splitting_ops_deepep_ht_attn_fusion_no_inductor():
+    # Pure attn-fusion case without inductor partition: even with
+    # DeepEP HT and dp>1, we should not re-enable piecewise compilation
+    # or add MoE ops into splitting_ops.
+    config = VllmConfig(
+        parallel_config=ParallelConfig(
+            all2all_backend="deepep_high_throughput",
+            data_parallel_size=8,
+        ),
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            pass_config={"enable_attn_fusion": True, "enable_noop": True},
+            custom_ops=["+quant_fp8"],
+            cudagraph_mode=CUDAGraphMode.PIECEWISE,
+        ),
+    )
+    assert config.compilation_config.splitting_ops == []
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL
+
+
 def test_should_split():
     import torch
 
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 5f9e2cfdd..b79200f0e 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -966,7 +966,9 @@ class CompilationConfig:
         # May get recomputed in the model runner if adjustment is needed for spec-decode
         self.compute_bs_to_padded_graph_size()
 
-    def set_splitting_ops_for_v1(self):
+    def set_splitting_ops_for_v1(
+        self, all2all_backend: str | None = None, data_parallel_size: int | None = None
+    ):
         # To compatible with OOT hardware plugin platform (for example vllm-ascend)
         # which currently only supports sequence parallelism in eager mode.
         if self.mode != CompilationMode.VLLM_COMPILE:
@@ -981,50 +983,83 @@ class CompilationConfig:
             "mode is CompilationMode.VLLM_COMPILE"
         )
 
-        if self.use_inductor_graph_partition:
-            self.set_splitting_ops_for_inductor_graph_partition()
-            return
+        added_default_splitting_ops = False
 
-        if self.pass_config.fuse_attn_quant:
-            # here use_inductor_graph_partition is False
+        if self.pass_config.fuse_attn_quant and not self.use_inductor_graph_partition:
             self.set_splitting_ops_for_attn_fusion()
-            return
-
-        if self.splitting_ops is None:
-            # NOTE: When using full cudagraph, instead of setting an empty
-            # list and capture the full cudagraph inside the flattened fx
-            # graph, we keep the piecewise fx graph structure but capture
-            # the full cudagraph outside the fx graph. This reduces some
-            # cpu overhead when the runtime batch_size is not cudagraph
-            # captured. see https://github.com/vllm-project/vllm/pull/20059
-            # for details. Make a copy to avoid mutating the class-level
-            # list via reference.
-            self.splitting_ops = list(self._attention_ops)
-        elif len(self.splitting_ops) == 0:
-            logger.warning_once("Using piecewise compilation with empty splitting_ops")
-            if self.cudagraph_mode == CUDAGraphMode.PIECEWISE:
+        else:
+            if self.splitting_ops is None:
+                # NOTE: When using full cudagraph, instead of setting an empty
+                # list and capture the full cudagraph inside the flattened fx
+                # graph, we keep the piecewise fx graph structure but capture
+                # the full cudagraph outside the fx graph. This reduces some
+                # cpu overhead when the runtime batch_size is not cudagraph
+                # captured. see https://github.com/vllm-project/vllm/pull/20059
+                # for details. Make a copy to avoid mutating the class-level
+                # list via reference.
+                self.splitting_ops = list(self._attention_ops)
+                added_default_splitting_ops = True
+            elif len(self.splitting_ops) == 0:
                 logger.warning_once(
-                    "Piecewise compilation with empty splitting_ops do not"
-                    "contains piecewise cudagraph. Setting cudagraph_"
-                    "mode to NONE. Hint: If you are using attention backends "
-                    "that support cudagraph, consider manually setting "
-                    "cudagraph_mode to FULL or FULL_DECODE_ONLY to enable "
-                    "full cudagraphs."
+                    "Using piecewise compilation with empty splitting_ops"
                 )
+                if self.cudagraph_mode == CUDAGraphMode.PIECEWISE:
+                    logger.warning_once(
+                        "Piecewise compilation with empty splitting_ops do not"
+                        "contains piecewise cudagraph. Setting cudagraph_"
+                        "mode to NONE. Hint: If you are using attention "
+                        "backends that support cudagraph, consider manually "
+                        "setting cudagraph_mode to FULL or FULL_DECODE_ONLY "
+                        "to enable full cudagraphs."
+                    )
+                    self.cudagraph_mode = CUDAGraphMode.NONE
+                elif self.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
+                    logger.warning_once(
+                        "Piecewise compilation with empty splitting_ops do "
+                        "not contains piecewise cudagraph. Setting "
+                        "cudagraph_mode to FULL."
+                    )
+                    self.cudagraph_mode = CUDAGraphMode.FULL
+                self.splitting_ops = []
+
+        # split MoE ops for cudagraph
+        moe_ops = [
+            "vllm::moe_forward",
+            "vllm::moe_forward_shared",
+        ]
+        backend = all2all_backend or envs.VLLM_ALL2ALL_BACKEND
+        dp_size = data_parallel_size if data_parallel_size is not None else 1
+        need_moe_splitting = (
+            backend == "deepep_high_throughput"
+            and dp_size > 1
+            # pure attn-fusion without inductor partition deliberately disables
+            # piecewise graphs and MoE splitting.
+            and not (
+                self.pass_config.fuse_attn_quant
+                and not self.use_inductor_graph_partition
+            )
+        )
+
+        if need_moe_splitting and self.cudagraph_mode != CUDAGraphMode.NONE:
+            # if we just initialized default splitting_ops for this config,
+            # automatically append the MoE ops
+            if added_default_splitting_ops:
+                for op in moe_ops:
+                    if op not in self.splitting_ops:
+                        self.splitting_ops.append(op)
+
+            # make sure MoE ops are split out
+            if not any(op in self.splitting_ops for op in moe_ops):
                 self.cudagraph_mode = CUDAGraphMode.NONE
-            elif self.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
                 logger.warning_once(
-                    "Piecewise compilation with empty splitting_ops do not "
-                    "contains piecewise cudagraph. Setting cudagraph_mode "
-                    "to FULL."
+                    "DeepEP high throughput backend with data_parallel_size > 1 "
+                    "requires splitting MoE ops from cudagraphs. Please ensure "
+                    "'vllm::moe_forward' or 'vllm::moe_forward_shared' are "
+                    "present in CompilationConfig.splitting_ops."
                 )
-                self.cudagraph_mode = CUDAGraphMode.FULL
-            self.splitting_ops = []
-
-    def set_splitting_ops_for_inductor_graph_partition(self):
-        assert self.use_inductor_graph_partition
-        if self.splitting_ops is None:
-            self.splitting_ops = list(self._attention_ops)
+            elif self.cudagraph_mode.has_full_cudagraphs():
+                # fall back to piecewise when MoE splitting is required.
+                self.cudagraph_mode = CUDAGraphMode.PIECEWISE
 
     def set_splitting_ops_for_attn_fusion(self):
         assert self.pass_config.fuse_attn_quant
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index b99be1e5d..36e4bd159 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -813,7 +813,10 @@ class VllmConfig:
         ), "MTP with cp_kv_cache_interleave_size > 1 is not supported now."
 
         # Do this after all the updates to compilation_config.mode
-        self.compilation_config.set_splitting_ops_for_v1()
+        self.compilation_config.set_splitting_ops_for_v1(
+            all2all_backend=self.parallel_config.all2all_backend,
+            data_parallel_size=self.parallel_config.data_parallel_size,
+        )
 
         if self.compilation_config.pass_config.enable_sp:
             # With pipeline parallelism or dynamo partitioning,
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 7e6ce6aee..37c95f486 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -232,44 +232,6 @@ class CudaPlatformBase(Platform):
                 logger.info(
                     "Forcing kv cache block size to 64 for FlashMLASparse backend."
                 )
-        # lazy import to avoid circular import
-        from vllm.config import CUDAGraphMode
-
-        compilation_config = vllm_config.compilation_config
-        if compilation_config.cudagraph_mode.has_full_cudagraphs():
-            # decode context parallel does not support full cudagraphs
-            if parallel_config.decode_context_parallel_size > 1:
-                logger.warning_once(
-                    "Decode context parallel (DCP) is enabled, which is "
-                    "incompatible with full CUDA graphs. "
-                    "Overriding cudagraph_mode to PIECEWISE."
-                )
-                compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
-            # prefill context parallel do not support full cudagraphs
-            elif parallel_config.prefill_context_parallel_size > 1:
-                logger.warning_once(
-                    "Prefill context parallel (PCP) is enabled, which is "
-                    "incompatible with full CUDA graphs. "
-                    "Overriding cudagraph_mode to PIECEWISE."
-                )
-                compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
-        if (
-            parallel_config.all2all_backend == "deepep_high_throughput"
-            and parallel_config.data_parallel_size > 1
-            and compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-        ):
-            # TODO: Piecewise Cuda graph might be enabled
-            # if torch compile cache key issue fixed
-            # See https://github.com/vllm-project/vllm/pull/25093
-            logger.info(
-                "WideEP: Disabling CUDA Graphs since DeepEP high-throughput "
-                "kernels are optimized for prefill and are incompatible with "
-                "CUDA Graphs. "
-                "In order to use CUDA Graphs for decode-optimized workloads, "
-                "use --all2all-backend with another option, such as "
-                "deepep_low_latency, pplx, or allgather_reducescatter."
-            )
-            compilation_config.cudagraph_mode = CUDAGraphMode.NONE
 
     @classmethod
     def get_current_memory_usage(
-- 
GitLab


From a49d813fa88338a4409928a6e6d2ab3d0019f83b Mon Sep 17 00:00:00 2001
From: Luke <yq0536@gmail.com>
Date: Sat, 6 Dec 2025 23:13:14 -0800
Subject: [PATCH 170/258] Lazy loading to avoid importing all files (#29716)

Signed-off-by: Luke <yq0536@gmail.com>
---
 vllm/transformers_utils/configs/__init__.py | 90 ++++++++++++---------
 1 file changed, 52 insertions(+), 38 deletions(-)

diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 0e8d16788..e536ca852 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -10,46 +10,47 @@ Model configs may be defined in this directory for the following reasons:
   deepseek-ai/DeepSeek-V3.2-Exp.
 """
 
-from transformers import DeepseekV3Config
+from __future__ import annotations
 
-from vllm.transformers_utils.configs.afmoe import AfmoeConfig
-from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
-from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
-from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig
-from vllm.transformers_utils.configs.eagle import EAGLEConfig
+import importlib
 
-# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
-# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
-# `FalconConfig` class from the official HuggingFace transformers library.
-from vllm.transformers_utils.configs.falcon import RWConfig
-from vllm.transformers_utils.configs.flex_olmo import FlexOlmoConfig
-from vllm.transformers_utils.configs.hunyuan_vl import (
-    HunYuanVLConfig,
-    HunYuanVLTextConfig,
-    HunYuanVLVisionConfig,
-)
-from vllm.transformers_utils.configs.jais import JAISConfig
-from vllm.transformers_utils.configs.kimi_linear import KimiLinearConfig
-from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
-from vllm.transformers_utils.configs.lfm2_moe import Lfm2MoeConfig
-from vllm.transformers_utils.configs.medusa import MedusaConfig
-from vllm.transformers_utils.configs.midashenglm import MiDashengLMConfig
-from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
-from vllm.transformers_utils.configs.moonvit import MoonViTConfig
-from vllm.transformers_utils.configs.nemotron import NemotronConfig
-from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
-from vllm.transformers_utils.configs.olmo3 import Olmo3Config
-from vllm.transformers_utils.configs.ovis import OvisConfig
-from vllm.transformers_utils.configs.qwen3_next import Qwen3NextConfig
-from vllm.transformers_utils.configs.radio import RadioConfig
-from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig
-from vllm.transformers_utils.configs.step3_vl import (
-    Step3TextConfig,
-    Step3VisionEncoderConfig,
-    Step3VLConfig,
-)
-from vllm.transformers_utils.configs.tarsier2 import Tarsier2Config
-from vllm.transformers_utils.configs.ultravox import UltravoxConfig
+_CLASS_TO_MODULE: dict[str, str] = {
+    "AfmoeConfig": "vllm.transformers_utils.configs.afmoe",
+    "ChatGLMConfig": "vllm.transformers_utils.configs.chatglm",
+    "DeepseekVLV2Config": "vllm.transformers_utils.configs.deepseek_vl2",
+    "DotsOCRConfig": "vllm.transformers_utils.configs.dotsocr",
+    "EAGLEConfig": "vllm.transformers_utils.configs.eagle",
+    "FlexOlmoConfig": "vllm.transformers_utils.configs.flex_olmo",
+    "HunYuanVLConfig": "vllm.transformers_utils.configs.hunyuan_vl",
+    "HunYuanVLTextConfig": "vllm.transformers_utils.configs.hunyuan_vl",
+    "HunYuanVLVisionConfig": "vllm.transformers_utils.configs.hunyuan_vl",
+    # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
+    # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
+    # `FalconConfig` class from the official HuggingFace transformers library.
+    "RWConfig": "vllm.transformers_utils.configs.falcon",
+    "JAISConfig": "vllm.transformers_utils.configs.jais",
+    "Lfm2MoeConfig": "vllm.transformers_utils.configs.lfm2_moe",
+    "MedusaConfig": "vllm.transformers_utils.configs.medusa",
+    "MiDashengLMConfig": "vllm.transformers_utils.configs.midashenglm",
+    "MLPSpeculatorConfig": "vllm.transformers_utils.configs.mlp_speculator",
+    "MoonViTConfig": "vllm.transformers_utils.configs.moonvit",
+    "KimiLinearConfig": "vllm.transformers_utils.configs.kimi_linear",
+    "KimiVLConfig": "vllm.transformers_utils.configs.kimi_vl",
+    "NemotronConfig": "vllm.transformers_utils.configs.nemotron",
+    "NemotronHConfig": "vllm.transformers_utils.configs.nemotron_h",
+    "Olmo3Config": "vllm.transformers_utils.configs.olmo3",
+    "OvisConfig": "vllm.transformers_utils.configs.ovis",
+    "RadioConfig": "vllm.transformers_utils.configs.radio",
+    "SpeculatorsConfig": "vllm.transformers_utils.configs.speculators.base",
+    "UltravoxConfig": "vllm.transformers_utils.configs.ultravox",
+    "Step3VLConfig": "vllm.transformers_utils.configs.step3_vl",
+    "Step3VisionEncoderConfig": "vllm.transformers_utils.configs.step3_vl",
+    "Step3TextConfig": "vllm.transformers_utils.configs.step3_vl",
+    "Qwen3NextConfig": "vllm.transformers_utils.configs.qwen3_next",
+    "Tarsier2Config": "vllm.transformers_utils.configs.tarsier2",
+    # Special case: DeepseekV3Config is from HuggingFace Transformers
+    "DeepseekV3Config": "transformers",
+}
 
 __all__ = [
     "AfmoeConfig",
@@ -84,3 +85,16 @@ __all__ = [
     "Qwen3NextConfig",
     "Tarsier2Config",
 ]
+
+
+def __getattr__(name: str):
+    if name in _CLASS_TO_MODULE:
+        module_name = _CLASS_TO_MODULE[name]
+        module = importlib.import_module(module_name)
+        return getattr(module, name)
+
+    raise AttributeError(f"module 'configs' has no attribute '{name}'")
+
+
+def __dir__():
+    return sorted(list(__all__))
-- 
GitLab


From 27f4c2fd46b99778d7ea19dfe7751fbaab615177 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 7 Dec 2025 15:15:42 +0800
Subject: [PATCH 171/258] [Renderer] Separate out `RendererConfig` from
 `ModelConfig` (#30145)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/contributing/model/transcription.md      |  12 +-
 .../distributed/test_sequence_parallelism.py  |   2 +
 tests/compile/test_functionalization.py       |   6 +-
 tests/compile/test_fusion.py                  |   6 +-
 tests/compile/test_fusion_attn.py             |   2 +
 tests/compile/test_pass_manager.py            |   8 +-
 tests/compile/test_qk_norm_rope_fusion.py     |   5 +-
 tests/distributed/test_kvlayout.py            |   3 +
 .../entrypoints/openai/test_chat_template.py  |  22 +-
 .../entrypoints/openai/test_lora_resolvers.py |  21 +-
 tests/entrypoints/openai/test_serving_chat.py |  28 ++-
 .../entrypoints/openai/test_serving_engine.py |   8 +-
 .../entrypoints/openai/test_serving_models.py |   8 +-
 tests/entrypoints/test_chat_utils.py          | 194 +++++++-----------
 tests/lora/test_lora_manager.py               |  14 +-
 tests/lora/test_worker.py                     |   2 +
 .../test_model_load_with_params.py            |  22 +-
 tests/models/language/pooling/test_gritlm.py  |   5 +-
 .../multimodal/processing/test_common.py      |  22 +-
 .../multimodal/processing/test_glm4_1v.py     |   4 +-
 .../multimodal/processing/test_h2ovl.py       |   2 +-
 .../multimodal/processing/test_idefics3.py    |   2 +-
 .../multimodal/processing/test_internvl.py    |   2 +-
 .../multimodal/processing/test_llama4.py      |   2 +-
 .../multimodal/processing/test_llava_next.py  |   6 +-
 .../processing/test_llava_onevision.py        |   6 +-
 .../processing/test_minimax_vl_01.py          |   4 +-
 .../multimodal/processing/test_mllama4.py     |   2 +-
 .../multimodal/processing/test_nemotron_vl.py |   2 +-
 .../multimodal/processing/test_phi3v.py       |   2 +-
 .../multimodal/processing/test_phi4mm.py      |   2 +-
 .../multimodal/processing/test_qwen2_vl.py    |   2 +-
 .../multimodal/processing/test_smolvlm.py     |   2 +-
 .../processing/test_tensor_schema.py          |  24 +--
 .../processing/test_transformers.py           |   5 +-
 tests/models/multimodal/test_mapping.py       |  33 +--
 tests/models/registry.py                      |  33 ++-
 tests/models/utils.py                         |  17 +-
 tests/multimodal/test_cache.py                |  27 ++-
 tests/multimodal/test_processing.py           |  24 ++-
 tests/multimodal/test_registry.py             |   4 +-
 tests/test_config.py                          | 131 +++++++-----
 tests/test_inputs.py                          |   7 +-
 tests/v1/attention/utils.py                   |   2 +
 tests/v1/core/test_kv_cache_utils.py          |  20 +-
 tests/v1/core/test_scheduler.py               |   2 +
 tests/v1/core/utils.py                        |   2 +
 tests/v1/engine/test_engine_core.py           |   2 +
 .../engine/test_process_multi_modal_uuids.py  |  24 ++-
 tests/v1/kv_connector/unit/utils.py           |   2 +
 tests/v1/spec_decode/test_eagle.py            |   2 +
 tests/v1/spec_decode/test_mtp.py              |   2 +
 tests/v1/spec_decode/test_ngram.py            |   2 +
 .../test_backend_guidance.py                  |  12 +-
 .../test_reasoning_structured_output.py       |  35 ++--
 tests/v1/tpu/worker/test_tpu_model_runner.py  |   2 +
 tests/v1/worker/test_gpu_model_runner.py      |   3 +
 vllm/config/__init__.py                       |   3 +
 vllm/config/model.py                          | 141 +++----------
 vllm/config/multimodal.py                     |   4 -
 vllm/config/renderer.py                       | 109 ++++++++++
 vllm/config/speculative.py                    |   5 -
 vllm/config/vllm.py                           |  25 ++-
 vllm/engine/arg_utils.py                      |  99 +++++----
 vllm/engine/protocol.py                       |   3 +-
 vllm/entrypoints/chat_utils.py                |  79 ++++---
 vllm/entrypoints/llm.py                       |  14 +-
 vllm/entrypoints/openai/api_server.py         |   2 +-
 vllm/entrypoints/openai/serving_completion.py |   2 +-
 vllm/entrypoints/openai/serving_engine.py     |  11 +-
 vllm/entrypoints/openai/serving_models.py     |   1 +
 vllm/entrypoints/openai/speech_to_text.py     |  10 +-
 vllm/entrypoints/pooling/pooling/serving.py   |   2 +-
 vllm/entrypoints/pooling/score/serving.py     |   4 +-
 vllm/entrypoints/score_utils.py               |  13 +-
 vllm/entrypoints/utils.py                     |   8 +-
 vllm/inputs/preprocess.py                     |   9 +-
 vllm/model_executor/models/adapters.py        |  20 +-
 vllm/model_executor/models/deepseek_ocr.py    |   4 +-
 vllm/model_executor/models/deepseek_vl2.py    |   4 +-
 vllm/model_executor/models/gemma3n_mm.py      |   8 +-
 vllm/model_executor/models/granite_speech.py  |  14 +-
 vllm/model_executor/models/gritlm.py          |  14 +-
 vllm/model_executor/models/interfaces.py      |  10 +-
 vllm/model_executor/models/interns1.py        |   2 +-
 .../model_executor/models/nano_nemotron_vl.py |  13 +-
 vllm/model_executor/models/nemotron_vl.py     |   2 +-
 vllm/model_executor/models/pixtral.py         |   2 +-
 vllm/model_executor/models/voxtral.py         |  22 +-
 vllm/model_executor/models/whisper.py         |  14 +-
 vllm/multimodal/cache.py                      |  22 +-
 vllm/multimodal/processing.py                 |  28 ++-
 vllm/multimodal/registry.py                   |  64 +++---
 vllm/tokenizers/registry.py                   |  24 +--
 vllm/transformers_utils/processor.py          |  28 ++-
 vllm/v1/core/encoder_cache_manager.py         |   8 +-
 vllm/v1/core/sched/scheduler.py               |   2 +-
 vllm/v1/engine/async_llm.py                   |   7 +-
 vllm/v1/engine/input_processor.py             |   7 +-
 vllm/v1/engine/llm_engine.py                  |   7 +-
 vllm/v1/spec_decode/eagle.py                  |   2 +-
 vllm/v1/structured_output/__init__.py         |  18 +-
 vllm/v1/worker/gpu_model_runner.py            |   7 +-
 vllm/v1/worker/tpu_model_runner.py            |   7 +-
 vllm/v1/worker/utils.py                       |  19 +-
 105 files changed, 971 insertions(+), 799 deletions(-)
 create mode 100644 vllm/config/renderer.py

diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md
index fca941acd..c56057890 100644
--- a/docs/contributing/model/transcription.md
+++ b/docs/contributing/model/transcription.md
@@ -22,7 +22,7 @@ Declare supported languages and capabilities:
     import torch
     from torch import nn
 
-    from vllm.config import ModelConfig, SpeechToTextConfig
+    from vllm.config import RendererConfig, SpeechToTextConfig
     from vllm.inputs.data import PromptType
     from vllm.model_executor.models.interfaces import SupportsTranscription
     
@@ -52,7 +52,7 @@ This is for controlling general behavior of the API when serving your model:
         @classmethod
         def get_speech_to_text_config(
             cls,
-            model_config: ModelConfig,
+            renderer_config: RendererConfig,
             task_type: Literal["transcribe", "translate"],
         ) -> SpeechToTextConfig:
             return SpeechToTextConfig(
@@ -83,7 +83,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
             cls,
             audio: np.ndarray,
             stt_config: SpeechToTextConfig,
-            model_config: ModelConfig,
+            renderer_config: RendererConfig,
             language: str | None,
             task_type: Literal["transcribe", "translate"],
             request_prompt: str,
@@ -120,7 +120,7 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
             cls,
             audio: np.ndarray,
             stt_config: SpeechToTextConfig,
-            model_config: ModelConfig,
+            renderer_config: RendererConfig,
             language: str | None,
             task_type: Literal["transcribe", "translate"],
             request_prompt: str,
@@ -183,7 +183,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics:
             cls,
             audio_duration_s: float,
             stt_config: SpeechToTextConfig,
-            model_config: ModelConfig,
+            renderer_config: RendererConfig,
         ) -> int | None:
             # Return None if unknown; otherwise return an estimate.
             return int(audio_duration_s * stt_config.sample_rate // 320)  # example
@@ -216,7 +216,7 @@ Relevant server logic:
             prompt = self.model_cls.get_generation_prompt(
                 audio=chunk,
                 stt_config=self.asr_config,
-                model_config=self.model_config,
+                renderer_config=self.renderer_config,
                 language=language,
                 task_type=self.task_type,
                 request_prompt=request.prompt,
diff --git a/tests/compile/distributed/test_sequence_parallelism.py b/tests/compile/distributed/test_sequence_parallelism.py
index d9fdc3acc..77d3a24d4 100644
--- a/tests/compile/distributed/test_sequence_parallelism.py
+++ b/tests/compile/distributed/test_sequence_parallelism.py
@@ -17,6 +17,7 @@ from vllm.config import (
     DeviceConfig,
     ModelConfig,
     PassConfig,
+    RendererConfig,
     VllmConfig,
     get_current_vllm_config,
     set_current_vllm_config,
@@ -276,6 +277,7 @@ def sequence_parallelism_pass_on_test_model(
 
     vllm_config = VllmConfig(
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         device_config=device_config,
         compilation_config=compilation_config,
     )
diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
index 758591589..52d6fd1e5 100644
--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@@ -15,6 +15,7 @@ from vllm.config import (
     CompilationConfig,
     ModelConfig,
     PassConfig,
+    RendererConfig,
     VllmConfig,
     set_current_vllm_config,
 )
@@ -219,8 +220,11 @@ def test_fix_functionalization(
     torch.set_default_device("cuda")
     torch.set_default_dtype(dtype)
 
+    model_config = ModelConfig(dtype=dtype)
+
     vllm_config = VllmConfig(
-        model_config=ModelConfig(dtype=dtype),
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         compilation_config=CompilationConfig(
             custom_ops=["all"],
             pass_config=PassConfig(
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index d0ba8385f..bb4ee6b8e 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -15,6 +15,7 @@ from vllm.config import (
     CompilationMode,
     ModelConfig,
     PassConfig,
+    RendererConfig,
     VllmConfig,
 )
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -154,8 +155,11 @@ def test_fusion_rmsnorm_quant(
         custom_ops.append("+rms_norm")
     if enable_quant_fp8_custom_op:
         custom_ops.append("+quant_fp8")
+
+    model_config = ModelConfig(dtype=dtype)
     vllm_config = VllmConfig(
-        model_config=ModelConfig(dtype=dtype),
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
             custom_ops=custom_ops,
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index db95dff5e..f87825db2 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -24,6 +24,7 @@ from vllm.config import (
     CompilationMode,
     ModelConfig,
     PassConfig,
+    RendererConfig,
     SchedulerConfig,
     VllmConfig,
     set_current_vllm_config,
@@ -325,6 +326,7 @@ def test_attention_quant_pattern(
     )
     vllm_config = VllmConfig(
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         scheduler_config=SchedulerConfig(
             max_num_seqs=1024,
             max_model_len=model_config.max_model_len,
diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py
index 6d0ba6b65..c95e9e3ff 100644
--- a/tests/compile/test_pass_manager.py
+++ b/tests/compile/test_pass_manager.py
@@ -7,7 +7,7 @@ import torch
 
 from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
 from vllm.compilation.pass_manager import PostGradPassManager
-from vllm.config import ModelConfig, VllmConfig
+from vllm.config import ModelConfig, RendererConfig, VllmConfig
 
 
 # dummy custom pass that doesn't inherit
@@ -43,7 +43,11 @@ class ProperPass(InductorPass):
 )
 def test_pass_manager_uuid(callable):
     # Some passes need dtype to be set
-    config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16))
+    model_config = ModelConfig(dtype=torch.bfloat16)
+    config = VllmConfig(
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
+    )
 
     pass_manager = PostGradPassManager()
     pass_manager.configure(config)
diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py
index e0968ac79..4d109015b 100644
--- a/tests/compile/test_qk_norm_rope_fusion.py
+++ b/tests/compile/test_qk_norm_rope_fusion.py
@@ -19,6 +19,7 @@ from vllm.config import (
     CompilationMode,
     ModelConfig,
     PassConfig,
+    RendererConfig,
     VllmConfig,
     set_current_vllm_config,
 )
@@ -133,8 +134,10 @@ def test_qk_norm_rope_fusion(
     if enable_rope_custom_op:
         custom_ops.append("+rotary_embedding")
 
+    model_config = ModelConfig(dtype=dtype)
     vllm_config = VllmConfig(
-        model_config=ModelConfig(dtype=dtype),
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
             custom_ops=custom_ops,
diff --git a/tests/distributed/test_kvlayout.py b/tests/distributed/test_kvlayout.py
index b190b2820..0d51a51a5 100644
--- a/tests/distributed/test_kvlayout.py
+++ b/tests/distributed/test_kvlayout.py
@@ -5,6 +5,7 @@ from vllm.config import (
     DeviceConfig,
     KVTransferConfig,
     ModelConfig,
+    RendererConfig,
     VllmConfig,
     set_current_vllm_config,
 )
@@ -47,6 +48,7 @@ def test_get_kv_connector_cache_layout_with_nixl_connector():
     vllm_config = VllmConfig(
         device_config=DeviceConfig("cpu"),
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         kv_transfer_config=kv_transfer_config,
     )
     with set_current_vllm_config(vllm_config):
@@ -70,6 +72,7 @@ def test_get_kv_connector_cache_layout_with_multi_connector():
     vllm_config = VllmConfig(
         device_config=DeviceConfig("cpu"),
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         kv_transfer_config=kv_transfer_config,
     )
     with set_current_vllm_config(vllm_config):
diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py
index 77087ac21..b050cfdb5 100644
--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -3,7 +3,6 @@
 
 import pytest
 
-from vllm.config import ModelConfig
 from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.tokenizers import get_tokenizer
@@ -107,24 +106,11 @@ def test_get_gen_prompt(
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
 
-    model_config = ModelConfig(
-        model,
-        tokenizer=model_info.tokenizer or model,
-        tokenizer_mode=model_info.tokenizer_mode,
-        trust_remote_code=model_info.trust_remote_code,
-        revision=model_info.revision,
-        hf_overrides=model_info.hf_overrides,
-        skip_tokenizer_init=model_info.require_embed_inputs,
-        enable_prompt_embeds=model_info.require_embed_inputs,
-        enable_mm_embeds=model_info.require_embed_inputs,
-        enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype,
-    )
+    renderer_config = model_info.build_renderer_config(model)
 
-    # Initialize the tokenizer
     tokenizer = get_tokenizer(
-        tokenizer_name=model_config.tokenizer,
-        trust_remote_code=model_config.trust_remote_code,
+        renderer_config.tokenizer,
+        trust_remote_code=renderer_config.trust_remote_code,
     )
     template_content = load_chat_template(chat_template=template)
 
@@ -143,7 +129,7 @@ def test_get_gen_prompt(
         tokenizer=tokenizer,
         conversation=mock_request.messages,
         chat_template=mock_request.chat_template or template_content,
-        model_config=model_config,
+        renderer_config=renderer_config,
         tools=None,
         add_generation_prompt=mock_request.add_generation_prompt,
         continue_final_message=mock_request.continue_final_message,
diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py
index ea6b3d812..7310c2610 100644
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -33,26 +33,34 @@ class MockModelConfig:
     """Minimal mock ModelConfig for testing."""
 
     model: str = MODEL_NAME
-    tokenizer: str = MODEL_NAME
     trust_remote_code: bool = False
-    tokenizer_mode: str = "auto"
     max_model_len: int = 100
-    tokenizer_revision: str | None = None
     multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
     hf_config: MockHFConfig = field(default_factory=MockHFConfig)
     logits_processors: list[str] | None = None
     logits_processor_pattern: str | None = None
     diff_sampling_param: dict | None = None
-    allowed_local_media_path: str = ""
-    allowed_media_domains: list[str] | None = None
     encoder_config = None
     generation_config: str = "auto"
-    skip_tokenizer_init: bool = False
 
     def get_diff_sampling_param(self):
         return self.diff_sampling_param or {}
 
 
+@dataclass
+class MockRendererConfig:
+    """Minimal mock RendererConfig for testing."""
+
+    model_config: MockModelConfig
+
+    tokenizer: str = MODEL_NAME
+    tokenizer_mode: str = "auto"
+    tokenizer_revision: str | None = None
+    skip_tokenizer_init: bool = False
+    allowed_local_media_path: str = ""
+    allowed_media_domains: list[str] | None = None
+
+
 class MockLoRAResolver(LoRAResolver):
     async def resolve_lora(
         self, base_model_name: str, lora_name: str
@@ -114,6 +122,7 @@ def mock_serving_setup():
     mock_engine.add_lora.reset_mock()
 
     mock_engine.model_config = MockModelConfig()
+    mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 9ea65f9fa..9df8f886e 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -346,27 +346,33 @@ class MockHFConfig:
 class MockModelConfig:
     task = "generate"
     runner_type = "generate"
-    tokenizer = MODEL_NAME
     trust_remote_code = False
-    tokenizer_mode = "auto"
     max_model_len = 100
-    tokenizer_revision = None
     multimodal_config = MultiModalConfig()
     hf_config = MockHFConfig()
     logits_processors: list[str] | None = None
     logits_processor_pattern = None
     diff_sampling_param: dict | None = None
-    allowed_local_media_path: str = ""
-    allowed_media_domains: list[str] | None = None
     encoder_config = None
     generation_config: str = "auto"
-    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
-    skip_tokenizer_init = False
 
     def get_diff_sampling_param(self):
         return self.diff_sampling_param or {}
 
 
+@dataclass
+class MockRendererConfig:
+    model_config: MockModelConfig = field(default_factory=MockModelConfig)
+
+    tokenizer = MODEL_NAME
+    tokenizer_mode = "auto"
+    tokenizer_revision = None
+    skip_tokenizer_init = False
+    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
+    allowed_local_media_path: str = ""
+    allowed_media_domains: list[str] | None = None
+
+
 def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
     models = OpenAIServingModels(
         engine_client=engine,
@@ -399,6 +405,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
 @dataclass
 class MockEngine:
     model_config: MockModelConfig = field(default_factory=MockModelConfig)
+    renderer_config: MockRendererConfig = field(default_factory=MockRendererConfig)
     input_processor: MagicMock = field(default_factory=MagicMock)
     io_processor: MagicMock = field(default_factory=MagicMock)
 
@@ -429,6 +436,7 @@ async def test_serving_chat_returns_correct_model_name():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = MockModelConfig()
+    mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
@@ -459,6 +467,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = MockModelConfig()
+    mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
@@ -492,6 +501,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
+    mock_engine.renderer_config = MockRendererConfig(mock_model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
@@ -537,6 +547,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
+    mock_engine.renderer_config = MockRendererConfig(mock_model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
@@ -583,6 +594,7 @@ async def test_serving_chat_could_load_correct_generation_config():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
+    mock_engine.renderer_config = MockRendererConfig(mock_model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
@@ -629,6 +641,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
+    mock_engine.renderer_config = MockRendererConfig(mock_model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
@@ -662,6 +675,7 @@ async def test_serving_chat_data_parallel_rank_extraction():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = MockModelConfig()
+    mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_engine.py
index 956a06dc5..6ab0942b5 100644
--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_engine.py
@@ -7,7 +7,7 @@ from unittest.mock import Mock
 
 import pytest
 
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, RendererConfig
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.tokenizers import MistralTokenizer
@@ -19,10 +19,16 @@ def serving() -> OpenAIServing:
 
     # Create minimal mocks
     engine_client = Mock()
+
     model_config = Mock(spec=ModelConfig)
     model_config.max_model_len = 32768
+
+    renderer_config = Mock(spec=RendererConfig)
+    renderer_config.model_config = model_config
+
     models = Mock(spec=OpenAIServingModels)
     models.model_config = model_config
+    models.renderer_config = renderer_config
     models.input_processor = Mock()
     models.io_processor = Mock()
 
diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py
index b585835a0..376df6cfe 100644
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -6,7 +6,7 @@ from unittest.mock import MagicMock
 
 import pytest
 
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, RendererConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.openai.protocol import (
     ErrorResponse,
@@ -27,9 +27,15 @@ LORA_UNLOADING_SUCCESS_MESSAGE = (
 async def _async_serving_models_init() -> OpenAIServingModels:
     mock_engine_client = MagicMock(spec=EngineClient)
     # Set the max_model_len attribute to avoid missing attribute
+
     mock_model_config = MagicMock(spec=ModelConfig)
     mock_model_config.max_model_len = 2048
+
+    mock_renderer_config = MagicMock(spec=RendererConfig)
+    mock_renderer_config.model_config = mock_model_config
+
     mock_engine_client.model_config = mock_model_config
+    mock_engine_client.renderer_config = mock_renderer_config
     mock_engine_client.input_processor = MagicMock()
     mock_engine_client.io_processor = MagicMock()
 
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 527322c71..7b296eae7 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -12,7 +12,7 @@ from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, RendererConfig
 from vllm.entrypoints.chat_utils import (
     _try_extract_ast,
     apply_mistral_chat_template,
@@ -233,7 +233,7 @@ def test_parse_chat_messages_single_image(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -265,7 +265,7 @@ def test_parse_chat_messages_single_image_with_uuid(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -295,7 +295,7 @@ def test_parse_chat_messages_single_empty_image_with_uuid(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -328,7 +328,7 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -369,7 +369,7 @@ def test_parse_chat_messages_multiple_images_with_uuids(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -409,7 +409,7 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -451,7 +451,7 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -485,7 +485,7 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -516,7 +516,7 @@ async def test_parse_chat_messages_empty_image_with_uuid_async(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -554,7 +554,7 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -595,7 +595,7 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -634,7 +634,7 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -660,7 +660,7 @@ def test_parse_chat_messages_empty_system(
                 "content": [{"type": "text", "text": "Who are you?"}],
             },
         ],
-        mistral_model_config,
+        RendererConfig(model_config=mistral_model_config),
         content_format="string",
     )
     assert conversation == [
@@ -677,7 +677,7 @@ def test_parse_chat_messages_empty_system(
                 "content": [{"type": "text", "text": "Who are you?"}],
             },
         ],
-        mistral_model_config,
+        RendererConfig(model_config=mistral_model_config),
         content_format="openai",
     )
     assert conversation == [
@@ -701,7 +701,7 @@ async def test_parse_chat_messages_single_image_async(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -730,7 +730,7 @@ def test_parse_chat_messages_multiple_images(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -758,7 +758,7 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -786,7 +786,7 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
                 ],
             }
         ],
-        phi3v_model_config_image_embeds,
+        RendererConfig(model_config=phi3v_model_config_image_embeds),
         content_format="string",
     )
 
@@ -818,7 +818,7 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
                 ],
             }
         ],
-        audio_embeds_model_config,
+        RendererConfig(model_config=audio_embeds_model_config),
         content_format="string",
     )
 
@@ -858,7 +858,7 @@ def test_parse_chat_messages_audio_embeds_with_string(
                 ],
             }
         ],
-        audio_embeds_model_config,
+        RendererConfig(model_config=audio_embeds_model_config),
         content_format="string",
     )
 
@@ -900,7 +900,7 @@ async def test_parse_chat_messages_audio_embeds_async(
                 ],
             }
         ],
-        audio_embeds_model_config,
+        RendererConfig(model_config=audio_embeds_model_config),
         content_format="string",
     )
 
@@ -1108,7 +1108,7 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
                 ],
             }
         ],
-        phi3v_model_config_image_embeds,
+        RendererConfig(model_config=phi3v_model_config_image_embeds),
         content_format="string",
     )
 
@@ -1144,7 +1144,7 @@ async def test_parse_chat_messages_multiple_images_async(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -1176,7 +1176,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
     assert conversation == [
@@ -1208,7 +1208,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -1245,7 +1245,7 @@ def test_parse_chat_messages_multiple_images_across_messages(
                 ],
             },
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -1289,7 +1289,7 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
                 ],
             },
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -1314,7 +1314,7 @@ def test_parse_chat_messages_context_text_format(
             {"role": "assistant", "content": "Some stuff."},
             {"role": "user", "content": "What about this one?"},
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="openai",
     )
 
@@ -1367,7 +1367,7 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message(
                         ],
                     }
                 ],
-                phi3v_model_config,
+                RendererConfig(model_config=phi3v_model_config),
                 content_format="string",
             )
 
@@ -1410,7 +1410,7 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages(
                         ],
                     },
                 ],
-                phi3v_model_config,
+                RendererConfig(model_config=phi3v_model_config),
                 content_format="string",
             )
 
@@ -1430,7 +1430,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
                 ],
             }
         ],
-        phi3v_model_config,
+        RendererConfig(model_config=phi3v_model_config),
         content_format="string",
     )
 
@@ -1464,7 +1464,7 @@ def test_parse_chat_messages_multiple_images_interleave(
                 ],
             }
         ],
-        phi3v_model_config_mm_interleaved,
+        RendererConfig(model_config=phi3v_model_config_mm_interleaved),
         content_format="string",
     )
 
@@ -1500,7 +1500,7 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
                 ],
             }
         ],
-        phi3v_model_config_mm_interleaved,
+        RendererConfig(model_config=phi3v_model_config_mm_interleaved),
         content_format="string",
     )
 
@@ -1545,7 +1545,7 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
                 ],
             }
         ],
-        phi3v_model_config_mm_interleaved,
+        RendererConfig(model_config=phi3v_model_config_mm_interleaved),
         content_format="string",
     )
 
@@ -1583,7 +1583,7 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
                 ],
             },
         ],
-        phi3v_model_config_mm_interleaved,
+        RendererConfig(model_config=phi3v_model_config_mm_interleaved),
         content_format="string",
     )
 
@@ -1631,7 +1631,7 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl
                 ],
             },
         ],
-        phi3v_model_config_mm_interleaved,
+        RendererConfig(model_config=phi3v_model_config_mm_interleaved),
         content_format="string",
     )
 
@@ -1675,7 +1675,7 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
                 ],
             },
         ],
-        qwen25omni_model_config_mm_interleaved,
+        RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
         content_format="string",
     )
 
@@ -1743,7 +1743,7 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
                 ],
             },
         ],
-        qwen25omni_model_config_mm_interleaved,
+        RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
         content_format="string",
     )
 
@@ -1813,7 +1813,7 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
                 ],
             },
         ],
-        qwen25omni_model_config_mm_interleaved,
+        RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
         content_format="string",
     )
 
@@ -1879,7 +1879,7 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
                 ],
             },
         ],
-        qwen25omni_model_config_mm_interleaved,
+        RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
         content_format="string",
     )
 
@@ -1927,7 +1927,7 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
                     ],
                 }
             ],
-            phi3v_model_config_mm_interleaved,
+            RendererConfig(model_config=phi3v_model_config_mm_interleaved),
             content_format="string",
         )
 
@@ -1945,24 +1945,11 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
 
-    model_config = ModelConfig(
-        model,
-        tokenizer=model_info.tokenizer or model,
-        tokenizer_mode=model_info.tokenizer_mode,
-        revision=model_info.revision,
-        trust_remote_code=model_info.trust_remote_code,
-        hf_overrides=model_info.hf_overrides,
-        skip_tokenizer_init=model_info.require_embed_inputs,
-        enable_prompt_embeds=model_info.require_embed_inputs,
-        enable_mm_embeds=model_info.require_embed_inputs,
-        enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype,
-    )
-
-    # Build the tokenizer
+    renderer_config = model_info.build_renderer_config(model)
+
     tokenizer = get_tokenizer(
-        model,
-        trust_remote_code=model_config.trust_remote_code,
+        renderer_config.tokenizer,
+        trust_remote_code=renderer_config.trust_remote_code,
     )
 
     tools = (
@@ -1985,7 +1972,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
         tokenizer,
         chat_template=None,
         tools=tools,
-        model_config=model_config,
+        model_config=renderer_config.model_config,
     )
     assert isinstance(chat_template, str)
 
@@ -2047,24 +2034,11 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa
         "enable_thinking": True,
     }
 
-    model_config = ModelConfig(
-        model,
-        tokenizer=model_info.tokenizer or model,
-        tokenizer_mode=model_info.tokenizer_mode,
-        revision=model_info.revision,
-        trust_remote_code=model_info.trust_remote_code,
-        hf_overrides=model_info.hf_overrides,
-        skip_tokenizer_init=model_info.require_embed_inputs,
-        enable_prompt_embeds=model_info.require_embed_inputs,
-        enable_mm_embeds=model_info.require_embed_inputs,
-        enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype,
-    )
-
-    # Build the tokenizer
+    renderer_config = model_info.build_renderer_config(model)
+
     tokenizer = get_tokenizer(
-        model,
-        trust_remote_code=model_config.trust_remote_code,
+        renderer_config.tokenizer,
+        trust_remote_code=renderer_config.trust_remote_code,
     )
 
     # Test detecting the tokenizer's chat_template
@@ -2072,7 +2046,7 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa
         tokenizer,
         chat_template=None,
         tools=tools,
-        model_config=model_config,
+        model_config=renderer_config.model_config,
     )
     with pytest.raises(
         ValueError, match="Found unexpected chat template kwargs from request"
@@ -2143,23 +2117,11 @@ def test_resolve_content_format_hf_defined(model, expected_format):
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
 
-    model_config = ModelConfig(
-        model,
-        tokenizer=model_info.tokenizer or model,
-        tokenizer_mode=model_info.tokenizer_mode,
-        revision=model_info.revision,
-        trust_remote_code=model_info.trust_remote_code,
-        hf_overrides=model_info.hf_overrides,
-        skip_tokenizer_init=model_info.require_embed_inputs,
-        enable_prompt_embeds=model_info.require_embed_inputs,
-        enable_mm_embeds=model_info.require_embed_inputs,
-        enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype,
-    )
+    renderer_config = model_info.build_renderer_config(model)
 
     tokenizer = get_tokenizer(
-        model,
-        trust_remote_code=model_config.trust_remote_code,
+        renderer_config.tokenizer,
+        trust_remote_code=renderer_config.trust_remote_code,
     )
 
     # Test detecting the tokenizer's chat_template
@@ -2167,7 +2129,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
         tokenizer,
         chat_template=None,
         tools=None,
-        model_config=model_config,
+        model_config=renderer_config.model_config,
     )
     assert isinstance(chat_template, str)
 
@@ -2181,7 +2143,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
         None,
         "auto",
         tokenizer,
-        model_config=model_config,
+        renderer_config=renderer_config,
     )
 
     assert resolved_format == expected_format
@@ -2203,23 +2165,11 @@ def test_resolve_content_format_fallbacks(model, expected_format):
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
 
-    model_config = ModelConfig(
-        model,
-        tokenizer=model_info.tokenizer or model,
-        tokenizer_mode=model_info.tokenizer_mode,
-        revision=model_info.revision,
-        trust_remote_code=model_info.trust_remote_code,
-        hf_overrides=model_info.hf_overrides,
-        skip_tokenizer_init=model_info.require_embed_inputs,
-        enable_prompt_embeds=model_info.require_embed_inputs,
-        enable_mm_embeds=model_info.require_embed_inputs,
-        enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype,
-    )
+    renderer_config = model_info.build_renderer_config(model)
 
     tokenizer = get_tokenizer(
-        model_config.tokenizer,
-        trust_remote_code=model_config.trust_remote_code,
+        renderer_config.tokenizer,
+        trust_remote_code=renderer_config.trust_remote_code,
     )
 
     # Test detecting the tokenizer's chat_template
@@ -2227,7 +2177,7 @@ def test_resolve_content_format_fallbacks(model, expected_format):
         tokenizer,
         chat_template=None,
         tools=None,
-        model_config=model_config,
+        model_config=renderer_config.model_config,
     )
     assert isinstance(chat_template, str)
 
@@ -2241,7 +2191,7 @@ def test_resolve_content_format_fallbacks(model, expected_format):
         None,
         "auto",
         tokenizer,
-        model_config=model_config,
+        renderer_config=renderer_config,
     )
 
     assert resolved_format == expected_format
@@ -2272,15 +2222,13 @@ def test_resolve_content_format_fallbacks(model, expected_format):
     ],
 )
 def test_resolve_content_format_examples(template_path, expected_format):
-    model_config = ModelConfig(
-        PHI3V_MODEL_ID,  # Dummy
-        tokenizer=PHI3V_MODEL_ID,  # Dummy
-        trust_remote_code=True,
-    )
+    model = PHI3V_MODEL_ID  # Dummy
+    model_config = ModelConfig(model, trust_remote_code=True)
+    renderer_config = RendererConfig(model_config=model_config, tokenizer=model)
 
     dummy_tokenizer = get_tokenizer(
-        PHI3V_MODEL_ID,  # Dummy
-        trust_remote_code=model_config.trust_remote_code,
+        renderer_config.tokenizer,
+        trust_remote_code=renderer_config.trust_remote_code,
     )
     dummy_tokenizer.chat_template = None
 
@@ -2297,7 +2245,7 @@ def test_resolve_content_format_examples(template_path, expected_format):
         None,
         "auto",
         dummy_tokenizer,
-        model_config=model_config,
+        renderer_config=renderer_config,
     )
 
     assert resolved_format == expected_format
@@ -2332,7 +2280,7 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config):
 
     conversation_with_thinking, _, _ = parse_chat_messages(
         messages,
-        mistral_model_config,
+        RendererConfig(model_config=mistral_model_config),
         content_format="openai",
     )
 
@@ -2432,7 +2380,7 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
                 ],
             }
         ],
-        qwen2_audio_model_config,
+        RendererConfig(model_config=qwen2_audio_model_config),
         content_format="string",
     )
 
@@ -2466,7 +2414,7 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
                 ],
             }
         ],
-        qwen2_audio_model_config,
+        RendererConfig(model_config=qwen2_audio_model_config),
         content_format="string",
     )
 
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 081f14d6f..7158120fc 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -8,7 +8,7 @@ import torch
 from safetensors.torch import load_file
 from torch import nn
 
-from vllm.config import ModelConfig, VllmConfig
+from vllm.config import ModelConfig, RendererConfig, VllmConfig
 from vllm.config.lora import LoRAConfig
 from vllm.lora.layers import (
     ColumnParallelLinearWithLoRA,
@@ -422,7 +422,11 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa
     )
 
     model_config = ModelConfig(max_model_len=16)
-    vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
+        lora_config=lora_config,
+    )
 
     vllm_config.scheduler_config.max_num_seqs = 4
     vllm_config.scheduler_config.max_num_batched_tokens = 2
@@ -525,7 +529,11 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path
     )
 
     model_config = ModelConfig(max_model_len=16)
-    vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
+        lora_config=lora_config,
+    )
 
     vllm_config.scheduler_config.max_num_seqs = 4
     vllm_config.scheduler_config.max_num_batched_tokens = 2
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 54059ec56..42d8c6202 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -11,6 +11,7 @@ from vllm.config import (
     DeviceConfig,
     ModelConfig,
     ParallelConfig,
+    RendererConfig,
     SchedulerConfig,
     VllmConfig,
 )
@@ -43,6 +44,7 @@ def test_worker_apply_lora(qwen3_lora_files):
 
     vllm_config = VllmConfig(
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         load_config=LoadConfig(
             download_dir=None,
             load_format="dummy",
diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
index 489ac1e64..e36867107 100644
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -42,8 +42,10 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
             "Write a short story about a robot that dreams for the first time.\n"
         )
 
-        model_config = vllm_model.llm.llm_engine.model_config
-        model_tokenizer = vllm_model.llm.llm_engine.tokenizer
+        llm_engine = vllm_model.llm.llm_engine
+        model_config = llm_engine.model_config
+        renderer_config = llm_engine.renderer_config
+        tokenizer = llm_engine.tokenizer
 
         # asserts on the bert model config file
         assert model_config.encoder_config["max_seq_length"] == 512
@@ -54,8 +56,8 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
         assert model_config.pooler_config.normalize
 
         # asserts on the tokenizer loaded
-        assert model_config.tokenizer == "BAAI/bge-base-en-v1.5"
-        assert model_tokenizer.model_max_length == 512
+        assert renderer_config.tokenizer == "BAAI/bge-base-en-v1.5"
+        assert tokenizer.model_max_length == 512
 
         def check_model(model):
             assert isinstance(model, BertEmbeddingModel)
@@ -86,8 +88,10 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
             "Write a short story about a robot that dreams for the first time.\n"
         )
 
-        model_config = vllm_model.llm.llm_engine.model_config
-        model_tokenizer = vllm_model.llm.llm_engine.tokenizer
+        llm_engine = vllm_model.llm.llm_engine
+        model_config = llm_engine.model_config
+        renderer_config = llm_engine.renderer_config
+        tokenizer = llm_engine.tokenizer
 
         # asserts on the bert model config file
         assert model_config.encoder_config["max_seq_length"] == 512
@@ -98,8 +102,8 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
         assert model_config.pooler_config.normalize
 
         # asserts on the tokenizer loaded
-        assert model_config.tokenizer == "intfloat/multilingual-e5-base"
-        assert model_tokenizer.model_max_length == 512
+        assert renderer_config.tokenizer == "intfloat/multilingual-e5-base"
+        assert tokenizer.model_max_length == 512
 
         def check_model(model):
             assert isinstance(model, RobertaEmbeddingModel)
@@ -128,7 +132,7 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch):
             "Write a short story about a robot that dreams for the first time.\n"
         )
 
-        assert vllm_model.llm.llm_engine.model_config.tokenizer == model_name
+        assert vllm_model.llm.llm_engine.renderer_config.tokenizer == model_name
 
         def check_model(model):
             assert isinstance(model, RobertaEmbeddingModel)
diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py
index 0adc9b5cf..11ee00358 100644
--- a/tests/models/language/pooling/test_gritlm.py
+++ b/tests/models/language/pooling/test_gritlm.py
@@ -6,7 +6,7 @@ import pytest
 from scipy.spatial.distance import cosine
 
 from vllm import LLM, SamplingParams
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, RendererConfig
 
 from ....utils import RemoteOpenAIServer
 
@@ -31,7 +31,8 @@ def test_find_array():
         dtype="bfloat16",
         seed=0,
     )
-    pooling = GritLMMeanPool(model_config=model_config)
+    renderer_config = RendererConfig(model_config=model_config)
+    pooling = GritLMMeanPool(renderer_config=renderer_config)
 
     arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
 
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 2e032ac4c..9b2b29b75 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -25,7 +25,6 @@ from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingC
 from vllm.tokenizers import (
     MistralTokenizer,
     TokenizerLike,
-    cached_tokenizer_from_config,
 )
 
 from ....multimodal.utils import random_audio, random_image, random_video
@@ -212,31 +211,20 @@ def _test_processing_correctness(
     else:
         model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch)
         model_id = model_id_or_arch
+
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
-    model_config = ModelConfig(
-        model_id,
-        tokenizer=model_info.tokenizer or model_id,
-        tokenizer_mode=model_info.tokenizer_mode,
-        revision=model_info.revision,
-        trust_remote_code=model_info.trust_remote_code,
-        hf_overrides=model_info.hf_overrides,
+    renderer_config = model_info.build_renderer_config(
+        model=model_id,
         # Ensure that the cache can fit all of the data
         mm_processor_cache_gb=2048,
-        skip_tokenizer_init=model_info.require_embed_inputs,
-        enable_prompt_embeds=model_info.require_embed_inputs,
-        enable_mm_embeds=model_info.require_embed_inputs,
-        enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype,
     )
+    model_config = renderer_config.model_config
 
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
     factories = model_cls._processor_factory
-    ctx = InputProcessingContext(
-        model_config,
-        tokenizer=cached_tokenizer_from_config(model_config),
-    )
+    ctx = InputProcessingContext.from_config(renderer_config)
     cache = MultiModalProcessorOnlyCache(model_config)
 
     processing_info = factories.info(ctx)
diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py
index 51071c935..fdc6352e2 100644
--- a/tests/models/multimodal/processing/test_glm4_1v.py
+++ b/tests/models/multimodal/processing/test_glm4_1v.py
@@ -40,7 +40,7 @@ def test_processor_override(
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"video": 1},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     tokenizer = processor.info.get_tokenizer()
     hf_processor_mm_kwargs = {"fps": fps}
 
@@ -79,7 +79,7 @@ def test_video_loader_consistency(
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"video": 1},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     hf_processor_mm_kwargs = {"fps": fps}
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py
index 1701d9dd8..1263d663e 100644
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -162,7 +162,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": len(size_factors)},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     min_num = min_dynamic_patch if dynamic_image_size else 1
diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py
index 351b9d018..bf12e79a7 100644
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -38,7 +38,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
index b4994295d..51f0d2e89 100644
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -116,7 +116,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": len(size_factors)},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     min_num = min_dynamic_patch if dynamic_image_size else 1
diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py
index b73246b68..04bc8d3f5 100644
--- a/tests/models/multimodal/processing/test_llama4.py
+++ b/tests/models/multimodal/processing/test_llama4.py
@@ -30,7 +30,7 @@ def test_processor_override(
         limit_mm_per_prompt={"image": num_imgs},
         mm_processor_cache_gb=mm_processor_cache_gb,
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     config = processor.info.get_hf_config()
     tokenizer = processor.info.get_tokenizer()
     hf_processor = processor.info.get_hf_processor()
diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index ffe7ca17b..cd01002a3 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -42,7 +42,7 @@ def test_processor_max_tokens(model_id):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": 1},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     info = processor.info
 
     seen_aspect_ratios = set[float]()
@@ -140,7 +140,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
 
     image_ratios = [
         (171, 152),
@@ -173,7 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
 
     seen_aspect_ratios = set[float]()
     image_sizes = list[ImageSize]()
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index f5c552fe6..be505d95a 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -42,7 +42,7 @@ def test_processor_max_tokens(model_id):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": 1},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     info = processor.info
 
     seen_aspect_ratios = set[float]()
@@ -138,7 +138,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
 
     image_ratios = [
         (171, 152),
@@ -171,7 +171,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
 
     seen_aspect_ratios = set[float]()
     image_sizes = list[ImageSize]()
diff --git a/tests/models/multimodal/processing/test_minimax_vl_01.py b/tests/models/multimodal/processing/test_minimax_vl_01.py
index 11e000123..17ac54fdd 100644
--- a/tests/models/multimodal/processing/test_minimax_vl_01.py
+++ b/tests/models/multimodal/processing/test_minimax_vl_01.py
@@ -24,7 +24,7 @@ def test_processor_override(
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     prompt = "<image>" * num_imgs
     image = Image.new("RGB", size=(364, 364))
     mm_data = {"image": [image] * num_imgs}
@@ -83,7 +83,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
 
     image_ratios = [
         (171, 152),
diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py
index e5ff2d139..9a65e2ddc 100644
--- a/tests/models/multimodal/processing/test_mllama4.py
+++ b/tests/models/multimodal/processing/test_mllama4.py
@@ -25,7 +25,7 @@ def test_profiling(model_id: str, max_model_len: int):
         limit_mm_per_prompt=mm_counts,
     )
 
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     profiler = MultiModalProfiler(processor)
 
     decoder_dummy_data = profiler.get_decoder_dummy_data(
diff --git a/tests/models/multimodal/processing/test_nemotron_vl.py b/tests/models/multimodal/processing/test_nemotron_vl.py
index 5311ab1b7..f3609743b 100644
--- a/tests/models/multimodal/processing/test_nemotron_vl.py
+++ b/tests/models/multimodal/processing/test_nemotron_vl.py
@@ -118,7 +118,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": len(size_factors)},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     min_num = min_dynamic_patch if dynamic_image_size else 1
diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py
index 8faff2611..f51bd9786 100644
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -39,7 +39,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_phi4mm.py b/tests/models/multimodal/processing/test_phi4mm.py
index 5391555c2..271357b0d 100644
--- a/tests/models/multimodal/processing/test_phi4mm.py
+++ b/tests/models/multimodal/processing/test_phi4mm.py
@@ -39,7 +39,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
index 9f4cdb678..d65a270a7 100644
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -34,7 +34,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     tokenizer = processor.info.get_tokenizer()
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
diff --git a/tests/models/multimodal/processing/test_smolvlm.py b/tests/models/multimodal/processing/test_smolvlm.py
index 6f77d5516..e0e6264de 100644
--- a/tests/models/multimodal/processing/test_smolvlm.py
+++ b/tests/models/multimodal/processing/test_smolvlm.py
@@ -38,7 +38,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index 5d489549c..24959fa48 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -11,7 +11,7 @@ import pytest
 import torch.nn as nn
 from PIL import Image
 
-from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
+from vllm.config import ModelConfig, RendererConfig, VllmConfig, set_current_vllm_config
 from vllm.config.multimodal import (
     AudioDummyOptions,
     BaseDummyOptions,
@@ -31,7 +31,6 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
 from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
 from vllm.multimodal.utils import group_mm_kwargs_by_modality
 from vllm.platforms import current_platform
-from vllm.tokenizers import cached_tokenizer_from_config
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.torch_utils import set_default_torch_dtype
 
@@ -150,7 +149,10 @@ def initialize_dummy_model(
         backend="nccl",
     )
     initialize_model_parallel(tensor_model_parallel_size=1)
-    vllm_config = VllmConfig(model_config=model_config)
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
+    )
     with set_current_vllm_config(vllm_config=vllm_config):
         with set_default_torch_dtype(model_config.dtype):
             model = model_cls(vllm_config=vllm_config)
@@ -182,19 +184,12 @@ def test_model_tensor_schema(model_id: str):
     else:
         dtype = model_info.dtype
 
-    model_config = ModelConfig(
+    renderer_config = model_info.build_renderer_config(
         model_id,
-        tokenizer=model_info.tokenizer or model_id,
-        tokenizer_mode=model_info.tokenizer_mode,
-        revision=model_info.revision,
-        trust_remote_code=model_info.trust_remote_code,
         hf_overrides=hf_overrides_fn,
-        skip_tokenizer_init=model_info.require_embed_inputs,
-        enable_prompt_embeds=model_info.require_embed_inputs,
-        enable_mm_embeds=model_info.require_embed_inputs,
-        enforce_eager=model_info.enforce_eager,
         dtype=dtype,
     )
+    model_config = renderer_config.model_config
 
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
     assert supports_multimodal(model_cls)
@@ -212,10 +207,7 @@ def test_model_tensor_schema(model_id: str):
     if not any(inputs_parse_methods):
         pytest.skip(f"{model_arch} does not support tensor schema validation.")
 
-    ctx = InputProcessingContext(
-        model_config,
-        tokenizer=cached_tokenizer_from_config(model_config),
-    )
+    ctx = InputProcessingContext.from_config(renderer_config)
     processing_info = factories.info(ctx)
     supported_mm_limits = processing_info.get_supported_mm_limits()
     limit_mm_per_prompt = {
diff --git a/tests/models/multimodal/processing/test_transformers.py b/tests/models/multimodal/processing/test_transformers.py
index e2a2186f4..c9a90eb88 100644
--- a/tests/models/multimodal/processing/test_transformers.py
+++ b/tests/models/multimodal/processing/test_transformers.py
@@ -3,7 +3,7 @@
 import pytest
 
 from vllm.assets.image import ImageAsset
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, RendererConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
 
@@ -13,8 +13,9 @@ def test_multimodal_processor(model_id):
         model=model_id,
         model_impl="transformers",
     )
+    renderer_config = RendererConfig(model_config=model_config)
 
-    mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)
+    mm_processor = MULTIMODAL_REGISTRY.create_processor(renderer_config)
 
     image_pil = ImageAsset("cherry_blossom").pil_image
     mm_data = {"image": image_pil}
diff --git a/tests/models/multimodal/test_mapping.py b/tests/models/multimodal/test_mapping.py
index 0d2eaca95..73de6b5f7 100644
--- a/tests/models/multimodal/test_mapping.py
+++ b/tests/models/multimodal/test_mapping.py
@@ -7,7 +7,6 @@ import torch
 import transformers
 from transformers import AutoConfig, PreTrainedModel
 
-from vllm.config import ModelConfig
 from vllm.model_executor.models.utils import WeightsMapper
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.transformers_utils.config import try_get_safetensors_metadata
@@ -50,37 +49,11 @@ def test_hf_model_weights_mapper(model_arch: str):
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
-    is_mistral_model = model_arch in [
-        "Mistral3ForConditionalGeneration",
-        "PixtralForConditionalGeneration",
-        "VoxtralForConditionalGeneration",
-    ]
-
-    if not is_mistral_model or model_info.tokenizer_mode == "mistral":
-        tokenizer_mode = model_info.tokenizer_mode
-    else:
-        tokenizer_mode = "hf"
-
-    model_id = model_info.default
-
-    model_config = ModelConfig(
-        model_id,
-        tokenizer=model_info.tokenizer or model_id,
-        tokenizer_mode=tokenizer_mode,
-        config_format="hf",
-        revision=model_info.revision,
-        trust_remote_code=model_info.trust_remote_code,
-        hf_overrides=model_info.hf_overrides,
-        skip_tokenizer_init=model_info.require_embed_inputs,
-        enable_prompt_embeds=model_info.require_embed_inputs,
-        enable_mm_embeds=model_info.require_embed_inputs,
-        enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype,
-    )
+    model_config = model_info.build_model_config(config_format="hf")
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
 
-    original_weights = create_repo_dummy_weights(model_id)
-    hf_dummy_model = create_dummy_model(model_id, model_arch)
+    original_weights = create_repo_dummy_weights(model_config.model)
+    hf_dummy_model = create_dummy_model(model_config.model, model_arch)
     hf_converted_weights = hf_dummy_model.named_parameters()
     hf_converted_buffers = hf_dummy_model.named_buffers()
     mapper: WeightsMapper = model_cls.hf_to_vllm_mapper
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 020cb7493..e2cb5bcbc 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -9,7 +9,8 @@ import pytest
 from packaging.version import Version
 from transformers import __version__ as TRANSFORMERS_VERSION
 
-from vllm.config.model import ModelDType, TokenizerMode
+from vllm.config.model import ModelConfig, ModelDType
+from vllm.config.renderer import RendererConfig, TokenizerMode
 
 
 @dataclass(frozen=True)
@@ -170,6 +171,36 @@ class _HfExamplesInfo:
             else:
                 pytest.skip(msg)
 
+    def build_model_config(self, model: str | None = None, **kwargs) -> ModelConfig:
+        if model is None:
+            model = self.default
+
+        return ModelConfig(
+            **{
+                "model": model,
+                "revision": self.revision,
+                "trust_remote_code": self.trust_remote_code,
+                "hf_overrides": self.hf_overrides,
+                "enable_prompt_embeds": self.require_embed_inputs,
+                "enable_mm_embeds": self.require_embed_inputs,
+                "enforce_eager": self.enforce_eager,
+                "dtype": self.dtype,
+                **kwargs,
+            }
+        )
+
+    def build_renderer_config(
+        self, model: str | None = None, **kwargs
+    ) -> RendererConfig:
+        model_config = self.build_model_config(model, **kwargs)
+
+        return RendererConfig(
+            model_config=model_config,
+            tokenizer=self.tokenizer or model_config.model,
+            tokenizer_mode=self.tokenizer_mode,
+            skip_tokenizer_init=self.require_embed_inputs,
+        )
+
 
 _TEXT_GENERATION_EXAMPLE_MODELS = {
     # [Decoder-only]
diff --git a/tests/models/utils.py b/tests/models/utils.py
index d84b4b820..87292cc45 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -13,7 +13,6 @@ from transformers import PretrainedConfig
 from vllm.config.model import ModelConfig, ModelDType, RunnerOption
 from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
 from vllm.multimodal.processing import InputProcessingContext
-from vllm.tokenizers import cached_tokenizer_from_config
 
 from .. import ci_envs
 from .registry import HF_EXAMPLE_MODELS
@@ -296,30 +295,18 @@ def build_model_context(
 
     model_config_kwargs = model_config_kwargs or {}
     limit_mm_per_prompt = limit_mm_per_prompt or {}
-    model_config = ModelConfig(
+    renderer_config = model_info.build_renderer_config(
         model_id,
         runner=runner,
-        tokenizer=model_info.tokenizer or model_id,
-        tokenizer_mode=model_info.tokenizer_mode,
-        revision=model_info.revision,
-        trust_remote_code=model_info.trust_remote_code,
         dtype=dtype,
         seed=0,
         mm_processor_kwargs=mm_processor_kwargs,
         limit_mm_per_prompt=limit_mm_per_prompt,
         mm_processor_cache_gb=mm_processor_cache_gb,
-        hf_overrides=model_info.hf_overrides,
-        skip_tokenizer_init=model_info.require_embed_inputs,
-        enable_prompt_embeds=model_info.require_embed_inputs,
-        enable_mm_embeds=model_info.require_embed_inputs,
-        enforce_eager=model_info.enforce_eager,
         **model_config_kwargs,
     )
 
-    return InputProcessingContext(
-        model_config,
-        tokenizer=cached_tokenizer_from_config(model_config),
-    )
+    return InputProcessingContext.from_config(renderer_config)
 
 
 def check_embeddings_close(
diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py
index e641b1111..ce16d9013 100644
--- a/tests/multimodal/test_cache.py
+++ b/tests/multimodal/test_cache.py
@@ -6,7 +6,7 @@ import numpy as np
 import pytest
 import torch
 
-from vllm.config import ModelConfig, ParallelConfig, VllmConfig
+from vllm.config import ModelConfig, ParallelConfig, RendererConfig, VllmConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.cache import (
     BaseMultiModalProcessorCache,
@@ -110,11 +110,14 @@ def _create_vllm_config(
     mm_processor_cache_gb: float,
     enable_ipc: bool,
 ):
+    model_config = ModelConfig(
+        model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+        mm_processor_cache_gb=mm_processor_cache_gb,
+    )
+
     return VllmConfig(
-        model_config=ModelConfig(
-            model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
-            mm_processor_cache_gb=mm_processor_cache_gb,
-        ),
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         parallel_config=ParallelConfig(data_parallel_size=1 if enable_ipc else 2),
     )
 
@@ -506,13 +509,15 @@ def _run_test_cache_eviction_shm(
 
 
 def test_cache_eviction_shm_cache():
+    model_config = ModelConfig(
+        model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+        mm_processor_cache_type="shm",
+        mm_shm_cache_max_object_size_mb=6,
+        mm_processor_cache_gb=15.2 * MiB_bytes / GiB_bytes,
+    )
     vllm_config = VllmConfig(
-        model_config=ModelConfig(
-            model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
-            mm_processor_cache_type="shm",
-            mm_shm_cache_max_object_size_mb=6,
-            mm_processor_cache_gb=15.2 * MiB_bytes / GiB_bytes,
-        ),
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
     )
     sender_cache = ShmObjectStoreSenderCache(vllm_config)
     receiver_cache = ShmObjectStoreReceiverCache(vllm_config, mp.Lock())
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 262ea42e4..adff57252 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -7,7 +7,7 @@ from contextlib import nullcontext
 import numpy as np
 import pytest
 
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, RendererConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.processing import (
     InputProcessingContext,
@@ -920,8 +920,9 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
         model=model_id,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
+    renderer_config = RendererConfig(model_config=model_config)
 
-    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(renderer_config)
     processor._supported_mm_limits = {"image": num_supported}
 
     profiler = MultiModalProfiler(processor)
@@ -955,8 +956,9 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
         model=model_id,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
+    renderer_config = RendererConfig(model_config=model_config)
 
-    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(renderer_config)
 
     rng = np.random.RandomState(0)
     image = random_image(rng, min_wh=128, max_wh=256)
@@ -1012,11 +1014,13 @@ def test_hf_processor_init_kwargs(
     inference_kwargs,
     expected_kwargs,
 ):
-    ctx = InputProcessingContext(
-        model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
-        tokenizer=None,
+    model_config = ModelConfig(model_id, mm_processor_kwargs=config_kwargs)
+    renderer_config = RendererConfig(
+        model_config=model_config,
+        tokenizer=model_id,
     )
 
+    ctx = InputProcessingContext.from_config(renderer_config)
     processor = ctx.get_hf_processor(
         DummyProcessor,  # type: ignore[arg-type]
         **inference_kwargs,
@@ -1045,11 +1049,13 @@ def test_hf_processor_call_kwargs(
     inference_kwargs,
     expected_kwargs,
 ):
-    ctx = InputProcessingContext(
-        model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
-        tokenizer=None,
+    model_config = ModelConfig(model_id, mm_processor_kwargs=config_kwargs)
+    renderer_config = RendererConfig(
+        model_config=model_config,
+        tokenizer=model_id,
     )
 
+    ctx = InputProcessingContext.from_config(renderer_config)
     processor = ctx.get_hf_processor(DummyProcessor)  # type: ignore[arg-type]
 
     result = ctx.call_hf_processor(processor, {}, inference_kwargs)
diff --git a/tests/multimodal/test_registry.py b/tests/multimodal/test_registry.py
index 3b01bda7f..8127fac09 100644
--- a/tests/multimodal/test_registry.py
+++ b/tests/multimodal/test_registry.py
@@ -31,4 +31,6 @@ def test_supports_multimodal_inputs(model_id, limit_mm_per_prompt, expected):
         model_id,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
-    assert MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.model_config) is expected
+    assert (
+        MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.renderer_config) is expected
+    )
diff --git a/tests/test_config.py b/tests/test_config.py
index 203447cd5..7464fcd1e 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -13,6 +13,7 @@ from vllm.config import (
     CompilationConfig,
     ModelConfig,
     PoolerConfig,
+    RendererConfig,
     SchedulerConfig,
     VllmConfig,
     update_config,
@@ -476,27 +477,41 @@ def test_load_config_pt_load_map_location(pt_load_map_location):
         ("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", 131073, 131072, True),
     ],
 )
-def test_get_and_verify_max_len(
+def test_recalculate_max_model_len(
     model_id, max_model_len, expected_max_len, should_raise
 ):
-    """Test get_and_verify_max_len with different configurations."""
+    """Test recalculate_max_model_len with different configurations."""
     model_config = ModelConfig(model_id)
 
     if should_raise:
         with pytest.raises(ValueError):
-            model_config.get_and_verify_max_len(max_model_len)
+            model_config.recalculate_max_model_len(
+                max_model_len,
+                tokenizer=model_id,
+                tokenizer_revision=None,
+            )
     else:
-        actual_max_len = model_config.get_and_verify_max_len(max_model_len)
-        assert actual_max_len == expected_max_len
+        model_config.recalculate_max_model_len(
+            max_model_len,
+            tokenizer=model_id,
+            tokenizer_revision=None,
+        )
+        assert model_config.max_model_len == expected_max_len
 
 
-class MockConfig:
-    """Simple mock object for testing maybe_pull_model_tokenizer_for_runai"""
+class MockModelConfig:
+    """Simple mock object for testing maybe_pull_model_for_runai"""
 
-    def __init__(self, model: str, tokenizer: str):
+    def __init__(self, model: str):
         self.model = model
-        self.tokenizer = tokenizer
-        self.model_weights = None
+
+
+class MockRendererConfig:
+    """Simple mock object for testing maybe_pull_tokenizer_for_runai"""
+
+    def __init__(self, model_config: MockModelConfig):
+        self.model_config = model_config
+        self.tokenizer = model_config.model
 
 
 @pytest.mark.parametrize(
@@ -514,59 +529,65 @@ def test_s3_url_model_tokenizer_paths(mock_pull_files, s3_url):
     mock_pull_files.return_value = None
 
     # Create first mock and run the method
-    config1 = MockConfig(model=s3_url, tokenizer=s3_url)
-    ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url, s3_url)
+    model_config1 = MockModelConfig(model=s3_url)
+    renderer_config1 = MockRendererConfig(model_config=model_config1)
+    ModelConfig.maybe_pull_model_for_runai(model_config1, s3_url)
+    RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config1, s3_url)
 
     # Check that model and tokenizer point to existing directories
-    assert os.path.exists(config1.model), (
-        f"Model directory does not exist: {config1.model}"
+    assert os.path.exists(model_config1.model), (
+        f"Model directory does not exist: {model_config1.model}"
     )
-    assert os.path.isdir(config1.model), (
-        f"Model path is not a directory: {config1.model}"
+    assert os.path.isdir(model_config1.model), (
+        f"Model path is not a directory: {model_config1.model}"
     )
-    assert os.path.exists(config1.tokenizer), (
-        f"Tokenizer directory does not exist: {config1.tokenizer}"
+    assert os.path.exists(renderer_config1.tokenizer), (
+        f"Tokenizer directory does not exist: {renderer_config1.tokenizer}"
     )
-    assert os.path.isdir(config1.tokenizer), (
-        f"Tokenizer path is not a directory: {config1.tokenizer}"
+    assert os.path.isdir(renderer_config1.tokenizer), (
+        f"Tokenizer path is not a directory: {renderer_config1.tokenizer}"
     )
 
     # Verify that the paths are different from the original S3 URL
-    assert config1.model != s3_url, "Model path should be converted to local directory"
-    assert config1.tokenizer != s3_url, (
+    assert model_config1.model != s3_url, (
+        "Model path should be converted to local directory"
+    )
+    assert renderer_config1.tokenizer != s3_url, (
         "Tokenizer path should be converted to local directory"
     )
 
     # Store the original paths
-    created_model_dir = config1.model
-    create_tokenizer_dir = config1.tokenizer
+    created_model_dir = model_config1.model
+    create_tokenizer_dir = renderer_config1.tokenizer
 
     # Create a new mock and run the method with the same S3 URL
-    config2 = MockConfig(model=s3_url, tokenizer=s3_url)
-    ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url, s3_url)
+    model_config2 = MockModelConfig(model=s3_url)
+    renderer_config2 = MockRendererConfig(model_config=model_config2)
+    ModelConfig.maybe_pull_model_for_runai(model_config2, s3_url)
+    RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config2, s3_url)
 
     # Check that the new directories exist
-    assert os.path.exists(config2.model), (
-        f"Model directory does not exist: {config2.model}"
+    assert os.path.exists(model_config2.model), (
+        f"Model directory does not exist: {model_config2.model}"
     )
-    assert os.path.isdir(config2.model), (
-        f"Model path is not a directory: {config2.model}"
+    assert os.path.isdir(model_config2.model), (
+        f"Model path is not a directory: {model_config2.model}"
     )
-    assert os.path.exists(config2.tokenizer), (
-        f"Tokenizer directory does not exist: {config2.tokenizer}"
+    assert os.path.exists(renderer_config2.tokenizer), (
+        f"Tokenizer directory does not exist: {renderer_config2.tokenizer}"
     )
-    assert os.path.isdir(config2.tokenizer), (
-        f"Tokenizer path is not a directory: {config2.tokenizer}"
+    assert os.path.isdir(renderer_config2.tokenizer), (
+        f"Tokenizer path is not a directory: {renderer_config2.tokenizer}"
     )
 
     # Verify that the paths are deterministic (same as before)
-    assert config2.model == created_model_dir, (
+    assert model_config2.model == created_model_dir, (
         f"Model paths are not deterministic. "
-        f"Original: {created_model_dir}, New: {config2.model}"
+        f"Original: {created_model_dir}, New: {model_config2.model}"
     )
-    assert config2.tokenizer == create_tokenizer_dir, (
+    assert renderer_config2.tokenizer == create_tokenizer_dir, (
         f"Tokenizer paths are not deterministic. "
-        f"Original: {create_tokenizer_dir}, New: {config2.tokenizer}"
+        f"Original: {create_tokenizer_dir}, New: {renderer_config2.tokenizer}"
     )
 
 
@@ -580,28 +601,36 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files):
     s3_url2 = "s3://example-bucket-2/model/"
 
     # Create mocks with different S3 URLs and run the method
-    config1 = MockConfig(model=s3_url1, tokenizer=s3_url1)
-    ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url1, s3_url1)
+    model_config1 = MockModelConfig(model=s3_url1)
+    renderer_config1 = MockRendererConfig(model_config=model_config1)
+    ModelConfig.maybe_pull_model_for_runai(model_config1, s3_url1)
+    RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config1, s3_url1)
 
-    config2 = MockConfig(model=s3_url2, tokenizer=s3_url2)
-    ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url2, s3_url2)
+    model_config2 = MockModelConfig(model=s3_url2)
+    renderer_config2 = MockRendererConfig(model_config=model_config2)
+    ModelConfig.maybe_pull_model_for_runai(model_config2, s3_url2)
+    RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config2, s3_url2)
 
     # Verify that different URLs produce different directories
-    assert config1.model != config2.model, (
+    assert model_config1.model != model_config2.model, (
         f"Different S3 URLs should create different model directories. "
-        f"URL1 model: {config1.model}, URL2 model: {config2.model}"
+        f"URL1 model: {model_config1.model}, URL2 model: {model_config2.model}"
     )
-    assert config1.tokenizer != config2.tokenizer, (
+    assert renderer_config1.tokenizer != renderer_config2.tokenizer, (
         f"Different S3 URLs should create different tokenizer directories. "
-        f"URL1 tokenizer: {config1.tokenizer}, "
-        f"URL2 tokenizer: {config2.tokenizer}"
+        f"URL1 tokenizer: {renderer_config1.tokenizer}, "
+        f"URL2 tokenizer: {renderer_config2.tokenizer}"
     )
 
     # Verify that both sets of directories exist
-    assert os.path.exists(config1.model) and os.path.isdir(config1.model)
-    assert os.path.exists(config1.tokenizer) and os.path.isdir(config1.tokenizer)
-    assert os.path.exists(config2.model) and os.path.isdir(config2.model)
-    assert os.path.exists(config2.tokenizer) and os.path.isdir(config2.tokenizer)
+    assert os.path.exists(model_config1.model) and os.path.isdir(model_config1.model)
+    assert os.path.exists(renderer_config1.tokenizer) and os.path.isdir(
+        renderer_config1.tokenizer
+    )
+    assert os.path.exists(model_config2.model) and os.path.isdir(model_config2.model)
+    assert os.path.exists(renderer_config2.tokenizer) and os.path.isdir(
+        renderer_config2.tokenizer
+    )
 
 
 @pytest.mark.parametrize(
diff --git a/tests/test_inputs.py b/tests/test_inputs.py
index c4339827d..48fd076ab 100644
--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, RendererConfig
 from vllm.inputs import zip_enc_dec_prompts
 from vllm.inputs.parse import parse_raw_prompts
 from vllm.inputs.preprocess import InputPreprocessor
@@ -108,8 +108,9 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
 )
 def test_preprocessor_always_mm_code_path(model_id, prompt):
     model_config = ModelConfig(model=model_id)
-    tokenizer = init_tokenizer_from_config(model_config)
-    input_preprocessor = InputPreprocessor(model_config, tokenizer)
+    renderer_config = RendererConfig(model_config=model_config)
+    tokenizer = init_tokenizer_from_config(renderer_config)
+    input_preprocessor = InputPreprocessor(renderer_config, tokenizer)
 
     # HF processor adds sep token
     sep_token_id = tokenizer.vocab[tokenizer.sep_token]
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index 6cab129c1..49307e3e5 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -16,6 +16,7 @@ from vllm.config import (
     LoadConfig,
     ModelConfig,
     ParallelConfig,
+    RendererConfig,
     SchedulerConfig,
     VllmConfig,
 )
@@ -216,6 +217,7 @@ def create_vllm_config(
 
     return VllmConfig(
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         parallel_config=parallel_config,
         scheduler_config=scheduler_config,
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index fd5cf6d3e..4a414bca5 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -8,7 +8,7 @@ import pytest
 import torch
 
 import vllm.v1.core.kv_cache_utils as kv_cache_utils
-from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
+from vllm.config import ModelConfig, RendererConfig, SchedulerConfig, VllmConfig
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.inputs import (
     MultiModalFeatureSpec,
@@ -667,7 +667,10 @@ def test_metrics_empty_stats():
 
 def test_get_kv_cache_configs_multiple_workers():
     model_config = ModelConfig(max_model_len=16)
-    vllm_config = VllmConfig(model_config=model_config)
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
+    )
 
     ref_kv_cache_spec = new_kv_cache_spec()
     same_kv_cache_specs = [
@@ -1136,6 +1139,7 @@ def test_estimate_max_model_len(model_id, max_model_len, want_estimated_max_len)
 
     vllm_config = VllmConfig(
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         scheduler_config=scheduler_config,
     )
 
@@ -1175,6 +1179,7 @@ def test_get_max_concurrency_for_kv_cache_config():
 
     vllm_config = VllmConfig(
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         scheduler_config=scheduler_config,
     )
 
@@ -1293,7 +1298,10 @@ def test_allocate_with_lookahead():
 def test_get_kv_cache_config_one_worker():
     # pass max_model_len to pass check_enough_kv_cache_memory
     model_config = ModelConfig(max_model_len=16)
-    vllm_config = VllmConfig(model_config=model_config)
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
+    )
 
     mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
     # all layers are full attention -> single group
@@ -1584,7 +1592,11 @@ def test_get_kv_cache_config_one_worker():
 
 def test_get_kv_cache_configs_attention_free():
     kv_cache_specs: dict[str, KVCacheSpec] = {}
-    vllm_config = VllmConfig(model_config=ModelConfig(max_model_len=16))
+    model_config = ModelConfig(max_model_len=16)
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
+    )
     kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [0])
     assert kv_cache_configs == [
         KVCacheConfig(
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index c6c4a5085..1505415a6 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -11,6 +11,7 @@ from vllm.config import (
     ECTransferConfig,
     KVTransferConfig,
     ModelConfig,
+    RendererConfig,
     SchedulerConfig,
     SpeculativeConfig,
     VllmConfig,
@@ -1563,6 +1564,7 @@ def create_scheduler_with_priority(
     vllm_config = VllmConfig(
         scheduler_config=scheduler_config,
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         kv_transfer_config=kv_transfer_config,
         speculative_config=speculative_config,
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index f5ba613d3..086885c29 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -9,6 +9,7 @@ from vllm.config import (
     ECTransferConfig,
     KVTransferConfig,
     ModelConfig,
+    RendererConfig,
     SchedulerConfig,
     SpeculativeConfig,
     VllmConfig,
@@ -132,6 +133,7 @@ def create_scheduler(
     vllm_config = VllmConfig(
         scheduler_config=scheduler_config,
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         kv_transfer_config=kv_transfer_config,
         speculative_config=speculative_config,
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 48be8c15a..c606100a1 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -15,6 +15,7 @@ from vllm.config import (
     ECTransferConfig,
     KVTransferConfig,
     ModelConfig,
+    RendererConfig,
     SchedulerConfig,
     VllmConfig,
 )
@@ -522,6 +523,7 @@ def test_encoder_instance_zero_kv_cache(
 
     vllm_config = VllmConfig(
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         scheduler_config=scheduler_config,
         kv_transfer_config=kv_transfer_config,
diff --git a/tests/v1/engine/test_process_multi_modal_uuids.py b/tests/v1/engine/test_process_multi_modal_uuids.py
index 1b11b8af4..85fab3a85 100644
--- a/tests/v1/engine/test_process_multi_modal_uuids.py
+++ b/tests/v1/engine/test_process_multi_modal_uuids.py
@@ -5,7 +5,14 @@ import pytest
 
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
+from vllm.config import (
+    CacheConfig,
+    DeviceConfig,
+    ModelConfig,
+    MultiModalConfig,
+    RendererConfig,
+    VllmConfig,
+)
 from vllm.sampling_params import SamplingParams
 from vllm.v1.engine import input_processor as input_processor_mod
 from vllm.v1.engine.input_processor import InputProcessor
@@ -44,22 +51,21 @@ def _mock_input_processor(
     monkeypatch.setattr(VllmConfig, "__post_init__", lambda self: None, raising=True)
 
     model_config = ModelConfig(
-        skip_tokenizer_init=True,
         max_model_len=128,
         mm_processor_cache_gb=mm_cache_gb,
         generation_config="vllm",
-        tokenizer="dummy",
     )
+    model_config.multimodal_config = MultiModalConfig(mm_processor_cache_gb=mm_cache_gb)
 
-    # Minimal multimodal_config to satisfy references in
-    # Processor.process_inputs.
-    class _MockMMConfig:
-        def __init__(self, gb: float):
-            self.mm_processor_cache_gb = gb
+    renderer_config = RendererConfig(
+        model_config=model_config,
+        tokenizer="dummy",
+        skip_tokenizer_init=True,
+    )
 
-    model_config.multimodal_config = _MockMMConfig(mm_cache_gb)  # type: ignore[attr-defined]
     vllm_config = VllmConfig(
         model_config=model_config,
+        renderer_config=renderer_config,
         cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching),
         device_config=DeviceConfig(device="cpu"),
     )
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 58f1a7282..768b338b5 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -15,6 +15,7 @@ from vllm.config import (
     DeviceConfig,
     KVTransferConfig,
     ModelConfig,
+    RendererConfig,
     SchedulerConfig,
     VllmConfig,
 )
@@ -127,6 +128,7 @@ def create_vllm_config(
     return VllmConfig(
         scheduler_config=scheduler_config,
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         kv_transfer_config=kv_transfer_config,
         device_config=DeviceConfig("cpu"),
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 616e57de3..888ea0169 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -19,6 +19,7 @@ from vllm.config import (
     DeviceConfig,
     ModelConfig,
     ParallelConfig,
+    RendererConfig,
     SchedulerConfig,
     SpeculativeConfig,
     VllmConfig,
@@ -61,6 +62,7 @@ def _create_proposer(
 
     vllm_config = VllmConfig(
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         cache_config=CacheConfig(),
         speculative_config=speculative_config,
         device_config=DeviceConfig(device=current_platform.device_type),
diff --git a/tests/v1/spec_decode/test_mtp.py b/tests/v1/spec_decode/test_mtp.py
index 3b8813ceb..4483c8243 100644
--- a/tests/v1/spec_decode/test_mtp.py
+++ b/tests/v1/spec_decode/test_mtp.py
@@ -18,6 +18,7 @@ from vllm.config import (
     DeviceConfig,
     ModelConfig,
     ParallelConfig,
+    RendererConfig,
     SchedulerConfig,
     SpeculativeConfig,
     VllmConfig,
@@ -46,6 +47,7 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer:
 
     vllm_config = VllmConfig(
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         cache_config=CacheConfig(),
         speculative_config=speculative_config,
         device_config=DeviceConfig(device=current_platform.device_type),
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index 6bc412abe..2e365e08a 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -4,6 +4,7 @@ import numpy as np
 
 from vllm.config import (
     ModelConfig,
+    RendererConfig,
     SpeculativeConfig,
     VllmConfig,
 )
@@ -69,6 +70,7 @@ def test_ngram_proposer():
         return NgramProposer(
             vllm_config=VllmConfig(
                 model_config=model_config,
+                renderer_config=RendererConfig(model_config=model_config),
                 speculative_config=SpeculativeConfig(
                     prompt_lookup_min=min_n,
                     prompt_lookup_max=max_n,
diff --git a/tests/v1/structured_output/test_backend_guidance.py b/tests/v1/structured_output/test_backend_guidance.py
index 4c01560fc..baef2459f 100644
--- a/tests/v1/structured_output/test_backend_guidance.py
+++ b/tests/v1/structured_output/test_backend_guidance.py
@@ -6,7 +6,7 @@ from concurrent.futures import Future
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.config import StructuredOutputsConfig, VllmConfig
+from vllm.config import RendererConfig, StructuredOutputsConfig, VllmConfig
 from vllm.config.model import ModelConfig
 from vllm.config.parallel import ParallelConfig
 from vllm.config.speculative import SpeculativeConfig
@@ -72,8 +72,11 @@ def test_backend_guidance_rollback_terminated():
 def test_grammar_bitmask_with_specdec():
     tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
     prompt = tokenizer.encode('{"a": "b"}')
+
+    model_config = ModelConfig(tokenizer=TOKENIZER)
     vllm_config = VllmConfig(
-        model_config=ModelConfig(tokenizer=TOKENIZER),
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config, tokenizer=TOKENIZER),
         structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
         speculative_config=SpeculativeConfig(model="[ngram]", num_speculative_tokens=3),
     )
@@ -137,8 +140,11 @@ def test_grammar_init_async_and_sync(async_grammar):
 
     # Use "external_launcher" for sync mode, None for async mode
     executor_backend = None if async_grammar else "external_launcher"
+
+    model_config = ModelConfig(tokenizer=TOKENIZER)
     vllm_config = VllmConfig(
-        model_config=ModelConfig(tokenizer=TOKENIZER),
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config, tokenizer=TOKENIZER),
         structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
         parallel_config=ParallelConfig(distributed_executor_backend=executor_backend),
     )
diff --git a/tests/v1/structured_output/test_reasoning_structured_output.py b/tests/v1/structured_output/test_reasoning_structured_output.py
index 70047a993..5901d38d1 100644
--- a/tests/v1/structured_output/test_reasoning_structured_output.py
+++ b/tests/v1/structured_output/test_reasoning_structured_output.py
@@ -7,7 +7,7 @@ from unittest.mock import Mock
 
 import pytest
 
-from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
+from vllm.config import ModelConfig, RendererConfig, SchedulerConfig, VllmConfig
 from vllm.reasoning import ReasoningParser
 from vllm.v1.request import Request
 from vllm.v1.structured_output import StructuredOutputManager
@@ -17,19 +17,26 @@ class TestReasoningStructuredOutput:
     """Test reasoning-aware structured output functionality."""
 
     @pytest.fixture
-    def mock_model_config(self):
-        """Create a mock ModelConfig."""
-        config = Mock(spec=ModelConfig)
-        config.skip_tokenizer_init = True  # Skip tokenizer init to avoid network calls
-        config.get_vocab_size = Mock(return_value=50000)
+    def mock_renderer_config(self):
+        """Create a mock RendererConfig."""
+        renderer_config = Mock(spec=RendererConfig)
+        renderer_config.skip_tokenizer_init = (
+            True  # Skip tokenizer init to avoid network calls
+        )
+
+        model_config = Mock(spec=ModelConfig)
+        model_config.get_vocab_size = Mock(return_value=50000)
+        model_config.trust_remote_code = False
         # Add missing runner_type attribute that tokenizer initialization expects
-        config.runner_type = "generate"
+        model_config.runner_type = "generate"
+        renderer_config.model_config = model_config
+
         # Add other attributes that tokenizer initialization might need
-        config.tokenizer = "test-tokenizer"
-        config.tokenizer_mode = "auto"
-        config.trust_remote_code = False
-        config.tokenizer_revision = None
-        return config
+        renderer_config.tokenizer = "test-tokenizer"
+        renderer_config.tokenizer_mode = "auto"
+        renderer_config.tokenizer_revision = None
+
+        return renderer_config
 
     @pytest.fixture
     def mock_scheduler_config(self):
@@ -39,10 +46,10 @@ class TestReasoningStructuredOutput:
         return config
 
     @pytest.fixture
-    def mock_vllm_config(self, mock_model_config, mock_scheduler_config):
+    def mock_vllm_config(self, mock_renderer_config, mock_scheduler_config):
         """Create a mock VllmConfig."""
         config = Mock(spec=VllmConfig)
-        config.model_config = mock_model_config
+        config.renderer_config = mock_renderer_config
         config.scheduler_config = mock_scheduler_config
         config.structured_outputs_config = Mock()
         config.structured_outputs_config.reasoning_parser = None
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index cfc06666e..080d23863 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -7,6 +7,7 @@ from vllm.attention.layer import Attention
 from vllm.config import (
     CacheConfig,
     ModelConfig,
+    RendererConfig,
     SchedulerConfig,
     VllmConfig,
     set_current_vllm_config,
@@ -45,6 +46,7 @@ def get_vllm_config():
     )
     vllm_config = VllmConfig(
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         scheduler_config=scheduler_config,
     )
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 7b8c4268a..464e3ab99 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -13,6 +13,7 @@ from vllm.config import (
     CacheConfig,
     ModelConfig,
     ParallelConfig,
+    RendererConfig,
     SchedulerConfig,
     VllmConfig,
     set_current_vllm_config,
@@ -101,6 +102,7 @@ def get_vllm_config():
     parallel_config = ParallelConfig()
     vllm_config = VllmConfig(
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         scheduler_config=scheduler_config,
         parallel_config=parallel_config,
@@ -811,6 +813,7 @@ def test_hybrid_attention_mamba_tensor_shapes():
     attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASHINFER)
     vllm_config = VllmConfig(
         model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         scheduler_config=scheduler_config,
         parallel_config=parallel_config,
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 0f84f3ca9..a4f9fd8d2 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -24,6 +24,7 @@ from vllm.config.multimodal import MultiModalConfig
 from vllm.config.observability import ObservabilityConfig
 from vllm.config.parallel import EPLBConfig, ParallelConfig
 from vllm.config.pooler import PoolerConfig
+from vllm.config.renderer import RendererConfig
 from vllm.config.scheduler import SchedulerConfig
 from vllm.config.speculative import SpeculativeConfig
 from vllm.config.speech_to_text import SpeechToTextConfig
@@ -81,6 +82,8 @@ __all__ = [
     "ParallelConfig",
     # From vllm.config.pooler
     "PoolerConfig",
+    # From vllm.config.renderer
+    "RendererConfig",
     # From vllm.config.scheduler
     "SchedulerConfig",
     # From vllm.config.speculative
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 509a9c5e1..b0d4fb8e0 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -36,7 +36,6 @@ from vllm.transformers_utils.config import (
     uses_xdrope_dim,
 )
 from vllm.transformers_utils.gguf_utils import (
-    is_gguf,
     is_remote_gguf,
     maybe_patch_hf_config_from_gguf,
     split_remote_gguf,
@@ -83,7 +82,6 @@ TaskOption = Literal[
     "transcription",
     "draft",
 ]
-TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
 ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
 LogprobsMode = Literal[
     "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
@@ -131,18 +129,6 @@ class ModelConfig:
 
     Note that the model may support other tasks using the same model runner.
     """
-    tokenizer: SkipValidation[str] = None  # type: ignore
-    """Name or path of the Hugging Face tokenizer to use. If unspecified, model
-    name or path will be used."""
-    tokenizer_mode: TokenizerMode | str = "auto"
-    """Tokenizer mode:\n
-    - "auto" will use the tokenizer from `mistral_common` for Mistral models
-    if available, otherwise it will use the "hf" tokenizer.\n
-    - "hf" will use the fast tokenizer if available.\n
-    - "slow" will always use the slow tokenizer.\n
-    - "mistral" will always use the tokenizer from `mistral_common`.\n
-    - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
-    - Other custom values can be supported via plugins."""
     trust_remote_code: bool = False
     """Trust remote code (e.g., from HuggingFace) when downloading the model
     and tokenizer."""
@@ -168,13 +154,6 @@ class ModelConfig:
     hf_config_path: str | None = None
     """Name or path of the Hugging Face config to use. If unspecified, model
     name or path will be used."""
-    allowed_local_media_path: str = ""
-    """Allowing API requests to read local images or videos from directories
-    specified by the server file system. This is a security risk. Should only
-    be enabled in trusted environments."""
-    allowed_media_domains: list[str] | None = None
-    """If set, only media URLs that belong to this domain can be used for
-    multi-modal inputs. """
     revision: str | None = None
     """The specific model version to use. It can be a branch name, a tag name,
     or a commit id. If unspecified, will use the default version."""
@@ -182,10 +161,6 @@ class ModelConfig:
     """The specific revision to use for the model code on the Hugging Face Hub.
     It can be a branch name, a tag name, or a commit id. If unspecified, will
     use the default version."""
-    tokenizer_revision: str | None = None
-    """The specific revision to use for the tokenizer on the Hugging Face Hub.
-    It can be a branch name, a tag name, or a commit id. If unspecified, will
-    use the default version."""
     max_model_len: SkipValidation[int] = None  # type: ignore
     """Model context length (prompt and output). If unspecified, will be
     automatically derived from the model config.
@@ -230,10 +205,6 @@ class ModelConfig:
     preventing potential numerical issues. Note that even if this is set to
     False, cascade attention will be only used when the heuristic tells that
     it's beneficial."""
-    skip_tokenizer_init: bool = False
-    """Skip initialization of tokenizer and detokenizer. Expects valid
-    `prompt_token_ids` and `None` for prompt from the input. The generated
-    output will contain token ids."""
     enable_prompt_embeds: bool = False
     """If `True`, enables passing text embeddings as inputs via the
     `prompt_embeds` key.
@@ -294,8 +265,6 @@ class ModelConfig:
     logits_processors: list[str | type[LogitsProcessor]] | None = None
     """One or more logits processors' fully-qualified class names or class
     definitions"""
-    io_processor_plugin: str | None = None
-    """IOProcessor plugin name to load at model startup"""
 
     # Pooler config
     pooler_config: PoolerConfig | None = None
@@ -308,7 +277,6 @@ class ModelConfig:
     from the architecture of `self.model`."""
     limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None
     enable_mm_embeds: InitVar[bool | None] = None
-    media_io_kwargs: InitVar[dict[str, dict[str, Any]] | None] = None
     mm_processor_kwargs: InitVar[dict[str, Any] | None] = None
     mm_processor_cache_gb: InitVar[float | None] = None
     mm_processor_cache_type: InitVar[MMCacheType | None] = None
@@ -335,18 +303,12 @@ class ModelConfig:
             "runner",
             "convert",
             "task",
-            "tokenizer",
-            "tokenizer_mode",
             "seed",
             "hf_config_path",
-            "allowed_local_media_path",
-            "allowed_media_domains",
-            "tokenizer_revision",
             "spec_target_max_model_len",
             "enforce_eager",
             "logprobs_mode",
             "disable_cascade_attn",
-            "skip_tokenizer_init",
             "served_model_name",
             "config_format",
             "hf_token",
@@ -354,11 +316,9 @@ class ModelConfig:
             "logits_processor_pattern",
             "override_attention_dtype",
             "logits_processors",
-            "io_processor_plugin",
             "pooler_config",
             "multimodal_config",
             "limit_mm_per_prompt",
-            "media_io_kwargs",
             "mm_processor_kwargs",
             "mm_processor_cache_gb",
             "mm_processor_cache_type",
@@ -423,7 +383,6 @@ class ModelConfig:
         # Multimodal config init vars
         limit_mm_per_prompt: dict[str, int | dict[str, int]] | None,
         enable_mm_embeds: bool | None,
-        media_io_kwargs: dict[str, dict[str, Any]] | None,
         mm_processor_kwargs: dict[str, Any] | None,
         mm_processor_cache_gb: float | None,
         mm_processor_cache_type: MMCacheType | None,
@@ -438,13 +397,8 @@ class ModelConfig:
         self.served_model_name = get_served_model_name(
             self.model, self.served_model_name
         )
-        self.model = maybe_model_redirect(self.model)
-        # The tokenizer is consistent with the model by default.
-        if self.tokenizer is None:
-            self.tokenizer = self.model
-        if self.tokenizer_revision is None:
-            self.tokenizer_revision = self.revision
-        self.tokenizer = maybe_model_redirect(self.tokenizer)
+        self.original_model = self.model
+        self.model = maybe_model_redirect(self.original_model)
 
         if isinstance(self.hf_config_path, str):
             self.hf_config_path = maybe_model_redirect(self.hf_config_path)
@@ -465,7 +419,7 @@ class ModelConfig:
                     hf_overrides_kw[key] = value
             hf_overrides_fn = None
 
-        self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)
+        self.maybe_pull_model_for_runai(self.model)
 
         from vllm.platforms import current_platform
 
@@ -648,7 +602,8 @@ class ModelConfig:
         )
 
         self.original_max_model_len = self.max_model_len
-        self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
+        self.recalculate_max_model_len(self.original_max_model_len)
+
         # Init multimodal config if needed
         if self._model_info.supports_multimodal:
             if (
@@ -664,7 +619,6 @@ class ModelConfig:
             mm_config_kwargs = dict(
                 limit_per_prompt=limit_mm_per_prompt,
                 enable_mm_embeds=enable_mm_embeds,
-                media_io_kwargs=media_io_kwargs,
                 mm_processor_kwargs=mm_processor_kwargs,
                 mm_processor_cache_gb=mm_processor_cache_gb,
                 mm_processor_cache_type=mm_processor_cache_type,
@@ -682,16 +636,8 @@ class ModelConfig:
 
             self.multimodal_config = MultiModalConfig(**mm_config_kwargs)
 
-        # Multimodal GGUF models must use original repo for mm processing
-        if is_gguf(self.tokenizer) and self.is_multimodal_model:
-            raise ValueError(
-                "Loading a multimodal GGUF model needs to use original "
-                "tokenizer. Please specify the unquantized hf model's "
-                "repo name or path using the --tokenizer argument."
-            )
-
         if self.disable_sliding_window:
-            # Set after get_and_verify_max_len to ensure that max_model_len
+            # Set after recalculate_max_model_len to ensure that max_model_len
             # can be correctly capped to sliding window size
             self.hf_text_config.sliding_window = None
 
@@ -715,10 +661,9 @@ class ModelConfig:
 
     @model_validator(mode="after")
     def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
-        if not isinstance(self.tokenizer, str):
-            raise ValueError("tokenizer must be a string after __post_init__.")
         if not isinstance(self.max_model_len, int):
             raise ValueError("max_model_len must be an integer after __post_init__.")
+
         return self
 
     def _get_transformers_backend_cls(self) -> str:
@@ -767,49 +712,17 @@ class ModelConfig:
         """The architecture vllm actually used."""
         return self._architecture
 
-    def maybe_pull_model_tokenizer_for_runai(self, model: str, tokenizer: str) -> None:
-        """Pull model/tokenizer from Object Storage to temporary
-        directory when needed.
-
-        Args:
-            model: Model name or path
-            tokenizer: Tokenizer name or path
-        """
-
-        if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)):
+    def maybe_pull_model_for_runai(self, model: str) -> None:
+        """Pull model from Object Storage to temporary directory when needed."""
+        if not is_runai_obj_uri(model):
             return
 
-        if is_runai_obj_uri(model):
-            object_storage_model = ObjectStorageModel(url=model)
-            object_storage_model.pull_files(
-                model, allow_pattern=["*.model", "*.py", "*.json"]
-            )
-            self.model_weights = model
-            self.model = object_storage_model.dir
-
-            # If tokenizer is same as model, download to same directory
-            if model == tokenizer:
-                object_storage_model.pull_files(
-                    model,
-                    ignore_pattern=[
-                        "*.pt",
-                        "*.safetensors",
-                        "*.bin",
-                        "*.tensors",
-                        "*.pth",
-                    ],
-                )
-                self.tokenizer = object_storage_model.dir
-                return
-
-        # Only download tokenizer if needed and not already handled
-        if is_runai_obj_uri(tokenizer):
-            object_storage_tokenizer = ObjectStorageModel(url=tokenizer)
-            object_storage_tokenizer.pull_files(
-                model,
-                ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"],
-            )
-            self.tokenizer = object_storage_tokenizer.dir
+        object_storage_model = ObjectStorageModel(url=model)
+        object_storage_model.pull_files(
+            model, allow_pattern=["*.model", "*.py", "*.json"]
+        )
+        self.model_weights = model
+        self.model = object_storage_model.dir
 
     def _get_encoder_config(self):
         model = self.model
@@ -1712,30 +1625,38 @@ class ModelConfig:
             return dense_modules[-1]["out_features"]
         return self.get_hidden_size()
 
-    def get_and_verify_max_len(self, max_model_len: int):
+    def recalculate_max_model_len(
+        self,
+        original_max_model_len: int | None,
+        *,
+        tokenizer: str | None = None,
+        tokenizer_revision: str | None = None,
+    ) -> None:
         # Consider max_model_len in tokenizer_config only when
         # pooling models use absolute position_embedding.
+        # NOTE: For simplicity we assume `args.model == args.tokenizer`
+        # since this is
         tokenizer_config = None
         if (
             self.runner_type == "pooling"
             and getattr(self.hf_config, "position_embedding_type", "") == "absolute"
         ):
             tokenizer_config = try_get_tokenizer_config(
-                self.tokenizer,
+                tokenizer or self.model,
                 trust_remote_code=self.trust_remote_code,
-                revision=self.tokenizer_revision,
+                revision=tokenizer_revision or self.revision,
             )
-        max_model_len = _get_and_verify_max_len(
+
+        self.max_model_len = _get_and_verify_max_len(
             hf_config=self.hf_text_config,
             tokenizer_config=tokenizer_config,
-            max_model_len=max_model_len,
+            max_model_len=original_max_model_len,
             disable_sliding_window=self.disable_sliding_window,
             sliding_window=self.get_sliding_window(),
             spec_target_max_model_len=self.spec_target_max_model_len,
             encoder_config=self.encoder_config,
         )
-        logger.info("Using max model len %s", max_model_len)
-        return max_model_len
+        logger.info("Using max model len %s", self.max_model_len)
 
     @property
     def attn_type(self) -> AttnTypeStr:
diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py
index 8a2936de9..37e2f6b4d 100644
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@@ -79,10 +79,6 @@ class MultiModalConfig:
 
     WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
     Only enable this flag for trusted users!"""
-    media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
-    """Additional args passed to process media inputs, keyed by modalities.
-    For example, to set num_frames for video, set
-    `--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
     mm_processor_kwargs: dict[str, object] | None = None
     """Arguments to be forwarded to the model's processor for multi-modal data,
     e.g., image processor. Overrides for the multi-modal processor obtained
diff --git a/vllm/config/renderer.py b/vllm/config/renderer.py
new file mode 100644
index 000000000..36a922b93
--- /dev/null
+++ b/vllm/config/renderer.py
@@ -0,0 +1,109 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Literal
+
+from pydantic import Field, SkipValidation
+from pydantic.dataclasses import dataclass
+
+from vllm.config.model import ModelConfig
+from vllm.config.utils import config
+from vllm.transformers_utils.gguf_utils import is_gguf
+from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri
+from vllm.transformers_utils.utils import maybe_model_redirect
+
+TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
+
+
+@config
+@dataclass
+class RendererConfig:
+    """Configuration for the renderer."""
+
+    # NOTE: In reality, this is a required argument.
+    # We provide a dummy default value here to generate the CLI args.
+    model_config: SkipValidation[ModelConfig] = None  # type: ignore
+    """Provides model context to the renderer."""
+
+    tokenizer: str = ""
+    """Name or path of the Hugging Face tokenizer to use. If unspecified, model
+    name or path will be used."""
+    tokenizer_mode: TokenizerMode | str = "auto"
+    """Tokenizer mode:\n
+    - "auto" will use the tokenizer from `mistral_common` for Mistral models
+    if available, otherwise it will use the "hf" tokenizer.\n
+    - "hf" will use the fast tokenizer if available.\n
+    - "slow" will always use the slow tokenizer.\n
+    - "mistral" will always use the tokenizer from `mistral_common`.\n
+    - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
+    - Other custom values can be supported via plugins."""
+    tokenizer_revision: str | None = None
+    """The specific revision to use for the tokenizer on the Hugging Face Hub.
+    It can be a branch name, a tag name, or a commit id. If unspecified, will
+    use the default version."""
+    skip_tokenizer_init: bool = False
+    """Skip initialization of tokenizer and detokenizer. Expects valid
+    `prompt_token_ids` and `None` for prompt from the input. The generated
+    output will contain token ids."""
+
+    io_processor_plugin: str | None = None
+    """IOProcessor plugin name to load at model startup."""
+
+    media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
+    """Additional args passed to process media inputs, keyed by modalities.
+    For example, to set num_frames for video, set
+    `--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
+    allowed_local_media_path: str = ""
+    """Allowing API requests to read local images or videos from directories
+    specified by the server file system. This is a security risk. Should only
+    be enabled in trusted environments."""
+    allowed_media_domains: list[str] | None = None
+    """If set, only media URLs that belong to this domain can be used for
+    multi-modal inputs. """
+
+    @property
+    def trust_remote_code(self) -> bool:
+        return self.model_config.trust_remote_code
+
+    def __post_init__(self) -> None:
+        model_config = self.model_config
+
+        # The tokenizer is consistent with the model by default.
+        if not self.tokenizer:
+            self.tokenizer = (
+                ModelConfig.model
+                if model_config is None
+                else model_config.original_model
+            )
+        if not self.tokenizer_revision:
+            self.tokenizer_revision = (
+                ModelConfig.revision if model_config is None else model_config.revision
+            )
+
+        self.original_tokenizer = self.tokenizer
+        self.tokenizer = maybe_model_redirect(self.original_tokenizer)
+        self.maybe_pull_tokenizer_for_runai(self.tokenizer)
+
+        # Multimodal GGUF models must use original repo for mm processing
+        is_multimodal_model = (
+            ModelConfig.is_multimodal_model
+            if model_config is None
+            else model_config.is_multimodal_model
+        )
+        if is_gguf(self.tokenizer) and is_multimodal_model:
+            raise ValueError(
+                "Loading a multimodal GGUF model needs to use original "
+                "tokenizer. Please specify the unquantized hf model's "
+                "repo name or path using the --tokenizer argument."
+            )
+
+    def maybe_pull_tokenizer_for_runai(self, tokenizer: str) -> None:
+        """Pull tokenizer from Object Storage to temporary directory when needed."""
+        if not is_runai_obj_uri(tokenizer):
+            return
+
+        object_storage_tokenizer = ObjectStorageModel(url=tokenizer)
+        object_storage_tokenizer.pull_files(
+            tokenizer,
+            ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"],
+        )
+        self.tokenizer = object_storage_tokenizer.dir
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index bf533bf14..63b63eac9 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -322,16 +322,11 @@ class SpeculativeConfig:
                 self.draft_model_config = ModelConfig(
                     model=self.model,
                     runner="draft",
-                    tokenizer=self.target_model_config.tokenizer,
-                    tokenizer_mode=self.target_model_config.tokenizer_mode,
                     trust_remote_code=self.target_model_config.trust_remote_code,
-                    allowed_local_media_path=self.target_model_config.allowed_local_media_path,
-                    allowed_media_domains=self.target_model_config.allowed_media_domains,
                     dtype=self.target_model_config.dtype,
                     seed=self.target_model_config.seed,
                     revision=self.revision,
                     code_revision=self.code_revision,
-                    tokenizer_revision=self.target_model_config.tokenizer_revision,
                     spec_target_max_model_len=self.target_model_config.max_model_len,
                     quantization=self.quantization,
                     enforce_eager=self.target_model_config.enforce_eager,
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 36e4bd159..417797c44 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -39,6 +39,7 @@ from .lora import LoRAConfig
 from .model import ModelConfig
 from .observability import ObservabilityConfig
 from .parallel import ParallelConfig
+from .renderer import RendererConfig
 from .scheduler import SchedulerConfig
 from .speculative import SpeculativeConfig
 from .structured_outputs import StructuredOutputsConfig
@@ -181,6 +182,8 @@ class VllmConfig:
     # try to download a model
     model_config: ModelConfig = Field(default=None)
     """Model configuration."""
+    renderer_config: RendererConfig = Field(default_factory=RendererConfig)
+    """Renderer configuration."""
     cache_config: CacheConfig = Field(default_factory=CacheConfig)
     """Cache configuration."""
     parallel_config: ParallelConfig = Field(default_factory=ParallelConfig)
@@ -741,7 +744,7 @@ class VllmConfig:
             from vllm.multimodal import MULTIMODAL_REGISTRY
 
             self.scheduler_config.max_num_encoder_input_tokens = (
-                MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
+                MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.renderer_config)
             )
             logger.debug(
                 "Encoder-decoder model detected: setting "
@@ -1186,11 +1189,13 @@ class VllmConfig:
             computed_compile_ranges_split_points
         )
 
-    def recalculate_max_model_len(self, max_model_len: int):
-        # Can only be called in try_verify_and_update_config
-        model_config = self.model_config
-        max_model_len = model_config.get_and_verify_max_len(max_model_len)
-        self.model_config.max_model_len = max_model_len
+    def recalculate_max_model_len(self, original_max_model_len: int | None) -> None:
+        # Can only be called during try_verify_and_update_config
+        self.model_config.recalculate_max_model_len(
+            original_max_model_len,
+            tokenizer=self.renderer_config.tokenizer,
+            tokenizer_revision=self.renderer_config.tokenizer_revision,
+        )
 
     def try_verify_and_update_config(self):
         if self.model_config is None:
@@ -1264,11 +1269,11 @@ class VllmConfig:
         return (
             f"model={self.model_config.model!r}, "
             f"speculative_config={self.speculative_config!r}, "
-            f"tokenizer={self.model_config.tokenizer!r}, "
-            f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
-            f"tokenizer_mode={self.model_config.tokenizer_mode}, "
+            f"tokenizer={self.renderer_config.tokenizer!r}, "
+            f"skip_tokenizer_init={self.renderer_config.skip_tokenizer_init}, "
+            f"tokenizer_mode={self.renderer_config.tokenizer_mode}, "
             f"revision={self.model_config.revision}, "
-            f"tokenizer_revision={self.model_config.tokenizer_revision}, "
+            f"tokenizer_revision={self.renderer_config.tokenizer_revision}, "
             f"trust_remote_code={self.model_config.trust_remote_code}, "
             f"dtype={self.model_config.dtype}, "
             f"max_seq_len={self.model_config.max_model_len}, "
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ceac5407a..bd398abb0 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -71,11 +71,11 @@ from vllm.config.model import (
     ModelDType,
     RunnerOption,
     TaskOption,
-    TokenizerMode,
 )
 from vllm.config.multimodal import MMCacheType, MMEncoderTPMode
 from vllm.config.observability import DetailedTraceModules
 from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy
+from vllm.config.renderer import RendererConfig, TokenizerMode
 from vllm.config.scheduler import SchedulerPolicy
 from vllm.config.utils import get_field
 from vllm.config.vllm import OptimizationLevel
@@ -355,17 +355,12 @@ class EngineArgs:
 
     model: str = ModelConfig.model
     served_model_name: str | list[str] | None = ModelConfig.served_model_name
-    tokenizer: str | None = ModelConfig.tokenizer
     hf_config_path: str | None = ModelConfig.hf_config_path
     runner: RunnerOption = ModelConfig.runner
     convert: ConvertOption = ModelConfig.convert
     task: TaskOption | None = ModelConfig.task
-    skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
     enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
-    tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode
     trust_remote_code: bool = ModelConfig.trust_remote_code
-    allowed_local_media_path: str = ModelConfig.allowed_local_media_path
-    allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains
     download_dir: str | None = LoadConfig.download_dir
     safetensors_load_strategy: str = LoadConfig.safetensors_load_strategy
     load_format: str | LoadFormats = LoadConfig.load_format
@@ -449,7 +444,6 @@ class EngineArgs:
     code_revision: str | None = ModelConfig.code_revision
     hf_token: bool | str | None = ModelConfig.hf_token
     hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
-    tokenizer_revision: str | None = ModelConfig.tokenizer_revision
     quantization: QuantizationMethods | None = ModelConfig.quantization
     enforce_eager: bool = ModelConfig.enforce_eager
     disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
@@ -458,9 +452,6 @@ class EngineArgs:
     )
     enable_mm_embeds: bool = MultiModalConfig.enable_mm_embeds
     interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings
-    media_io_kwargs: dict[str, dict[str, Any]] = get_field(
-        MultiModalConfig, "media_io_kwargs"
-    )
     mm_processor_kwargs: dict[str, Any] | None = MultiModalConfig.mm_processor_kwargs
     disable_mm_preprocessor_cache: bool = False  # DEPRECATED
     mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb
@@ -474,9 +465,19 @@ class EngineArgs:
     mm_encoder_attn_backend: AttentionBackendEnum | str | None = (
         MultiModalConfig.mm_encoder_attn_backend
     )
-    io_processor_plugin: str | None = None
     skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
     video_pruning_rate: float = MultiModalConfig.video_pruning_rate
+    # Renderer fields
+    tokenizer: str | None = None
+    tokenizer_mode: TokenizerMode | str = RendererConfig.tokenizer_mode
+    tokenizer_revision: str | None = RendererConfig.tokenizer_revision
+    skip_tokenizer_init: bool = RendererConfig.skip_tokenizer_init
+    io_processor_plugin: str | None = None
+    media_io_kwargs: dict[str, dict[str, Any]] = get_field(
+        RendererConfig, "media_io_kwargs"
+    )
+    allowed_local_media_path: str = RendererConfig.allowed_local_media_path
+    allowed_media_domains: list[str] | None = RendererConfig.allowed_media_domains
     # LoRA fields
     enable_lora: bool = False
     max_loras: int = LoRAConfig.max_loras
@@ -627,25 +628,14 @@ class EngineArgs:
         model_group.add_argument("--runner", **model_kwargs["runner"])
         model_group.add_argument("--convert", **model_kwargs["convert"])
         model_group.add_argument("--task", **model_kwargs["task"], deprecated=True)
-        model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
-        model_group.add_argument("--tokenizer-mode", **model_kwargs["tokenizer_mode"])
         model_group.add_argument(
             "--trust-remote-code", **model_kwargs["trust_remote_code"]
         )
         model_group.add_argument("--dtype", **model_kwargs["dtype"])
         model_group.add_argument("--seed", **model_kwargs["seed"])
         model_group.add_argument("--hf-config-path", **model_kwargs["hf_config_path"])
-        model_group.add_argument(
-            "--allowed-local-media-path", **model_kwargs["allowed_local_media_path"]
-        )
-        model_group.add_argument(
-            "--allowed-media-domains", **model_kwargs["allowed_media_domains"]
-        )
         model_group.add_argument("--revision", **model_kwargs["revision"])
         model_group.add_argument("--code-revision", **model_kwargs["code_revision"])
-        model_group.add_argument(
-            "--tokenizer-revision", **model_kwargs["tokenizer_revision"]
-        )
         model_group.add_argument("--max-model-len", **model_kwargs["max_model_len"])
         model_group.add_argument("--quantization", "-q", **model_kwargs["quantization"])
         model_group.add_argument("--enforce-eager", **model_kwargs["enforce_eager"])
@@ -657,9 +647,6 @@ class EngineArgs:
         model_group.add_argument(
             "--disable-cascade-attn", **model_kwargs["disable_cascade_attn"]
         )
-        model_group.add_argument(
-            "--skip-tokenizer-init", **model_kwargs["skip_tokenizer_init"]
-        )
         model_group.add_argument(
             "--enable-prompt-embeds", **model_kwargs["enable_prompt_embeds"]
         )
@@ -698,8 +685,34 @@ class EngineArgs:
         model_group.add_argument(
             "--logits-processors", **model_kwargs["logits_processors"]
         )
-        model_group.add_argument(
-            "--io-processor-plugin", **model_kwargs["io_processor_plugin"]
+
+        # Renderer arguments
+        renderer_kwargs = get_kwargs(RendererConfig)
+        renderer_group = parser.add_argument_group(
+            title="RendererConfig",
+            description=RendererConfig.__doc__,
+        )
+        renderer_group.add_argument("--tokenizer", **renderer_kwargs["tokenizer"])
+        renderer_group.add_argument(
+            "--tokenizer-mode", **renderer_kwargs["tokenizer_mode"]
+        )
+        renderer_group.add_argument(
+            "--tokenizer-revision", **renderer_kwargs["tokenizer_revision"]
+        )
+        renderer_group.add_argument(
+            "--skip-tokenizer-init", **renderer_kwargs["skip_tokenizer_init"]
+        )
+        renderer_group.add_argument(
+            "--media-io-kwargs", **renderer_kwargs["media_io_kwargs"]
+        )
+        renderer_group.add_argument(
+            "--allowed-local-media-path", **renderer_kwargs["allowed_local_media_path"]
+        )
+        renderer_group.add_argument(
+            "--allowed-media-domains", **renderer_kwargs["allowed_media_domains"]
+        )
+        renderer_group.add_argument(
+            "--io-processor-plugin", **renderer_kwargs["io_processor_plugin"]
         )
 
         # Model loading arguments
@@ -949,9 +962,6 @@ class EngineArgs:
         multimodal_group.add_argument(
             "--enable-mm-embeds", **multimodal_kwargs["enable_mm_embeds"]
         )
-        multimodal_group.add_argument(
-            "--media-io-kwargs", **multimodal_kwargs["media_io_kwargs"]
-        )
         multimodal_group.add_argument(
             "--mm-processor-kwargs", **multimodal_kwargs["mm_processor_kwargs"]
         )
@@ -1255,18 +1265,13 @@ class EngineArgs:
             runner=self.runner,
             convert=self.convert,
             task=self.task,
-            tokenizer=self.tokenizer,
-            tokenizer_mode=self.tokenizer_mode,
             trust_remote_code=self.trust_remote_code,
-            allowed_local_media_path=self.allowed_local_media_path,
-            allowed_media_domains=self.allowed_media_domains,
             dtype=self.dtype,
             seed=self.seed,
             revision=self.revision,
             code_revision=self.code_revision,
             hf_token=self.hf_token,
             hf_overrides=self.hf_overrides,
-            tokenizer_revision=self.tokenizer_revision,
             max_model_len=self.max_model_len,
             quantization=self.quantization,
             enforce_eager=self.enforce_eager,
@@ -1274,13 +1279,11 @@ class EngineArgs:
             logprobs_mode=self.logprobs_mode,
             disable_sliding_window=self.disable_sliding_window,
             disable_cascade_attn=self.disable_cascade_attn,
-            skip_tokenizer_init=self.skip_tokenizer_init,
             enable_prompt_embeds=self.enable_prompt_embeds,
             served_model_name=self.served_model_name,
             limit_mm_per_prompt=self.limit_mm_per_prompt,
             enable_mm_embeds=self.enable_mm_embeds,
             interleave_mm_strings=self.interleave_mm_strings,
-            media_io_kwargs=self.media_io_kwargs,
             skip_mm_profiling=self.skip_mm_profiling,
             config_format=self.config_format,
             mm_processor_kwargs=self.mm_processor_kwargs,
@@ -1298,7 +1301,6 @@ class EngineArgs:
             override_attention_dtype=self.override_attention_dtype,
             logits_processors=self.logits_processors,
             video_pruning_rate=self.video_pruning_rate,
-            io_processor_plugin=self.io_processor_plugin,
         )
 
     def validate_tensorizer_args(self):
@@ -1394,9 +1396,25 @@ class EngineArgs:
             )
 
         model_config = self.create_model_config()
-        self.model = model_config.model
-        self.tokenizer = model_config.tokenizer
+        renderer_config = RendererConfig(
+            model_config=model_config,
+            tokenizer=self.tokenizer or "",
+            tokenizer_mode=self.tokenizer_mode,
+            tokenizer_revision=self.tokenizer_revision,
+            skip_tokenizer_init=self.skip_tokenizer_init,
+            io_processor_plugin=self.io_processor_plugin,
+            media_io_kwargs=self.media_io_kwargs,
+            allowed_local_media_path=self.allowed_local_media_path,
+            allowed_media_domains=self.allowed_media_domains,
+        )
 
+        model_config.recalculate_max_model_len(
+            model_config.original_max_model_len,
+            tokenizer=renderer_config.tokenizer,
+            tokenizer_revision=renderer_config.tokenizer_revision,
+        )
+
+        self.model = model_config.model
         self._check_feature_supported(model_config)
         self._set_default_chunked_prefill_and_prefix_caching_args(model_config)
         self._set_default_max_num_seqs_and_batched_tokens_args(
@@ -1768,6 +1786,7 @@ class EngineArgs:
             )
         config = VllmConfig(
             model_config=model_config,
+            renderer_config=renderer_config,
             cache_config=cache_config,
             parallel_config=parallel_config,
             scheduler_config=scheduler_config,
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index d94951a0c..7b60e7f89 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
 from collections.abc import AsyncGenerator, Iterable, Mapping
 from typing import Any
 
-from vllm.config import ModelConfig, VllmConfig
+from vllm.config import ModelConfig, RendererConfig, VllmConfig
 from vllm.inputs.data import PromptType
 from vllm.lora.request import LoRARequest
 from vllm.outputs import PoolingRequestOutput, RequestOutput
@@ -22,6 +22,7 @@ class EngineClient(ABC):
     """Protocol class for Clients to Engine"""
 
     vllm_config: VllmConfig
+    renderer_config: RendererConfig
     model_config: ModelConfig
     input_processor: InputProcessor
     io_processor: IOProcessor | None
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index aceaa8bd4..5ad256c2f 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -44,7 +44,7 @@ from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, Processor
 from typing_extensions import Required, TypedDict
 
 from vllm import envs
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, RendererConfig
 from vllm.logger import init_logger
 from vllm.model_executor.models import SupportsMultiModal
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict
@@ -452,9 +452,10 @@ This is needed because `lru_cache` does not cache when an exception happens.
 
 def _try_get_processor_chat_template(
     tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
-    model_config: ModelConfig,
+    *,
+    trust_remote_code: bool,
 ) -> str | None:
-    cache_key = (tokenizer.name_or_path, model_config.trust_remote_code)
+    cache_key = (tokenizer.name_or_path, trust_remote_code)
     if cache_key in _PROCESSOR_CHAT_TEMPLATES:
         return _PROCESSOR_CHAT_TEMPLATES[cache_key]
 
@@ -466,7 +467,7 @@ def _try_get_processor_chat_template(
                 PreTrainedTokenizerFast,
                 ProcessorMixin,
             ),
-            trust_remote_code=model_config.trust_remote_code,
+            trust_remote_code=trust_remote_code,
         )
         if (
             isinstance(processor, ProcessorMixin)
@@ -499,7 +500,10 @@ def resolve_hf_chat_template(
 
     # 2nd priority: AutoProcessor chat template, unless tool calling is enabled
     if tools is None:
-        chat_template = _try_get_processor_chat_template(tokenizer, model_config)
+        chat_template = _try_get_processor_chat_template(
+            tokenizer,
+            trust_remote_code=model_config.trust_remote_code,
+        )
         if chat_template is not None:
             return chat_template
 
@@ -513,10 +517,10 @@ def resolve_hf_chat_template(
             exc_info=True,
         )
 
-    # 4th priority: Predefined fallbacks
+    # 4th priority: Predefined fallbacks]
     path = get_chat_template_fallback_path(
         model_type=model_config.hf_config.model_type,
-        tokenizer_name_or_path=model_config.tokenizer,
+        tokenizer_name_or_path=tokenizer.name_or_path,
     )
     if path is not None:
         logger.info_once(
@@ -538,14 +542,14 @@ def _resolve_chat_template_content_format(
     tools: list[dict[str, Any]] | None,
     tokenizer: TokenizerLike | None,
     *,
-    model_config: ModelConfig,
+    renderer_config: RendererConfig,
 ) -> _ChatTemplateContentFormat:
     if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
         hf_chat_template = resolve_hf_chat_template(
             tokenizer,
             chat_template=chat_template,
             tools=tools,
-            model_config=model_config,
+            model_config=renderer_config.model_config,
         )
     else:
         hf_chat_template = None
@@ -595,7 +599,7 @@ def resolve_chat_template_content_format(
     given_format: ChatTemplateContentFormatOption,
     tokenizer: TokenizerLike | None,
     *,
-    model_config: ModelConfig,
+    renderer_config: RendererConfig,
 ) -> _ChatTemplateContentFormat:
     if given_format != "auto":
         return given_format
@@ -604,7 +608,7 @@ def resolve_chat_template_content_format(
         chat_template,
         tools,
         tokenizer,
-        model_config=model_config,
+        renderer_config=renderer_config,
     )
 
     _log_chat_template_content_format(
@@ -627,32 +631,32 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
     maximum per prompt.
     """
 
-    def __init__(self, model_config: ModelConfig):
+    def __init__(self, renderer_config: RendererConfig):
         super().__init__()
 
-        self._model_config = model_config
+        self._renderer_config = renderer_config
 
         self._items_by_modality = defaultdict[str, list[_T | None]](list)
         self._uuids_by_modality = defaultdict[str, list[str | None]](list)
 
     @property
-    def model_config(self) -> ModelConfig:
-        return self._model_config
+    def renderer_config(self) -> RendererConfig:
+        return self._renderer_config
 
     @cached_property
     def model_cls(self) -> type[SupportsMultiModal]:
         from vllm.model_executor.model_loader import get_model_cls
 
-        model_cls = get_model_cls(self.model_config)
+        model_cls = get_model_cls(self.renderer_config.model_config)
         return cast(type[SupportsMultiModal], model_cls)
 
     @property
     def allowed_local_media_path(self):
-        return self._model_config.allowed_local_media_path
+        return self._renderer_config.allowed_local_media_path
 
     @property
     def allowed_media_domains(self):
-        return self._model_config.allowed_media_domains
+        return self._renderer_config.allowed_media_domains
 
     @property
     def mm_registry(self):
@@ -660,7 +664,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
 
     @cached_property
     def mm_processor(self):
-        return self.mm_registry.create_processor(self.model_config)
+        return self.mm_registry.create_processor(self.renderer_config)
 
     def add(
         self,
@@ -851,19 +855,20 @@ class MultiModalContentParser(BaseMultiModalContentParser):
         super().__init__()
 
         self._tracker = tracker
-        multimodal_config = self._tracker.model_config.multimodal_config
-        media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None)
-
         self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load(
             envs.VLLM_MEDIA_CONNECTOR,
-            media_io_kwargs=media_io_kwargs,
+            media_io_kwargs=self.renderer_config.media_io_kwargs,
             allowed_local_media_path=tracker.allowed_local_media_path,
             allowed_media_domains=tracker.allowed_media_domains,
         )
 
+    @property
+    def renderer_config(self) -> RendererConfig:
+        return self._tracker.renderer_config
+
     @property
     def model_config(self) -> ModelConfig:
-        return self._tracker.model_config
+        return self.renderer_config.model_config
 
     def parse_image(self, image_url: str | None, uuid: str | None = None) -> None:
         image = self._connector.fetch_image(image_url) if image_url else None
@@ -963,18 +968,20 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
         super().__init__()
 
         self._tracker = tracker
-        multimodal_config = self._tracker.model_config.multimodal_config
-        media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None)
         self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load(
             envs.VLLM_MEDIA_CONNECTOR,
-            media_io_kwargs=media_io_kwargs,
+            media_io_kwargs=self.renderer_config.media_io_kwargs,
             allowed_local_media_path=tracker.allowed_local_media_path,
             allowed_media_domains=tracker.allowed_media_domains,
         )
 
+    @property
+    def renderer_config(self) -> RendererConfig:
+        return self._tracker.renderer_config
+
     @property
     def model_config(self) -> ModelConfig:
-        return self._tracker.model_config
+        return self.renderer_config.model_config
 
     def parse_image(self, image_url: str | None, uuid: str | None = None) -> None:
         image_coro = self._connector.fetch_image_async(image_url) if image_url else None
@@ -1604,15 +1611,17 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None:
 
 def parse_chat_messages(
     messages: list[ChatCompletionMessageParam],
-    model_config: ModelConfig,
+    renderer_config: RendererConfig,
     content_format: _ChatTemplateContentFormat,
 ) -> tuple[
     list[ConversationMessage],
     MultiModalDataDict | None,
     MultiModalUUIDDict | None,
 ]:
+    model_config = renderer_config.model_config
+
     conversation: list[ConversationMessage] = []
-    mm_tracker = MultiModalItemTracker(model_config)
+    mm_tracker = MultiModalItemTracker(renderer_config)
 
     for msg in messages:
         sub_messages = _parse_chat_message_content(
@@ -1635,15 +1644,17 @@ def parse_chat_messages(
 
 def parse_chat_messages_futures(
     messages: list[ChatCompletionMessageParam],
-    model_config: ModelConfig,
+    renderer_config: RendererConfig,
     content_format: _ChatTemplateContentFormat,
 ) -> tuple[
     list[ConversationMessage],
     Awaitable[MultiModalDataDict | None],
     MultiModalUUIDDict | None,
 ]:
+    model_config = renderer_config.model_config
+
     conversation: list[ConversationMessage] = []
-    mm_tracker = AsyncMultiModalItemTracker(model_config)
+    mm_tracker = AsyncMultiModalItemTracker(renderer_config)
 
     for msg in messages:
         sub_messages = _parse_chat_message_content(
@@ -1748,14 +1759,14 @@ def apply_hf_chat_template(
     chat_template: str | None,
     tools: list[dict[str, Any]] | None,
     *,
-    model_config: ModelConfig,
+    renderer_config: RendererConfig,
     **kwargs: Any,
 ) -> str:
     hf_chat_template = resolve_hf_chat_template(
         tokenizer,
         chat_template=chat_template,
         tools=tools,
-        model_config=model_config,
+        model_config=renderer_config.model_config,
     )
 
     if hf_chat_template is None:
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 913324fd5..6b3cb26af 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -29,8 +29,8 @@ from vllm.config.model import (
     HfOverrides,
     ModelDType,
     RunnerOption,
-    TokenizerMode,
 )
+from vllm.config.renderer import TokenizerMode
 from vllm.engine.arg_utils import EngineArgs
 from vllm.entrypoints.chat_utils import (
     ChatCompletionMessageParam,
@@ -343,6 +343,7 @@ class LLM:
         logger.info("Supported tasks: %s", supported_tasks)
         self.supported_tasks = supported_tasks
 
+        self.renderer_config = self.llm_engine.renderer_config
         self.model_config = self.llm_engine.model_config
         self.input_processor = self.llm_engine.input_processor
         self.io_processor = self.llm_engine.io_processor
@@ -808,13 +809,13 @@ class LLM:
             list_of_messages = [cast(list[ChatCompletionMessageParam], messages)]
 
         tokenizer = self.get_tokenizer()
-        model_config = self.model_config
+        renderer_config = self.renderer_config
         resolved_content_format = resolve_chat_template_content_format(
             chat_template,
             tools,
             chat_template_content_format,
             tokenizer,
-            model_config=model_config,
+            renderer_config=renderer_config,
         )
 
         _chat_template_kwargs: dict[str, Any] = dict(
@@ -833,7 +834,7 @@ class LLM:
             # the chat message parsing for it.
             conversation, mm_data, mm_uuids = parse_chat_messages(
                 msgs,
-                model_config,
+                renderer_config,
                 content_format=resolved_content_format,
             )
 
@@ -847,7 +848,7 @@ class LLM:
                 prompt_str = apply_hf_chat_template(
                     tokenizer=tokenizer,
                     conversation=conversation,
-                    model_config=model_config,
+                    renderer_config=renderer_config,
                     **_chat_template_kwargs,
                 )
                 # Special tokens are already included in chat templates so
@@ -1290,6 +1291,7 @@ class LLM:
         lora_request: list[LoRARequest] | LoRARequest | None = None,
         tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[ScoringRequestOutput]:
+        renderer_config = self.renderer_config
         model_config = self.model_config
 
         if isinstance(tokenizer, MistralTokenizer):
@@ -1317,7 +1319,7 @@ class LLM:
 
         for q, d in input_pairs:
             _, engine_prompt = get_score_prompt(
-                model_config=model_config,
+                renderer_config=renderer_config,
                 data_1=q,
                 data_2=d,
                 tokenizer=tokenizer,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 7be601d82..d77d611a2 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1099,7 +1099,7 @@ async def init_app_state(
     logger.info("Supported tasks: %s", supported_tasks)
 
     resolved_chat_template = await process_chat_template(
-        args.chat_template, engine_client, vllm_config.model_config
+        args.chat_template, engine_client, vllm_config.renderer_config
     )
 
     if args.tool_server == "demo":
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 3e421e21e..a9e72fb00 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -122,7 +122,7 @@ class OpenAIServingCompletion(OpenAIServing):
         try:
             lora_request = self._maybe_get_adapters(request)
 
-            if self.model_config.skip_tokenizer_init:
+            if self.renderer_config.skip_tokenizer_init:
                 tokenizer = None
             else:
                 tokenizer = await self.engine_client.get_tokenizer()
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 99936f588..d887cf48d 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -291,6 +291,7 @@ class OpenAIServing:
 
         self.input_processor = self.models.input_processor
         self.io_processor = self.models.io_processor
+        self.renderer_config = self.models.renderer_config
         self.model_config = self.models.model_config
         self.max_model_len = self.model_config.max_model_len
 
@@ -1100,18 +1101,18 @@ class OpenAIServing:
         Sequence[RequestPrompt],
         list[EngineTokensPrompt],
     ]:
-        model_config = self.model_config
+        renderer_config = self.renderer_config
 
         resolved_content_format = resolve_chat_template_content_format(
             chat_template,
             tool_dicts,
             chat_template_content_format,
             tokenizer,
-            model_config=model_config,
+            renderer_config=renderer_config,
         )
         conversation, mm_data_future, mm_uuids = parse_chat_messages_futures(
             messages,
-            model_config,
+            renderer_config,
             content_format=resolved_content_format,
         )
 
@@ -1138,14 +1139,14 @@ class OpenAIServing:
             request_prompt = tokenizer.apply_chat_template(
                 conversation=conversation,
                 messages=messages,
-                model_config=model_config,
+                model_config=renderer_config.model_config,
                 **_chat_template_kwargs,
             )
         else:
             request_prompt = apply_hf_chat_template(
                 tokenizer=tokenizer,
                 conversation=conversation,
-                model_config=model_config,
+                renderer_config=renderer_config,
                 **_chat_template_kwargs,
             )
 
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index 953398a9a..ec65e6593 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -71,6 +71,7 @@ class OpenAIServingModels:
 
         self.input_processor = self.engine_client.input_processor
         self.io_processor = self.engine_client.io_processor
+        self.renderer_config = self.engine_client.renderer_config
         self.model_config = self.engine_client.model_config
         self.max_model_len = self.model_config.max_model_len
 
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index cea9924eb..5fd79eed1 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -91,7 +91,7 @@ class OpenAISpeechToText(OpenAIServing):
         self.task_type = task_type
 
         self.asr_config = self.model_cls.get_speech_to_text_config(
-            self.model_config, task_type
+            self.renderer_config, task_type
         )
 
         self.enable_force_include_usage = enable_force_include_usage
@@ -101,8 +101,8 @@ class OpenAISpeechToText(OpenAIServing):
             self.tokenizer = cast(
                 PreTrainedTokenizerBase,
                 get_tokenizer(
-                    tokenizer_name=self.model_config.tokenizer,
-                    tokenizer_mode=self.model_config.tokenizer_mode,
+                    tokenizer_name=self.renderer_config.tokenizer,
+                    tokenizer_mode=self.renderer_config.tokenizer_mode,
                 ),
             )
 
@@ -154,7 +154,7 @@ class OpenAISpeechToText(OpenAIServing):
             prompt = self.model_cls.get_generation_prompt(
                 audio=chunk,
                 stt_config=self.asr_config,
-                model_config=self.model_config,
+                renderer_config=self.renderer_config,
                 language=language,
                 task_type=self.task_type,
                 request_prompt=request.prompt,
@@ -428,7 +428,7 @@ class OpenAISpeechToText(OpenAIServing):
                     if res.prompt_token_ids is not None:
                         num_prompt_tokens = len(res.prompt_token_ids)
                         if audio_tokens := self.model_cls.get_num_audio_tokens(
-                            audio_duration_s, self.asr_config, self.model_config
+                            audio_duration_s, self.asr_config, self.renderer_config
                         ):
                             num_prompt_tokens += audio_tokens
 
diff --git a/vllm/entrypoints/pooling/pooling/serving.py b/vllm/entrypoints/pooling/pooling/serving.py
index 7fb767e26..cd28ccba9 100644
--- a/vllm/entrypoints/pooling/pooling/serving.py
+++ b/vllm/entrypoints/pooling/pooling/serving.py
@@ -94,7 +94,7 @@ class OpenAIServingPooling(OpenAIServing):
         try:
             lora_request = self._maybe_get_adapters(request)
 
-            if self.model_config.skip_tokenizer_init:
+            if self.renderer_config.skip_tokenizer_init:
                 tokenizer = None
             else:
                 tokenizer = await self.engine_client.get_tokenizer()
diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py
index e5a667830..f657fcefd 100644
--- a/vllm/entrypoints/pooling/score/serving.py
+++ b/vllm/entrypoints/pooling/score/serving.py
@@ -160,10 +160,8 @@ class ServingScores(OpenAIServing):
         data_1: str | ScoreContentPartParam,
         data_2: str | ScoreContentPartParam,
     ) -> tuple[str, TokensPrompt]:
-        model_config = self.model_config
-
         full_prompt, engine_prompt = get_score_prompt(
-            model_config=model_config,
+            renderer_config=self.renderer_config,
             data_1=data_1,
             data_2=data_2,
             tokenizer=tokenizer,
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index 072ddd4c9..561adbe45 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -5,7 +5,7 @@ from typing import Any, TypeAlias, cast
 from torch.nn import CosineSimilarity
 from typing_extensions import Required, TypedDict
 
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, RendererConfig
 from vllm.entrypoints.chat_utils import (
     BaseMultiModalItemTracker,
     ChatCompletionContentPartImageEmbedsParam,
@@ -88,9 +88,9 @@ def _validate_score_input_lens(
 def parse_score_data(
     data_1: str | ScoreContentPartParam,
     data_2: str | ScoreContentPartParam,
-    model_config: ModelConfig,
+    renderer_config: RendererConfig,
 ) -> tuple[str, str, MultiModalDataDict | None]:
-    mm_tracker = MultiModalItemTracker(model_config)
+    mm_tracker = MultiModalItemTracker(renderer_config)
 
     content_1 = _parse_score_content(data_1, mm_tracker)
     content_2 = _parse_score_content(data_2, mm_tracker)
@@ -176,7 +176,7 @@ def post_process_tokens(
 
 
 def get_score_prompt(
-    model_config: ModelConfig,
+    renderer_config: RendererConfig,
     tokenizer: TokenizerLike,
     tokenization_kwargs: dict[str, Any],
     data_1: str | ScoreContentPartParam,
@@ -185,11 +185,14 @@ def get_score_prompt(
     prompt_1, prompt_2, mm_data = parse_score_data(
         data_1,
         data_2,
-        model_config,
+        renderer_config,
     )
+
     from vllm.model_executor.model_loader import get_model_cls
 
+    model_config = renderer_config.model_config
     model = get_model_cls(model_config)
+
     if supports_score_template(model):
         full_prompt = apply_score_template(model_config, prompt_1, prompt_2)
         prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index daeeb995b..a81f73ac9 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -13,7 +13,7 @@ from fastapi import Request
 from fastapi.responses import JSONResponse, StreamingResponse
 from starlette.background import BackgroundTask, BackgroundTasks
 
-from vllm.config import ModelConfig
+from vllm.config import RendererConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import (
@@ -288,7 +288,7 @@ def process_lora_modules(
 async def process_chat_template(
     args_chat_template: Path | str | None,
     engine_client: EngineClient,
-    model_config: ModelConfig,
+    renderer_config: RendererConfig,
 ) -> str | None:
     resolved_chat_template = load_chat_template(args_chat_template)
     if resolved_chat_template is not None:
@@ -305,7 +305,7 @@ async def process_chat_template(
                 tokenizer=tokenizer,
                 chat_template=None,
                 tools=None,
-                model_config=model_config,
+                model_config=renderer_config.model_config,
             )
 
             if hf_chat_template != resolved_chat_template:
@@ -314,6 +314,6 @@ async def process_chat_template(
                     "It is different from official chat template '%s'. "
                     "This discrepancy may lead to performance degradation.",
                     resolved_chat_template,
-                    model_config.model,
+                    renderer_config.model_config.model,
                 )
     return resolved_chat_template
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 0372b06d0..f534d102f 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -6,7 +6,7 @@ from typing import Any, cast
 
 from typing_extensions import assert_never
 
-from vllm.config import ModelConfig
+from vllm.config import RendererConfig
 from vllm.logger import init_logger
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.multimodal.cache import BaseMultiModalProcessorCache
@@ -45,14 +45,15 @@ logger = init_logger(__name__)
 class InputPreprocessor:
     def __init__(
         self,
-        model_config: ModelConfig,
+        renderer_config: RendererConfig,
         tokenizer: TokenizerLike | None,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         mm_processor_cache: BaseMultiModalProcessorCache | None = None,
     ) -> None:
         super().__init__()
 
-        self.model_config = model_config
+        self.renderer_config = renderer_config
+        self.model_config = renderer_config.model_config
         self.tokenizer = tokenizer
         self.mm_registry = mm_registry
         self.mm_processor_cache = mm_processor_cache
@@ -231,7 +232,7 @@ class InputPreprocessor:
     def _get_mm_processor(self) -> BaseMultiModalProcessor:
         if not hasattr(self, "_mm_processor"):
             self._mm_processor = self.mm_registry.create_processor(
-                self.model_config,
+                self.renderer_config,
                 tokenizer=self.tokenizer,
                 cache=self.mm_processor_cache,
             )
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 007d847ac..a2700bd5a 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -415,7 +415,7 @@ def load_weights_using_from_2_way_softmax(
     from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
     from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
-    model_config = model.vllm_config.model_config
+    renderer_config = model.vllm_config.renderer_config
     quant_config = model.vllm_config.quant_config
     text_config = model.config.get_text_config()
 
@@ -447,10 +447,10 @@ def load_weights_using_from_2_way_softmax(
     from vllm.tokenizers import get_tokenizer
 
     tokenizer = get_tokenizer(
-        model_config.tokenizer,
-        revision=model_config.tokenizer_revision,
-        tokenizer_mode=model_config.tokenizer_mode,
-        trust_remote_code=model_config.trust_remote_code,
+        renderer_config.tokenizer,
+        revision=renderer_config.tokenizer_revision,
+        tokenizer_mode=renderer_config.tokenizer_mode,
+        trust_remote_code=renderer_config.trust_remote_code,
     )
 
     false_id = tokenizer.convert_tokens_to_ids(tokens[0])
@@ -473,7 +473,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
     from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
     from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
-    model_config = model.vllm_config.model_config
+    renderer_config = model.vllm_config.renderer_config
     quant_config = model.vllm_config.quant_config
     text_config = model.config.get_text_config()
 
@@ -501,10 +501,10 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
     from vllm.tokenizers import get_tokenizer
 
     tokenizer = get_tokenizer(
-        model_config.tokenizer,
-        revision=model_config.tokenizer_revision,
-        tokenizer_mode=model_config.tokenizer_mode,
-        trust_remote_code=model_config.trust_remote_code,
+        renderer_config.tokenizer,
+        revision=renderer_config.tokenizer_revision,
+        tokenizer_mode=renderer_config.tokenizer_mode,
+        trust_remote_code=renderer_config.trust_remote_code,
     )
 
     token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py
index 1f07381c0..bd4724749 100644
--- a/vllm/model_executor/models/deepseek_ocr.py
+++ b/vllm/model_executor/models/deepseek_ocr.py
@@ -377,8 +377,8 @@ class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         self.projector_config = config.projector_config
         self.text_config = config.text_config
 
-        model_config = vllm_config.model_config
-        tokenizer = cached_tokenizer_from_config(model_config)
+        renderer_config = vllm_config.renderer_config
+        tokenizer = cached_tokenizer_from_config(renderer_config)
         self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN]
 
         self.sam_model = build_sam_vit_b()
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 9f8faf9ed..be03e1df8 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -370,8 +370,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         self.projector_config = config.projector_config
         self.text_config = config.text_config
 
-        model_config = vllm_config.model_config
-        tokenizer = cached_tokenizer_from_config(model_config)
+        renderer_config = vllm_config.renderer_config
+        tokenizer = cached_tokenizer_from_config(renderer_config)
         self.image_token_id: int = tokenizer.vocab[_IMAGE_TOKEN]
 
         self.vision = self._init_vision_module(
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
index 7036118ad..f82529d84 100644
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -18,7 +18,7 @@ from transformers.models.gemma3n import (
 )
 from transformers.models.siglip import SiglipImageProcessorFast
 
-from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
+from vllm.config import RendererConfig, SpeechToTextConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
@@ -760,7 +760,7 @@ class Gemma3nForConditionalGeneration(
         cls,
         audio: np.ndarray,
         stt_config: SpeechToTextConfig,
-        model_config: ModelConfig,
+        renderer_config: RendererConfig,
         language: Optional[str],
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
@@ -798,7 +798,9 @@ class Gemma3nForConditionalGeneration(
 
     @classmethod
     def get_speech_to_text_config(
-        cls, model_config: ModelConfig, task_type: str
+        cls,
+        renderer_config: RendererConfig,
+        task_type: str,
     ) -> SpeechToTextConfig:
         return SpeechToTextConfig(
             # Let's set this to 30 as suggested in the docs for now, although
diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
index a4e50f408..96645f20b 100644
--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -34,7 +34,7 @@ import torch.nn.functional as F
 from torch import nn
 from transformers import BatchFeature, PretrainedConfig
 
-from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
+from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.inputs.data import PromptType
 from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
@@ -840,7 +840,7 @@ class GraniteSpeechForConditionalGeneration(
     def get_generation_prompt(
         cls,
         audio: np.ndarray,
-        model_config: ModelConfig,
+        renderer_config: RendererConfig,
         stt_config: SpeechToTextConfig,
         language: str | None,
         task_type: Literal["transcribe", "translate"],
@@ -861,7 +861,7 @@ class GraniteSpeechForConditionalGeneration(
         else:
             raise ValueError(f"Unsupported task type {task_type}")
 
-        tokenizer = cached_tokenizer_from_config(model_config)
+        tokenizer = cached_tokenizer_from_config(renderer_config)
         chat = [dict(role="user", content=user_prompt)]
         prompt = tokenizer.apply_chat_template(
             chat,
@@ -882,10 +882,10 @@ class GraniteSpeechForConditionalGeneration(
         cls,
         audio_duration_s: float,
         stt_config: SpeechToTextConfig,
-        model_config: ModelConfig,
+        renderer_config: RendererConfig,
     ) -> int | None:
         """Get the number of audio tokens for an audio duration in sec."""
-        processor = cached_processor_from_config(model_config)
+        processor = cached_processor_from_config(renderer_config)
         hop_length = processor.audio_processor.melspec_kwargs["hop_length"]
         proj_win_size = processor.audio_processor.projector_window_size
         ds_rate = processor.audio_processor.projector_downsample_rate
@@ -903,7 +903,9 @@ class GraniteSpeechForConditionalGeneration(
 
     @classmethod
     def get_speech_to_text_config(
-        cls, model_config: ModelConfig, task_type: str
+        cls,
+        renderer_config: RendererConfig,
+        task_type: str,
     ) -> SpeechToTextConfig:
         """Get the stt config for this model."""
         # Default settings are reasonable for this model and we don't currently
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index 2aba626a7..b9f3ac8ae 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -6,7 +6,7 @@ import numpy as np
 import torch
 import torch.nn as nn
 
-from vllm.config import ModelConfig, VllmConfig
+from vllm.config import RendererConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.pooler import (
     DispatchPooler,
@@ -29,12 +29,12 @@ logger = init_logger(__name__)
 class GritLMMeanPool(nn.Module):
     """As `MeanPool`, but only includes non-instruction tokens."""
 
-    def __init__(self, model_config: ModelConfig):
+    def __init__(self, renderer_config: RendererConfig):
         super().__init__()
 
-        self.model_config = model_config
+        self.renderer_config = renderer_config
 
-        tokenizer = cached_tokenizer_from_config(self.model_config)
+        tokenizer = cached_tokenizer_from_config(self.renderer_config)
 
         # Collect the tokens needed for pattern matching.
         # "▁<" is different from "_<". The former uses "▁" to indicate that
@@ -174,10 +174,10 @@ class GritLMMeanPool(nn.Module):
 
 
 class GritLMPooler(Pooler):
-    def __init__(self, model_config: ModelConfig):
+    def __init__(self, renderer_config: RendererConfig):
         super().__init__()
 
-        self.pooling = GritLMMeanPool(model_config)
+        self.pooling = GritLMMeanPool(renderer_config)
         self.head = PoolerHead(PoolerNormalize())
 
     def get_supported_tasks(self) -> Set[PoolingTask]:
@@ -238,6 +238,6 @@ class GritLM(LlamaForCausalLM):
             self.pooler = DispatchPooler(
                 {
                     "token_embed": Pooler.for_token_embed(pooler_config),
-                    "embed": GritLMPooler(vllm_config.model_config),
+                    "embed": GritLMPooler(vllm_config.renderer_config),
                 }
             )
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 607ff5583..4df91aaf8 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -19,7 +19,7 @@ from torch import Tensor
 from transformers.models.whisper.tokenization_whisper import LANGUAGES
 from typing_extensions import Self, TypeIs
 
-from vllm.config import ModelConfig, SpeechToTextConfig
+from vllm.config import RendererConfig, SpeechToTextConfig
 from vllm.inputs import TokensPrompt
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
@@ -887,7 +887,7 @@ class SupportsTranscription(Protocol):
         cls,
         audio: np.ndarray,
         stt_config: SpeechToTextConfig,
-        model_config: ModelConfig,
+        renderer_config: RendererConfig,
         language: str | None,
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
@@ -930,7 +930,9 @@ class SupportsTranscription(Protocol):
 
     @classmethod
     def get_speech_to_text_config(
-        cls, model_config: ModelConfig, task_type: Literal["transcribe", "translate"]
+        cls,
+        renderer_config: RendererConfig,
+        task_type: Literal["transcribe", "translate"],
     ) -> SpeechToTextConfig:
         """Get the speech to text config for the ASR model."""
         ...
@@ -940,7 +942,7 @@ class SupportsTranscription(Protocol):
         cls,
         audio_duration_s: float,
         stt_config: SpeechToTextConfig,
-        model_config: ModelConfig,
+        renderer_config: RendererConfig,
     ) -> int | None:
         """
         Map from audio duration to number of audio tokens produced by the ASR
diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py
index 18985cefb..d75637da1 100644
--- a/vllm/model_executor/models/interns1.py
+++ b/vllm/model_executor/models/interns1.py
@@ -182,7 +182,7 @@ class InternS1ProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
         hf_processor = self.ctx.get_hf_processor(InternVLProcessor, **kwargs)
         hf_processor.video_processor = cached_video_processor_from_config(
-            self.ctx.model_config,
+            self.ctx.renderer_config,
             processor_cls=InternVLVideoProcessor,
             size=hf_processor.image_processor.size,
             **kwargs,
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 6dfab595e..4daaefd0c 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -1169,16 +1169,17 @@ class NemotronH_Nano_VL_V2(
         self.mlp1 = self.mlp1.to(self.language_model.config.dtype)
 
         self.config = config
-        self.model_config = vllm_config.model_config
 
         # Pre-tokenize special tokens for video processing
         # to avoid repeated tokenization
-        tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
-        self._img_start_token_ids = tokenizer.encode(
+        self._tokenizer = cached_tokenizer_from_config(vllm_config.renderer_config)
+        self._img_start_token_ids = self._tokenizer.encode(
             IMG_START, add_special_tokens=False
         )
-        self._img_end_token_ids = tokenizer.encode(IMG_END, add_special_tokens=False)
-        self._img_context_token_ids = tokenizer.encode(
+        self._img_end_token_ids = self._tokenizer.encode(
+            IMG_END, add_special_tokens=False
+        )
+        self._img_context_token_ids = self._tokenizer.encode(
             IMG_CONTEXT, add_special_tokens=False
         )
 
@@ -1364,7 +1365,7 @@ class NemotronH_Nano_VL_V2(
         input_embeds for the LLM.
         """
         device = video_embeddings.device
-        tokenizer = cached_tokenizer_from_config(self.model_config)
+        tokenizer = self._tokenizer
 
         # Generate video replacement token IDs using get_video_repl
         # This tokenizes each frame separator independently, then uses pre-tokenized
diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py
index 391980fc6..797793e65 100644
--- a/vllm/model_executor/models/nemotron_vl.py
+++ b/vllm/model_executor/models/nemotron_vl.py
@@ -347,7 +347,7 @@ class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
 
     def get_image_processor(self, **kwargs: object):
         return cached_image_processor_from_config(
-            self.ctx.model_config,
+            self.ctx.renderer_config,
             **kwargs,
         )
 
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index faf2d80d2..ebe743fa8 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -193,7 +193,7 @@ class PixtralProcessorAdapter:
 
 class PixtralProcessingInfo(BaseProcessingInfo):
     def get_tokenizer(self) -> MistralTokenizer:
-        tokenizer = cached_tokenizer_from_config(self.ctx.model_config)
+        tokenizer = cached_tokenizer_from_config(self.ctx.renderer_config)
         if not isinstance(tokenizer, MistralTokenizer):
             raise ValueError("This model requires `--tokenizer-mode mistral`")
 
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 7b408248e..0acd564e2 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -20,7 +20,7 @@ from mistral_common.tokens.tokenizers.audio import Audio, AudioEncoder
 from transformers import BatchFeature, TensorType, WhisperConfig
 from transformers.tokenization_utils_base import TextInput
 
-from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
+from vllm.config import RendererConfig, SpeechToTextConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
@@ -176,7 +176,7 @@ class VoxtralProcessorAdapter:
 
 class VoxtralProcessingInfo(BaseProcessingInfo):
     def get_tokenizer(self) -> MistralTokenizer:
-        tokenizer = cached_tokenizer_from_config(self.ctx.model_config)
+        tokenizer = cached_tokenizer_from_config(self.ctx.renderer_config)
         if not isinstance(tokenizer, MistralTokenizer):
             raise ValueError("This model requires `--tokenizer-mode mistral`")
 
@@ -339,7 +339,7 @@ class VoxtralForConditionalGeneration(
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
-        self.tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
+        self.tokenizer = cached_tokenizer_from_config(vllm_config.renderer_config)
 
         # update quant config to so that ignored module and target module names
         # match the vLLM model names
@@ -450,9 +450,11 @@ class VoxtralForConditionalGeneration(
 
     @classmethod
     def get_speech_to_text_config(
-        cls, model_config: ModelConfig, task_type: str
+        cls,
+        renderer_config: RendererConfig,
+        task_type: str,
     ) -> SpeechToTextConfig:
-        tokenizer = cached_tokenizer_from_config(model_config)
+        tokenizer = cached_tokenizer_from_config(renderer_config)
         audio_config = tokenizer.instruct.audio_encoder.audio_config
         max_audio_clip_s = audio_config.chunk_length_s
         sample_rate = audio_config.sampling_rate
@@ -468,17 +470,17 @@ class VoxtralForConditionalGeneration(
     def get_generation_prompt(
         cls,
         audio: np.ndarray,
-        model_config: ModelConfig,
+        renderer_config: RendererConfig,  # not needed here
         stt_config: SpeechToTextConfig,
         language: str | None,
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
         to_language: str | None,
     ) -> PromptType:
-        tokenizer = cached_tokenizer_from_config(model_config)
+        tokenizer = cached_tokenizer_from_config(renderer_config)
         audio = Audio(audio, int(stt_config.sample_rate), format="wav")  # lossless
         req = TranscriptionRequest(
-            model=model_config.model,
+            model=renderer_config.model_config.model,
             audio=RawAudio.from_audio(audio),
             language=language,
         )
@@ -494,14 +496,14 @@ class VoxtralForConditionalGeneration(
         cls,
         audio_duration_s: float,
         stt_config: SpeechToTextConfig,
-        model_config: ModelConfig,
+        renderer_config: RendererConfig,
     ) -> int | None:
         """
         Map from audio duration to number of audio tokens produced by the ASR
         model, without running a forward pass.
         This is used for estimating the amount of processing for this audio.
         """
-        tokenizer = cached_tokenizer_from_config(model_config)
+        tokenizer = cached_tokenizer_from_config(renderer_config)
         adapter = VoxtralProcessorAdapter(tokenizer)
         return adapter.get_num_audio_tokens(
             int(audio_duration_s * stt_config.sample_rate)
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index b2feff133..6f526e395 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -19,7 +19,7 @@ from transformers.models.whisper.modeling_whisper import sinusoids
 from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention, MultiHeadAttention
 from vllm.attention.layers.cross_attention import CrossAttention
-from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
+from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.inputs.data import PromptType
@@ -811,7 +811,7 @@ class WhisperForConditionalGeneration(
     def get_generation_prompt(
         cls,
         audio: np.ndarray,
-        model_config: ModelConfig,  # not needed here
+        renderer_config: RendererConfig,  # not needed here
         stt_config: SpeechToTextConfig,
         language: str | None,
         task_type: Literal["transcribe", "translate"],
@@ -847,9 +847,11 @@ class WhisperForConditionalGeneration(
 
     @classmethod
     def get_speech_to_text_config(
-        cls, model_config: ModelConfig, task_type: str
+        cls,
+        renderer_config: RendererConfig,
+        task_type: str,
     ) -> SpeechToTextConfig:
-        processor = cached_processor_from_config(model_config)
+        processor = cached_processor_from_config(renderer_config)
 
         return SpeechToTextConfig(
             max_audio_clip_s=processor.feature_extractor.chunk_length,
@@ -861,9 +863,9 @@ class WhisperForConditionalGeneration(
         cls,
         audio_duration_s: float,
         stt_config: SpeechToTextConfig,
-        model_config: ModelConfig,
+        renderer_config: RendererConfig,
     ) -> int | None:
-        processor = cached_processor_from_config(model_config)
+        processor = cached_processor_from_config(renderer_config)
         hop_length = processor.feature_extractor.hop_length
         assert hop_length is not None
         # NOTE(NickLucche) user can't pass encoder
diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
index 67bdf5e15..9c838fe67 100644
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@@ -31,7 +31,7 @@ from .inputs import (
 )
 
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig, VllmConfig
+    from vllm.config import ModelConfig, RendererConfig, VllmConfig
 
     from .processing import ResolvedPromptUpdate
     from .registry import MultiModalRegistry
@@ -561,13 +561,13 @@ class ShmObjectStoreSenderCache(BaseMultiModalProcessorCache):
 
 
 def _enable_processor_cache(
-    model_config: "ModelConfig",
+    renderer_config: "RendererConfig",
     mm_registry: "MultiModalRegistry",
 ) -> bool:
-    if not mm_registry.supports_multimodal_inputs(model_config):
+    if not mm_registry.supports_multimodal_inputs(renderer_config):
         return False
 
-    mm_config = model_config.get_multimodal_config()
+    mm_config = renderer_config.model_config.get_multimodal_config()
     return mm_config.mm_processor_cache_gb > 0
 
 
@@ -599,7 +599,7 @@ def processor_cache_from_config(
     """Return a `BaseMultiModalProcessorCache`, if enabled."""
     model_config = vllm_config.model_config
 
-    if not _enable_processor_cache(model_config, mm_registry):
+    if not _enable_processor_cache(vllm_config.renderer_config, mm_registry):
         return None
 
     if not _enable_ipc_cache(vllm_config):
@@ -611,14 +611,14 @@ def processor_cache_from_config(
 
 
 def processor_only_cache_from_config(
-    model_config: "ModelConfig",
+    renderer_config: "RendererConfig",
     mm_registry: "MultiModalRegistry",
 ):
     """Return a `MultiModalProcessorOnlyCache`, if enabled."""
-    if not _enable_processor_cache(model_config, mm_registry):
+    if not _enable_processor_cache(renderer_config, mm_registry):
         return None
 
-    return MultiModalProcessorOnlyCache(model_config)
+    return MultiModalProcessorOnlyCache(renderer_config.model_config)
 
 
 class BaseMultiModalReceiverCache(
@@ -787,7 +787,7 @@ def engine_receiver_cache_from_config(
     """
     model_config = vllm_config.model_config
 
-    if not _enable_processor_cache(model_config, mm_registry):
+    if not _enable_processor_cache(vllm_config.renderer_config, mm_registry):
         return None
 
     if not _enable_ipc_cache(vllm_config):
@@ -809,9 +809,7 @@ def worker_receiver_cache_from_config(
     Return a `BaseMultiModalReceiverCache` only when IPC caching is enabled and
     mm_processor_cache_type=="shm".
     """
-    model_config = vllm_config.model_config
-
-    if not _enable_processor_cache(model_config, mm_registry):
+    if not _enable_processor_cache(vllm_config.renderer_config, mm_registry):
         return None
 
     if not _enable_ipc_cache(vllm_config):
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 039077378..81ceb76a4 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -23,7 +23,7 @@ import torch
 from typing_extensions import TypeVar, assert_never
 
 from vllm.logger import init_logger
-from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
 from vllm.transformers_utils.processor import cached_processor_from_config
 from vllm.utils.collection_utils import flatten_2d_lists, full_groupby
 from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
@@ -53,7 +53,7 @@ if TYPE_CHECKING:
     from transformers.feature_extraction_utils import BatchFeature
     from transformers.processing_utils import ProcessorMixin
 
-    from vllm.config import ModelConfig
+    from vllm.config import ModelConfig, RendererConfig
 
     from .cache import BaseMultiModalProcessorCache
     from .profiling import BaseDummyInputsBuilder
@@ -63,6 +63,7 @@ else:
     ProcessorMixin = object
 
     ModelConfig = object
+    RendererConfig = object
 
     BaseMultiModalProcessorCache = object
 
@@ -945,12 +946,29 @@ class InputProcessingContext:
     modify the inputs.
     """
 
-    model_config: ModelConfig
-    """The configuration of the model."""
+    renderer_config: RendererConfig
+    """The configuration of the renderer."""
 
     tokenizer: TokenizerLike | None
     """The tokenizer used to tokenize the inputs."""
 
+    @classmethod
+    def from_config(
+        cls,
+        renderer_config: RendererConfig,
+        *,
+        tokenizer: TokenizerLike | None = None,
+    ):
+        if tokenizer is None and not renderer_config.skip_tokenizer_init:
+            tokenizer = cached_tokenizer_from_config(renderer_config)
+
+        return cls(renderer_config, tokenizer)
+
+    @property
+    def model_config(self) -> ModelConfig:
+        """The configuration of the model."""
+        return self.renderer_config.model_config
+
     def get_tokenizer(self) -> TokenizerLike:
         if self.tokenizer is None:
             raise ValueError(
@@ -1047,7 +1065,7 @@ class InputProcessingContext:
             typ = ProcessorMixin
 
         return cached_processor_from_config(
-            self.model_config,
+            self.renderer_config,
             processor_cls=typ,
             tokenizer=self.tokenizer,
             **kwargs,
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 00a84f9de..e49aaa504 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast
 
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.logger import init_logger
-from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
+from vllm.tokenizers import TokenizerLike
 
 from .cache import BaseMultiModalProcessorCache
 from .processing import (
@@ -22,7 +22,7 @@ from .profiling import (
 )
 
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig
+    from vllm.config import ModelConfig, RendererConfig
     from vllm.model_executor.models.interfaces import SupportsMultiModal
 
 logger = init_logger(__name__)
@@ -114,17 +114,18 @@ class MultiModalRegistry:
 
         return mm_options if len(mm_options) > 0 else None
 
-    def supports_multimodal_inputs(self, model_config: "ModelConfig") -> bool:
+    def supports_multimodal_inputs(self, renderer_config: "RendererConfig") -> bool:
         """
         Checks if the model supports multimodal inputs.
         Returns True if the model is multimodal with any non-zero supported
         modalities, otherwise returns False, effectively running in
         text-only mode.
         """
+        model_config = renderer_config.model_config
         if not model_config.is_multimodal_model:
             return False
 
-        info = self._create_processing_info(model_config, tokenizer=None)
+        info = self._create_processing_info(renderer_config, tokenizer=None)
         supported_modalities = info.get_supported_mm_limits()
 
         mm_config = model_config.get_multimodal_config()
@@ -144,7 +145,7 @@ class MultiModalRegistry:
 
     def get_max_tokens_per_item_by_modality(
         self,
-        model_config: "ModelConfig",
+        renderer_config: "RendererConfig",
         *,
         cache: BaseMultiModalProcessorCache | None = None,
         profiler_limits: Mapping[str, int] | None = None,
@@ -153,10 +154,11 @@ class MultiModalRegistry:
         Get the maximum number of tokens per data item from each modality based
         on underlying model configuration.
         """
+        model_config = renderer_config.model_config
         if not model_config.is_multimodal_model:
             return {}
 
-        processor = self.create_processor(model_config, cache=cache)
+        processor = self.create_processor(renderer_config, cache=cache)
         profiler: MultiModalProfiler = MultiModalProfiler(processor)
 
         seq_len = model_config.max_model_len
@@ -171,7 +173,7 @@ class MultiModalRegistry:
 
     def get_mm_limits_per_prompt(
         self,
-        model_config: "ModelConfig",
+        renderer_config: "RendererConfig",
         *,
         cache: BaseMultiModalProcessorCache | None = None,
     ) -> Mapping[str, int]:
@@ -179,10 +181,11 @@ class MultiModalRegistry:
         Get the maximum number of multi-modal input instances for each modality
         that are allowed per prompt for a model class.
         """
+        model_config = renderer_config.model_config
         if not model_config.is_multimodal_model:
             return {}
 
-        processor = self.create_processor(model_config, cache=cache)
+        processor = self.create_processor(renderer_config, cache=cache)
         profiler: MultiModalProfiler = MultiModalProfiler(processor)
         return profiler.get_mm_limits()
 
@@ -228,30 +231,21 @@ class MultiModalRegistry:
         assert hasattr(model_cls, "_processor_factory")
         return cast("SupportsMultiModal", model_cls)
 
-    def _create_processing_ctx(
-        self,
-        model_config: "ModelConfig",
-        tokenizer: TokenizerLike | None = None,
-    ) -> InputProcessingContext:
-        if tokenizer is None and not model_config.skip_tokenizer_init:
-            tokenizer = cached_tokenizer_from_config(model_config)
-
-        return InputProcessingContext(model_config, tokenizer)
-
     def _create_processing_info(
         self,
-        model_config: "ModelConfig",
+        renderer_config: "RendererConfig",
         *,
         tokenizer: TokenizerLike | None = None,
     ) -> BaseProcessingInfo:
-        model_cls = self._get_model_cls(model_config)
+        model_cls = self._get_model_cls(renderer_config.model_config)
         factories = model_cls._processor_factory
-        ctx = self._create_processing_ctx(model_config, tokenizer)
+
+        ctx = InputProcessingContext.from_config(renderer_config, tokenizer=tokenizer)
         return factories.info(ctx)
 
     def create_processor(
         self,
-        model_config: "ModelConfig",
+        renderer_config: "RendererConfig",
         *,
         tokenizer: TokenizerLike | None = None,
         cache: BaseMultiModalProcessorCache | None = None,
@@ -259,19 +253,19 @@ class MultiModalRegistry:
         """
         Create a multi-modal processor for a specific model and tokenizer.
         """
+        model_config = renderer_config.model_config
         if not model_config.is_multimodal_model:
             raise ValueError(f"{model_config.model} is not a multimodal model")
 
         model_cls = self._get_model_cls(model_config)
         factories = model_cls._processor_factory
 
-        ctx = self._create_processing_ctx(model_config, tokenizer)
-
+        ctx = InputProcessingContext.from_config(renderer_config, tokenizer=tokenizer)
         return factories.build_processor(ctx, cache=cache)
 
     def get_decoder_dummy_data(
         self,
-        model_config: "ModelConfig",
+        renderer_config: "RendererConfig",
         seq_len: int,
         mm_counts: Mapping[str, int] | None = None,
         *,
@@ -280,15 +274,15 @@ class MultiModalRegistry:
         """
         Create dummy data for profiling the memory usage of a model.
 
-        The model is identified by `model_config`.
+        The model is identified by `renderer_config`.
         """
-        processor = self.create_processor(model_config, cache=cache)
+        processor = self.create_processor(renderer_config, cache=cache)
         profiler: MultiModalProfiler = MultiModalProfiler(processor)
 
         # Extract configurable options from multimodal config.
         # Only include modalities that use advanced option types so legacy
         # count-only behavior remains unchanged.
-        mm_options = self._extract_mm_options(model_config)
+        mm_options = self._extract_mm_options(renderer_config.model_config)
 
         dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts, mm_options)
 
@@ -304,7 +298,7 @@ class MultiModalRegistry:
 
     def get_encoder_dummy_data(
         self,
-        model_config: "ModelConfig",
+        renderer_config: "RendererConfig",
         seq_len: int,
         mm_counts: Mapping[str, int] | None = None,
         *,
@@ -313,15 +307,15 @@ class MultiModalRegistry:
         """
         Create dummy data for profiling the memory usage of a model.
 
-        The model is identified by `model_config`.
+        The model is identified by `renderer_config`.
         """
-        processor = self.create_processor(model_config, cache=cache)
+        processor = self.create_processor(renderer_config, cache=cache)
         profiler: MultiModalProfiler = MultiModalProfiler(processor)
 
         # Extract configurable options from multimodal config.
         # Only include modalities that use advanced option types so legacy
         # count-only behavior remains unchanged.
-        mm_options = self._extract_mm_options(model_config)
+        mm_options = self._extract_mm_options(renderer_config.model_config)
 
         dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts, mm_options)
 
@@ -336,13 +330,15 @@ class MultiModalRegistry:
 
         return dummy_data
 
-    def get_encdec_max_encoder_len(self, model_config: "ModelConfig") -> int:
+    def get_encdec_max_encoder_len(self, renderer_config: "RendererConfig") -> int:
         """
         Get the maximum length of the encoder input for encoder-decoder models.
         """
+        model_config = renderer_config.model_config
         if not model_config.is_encoder_decoder:
             return 0
-        max_tokens = self.get_max_tokens_per_item_by_modality(model_config)
+
+        max_tokens = self.get_max_tokens_per_item_by_modality(renderer_config)
         if not max_tokens:
             # TODO - this function assumes encoder-decoder models are
             # multimodal. This will need to change when adding support for more
diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py
index 1d44feeee..c9575511a 100644
--- a/vllm/tokenizers/registry.py
+++ b/vllm/tokenizers/registry.py
@@ -24,7 +24,7 @@ from vllm.utils.import_utils import resolve_obj_by_qualname
 from .protocol import TokenizerLike
 
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig
+    from vllm.config import RendererConfig
 
 logger = init_logger(__name__)
 
@@ -205,18 +205,18 @@ def get_tokenizer(
 cached_get_tokenizer = lru_cache(get_tokenizer)
 
 
-def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs):
+def cached_tokenizer_from_config(renderer_config: "RendererConfig", **kwargs):
     return cached_get_tokenizer(
-        model_config.tokenizer,
-        tokenizer_mode=model_config.tokenizer_mode,
-        revision=model_config.tokenizer_revision,
-        trust_remote_code=model_config.trust_remote_code,
+        renderer_config.tokenizer,
+        tokenizer_mode=renderer_config.tokenizer_mode,
+        revision=renderer_config.tokenizer_revision,
+        trust_remote_code=renderer_config.trust_remote_code,
         **kwargs,
     )
 
 
-def init_tokenizer_from_config(model_config: "ModelConfig"):
-    runner_type = model_config.runner_type
+def init_tokenizer_from_config(renderer_config: "RendererConfig"):
+    runner_type = renderer_config.model_config.runner_type
     if runner_type == "generate" or runner_type == "draft":
         truncation_side = "left"
     elif runner_type == "pooling":
@@ -225,9 +225,9 @@ def init_tokenizer_from_config(model_config: "ModelConfig"):
         assert_never(runner_type)
 
     return get_tokenizer(
-        model_config.tokenizer,
-        tokenizer_mode=model_config.tokenizer_mode,
-        trust_remote_code=model_config.trust_remote_code,
-        revision=model_config.tokenizer_revision,
+        renderer_config.tokenizer,
+        tokenizer_mode=renderer_config.tokenizer_mode,
+        trust_remote_code=renderer_config.trust_remote_code,
+        revision=renderer_config.tokenizer_revision,
         truncation_side=truncation_side,
     )
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index e9864b0c1..bdebd2686 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -23,7 +23,7 @@ from vllm.transformers_utils.utils import convert_model_repo_to_path
 from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
 
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig
+    from vllm.config import ModelConfig, RendererConfig
 
 _P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
 _V = TypeVar("_V", bound=BaseVideoProcessor, default=BaseVideoProcessor)
@@ -233,17 +233,18 @@ def cached_get_processor_without_dynamic_kwargs(
 
 
 def cached_processor_from_config(
-    model_config: "ModelConfig",
+    renderer_config: "RendererConfig",
     processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
     **kwargs: Any,
 ) -> _P:
+    model_config = renderer_config.model_config
     if is_gguf(model_config.model):
-        assert not is_gguf(model_config.tokenizer), (
+        assert not is_gguf(renderer_config.tokenizer), (
             "For multimodal GGUF models, the original tokenizer "
             "should be used to correctly load processor."
         )
-        model = model_config.tokenizer
-        revision = model_config.tokenizer_revision
+        model = renderer_config.tokenizer
+        revision = renderer_config.tokenizer_revision
     else:
         model = model_config.model
         revision = model_config.revision
@@ -297,9 +298,11 @@ cached_get_feature_extractor = lru_cache(get_feature_extractor)
 
 
 def cached_feature_extractor_from_config(
-    model_config: "ModelConfig",
+    renderer_config: "RendererConfig",
     **kwargs: Any,
 ):
+    model_config = renderer_config.model_config
+
     return cached_get_feature_extractor(
         model_config.model,
         revision=model_config.revision,
@@ -348,16 +351,17 @@ cached_get_image_processor = lru_cache(get_image_processor)
 
 
 def cached_image_processor_from_config(
-    model_config: "ModelConfig",
+    renderer_config: "RendererConfig",
     **kwargs: Any,
 ):
+    model_config = renderer_config.model_config
     if is_gguf(model_config.model):
-        assert not is_gguf(model_config.tokenizer), (
+        assert not is_gguf(renderer_config.tokenizer), (
             "For multimodal GGUF models, the original tokenizer "
             "should be used to correctly load image processor."
         )
-        model = model_config.tokenizer
-        revision = model_config.tokenizer_revision
+        model = renderer_config.tokenizer
+        revision = renderer_config.tokenizer_revision
     else:
         model = model_config.model
         revision = model_config.revision
@@ -411,10 +415,12 @@ cached_get_video_processor = lru_cache(get_video_processor)
 
 
 def cached_video_processor_from_config(
-    model_config: "ModelConfig",
+    renderer_config: "RendererConfig",
     processor_cls: type[_V] | None = None,
     **kwargs: Any,
 ):
+    model_config = renderer_config.model_config
+
     return cached_get_video_processor(
         model_config.model,
         revision=model_config.revision,
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index 3959e9a59..21315b85f 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -10,7 +10,7 @@ from vllm.multimodal import MultiModalRegistry
 from vllm.v1.request import Request
 
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig, SchedulerConfig
+    from vllm.config import RendererConfig, SchedulerConfig
 
 logger = init_logger(__name__)
 
@@ -250,7 +250,7 @@ class EncoderCacheManager:
 
 
 def compute_encoder_budget(
-    model_config: "ModelConfig",
+    renderer_config: "RendererConfig",
     scheduler_config: "SchedulerConfig",
     mm_registry: MultiModalRegistry,
 ) -> tuple[int, int]:
@@ -263,9 +263,9 @@ def compute_encoder_budget(
         - Space budget for encoder cache size, measured in number of tokens
             from the input sequence.
     """
-    if mm_registry.supports_multimodal_inputs(model_config):
+    if mm_registry.supports_multimodal_inputs(renderer_config):
         max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality(
-            model_config
+            renderer_config
         )
 
         return compute_mm_encoder_budget(
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 0a8efa2fd..96073efc5 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -164,7 +164,7 @@ class Scheduler(SchedulerInterface):
         # This can be changed when we make encoder cache for embedding caching
         # across requests.
         encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
-            model_config=vllm_config.model_config,
+            renderer_config=vllm_config.renderer_config,
             scheduler_config=vllm_config.scheduler_config,
             mm_registry=mm_registry,
         )
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index fd7e04dc0..b76f9c059 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -91,6 +91,7 @@ class AsyncLLM(EngineClient):
         # Ensure we can serialize custom transformer configs
         maybe_register_config_serialize_by_value()
 
+        self.renderer_config = vllm_config.renderer_config
         self.model_config = vllm_config.model_config
         self.vllm_config = vllm_config
         self.observability_config = vllm_config.observability_config
@@ -108,15 +109,15 @@ class AsyncLLM(EngineClient):
                 "enabling logging without default stat loggers."
             )
 
-        if self.model_config.skip_tokenizer_init:
+        if self.renderer_config.skip_tokenizer_init:
             tokenizer = None
         else:
-            tokenizer = init_tokenizer_from_config(self.model_config)
+            tokenizer = init_tokenizer_from_config(self.renderer_config)
 
         self.input_processor = InputProcessor(self.vllm_config, tokenizer)
         self.io_processor = get_io_processor(
             self.vllm_config,
-            self.model_config.io_processor_plugin,
+            self.renderer_config.io_processor_plugin,
         )
 
         # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py
index e6a94f4e3..a2f6ba5be 100644
--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@@ -43,6 +43,7 @@ class InputProcessor:
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ) -> None:
         self.vllm_config = vllm_config
+        self.renderer_config = vllm_config.renderer_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
         self.lora_config = vllm_config.lora_config
@@ -54,7 +55,7 @@ class InputProcessor:
         self.mm_processor_cache = processor_cache_from_config(vllm_config, mm_registry)
 
         self.input_preprocessor = InputPreprocessor(
-            self.model_config,
+            self.renderer_config,
             tokenizer,
             mm_registry,
             mm_processor_cache=self.mm_processor_cache,
@@ -252,7 +253,7 @@ class InputProcessor:
         if not params.structured_outputs or not self.structured_outputs_config:
             return
 
-        if self.model_config.skip_tokenizer_init and params.structured_outputs:
+        if self.renderer_config.skip_tokenizer_init and params.structured_outputs:
             raise ValueError(
                 "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'"  # noqa: E501
             )
@@ -582,7 +583,7 @@ class InputProcessor:
             if prompt_type == "encoder" and model_config.is_multimodal_model:
                 mm_registry = self.input_preprocessor.mm_registry
                 mm_processor = mm_registry.create_processor(
-                    model_config,
+                    self.renderer_config,
                     tokenizer=tokenizer,
                 )
                 assert isinstance(mm_processor, EncDecMultiModalProcessor)
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 4c3129100..ba0e1cf25 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -60,6 +60,7 @@ class LLMEngine:
     ) -> None:
         self.vllm_config = vllm_config
         self.observability_config = vllm_config.observability_config
+        self.renderer_config = vllm_config.renderer_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
 
@@ -83,15 +84,15 @@ class LLMEngine:
             self.dp_group = None
         self.should_execute_dummy_batch = False
 
-        if self.model_config.skip_tokenizer_init:
+        if self.renderer_config.skip_tokenizer_init:
             tokenizer = None
         else:
-            tokenizer = init_tokenizer_from_config(self.model_config)
+            tokenizer = init_tokenizer_from_config(self.renderer_config)
 
         self.input_processor = InputProcessor(self.vllm_config, tokenizer)
         self.io_processor = get_io_processor(
             self.vllm_config,
-            self.model_config.io_processor_plugin,
+            self.renderer_config.io_processor_plugin,
         )
 
         # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 31428db2d..797641851 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -85,7 +85,7 @@ class EagleProposer:
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
         self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
-            vllm_config.model_config
+            vllm_config.renderer_config
         )
 
         self.attn_metadata_builder: AttentionMetadataBuilder | None = None
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 5ee88178c..36aa3d9bb 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -63,7 +63,7 @@ class StructuredOutputManager:
             max_workers = max(1, min(multiprocessing.cpu_count() // 2, 8))
             self.executor_for_fillmask = ThreadPoolExecutor(max_workers=max_workers)
 
-        if not self.vllm_config.model_config.skip_tokenizer_init:
+        if not vllm_config.renderer_config.skip_tokenizer_init:
             # The default max_workers if not specified is the number of
             # CPUs * 5, which is way too high since these tasks are CPU-bound,
             # not I/O bound. We also know we would never dominate CPU usage
@@ -71,21 +71,15 @@ class StructuredOutputManager:
             # of CPUs.
             max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
             self.executor = ThreadPoolExecutor(max_workers=max_workers)
-            self.tokenizer = init_tokenizer_from_config(
-                model_config=self.vllm_config.model_config
-            )
-            reasoning_parser = (
-                self.vllm_config.structured_outputs_config.reasoning_parser
-            )
+            self.tokenizer = init_tokenizer_from_config(vllm_config.renderer_config)
+            reasoning_parser = vllm_config.structured_outputs_config.reasoning_parser
             reasoning_parser_plugin = (
-                self.vllm_config.structured_outputs_config.reasoning_parser_plugin
+                vllm_config.structured_outputs_config.reasoning_parser_plugin
             )
             if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3:
                 ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin)
 
-            reasoning_parser = (
-                self.vllm_config.structured_outputs_config.reasoning_parser
-            )
+            reasoning_parser = vllm_config.structured_outputs_config.reasoning_parser
             if reasoning_parser:
                 reasoner_cls = ReasoningParserManager.get_reasoning_parser(
                     reasoning_parser
@@ -93,7 +87,7 @@ class StructuredOutputManager:
                 self.reasoner = reasoner_cls(tokenizer=self.tokenizer)
 
         self.enable_in_reasoning = (
-            self.vllm_config.structured_outputs_config.enable_in_reasoning
+            vllm_config.structured_outputs_config.enable_in_reasoning
         )
 
     def grammar_init(self, request: Request) -> None:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a50360ab0..b3c8d4da2 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -271,6 +271,7 @@ class GPUModelRunner(
         device: torch.device,
     ):
         self.vllm_config = vllm_config
+        self.renderer_config = vllm_config.renderer_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
         self.compilation_config = vllm_config.compilation_config
@@ -335,7 +336,7 @@ class GPUModelRunner(
         self.uses_mrope = model_config.uses_mrope
         self.uses_xdrope_dim = model_config.uses_xdrope_dim
         self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
-            model_config
+            self.renderer_config
         )
 
         if self.model_config.is_encoder_decoder:
@@ -558,7 +559,7 @@ class GPUModelRunner(
 
         self.mm_budget = (
             MultiModalBudget(
-                self.model_config,
+                self.renderer_config,
                 self.scheduler_config,
                 self.mm_registry,
             )
@@ -3873,7 +3874,7 @@ class GPUModelRunner(
         assert self.mm_budget is not None
 
         dummy_decoder_data = self.mm_registry.get_decoder_dummy_data(
-            model_config=self.model_config,
+            renderer_config=self.renderer_config,
             seq_len=self.max_model_len,
             mm_counts={modality: 1},
             cache=self.mm_budget.cache,
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 283f21b77..7e2a6af68 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -143,6 +143,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         original_parallel_config: ParallelConfig | None = None,
     ):
         self.vllm_config = vllm_config
+        self.renderer_config = vllm_config.renderer_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
         self.lora_config = vllm_config.lora_config
@@ -222,7 +223,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
         self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
-            model_config
+            self.renderer_config
         )
         # TODO: Support M-RoPE (e.g, Qwen2-VL)
         assert not self.uses_mrope, "TPU does not support M-RoPE yet."
@@ -353,7 +354,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         self.mm_budget = (
             MultiModalBudget(
-                self.model_config,
+                self.renderer_config,
                 self.scheduler_config,
                 self.mm_registry,
             )
@@ -2038,7 +2039,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         assert self.mm_budget is not None
 
         dummy_decoder_data = self.mm_registry.get_decoder_dummy_data(
-            model_config=self.model_config,
+            renderer_config=self.renderer_config,
             seq_len=self.max_model_len,
             mm_counts={modality: 1},
             cache=self.mm_budget.cache,
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 0b0e2006d..44418b998 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -7,7 +7,7 @@ import torch
 
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.layer import Attention
-from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
+from vllm.config import RendererConfig, SchedulerConfig, VllmConfig
 from vllm.model_executor.models.interfaces import MultiModalEmbeddings
 from vllm.model_executor.models.utils import extract_layer_index
 from vllm.multimodal.cache import processor_only_cache_from_config
@@ -23,24 +23,29 @@ class MultiModalBudget:
 
     def __init__(
         self,
-        model_config: ModelConfig,
+        renderer_config: RendererConfig,
         scheduler_config: SchedulerConfig,
         mm_registry: MultiModalRegistry,
     ) -> None:
         super().__init__()
 
-        self.model_config = model_config
+        self.renderer_config = renderer_config
+        self.model_config = renderer_config.model_config
         self.scheduler_config = scheduler_config
         self.mm_registry = mm_registry
-        self.cache = cache = processor_only_cache_from_config(model_config, mm_registry)
+        self.cache = cache = processor_only_cache_from_config(
+            renderer_config, mm_registry
+        )
 
-        self.max_model_len = model_config.max_model_len
+        self.max_model_len = self.model_config.max_model_len
         self.max_num_reqs = scheduler_config.max_num_seqs
 
-        self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config, cache=cache)
+        self.mm_limits = mm_registry.get_mm_limits_per_prompt(
+            renderer_config, cache=cache
+        )
 
         max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality(
-            model_config,
+            renderer_config,
             cache=cache,
             profiler_limits=self.mm_limits,
         )
-- 
GitLab


From e83b7e379c11bf136c1b96bc6a67b6d2207cfde4 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 7 Dec 2025 16:00:22 +0800
Subject: [PATCH 172/258] Revert "[Renderer] Separate out `RendererConfig` from
 `ModelConfig` (#30145)" (#30199)

---
 docs/contributing/model/transcription.md      |  12 +-
 .../distributed/test_sequence_parallelism.py  |   2 -
 tests/compile/test_functionalization.py       |   6 +-
 tests/compile/test_fusion.py                  |   6 +-
 tests/compile/test_fusion_attn.py             |   2 -
 tests/compile/test_pass_manager.py            |   8 +-
 tests/compile/test_qk_norm_rope_fusion.py     |   5 +-
 tests/distributed/test_kvlayout.py            |   3 -
 .../entrypoints/openai/test_chat_template.py  |  22 +-
 .../entrypoints/openai/test_lora_resolvers.py |  21 +-
 tests/entrypoints/openai/test_serving_chat.py |  28 +--
 .../entrypoints/openai/test_serving_engine.py |   8 +-
 .../entrypoints/openai/test_serving_models.py |   8 +-
 tests/entrypoints/test_chat_utils.py          | 194 +++++++++++-------
 tests/lora/test_lora_manager.py               |  14 +-
 tests/lora/test_worker.py                     |   2 -
 .../test_model_load_with_params.py            |  22 +-
 tests/models/language/pooling/test_gritlm.py  |   5 +-
 .../multimodal/processing/test_common.py      |  22 +-
 .../multimodal/processing/test_glm4_1v.py     |   4 +-
 .../multimodal/processing/test_h2ovl.py       |   2 +-
 .../multimodal/processing/test_idefics3.py    |   2 +-
 .../multimodal/processing/test_internvl.py    |   2 +-
 .../multimodal/processing/test_llama4.py      |   2 +-
 .../multimodal/processing/test_llava_next.py  |   6 +-
 .../processing/test_llava_onevision.py        |   6 +-
 .../processing/test_minimax_vl_01.py          |   4 +-
 .../multimodal/processing/test_mllama4.py     |   2 +-
 .../multimodal/processing/test_nemotron_vl.py |   2 +-
 .../multimodal/processing/test_phi3v.py       |   2 +-
 .../multimodal/processing/test_phi4mm.py      |   2 +-
 .../multimodal/processing/test_qwen2_vl.py    |   2 +-
 .../multimodal/processing/test_smolvlm.py     |   2 +-
 .../processing/test_tensor_schema.py          |  24 ++-
 .../processing/test_transformers.py           |   5 +-
 tests/models/multimodal/test_mapping.py       |  33 ++-
 tests/models/registry.py                      |  33 +--
 tests/models/utils.py                         |  17 +-
 tests/multimodal/test_cache.py                |  27 +--
 tests/multimodal/test_processing.py           |  24 +--
 tests/multimodal/test_registry.py             |   4 +-
 tests/test_config.py                          | 131 +++++-------
 tests/test_inputs.py                          |   7 +-
 tests/v1/attention/utils.py                   |   2 -
 tests/v1/core/test_kv_cache_utils.py          |  20 +-
 tests/v1/core/test_scheduler.py               |   2 -
 tests/v1/core/utils.py                        |   2 -
 tests/v1/engine/test_engine_core.py           |   2 -
 .../engine/test_process_multi_modal_uuids.py  |  24 +--
 tests/v1/kv_connector/unit/utils.py           |   2 -
 tests/v1/spec_decode/test_eagle.py            |   2 -
 tests/v1/spec_decode/test_mtp.py              |   2 -
 tests/v1/spec_decode/test_ngram.py            |   2 -
 .../test_backend_guidance.py                  |  12 +-
 .../test_reasoning_structured_output.py       |  35 ++--
 tests/v1/tpu/worker/test_tpu_model_runner.py  |   2 -
 tests/v1/worker/test_gpu_model_runner.py      |   3 -
 vllm/config/__init__.py                       |   3 -
 vllm/config/model.py                          | 141 ++++++++++---
 vllm/config/multimodal.py                     |   4 +
 vllm/config/renderer.py                       | 109 ----------
 vllm/config/speculative.py                    |   5 +
 vllm/config/vllm.py                           |  25 +--
 vllm/engine/arg_utils.py                      |  99 ++++-----
 vllm/engine/protocol.py                       |   3 +-
 vllm/entrypoints/chat_utils.py                |  79 +++----
 vllm/entrypoints/llm.py                       |  14 +-
 vllm/entrypoints/openai/api_server.py         |   2 +-
 vllm/entrypoints/openai/serving_completion.py |   2 +-
 vllm/entrypoints/openai/serving_engine.py     |  11 +-
 vllm/entrypoints/openai/serving_models.py     |   1 -
 vllm/entrypoints/openai/speech_to_text.py     |  10 +-
 vllm/entrypoints/pooling/pooling/serving.py   |   2 +-
 vllm/entrypoints/pooling/score/serving.py     |   4 +-
 vllm/entrypoints/score_utils.py               |  13 +-
 vllm/entrypoints/utils.py                     |   8 +-
 vllm/inputs/preprocess.py                     |   9 +-
 vllm/model_executor/models/adapters.py        |  20 +-
 vllm/model_executor/models/deepseek_ocr.py    |   4 +-
 vllm/model_executor/models/deepseek_vl2.py    |   4 +-
 vllm/model_executor/models/gemma3n_mm.py      |   8 +-
 vllm/model_executor/models/granite_speech.py  |  14 +-
 vllm/model_executor/models/gritlm.py          |  14 +-
 vllm/model_executor/models/interfaces.py      |  10 +-
 vllm/model_executor/models/interns1.py        |   2 +-
 .../model_executor/models/nano_nemotron_vl.py |  13 +-
 vllm/model_executor/models/nemotron_vl.py     |   2 +-
 vllm/model_executor/models/pixtral.py         |   2 +-
 vllm/model_executor/models/voxtral.py         |  22 +-
 vllm/model_executor/models/whisper.py         |  14 +-
 vllm/multimodal/cache.py                      |  22 +-
 vllm/multimodal/processing.py                 |  28 +--
 vllm/multimodal/registry.py                   |  64 +++---
 vllm/tokenizers/registry.py                   |  24 +--
 vllm/transformers_utils/processor.py          |  28 +--
 vllm/v1/core/encoder_cache_manager.py         |   8 +-
 vllm/v1/core/sched/scheduler.py               |   2 +-
 vllm/v1/engine/async_llm.py                   |   7 +-
 vllm/v1/engine/input_processor.py             |   7 +-
 vllm/v1/engine/llm_engine.py                  |   7 +-
 vllm/v1/spec_decode/eagle.py                  |   2 +-
 vllm/v1/structured_output/__init__.py         |  18 +-
 vllm/v1/worker/gpu_model_runner.py            |   7 +-
 vllm/v1/worker/tpu_model_runner.py            |   7 +-
 vllm/v1/worker/utils.py                       |  19 +-
 105 files changed, 799 insertions(+), 971 deletions(-)
 delete mode 100644 vllm/config/renderer.py

diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md
index c56057890..fca941acd 100644
--- a/docs/contributing/model/transcription.md
+++ b/docs/contributing/model/transcription.md
@@ -22,7 +22,7 @@ Declare supported languages and capabilities:
     import torch
     from torch import nn
 
-    from vllm.config import RendererConfig, SpeechToTextConfig
+    from vllm.config import ModelConfig, SpeechToTextConfig
     from vllm.inputs.data import PromptType
     from vllm.model_executor.models.interfaces import SupportsTranscription
     
@@ -52,7 +52,7 @@ This is for controlling general behavior of the API when serving your model:
         @classmethod
         def get_speech_to_text_config(
             cls,
-            renderer_config: RendererConfig,
+            model_config: ModelConfig,
             task_type: Literal["transcribe", "translate"],
         ) -> SpeechToTextConfig:
             return SpeechToTextConfig(
@@ -83,7 +83,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
             cls,
             audio: np.ndarray,
             stt_config: SpeechToTextConfig,
-            renderer_config: RendererConfig,
+            model_config: ModelConfig,
             language: str | None,
             task_type: Literal["transcribe", "translate"],
             request_prompt: str,
@@ -120,7 +120,7 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
             cls,
             audio: np.ndarray,
             stt_config: SpeechToTextConfig,
-            renderer_config: RendererConfig,
+            model_config: ModelConfig,
             language: str | None,
             task_type: Literal["transcribe", "translate"],
             request_prompt: str,
@@ -183,7 +183,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics:
             cls,
             audio_duration_s: float,
             stt_config: SpeechToTextConfig,
-            renderer_config: RendererConfig,
+            model_config: ModelConfig,
         ) -> int | None:
             # Return None if unknown; otherwise return an estimate.
             return int(audio_duration_s * stt_config.sample_rate // 320)  # example
@@ -216,7 +216,7 @@ Relevant server logic:
             prompt = self.model_cls.get_generation_prompt(
                 audio=chunk,
                 stt_config=self.asr_config,
-                renderer_config=self.renderer_config,
+                model_config=self.model_config,
                 language=language,
                 task_type=self.task_type,
                 request_prompt=request.prompt,
diff --git a/tests/compile/distributed/test_sequence_parallelism.py b/tests/compile/distributed/test_sequence_parallelism.py
index 77d3a24d4..d9fdc3acc 100644
--- a/tests/compile/distributed/test_sequence_parallelism.py
+++ b/tests/compile/distributed/test_sequence_parallelism.py
@@ -17,7 +17,6 @@ from vllm.config import (
     DeviceConfig,
     ModelConfig,
     PassConfig,
-    RendererConfig,
     VllmConfig,
     get_current_vllm_config,
     set_current_vllm_config,
@@ -277,7 +276,6 @@ def sequence_parallelism_pass_on_test_model(
 
     vllm_config = VllmConfig(
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         device_config=device_config,
         compilation_config=compilation_config,
     )
diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
index 52d6fd1e5..758591589 100644
--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@@ -15,7 +15,6 @@ from vllm.config import (
     CompilationConfig,
     ModelConfig,
     PassConfig,
-    RendererConfig,
     VllmConfig,
     set_current_vllm_config,
 )
@@ -220,11 +219,8 @@ def test_fix_functionalization(
     torch.set_default_device("cuda")
     torch.set_default_dtype(dtype)
 
-    model_config = ModelConfig(dtype=dtype)
-
     vllm_config = VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
+        model_config=ModelConfig(dtype=dtype),
         compilation_config=CompilationConfig(
             custom_ops=["all"],
             pass_config=PassConfig(
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index bb4ee6b8e..d0ba8385f 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -15,7 +15,6 @@ from vllm.config import (
     CompilationMode,
     ModelConfig,
     PassConfig,
-    RendererConfig,
     VllmConfig,
 )
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -155,11 +154,8 @@ def test_fusion_rmsnorm_quant(
         custom_ops.append("+rms_norm")
     if enable_quant_fp8_custom_op:
         custom_ops.append("+quant_fp8")
-
-    model_config = ModelConfig(dtype=dtype)
     vllm_config = VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
+        model_config=ModelConfig(dtype=dtype),
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
             custom_ops=custom_ops,
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index f87825db2..db95dff5e 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -24,7 +24,6 @@ from vllm.config import (
     CompilationMode,
     ModelConfig,
     PassConfig,
-    RendererConfig,
     SchedulerConfig,
     VllmConfig,
     set_current_vllm_config,
@@ -326,7 +325,6 @@ def test_attention_quant_pattern(
     )
     vllm_config = VllmConfig(
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         scheduler_config=SchedulerConfig(
             max_num_seqs=1024,
             max_model_len=model_config.max_model_len,
diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py
index c95e9e3ff..6d0ba6b65 100644
--- a/tests/compile/test_pass_manager.py
+++ b/tests/compile/test_pass_manager.py
@@ -7,7 +7,7 @@ import torch
 
 from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
 from vllm.compilation.pass_manager import PostGradPassManager
-from vllm.config import ModelConfig, RendererConfig, VllmConfig
+from vllm.config import ModelConfig, VllmConfig
 
 
 # dummy custom pass that doesn't inherit
@@ -43,11 +43,7 @@ class ProperPass(InductorPass):
 )
 def test_pass_manager_uuid(callable):
     # Some passes need dtype to be set
-    model_config = ModelConfig(dtype=torch.bfloat16)
-    config = VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
-    )
+    config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16))
 
     pass_manager = PostGradPassManager()
     pass_manager.configure(config)
diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py
index 4d109015b..e0968ac79 100644
--- a/tests/compile/test_qk_norm_rope_fusion.py
+++ b/tests/compile/test_qk_norm_rope_fusion.py
@@ -19,7 +19,6 @@ from vllm.config import (
     CompilationMode,
     ModelConfig,
     PassConfig,
-    RendererConfig,
     VllmConfig,
     set_current_vllm_config,
 )
@@ -134,10 +133,8 @@ def test_qk_norm_rope_fusion(
     if enable_rope_custom_op:
         custom_ops.append("+rotary_embedding")
 
-    model_config = ModelConfig(dtype=dtype)
     vllm_config = VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
+        model_config=ModelConfig(dtype=dtype),
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
             custom_ops=custom_ops,
diff --git a/tests/distributed/test_kvlayout.py b/tests/distributed/test_kvlayout.py
index 0d51a51a5..b190b2820 100644
--- a/tests/distributed/test_kvlayout.py
+++ b/tests/distributed/test_kvlayout.py
@@ -5,7 +5,6 @@ from vllm.config import (
     DeviceConfig,
     KVTransferConfig,
     ModelConfig,
-    RendererConfig,
     VllmConfig,
     set_current_vllm_config,
 )
@@ -48,7 +47,6 @@ def test_get_kv_connector_cache_layout_with_nixl_connector():
     vllm_config = VllmConfig(
         device_config=DeviceConfig("cpu"),
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         kv_transfer_config=kv_transfer_config,
     )
     with set_current_vllm_config(vllm_config):
@@ -72,7 +70,6 @@ def test_get_kv_connector_cache_layout_with_multi_connector():
     vllm_config = VllmConfig(
         device_config=DeviceConfig("cpu"),
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         kv_transfer_config=kv_transfer_config,
     )
     with set_current_vllm_config(vllm_config):
diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py
index b050cfdb5..77087ac21 100644
--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -3,6 +3,7 @@
 
 import pytest
 
+from vllm.config import ModelConfig
 from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.tokenizers import get_tokenizer
@@ -106,11 +107,24 @@ def test_get_gen_prompt(
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
 
-    renderer_config = model_info.build_renderer_config(model)
+    model_config = ModelConfig(
+        model,
+        tokenizer=model_info.tokenizer or model,
+        tokenizer_mode=model_info.tokenizer_mode,
+        trust_remote_code=model_info.trust_remote_code,
+        revision=model_info.revision,
+        hf_overrides=model_info.hf_overrides,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype,
+    )
 
+    # Initialize the tokenizer
     tokenizer = get_tokenizer(
-        renderer_config.tokenizer,
-        trust_remote_code=renderer_config.trust_remote_code,
+        tokenizer_name=model_config.tokenizer,
+        trust_remote_code=model_config.trust_remote_code,
     )
     template_content = load_chat_template(chat_template=template)
 
@@ -129,7 +143,7 @@ def test_get_gen_prompt(
         tokenizer=tokenizer,
         conversation=mock_request.messages,
         chat_template=mock_request.chat_template or template_content,
-        renderer_config=renderer_config,
+        model_config=model_config,
         tools=None,
         add_generation_prompt=mock_request.add_generation_prompt,
         continue_final_message=mock_request.continue_final_message,
diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py
index 7310c2610..ea6b3d812 100644
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -33,34 +33,26 @@ class MockModelConfig:
     """Minimal mock ModelConfig for testing."""
 
     model: str = MODEL_NAME
+    tokenizer: str = MODEL_NAME
     trust_remote_code: bool = False
+    tokenizer_mode: str = "auto"
     max_model_len: int = 100
+    tokenizer_revision: str | None = None
     multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
     hf_config: MockHFConfig = field(default_factory=MockHFConfig)
     logits_processors: list[str] | None = None
     logits_processor_pattern: str | None = None
     diff_sampling_param: dict | None = None
+    allowed_local_media_path: str = ""
+    allowed_media_domains: list[str] | None = None
     encoder_config = None
     generation_config: str = "auto"
+    skip_tokenizer_init: bool = False
 
     def get_diff_sampling_param(self):
         return self.diff_sampling_param or {}
 
 
-@dataclass
-class MockRendererConfig:
-    """Minimal mock RendererConfig for testing."""
-
-    model_config: MockModelConfig
-
-    tokenizer: str = MODEL_NAME
-    tokenizer_mode: str = "auto"
-    tokenizer_revision: str | None = None
-    skip_tokenizer_init: bool = False
-    allowed_local_media_path: str = ""
-    allowed_media_domains: list[str] | None = None
-
-
 class MockLoRAResolver(LoRAResolver):
     async def resolve_lora(
         self, base_model_name: str, lora_name: str
@@ -122,7 +114,6 @@ def mock_serving_setup():
     mock_engine.add_lora.reset_mock()
 
     mock_engine.model_config = MockModelConfig()
-    mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 9df8f886e..9ea65f9fa 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -346,33 +346,27 @@ class MockHFConfig:
 class MockModelConfig:
     task = "generate"
     runner_type = "generate"
+    tokenizer = MODEL_NAME
     trust_remote_code = False
+    tokenizer_mode = "auto"
     max_model_len = 100
+    tokenizer_revision = None
     multimodal_config = MultiModalConfig()
     hf_config = MockHFConfig()
     logits_processors: list[str] | None = None
     logits_processor_pattern = None
     diff_sampling_param: dict | None = None
+    allowed_local_media_path: str = ""
+    allowed_media_domains: list[str] | None = None
     encoder_config = None
     generation_config: str = "auto"
+    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
+    skip_tokenizer_init = False
 
     def get_diff_sampling_param(self):
         return self.diff_sampling_param or {}
 
 
-@dataclass
-class MockRendererConfig:
-    model_config: MockModelConfig = field(default_factory=MockModelConfig)
-
-    tokenizer = MODEL_NAME
-    tokenizer_mode = "auto"
-    tokenizer_revision = None
-    skip_tokenizer_init = False
-    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
-    allowed_local_media_path: str = ""
-    allowed_media_domains: list[str] | None = None
-
-
 def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
     models = OpenAIServingModels(
         engine_client=engine,
@@ -405,7 +399,6 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
 @dataclass
 class MockEngine:
     model_config: MockModelConfig = field(default_factory=MockModelConfig)
-    renderer_config: MockRendererConfig = field(default_factory=MockRendererConfig)
     input_processor: MagicMock = field(default_factory=MagicMock)
     io_processor: MagicMock = field(default_factory=MagicMock)
 
@@ -436,7 +429,6 @@ async def test_serving_chat_returns_correct_model_name():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = MockModelConfig()
-    mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
@@ -467,7 +459,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = MockModelConfig()
-    mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
@@ -501,7 +492,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
-    mock_engine.renderer_config = MockRendererConfig(mock_model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
@@ -547,7 +537,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
-    mock_engine.renderer_config = MockRendererConfig(mock_model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
@@ -594,7 +583,6 @@ async def test_serving_chat_could_load_correct_generation_config():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
-    mock_engine.renderer_config = MockRendererConfig(mock_model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
@@ -641,7 +629,6 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
-    mock_engine.renderer_config = MockRendererConfig(mock_model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
@@ -675,7 +662,6 @@ async def test_serving_chat_data_parallel_rank_extraction():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = MockModelConfig()
-    mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
     mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_engine.py
index 6ab0942b5..956a06dc5 100644
--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_engine.py
@@ -7,7 +7,7 @@ from unittest.mock import Mock
 
 import pytest
 
-from vllm.config import ModelConfig, RendererConfig
+from vllm.config import ModelConfig
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.tokenizers import MistralTokenizer
@@ -19,16 +19,10 @@ def serving() -> OpenAIServing:
 
     # Create minimal mocks
     engine_client = Mock()
-
     model_config = Mock(spec=ModelConfig)
     model_config.max_model_len = 32768
-
-    renderer_config = Mock(spec=RendererConfig)
-    renderer_config.model_config = model_config
-
     models = Mock(spec=OpenAIServingModels)
     models.model_config = model_config
-    models.renderer_config = renderer_config
     models.input_processor = Mock()
     models.io_processor = Mock()
 
diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py
index 376df6cfe..b585835a0 100644
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -6,7 +6,7 @@ from unittest.mock import MagicMock
 
 import pytest
 
-from vllm.config import ModelConfig, RendererConfig
+from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.openai.protocol import (
     ErrorResponse,
@@ -27,15 +27,9 @@ LORA_UNLOADING_SUCCESS_MESSAGE = (
 async def _async_serving_models_init() -> OpenAIServingModels:
     mock_engine_client = MagicMock(spec=EngineClient)
     # Set the max_model_len attribute to avoid missing attribute
-
     mock_model_config = MagicMock(spec=ModelConfig)
     mock_model_config.max_model_len = 2048
-
-    mock_renderer_config = MagicMock(spec=RendererConfig)
-    mock_renderer_config.model_config = mock_model_config
-
     mock_engine_client.model_config = mock_model_config
-    mock_engine_client.renderer_config = mock_renderer_config
     mock_engine_client.input_processor = MagicMock()
     mock_engine_client.io_processor = MagicMock()
 
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 7b296eae7..527322c71 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -12,7 +12,7 @@ from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import ModelConfig, RendererConfig
+from vllm.config import ModelConfig
 from vllm.entrypoints.chat_utils import (
     _try_extract_ast,
     apply_mistral_chat_template,
@@ -233,7 +233,7 @@ def test_parse_chat_messages_single_image(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -265,7 +265,7 @@ def test_parse_chat_messages_single_image_with_uuid(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -295,7 +295,7 @@ def test_parse_chat_messages_single_empty_image_with_uuid(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -328,7 +328,7 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -369,7 +369,7 @@ def test_parse_chat_messages_multiple_images_with_uuids(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -409,7 +409,7 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -451,7 +451,7 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -485,7 +485,7 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -516,7 +516,7 @@ async def test_parse_chat_messages_empty_image_with_uuid_async(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -554,7 +554,7 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -595,7 +595,7 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -634,7 +634,7 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -660,7 +660,7 @@ def test_parse_chat_messages_empty_system(
                 "content": [{"type": "text", "text": "Who are you?"}],
             },
         ],
-        RendererConfig(model_config=mistral_model_config),
+        mistral_model_config,
         content_format="string",
     )
     assert conversation == [
@@ -677,7 +677,7 @@ def test_parse_chat_messages_empty_system(
                 "content": [{"type": "text", "text": "Who are you?"}],
             },
         ],
-        RendererConfig(model_config=mistral_model_config),
+        mistral_model_config,
         content_format="openai",
     )
     assert conversation == [
@@ -701,7 +701,7 @@ async def test_parse_chat_messages_single_image_async(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -730,7 +730,7 @@ def test_parse_chat_messages_multiple_images(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -758,7 +758,7 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -786,7 +786,7 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config_image_embeds),
+        phi3v_model_config_image_embeds,
         content_format="string",
     )
 
@@ -818,7 +818,7 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
                 ],
             }
         ],
-        RendererConfig(model_config=audio_embeds_model_config),
+        audio_embeds_model_config,
         content_format="string",
     )
 
@@ -858,7 +858,7 @@ def test_parse_chat_messages_audio_embeds_with_string(
                 ],
             }
         ],
-        RendererConfig(model_config=audio_embeds_model_config),
+        audio_embeds_model_config,
         content_format="string",
     )
 
@@ -900,7 +900,7 @@ async def test_parse_chat_messages_audio_embeds_async(
                 ],
             }
         ],
-        RendererConfig(model_config=audio_embeds_model_config),
+        audio_embeds_model_config,
         content_format="string",
     )
 
@@ -1108,7 +1108,7 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config_image_embeds),
+        phi3v_model_config_image_embeds,
         content_format="string",
     )
 
@@ -1144,7 +1144,7 @@ async def test_parse_chat_messages_multiple_images_async(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -1176,7 +1176,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
     assert conversation == [
@@ -1208,7 +1208,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -1245,7 +1245,7 @@ def test_parse_chat_messages_multiple_images_across_messages(
                 ],
             },
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -1289,7 +1289,7 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
                 ],
             },
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -1314,7 +1314,7 @@ def test_parse_chat_messages_context_text_format(
             {"role": "assistant", "content": "Some stuff."},
             {"role": "user", "content": "What about this one?"},
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="openai",
     )
 
@@ -1367,7 +1367,7 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message(
                         ],
                     }
                 ],
-                RendererConfig(model_config=phi3v_model_config),
+                phi3v_model_config,
                 content_format="string",
             )
 
@@ -1410,7 +1410,7 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages(
                         ],
                     },
                 ],
-                RendererConfig(model_config=phi3v_model_config),
+                phi3v_model_config,
                 content_format="string",
             )
 
@@ -1430,7 +1430,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config),
+        phi3v_model_config,
         content_format="string",
     )
 
@@ -1464,7 +1464,7 @@ def test_parse_chat_messages_multiple_images_interleave(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config_mm_interleaved),
+        phi3v_model_config_mm_interleaved,
         content_format="string",
     )
 
@@ -1500,7 +1500,7 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config_mm_interleaved),
+        phi3v_model_config_mm_interleaved,
         content_format="string",
     )
 
@@ -1545,7 +1545,7 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
                 ],
             }
         ],
-        RendererConfig(model_config=phi3v_model_config_mm_interleaved),
+        phi3v_model_config_mm_interleaved,
         content_format="string",
     )
 
@@ -1583,7 +1583,7 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
                 ],
             },
         ],
-        RendererConfig(model_config=phi3v_model_config_mm_interleaved),
+        phi3v_model_config_mm_interleaved,
         content_format="string",
     )
 
@@ -1631,7 +1631,7 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl
                 ],
             },
         ],
-        RendererConfig(model_config=phi3v_model_config_mm_interleaved),
+        phi3v_model_config_mm_interleaved,
         content_format="string",
     )
 
@@ -1675,7 +1675,7 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
                 ],
             },
         ],
-        RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
+        qwen25omni_model_config_mm_interleaved,
         content_format="string",
     )
 
@@ -1743,7 +1743,7 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
                 ],
             },
         ],
-        RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
+        qwen25omni_model_config_mm_interleaved,
         content_format="string",
     )
 
@@ -1813,7 +1813,7 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
                 ],
             },
         ],
-        RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
+        qwen25omni_model_config_mm_interleaved,
         content_format="string",
     )
 
@@ -1879,7 +1879,7 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
                 ],
             },
         ],
-        RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
+        qwen25omni_model_config_mm_interleaved,
         content_format="string",
     )
 
@@ -1927,7 +1927,7 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
                     ],
                 }
             ],
-            RendererConfig(model_config=phi3v_model_config_mm_interleaved),
+            phi3v_model_config_mm_interleaved,
             content_format="string",
         )
 
@@ -1945,11 +1945,24 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
 
-    renderer_config = model_info.build_renderer_config(model)
-
+    model_config = ModelConfig(
+        model,
+        tokenizer=model_info.tokenizer or model,
+        tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
+        trust_remote_code=model_info.trust_remote_code,
+        hf_overrides=model_info.hf_overrides,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype,
+    )
+
+    # Build the tokenizer
     tokenizer = get_tokenizer(
-        renderer_config.tokenizer,
-        trust_remote_code=renderer_config.trust_remote_code,
+        model,
+        trust_remote_code=model_config.trust_remote_code,
     )
 
     tools = (
@@ -1972,7 +1985,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
         tokenizer,
         chat_template=None,
         tools=tools,
-        model_config=renderer_config.model_config,
+        model_config=model_config,
     )
     assert isinstance(chat_template, str)
 
@@ -2034,11 +2047,24 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa
         "enable_thinking": True,
     }
 
-    renderer_config = model_info.build_renderer_config(model)
-
+    model_config = ModelConfig(
+        model,
+        tokenizer=model_info.tokenizer or model,
+        tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
+        trust_remote_code=model_info.trust_remote_code,
+        hf_overrides=model_info.hf_overrides,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype,
+    )
+
+    # Build the tokenizer
     tokenizer = get_tokenizer(
-        renderer_config.tokenizer,
-        trust_remote_code=renderer_config.trust_remote_code,
+        model,
+        trust_remote_code=model_config.trust_remote_code,
     )
 
     # Test detecting the tokenizer's chat_template
@@ -2046,7 +2072,7 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa
         tokenizer,
         chat_template=None,
         tools=tools,
-        model_config=renderer_config.model_config,
+        model_config=model_config,
     )
     with pytest.raises(
         ValueError, match="Found unexpected chat template kwargs from request"
@@ -2117,11 +2143,23 @@ def test_resolve_content_format_hf_defined(model, expected_format):
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
 
-    renderer_config = model_info.build_renderer_config(model)
+    model_config = ModelConfig(
+        model,
+        tokenizer=model_info.tokenizer or model,
+        tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
+        trust_remote_code=model_info.trust_remote_code,
+        hf_overrides=model_info.hf_overrides,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype,
+    )
 
     tokenizer = get_tokenizer(
-        renderer_config.tokenizer,
-        trust_remote_code=renderer_config.trust_remote_code,
+        model,
+        trust_remote_code=model_config.trust_remote_code,
     )
 
     # Test detecting the tokenizer's chat_template
@@ -2129,7 +2167,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
         tokenizer,
         chat_template=None,
         tools=None,
-        model_config=renderer_config.model_config,
+        model_config=model_config,
     )
     assert isinstance(chat_template, str)
 
@@ -2143,7 +2181,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
         None,
         "auto",
         tokenizer,
-        renderer_config=renderer_config,
+        model_config=model_config,
     )
 
     assert resolved_format == expected_format
@@ -2165,11 +2203,23 @@ def test_resolve_content_format_fallbacks(model, expected_format):
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
 
-    renderer_config = model_info.build_renderer_config(model)
+    model_config = ModelConfig(
+        model,
+        tokenizer=model_info.tokenizer or model,
+        tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
+        trust_remote_code=model_info.trust_remote_code,
+        hf_overrides=model_info.hf_overrides,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype,
+    )
 
     tokenizer = get_tokenizer(
-        renderer_config.tokenizer,
-        trust_remote_code=renderer_config.trust_remote_code,
+        model_config.tokenizer,
+        trust_remote_code=model_config.trust_remote_code,
     )
 
     # Test detecting the tokenizer's chat_template
@@ -2177,7 +2227,7 @@ def test_resolve_content_format_fallbacks(model, expected_format):
         tokenizer,
         chat_template=None,
         tools=None,
-        model_config=renderer_config.model_config,
+        model_config=model_config,
     )
     assert isinstance(chat_template, str)
 
@@ -2191,7 +2241,7 @@ def test_resolve_content_format_fallbacks(model, expected_format):
         None,
         "auto",
         tokenizer,
-        renderer_config=renderer_config,
+        model_config=model_config,
     )
 
     assert resolved_format == expected_format
@@ -2222,13 +2272,15 @@ def test_resolve_content_format_fallbacks(model, expected_format):
     ],
 )
 def test_resolve_content_format_examples(template_path, expected_format):
-    model = PHI3V_MODEL_ID  # Dummy
-    model_config = ModelConfig(model, trust_remote_code=True)
-    renderer_config = RendererConfig(model_config=model_config, tokenizer=model)
+    model_config = ModelConfig(
+        PHI3V_MODEL_ID,  # Dummy
+        tokenizer=PHI3V_MODEL_ID,  # Dummy
+        trust_remote_code=True,
+    )
 
     dummy_tokenizer = get_tokenizer(
-        renderer_config.tokenizer,
-        trust_remote_code=renderer_config.trust_remote_code,
+        PHI3V_MODEL_ID,  # Dummy
+        trust_remote_code=model_config.trust_remote_code,
     )
     dummy_tokenizer.chat_template = None
 
@@ -2245,7 +2297,7 @@ def test_resolve_content_format_examples(template_path, expected_format):
         None,
         "auto",
         dummy_tokenizer,
-        renderer_config=renderer_config,
+        model_config=model_config,
     )
 
     assert resolved_format == expected_format
@@ -2280,7 +2332,7 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config):
 
     conversation_with_thinking, _, _ = parse_chat_messages(
         messages,
-        RendererConfig(model_config=mistral_model_config),
+        mistral_model_config,
         content_format="openai",
     )
 
@@ -2380,7 +2432,7 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
                 ],
             }
         ],
-        RendererConfig(model_config=qwen2_audio_model_config),
+        qwen2_audio_model_config,
         content_format="string",
     )
 
@@ -2414,7 +2466,7 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
                 ],
             }
         ],
-        RendererConfig(model_config=qwen2_audio_model_config),
+        qwen2_audio_model_config,
         content_format="string",
     )
 
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 7158120fc..081f14d6f 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -8,7 +8,7 @@ import torch
 from safetensors.torch import load_file
 from torch import nn
 
-from vllm.config import ModelConfig, RendererConfig, VllmConfig
+from vllm.config import ModelConfig, VllmConfig
 from vllm.config.lora import LoRAConfig
 from vllm.lora.layers import (
     ColumnParallelLinearWithLoRA,
@@ -422,11 +422,7 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa
     )
 
     model_config = ModelConfig(max_model_len=16)
-    vllm_config = VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
-        lora_config=lora_config,
-    )
+    vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
 
     vllm_config.scheduler_config.max_num_seqs = 4
     vllm_config.scheduler_config.max_num_batched_tokens = 2
@@ -529,11 +525,7 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path
     )
 
     model_config = ModelConfig(max_model_len=16)
-    vllm_config = VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
-        lora_config=lora_config,
-    )
+    vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
 
     vllm_config.scheduler_config.max_num_seqs = 4
     vllm_config.scheduler_config.max_num_batched_tokens = 2
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 42d8c6202..54059ec56 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -11,7 +11,6 @@ from vllm.config import (
     DeviceConfig,
     ModelConfig,
     ParallelConfig,
-    RendererConfig,
     SchedulerConfig,
     VllmConfig,
 )
@@ -44,7 +43,6 @@ def test_worker_apply_lora(qwen3_lora_files):
 
     vllm_config = VllmConfig(
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         load_config=LoadConfig(
             download_dir=None,
             load_format="dummy",
diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
index e36867107..489ac1e64 100644
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -42,10 +42,8 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
             "Write a short story about a robot that dreams for the first time.\n"
         )
 
-        llm_engine = vllm_model.llm.llm_engine
-        model_config = llm_engine.model_config
-        renderer_config = llm_engine.renderer_config
-        tokenizer = llm_engine.tokenizer
+        model_config = vllm_model.llm.llm_engine.model_config
+        model_tokenizer = vllm_model.llm.llm_engine.tokenizer
 
         # asserts on the bert model config file
         assert model_config.encoder_config["max_seq_length"] == 512
@@ -56,8 +54,8 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
         assert model_config.pooler_config.normalize
 
         # asserts on the tokenizer loaded
-        assert renderer_config.tokenizer == "BAAI/bge-base-en-v1.5"
-        assert tokenizer.model_max_length == 512
+        assert model_config.tokenizer == "BAAI/bge-base-en-v1.5"
+        assert model_tokenizer.model_max_length == 512
 
         def check_model(model):
             assert isinstance(model, BertEmbeddingModel)
@@ -88,10 +86,8 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
             "Write a short story about a robot that dreams for the first time.\n"
         )
 
-        llm_engine = vllm_model.llm.llm_engine
-        model_config = llm_engine.model_config
-        renderer_config = llm_engine.renderer_config
-        tokenizer = llm_engine.tokenizer
+        model_config = vllm_model.llm.llm_engine.model_config
+        model_tokenizer = vllm_model.llm.llm_engine.tokenizer
 
         # asserts on the bert model config file
         assert model_config.encoder_config["max_seq_length"] == 512
@@ -102,8 +98,8 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
         assert model_config.pooler_config.normalize
 
         # asserts on the tokenizer loaded
-        assert renderer_config.tokenizer == "intfloat/multilingual-e5-base"
-        assert tokenizer.model_max_length == 512
+        assert model_config.tokenizer == "intfloat/multilingual-e5-base"
+        assert model_tokenizer.model_max_length == 512
 
         def check_model(model):
             assert isinstance(model, RobertaEmbeddingModel)
@@ -132,7 +128,7 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch):
             "Write a short story about a robot that dreams for the first time.\n"
         )
 
-        assert vllm_model.llm.llm_engine.renderer_config.tokenizer == model_name
+        assert vllm_model.llm.llm_engine.model_config.tokenizer == model_name
 
         def check_model(model):
             assert isinstance(model, RobertaEmbeddingModel)
diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py
index 11ee00358..0adc9b5cf 100644
--- a/tests/models/language/pooling/test_gritlm.py
+++ b/tests/models/language/pooling/test_gritlm.py
@@ -6,7 +6,7 @@ import pytest
 from scipy.spatial.distance import cosine
 
 from vllm import LLM, SamplingParams
-from vllm.config import ModelConfig, RendererConfig
+from vllm.config import ModelConfig
 
 from ....utils import RemoteOpenAIServer
 
@@ -31,8 +31,7 @@ def test_find_array():
         dtype="bfloat16",
         seed=0,
     )
-    renderer_config = RendererConfig(model_config=model_config)
-    pooling = GritLMMeanPool(renderer_config=renderer_config)
+    pooling = GritLMMeanPool(model_config=model_config)
 
     arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
 
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 9b2b29b75..2e032ac4c 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -25,6 +25,7 @@ from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingC
 from vllm.tokenizers import (
     MistralTokenizer,
     TokenizerLike,
+    cached_tokenizer_from_config,
 )
 
 from ....multimodal.utils import random_audio, random_image, random_video
@@ -211,20 +212,31 @@ def _test_processing_correctness(
     else:
         model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch)
         model_id = model_id_or_arch
-
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
-    renderer_config = model_info.build_renderer_config(
-        model=model_id,
+    model_config = ModelConfig(
+        model_id,
+        tokenizer=model_info.tokenizer or model_id,
+        tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
+        trust_remote_code=model_info.trust_remote_code,
+        hf_overrides=model_info.hf_overrides,
         # Ensure that the cache can fit all of the data
         mm_processor_cache_gb=2048,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype,
     )
-    model_config = renderer_config.model_config
 
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
     factories = model_cls._processor_factory
-    ctx = InputProcessingContext.from_config(renderer_config)
+    ctx = InputProcessingContext(
+        model_config,
+        tokenizer=cached_tokenizer_from_config(model_config),
+    )
     cache = MultiModalProcessorOnlyCache(model_config)
 
     processing_info = factories.info(ctx)
diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py
index fdc6352e2..51071c935 100644
--- a/tests/models/multimodal/processing/test_glm4_1v.py
+++ b/tests/models/multimodal/processing/test_glm4_1v.py
@@ -40,7 +40,7 @@ def test_processor_override(
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"video": 1},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     tokenizer = processor.info.get_tokenizer()
     hf_processor_mm_kwargs = {"fps": fps}
 
@@ -79,7 +79,7 @@ def test_video_loader_consistency(
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"video": 1},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {"fps": fps}
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py
index 1263d663e..1701d9dd8 100644
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -162,7 +162,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": len(size_factors)},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     min_num = min_dynamic_patch if dynamic_image_size else 1
diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py
index bf12e79a7..351b9d018 100644
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -38,7 +38,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
index 51f0d2e89..b4994295d 100644
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -116,7 +116,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": len(size_factors)},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     min_num = min_dynamic_patch if dynamic_image_size else 1
diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py
index 04bc8d3f5..b73246b68 100644
--- a/tests/models/multimodal/processing/test_llama4.py
+++ b/tests/models/multimodal/processing/test_llama4.py
@@ -30,7 +30,7 @@ def test_processor_override(
         limit_mm_per_prompt={"image": num_imgs},
         mm_processor_cache_gb=mm_processor_cache_gb,
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     config = processor.info.get_hf_config()
     tokenizer = processor.info.get_tokenizer()
     hf_processor = processor.info.get_hf_processor()
diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index cd01002a3..ffe7ca17b 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -42,7 +42,7 @@ def test_processor_max_tokens(model_id):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": 1},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     info = processor.info
 
     seen_aspect_ratios = set[float]()
@@ -140,7 +140,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
     image_ratios = [
         (171, 152),
@@ -173,7 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
     seen_aspect_ratios = set[float]()
     image_sizes = list[ImageSize]()
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index be505d95a..f5c552fe6 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -42,7 +42,7 @@ def test_processor_max_tokens(model_id):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": 1},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     info = processor.info
 
     seen_aspect_ratios = set[float]()
@@ -138,7 +138,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
     image_ratios = [
         (171, 152),
@@ -171,7 +171,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
     seen_aspect_ratios = set[float]()
     image_sizes = list[ImageSize]()
diff --git a/tests/models/multimodal/processing/test_minimax_vl_01.py b/tests/models/multimodal/processing/test_minimax_vl_01.py
index 17ac54fdd..11e000123 100644
--- a/tests/models/multimodal/processing/test_minimax_vl_01.py
+++ b/tests/models/multimodal/processing/test_minimax_vl_01.py
@@ -24,7 +24,7 @@ def test_processor_override(
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     prompt = "<image>" * num_imgs
     image = Image.new("RGB", size=(364, 364))
     mm_data = {"image": [image] * num_imgs}
@@ -83,7 +83,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
     image_ratios = [
         (171, 152),
diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py
index 9a65e2ddc..e5ff2d139 100644
--- a/tests/models/multimodal/processing/test_mllama4.py
+++ b/tests/models/multimodal/processing/test_mllama4.py
@@ -25,7 +25,7 @@ def test_profiling(model_id: str, max_model_len: int):
         limit_mm_per_prompt=mm_counts,
     )
 
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     profiler = MultiModalProfiler(processor)
 
     decoder_dummy_data = profiler.get_decoder_dummy_data(
diff --git a/tests/models/multimodal/processing/test_nemotron_vl.py b/tests/models/multimodal/processing/test_nemotron_vl.py
index f3609743b..5311ab1b7 100644
--- a/tests/models/multimodal/processing/test_nemotron_vl.py
+++ b/tests/models/multimodal/processing/test_nemotron_vl.py
@@ -118,7 +118,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": len(size_factors)},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     min_num = min_dynamic_patch if dynamic_image_size else 1
diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py
index f51bd9786..8faff2611 100644
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -39,7 +39,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_phi4mm.py b/tests/models/multimodal/processing/test_phi4mm.py
index 271357b0d..5391555c2 100644
--- a/tests/models/multimodal/processing/test_phi4mm.py
+++ b/tests/models/multimodal/processing/test_phi4mm.py
@@ -39,7 +39,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
index d65a270a7..9f4cdb678 100644
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -34,7 +34,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     tokenizer = processor.info.get_tokenizer()
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
diff --git a/tests/models/multimodal/processing/test_smolvlm.py b/tests/models/multimodal/processing/test_smolvlm.py
index e0e6264de..6f77d5516 100644
--- a/tests/models/multimodal/processing/test_smolvlm.py
+++ b/tests/models/multimodal/processing/test_smolvlm.py
@@ -38,7 +38,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index 24959fa48..5d489549c 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -11,7 +11,7 @@ import pytest
 import torch.nn as nn
 from PIL import Image
 
-from vllm.config import ModelConfig, RendererConfig, VllmConfig, set_current_vllm_config
+from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
 from vllm.config.multimodal import (
     AudioDummyOptions,
     BaseDummyOptions,
@@ -31,6 +31,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
 from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
 from vllm.multimodal.utils import group_mm_kwargs_by_modality
 from vllm.platforms import current_platform
+from vllm.tokenizers import cached_tokenizer_from_config
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.torch_utils import set_default_torch_dtype
 
@@ -149,10 +150,7 @@ def initialize_dummy_model(
         backend="nccl",
     )
     initialize_model_parallel(tensor_model_parallel_size=1)
-    vllm_config = VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
-    )
+    vllm_config = VllmConfig(model_config=model_config)
     with set_current_vllm_config(vllm_config=vllm_config):
         with set_default_torch_dtype(model_config.dtype):
             model = model_cls(vllm_config=vllm_config)
@@ -184,12 +182,19 @@ def test_model_tensor_schema(model_id: str):
     else:
         dtype = model_info.dtype
 
-    renderer_config = model_info.build_renderer_config(
+    model_config = ModelConfig(
         model_id,
+        tokenizer=model_info.tokenizer or model_id,
+        tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
+        trust_remote_code=model_info.trust_remote_code,
         hf_overrides=hf_overrides_fn,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
+        enforce_eager=model_info.enforce_eager,
         dtype=dtype,
     )
-    model_config = renderer_config.model_config
 
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
     assert supports_multimodal(model_cls)
@@ -207,7 +212,10 @@ def test_model_tensor_schema(model_id: str):
     if not any(inputs_parse_methods):
         pytest.skip(f"{model_arch} does not support tensor schema validation.")
 
-    ctx = InputProcessingContext.from_config(renderer_config)
+    ctx = InputProcessingContext(
+        model_config,
+        tokenizer=cached_tokenizer_from_config(model_config),
+    )
     processing_info = factories.info(ctx)
     supported_mm_limits = processing_info.get_supported_mm_limits()
     limit_mm_per_prompt = {
diff --git a/tests/models/multimodal/processing/test_transformers.py b/tests/models/multimodal/processing/test_transformers.py
index c9a90eb88..e2a2186f4 100644
--- a/tests/models/multimodal/processing/test_transformers.py
+++ b/tests/models/multimodal/processing/test_transformers.py
@@ -3,7 +3,7 @@
 import pytest
 
 from vllm.assets.image import ImageAsset
-from vllm.config import ModelConfig, RendererConfig
+from vllm.config import ModelConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
 
@@ -13,9 +13,8 @@ def test_multimodal_processor(model_id):
         model=model_id,
         model_impl="transformers",
     )
-    renderer_config = RendererConfig(model_config=model_config)
 
-    mm_processor = MULTIMODAL_REGISTRY.create_processor(renderer_config)
+    mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)
 
     image_pil = ImageAsset("cherry_blossom").pil_image
     mm_data = {"image": image_pil}
diff --git a/tests/models/multimodal/test_mapping.py b/tests/models/multimodal/test_mapping.py
index 73de6b5f7..0d2eaca95 100644
--- a/tests/models/multimodal/test_mapping.py
+++ b/tests/models/multimodal/test_mapping.py
@@ -7,6 +7,7 @@ import torch
 import transformers
 from transformers import AutoConfig, PreTrainedModel
 
+from vllm.config import ModelConfig
 from vllm.model_executor.models.utils import WeightsMapper
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.transformers_utils.config import try_get_safetensors_metadata
@@ -49,11 +50,37 @@ def test_hf_model_weights_mapper(model_arch: str):
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
-    model_config = model_info.build_model_config(config_format="hf")
+    is_mistral_model = model_arch in [
+        "Mistral3ForConditionalGeneration",
+        "PixtralForConditionalGeneration",
+        "VoxtralForConditionalGeneration",
+    ]
+
+    if not is_mistral_model or model_info.tokenizer_mode == "mistral":
+        tokenizer_mode = model_info.tokenizer_mode
+    else:
+        tokenizer_mode = "hf"
+
+    model_id = model_info.default
+
+    model_config = ModelConfig(
+        model_id,
+        tokenizer=model_info.tokenizer or model_id,
+        tokenizer_mode=tokenizer_mode,
+        config_format="hf",
+        revision=model_info.revision,
+        trust_remote_code=model_info.trust_remote_code,
+        hf_overrides=model_info.hf_overrides,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype,
+    )
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
 
-    original_weights = create_repo_dummy_weights(model_config.model)
-    hf_dummy_model = create_dummy_model(model_config.model, model_arch)
+    original_weights = create_repo_dummy_weights(model_id)
+    hf_dummy_model = create_dummy_model(model_id, model_arch)
     hf_converted_weights = hf_dummy_model.named_parameters()
     hf_converted_buffers = hf_dummy_model.named_buffers()
     mapper: WeightsMapper = model_cls.hf_to_vllm_mapper
diff --git a/tests/models/registry.py b/tests/models/registry.py
index e2cb5bcbc..020cb7493 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -9,8 +9,7 @@ import pytest
 from packaging.version import Version
 from transformers import __version__ as TRANSFORMERS_VERSION
 
-from vllm.config.model import ModelConfig, ModelDType
-from vllm.config.renderer import RendererConfig, TokenizerMode
+from vllm.config.model import ModelDType, TokenizerMode
 
 
 @dataclass(frozen=True)
@@ -171,36 +170,6 @@ class _HfExamplesInfo:
             else:
                 pytest.skip(msg)
 
-    def build_model_config(self, model: str | None = None, **kwargs) -> ModelConfig:
-        if model is None:
-            model = self.default
-
-        return ModelConfig(
-            **{
-                "model": model,
-                "revision": self.revision,
-                "trust_remote_code": self.trust_remote_code,
-                "hf_overrides": self.hf_overrides,
-                "enable_prompt_embeds": self.require_embed_inputs,
-                "enable_mm_embeds": self.require_embed_inputs,
-                "enforce_eager": self.enforce_eager,
-                "dtype": self.dtype,
-                **kwargs,
-            }
-        )
-
-    def build_renderer_config(
-        self, model: str | None = None, **kwargs
-    ) -> RendererConfig:
-        model_config = self.build_model_config(model, **kwargs)
-
-        return RendererConfig(
-            model_config=model_config,
-            tokenizer=self.tokenizer or model_config.model,
-            tokenizer_mode=self.tokenizer_mode,
-            skip_tokenizer_init=self.require_embed_inputs,
-        )
-
 
 _TEXT_GENERATION_EXAMPLE_MODELS = {
     # [Decoder-only]
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 87292cc45..d84b4b820 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -13,6 +13,7 @@ from transformers import PretrainedConfig
 from vllm.config.model import ModelConfig, ModelDType, RunnerOption
 from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
 from vllm.multimodal.processing import InputProcessingContext
+from vllm.tokenizers import cached_tokenizer_from_config
 
 from .. import ci_envs
 from .registry import HF_EXAMPLE_MODELS
@@ -295,18 +296,30 @@ def build_model_context(
 
     model_config_kwargs = model_config_kwargs or {}
     limit_mm_per_prompt = limit_mm_per_prompt or {}
-    renderer_config = model_info.build_renderer_config(
+    model_config = ModelConfig(
         model_id,
         runner=runner,
+        tokenizer=model_info.tokenizer or model_id,
+        tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
+        trust_remote_code=model_info.trust_remote_code,
         dtype=dtype,
         seed=0,
         mm_processor_kwargs=mm_processor_kwargs,
         limit_mm_per_prompt=limit_mm_per_prompt,
         mm_processor_cache_gb=mm_processor_cache_gb,
+        hf_overrides=model_info.hf_overrides,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
+        enforce_eager=model_info.enforce_eager,
         **model_config_kwargs,
     )
 
-    return InputProcessingContext.from_config(renderer_config)
+    return InputProcessingContext(
+        model_config,
+        tokenizer=cached_tokenizer_from_config(model_config),
+    )
 
 
 def check_embeddings_close(
diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py
index ce16d9013..e641b1111 100644
--- a/tests/multimodal/test_cache.py
+++ b/tests/multimodal/test_cache.py
@@ -6,7 +6,7 @@ import numpy as np
 import pytest
 import torch
 
-from vllm.config import ModelConfig, ParallelConfig, RendererConfig, VllmConfig
+from vllm.config import ModelConfig, ParallelConfig, VllmConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.cache import (
     BaseMultiModalProcessorCache,
@@ -110,14 +110,11 @@ def _create_vllm_config(
     mm_processor_cache_gb: float,
     enable_ipc: bool,
 ):
-    model_config = ModelConfig(
-        model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
-        mm_processor_cache_gb=mm_processor_cache_gb,
-    )
-
     return VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
+        model_config=ModelConfig(
+            model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+            mm_processor_cache_gb=mm_processor_cache_gb,
+        ),
         parallel_config=ParallelConfig(data_parallel_size=1 if enable_ipc else 2),
     )
 
@@ -509,15 +506,13 @@ def _run_test_cache_eviction_shm(
 
 
 def test_cache_eviction_shm_cache():
-    model_config = ModelConfig(
-        model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
-        mm_processor_cache_type="shm",
-        mm_shm_cache_max_object_size_mb=6,
-        mm_processor_cache_gb=15.2 * MiB_bytes / GiB_bytes,
-    )
     vllm_config = VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
+        model_config=ModelConfig(
+            model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+            mm_processor_cache_type="shm",
+            mm_shm_cache_max_object_size_mb=6,
+            mm_processor_cache_gb=15.2 * MiB_bytes / GiB_bytes,
+        ),
     )
     sender_cache = ShmObjectStoreSenderCache(vllm_config)
     receiver_cache = ShmObjectStoreReceiverCache(vllm_config, mp.Lock())
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index adff57252..262ea42e4 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -7,7 +7,7 @@ from contextlib import nullcontext
 import numpy as np
 import pytest
 
-from vllm.config import ModelConfig, RendererConfig
+from vllm.config import ModelConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.processing import (
     InputProcessingContext,
@@ -920,9 +920,8 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
         model=model_id,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
-    renderer_config = RendererConfig(model_config=model_config)
 
-    processor = MULTIMODAL_REGISTRY.create_processor(renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
     processor._supported_mm_limits = {"image": num_supported}
 
     profiler = MultiModalProfiler(processor)
@@ -956,9 +955,8 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
         model=model_id,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
-    renderer_config = RendererConfig(model_config=model_config)
 
-    processor = MULTIMODAL_REGISTRY.create_processor(renderer_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
 
     rng = np.random.RandomState(0)
     image = random_image(rng, min_wh=128, max_wh=256)
@@ -1014,13 +1012,11 @@ def test_hf_processor_init_kwargs(
     inference_kwargs,
     expected_kwargs,
 ):
-    model_config = ModelConfig(model_id, mm_processor_kwargs=config_kwargs)
-    renderer_config = RendererConfig(
-        model_config=model_config,
-        tokenizer=model_id,
+    ctx = InputProcessingContext(
+        model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
+        tokenizer=None,
     )
 
-    ctx = InputProcessingContext.from_config(renderer_config)
     processor = ctx.get_hf_processor(
         DummyProcessor,  # type: ignore[arg-type]
         **inference_kwargs,
@@ -1049,13 +1045,11 @@ def test_hf_processor_call_kwargs(
     inference_kwargs,
     expected_kwargs,
 ):
-    model_config = ModelConfig(model_id, mm_processor_kwargs=config_kwargs)
-    renderer_config = RendererConfig(
-        model_config=model_config,
-        tokenizer=model_id,
+    ctx = InputProcessingContext(
+        model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
+        tokenizer=None,
     )
 
-    ctx = InputProcessingContext.from_config(renderer_config)
     processor = ctx.get_hf_processor(DummyProcessor)  # type: ignore[arg-type]
 
     result = ctx.call_hf_processor(processor, {}, inference_kwargs)
diff --git a/tests/multimodal/test_registry.py b/tests/multimodal/test_registry.py
index 8127fac09..3b01bda7f 100644
--- a/tests/multimodal/test_registry.py
+++ b/tests/multimodal/test_registry.py
@@ -31,6 +31,4 @@ def test_supports_multimodal_inputs(model_id, limit_mm_per_prompt, expected):
         model_id,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
-    assert (
-        MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.renderer_config) is expected
-    )
+    assert MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.model_config) is expected
diff --git a/tests/test_config.py b/tests/test_config.py
index 7464fcd1e..203447cd5 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -13,7 +13,6 @@ from vllm.config import (
     CompilationConfig,
     ModelConfig,
     PoolerConfig,
-    RendererConfig,
     SchedulerConfig,
     VllmConfig,
     update_config,
@@ -477,41 +476,27 @@ def test_load_config_pt_load_map_location(pt_load_map_location):
         ("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", 131073, 131072, True),
     ],
 )
-def test_recalculate_max_model_len(
+def test_get_and_verify_max_len(
     model_id, max_model_len, expected_max_len, should_raise
 ):
-    """Test recalculate_max_model_len with different configurations."""
+    """Test get_and_verify_max_len with different configurations."""
     model_config = ModelConfig(model_id)
 
     if should_raise:
         with pytest.raises(ValueError):
-            model_config.recalculate_max_model_len(
-                max_model_len,
-                tokenizer=model_id,
-                tokenizer_revision=None,
-            )
+            model_config.get_and_verify_max_len(max_model_len)
     else:
-        model_config.recalculate_max_model_len(
-            max_model_len,
-            tokenizer=model_id,
-            tokenizer_revision=None,
-        )
-        assert model_config.max_model_len == expected_max_len
+        actual_max_len = model_config.get_and_verify_max_len(max_model_len)
+        assert actual_max_len == expected_max_len
 
 
-class MockModelConfig:
-    """Simple mock object for testing maybe_pull_model_for_runai"""
+class MockConfig:
+    """Simple mock object for testing maybe_pull_model_tokenizer_for_runai"""
 
-    def __init__(self, model: str):
+    def __init__(self, model: str, tokenizer: str):
         self.model = model
-
-
-class MockRendererConfig:
-    """Simple mock object for testing maybe_pull_tokenizer_for_runai"""
-
-    def __init__(self, model_config: MockModelConfig):
-        self.model_config = model_config
-        self.tokenizer = model_config.model
+        self.tokenizer = tokenizer
+        self.model_weights = None
 
 
 @pytest.mark.parametrize(
@@ -529,65 +514,59 @@ def test_s3_url_model_tokenizer_paths(mock_pull_files, s3_url):
     mock_pull_files.return_value = None
 
     # Create first mock and run the method
-    model_config1 = MockModelConfig(model=s3_url)
-    renderer_config1 = MockRendererConfig(model_config=model_config1)
-    ModelConfig.maybe_pull_model_for_runai(model_config1, s3_url)
-    RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config1, s3_url)
+    config1 = MockConfig(model=s3_url, tokenizer=s3_url)
+    ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url, s3_url)
 
     # Check that model and tokenizer point to existing directories
-    assert os.path.exists(model_config1.model), (
-        f"Model directory does not exist: {model_config1.model}"
+    assert os.path.exists(config1.model), (
+        f"Model directory does not exist: {config1.model}"
     )
-    assert os.path.isdir(model_config1.model), (
-        f"Model path is not a directory: {model_config1.model}"
+    assert os.path.isdir(config1.model), (
+        f"Model path is not a directory: {config1.model}"
     )
-    assert os.path.exists(renderer_config1.tokenizer), (
-        f"Tokenizer directory does not exist: {renderer_config1.tokenizer}"
+    assert os.path.exists(config1.tokenizer), (
+        f"Tokenizer directory does not exist: {config1.tokenizer}"
     )
-    assert os.path.isdir(renderer_config1.tokenizer), (
-        f"Tokenizer path is not a directory: {renderer_config1.tokenizer}"
+    assert os.path.isdir(config1.tokenizer), (
+        f"Tokenizer path is not a directory: {config1.tokenizer}"
     )
 
     # Verify that the paths are different from the original S3 URL
-    assert model_config1.model != s3_url, (
-        "Model path should be converted to local directory"
-    )
-    assert renderer_config1.tokenizer != s3_url, (
+    assert config1.model != s3_url, "Model path should be converted to local directory"
+    assert config1.tokenizer != s3_url, (
         "Tokenizer path should be converted to local directory"
     )
 
     # Store the original paths
-    created_model_dir = model_config1.model
-    create_tokenizer_dir = renderer_config1.tokenizer
+    created_model_dir = config1.model
+    create_tokenizer_dir = config1.tokenizer
 
     # Create a new mock and run the method with the same S3 URL
-    model_config2 = MockModelConfig(model=s3_url)
-    renderer_config2 = MockRendererConfig(model_config=model_config2)
-    ModelConfig.maybe_pull_model_for_runai(model_config2, s3_url)
-    RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config2, s3_url)
+    config2 = MockConfig(model=s3_url, tokenizer=s3_url)
+    ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url, s3_url)
 
     # Check that the new directories exist
-    assert os.path.exists(model_config2.model), (
-        f"Model directory does not exist: {model_config2.model}"
+    assert os.path.exists(config2.model), (
+        f"Model directory does not exist: {config2.model}"
     )
-    assert os.path.isdir(model_config2.model), (
-        f"Model path is not a directory: {model_config2.model}"
+    assert os.path.isdir(config2.model), (
+        f"Model path is not a directory: {config2.model}"
     )
-    assert os.path.exists(renderer_config2.tokenizer), (
-        f"Tokenizer directory does not exist: {renderer_config2.tokenizer}"
+    assert os.path.exists(config2.tokenizer), (
+        f"Tokenizer directory does not exist: {config2.tokenizer}"
     )
-    assert os.path.isdir(renderer_config2.tokenizer), (
-        f"Tokenizer path is not a directory: {renderer_config2.tokenizer}"
+    assert os.path.isdir(config2.tokenizer), (
+        f"Tokenizer path is not a directory: {config2.tokenizer}"
     )
 
     # Verify that the paths are deterministic (same as before)
-    assert model_config2.model == created_model_dir, (
+    assert config2.model == created_model_dir, (
         f"Model paths are not deterministic. "
-        f"Original: {created_model_dir}, New: {model_config2.model}"
+        f"Original: {created_model_dir}, New: {config2.model}"
     )
-    assert renderer_config2.tokenizer == create_tokenizer_dir, (
+    assert config2.tokenizer == create_tokenizer_dir, (
         f"Tokenizer paths are not deterministic. "
-        f"Original: {create_tokenizer_dir}, New: {renderer_config2.tokenizer}"
+        f"Original: {create_tokenizer_dir}, New: {config2.tokenizer}"
     )
 
 
@@ -601,36 +580,28 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files):
     s3_url2 = "s3://example-bucket-2/model/"
 
     # Create mocks with different S3 URLs and run the method
-    model_config1 = MockModelConfig(model=s3_url1)
-    renderer_config1 = MockRendererConfig(model_config=model_config1)
-    ModelConfig.maybe_pull_model_for_runai(model_config1, s3_url1)
-    RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config1, s3_url1)
+    config1 = MockConfig(model=s3_url1, tokenizer=s3_url1)
+    ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url1, s3_url1)
 
-    model_config2 = MockModelConfig(model=s3_url2)
-    renderer_config2 = MockRendererConfig(model_config=model_config2)
-    ModelConfig.maybe_pull_model_for_runai(model_config2, s3_url2)
-    RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config2, s3_url2)
+    config2 = MockConfig(model=s3_url2, tokenizer=s3_url2)
+    ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url2, s3_url2)
 
     # Verify that different URLs produce different directories
-    assert model_config1.model != model_config2.model, (
+    assert config1.model != config2.model, (
         f"Different S3 URLs should create different model directories. "
-        f"URL1 model: {model_config1.model}, URL2 model: {model_config2.model}"
+        f"URL1 model: {config1.model}, URL2 model: {config2.model}"
     )
-    assert renderer_config1.tokenizer != renderer_config2.tokenizer, (
+    assert config1.tokenizer != config2.tokenizer, (
         f"Different S3 URLs should create different tokenizer directories. "
-        f"URL1 tokenizer: {renderer_config1.tokenizer}, "
-        f"URL2 tokenizer: {renderer_config2.tokenizer}"
+        f"URL1 tokenizer: {config1.tokenizer}, "
+        f"URL2 tokenizer: {config2.tokenizer}"
     )
 
     # Verify that both sets of directories exist
-    assert os.path.exists(model_config1.model) and os.path.isdir(model_config1.model)
-    assert os.path.exists(renderer_config1.tokenizer) and os.path.isdir(
-        renderer_config1.tokenizer
-    )
-    assert os.path.exists(model_config2.model) and os.path.isdir(model_config2.model)
-    assert os.path.exists(renderer_config2.tokenizer) and os.path.isdir(
-        renderer_config2.tokenizer
-    )
+    assert os.path.exists(config1.model) and os.path.isdir(config1.model)
+    assert os.path.exists(config1.tokenizer) and os.path.isdir(config1.tokenizer)
+    assert os.path.exists(config2.model) and os.path.isdir(config2.model)
+    assert os.path.exists(config2.tokenizer) and os.path.isdir(config2.tokenizer)
 
 
 @pytest.mark.parametrize(
diff --git a/tests/test_inputs.py b/tests/test_inputs.py
index 48fd076ab..c4339827d 100644
--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-from vllm.config import ModelConfig, RendererConfig
+from vllm.config import ModelConfig
 from vllm.inputs import zip_enc_dec_prompts
 from vllm.inputs.parse import parse_raw_prompts
 from vllm.inputs.preprocess import InputPreprocessor
@@ -108,9 +108,8 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
 )
 def test_preprocessor_always_mm_code_path(model_id, prompt):
     model_config = ModelConfig(model=model_id)
-    renderer_config = RendererConfig(model_config=model_config)
-    tokenizer = init_tokenizer_from_config(renderer_config)
-    input_preprocessor = InputPreprocessor(renderer_config, tokenizer)
+    tokenizer = init_tokenizer_from_config(model_config)
+    input_preprocessor = InputPreprocessor(model_config, tokenizer)
 
     # HF processor adds sep token
     sep_token_id = tokenizer.vocab[tokenizer.sep_token]
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index 49307e3e5..6cab129c1 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -16,7 +16,6 @@ from vllm.config import (
     LoadConfig,
     ModelConfig,
     ParallelConfig,
-    RendererConfig,
     SchedulerConfig,
     VllmConfig,
 )
@@ -217,7 +216,6 @@ def create_vllm_config(
 
     return VllmConfig(
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         parallel_config=parallel_config,
         scheduler_config=scheduler_config,
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 4a414bca5..fd5cf6d3e 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -8,7 +8,7 @@ import pytest
 import torch
 
 import vllm.v1.core.kv_cache_utils as kv_cache_utils
-from vllm.config import ModelConfig, RendererConfig, SchedulerConfig, VllmConfig
+from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.inputs import (
     MultiModalFeatureSpec,
@@ -667,10 +667,7 @@ def test_metrics_empty_stats():
 
 def test_get_kv_cache_configs_multiple_workers():
     model_config = ModelConfig(max_model_len=16)
-    vllm_config = VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
-    )
+    vllm_config = VllmConfig(model_config=model_config)
 
     ref_kv_cache_spec = new_kv_cache_spec()
     same_kv_cache_specs = [
@@ -1139,7 +1136,6 @@ def test_estimate_max_model_len(model_id, max_model_len, want_estimated_max_len)
 
     vllm_config = VllmConfig(
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         scheduler_config=scheduler_config,
     )
 
@@ -1179,7 +1175,6 @@ def test_get_max_concurrency_for_kv_cache_config():
 
     vllm_config = VllmConfig(
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         scheduler_config=scheduler_config,
     )
 
@@ -1298,10 +1293,7 @@ def test_allocate_with_lookahead():
 def test_get_kv_cache_config_one_worker():
     # pass max_model_len to pass check_enough_kv_cache_memory
     model_config = ModelConfig(max_model_len=16)
-    vllm_config = VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
-    )
+    vllm_config = VllmConfig(model_config=model_config)
 
     mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
     # all layers are full attention -> single group
@@ -1592,11 +1584,7 @@ def test_get_kv_cache_config_one_worker():
 
 def test_get_kv_cache_configs_attention_free():
     kv_cache_specs: dict[str, KVCacheSpec] = {}
-    model_config = ModelConfig(max_model_len=16)
-    vllm_config = VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
-    )
+    vllm_config = VllmConfig(model_config=ModelConfig(max_model_len=16))
     kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [0])
     assert kv_cache_configs == [
         KVCacheConfig(
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 1505415a6..c6c4a5085 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -11,7 +11,6 @@ from vllm.config import (
     ECTransferConfig,
     KVTransferConfig,
     ModelConfig,
-    RendererConfig,
     SchedulerConfig,
     SpeculativeConfig,
     VllmConfig,
@@ -1564,7 +1563,6 @@ def create_scheduler_with_priority(
     vllm_config = VllmConfig(
         scheduler_config=scheduler_config,
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         kv_transfer_config=kv_transfer_config,
         speculative_config=speculative_config,
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 086885c29..f5ba613d3 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -9,7 +9,6 @@ from vllm.config import (
     ECTransferConfig,
     KVTransferConfig,
     ModelConfig,
-    RendererConfig,
     SchedulerConfig,
     SpeculativeConfig,
     VllmConfig,
@@ -133,7 +132,6 @@ def create_scheduler(
     vllm_config = VllmConfig(
         scheduler_config=scheduler_config,
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         kv_transfer_config=kv_transfer_config,
         speculative_config=speculative_config,
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index c606100a1..48be8c15a 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -15,7 +15,6 @@ from vllm.config import (
     ECTransferConfig,
     KVTransferConfig,
     ModelConfig,
-    RendererConfig,
     SchedulerConfig,
     VllmConfig,
 )
@@ -523,7 +522,6 @@ def test_encoder_instance_zero_kv_cache(
 
     vllm_config = VllmConfig(
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         scheduler_config=scheduler_config,
         kv_transfer_config=kv_transfer_config,
diff --git a/tests/v1/engine/test_process_multi_modal_uuids.py b/tests/v1/engine/test_process_multi_modal_uuids.py
index 85fab3a85..1b11b8af4 100644
--- a/tests/v1/engine/test_process_multi_modal_uuids.py
+++ b/tests/v1/engine/test_process_multi_modal_uuids.py
@@ -5,14 +5,7 @@ import pytest
 
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import (
-    CacheConfig,
-    DeviceConfig,
-    ModelConfig,
-    MultiModalConfig,
-    RendererConfig,
-    VllmConfig,
-)
+from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
 from vllm.sampling_params import SamplingParams
 from vllm.v1.engine import input_processor as input_processor_mod
 from vllm.v1.engine.input_processor import InputProcessor
@@ -51,21 +44,22 @@ def _mock_input_processor(
     monkeypatch.setattr(VllmConfig, "__post_init__", lambda self: None, raising=True)
 
     model_config = ModelConfig(
+        skip_tokenizer_init=True,
         max_model_len=128,
         mm_processor_cache_gb=mm_cache_gb,
         generation_config="vllm",
-    )
-    model_config.multimodal_config = MultiModalConfig(mm_processor_cache_gb=mm_cache_gb)
-
-    renderer_config = RendererConfig(
-        model_config=model_config,
         tokenizer="dummy",
-        skip_tokenizer_init=True,
     )
 
+    # Minimal multimodal_config to satisfy references in
+    # Processor.process_inputs.
+    class _MockMMConfig:
+        def __init__(self, gb: float):
+            self.mm_processor_cache_gb = gb
+
+    model_config.multimodal_config = _MockMMConfig(mm_cache_gb)  # type: ignore[attr-defined]
     vllm_config = VllmConfig(
         model_config=model_config,
-        renderer_config=renderer_config,
         cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching),
         device_config=DeviceConfig(device="cpu"),
     )
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 768b338b5..58f1a7282 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -15,7 +15,6 @@ from vllm.config import (
     DeviceConfig,
     KVTransferConfig,
     ModelConfig,
-    RendererConfig,
     SchedulerConfig,
     VllmConfig,
 )
@@ -128,7 +127,6 @@ def create_vllm_config(
     return VllmConfig(
         scheduler_config=scheduler_config,
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         kv_transfer_config=kv_transfer_config,
         device_config=DeviceConfig("cpu"),
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 888ea0169..616e57de3 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -19,7 +19,6 @@ from vllm.config import (
     DeviceConfig,
     ModelConfig,
     ParallelConfig,
-    RendererConfig,
     SchedulerConfig,
     SpeculativeConfig,
     VllmConfig,
@@ -62,7 +61,6 @@ def _create_proposer(
 
     vllm_config = VllmConfig(
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         cache_config=CacheConfig(),
         speculative_config=speculative_config,
         device_config=DeviceConfig(device=current_platform.device_type),
diff --git a/tests/v1/spec_decode/test_mtp.py b/tests/v1/spec_decode/test_mtp.py
index 4483c8243..3b8813ceb 100644
--- a/tests/v1/spec_decode/test_mtp.py
+++ b/tests/v1/spec_decode/test_mtp.py
@@ -18,7 +18,6 @@ from vllm.config import (
     DeviceConfig,
     ModelConfig,
     ParallelConfig,
-    RendererConfig,
     SchedulerConfig,
     SpeculativeConfig,
     VllmConfig,
@@ -47,7 +46,6 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer:
 
     vllm_config = VllmConfig(
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         cache_config=CacheConfig(),
         speculative_config=speculative_config,
         device_config=DeviceConfig(device=current_platform.device_type),
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index 2e365e08a..6bc412abe 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -4,7 +4,6 @@ import numpy as np
 
 from vllm.config import (
     ModelConfig,
-    RendererConfig,
     SpeculativeConfig,
     VllmConfig,
 )
@@ -70,7 +69,6 @@ def test_ngram_proposer():
         return NgramProposer(
             vllm_config=VllmConfig(
                 model_config=model_config,
-                renderer_config=RendererConfig(model_config=model_config),
                 speculative_config=SpeculativeConfig(
                     prompt_lookup_min=min_n,
                     prompt_lookup_max=max_n,
diff --git a/tests/v1/structured_output/test_backend_guidance.py b/tests/v1/structured_output/test_backend_guidance.py
index baef2459f..4c01560fc 100644
--- a/tests/v1/structured_output/test_backend_guidance.py
+++ b/tests/v1/structured_output/test_backend_guidance.py
@@ -6,7 +6,7 @@ from concurrent.futures import Future
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.config import RendererConfig, StructuredOutputsConfig, VllmConfig
+from vllm.config import StructuredOutputsConfig, VllmConfig
 from vllm.config.model import ModelConfig
 from vllm.config.parallel import ParallelConfig
 from vllm.config.speculative import SpeculativeConfig
@@ -72,11 +72,8 @@ def test_backend_guidance_rollback_terminated():
 def test_grammar_bitmask_with_specdec():
     tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
     prompt = tokenizer.encode('{"a": "b"}')
-
-    model_config = ModelConfig(tokenizer=TOKENIZER)
     vllm_config = VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config, tokenizer=TOKENIZER),
+        model_config=ModelConfig(tokenizer=TOKENIZER),
         structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
         speculative_config=SpeculativeConfig(model="[ngram]", num_speculative_tokens=3),
     )
@@ -140,11 +137,8 @@ def test_grammar_init_async_and_sync(async_grammar):
 
     # Use "external_launcher" for sync mode, None for async mode
     executor_backend = None if async_grammar else "external_launcher"
-
-    model_config = ModelConfig(tokenizer=TOKENIZER)
     vllm_config = VllmConfig(
-        model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config, tokenizer=TOKENIZER),
+        model_config=ModelConfig(tokenizer=TOKENIZER),
         structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
         parallel_config=ParallelConfig(distributed_executor_backend=executor_backend),
     )
diff --git a/tests/v1/structured_output/test_reasoning_structured_output.py b/tests/v1/structured_output/test_reasoning_structured_output.py
index 5901d38d1..70047a993 100644
--- a/tests/v1/structured_output/test_reasoning_structured_output.py
+++ b/tests/v1/structured_output/test_reasoning_structured_output.py
@@ -7,7 +7,7 @@ from unittest.mock import Mock
 
 import pytest
 
-from vllm.config import ModelConfig, RendererConfig, SchedulerConfig, VllmConfig
+from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
 from vllm.reasoning import ReasoningParser
 from vllm.v1.request import Request
 from vllm.v1.structured_output import StructuredOutputManager
@@ -17,26 +17,19 @@ class TestReasoningStructuredOutput:
     """Test reasoning-aware structured output functionality."""
 
     @pytest.fixture
-    def mock_renderer_config(self):
-        """Create a mock RendererConfig."""
-        renderer_config = Mock(spec=RendererConfig)
-        renderer_config.skip_tokenizer_init = (
-            True  # Skip tokenizer init to avoid network calls
-        )
-
-        model_config = Mock(spec=ModelConfig)
-        model_config.get_vocab_size = Mock(return_value=50000)
-        model_config.trust_remote_code = False
+    def mock_model_config(self):
+        """Create a mock ModelConfig."""
+        config = Mock(spec=ModelConfig)
+        config.skip_tokenizer_init = True  # Skip tokenizer init to avoid network calls
+        config.get_vocab_size = Mock(return_value=50000)
         # Add missing runner_type attribute that tokenizer initialization expects
-        model_config.runner_type = "generate"
-        renderer_config.model_config = model_config
-
+        config.runner_type = "generate"
         # Add other attributes that tokenizer initialization might need
-        renderer_config.tokenizer = "test-tokenizer"
-        renderer_config.tokenizer_mode = "auto"
-        renderer_config.tokenizer_revision = None
-
-        return renderer_config
+        config.tokenizer = "test-tokenizer"
+        config.tokenizer_mode = "auto"
+        config.trust_remote_code = False
+        config.tokenizer_revision = None
+        return config
 
     @pytest.fixture
     def mock_scheduler_config(self):
@@ -46,10 +39,10 @@ class TestReasoningStructuredOutput:
         return config
 
     @pytest.fixture
-    def mock_vllm_config(self, mock_renderer_config, mock_scheduler_config):
+    def mock_vllm_config(self, mock_model_config, mock_scheduler_config):
         """Create a mock VllmConfig."""
         config = Mock(spec=VllmConfig)
-        config.renderer_config = mock_renderer_config
+        config.model_config = mock_model_config
         config.scheduler_config = mock_scheduler_config
         config.structured_outputs_config = Mock()
         config.structured_outputs_config.reasoning_parser = None
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 080d23863..cfc06666e 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -7,7 +7,6 @@ from vllm.attention.layer import Attention
 from vllm.config import (
     CacheConfig,
     ModelConfig,
-    RendererConfig,
     SchedulerConfig,
     VllmConfig,
     set_current_vllm_config,
@@ -46,7 +45,6 @@ def get_vllm_config():
     )
     vllm_config = VllmConfig(
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         scheduler_config=scheduler_config,
     )
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 464e3ab99..7b8c4268a 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -13,7 +13,6 @@ from vllm.config import (
     CacheConfig,
     ModelConfig,
     ParallelConfig,
-    RendererConfig,
     SchedulerConfig,
     VllmConfig,
     set_current_vllm_config,
@@ -102,7 +101,6 @@ def get_vllm_config():
     parallel_config = ParallelConfig()
     vllm_config = VllmConfig(
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         scheduler_config=scheduler_config,
         parallel_config=parallel_config,
@@ -813,7 +811,6 @@ def test_hybrid_attention_mamba_tensor_shapes():
     attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASHINFER)
     vllm_config = VllmConfig(
         model_config=model_config,
-        renderer_config=RendererConfig(model_config=model_config),
         cache_config=cache_config,
         scheduler_config=scheduler_config,
         parallel_config=parallel_config,
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index a4f9fd8d2..0f84f3ca9 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -24,7 +24,6 @@ from vllm.config.multimodal import MultiModalConfig
 from vllm.config.observability import ObservabilityConfig
 from vllm.config.parallel import EPLBConfig, ParallelConfig
 from vllm.config.pooler import PoolerConfig
-from vllm.config.renderer import RendererConfig
 from vllm.config.scheduler import SchedulerConfig
 from vllm.config.speculative import SpeculativeConfig
 from vllm.config.speech_to_text import SpeechToTextConfig
@@ -82,8 +81,6 @@ __all__ = [
     "ParallelConfig",
     # From vllm.config.pooler
     "PoolerConfig",
-    # From vllm.config.renderer
-    "RendererConfig",
     # From vllm.config.scheduler
     "SchedulerConfig",
     # From vllm.config.speculative
diff --git a/vllm/config/model.py b/vllm/config/model.py
index b0d4fb8e0..509a9c5e1 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -36,6 +36,7 @@ from vllm.transformers_utils.config import (
     uses_xdrope_dim,
 )
 from vllm.transformers_utils.gguf_utils import (
+    is_gguf,
     is_remote_gguf,
     maybe_patch_hf_config_from_gguf,
     split_remote_gguf,
@@ -82,6 +83,7 @@ TaskOption = Literal[
     "transcription",
     "draft",
 ]
+TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
 ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
 LogprobsMode = Literal[
     "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
@@ -129,6 +131,18 @@ class ModelConfig:
 
     Note that the model may support other tasks using the same model runner.
     """
+    tokenizer: SkipValidation[str] = None  # type: ignore
+    """Name or path of the Hugging Face tokenizer to use. If unspecified, model
+    name or path will be used."""
+    tokenizer_mode: TokenizerMode | str = "auto"
+    """Tokenizer mode:\n
+    - "auto" will use the tokenizer from `mistral_common` for Mistral models
+    if available, otherwise it will use the "hf" tokenizer.\n
+    - "hf" will use the fast tokenizer if available.\n
+    - "slow" will always use the slow tokenizer.\n
+    - "mistral" will always use the tokenizer from `mistral_common`.\n
+    - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
+    - Other custom values can be supported via plugins."""
     trust_remote_code: bool = False
     """Trust remote code (e.g., from HuggingFace) when downloading the model
     and tokenizer."""
@@ -154,6 +168,13 @@ class ModelConfig:
     hf_config_path: str | None = None
     """Name or path of the Hugging Face config to use. If unspecified, model
     name or path will be used."""
+    allowed_local_media_path: str = ""
+    """Allowing API requests to read local images or videos from directories
+    specified by the server file system. This is a security risk. Should only
+    be enabled in trusted environments."""
+    allowed_media_domains: list[str] | None = None
+    """If set, only media URLs that belong to this domain can be used for
+    multi-modal inputs. """
     revision: str | None = None
     """The specific model version to use. It can be a branch name, a tag name,
     or a commit id. If unspecified, will use the default version."""
@@ -161,6 +182,10 @@ class ModelConfig:
     """The specific revision to use for the model code on the Hugging Face Hub.
     It can be a branch name, a tag name, or a commit id. If unspecified, will
     use the default version."""
+    tokenizer_revision: str | None = None
+    """The specific revision to use for the tokenizer on the Hugging Face Hub.
+    It can be a branch name, a tag name, or a commit id. If unspecified, will
+    use the default version."""
     max_model_len: SkipValidation[int] = None  # type: ignore
     """Model context length (prompt and output). If unspecified, will be
     automatically derived from the model config.
@@ -205,6 +230,10 @@ class ModelConfig:
     preventing potential numerical issues. Note that even if this is set to
     False, cascade attention will be only used when the heuristic tells that
     it's beneficial."""
+    skip_tokenizer_init: bool = False
+    """Skip initialization of tokenizer and detokenizer. Expects valid
+    `prompt_token_ids` and `None` for prompt from the input. The generated
+    output will contain token ids."""
     enable_prompt_embeds: bool = False
     """If `True`, enables passing text embeddings as inputs via the
     `prompt_embeds` key.
@@ -265,6 +294,8 @@ class ModelConfig:
     logits_processors: list[str | type[LogitsProcessor]] | None = None
     """One or more logits processors' fully-qualified class names or class
     definitions"""
+    io_processor_plugin: str | None = None
+    """IOProcessor plugin name to load at model startup"""
 
     # Pooler config
     pooler_config: PoolerConfig | None = None
@@ -277,6 +308,7 @@ class ModelConfig:
     from the architecture of `self.model`."""
     limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None
     enable_mm_embeds: InitVar[bool | None] = None
+    media_io_kwargs: InitVar[dict[str, dict[str, Any]] | None] = None
     mm_processor_kwargs: InitVar[dict[str, Any] | None] = None
     mm_processor_cache_gb: InitVar[float | None] = None
     mm_processor_cache_type: InitVar[MMCacheType | None] = None
@@ -303,12 +335,18 @@ class ModelConfig:
             "runner",
             "convert",
             "task",
+            "tokenizer",
+            "tokenizer_mode",
             "seed",
             "hf_config_path",
+            "allowed_local_media_path",
+            "allowed_media_domains",
+            "tokenizer_revision",
             "spec_target_max_model_len",
             "enforce_eager",
             "logprobs_mode",
             "disable_cascade_attn",
+            "skip_tokenizer_init",
             "served_model_name",
             "config_format",
             "hf_token",
@@ -316,9 +354,11 @@ class ModelConfig:
             "logits_processor_pattern",
             "override_attention_dtype",
             "logits_processors",
+            "io_processor_plugin",
             "pooler_config",
             "multimodal_config",
             "limit_mm_per_prompt",
+            "media_io_kwargs",
             "mm_processor_kwargs",
             "mm_processor_cache_gb",
             "mm_processor_cache_type",
@@ -383,6 +423,7 @@ class ModelConfig:
         # Multimodal config init vars
         limit_mm_per_prompt: dict[str, int | dict[str, int]] | None,
         enable_mm_embeds: bool | None,
+        media_io_kwargs: dict[str, dict[str, Any]] | None,
         mm_processor_kwargs: dict[str, Any] | None,
         mm_processor_cache_gb: float | None,
         mm_processor_cache_type: MMCacheType | None,
@@ -397,8 +438,13 @@ class ModelConfig:
         self.served_model_name = get_served_model_name(
             self.model, self.served_model_name
         )
-        self.original_model = self.model
-        self.model = maybe_model_redirect(self.original_model)
+        self.model = maybe_model_redirect(self.model)
+        # The tokenizer is consistent with the model by default.
+        if self.tokenizer is None:
+            self.tokenizer = self.model
+        if self.tokenizer_revision is None:
+            self.tokenizer_revision = self.revision
+        self.tokenizer = maybe_model_redirect(self.tokenizer)
 
         if isinstance(self.hf_config_path, str):
             self.hf_config_path = maybe_model_redirect(self.hf_config_path)
@@ -419,7 +465,7 @@ class ModelConfig:
                     hf_overrides_kw[key] = value
             hf_overrides_fn = None
 
-        self.maybe_pull_model_for_runai(self.model)
+        self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)
 
         from vllm.platforms import current_platform
 
@@ -602,8 +648,7 @@ class ModelConfig:
         )
 
         self.original_max_model_len = self.max_model_len
-        self.recalculate_max_model_len(self.original_max_model_len)
-
+        self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
         # Init multimodal config if needed
         if self._model_info.supports_multimodal:
             if (
@@ -619,6 +664,7 @@ class ModelConfig:
             mm_config_kwargs = dict(
                 limit_per_prompt=limit_mm_per_prompt,
                 enable_mm_embeds=enable_mm_embeds,
+                media_io_kwargs=media_io_kwargs,
                 mm_processor_kwargs=mm_processor_kwargs,
                 mm_processor_cache_gb=mm_processor_cache_gb,
                 mm_processor_cache_type=mm_processor_cache_type,
@@ -636,8 +682,16 @@ class ModelConfig:
 
             self.multimodal_config = MultiModalConfig(**mm_config_kwargs)
 
+        # Multimodal GGUF models must use original repo for mm processing
+        if is_gguf(self.tokenizer) and self.is_multimodal_model:
+            raise ValueError(
+                "Loading a multimodal GGUF model needs to use original "
+                "tokenizer. Please specify the unquantized hf model's "
+                "repo name or path using the --tokenizer argument."
+            )
+
         if self.disable_sliding_window:
-            # Set after recalculate_max_model_len to ensure that max_model_len
+            # Set after get_and_verify_max_len to ensure that max_model_len
             # can be correctly capped to sliding window size
             self.hf_text_config.sliding_window = None
 
@@ -661,9 +715,10 @@ class ModelConfig:
 
     @model_validator(mode="after")
     def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
+        if not isinstance(self.tokenizer, str):
+            raise ValueError("tokenizer must be a string after __post_init__.")
         if not isinstance(self.max_model_len, int):
             raise ValueError("max_model_len must be an integer after __post_init__.")
-
         return self
 
     def _get_transformers_backend_cls(self) -> str:
@@ -712,17 +767,49 @@ class ModelConfig:
         """The architecture vllm actually used."""
         return self._architecture
 
-    def maybe_pull_model_for_runai(self, model: str) -> None:
-        """Pull model from Object Storage to temporary directory when needed."""
-        if not is_runai_obj_uri(model):
+    def maybe_pull_model_tokenizer_for_runai(self, model: str, tokenizer: str) -> None:
+        """Pull model/tokenizer from Object Storage to temporary
+        directory when needed.
+
+        Args:
+            model: Model name or path
+            tokenizer: Tokenizer name or path
+        """
+
+        if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)):
             return
 
-        object_storage_model = ObjectStorageModel(url=model)
-        object_storage_model.pull_files(
-            model, allow_pattern=["*.model", "*.py", "*.json"]
-        )
-        self.model_weights = model
-        self.model = object_storage_model.dir
+        if is_runai_obj_uri(model):
+            object_storage_model = ObjectStorageModel(url=model)
+            object_storage_model.pull_files(
+                model, allow_pattern=["*.model", "*.py", "*.json"]
+            )
+            self.model_weights = model
+            self.model = object_storage_model.dir
+
+            # If tokenizer is same as model, download to same directory
+            if model == tokenizer:
+                object_storage_model.pull_files(
+                    model,
+                    ignore_pattern=[
+                        "*.pt",
+                        "*.safetensors",
+                        "*.bin",
+                        "*.tensors",
+                        "*.pth",
+                    ],
+                )
+                self.tokenizer = object_storage_model.dir
+                return
+
+        # Only download tokenizer if needed and not already handled
+        if is_runai_obj_uri(tokenizer):
+            object_storage_tokenizer = ObjectStorageModel(url=tokenizer)
+            object_storage_tokenizer.pull_files(
+                model,
+                ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"],
+            )
+            self.tokenizer = object_storage_tokenizer.dir
 
     def _get_encoder_config(self):
         model = self.model
@@ -1625,38 +1712,30 @@ class ModelConfig:
             return dense_modules[-1]["out_features"]
         return self.get_hidden_size()
 
-    def recalculate_max_model_len(
-        self,
-        original_max_model_len: int | None,
-        *,
-        tokenizer: str | None = None,
-        tokenizer_revision: str | None = None,
-    ) -> None:
+    def get_and_verify_max_len(self, max_model_len: int):
         # Consider max_model_len in tokenizer_config only when
         # pooling models use absolute position_embedding.
-        # NOTE: For simplicity we assume `args.model == args.tokenizer`
-        # since this is
         tokenizer_config = None
         if (
             self.runner_type == "pooling"
             and getattr(self.hf_config, "position_embedding_type", "") == "absolute"
         ):
             tokenizer_config = try_get_tokenizer_config(
-                tokenizer or self.model,
+                self.tokenizer,
                 trust_remote_code=self.trust_remote_code,
-                revision=tokenizer_revision or self.revision,
+                revision=self.tokenizer_revision,
             )
-
-        self.max_model_len = _get_and_verify_max_len(
+        max_model_len = _get_and_verify_max_len(
             hf_config=self.hf_text_config,
             tokenizer_config=tokenizer_config,
-            max_model_len=original_max_model_len,
+            max_model_len=max_model_len,
             disable_sliding_window=self.disable_sliding_window,
             sliding_window=self.get_sliding_window(),
             spec_target_max_model_len=self.spec_target_max_model_len,
             encoder_config=self.encoder_config,
         )
-        logger.info("Using max model len %s", self.max_model_len)
+        logger.info("Using max model len %s", max_model_len)
+        return max_model_len
 
     @property
     def attn_type(self) -> AttnTypeStr:
diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py
index 37e2f6b4d..8a2936de9 100644
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@@ -79,6 +79,10 @@ class MultiModalConfig:
 
     WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
     Only enable this flag for trusted users!"""
+    media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
+    """Additional args passed to process media inputs, keyed by modalities.
+    For example, to set num_frames for video, set
+    `--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
     mm_processor_kwargs: dict[str, object] | None = None
     """Arguments to be forwarded to the model's processor for multi-modal data,
     e.g., image processor. Overrides for the multi-modal processor obtained
diff --git a/vllm/config/renderer.py b/vllm/config/renderer.py
deleted file mode 100644
index 36a922b93..000000000
--- a/vllm/config/renderer.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Literal
-
-from pydantic import Field, SkipValidation
-from pydantic.dataclasses import dataclass
-
-from vllm.config.model import ModelConfig
-from vllm.config.utils import config
-from vllm.transformers_utils.gguf_utils import is_gguf
-from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri
-from vllm.transformers_utils.utils import maybe_model_redirect
-
-TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
-
-
-@config
-@dataclass
-class RendererConfig:
-    """Configuration for the renderer."""
-
-    # NOTE: In reality, this is a required argument.
-    # We provide a dummy default value here to generate the CLI args.
-    model_config: SkipValidation[ModelConfig] = None  # type: ignore
-    """Provides model context to the renderer."""
-
-    tokenizer: str = ""
-    """Name or path of the Hugging Face tokenizer to use. If unspecified, model
-    name or path will be used."""
-    tokenizer_mode: TokenizerMode | str = "auto"
-    """Tokenizer mode:\n
-    - "auto" will use the tokenizer from `mistral_common` for Mistral models
-    if available, otherwise it will use the "hf" tokenizer.\n
-    - "hf" will use the fast tokenizer if available.\n
-    - "slow" will always use the slow tokenizer.\n
-    - "mistral" will always use the tokenizer from `mistral_common`.\n
-    - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
-    - Other custom values can be supported via plugins."""
-    tokenizer_revision: str | None = None
-    """The specific revision to use for the tokenizer on the Hugging Face Hub.
-    It can be a branch name, a tag name, or a commit id. If unspecified, will
-    use the default version."""
-    skip_tokenizer_init: bool = False
-    """Skip initialization of tokenizer and detokenizer. Expects valid
-    `prompt_token_ids` and `None` for prompt from the input. The generated
-    output will contain token ids."""
-
-    io_processor_plugin: str | None = None
-    """IOProcessor plugin name to load at model startup."""
-
-    media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
-    """Additional args passed to process media inputs, keyed by modalities.
-    For example, to set num_frames for video, set
-    `--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
-    allowed_local_media_path: str = ""
-    """Allowing API requests to read local images or videos from directories
-    specified by the server file system. This is a security risk. Should only
-    be enabled in trusted environments."""
-    allowed_media_domains: list[str] | None = None
-    """If set, only media URLs that belong to this domain can be used for
-    multi-modal inputs. """
-
-    @property
-    def trust_remote_code(self) -> bool:
-        return self.model_config.trust_remote_code
-
-    def __post_init__(self) -> None:
-        model_config = self.model_config
-
-        # The tokenizer is consistent with the model by default.
-        if not self.tokenizer:
-            self.tokenizer = (
-                ModelConfig.model
-                if model_config is None
-                else model_config.original_model
-            )
-        if not self.tokenizer_revision:
-            self.tokenizer_revision = (
-                ModelConfig.revision if model_config is None else model_config.revision
-            )
-
-        self.original_tokenizer = self.tokenizer
-        self.tokenizer = maybe_model_redirect(self.original_tokenizer)
-        self.maybe_pull_tokenizer_for_runai(self.tokenizer)
-
-        # Multimodal GGUF models must use original repo for mm processing
-        is_multimodal_model = (
-            ModelConfig.is_multimodal_model
-            if model_config is None
-            else model_config.is_multimodal_model
-        )
-        if is_gguf(self.tokenizer) and is_multimodal_model:
-            raise ValueError(
-                "Loading a multimodal GGUF model needs to use original "
-                "tokenizer. Please specify the unquantized hf model's "
-                "repo name or path using the --tokenizer argument."
-            )
-
-    def maybe_pull_tokenizer_for_runai(self, tokenizer: str) -> None:
-        """Pull tokenizer from Object Storage to temporary directory when needed."""
-        if not is_runai_obj_uri(tokenizer):
-            return
-
-        object_storage_tokenizer = ObjectStorageModel(url=tokenizer)
-        object_storage_tokenizer.pull_files(
-            tokenizer,
-            ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"],
-        )
-        self.tokenizer = object_storage_tokenizer.dir
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index 63b63eac9..bf533bf14 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -322,11 +322,16 @@ class SpeculativeConfig:
                 self.draft_model_config = ModelConfig(
                     model=self.model,
                     runner="draft",
+                    tokenizer=self.target_model_config.tokenizer,
+                    tokenizer_mode=self.target_model_config.tokenizer_mode,
                     trust_remote_code=self.target_model_config.trust_remote_code,
+                    allowed_local_media_path=self.target_model_config.allowed_local_media_path,
+                    allowed_media_domains=self.target_model_config.allowed_media_domains,
                     dtype=self.target_model_config.dtype,
                     seed=self.target_model_config.seed,
                     revision=self.revision,
                     code_revision=self.code_revision,
+                    tokenizer_revision=self.target_model_config.tokenizer_revision,
                     spec_target_max_model_len=self.target_model_config.max_model_len,
                     quantization=self.quantization,
                     enforce_eager=self.target_model_config.enforce_eager,
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 417797c44..36e4bd159 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -39,7 +39,6 @@ from .lora import LoRAConfig
 from .model import ModelConfig
 from .observability import ObservabilityConfig
 from .parallel import ParallelConfig
-from .renderer import RendererConfig
 from .scheduler import SchedulerConfig
 from .speculative import SpeculativeConfig
 from .structured_outputs import StructuredOutputsConfig
@@ -182,8 +181,6 @@ class VllmConfig:
     # try to download a model
     model_config: ModelConfig = Field(default=None)
     """Model configuration."""
-    renderer_config: RendererConfig = Field(default_factory=RendererConfig)
-    """Renderer configuration."""
     cache_config: CacheConfig = Field(default_factory=CacheConfig)
     """Cache configuration."""
     parallel_config: ParallelConfig = Field(default_factory=ParallelConfig)
@@ -744,7 +741,7 @@ class VllmConfig:
             from vllm.multimodal import MULTIMODAL_REGISTRY
 
             self.scheduler_config.max_num_encoder_input_tokens = (
-                MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.renderer_config)
+                MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
             )
             logger.debug(
                 "Encoder-decoder model detected: setting "
@@ -1189,13 +1186,11 @@ class VllmConfig:
             computed_compile_ranges_split_points
         )
 
-    def recalculate_max_model_len(self, original_max_model_len: int | None) -> None:
-        # Can only be called during try_verify_and_update_config
-        self.model_config.recalculate_max_model_len(
-            original_max_model_len,
-            tokenizer=self.renderer_config.tokenizer,
-            tokenizer_revision=self.renderer_config.tokenizer_revision,
-        )
+    def recalculate_max_model_len(self, max_model_len: int):
+        # Can only be called in try_verify_and_update_config
+        model_config = self.model_config
+        max_model_len = model_config.get_and_verify_max_len(max_model_len)
+        self.model_config.max_model_len = max_model_len
 
     def try_verify_and_update_config(self):
         if self.model_config is None:
@@ -1269,11 +1264,11 @@ class VllmConfig:
         return (
             f"model={self.model_config.model!r}, "
             f"speculative_config={self.speculative_config!r}, "
-            f"tokenizer={self.renderer_config.tokenizer!r}, "
-            f"skip_tokenizer_init={self.renderer_config.skip_tokenizer_init}, "
-            f"tokenizer_mode={self.renderer_config.tokenizer_mode}, "
+            f"tokenizer={self.model_config.tokenizer!r}, "
+            f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
+            f"tokenizer_mode={self.model_config.tokenizer_mode}, "
             f"revision={self.model_config.revision}, "
-            f"tokenizer_revision={self.renderer_config.tokenizer_revision}, "
+            f"tokenizer_revision={self.model_config.tokenizer_revision}, "
             f"trust_remote_code={self.model_config.trust_remote_code}, "
             f"dtype={self.model_config.dtype}, "
             f"max_seq_len={self.model_config.max_model_len}, "
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index bd398abb0..ceac5407a 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -71,11 +71,11 @@ from vllm.config.model import (
     ModelDType,
     RunnerOption,
     TaskOption,
+    TokenizerMode,
 )
 from vllm.config.multimodal import MMCacheType, MMEncoderTPMode
 from vllm.config.observability import DetailedTraceModules
 from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy
-from vllm.config.renderer import RendererConfig, TokenizerMode
 from vllm.config.scheduler import SchedulerPolicy
 from vllm.config.utils import get_field
 from vllm.config.vllm import OptimizationLevel
@@ -355,12 +355,17 @@ class EngineArgs:
 
     model: str = ModelConfig.model
     served_model_name: str | list[str] | None = ModelConfig.served_model_name
+    tokenizer: str | None = ModelConfig.tokenizer
     hf_config_path: str | None = ModelConfig.hf_config_path
     runner: RunnerOption = ModelConfig.runner
     convert: ConvertOption = ModelConfig.convert
     task: TaskOption | None = ModelConfig.task
+    skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
     enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
+    tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode
     trust_remote_code: bool = ModelConfig.trust_remote_code
+    allowed_local_media_path: str = ModelConfig.allowed_local_media_path
+    allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains
     download_dir: str | None = LoadConfig.download_dir
     safetensors_load_strategy: str = LoadConfig.safetensors_load_strategy
     load_format: str | LoadFormats = LoadConfig.load_format
@@ -444,6 +449,7 @@ class EngineArgs:
     code_revision: str | None = ModelConfig.code_revision
     hf_token: bool | str | None = ModelConfig.hf_token
     hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
+    tokenizer_revision: str | None = ModelConfig.tokenizer_revision
     quantization: QuantizationMethods | None = ModelConfig.quantization
     enforce_eager: bool = ModelConfig.enforce_eager
     disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
@@ -452,6 +458,9 @@ class EngineArgs:
     )
     enable_mm_embeds: bool = MultiModalConfig.enable_mm_embeds
     interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings
+    media_io_kwargs: dict[str, dict[str, Any]] = get_field(
+        MultiModalConfig, "media_io_kwargs"
+    )
     mm_processor_kwargs: dict[str, Any] | None = MultiModalConfig.mm_processor_kwargs
     disable_mm_preprocessor_cache: bool = False  # DEPRECATED
     mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb
@@ -465,19 +474,9 @@ class EngineArgs:
     mm_encoder_attn_backend: AttentionBackendEnum | str | None = (
         MultiModalConfig.mm_encoder_attn_backend
     )
+    io_processor_plugin: str | None = None
     skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
     video_pruning_rate: float = MultiModalConfig.video_pruning_rate
-    # Renderer fields
-    tokenizer: str | None = None
-    tokenizer_mode: TokenizerMode | str = RendererConfig.tokenizer_mode
-    tokenizer_revision: str | None = RendererConfig.tokenizer_revision
-    skip_tokenizer_init: bool = RendererConfig.skip_tokenizer_init
-    io_processor_plugin: str | None = None
-    media_io_kwargs: dict[str, dict[str, Any]] = get_field(
-        RendererConfig, "media_io_kwargs"
-    )
-    allowed_local_media_path: str = RendererConfig.allowed_local_media_path
-    allowed_media_domains: list[str] | None = RendererConfig.allowed_media_domains
     # LoRA fields
     enable_lora: bool = False
     max_loras: int = LoRAConfig.max_loras
@@ -628,14 +627,25 @@ class EngineArgs:
         model_group.add_argument("--runner", **model_kwargs["runner"])
         model_group.add_argument("--convert", **model_kwargs["convert"])
         model_group.add_argument("--task", **model_kwargs["task"], deprecated=True)
+        model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
+        model_group.add_argument("--tokenizer-mode", **model_kwargs["tokenizer_mode"])
         model_group.add_argument(
             "--trust-remote-code", **model_kwargs["trust_remote_code"]
         )
         model_group.add_argument("--dtype", **model_kwargs["dtype"])
         model_group.add_argument("--seed", **model_kwargs["seed"])
         model_group.add_argument("--hf-config-path", **model_kwargs["hf_config_path"])
+        model_group.add_argument(
+            "--allowed-local-media-path", **model_kwargs["allowed_local_media_path"]
+        )
+        model_group.add_argument(
+            "--allowed-media-domains", **model_kwargs["allowed_media_domains"]
+        )
         model_group.add_argument("--revision", **model_kwargs["revision"])
         model_group.add_argument("--code-revision", **model_kwargs["code_revision"])
+        model_group.add_argument(
+            "--tokenizer-revision", **model_kwargs["tokenizer_revision"]
+        )
         model_group.add_argument("--max-model-len", **model_kwargs["max_model_len"])
         model_group.add_argument("--quantization", "-q", **model_kwargs["quantization"])
         model_group.add_argument("--enforce-eager", **model_kwargs["enforce_eager"])
@@ -647,6 +657,9 @@ class EngineArgs:
         model_group.add_argument(
             "--disable-cascade-attn", **model_kwargs["disable_cascade_attn"]
         )
+        model_group.add_argument(
+            "--skip-tokenizer-init", **model_kwargs["skip_tokenizer_init"]
+        )
         model_group.add_argument(
             "--enable-prompt-embeds", **model_kwargs["enable_prompt_embeds"]
         )
@@ -685,34 +698,8 @@ class EngineArgs:
         model_group.add_argument(
             "--logits-processors", **model_kwargs["logits_processors"]
         )
-
-        # Renderer arguments
-        renderer_kwargs = get_kwargs(RendererConfig)
-        renderer_group = parser.add_argument_group(
-            title="RendererConfig",
-            description=RendererConfig.__doc__,
-        )
-        renderer_group.add_argument("--tokenizer", **renderer_kwargs["tokenizer"])
-        renderer_group.add_argument(
-            "--tokenizer-mode", **renderer_kwargs["tokenizer_mode"]
-        )
-        renderer_group.add_argument(
-            "--tokenizer-revision", **renderer_kwargs["tokenizer_revision"]
-        )
-        renderer_group.add_argument(
-            "--skip-tokenizer-init", **renderer_kwargs["skip_tokenizer_init"]
-        )
-        renderer_group.add_argument(
-            "--media-io-kwargs", **renderer_kwargs["media_io_kwargs"]
-        )
-        renderer_group.add_argument(
-            "--allowed-local-media-path", **renderer_kwargs["allowed_local_media_path"]
-        )
-        renderer_group.add_argument(
-            "--allowed-media-domains", **renderer_kwargs["allowed_media_domains"]
-        )
-        renderer_group.add_argument(
-            "--io-processor-plugin", **renderer_kwargs["io_processor_plugin"]
+        model_group.add_argument(
+            "--io-processor-plugin", **model_kwargs["io_processor_plugin"]
         )
 
         # Model loading arguments
@@ -962,6 +949,9 @@ class EngineArgs:
         multimodal_group.add_argument(
             "--enable-mm-embeds", **multimodal_kwargs["enable_mm_embeds"]
         )
+        multimodal_group.add_argument(
+            "--media-io-kwargs", **multimodal_kwargs["media_io_kwargs"]
+        )
         multimodal_group.add_argument(
             "--mm-processor-kwargs", **multimodal_kwargs["mm_processor_kwargs"]
         )
@@ -1265,13 +1255,18 @@ class EngineArgs:
             runner=self.runner,
             convert=self.convert,
             task=self.task,
+            tokenizer=self.tokenizer,
+            tokenizer_mode=self.tokenizer_mode,
             trust_remote_code=self.trust_remote_code,
+            allowed_local_media_path=self.allowed_local_media_path,
+            allowed_media_domains=self.allowed_media_domains,
             dtype=self.dtype,
             seed=self.seed,
             revision=self.revision,
             code_revision=self.code_revision,
             hf_token=self.hf_token,
             hf_overrides=self.hf_overrides,
+            tokenizer_revision=self.tokenizer_revision,
             max_model_len=self.max_model_len,
             quantization=self.quantization,
             enforce_eager=self.enforce_eager,
@@ -1279,11 +1274,13 @@ class EngineArgs:
             logprobs_mode=self.logprobs_mode,
             disable_sliding_window=self.disable_sliding_window,
             disable_cascade_attn=self.disable_cascade_attn,
+            skip_tokenizer_init=self.skip_tokenizer_init,
             enable_prompt_embeds=self.enable_prompt_embeds,
             served_model_name=self.served_model_name,
             limit_mm_per_prompt=self.limit_mm_per_prompt,
             enable_mm_embeds=self.enable_mm_embeds,
             interleave_mm_strings=self.interleave_mm_strings,
+            media_io_kwargs=self.media_io_kwargs,
             skip_mm_profiling=self.skip_mm_profiling,
             config_format=self.config_format,
             mm_processor_kwargs=self.mm_processor_kwargs,
@@ -1301,6 +1298,7 @@ class EngineArgs:
             override_attention_dtype=self.override_attention_dtype,
             logits_processors=self.logits_processors,
             video_pruning_rate=self.video_pruning_rate,
+            io_processor_plugin=self.io_processor_plugin,
         )
 
     def validate_tensorizer_args(self):
@@ -1396,25 +1394,9 @@ class EngineArgs:
             )
 
         model_config = self.create_model_config()
-        renderer_config = RendererConfig(
-            model_config=model_config,
-            tokenizer=self.tokenizer or "",
-            tokenizer_mode=self.tokenizer_mode,
-            tokenizer_revision=self.tokenizer_revision,
-            skip_tokenizer_init=self.skip_tokenizer_init,
-            io_processor_plugin=self.io_processor_plugin,
-            media_io_kwargs=self.media_io_kwargs,
-            allowed_local_media_path=self.allowed_local_media_path,
-            allowed_media_domains=self.allowed_media_domains,
-        )
-
-        model_config.recalculate_max_model_len(
-            model_config.original_max_model_len,
-            tokenizer=renderer_config.tokenizer,
-            tokenizer_revision=renderer_config.tokenizer_revision,
-        )
-
         self.model = model_config.model
+        self.tokenizer = model_config.tokenizer
+
         self._check_feature_supported(model_config)
         self._set_default_chunked_prefill_and_prefix_caching_args(model_config)
         self._set_default_max_num_seqs_and_batched_tokens_args(
@@ -1786,7 +1768,6 @@ class EngineArgs:
             )
         config = VllmConfig(
             model_config=model_config,
-            renderer_config=renderer_config,
             cache_config=cache_config,
             parallel_config=parallel_config,
             scheduler_config=scheduler_config,
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 7b60e7f89..d94951a0c 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
 from collections.abc import AsyncGenerator, Iterable, Mapping
 from typing import Any
 
-from vllm.config import ModelConfig, RendererConfig, VllmConfig
+from vllm.config import ModelConfig, VllmConfig
 from vllm.inputs.data import PromptType
 from vllm.lora.request import LoRARequest
 from vllm.outputs import PoolingRequestOutput, RequestOutput
@@ -22,7 +22,6 @@ class EngineClient(ABC):
     """Protocol class for Clients to Engine"""
 
     vllm_config: VllmConfig
-    renderer_config: RendererConfig
     model_config: ModelConfig
     input_processor: InputProcessor
     io_processor: IOProcessor | None
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 5ad256c2f..aceaa8bd4 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -44,7 +44,7 @@ from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, Processor
 from typing_extensions import Required, TypedDict
 
 from vllm import envs
-from vllm.config import ModelConfig, RendererConfig
+from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.model_executor.models import SupportsMultiModal
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict
@@ -452,10 +452,9 @@ This is needed because `lru_cache` does not cache when an exception happens.
 
 def _try_get_processor_chat_template(
     tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
-    *,
-    trust_remote_code: bool,
+    model_config: ModelConfig,
 ) -> str | None:
-    cache_key = (tokenizer.name_or_path, trust_remote_code)
+    cache_key = (tokenizer.name_or_path, model_config.trust_remote_code)
     if cache_key in _PROCESSOR_CHAT_TEMPLATES:
         return _PROCESSOR_CHAT_TEMPLATES[cache_key]
 
@@ -467,7 +466,7 @@ def _try_get_processor_chat_template(
                 PreTrainedTokenizerFast,
                 ProcessorMixin,
             ),
-            trust_remote_code=trust_remote_code,
+            trust_remote_code=model_config.trust_remote_code,
         )
         if (
             isinstance(processor, ProcessorMixin)
@@ -500,10 +499,7 @@ def resolve_hf_chat_template(
 
     # 2nd priority: AutoProcessor chat template, unless tool calling is enabled
     if tools is None:
-        chat_template = _try_get_processor_chat_template(
-            tokenizer,
-            trust_remote_code=model_config.trust_remote_code,
-        )
+        chat_template = _try_get_processor_chat_template(tokenizer, model_config)
         if chat_template is not None:
             return chat_template
 
@@ -517,10 +513,10 @@ def resolve_hf_chat_template(
             exc_info=True,
         )
 
-    # 4th priority: Predefined fallbacks]
+    # 4th priority: Predefined fallbacks
     path = get_chat_template_fallback_path(
         model_type=model_config.hf_config.model_type,
-        tokenizer_name_or_path=tokenizer.name_or_path,
+        tokenizer_name_or_path=model_config.tokenizer,
     )
     if path is not None:
         logger.info_once(
@@ -542,14 +538,14 @@ def _resolve_chat_template_content_format(
     tools: list[dict[str, Any]] | None,
     tokenizer: TokenizerLike | None,
     *,
-    renderer_config: RendererConfig,
+    model_config: ModelConfig,
 ) -> _ChatTemplateContentFormat:
     if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
         hf_chat_template = resolve_hf_chat_template(
             tokenizer,
             chat_template=chat_template,
             tools=tools,
-            model_config=renderer_config.model_config,
+            model_config=model_config,
         )
     else:
         hf_chat_template = None
@@ -599,7 +595,7 @@ def resolve_chat_template_content_format(
     given_format: ChatTemplateContentFormatOption,
     tokenizer: TokenizerLike | None,
     *,
-    renderer_config: RendererConfig,
+    model_config: ModelConfig,
 ) -> _ChatTemplateContentFormat:
     if given_format != "auto":
         return given_format
@@ -608,7 +604,7 @@ def resolve_chat_template_content_format(
         chat_template,
         tools,
         tokenizer,
-        renderer_config=renderer_config,
+        model_config=model_config,
     )
 
     _log_chat_template_content_format(
@@ -631,32 +627,32 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
     maximum per prompt.
     """
 
-    def __init__(self, renderer_config: RendererConfig):
+    def __init__(self, model_config: ModelConfig):
         super().__init__()
 
-        self._renderer_config = renderer_config
+        self._model_config = model_config
 
         self._items_by_modality = defaultdict[str, list[_T | None]](list)
         self._uuids_by_modality = defaultdict[str, list[str | None]](list)
 
     @property
-    def renderer_config(self) -> RendererConfig:
-        return self._renderer_config
+    def model_config(self) -> ModelConfig:
+        return self._model_config
 
     @cached_property
     def model_cls(self) -> type[SupportsMultiModal]:
         from vllm.model_executor.model_loader import get_model_cls
 
-        model_cls = get_model_cls(self.renderer_config.model_config)
+        model_cls = get_model_cls(self.model_config)
         return cast(type[SupportsMultiModal], model_cls)
 
     @property
     def allowed_local_media_path(self):
-        return self._renderer_config.allowed_local_media_path
+        return self._model_config.allowed_local_media_path
 
     @property
     def allowed_media_domains(self):
-        return self._renderer_config.allowed_media_domains
+        return self._model_config.allowed_media_domains
 
     @property
     def mm_registry(self):
@@ -664,7 +660,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
 
     @cached_property
     def mm_processor(self):
-        return self.mm_registry.create_processor(self.renderer_config)
+        return self.mm_registry.create_processor(self.model_config)
 
     def add(
         self,
@@ -855,20 +851,19 @@ class MultiModalContentParser(BaseMultiModalContentParser):
         super().__init__()
 
         self._tracker = tracker
+        multimodal_config = self._tracker.model_config.multimodal_config
+        media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None)
+
         self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load(
             envs.VLLM_MEDIA_CONNECTOR,
-            media_io_kwargs=self.renderer_config.media_io_kwargs,
+            media_io_kwargs=media_io_kwargs,
             allowed_local_media_path=tracker.allowed_local_media_path,
             allowed_media_domains=tracker.allowed_media_domains,
         )
 
-    @property
-    def renderer_config(self) -> RendererConfig:
-        return self._tracker.renderer_config
-
     @property
     def model_config(self) -> ModelConfig:
-        return self.renderer_config.model_config
+        return self._tracker.model_config
 
     def parse_image(self, image_url: str | None, uuid: str | None = None) -> None:
         image = self._connector.fetch_image(image_url) if image_url else None
@@ -968,20 +963,18 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
         super().__init__()
 
         self._tracker = tracker
+        multimodal_config = self._tracker.model_config.multimodal_config
+        media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None)
         self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load(
             envs.VLLM_MEDIA_CONNECTOR,
-            media_io_kwargs=self.renderer_config.media_io_kwargs,
+            media_io_kwargs=media_io_kwargs,
             allowed_local_media_path=tracker.allowed_local_media_path,
             allowed_media_domains=tracker.allowed_media_domains,
         )
 
-    @property
-    def renderer_config(self) -> RendererConfig:
-        return self._tracker.renderer_config
-
     @property
     def model_config(self) -> ModelConfig:
-        return self.renderer_config.model_config
+        return self._tracker.model_config
 
     def parse_image(self, image_url: str | None, uuid: str | None = None) -> None:
         image_coro = self._connector.fetch_image_async(image_url) if image_url else None
@@ -1611,17 +1604,15 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None:
 
 def parse_chat_messages(
     messages: list[ChatCompletionMessageParam],
-    renderer_config: RendererConfig,
+    model_config: ModelConfig,
     content_format: _ChatTemplateContentFormat,
 ) -> tuple[
     list[ConversationMessage],
     MultiModalDataDict | None,
     MultiModalUUIDDict | None,
 ]:
-    model_config = renderer_config.model_config
-
     conversation: list[ConversationMessage] = []
-    mm_tracker = MultiModalItemTracker(renderer_config)
+    mm_tracker = MultiModalItemTracker(model_config)
 
     for msg in messages:
         sub_messages = _parse_chat_message_content(
@@ -1644,17 +1635,15 @@ def parse_chat_messages(
 
 def parse_chat_messages_futures(
     messages: list[ChatCompletionMessageParam],
-    renderer_config: RendererConfig,
+    model_config: ModelConfig,
     content_format: _ChatTemplateContentFormat,
 ) -> tuple[
     list[ConversationMessage],
     Awaitable[MultiModalDataDict | None],
     MultiModalUUIDDict | None,
 ]:
-    model_config = renderer_config.model_config
-
     conversation: list[ConversationMessage] = []
-    mm_tracker = AsyncMultiModalItemTracker(renderer_config)
+    mm_tracker = AsyncMultiModalItemTracker(model_config)
 
     for msg in messages:
         sub_messages = _parse_chat_message_content(
@@ -1759,14 +1748,14 @@ def apply_hf_chat_template(
     chat_template: str | None,
     tools: list[dict[str, Any]] | None,
     *,
-    renderer_config: RendererConfig,
+    model_config: ModelConfig,
     **kwargs: Any,
 ) -> str:
     hf_chat_template = resolve_hf_chat_template(
         tokenizer,
         chat_template=chat_template,
         tools=tools,
-        model_config=renderer_config.model_config,
+        model_config=model_config,
     )
 
     if hf_chat_template is None:
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 6b3cb26af..913324fd5 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -29,8 +29,8 @@ from vllm.config.model import (
     HfOverrides,
     ModelDType,
     RunnerOption,
+    TokenizerMode,
 )
-from vllm.config.renderer import TokenizerMode
 from vllm.engine.arg_utils import EngineArgs
 from vllm.entrypoints.chat_utils import (
     ChatCompletionMessageParam,
@@ -343,7 +343,6 @@ class LLM:
         logger.info("Supported tasks: %s", supported_tasks)
         self.supported_tasks = supported_tasks
 
-        self.renderer_config = self.llm_engine.renderer_config
         self.model_config = self.llm_engine.model_config
         self.input_processor = self.llm_engine.input_processor
         self.io_processor = self.llm_engine.io_processor
@@ -809,13 +808,13 @@ class LLM:
             list_of_messages = [cast(list[ChatCompletionMessageParam], messages)]
 
         tokenizer = self.get_tokenizer()
-        renderer_config = self.renderer_config
+        model_config = self.model_config
         resolved_content_format = resolve_chat_template_content_format(
             chat_template,
             tools,
             chat_template_content_format,
             tokenizer,
-            renderer_config=renderer_config,
+            model_config=model_config,
         )
 
         _chat_template_kwargs: dict[str, Any] = dict(
@@ -834,7 +833,7 @@ class LLM:
             # the chat message parsing for it.
             conversation, mm_data, mm_uuids = parse_chat_messages(
                 msgs,
-                renderer_config,
+                model_config,
                 content_format=resolved_content_format,
             )
 
@@ -848,7 +847,7 @@ class LLM:
                 prompt_str = apply_hf_chat_template(
                     tokenizer=tokenizer,
                     conversation=conversation,
-                    renderer_config=renderer_config,
+                    model_config=model_config,
                     **_chat_template_kwargs,
                 )
                 # Special tokens are already included in chat templates so
@@ -1291,7 +1290,6 @@ class LLM:
         lora_request: list[LoRARequest] | LoRARequest | None = None,
         tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[ScoringRequestOutput]:
-        renderer_config = self.renderer_config
         model_config = self.model_config
 
         if isinstance(tokenizer, MistralTokenizer):
@@ -1319,7 +1317,7 @@ class LLM:
 
         for q, d in input_pairs:
             _, engine_prompt = get_score_prompt(
-                renderer_config=renderer_config,
+                model_config=model_config,
                 data_1=q,
                 data_2=d,
                 tokenizer=tokenizer,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index d77d611a2..7be601d82 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1099,7 +1099,7 @@ async def init_app_state(
     logger.info("Supported tasks: %s", supported_tasks)
 
     resolved_chat_template = await process_chat_template(
-        args.chat_template, engine_client, vllm_config.renderer_config
+        args.chat_template, engine_client, vllm_config.model_config
     )
 
     if args.tool_server == "demo":
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index a9e72fb00..3e421e21e 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -122,7 +122,7 @@ class OpenAIServingCompletion(OpenAIServing):
         try:
             lora_request = self._maybe_get_adapters(request)
 
-            if self.renderer_config.skip_tokenizer_init:
+            if self.model_config.skip_tokenizer_init:
                 tokenizer = None
             else:
                 tokenizer = await self.engine_client.get_tokenizer()
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index d887cf48d..99936f588 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -291,7 +291,6 @@ class OpenAIServing:
 
         self.input_processor = self.models.input_processor
         self.io_processor = self.models.io_processor
-        self.renderer_config = self.models.renderer_config
         self.model_config = self.models.model_config
         self.max_model_len = self.model_config.max_model_len
 
@@ -1101,18 +1100,18 @@ class OpenAIServing:
         Sequence[RequestPrompt],
         list[EngineTokensPrompt],
     ]:
-        renderer_config = self.renderer_config
+        model_config = self.model_config
 
         resolved_content_format = resolve_chat_template_content_format(
             chat_template,
             tool_dicts,
             chat_template_content_format,
             tokenizer,
-            renderer_config=renderer_config,
+            model_config=model_config,
         )
         conversation, mm_data_future, mm_uuids = parse_chat_messages_futures(
             messages,
-            renderer_config,
+            model_config,
             content_format=resolved_content_format,
         )
 
@@ -1139,14 +1138,14 @@ class OpenAIServing:
             request_prompt = tokenizer.apply_chat_template(
                 conversation=conversation,
                 messages=messages,
-                model_config=renderer_config.model_config,
+                model_config=model_config,
                 **_chat_template_kwargs,
             )
         else:
             request_prompt = apply_hf_chat_template(
                 tokenizer=tokenizer,
                 conversation=conversation,
-                renderer_config=renderer_config,
+                model_config=model_config,
                 **_chat_template_kwargs,
             )
 
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index ec65e6593..953398a9a 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -71,7 +71,6 @@ class OpenAIServingModels:
 
         self.input_processor = self.engine_client.input_processor
         self.io_processor = self.engine_client.io_processor
-        self.renderer_config = self.engine_client.renderer_config
         self.model_config = self.engine_client.model_config
         self.max_model_len = self.model_config.max_model_len
 
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index 5fd79eed1..cea9924eb 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -91,7 +91,7 @@ class OpenAISpeechToText(OpenAIServing):
         self.task_type = task_type
 
         self.asr_config = self.model_cls.get_speech_to_text_config(
-            self.renderer_config, task_type
+            self.model_config, task_type
         )
 
         self.enable_force_include_usage = enable_force_include_usage
@@ -101,8 +101,8 @@ class OpenAISpeechToText(OpenAIServing):
             self.tokenizer = cast(
                 PreTrainedTokenizerBase,
                 get_tokenizer(
-                    tokenizer_name=self.renderer_config.tokenizer,
-                    tokenizer_mode=self.renderer_config.tokenizer_mode,
+                    tokenizer_name=self.model_config.tokenizer,
+                    tokenizer_mode=self.model_config.tokenizer_mode,
                 ),
             )
 
@@ -154,7 +154,7 @@ class OpenAISpeechToText(OpenAIServing):
             prompt = self.model_cls.get_generation_prompt(
                 audio=chunk,
                 stt_config=self.asr_config,
-                renderer_config=self.renderer_config,
+                model_config=self.model_config,
                 language=language,
                 task_type=self.task_type,
                 request_prompt=request.prompt,
@@ -428,7 +428,7 @@ class OpenAISpeechToText(OpenAIServing):
                     if res.prompt_token_ids is not None:
                         num_prompt_tokens = len(res.prompt_token_ids)
                         if audio_tokens := self.model_cls.get_num_audio_tokens(
-                            audio_duration_s, self.asr_config, self.renderer_config
+                            audio_duration_s, self.asr_config, self.model_config
                         ):
                             num_prompt_tokens += audio_tokens
 
diff --git a/vllm/entrypoints/pooling/pooling/serving.py b/vllm/entrypoints/pooling/pooling/serving.py
index cd28ccba9..7fb767e26 100644
--- a/vllm/entrypoints/pooling/pooling/serving.py
+++ b/vllm/entrypoints/pooling/pooling/serving.py
@@ -94,7 +94,7 @@ class OpenAIServingPooling(OpenAIServing):
         try:
             lora_request = self._maybe_get_adapters(request)
 
-            if self.renderer_config.skip_tokenizer_init:
+            if self.model_config.skip_tokenizer_init:
                 tokenizer = None
             else:
                 tokenizer = await self.engine_client.get_tokenizer()
diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py
index f657fcefd..e5a667830 100644
--- a/vllm/entrypoints/pooling/score/serving.py
+++ b/vllm/entrypoints/pooling/score/serving.py
@@ -160,8 +160,10 @@ class ServingScores(OpenAIServing):
         data_1: str | ScoreContentPartParam,
         data_2: str | ScoreContentPartParam,
     ) -> tuple[str, TokensPrompt]:
+        model_config = self.model_config
+
         full_prompt, engine_prompt = get_score_prompt(
-            renderer_config=self.renderer_config,
+            model_config=model_config,
             data_1=data_1,
             data_2=data_2,
             tokenizer=tokenizer,
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index 561adbe45..072ddd4c9 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -5,7 +5,7 @@ from typing import Any, TypeAlias, cast
 from torch.nn import CosineSimilarity
 from typing_extensions import Required, TypedDict
 
-from vllm.config import ModelConfig, RendererConfig
+from vllm.config import ModelConfig
 from vllm.entrypoints.chat_utils import (
     BaseMultiModalItemTracker,
     ChatCompletionContentPartImageEmbedsParam,
@@ -88,9 +88,9 @@ def _validate_score_input_lens(
 def parse_score_data(
     data_1: str | ScoreContentPartParam,
     data_2: str | ScoreContentPartParam,
-    renderer_config: RendererConfig,
+    model_config: ModelConfig,
 ) -> tuple[str, str, MultiModalDataDict | None]:
-    mm_tracker = MultiModalItemTracker(renderer_config)
+    mm_tracker = MultiModalItemTracker(model_config)
 
     content_1 = _parse_score_content(data_1, mm_tracker)
     content_2 = _parse_score_content(data_2, mm_tracker)
@@ -176,7 +176,7 @@ def post_process_tokens(
 
 
 def get_score_prompt(
-    renderer_config: RendererConfig,
+    model_config: ModelConfig,
     tokenizer: TokenizerLike,
     tokenization_kwargs: dict[str, Any],
     data_1: str | ScoreContentPartParam,
@@ -185,14 +185,11 @@ def get_score_prompt(
     prompt_1, prompt_2, mm_data = parse_score_data(
         data_1,
         data_2,
-        renderer_config,
+        model_config,
     )
-
     from vllm.model_executor.model_loader import get_model_cls
 
-    model_config = renderer_config.model_config
     model = get_model_cls(model_config)
-
     if supports_score_template(model):
         full_prompt = apply_score_template(model_config, prompt_1, prompt_2)
         prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index a81f73ac9..daeeb995b 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -13,7 +13,7 @@ from fastapi import Request
 from fastapi.responses import JSONResponse, StreamingResponse
 from starlette.background import BackgroundTask, BackgroundTasks
 
-from vllm.config import RendererConfig
+from vllm.config import ModelConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import (
@@ -288,7 +288,7 @@ def process_lora_modules(
 async def process_chat_template(
     args_chat_template: Path | str | None,
     engine_client: EngineClient,
-    renderer_config: RendererConfig,
+    model_config: ModelConfig,
 ) -> str | None:
     resolved_chat_template = load_chat_template(args_chat_template)
     if resolved_chat_template is not None:
@@ -305,7 +305,7 @@ async def process_chat_template(
                 tokenizer=tokenizer,
                 chat_template=None,
                 tools=None,
-                model_config=renderer_config.model_config,
+                model_config=model_config,
             )
 
             if hf_chat_template != resolved_chat_template:
@@ -314,6 +314,6 @@ async def process_chat_template(
                     "It is different from official chat template '%s'. "
                     "This discrepancy may lead to performance degradation.",
                     resolved_chat_template,
-                    renderer_config.model_config.model,
+                    model_config.model,
                 )
     return resolved_chat_template
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index f534d102f..0372b06d0 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -6,7 +6,7 @@ from typing import Any, cast
 
 from typing_extensions import assert_never
 
-from vllm.config import RendererConfig
+from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.multimodal.cache import BaseMultiModalProcessorCache
@@ -45,15 +45,14 @@ logger = init_logger(__name__)
 class InputPreprocessor:
     def __init__(
         self,
-        renderer_config: RendererConfig,
+        model_config: ModelConfig,
         tokenizer: TokenizerLike | None,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         mm_processor_cache: BaseMultiModalProcessorCache | None = None,
     ) -> None:
         super().__init__()
 
-        self.renderer_config = renderer_config
-        self.model_config = renderer_config.model_config
+        self.model_config = model_config
         self.tokenizer = tokenizer
         self.mm_registry = mm_registry
         self.mm_processor_cache = mm_processor_cache
@@ -232,7 +231,7 @@ class InputPreprocessor:
     def _get_mm_processor(self) -> BaseMultiModalProcessor:
         if not hasattr(self, "_mm_processor"):
             self._mm_processor = self.mm_registry.create_processor(
-                self.renderer_config,
+                self.model_config,
                 tokenizer=self.tokenizer,
                 cache=self.mm_processor_cache,
             )
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index a2700bd5a..007d847ac 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -415,7 +415,7 @@ def load_weights_using_from_2_way_softmax(
     from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
     from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
-    renderer_config = model.vllm_config.renderer_config
+    model_config = model.vllm_config.model_config
     quant_config = model.vllm_config.quant_config
     text_config = model.config.get_text_config()
 
@@ -447,10 +447,10 @@ def load_weights_using_from_2_way_softmax(
     from vllm.tokenizers import get_tokenizer
 
     tokenizer = get_tokenizer(
-        renderer_config.tokenizer,
-        revision=renderer_config.tokenizer_revision,
-        tokenizer_mode=renderer_config.tokenizer_mode,
-        trust_remote_code=renderer_config.trust_remote_code,
+        model_config.tokenizer,
+        revision=model_config.tokenizer_revision,
+        tokenizer_mode=model_config.tokenizer_mode,
+        trust_remote_code=model_config.trust_remote_code,
     )
 
     false_id = tokenizer.convert_tokens_to_ids(tokens[0])
@@ -473,7 +473,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
     from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
     from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
-    renderer_config = model.vllm_config.renderer_config
+    model_config = model.vllm_config.model_config
     quant_config = model.vllm_config.quant_config
     text_config = model.config.get_text_config()
 
@@ -501,10 +501,10 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
     from vllm.tokenizers import get_tokenizer
 
     tokenizer = get_tokenizer(
-        renderer_config.tokenizer,
-        revision=renderer_config.tokenizer_revision,
-        tokenizer_mode=renderer_config.tokenizer_mode,
-        trust_remote_code=renderer_config.trust_remote_code,
+        model_config.tokenizer,
+        revision=model_config.tokenizer_revision,
+        tokenizer_mode=model_config.tokenizer_mode,
+        trust_remote_code=model_config.trust_remote_code,
     )
 
     token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py
index bd4724749..1f07381c0 100644
--- a/vllm/model_executor/models/deepseek_ocr.py
+++ b/vllm/model_executor/models/deepseek_ocr.py
@@ -377,8 +377,8 @@ class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         self.projector_config = config.projector_config
         self.text_config = config.text_config
 
-        renderer_config = vllm_config.renderer_config
-        tokenizer = cached_tokenizer_from_config(renderer_config)
+        model_config = vllm_config.model_config
+        tokenizer = cached_tokenizer_from_config(model_config)
         self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN]
 
         self.sam_model = build_sam_vit_b()
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index be03e1df8..9f8faf9ed 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -370,8 +370,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         self.projector_config = config.projector_config
         self.text_config = config.text_config
 
-        renderer_config = vllm_config.renderer_config
-        tokenizer = cached_tokenizer_from_config(renderer_config)
+        model_config = vllm_config.model_config
+        tokenizer = cached_tokenizer_from_config(model_config)
         self.image_token_id: int = tokenizer.vocab[_IMAGE_TOKEN]
 
         self.vision = self._init_vision_module(
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
index f82529d84..7036118ad 100644
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -18,7 +18,7 @@ from transformers.models.gemma3n import (
 )
 from transformers.models.siglip import SiglipImageProcessorFast
 
-from vllm.config import RendererConfig, SpeechToTextConfig, VllmConfig
+from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
@@ -760,7 +760,7 @@ class Gemma3nForConditionalGeneration(
         cls,
         audio: np.ndarray,
         stt_config: SpeechToTextConfig,
-        renderer_config: RendererConfig,
+        model_config: ModelConfig,
         language: Optional[str],
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
@@ -798,9 +798,7 @@ class Gemma3nForConditionalGeneration(
 
     @classmethod
     def get_speech_to_text_config(
-        cls,
-        renderer_config: RendererConfig,
-        task_type: str,
+        cls, model_config: ModelConfig, task_type: str
     ) -> SpeechToTextConfig:
         return SpeechToTextConfig(
             # Let's set this to 30 as suggested in the docs for now, although
diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
index 96645f20b..a4e50f408 100644
--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -34,7 +34,7 @@ import torch.nn.functional as F
 from torch import nn
 from transformers import BatchFeature, PretrainedConfig
 
-from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.inputs.data import PromptType
 from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
@@ -840,7 +840,7 @@ class GraniteSpeechForConditionalGeneration(
     def get_generation_prompt(
         cls,
         audio: np.ndarray,
-        renderer_config: RendererConfig,
+        model_config: ModelConfig,
         stt_config: SpeechToTextConfig,
         language: str | None,
         task_type: Literal["transcribe", "translate"],
@@ -861,7 +861,7 @@ class GraniteSpeechForConditionalGeneration(
         else:
             raise ValueError(f"Unsupported task type {task_type}")
 
-        tokenizer = cached_tokenizer_from_config(renderer_config)
+        tokenizer = cached_tokenizer_from_config(model_config)
         chat = [dict(role="user", content=user_prompt)]
         prompt = tokenizer.apply_chat_template(
             chat,
@@ -882,10 +882,10 @@ class GraniteSpeechForConditionalGeneration(
         cls,
         audio_duration_s: float,
         stt_config: SpeechToTextConfig,
-        renderer_config: RendererConfig,
+        model_config: ModelConfig,
     ) -> int | None:
         """Get the number of audio tokens for an audio duration in sec."""
-        processor = cached_processor_from_config(renderer_config)
+        processor = cached_processor_from_config(model_config)
         hop_length = processor.audio_processor.melspec_kwargs["hop_length"]
         proj_win_size = processor.audio_processor.projector_window_size
         ds_rate = processor.audio_processor.projector_downsample_rate
@@ -903,9 +903,7 @@ class GraniteSpeechForConditionalGeneration(
 
     @classmethod
     def get_speech_to_text_config(
-        cls,
-        renderer_config: RendererConfig,
-        task_type: str,
+        cls, model_config: ModelConfig, task_type: str
     ) -> SpeechToTextConfig:
         """Get the stt config for this model."""
         # Default settings are reasonable for this model and we don't currently
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index b9f3ac8ae..2aba626a7 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -6,7 +6,7 @@ import numpy as np
 import torch
 import torch.nn as nn
 
-from vllm.config import RendererConfig, VllmConfig
+from vllm.config import ModelConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.pooler import (
     DispatchPooler,
@@ -29,12 +29,12 @@ logger = init_logger(__name__)
 class GritLMMeanPool(nn.Module):
     """As `MeanPool`, but only includes non-instruction tokens."""
 
-    def __init__(self, renderer_config: RendererConfig):
+    def __init__(self, model_config: ModelConfig):
         super().__init__()
 
-        self.renderer_config = renderer_config
+        self.model_config = model_config
 
-        tokenizer = cached_tokenizer_from_config(self.renderer_config)
+        tokenizer = cached_tokenizer_from_config(self.model_config)
 
         # Collect the tokens needed for pattern matching.
         # "▁<" is different from "_<". The former uses "▁" to indicate that
@@ -174,10 +174,10 @@ class GritLMMeanPool(nn.Module):
 
 
 class GritLMPooler(Pooler):
-    def __init__(self, renderer_config: RendererConfig):
+    def __init__(self, model_config: ModelConfig):
         super().__init__()
 
-        self.pooling = GritLMMeanPool(renderer_config)
+        self.pooling = GritLMMeanPool(model_config)
         self.head = PoolerHead(PoolerNormalize())
 
     def get_supported_tasks(self) -> Set[PoolingTask]:
@@ -238,6 +238,6 @@ class GritLM(LlamaForCausalLM):
             self.pooler = DispatchPooler(
                 {
                     "token_embed": Pooler.for_token_embed(pooler_config),
-                    "embed": GritLMPooler(vllm_config.renderer_config),
+                    "embed": GritLMPooler(vllm_config.model_config),
                 }
             )
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 4df91aaf8..607ff5583 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -19,7 +19,7 @@ from torch import Tensor
 from transformers.models.whisper.tokenization_whisper import LANGUAGES
 from typing_extensions import Self, TypeIs
 
-from vllm.config import RendererConfig, SpeechToTextConfig
+from vllm.config import ModelConfig, SpeechToTextConfig
 from vllm.inputs import TokensPrompt
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
@@ -887,7 +887,7 @@ class SupportsTranscription(Protocol):
         cls,
         audio: np.ndarray,
         stt_config: SpeechToTextConfig,
-        renderer_config: RendererConfig,
+        model_config: ModelConfig,
         language: str | None,
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
@@ -930,9 +930,7 @@ class SupportsTranscription(Protocol):
 
     @classmethod
     def get_speech_to_text_config(
-        cls,
-        renderer_config: RendererConfig,
-        task_type: Literal["transcribe", "translate"],
+        cls, model_config: ModelConfig, task_type: Literal["transcribe", "translate"]
     ) -> SpeechToTextConfig:
         """Get the speech to text config for the ASR model."""
         ...
@@ -942,7 +940,7 @@ class SupportsTranscription(Protocol):
         cls,
         audio_duration_s: float,
         stt_config: SpeechToTextConfig,
-        renderer_config: RendererConfig,
+        model_config: ModelConfig,
     ) -> int | None:
         """
         Map from audio duration to number of audio tokens produced by the ASR
diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py
index d75637da1..18985cefb 100644
--- a/vllm/model_executor/models/interns1.py
+++ b/vllm/model_executor/models/interns1.py
@@ -182,7 +182,7 @@ class InternS1ProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
         hf_processor = self.ctx.get_hf_processor(InternVLProcessor, **kwargs)
         hf_processor.video_processor = cached_video_processor_from_config(
-            self.ctx.renderer_config,
+            self.ctx.model_config,
             processor_cls=InternVLVideoProcessor,
             size=hf_processor.image_processor.size,
             **kwargs,
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 4daaefd0c..6dfab595e 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -1169,17 +1169,16 @@ class NemotronH_Nano_VL_V2(
         self.mlp1 = self.mlp1.to(self.language_model.config.dtype)
 
         self.config = config
+        self.model_config = vllm_config.model_config
 
         # Pre-tokenize special tokens for video processing
         # to avoid repeated tokenization
-        self._tokenizer = cached_tokenizer_from_config(vllm_config.renderer_config)
-        self._img_start_token_ids = self._tokenizer.encode(
+        tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
+        self._img_start_token_ids = tokenizer.encode(
             IMG_START, add_special_tokens=False
         )
-        self._img_end_token_ids = self._tokenizer.encode(
-            IMG_END, add_special_tokens=False
-        )
-        self._img_context_token_ids = self._tokenizer.encode(
+        self._img_end_token_ids = tokenizer.encode(IMG_END, add_special_tokens=False)
+        self._img_context_token_ids = tokenizer.encode(
             IMG_CONTEXT, add_special_tokens=False
         )
 
@@ -1365,7 +1364,7 @@ class NemotronH_Nano_VL_V2(
         input_embeds for the LLM.
         """
         device = video_embeddings.device
-        tokenizer = self._tokenizer
+        tokenizer = cached_tokenizer_from_config(self.model_config)
 
         # Generate video replacement token IDs using get_video_repl
         # This tokenizes each frame separator independently, then uses pre-tokenized
diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py
index 797793e65..391980fc6 100644
--- a/vllm/model_executor/models/nemotron_vl.py
+++ b/vllm/model_executor/models/nemotron_vl.py
@@ -347,7 +347,7 @@ class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
 
     def get_image_processor(self, **kwargs: object):
         return cached_image_processor_from_config(
-            self.ctx.renderer_config,
+            self.ctx.model_config,
             **kwargs,
         )
 
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index ebe743fa8..faf2d80d2 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -193,7 +193,7 @@ class PixtralProcessorAdapter:
 
 class PixtralProcessingInfo(BaseProcessingInfo):
     def get_tokenizer(self) -> MistralTokenizer:
-        tokenizer = cached_tokenizer_from_config(self.ctx.renderer_config)
+        tokenizer = cached_tokenizer_from_config(self.ctx.model_config)
         if not isinstance(tokenizer, MistralTokenizer):
             raise ValueError("This model requires `--tokenizer-mode mistral`")
 
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 0acd564e2..7b408248e 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -20,7 +20,7 @@ from mistral_common.tokens.tokenizers.audio import Audio, AudioEncoder
 from transformers import BatchFeature, TensorType, WhisperConfig
 from transformers.tokenization_utils_base import TextInput
 
-from vllm.config import RendererConfig, SpeechToTextConfig, VllmConfig
+from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
@@ -176,7 +176,7 @@ class VoxtralProcessorAdapter:
 
 class VoxtralProcessingInfo(BaseProcessingInfo):
     def get_tokenizer(self) -> MistralTokenizer:
-        tokenizer = cached_tokenizer_from_config(self.ctx.renderer_config)
+        tokenizer = cached_tokenizer_from_config(self.ctx.model_config)
         if not isinstance(tokenizer, MistralTokenizer):
             raise ValueError("This model requires `--tokenizer-mode mistral`")
 
@@ -339,7 +339,7 @@ class VoxtralForConditionalGeneration(
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
-        self.tokenizer = cached_tokenizer_from_config(vllm_config.renderer_config)
+        self.tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
 
         # update quant config to so that ignored module and target module names
         # match the vLLM model names
@@ -450,11 +450,9 @@ class VoxtralForConditionalGeneration(
 
     @classmethod
     def get_speech_to_text_config(
-        cls,
-        renderer_config: RendererConfig,
-        task_type: str,
+        cls, model_config: ModelConfig, task_type: str
     ) -> SpeechToTextConfig:
-        tokenizer = cached_tokenizer_from_config(renderer_config)
+        tokenizer = cached_tokenizer_from_config(model_config)
         audio_config = tokenizer.instruct.audio_encoder.audio_config
         max_audio_clip_s = audio_config.chunk_length_s
         sample_rate = audio_config.sampling_rate
@@ -470,17 +468,17 @@ class VoxtralForConditionalGeneration(
     def get_generation_prompt(
         cls,
         audio: np.ndarray,
-        renderer_config: RendererConfig,  # not needed here
+        model_config: ModelConfig,
         stt_config: SpeechToTextConfig,
         language: str | None,
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
         to_language: str | None,
     ) -> PromptType:
-        tokenizer = cached_tokenizer_from_config(renderer_config)
+        tokenizer = cached_tokenizer_from_config(model_config)
         audio = Audio(audio, int(stt_config.sample_rate), format="wav")  # lossless
         req = TranscriptionRequest(
-            model=renderer_config.model_config.model,
+            model=model_config.model,
             audio=RawAudio.from_audio(audio),
             language=language,
         )
@@ -496,14 +494,14 @@ class VoxtralForConditionalGeneration(
         cls,
         audio_duration_s: float,
         stt_config: SpeechToTextConfig,
-        renderer_config: RendererConfig,
+        model_config: ModelConfig,
     ) -> int | None:
         """
         Map from audio duration to number of audio tokens produced by the ASR
         model, without running a forward pass.
         This is used for estimating the amount of processing for this audio.
         """
-        tokenizer = cached_tokenizer_from_config(renderer_config)
+        tokenizer = cached_tokenizer_from_config(model_config)
         adapter = VoxtralProcessorAdapter(tokenizer)
         return adapter.get_num_audio_tokens(
             int(audio_duration_s * stt_config.sample_rate)
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 6f526e395..b2feff133 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -19,7 +19,7 @@ from transformers.models.whisper.modeling_whisper import sinusoids
 from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention, MultiHeadAttention
 from vllm.attention.layers.cross_attention import CrossAttention
-from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.inputs.data import PromptType
@@ -811,7 +811,7 @@ class WhisperForConditionalGeneration(
     def get_generation_prompt(
         cls,
         audio: np.ndarray,
-        renderer_config: RendererConfig,  # not needed here
+        model_config: ModelConfig,  # not needed here
         stt_config: SpeechToTextConfig,
         language: str | None,
         task_type: Literal["transcribe", "translate"],
@@ -847,11 +847,9 @@ class WhisperForConditionalGeneration(
 
     @classmethod
     def get_speech_to_text_config(
-        cls,
-        renderer_config: RendererConfig,
-        task_type: str,
+        cls, model_config: ModelConfig, task_type: str
     ) -> SpeechToTextConfig:
-        processor = cached_processor_from_config(renderer_config)
+        processor = cached_processor_from_config(model_config)
 
         return SpeechToTextConfig(
             max_audio_clip_s=processor.feature_extractor.chunk_length,
@@ -863,9 +861,9 @@ class WhisperForConditionalGeneration(
         cls,
         audio_duration_s: float,
         stt_config: SpeechToTextConfig,
-        renderer_config: RendererConfig,
+        model_config: ModelConfig,
     ) -> int | None:
-        processor = cached_processor_from_config(renderer_config)
+        processor = cached_processor_from_config(model_config)
         hop_length = processor.feature_extractor.hop_length
         assert hop_length is not None
         # NOTE(NickLucche) user can't pass encoder
diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
index 9c838fe67..67bdf5e15 100644
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@@ -31,7 +31,7 @@ from .inputs import (
 )
 
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig, RendererConfig, VllmConfig
+    from vllm.config import ModelConfig, VllmConfig
 
     from .processing import ResolvedPromptUpdate
     from .registry import MultiModalRegistry
@@ -561,13 +561,13 @@ class ShmObjectStoreSenderCache(BaseMultiModalProcessorCache):
 
 
 def _enable_processor_cache(
-    renderer_config: "RendererConfig",
+    model_config: "ModelConfig",
     mm_registry: "MultiModalRegistry",
 ) -> bool:
-    if not mm_registry.supports_multimodal_inputs(renderer_config):
+    if not mm_registry.supports_multimodal_inputs(model_config):
         return False
 
-    mm_config = renderer_config.model_config.get_multimodal_config()
+    mm_config = model_config.get_multimodal_config()
     return mm_config.mm_processor_cache_gb > 0
 
 
@@ -599,7 +599,7 @@ def processor_cache_from_config(
     """Return a `BaseMultiModalProcessorCache`, if enabled."""
     model_config = vllm_config.model_config
 
-    if not _enable_processor_cache(vllm_config.renderer_config, mm_registry):
+    if not _enable_processor_cache(model_config, mm_registry):
         return None
 
     if not _enable_ipc_cache(vllm_config):
@@ -611,14 +611,14 @@ def processor_cache_from_config(
 
 
 def processor_only_cache_from_config(
-    renderer_config: "RendererConfig",
+    model_config: "ModelConfig",
     mm_registry: "MultiModalRegistry",
 ):
     """Return a `MultiModalProcessorOnlyCache`, if enabled."""
-    if not _enable_processor_cache(renderer_config, mm_registry):
+    if not _enable_processor_cache(model_config, mm_registry):
         return None
 
-    return MultiModalProcessorOnlyCache(renderer_config.model_config)
+    return MultiModalProcessorOnlyCache(model_config)
 
 
 class BaseMultiModalReceiverCache(
@@ -787,7 +787,7 @@ def engine_receiver_cache_from_config(
     """
     model_config = vllm_config.model_config
 
-    if not _enable_processor_cache(vllm_config.renderer_config, mm_registry):
+    if not _enable_processor_cache(model_config, mm_registry):
         return None
 
     if not _enable_ipc_cache(vllm_config):
@@ -809,7 +809,9 @@ def worker_receiver_cache_from_config(
     Return a `BaseMultiModalReceiverCache` only when IPC caching is enabled and
     mm_processor_cache_type=="shm".
     """
-    if not _enable_processor_cache(vllm_config.renderer_config, mm_registry):
+    model_config = vllm_config.model_config
+
+    if not _enable_processor_cache(model_config, mm_registry):
         return None
 
     if not _enable_ipc_cache(vllm_config):
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 81ceb76a4..039077378 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -23,7 +23,7 @@ import torch
 from typing_extensions import TypeVar, assert_never
 
 from vllm.logger import init_logger
-from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
+from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.processor import cached_processor_from_config
 from vllm.utils.collection_utils import flatten_2d_lists, full_groupby
 from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
@@ -53,7 +53,7 @@ if TYPE_CHECKING:
     from transformers.feature_extraction_utils import BatchFeature
     from transformers.processing_utils import ProcessorMixin
 
-    from vllm.config import ModelConfig, RendererConfig
+    from vllm.config import ModelConfig
 
     from .cache import BaseMultiModalProcessorCache
     from .profiling import BaseDummyInputsBuilder
@@ -63,7 +63,6 @@ else:
     ProcessorMixin = object
 
     ModelConfig = object
-    RendererConfig = object
 
     BaseMultiModalProcessorCache = object
 
@@ -946,29 +945,12 @@ class InputProcessingContext:
     modify the inputs.
     """
 
-    renderer_config: RendererConfig
-    """The configuration of the renderer."""
+    model_config: ModelConfig
+    """The configuration of the model."""
 
     tokenizer: TokenizerLike | None
     """The tokenizer used to tokenize the inputs."""
 
-    @classmethod
-    def from_config(
-        cls,
-        renderer_config: RendererConfig,
-        *,
-        tokenizer: TokenizerLike | None = None,
-    ):
-        if tokenizer is None and not renderer_config.skip_tokenizer_init:
-            tokenizer = cached_tokenizer_from_config(renderer_config)
-
-        return cls(renderer_config, tokenizer)
-
-    @property
-    def model_config(self) -> ModelConfig:
-        """The configuration of the model."""
-        return self.renderer_config.model_config
-
     def get_tokenizer(self) -> TokenizerLike:
         if self.tokenizer is None:
             raise ValueError(
@@ -1065,7 +1047,7 @@ class InputProcessingContext:
             typ = ProcessorMixin
 
         return cached_processor_from_config(
-            self.renderer_config,
+            self.model_config,
             processor_cls=typ,
             tokenizer=self.tokenizer,
             **kwargs,
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index e49aaa504..00a84f9de 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast
 
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.logger import init_logger
-from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
 
 from .cache import BaseMultiModalProcessorCache
 from .processing import (
@@ -22,7 +22,7 @@ from .profiling import (
 )
 
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig, RendererConfig
+    from vllm.config import ModelConfig
     from vllm.model_executor.models.interfaces import SupportsMultiModal
 
 logger = init_logger(__name__)
@@ -114,18 +114,17 @@ class MultiModalRegistry:
 
         return mm_options if len(mm_options) > 0 else None
 
-    def supports_multimodal_inputs(self, renderer_config: "RendererConfig") -> bool:
+    def supports_multimodal_inputs(self, model_config: "ModelConfig") -> bool:
         """
         Checks if the model supports multimodal inputs.
         Returns True if the model is multimodal with any non-zero supported
         modalities, otherwise returns False, effectively running in
         text-only mode.
         """
-        model_config = renderer_config.model_config
         if not model_config.is_multimodal_model:
             return False
 
-        info = self._create_processing_info(renderer_config, tokenizer=None)
+        info = self._create_processing_info(model_config, tokenizer=None)
         supported_modalities = info.get_supported_mm_limits()
 
         mm_config = model_config.get_multimodal_config()
@@ -145,7 +144,7 @@ class MultiModalRegistry:
 
     def get_max_tokens_per_item_by_modality(
         self,
-        renderer_config: "RendererConfig",
+        model_config: "ModelConfig",
         *,
         cache: BaseMultiModalProcessorCache | None = None,
         profiler_limits: Mapping[str, int] | None = None,
@@ -154,11 +153,10 @@ class MultiModalRegistry:
         Get the maximum number of tokens per data item from each modality based
         on underlying model configuration.
         """
-        model_config = renderer_config.model_config
         if not model_config.is_multimodal_model:
             return {}
 
-        processor = self.create_processor(renderer_config, cache=cache)
+        processor = self.create_processor(model_config, cache=cache)
         profiler: MultiModalProfiler = MultiModalProfiler(processor)
 
         seq_len = model_config.max_model_len
@@ -173,7 +171,7 @@ class MultiModalRegistry:
 
     def get_mm_limits_per_prompt(
         self,
-        renderer_config: "RendererConfig",
+        model_config: "ModelConfig",
         *,
         cache: BaseMultiModalProcessorCache | None = None,
     ) -> Mapping[str, int]:
@@ -181,11 +179,10 @@ class MultiModalRegistry:
         Get the maximum number of multi-modal input instances for each modality
         that are allowed per prompt for a model class.
         """
-        model_config = renderer_config.model_config
         if not model_config.is_multimodal_model:
             return {}
 
-        processor = self.create_processor(renderer_config, cache=cache)
+        processor = self.create_processor(model_config, cache=cache)
         profiler: MultiModalProfiler = MultiModalProfiler(processor)
         return profiler.get_mm_limits()
 
@@ -231,21 +228,30 @@ class MultiModalRegistry:
         assert hasattr(model_cls, "_processor_factory")
         return cast("SupportsMultiModal", model_cls)
 
+    def _create_processing_ctx(
+        self,
+        model_config: "ModelConfig",
+        tokenizer: TokenizerLike | None = None,
+    ) -> InputProcessingContext:
+        if tokenizer is None and not model_config.skip_tokenizer_init:
+            tokenizer = cached_tokenizer_from_config(model_config)
+
+        return InputProcessingContext(model_config, tokenizer)
+
     def _create_processing_info(
         self,
-        renderer_config: "RendererConfig",
+        model_config: "ModelConfig",
         *,
         tokenizer: TokenizerLike | None = None,
     ) -> BaseProcessingInfo:
-        model_cls = self._get_model_cls(renderer_config.model_config)
+        model_cls = self._get_model_cls(model_config)
         factories = model_cls._processor_factory
-
-        ctx = InputProcessingContext.from_config(renderer_config, tokenizer=tokenizer)
+        ctx = self._create_processing_ctx(model_config, tokenizer)
         return factories.info(ctx)
 
     def create_processor(
         self,
-        renderer_config: "RendererConfig",
+        model_config: "ModelConfig",
         *,
         tokenizer: TokenizerLike | None = None,
         cache: BaseMultiModalProcessorCache | None = None,
@@ -253,19 +259,19 @@ class MultiModalRegistry:
         """
         Create a multi-modal processor for a specific model and tokenizer.
         """
-        model_config = renderer_config.model_config
         if not model_config.is_multimodal_model:
             raise ValueError(f"{model_config.model} is not a multimodal model")
 
         model_cls = self._get_model_cls(model_config)
         factories = model_cls._processor_factory
 
-        ctx = InputProcessingContext.from_config(renderer_config, tokenizer=tokenizer)
+        ctx = self._create_processing_ctx(model_config, tokenizer)
+
         return factories.build_processor(ctx, cache=cache)
 
     def get_decoder_dummy_data(
         self,
-        renderer_config: "RendererConfig",
+        model_config: "ModelConfig",
         seq_len: int,
         mm_counts: Mapping[str, int] | None = None,
         *,
@@ -274,15 +280,15 @@ class MultiModalRegistry:
         """
         Create dummy data for profiling the memory usage of a model.
 
-        The model is identified by `renderer_config`.
+        The model is identified by `model_config`.
         """
-        processor = self.create_processor(renderer_config, cache=cache)
+        processor = self.create_processor(model_config, cache=cache)
         profiler: MultiModalProfiler = MultiModalProfiler(processor)
 
         # Extract configurable options from multimodal config.
         # Only include modalities that use advanced option types so legacy
         # count-only behavior remains unchanged.
-        mm_options = self._extract_mm_options(renderer_config.model_config)
+        mm_options = self._extract_mm_options(model_config)
 
         dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts, mm_options)
 
@@ -298,7 +304,7 @@ class MultiModalRegistry:
 
     def get_encoder_dummy_data(
         self,
-        renderer_config: "RendererConfig",
+        model_config: "ModelConfig",
         seq_len: int,
         mm_counts: Mapping[str, int] | None = None,
         *,
@@ -307,15 +313,15 @@ class MultiModalRegistry:
         """
         Create dummy data for profiling the memory usage of a model.
 
-        The model is identified by `renderer_config`.
+        The model is identified by `model_config`.
         """
-        processor = self.create_processor(renderer_config, cache=cache)
+        processor = self.create_processor(model_config, cache=cache)
         profiler: MultiModalProfiler = MultiModalProfiler(processor)
 
         # Extract configurable options from multimodal config.
         # Only include modalities that use advanced option types so legacy
         # count-only behavior remains unchanged.
-        mm_options = self._extract_mm_options(renderer_config.model_config)
+        mm_options = self._extract_mm_options(model_config)
 
         dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts, mm_options)
 
@@ -330,15 +336,13 @@ class MultiModalRegistry:
 
         return dummy_data
 
-    def get_encdec_max_encoder_len(self, renderer_config: "RendererConfig") -> int:
+    def get_encdec_max_encoder_len(self, model_config: "ModelConfig") -> int:
         """
         Get the maximum length of the encoder input for encoder-decoder models.
         """
-        model_config = renderer_config.model_config
         if not model_config.is_encoder_decoder:
             return 0
-
-        max_tokens = self.get_max_tokens_per_item_by_modality(renderer_config)
+        max_tokens = self.get_max_tokens_per_item_by_modality(model_config)
         if not max_tokens:
             # TODO - this function assumes encoder-decoder models are
             # multimodal. This will need to change when adding support for more
diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py
index c9575511a..1d44feeee 100644
--- a/vllm/tokenizers/registry.py
+++ b/vllm/tokenizers/registry.py
@@ -24,7 +24,7 @@ from vllm.utils.import_utils import resolve_obj_by_qualname
 from .protocol import TokenizerLike
 
 if TYPE_CHECKING:
-    from vllm.config import RendererConfig
+    from vllm.config import ModelConfig
 
 logger = init_logger(__name__)
 
@@ -205,18 +205,18 @@ def get_tokenizer(
 cached_get_tokenizer = lru_cache(get_tokenizer)
 
 
-def cached_tokenizer_from_config(renderer_config: "RendererConfig", **kwargs):
+def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs):
     return cached_get_tokenizer(
-        renderer_config.tokenizer,
-        tokenizer_mode=renderer_config.tokenizer_mode,
-        revision=renderer_config.tokenizer_revision,
-        trust_remote_code=renderer_config.trust_remote_code,
+        model_config.tokenizer,
+        tokenizer_mode=model_config.tokenizer_mode,
+        revision=model_config.tokenizer_revision,
+        trust_remote_code=model_config.trust_remote_code,
         **kwargs,
     )
 
 
-def init_tokenizer_from_config(renderer_config: "RendererConfig"):
-    runner_type = renderer_config.model_config.runner_type
+def init_tokenizer_from_config(model_config: "ModelConfig"):
+    runner_type = model_config.runner_type
     if runner_type == "generate" or runner_type == "draft":
         truncation_side = "left"
     elif runner_type == "pooling":
@@ -225,9 +225,9 @@ def init_tokenizer_from_config(renderer_config: "RendererConfig"):
         assert_never(runner_type)
 
     return get_tokenizer(
-        renderer_config.tokenizer,
-        tokenizer_mode=renderer_config.tokenizer_mode,
-        trust_remote_code=renderer_config.trust_remote_code,
-        revision=renderer_config.tokenizer_revision,
+        model_config.tokenizer,
+        tokenizer_mode=model_config.tokenizer_mode,
+        trust_remote_code=model_config.trust_remote_code,
+        revision=model_config.tokenizer_revision,
         truncation_side=truncation_side,
     )
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index bdebd2686..e9864b0c1 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -23,7 +23,7 @@ from vllm.transformers_utils.utils import convert_model_repo_to_path
 from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
 
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig, RendererConfig
+    from vllm.config import ModelConfig
 
 _P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
 _V = TypeVar("_V", bound=BaseVideoProcessor, default=BaseVideoProcessor)
@@ -233,18 +233,17 @@ def cached_get_processor_without_dynamic_kwargs(
 
 
 def cached_processor_from_config(
-    renderer_config: "RendererConfig",
+    model_config: "ModelConfig",
     processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
     **kwargs: Any,
 ) -> _P:
-    model_config = renderer_config.model_config
     if is_gguf(model_config.model):
-        assert not is_gguf(renderer_config.tokenizer), (
+        assert not is_gguf(model_config.tokenizer), (
             "For multimodal GGUF models, the original tokenizer "
             "should be used to correctly load processor."
         )
-        model = renderer_config.tokenizer
-        revision = renderer_config.tokenizer_revision
+        model = model_config.tokenizer
+        revision = model_config.tokenizer_revision
     else:
         model = model_config.model
         revision = model_config.revision
@@ -298,11 +297,9 @@ cached_get_feature_extractor = lru_cache(get_feature_extractor)
 
 
 def cached_feature_extractor_from_config(
-    renderer_config: "RendererConfig",
+    model_config: "ModelConfig",
     **kwargs: Any,
 ):
-    model_config = renderer_config.model_config
-
     return cached_get_feature_extractor(
         model_config.model,
         revision=model_config.revision,
@@ -351,17 +348,16 @@ cached_get_image_processor = lru_cache(get_image_processor)
 
 
 def cached_image_processor_from_config(
-    renderer_config: "RendererConfig",
+    model_config: "ModelConfig",
     **kwargs: Any,
 ):
-    model_config = renderer_config.model_config
     if is_gguf(model_config.model):
-        assert not is_gguf(renderer_config.tokenizer), (
+        assert not is_gguf(model_config.tokenizer), (
             "For multimodal GGUF models, the original tokenizer "
             "should be used to correctly load image processor."
         )
-        model = renderer_config.tokenizer
-        revision = renderer_config.tokenizer_revision
+        model = model_config.tokenizer
+        revision = model_config.tokenizer_revision
     else:
         model = model_config.model
         revision = model_config.revision
@@ -415,12 +411,10 @@ cached_get_video_processor = lru_cache(get_video_processor)
 
 
 def cached_video_processor_from_config(
-    renderer_config: "RendererConfig",
+    model_config: "ModelConfig",
     processor_cls: type[_V] | None = None,
     **kwargs: Any,
 ):
-    model_config = renderer_config.model_config
-
     return cached_get_video_processor(
         model_config.model,
         revision=model_config.revision,
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index 21315b85f..3959e9a59 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -10,7 +10,7 @@ from vllm.multimodal import MultiModalRegistry
 from vllm.v1.request import Request
 
 if TYPE_CHECKING:
-    from vllm.config import RendererConfig, SchedulerConfig
+    from vllm.config import ModelConfig, SchedulerConfig
 
 logger = init_logger(__name__)
 
@@ -250,7 +250,7 @@ class EncoderCacheManager:
 
 
 def compute_encoder_budget(
-    renderer_config: "RendererConfig",
+    model_config: "ModelConfig",
     scheduler_config: "SchedulerConfig",
     mm_registry: MultiModalRegistry,
 ) -> tuple[int, int]:
@@ -263,9 +263,9 @@ def compute_encoder_budget(
         - Space budget for encoder cache size, measured in number of tokens
             from the input sequence.
     """
-    if mm_registry.supports_multimodal_inputs(renderer_config):
+    if mm_registry.supports_multimodal_inputs(model_config):
         max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality(
-            renderer_config
+            model_config
         )
 
         return compute_mm_encoder_budget(
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 96073efc5..0a8efa2fd 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -164,7 +164,7 @@ class Scheduler(SchedulerInterface):
         # This can be changed when we make encoder cache for embedding caching
         # across requests.
         encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
-            renderer_config=vllm_config.renderer_config,
+            model_config=vllm_config.model_config,
             scheduler_config=vllm_config.scheduler_config,
             mm_registry=mm_registry,
         )
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index b76f9c059..fd7e04dc0 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -91,7 +91,6 @@ class AsyncLLM(EngineClient):
         # Ensure we can serialize custom transformer configs
         maybe_register_config_serialize_by_value()
 
-        self.renderer_config = vllm_config.renderer_config
         self.model_config = vllm_config.model_config
         self.vllm_config = vllm_config
         self.observability_config = vllm_config.observability_config
@@ -109,15 +108,15 @@ class AsyncLLM(EngineClient):
                 "enabling logging without default stat loggers."
             )
 
-        if self.renderer_config.skip_tokenizer_init:
+        if self.model_config.skip_tokenizer_init:
             tokenizer = None
         else:
-            tokenizer = init_tokenizer_from_config(self.renderer_config)
+            tokenizer = init_tokenizer_from_config(self.model_config)
 
         self.input_processor = InputProcessor(self.vllm_config, tokenizer)
         self.io_processor = get_io_processor(
             self.vllm_config,
-            self.renderer_config.io_processor_plugin,
+            self.model_config.io_processor_plugin,
         )
 
         # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py
index a2f6ba5be..e6a94f4e3 100644
--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@@ -43,7 +43,6 @@ class InputProcessor:
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ) -> None:
         self.vllm_config = vllm_config
-        self.renderer_config = vllm_config.renderer_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
         self.lora_config = vllm_config.lora_config
@@ -55,7 +54,7 @@ class InputProcessor:
         self.mm_processor_cache = processor_cache_from_config(vllm_config, mm_registry)
 
         self.input_preprocessor = InputPreprocessor(
-            self.renderer_config,
+            self.model_config,
             tokenizer,
             mm_registry,
             mm_processor_cache=self.mm_processor_cache,
@@ -253,7 +252,7 @@ class InputProcessor:
         if not params.structured_outputs or not self.structured_outputs_config:
             return
 
-        if self.renderer_config.skip_tokenizer_init and params.structured_outputs:
+        if self.model_config.skip_tokenizer_init and params.structured_outputs:
             raise ValueError(
                 "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'"  # noqa: E501
             )
@@ -583,7 +582,7 @@ class InputProcessor:
             if prompt_type == "encoder" and model_config.is_multimodal_model:
                 mm_registry = self.input_preprocessor.mm_registry
                 mm_processor = mm_registry.create_processor(
-                    self.renderer_config,
+                    model_config,
                     tokenizer=tokenizer,
                 )
                 assert isinstance(mm_processor, EncDecMultiModalProcessor)
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index ba0e1cf25..4c3129100 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -60,7 +60,6 @@ class LLMEngine:
     ) -> None:
         self.vllm_config = vllm_config
         self.observability_config = vllm_config.observability_config
-        self.renderer_config = vllm_config.renderer_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
 
@@ -84,15 +83,15 @@ class LLMEngine:
             self.dp_group = None
         self.should_execute_dummy_batch = False
 
-        if self.renderer_config.skip_tokenizer_init:
+        if self.model_config.skip_tokenizer_init:
             tokenizer = None
         else:
-            tokenizer = init_tokenizer_from_config(self.renderer_config)
+            tokenizer = init_tokenizer_from_config(self.model_config)
 
         self.input_processor = InputProcessor(self.vllm_config, tokenizer)
         self.io_processor = get_io_processor(
             self.vllm_config,
-            self.renderer_config.io_processor_plugin,
+            self.model_config.io_processor_plugin,
         )
 
         # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 797641851..31428db2d 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -85,7 +85,7 @@ class EagleProposer:
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
         self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
-            vllm_config.renderer_config
+            vllm_config.model_config
         )
 
         self.attn_metadata_builder: AttentionMetadataBuilder | None = None
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 36aa3d9bb..5ee88178c 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -63,7 +63,7 @@ class StructuredOutputManager:
             max_workers = max(1, min(multiprocessing.cpu_count() // 2, 8))
             self.executor_for_fillmask = ThreadPoolExecutor(max_workers=max_workers)
 
-        if not vllm_config.renderer_config.skip_tokenizer_init:
+        if not self.vllm_config.model_config.skip_tokenizer_init:
             # The default max_workers if not specified is the number of
             # CPUs * 5, which is way too high since these tasks are CPU-bound,
             # not I/O bound. We also know we would never dominate CPU usage
@@ -71,15 +71,21 @@ class StructuredOutputManager:
             # of CPUs.
             max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
             self.executor = ThreadPoolExecutor(max_workers=max_workers)
-            self.tokenizer = init_tokenizer_from_config(vllm_config.renderer_config)
-            reasoning_parser = vllm_config.structured_outputs_config.reasoning_parser
+            self.tokenizer = init_tokenizer_from_config(
+                model_config=self.vllm_config.model_config
+            )
+            reasoning_parser = (
+                self.vllm_config.structured_outputs_config.reasoning_parser
+            )
             reasoning_parser_plugin = (
-                vllm_config.structured_outputs_config.reasoning_parser_plugin
+                self.vllm_config.structured_outputs_config.reasoning_parser_plugin
             )
             if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3:
                 ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin)
 
-            reasoning_parser = vllm_config.structured_outputs_config.reasoning_parser
+            reasoning_parser = (
+                self.vllm_config.structured_outputs_config.reasoning_parser
+            )
             if reasoning_parser:
                 reasoner_cls = ReasoningParserManager.get_reasoning_parser(
                     reasoning_parser
@@ -87,7 +93,7 @@ class StructuredOutputManager:
                 self.reasoner = reasoner_cls(tokenizer=self.tokenizer)
 
         self.enable_in_reasoning = (
-            vllm_config.structured_outputs_config.enable_in_reasoning
+            self.vllm_config.structured_outputs_config.enable_in_reasoning
         )
 
     def grammar_init(self, request: Request) -> None:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index b3c8d4da2..a50360ab0 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -271,7 +271,6 @@ class GPUModelRunner(
         device: torch.device,
     ):
         self.vllm_config = vllm_config
-        self.renderer_config = vllm_config.renderer_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
         self.compilation_config = vllm_config.compilation_config
@@ -336,7 +335,7 @@ class GPUModelRunner(
         self.uses_mrope = model_config.uses_mrope
         self.uses_xdrope_dim = model_config.uses_xdrope_dim
         self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
-            self.renderer_config
+            model_config
         )
 
         if self.model_config.is_encoder_decoder:
@@ -559,7 +558,7 @@ class GPUModelRunner(
 
         self.mm_budget = (
             MultiModalBudget(
-                self.renderer_config,
+                self.model_config,
                 self.scheduler_config,
                 self.mm_registry,
             )
@@ -3874,7 +3873,7 @@ class GPUModelRunner(
         assert self.mm_budget is not None
 
         dummy_decoder_data = self.mm_registry.get_decoder_dummy_data(
-            renderer_config=self.renderer_config,
+            model_config=self.model_config,
             seq_len=self.max_model_len,
             mm_counts={modality: 1},
             cache=self.mm_budget.cache,
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 7e2a6af68..283f21b77 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -143,7 +143,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         original_parallel_config: ParallelConfig | None = None,
     ):
         self.vllm_config = vllm_config
-        self.renderer_config = vllm_config.renderer_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
         self.lora_config = vllm_config.lora_config
@@ -223,7 +222,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
         self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
-            self.renderer_config
+            model_config
         )
         # TODO: Support M-RoPE (e.g, Qwen2-VL)
         assert not self.uses_mrope, "TPU does not support M-RoPE yet."
@@ -354,7 +353,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         self.mm_budget = (
             MultiModalBudget(
-                self.renderer_config,
+                self.model_config,
                 self.scheduler_config,
                 self.mm_registry,
             )
@@ -2039,7 +2038,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         assert self.mm_budget is not None
 
         dummy_decoder_data = self.mm_registry.get_decoder_dummy_data(
-            renderer_config=self.renderer_config,
+            model_config=self.model_config,
             seq_len=self.max_model_len,
             mm_counts={modality: 1},
             cache=self.mm_budget.cache,
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 44418b998..0b0e2006d 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -7,7 +7,7 @@ import torch
 
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.layer import Attention
-from vllm.config import RendererConfig, SchedulerConfig, VllmConfig
+from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
 from vllm.model_executor.models.interfaces import MultiModalEmbeddings
 from vllm.model_executor.models.utils import extract_layer_index
 from vllm.multimodal.cache import processor_only_cache_from_config
@@ -23,29 +23,24 @@ class MultiModalBudget:
 
     def __init__(
         self,
-        renderer_config: RendererConfig,
+        model_config: ModelConfig,
         scheduler_config: SchedulerConfig,
         mm_registry: MultiModalRegistry,
     ) -> None:
         super().__init__()
 
-        self.renderer_config = renderer_config
-        self.model_config = renderer_config.model_config
+        self.model_config = model_config
         self.scheduler_config = scheduler_config
         self.mm_registry = mm_registry
-        self.cache = cache = processor_only_cache_from_config(
-            renderer_config, mm_registry
-        )
+        self.cache = cache = processor_only_cache_from_config(model_config, mm_registry)
 
-        self.max_model_len = self.model_config.max_model_len
+        self.max_model_len = model_config.max_model_len
         self.max_num_reqs = scheduler_config.max_num_seqs
 
-        self.mm_limits = mm_registry.get_mm_limits_per_prompt(
-            renderer_config, cache=cache
-        )
+        self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config, cache=cache)
 
         max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality(
-            renderer_config,
+            model_config,
             cache=cache,
             profiler_limits=self.mm_limits,
         )
-- 
GitLab


From 1b0482b9d1b8a082ff94a95516ec78349de6f515 Mon Sep 17 00:00:00 2001
From: Yifan Qiao <yifanqiao@berkeley.edu>
Date: Sun, 7 Dec 2025 00:39:21 -0800
Subject: [PATCH 173/258] [Misc][Core] Remove unused `req_index` increment in
 scheduler (#30176)

Signed-off-by: Yifan Qiao <yifanqiao@berkeley.edu>
---
 vllm/v1/core/sched/scheduler.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 0a8efa2fd..d858e8400 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -607,7 +607,6 @@ class Scheduler(SchedulerInterface):
 
                 self._update_connector_prefix_cache_stats(request)
 
-                req_index += 1
                 self.running.append(request)
                 if self.log_stats:
                     request.record_event(
-- 
GitLab


From 879ddb09c3b7b5a21f75e46ded148ce70f1c486e Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Date: Sun, 7 Dec 2025 17:58:47 +0800
Subject: [PATCH 174/258] [Kernel][MoE] optimize `moe_align_block_size`
 (#29642)

Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../kernels/benchmark_moe_align_block_size.py |  21 ++-
 csrc/moe/moe_align_sum_kernels.cu             | 152 ++++++++++++------
 csrc/moe/moe_ops.h                            |   3 +-
 csrc/moe/torch_bindings.cpp                   |   3 +-
 tests/kernels/moe/test_moe.py                 |  16 +-
 .../kernels/moe/test_moe_align_block_size.py  |  22 ++-
 vllm/_custom_ops.py                           |   2 +
 .../layers/fused_moe/fused_marlin_moe.py      |   6 +-
 .../layers/fused_moe/fused_moe.py             |   9 +-
 .../layers/fused_moe/moe_align_block_size.py  |  24 ++-
 10 files changed, 195 insertions(+), 63 deletions(-)

diff --git a/benchmarks/kernels/benchmark_moe_align_block_size.py b/benchmarks/kernels/benchmark_moe_align_block_size.py
index f540cff62..5f9a131f7 100644
--- a/benchmarks/kernels/benchmark_moe_align_block_size.py
+++ b/benchmarks/kernels/benchmark_moe_align_block_size.py
@@ -24,12 +24,15 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
 num_tokens_range = [1, 16, 256, 4096]
 num_experts_range = [16, 64, 224, 256, 280, 512]
 topk_range = [1, 2, 8]
-configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
+ep_size_range = [1, 8]
+configs = list(
+    itertools.product(num_tokens_range, num_experts_range, topk_range, ep_size_range)
+)
 
 
 @triton.testing.perf_report(
     triton.testing.Benchmark(
-        x_names=["num_tokens", "num_experts", "topk"],
+        x_names=["num_tokens", "num_experts", "topk", "ep_size"],
         x_vals=configs,
         line_arg="provider",
         line_vals=["vllm"],
@@ -38,16 +41,26 @@ configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range
         args={},
     )
 )
-def benchmark(num_tokens, num_experts, topk, provider):
+def benchmark(num_tokens, num_experts, topk, ep_size, provider):
     """Benchmark function for Triton."""
     block_size = 256
+    torch.cuda.manual_seed_all(0)
     topk_ids = get_topk_ids(num_tokens, num_experts, topk)
 
+    e_map = None
+    if ep_size != 1:
+        local_e = num_experts // ep_size
+        e_ids = torch.randperm(num_experts, device="cuda", dtype=torch.int32)[:local_e]
+        e_map = torch.full((num_experts,), -1, device="cuda", dtype=torch.int32)
+        e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
+
     quantiles = [0.5, 0.2, 0.8]
 
     if provider == "vllm":
         ms, min_ms, max_ms = triton.testing.do_bench(
-            lambda: moe_align_block_size(topk_ids, block_size, num_experts),
+            lambda: moe_align_block_size(
+                topk_ids, block_size, num_experts, e_map, ignore_invalid_experts=True
+            ),
             quantiles=quantiles,
         )
 
diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index b3d0c0aa5..ddcdcc38b 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -83,14 +83,22 @@ template <typename scalar_t>
 __global__ void moe_align_block_size_kernel(
     const scalar_t* __restrict__ topk_ids,
     int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
-    int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
+    int32_t* __restrict__ total_tokens_post_pad,
+    int32_t* __restrict__ expert_map, int32_t num_experts,
     int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
-    size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded) {
+    size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded,
+    bool has_expert_map) {
   extern __shared__ int32_t shared_counts[];
 
-  // Initialize sorted_token_ids with numel
-  for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
-    sorted_token_ids[it] = numel;
+  // Use a separate threadblock to fill sorted_token_ids.
+  // This is safe since the current kernel does not use sorted_token_ids.
+  if (blockIdx.x == 1) {
+    // Initialize sorted_token_ids with numel
+    for (size_t it = threadIdx.x; it < max_num_tokens_padded;
+         it += blockDim.x) {
+      sorted_token_ids[it] = numel;
+    }
+    return;
   }
 
   const int warp_id = threadIdx.x / WARP_SIZE;
@@ -112,6 +120,11 @@ __global__ void moe_align_block_size_kernel(
     if (expert_id >= num_experts) {
       continue;
     }
+    if (has_expert_map) {
+      expert_id = expert_map[expert_id];
+      // filter invalid experts
+      if (expert_id == -1) continue;
+    }
     int warp_idx = expert_id / experts_per_warp;
     int expert_offset = expert_id % experts_per_warp;
     atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
@@ -163,7 +176,8 @@ template <typename scalar_t>
 __global__ void count_and_sort_expert_tokens_kernel(
     const scalar_t* __restrict__ topk_ids,
     int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
-    size_t numel, int32_t num_experts) {
+    int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
+    bool has_expert_map) {
   const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
   const size_t stride = blockDim.x * gridDim.x;
 
@@ -172,6 +186,11 @@ __global__ void count_and_sort_expert_tokens_kernel(
     if (expert_id >= num_experts) {
       continue;
     }
+    if (has_expert_map) {
+      expert_id = expert_map[expert_id];
+      // filter invalid experts
+      if (expert_id == -1) continue;
+    }
     int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
     sorted_token_ids[rank_post_pad] = i;
   }
@@ -193,50 +212,69 @@ __global__ void moe_sum_kernel(
   }
 }
 
-template <typename scalar_t>
+template <typename scalar_t, int32_t fill_threads>
 __global__ void moe_align_block_size_small_batch_expert_kernel(
     const scalar_t* __restrict__ topk_ids,
     int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
-    int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
-    int32_t block_size, size_t numel, int32_t max_num_tokens_padded) {
-  // Initialize sorted_token_ids with numel
-  for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
-    sorted_token_ids[it] = numel;
+    int32_t* __restrict__ total_tokens_post_pad,
+    int32_t* __restrict__ expert_map, int32_t num_experts, int32_t block_size,
+    size_t numel, int32_t max_num_tokens_padded, bool has_expert_map) {
+  // Use an additional group of threads to fill sorted_token_ids.
+  // Since the current kernel will use sorted_token_ids afterward,
+  // we fill sorted_token_ids within the same threadblock to make
+  // synchronization easier.
+  if (threadIdx.x < fill_threads) {
+    // Initialize sorted_token_ids with numel
+    for (size_t it = threadIdx.x; it < max_num_tokens_padded;
+         it += fill_threads) {
+      sorted_token_ids[it] = numel;
+    }
+    // Three __syncthreads() corresponding to the other threads
+    __syncthreads();
+    __syncthreads();
+    __syncthreads();
+    return;
   }
 
-  const size_t tid = threadIdx.x;
-  const size_t stride = blockDim.x;
+  const size_t tid = threadIdx.x - fill_threads;
+  const size_t stride = blockDim.x - fill_threads;
 
   extern __shared__ int32_t shared_mem[];
   int32_t* cumsum = shared_mem;
   int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1);
 
   for (int i = 0; i < num_experts; ++i) {
-    tokens_cnts[(threadIdx.x + 1) * num_experts + i] = 0;
+    tokens_cnts[(tid + 1) * num_experts + i] = 0;
   }
 
   for (size_t i = tid; i < numel; i += stride) {
-    ++tokens_cnts[(threadIdx.x + 1) * num_experts + topk_ids[i]];
+    int32_t expert_id = topk_ids[i];
+    if (has_expert_map) {
+      expert_id = expert_map[expert_id];
+      // filter invalid expert
+      if (expert_id == -1) continue;
+    }
+    ++tokens_cnts[(tid + 1) * num_experts + expert_id];
   }
 
   __syncthreads();
 
-  if (threadIdx.x < num_experts) {
-    tokens_cnts[threadIdx.x] = 0;
-    for (int i = 1; i <= blockDim.x; ++i) {
-      tokens_cnts[i * num_experts + threadIdx.x] +=
-          tokens_cnts[(i - 1) * num_experts + threadIdx.x];
+  if (tid < num_experts) {
+    tokens_cnts[tid] = 0;
+    for (int i = 1; i <= stride; ++i) {
+      tokens_cnts[i * num_experts + tid] +=
+          tokens_cnts[(i - 1) * num_experts + tid];
     }
   }
 
   __syncthreads();
 
-  if (threadIdx.x == 0) {
+  if (tid == 0) {
     cumsum[0] = 0;
     for (int i = 1; i <= num_experts; ++i) {
       cumsum[i] =
           cumsum[i - 1] +
-          CEILDIV(tokens_cnts[blockDim.x * num_experts + i - 1], block_size) *
+          CEILDIV(tokens_cnts[stride * num_experts + i - 1], block_size) *
               block_size;
     }
     *total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
@@ -244,26 +282,30 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
 
   __syncthreads();
 
-  if (threadIdx.x < num_experts) {
-    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
-         i += block_size) {
-      expert_ids[i / block_size] = threadIdx.x;
+  if (tid < num_experts) {
+    for (int i = cumsum[tid]; i < cumsum[tid + 1]; i += block_size) {
+      expert_ids[i / block_size] = tid;
     }
   }
 
   // Fill remaining expert_ids with 0
-  const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x;
+  const size_t fill_start_idx = cumsum[num_experts] / block_size + tid;
   const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size);
-  for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) {
+  for (size_t i = fill_start_idx; i < expert_ids_size; i += stride) {
     expert_ids[i] = 0;
   }
 
   for (size_t i = tid; i < numel; i += stride) {
     int32_t expert_id = topk_ids[i];
+    if (has_expert_map) {
+      expert_id = expert_map[expert_id];
+      // filter invalid expert
+      if (expert_id == -1) continue;
+    }
     int32_t rank_post_pad =
-        tokens_cnts[threadIdx.x * num_experts + expert_id] + cumsum[expert_id];
+        tokens_cnts[tid * num_experts + expert_id] + cumsum[expert_id];
     sorted_token_ids[rank_post_pad] = i;
-    ++tokens_cnts[threadIdx.x * num_experts + expert_id];
+    ++tokens_cnts[tid * num_experts + expert_id];
   }
 }
 
@@ -275,7 +317,8 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
 void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                           int64_t block_size, torch::Tensor sorted_token_ids,
                           torch::Tensor experts_ids,
-                          torch::Tensor num_tokens_post_pad) {
+                          torch::Tensor num_tokens_post_pad,
+                          std::optional<torch::Tensor> maybe_expert_map) {
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   int64_t padded_num_experts =
@@ -287,14 +330,19 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
   // BlockScan uses 1024 threads and assigns one thread per expert.
   TORCH_CHECK(padded_num_experts < 1024,
               "padded_num_experts must be less than 1024");
+  auto options_int =
+      torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
+  bool has_expert_map = maybe_expert_map.has_value();
+  torch::Tensor expert_map;
+  if (has_expert_map) {
+    expert_map = maybe_expert_map.value();
+  } else {
+    expert_map = torch::empty({0}, options_int);
+  }
 
   VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
       topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
         // calc needed amount of shared mem for `cumsum` tensors
-        auto options_int =
-            torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
-        torch::Tensor cumsum_buffer =
-            torch::empty({num_experts + 1}, options_int);
         bool small_batch_expert_mode =
             (topk_ids.numel() < 1024) && (num_experts <= 64);
 
@@ -304,30 +352,41 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
               ((threads + 1) * num_experts + (num_experts + 1)) *
               sizeof(int32_t);
 
+          // threadIdx.x >= fill_threads: counting experts and aligning
+          // threadIdx.x < fill_threads: filling sorted_token_ids
+          constexpr int32_t fill_threads = 256;
           auto small_batch_expert_kernel =
               vllm::moe::moe_align_block_size_small_batch_expert_kernel<
-                  scalar_t>;
-          small_batch_expert_kernel<<<1, threads, shared_mem_size, stream>>>(
+                  scalar_t, fill_threads>;
+          small_batch_expert_kernel<<<1, fill_threads + threads,
+                                      shared_mem_size, stream>>>(
               topk_ids.data_ptr<scalar_t>(),
               sorted_token_ids.data_ptr<int32_t>(),
               experts_ids.data_ptr<int32_t>(),
-              num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
-              topk_ids.numel(), sorted_token_ids.size(0));
+              num_tokens_post_pad.data_ptr<int32_t>(),
+              expert_map.data_ptr<int32_t>(), num_experts, block_size,
+              topk_ids.numel(), sorted_token_ids.size(0), has_expert_map);
         } else {
+          torch::Tensor cumsum_buffer =
+              torch::empty({num_experts + 1}, options_int);
           auto align_kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;
 
           size_t num_warps = CEILDIV(padded_num_experts, experts_per_warp);
           size_t shared_mem_size =
               num_warps * experts_per_warp * sizeof(int32_t);
 
-          align_kernel<<<1, threads, shared_mem_size, stream>>>(
+          // launch two threadblocks
+          // blockIdx.x == 0: counting experts and aligning
+          // blockIdx.x == 1: filling sorted_token_ids
+          align_kernel<<<2, threads, shared_mem_size, stream>>>(
               topk_ids.data_ptr<scalar_t>(),
               sorted_token_ids.data_ptr<int32_t>(),
               experts_ids.data_ptr<int32_t>(),
-              num_tokens_post_pad.data_ptr<int32_t>(), num_experts,
-              padded_num_experts, experts_per_warp, block_size,
-              topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>(),
-              sorted_token_ids.size(0));
+              num_tokens_post_pad.data_ptr<int32_t>(),
+              expert_map.data_ptr<int32_t>(), num_experts, padded_num_experts,
+              experts_per_warp, block_size, topk_ids.numel(),
+              cumsum_buffer.data_ptr<int32_t>(), sorted_token_ids.size(0),
+              has_expert_map);
 
           const int block_threads = std::min(256, (int)threads);
           const int num_blocks =
@@ -340,7 +399,8 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
           sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
               topk_ids.data_ptr<scalar_t>(),
               sorted_token_ids.data_ptr<int32_t>(),
-              cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel(), num_experts);
+              cumsum_buffer.data_ptr<int32_t>(), expert_map.data_ptr<int32_t>(),
+              topk_ids.numel(), num_experts, has_expert_map);
         }
       });
 }
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
index 11c6875f7..4c7accf03 100644
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -11,7 +11,8 @@ void moe_sum(torch::Tensor& input, torch::Tensor& output);
 void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                           int64_t block_size, torch::Tensor sorted_token_ids,
                           torch::Tensor experts_ids,
-                          torch::Tensor num_tokens_post_pad);
+                          torch::Tensor num_tokens_post_pad,
+                          std::optional<torch::Tensor> maybe_expert_map);
 
 void batched_moe_align_block_size(int64_t max_tokens_per_batch,
                                   int64_t block_size,
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index e0a828072..fca57c31c 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -19,7 +19,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "moe_align_block_size(Tensor topk_ids, int num_experts,"
       "                     int block_size, Tensor! sorted_token_ids,"
       "                     Tensor! experts_ids,"
-      "                     Tensor! num_tokens_post_pad) -> ()");
+      "                     Tensor! num_tokens_post_pad,"
+      "                     Tensor? maybe_expert_map) -> ()");
   m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
 
   // Aligning the number of tokens to be processed by each expert such
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index bacf6f37f..82659276a 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -955,9 +955,22 @@ def test_fused_marlin_moe_with_bias(m):
     torch.testing.assert_close(marlin_output, torch_output, atol=5e-2, rtol=0)
 
 
-def test_moe_align_block_size_opcheck():
+@pytest.mark.parametrize("ep_size", [1, 2])
+def test_moe_align_block_size_opcheck(ep_size):
     num_experts = 4
     block_size = 4
+
+    expert_map = None
+    if ep_size != 1:
+        local_num_experts = num_experts // ep_size
+        expert_ids = torch.randint(
+            0, num_experts, (local_num_experts,), device="cuda", dtype=torch.int32
+        )
+        expert_map = torch.full((num_experts,), -1, device="cuda", dtype=torch.int32)
+        expert_map[expert_ids] = torch.arange(
+            local_num_experts, device="cuda", dtype=torch.int32
+        )
+
     topk_ids = torch.randint(0, num_experts, (3, 4), dtype=torch.int32, device="cuda")
 
     max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
@@ -980,6 +993,7 @@ def test_moe_align_block_size_opcheck():
             sorted_ids,
             expert_ids,
             num_tokens_post_pad,
+            expert_map,
         ),
     )
 
diff --git a/tests/kernels/moe/test_moe_align_block_size.py b/tests/kernels/moe/test_moe_align_block_size.py
index 8975f00bd..1abfc11fb 100644
--- a/tests/kernels/moe/test_moe_align_block_size.py
+++ b/tests/kernels/moe/test_moe_align_block_size.py
@@ -106,6 +106,8 @@ def torch_moe_align_block_size(
     max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
     if pad_sorted_ids:
         max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
+    if topk_ids.numel() < num_experts:
+        max_num_tokens_padded = topk_ids.numel() * block_size
 
     flattened_token_indices = torch.arange(
         topk_ids.numel(), device=topk_ids.device, dtype=torch.int32
@@ -126,6 +128,8 @@ def torch_moe_align_block_size(
     )
     for expert_id in range(num_experts):
         original_count = expert_token_counts[expert_id]
+        if expert_map is not None and expert_map[expert_id] == -1:
+            continue
         if original_count > 0:
             expert_padded_counts[expert_id] = (
                 (original_count + block_size - 1) // block_size
@@ -143,6 +147,9 @@ def torch_moe_align_block_size(
     current_pos = 0
     current_block = 0
     for expert_id in range(num_experts):
+        if expert_map is not None and expert_map[expert_id] == -1:
+            continue
+
         expert_mask = sorted_expert_ids == expert_id
         expert_tokens = sorted_token_indices[expert_mask]
         num_expert_tokens = expert_tokens.shape[0]
@@ -153,7 +160,13 @@ def torch_moe_align_block_size(
             )
 
             expert_blocks_needed = expert_padded_counts[expert_id] // block_size
-            expert_ids[current_block : current_block + expert_blocks_needed] = expert_id
+
+            expert_id_new = expert_id
+            if expert_map is not None:
+                expert_id_new = expert_map[expert_id]
+            expert_ids[current_block : current_block + expert_blocks_needed] = (
+                expert_id_new
+            )
 
             current_pos += expert_padded_counts[expert_id]
             current_block += expert_blocks_needed
@@ -163,8 +176,6 @@ def torch_moe_align_block_size(
         [total_padded_tokens], dtype=torch.int32, device=topk_ids.device
     )
 
-    if expert_map is not None:
-        expert_ids = expert_map[expert_ids]
     return sorted_token_ids, expert_ids, num_tokens_post_pad
 
 
@@ -229,9 +240,9 @@ def test_moe_align_block_size(
     )
 
 
-@pytest.mark.parametrize("m", [16, 32])
+@pytest.mark.parametrize("m", [16, 32, 2048])
 @pytest.mark.parametrize("topk", [2, 4])
-@pytest.mark.parametrize("num_experts", [8])
+@pytest.mark.parametrize("num_experts", [8, 64])
 @pytest.mark.parametrize("block_size", [64])
 @pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
 def test_moe_align_block_size_with_expert_map(
@@ -253,6 +264,7 @@ def test_moe_align_block_size_with_expert_map(
         block_size=block_size,
         num_experts=num_experts,
         expert_map=expert_map,
+        ignore_invalid_experts=True,
     )
     golden_sorted_ids, golden_expert_ids, golden_num_tokens = (
         torch_moe_align_block_size(
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index e60158898..94e275452 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1877,6 +1877,7 @@ def moe_align_block_size(
     sorted_token_ids: torch.Tensor,
     experts_ids: torch.Tensor,
     num_tokens_post_pad: torch.Tensor,
+    expert_map: torch.Tensor | None = None,
 ) -> None:
     torch.ops._moe_C.moe_align_block_size(
         topk_ids,
@@ -1885,6 +1886,7 @@ def moe_align_block_size(
         sorted_token_ids,
         experts_ids,
         num_tokens_post_pad,
+        expert_map,
     )
 
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 9c377db72..92d72b756 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -316,7 +316,11 @@ def fused_marlin_moe(
     if global_num_experts == -1:
         global_num_experts = E
     sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
-        topk_ids, block_size_m, global_num_experts, expert_map
+        topk_ids,
+        block_size_m,
+        global_num_experts,
+        expert_map,
+        ignore_invalid_experts=True,
     )
 
     assert activation is not None
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index df208eae2..f3c158ee2 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1887,7 +1887,11 @@ def fused_experts_impl(
         )
 
         sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
-            curr_topk_ids, config["BLOCK_SIZE_M"], global_num_experts, expert_map
+            curr_topk_ids,
+            config["BLOCK_SIZE_M"],
+            global_num_experts,
+            expert_map,
+            ignore_invalid_experts=True,
         )
 
         invoke_fused_moe_kernel(
@@ -1946,6 +1950,9 @@ def fused_experts_impl(
             block_shape=block_shape,
         )
 
+        if expert_map is not None:
+            intermediate_cache3.zero_()
+
         invoke_fused_moe_kernel(
             qintermediate_cache2,
             w2,
diff --git a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
index 7f6155997..7fc8bfcf8 100644
--- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
+++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
@@ -14,6 +14,7 @@ def moe_align_block_size(
     num_experts: int,
     expert_map: torch.Tensor | None = None,
     pad_sorted_ids: bool = False,
+    ignore_invalid_experts: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Aligns the token distribution across experts to be compatible with block
@@ -35,7 +36,13 @@ def moe_align_block_size(
         expert parallel shard. If the expert is not in the current expert
         parallel shard, the mapping is set to -1.
     - pad_sorted_ids: A flag indicating whether the sorted_token_ids length
-      should be padded to a multiple of block_size,
+        should be padded to a multiple of block_size,
+    - ignore_invalid_experts: A flag indicating whether to ignore invalid
+        experts. When False, all expert_ids in topk_ids will participate in
+        counting and ranking, but invalid experts in expert_ids will be marked
+        as -1. When True, all invalid expert_ids in topk_ids will be ignored
+        and will not participate in counting or ranking, and there will be no
+        -1 in expert_ids.
 
     Returns:
     - sorted_token_ids: A tensor containing the sorted token indices according
@@ -67,6 +74,10 @@ def moe_align_block_size(
     max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
     if pad_sorted_ids:
         max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
+    if topk_ids.numel() < num_experts:
+        max_num_tokens_padded = min(
+            topk_ids.numel() * block_size, max_num_tokens_padded
+        )
     sorted_ids = torch.empty(
         (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
     )
@@ -77,9 +88,16 @@ def moe_align_block_size(
     num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
 
     ops.moe_align_block_size(
-        topk_ids, num_experts, block_size, sorted_ids, expert_ids, num_tokens_post_pad
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_ids,
+        expert_ids,
+        num_tokens_post_pad,
+        expert_map if ignore_invalid_experts else None,
     )
-    if expert_map is not None:
+
+    if expert_map is not None and not ignore_invalid_experts:
         expert_ids = expert_map[expert_ids]
 
     return sorted_ids, expert_ids, num_tokens_post_pad
-- 
GitLab


From b0f4866a77f22720a5e295cbcffee5cbe9fc2b56 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sun, 7 Dec 2025 20:27:11 +0800
Subject: [PATCH 175/258] [CI/Build]Temporary workaround for
 test_default_mm_loras timeout (#30202)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_default_mm_loras.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/lora/test_default_mm_loras.py b/tests/lora/test_default_mm_loras.py
index 407b29fdd..1d16862b3 100644
--- a/tests/lora/test_default_mm_loras.py
+++ b/tests/lora/test_default_mm_loras.py
@@ -13,6 +13,7 @@ from huggingface_hub import snapshot_download
 from vllm.lora.request import LoRARequest
 
 from ..conftest import AudioTestAssets, VllmRunner
+from ..utils import create_new_process_for_each_test
 
 MODEL_PATH = snapshot_download("microsoft/Phi-4-multimodal-instruct")
 AUDIO_LORA_PATH = os.path.join(MODEL_PATH, "speech-lora")
@@ -60,6 +61,7 @@ def run_test(vllm_runner, audio_assets, lora_request, expected_suffix, **kwargs)
         assert vllm_outputs_with_default_lora[-1][-1][-1].endswith(expected_suffix)
 
 
+@create_new_process_for_each_test()
 def test_active_default_mm_lora(
     vllm_runner: type[VllmRunner],
     audio_assets: AudioTestAssets,
@@ -74,6 +76,7 @@ def test_active_default_mm_lora(
     )
 
 
+@create_new_process_for_each_test()
 def test_inactive_default_mm_lora(
     vllm_runner: type[VllmRunner],
     audio_assets: AudioTestAssets,
@@ -89,6 +92,7 @@ def test_inactive_default_mm_lora(
     )
 
 
+@create_new_process_for_each_test()
 def test_default_mm_lora_succeeds_with_redundant_lora_request(
     vllm_runner: type[VllmRunner],
     audio_assets: AudioTestAssets,
@@ -103,6 +107,7 @@ def test_default_mm_lora_succeeds_with_redundant_lora_request(
     )
 
 
+@create_new_process_for_each_test()
 def test_default_mm_lora_fails_with_overridden_lora_request(
     vllm_runner: type[VllmRunner],
     audio_assets: AudioTestAssets,
@@ -118,6 +123,7 @@ def test_default_mm_lora_fails_with_overridden_lora_request(
     )
 
 
+@create_new_process_for_each_test()
 def test_default_mm_lora_does_not_expand_string_reqs(vllm_runner):
     class MockEngineException(Exception):
         pass
-- 
GitLab


From 541a2ef892720489f770569417bc1bc4436dbb21 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Sun, 7 Dec 2025 07:31:14 -0500
Subject: [PATCH 176/258] [Perf] Deepgemm fused layout kernel for activations,
 4.3% throughput improvement, 10.7% TTFT improvement. (#29546)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 csrc/ops.h                                    |   8 +
 .../w8a8/fp8/per_token_group_quant.cu         | 185 ++++++++++++++++++
 csrc/torch_bindings.cpp                       |   9 +
 .../layers/fused_moe/deep_gemm_moe.py         |  41 ++--
 .../layers/quantization/utils/fp8_utils.py    |  80 +++++++-
 5 files changed, 311 insertions(+), 12 deletions(-)

diff --git a/csrc/ops.h b/csrc/ops.h
index 4bb7857b1..d302f0491 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -299,6 +299,14 @@ void per_token_group_quant_int8(const torch::Tensor& input,
                                 torch::Tensor& output_q,
                                 torch::Tensor& output_s, int64_t group_size,
                                 double eps, double int8_min, double int8_max);
+
+// Fused activation quantisation + DeepGEMM-compatible UE8M0-packed scales.
+void per_token_group_quant_8bit_packed(const torch::Tensor& input,
+                                       torch::Tensor& output_q,
+                                       torch::Tensor& output_s_packed,
+                                       int64_t group_size, double eps,
+                                       double min_8bit, double max_8bit);
+
 #endif
 
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
diff --git a/csrc/quantization/w8a8/fp8/per_token_group_quant.cu b/csrc/quantization/w8a8/fp8/per_token_group_quant.cu
index e3ab0676b..f9ac874c4 100644
--- a/csrc/quantization/w8a8/fp8/per_token_group_quant.cu
+++ b/csrc/quantization/w8a8/fp8/per_token_group_quant.cu
@@ -206,6 +206,191 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
 #undef LAUNCH_KERNEL
 }
 
+template <typename T, typename DST_DTYPE>
+__global__ void per_token_group_quant_8bit_packed_kernel(
+    const T* __restrict__ input, void* __restrict__ output_q,
+    unsigned int* __restrict__ output_s_packed, const int group_size,
+    const int num_groups, const int groups_per_block, const int groups_per_row,
+    const int mn, const int tma_aligned_mn, const float eps,
+    const float min_8bit, const float max_8bit) {
+  const int threads_per_group = 16;
+  const int64_t local_group_id = threadIdx.x / threads_per_group;
+  const int lane_id = threadIdx.x % threads_per_group;
+
+  const int64_t block_group_id = blockIdx.x * groups_per_block;
+  const int64_t global_group_id = block_group_id + local_group_id;
+  if (global_group_id >= num_groups) {
+    return;
+  }
+
+  const int64_t block_group_offset = global_group_id * group_size;
+
+  float local_absmax = eps;
+
+  const T* group_input = input + block_group_offset;
+  DST_DTYPE* group_output =
+      static_cast<DST_DTYPE*>(output_q) + block_group_offset;
+
+  // shared memory to cache each group's data to avoid double DRAM reads.
+  extern __shared__ __align__(16) char smem_raw[];
+  T* smem = reinterpret_cast<T*>(smem_raw);
+  T* smem_group = smem + local_group_id * group_size;
+
+  constexpr int vec_size = 16 / sizeof(T);
+  using vec_t = vllm::vec_n_t<T, vec_size>;
+
+  // copy global -> shared & compute absmax
+  auto scalar_op_cache = [&] __device__(T & dst, const T& src) {
+    float abs_v = fabsf(static_cast<float>(src));
+    local_absmax = fmaxf(local_absmax, abs_v);
+    dst = src;
+  };
+
+  vllm::vectorize_with_alignment<vec_size>(
+      group_input,        // in
+      smem_group,         // out (shared)
+      group_size,         // elements per group
+      lane_id,            // thread id
+      threads_per_group,  // stride in group
+      scalar_op_cache);   // scalar handler
+
+  local_absmax = GroupReduceMax(local_absmax);
+
+  float y_s = local_absmax / max_8bit;
+  y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f))));
+
+  // pack 4 scales into a uint32
+  if (lane_id == 0) {
+    // map flat group id to 2D indices (mn_idx, sf_k_idx)
+    const int sf_k_idx = static_cast<int>(global_group_id % groups_per_row);
+    const int mn_idx = static_cast<int>(global_group_id / groups_per_row);
+
+    if (mn_idx < mn) {
+      // each uint32 in output_s_packed stores 4 packed scales
+      const int sf_k_pack_idx = sf_k_idx / 4;
+      const int pos = sf_k_idx % 4;
+
+      // reinterpret the UE8M0 scale y_s as IEEE bits, extract the 8-bit
+      // exponent, and place it into the correct byte of the 32-bit word.
+      const unsigned int bits = __float_as_uint(y_s);
+      const unsigned int exponent = (bits >> 23u) & 0xffu;
+      const unsigned int contrib = exponent << (pos * 8u);
+
+      const int out_idx = sf_k_pack_idx * tma_aligned_mn + mn_idx;
+      // atomically OR 8-bit exponent into the packed scales buffer
+      atomicOr(output_s_packed + out_idx, contrib);
+    }
+  }
+
+  __syncthreads();
+
+  // quantize shared -> global 8-bit
+  auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) {
+    float q = fminf(fmaxf(static_cast<float>(src) / y_s, min_8bit), max_8bit);
+    dst = DST_DTYPE(q);
+  };
+
+  vllm::vectorize_with_alignment<vec_size>(
+      smem_group,         // in (shared)
+      group_output,       // out (global quant tensor)
+      group_size,         // elements
+      lane_id,            // tid
+      threads_per_group,  // stride
+      scalar_op_quant);   // scalar handler
+}
+
+void per_token_group_quant_8bit_packed(const torch::Tensor& input,
+                                       torch::Tensor& output_q,
+                                       torch::Tensor& output_s_packed,
+                                       int64_t group_size, double eps,
+                                       double min_8bit, double max_8bit) {
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(output_q.is_contiguous());
+
+  const int64_t k = input.size(-1);
+  TORCH_CHECK(k % group_size == 0, "Last dimension (", k,
+              ") must be divisible by group_size (", group_size, ").");
+
+  const int64_t mn = input.numel() / k;
+  const int64_t groups_per_row = k / group_size;
+  const int64_t num_groups = mn * groups_per_row;
+
+  TORCH_CHECK(output_s_packed.dim() == 2,
+              "output_s_packed must be 2D, got dim=", output_s_packed.dim(),
+              ".");
+
+  const int64_t k_num_packed_sfk = (groups_per_row + 3) / 4;
+  const int64_t tma_aligned_mn = ((mn + 3) / 4) * 4;
+
+  TORCH_CHECK(output_s_packed.scalar_type() == at::ScalarType::Int,
+              "output_s_packed must have dtype int32 for UE8M0-packed scales.");
+  // DeepGEMM expects SFA scales in MN-major form with shape
+  // [mn, ceil_div(K, 128 * 4)] and TMA-aligned stride on the last
+  // dimension.
+  TORCH_CHECK(output_s_packed.size(0) == mn &&
+                  output_s_packed.size(1) == k_num_packed_sfk,
+              "output_s_packed shape must be [", mn, ", ", k_num_packed_sfk,
+              "], but got [", output_s_packed.size(0), ", ",
+              output_s_packed.size(1), "].");
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  constexpr int THREADS_PER_GROUP = 16;
+
+  int groups_per_block = 1;
+
+  if (num_groups % 16 == 0) {
+    groups_per_block = 16;
+  } else if (num_groups % 8 == 0) {
+    groups_per_block = 8;
+  } else if (num_groups % 4 == 0) {
+    groups_per_block = 4;
+  } else if (num_groups % 2 == 0) {
+    groups_per_block = 2;
+  }
+
+  auto dst_type = output_q.scalar_type();
+  const int num_blocks = num_groups / groups_per_block;
+  const int num_threads = groups_per_block * THREADS_PER_GROUP;
+
+  // zero-initialize packed scales, since we use atomicOr to accumulate
+  // exponents from different groups.
+  output_s_packed.zero_();
+
+#define LAUNCH_PACKED_KERNEL(T, DST_DTYPE)                                \
+  do {                                                                    \
+    dim3 grid(num_blocks);                                                \
+    dim3 block(num_threads);                                              \
+    size_t smem_bytes =                                                   \
+        static_cast<size_t>(groups_per_block) * group_size * sizeof(T);   \
+    per_token_group_quant_8bit_packed_kernel<T, DST_DTYPE>                \
+        <<<grid, block, smem_bytes, stream>>>(                            \
+            static_cast<const T*>(input.data_ptr()), output_q.data_ptr(), \
+            reinterpret_cast<unsigned int*>(output_s_packed.data_ptr()),  \
+            static_cast<int>(group_size), static_cast<int>(num_groups),   \
+            groups_per_block, static_cast<int>(groups_per_row),           \
+            static_cast<int>(mn), static_cast<int>(tma_aligned_mn),       \
+            static_cast<float>(eps), static_cast<float>(min_8bit),        \
+            static_cast<float>(max_8bit));                                \
+  } while (0)
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "per_token_group_quant_8bit_packed", ([&] {
+        if (dst_type == at::ScalarType::Float8_e4m3fn) {
+          LAUNCH_PACKED_KERNEL(scalar_t, __nv_fp8_e4m3);
+        } else if (dst_type == at::ScalarType::Char) {
+          LAUNCH_PACKED_KERNEL(scalar_t, int8_t);
+        } else {
+          TORCH_CHECK(
+              false,
+              "per_token_group_quant_8bit_packed only supports FP8/INT8 "
+              "outputs.");
+        }
+      }));
+
+#undef LAUNCH_PACKED_KERNEL
+}
+
 void per_token_group_quant_fp8(const torch::Tensor& input,
                                torch::Tensor& output_q, torch::Tensor& output_s,
                                int64_t group_size, double eps, double fp8_min,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 914227838..23ac1d9ab 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -617,6 +617,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("per_token_group_fp8_quant", torch::kCUDA,
            &per_token_group_quant_fp8);
 
+  // Compute per-token-group 8-bit quantized tensor and UE8M0-packed,
+  // TMA-aligned scales for DeepGEMM.
+  ops.def(
+      "per_token_group_fp8_quant_packed(Tensor input, Tensor! output_q, "
+      "Tensor! output_s_packed, int group_size, float eps, float fp8_min, "
+      "float fp8_max) -> ()");
+  ops.impl("per_token_group_fp8_quant_packed", torch::kCUDA,
+           &per_token_group_quant_8bit_packed);
+
   // Compute per-token-group INT8 quantized tensor and scaling factor.
   ops.def(
       "per_token_group_quant_int8(Tensor input, Tensor! output_q, Tensor! "
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
index 9f47e692d..4a64736ed 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -23,9 +23,11 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
 from vllm.model_executor.layers.fused_moe.utils import _resize_cache
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8,
+    per_token_group_quant_fp8_packed_for_deepgemm,
     silu_mul_per_token_group_quant_fp8_colmajor,
 )
 from vllm.utils.deep_gemm import (
+    DeepGemmQuantScaleFMT,
     get_mk_alignment_for_contiguous_layout,
     m_grouped_fp8_gemm_nt_contiguous,
 )
@@ -157,23 +159,40 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
     def _act_mul_quant(
         self, input: torch.Tensor, output: torch.Tensor, activation: str
     ) -> tuple[torch.Tensor, torch.Tensor]:
-        if activation == "silu":
-            return silu_mul_per_token_group_quant_fp8_colmajor(
-                input=input, output=output
-            )
-        else:
-            # This is a fallback path. If we find ourselves using any activation other
-            # than silu, we should add that activation to
-            # silu_mul_per_token_group_quant_fp8_colmajor kernel as it is much faster.
+        assert self.block_shape is not None
+        block_k = self.block_shape[1]
+        scale_fmt = DeepGemmQuantScaleFMT.from_oracle()
+
+        # 1. DeepGemm UE8M0: use packed per-token-group quant
+        if scale_fmt == DeepGemmQuantScaleFMT.UE8M0:
             M_sum, N = input.size()
             act_out = torch.empty(
                 (M_sum, N // 2), dtype=input.dtype, device=input.device
             )
             self.activation(activation, act_out, input)
-            assert self.block_shape is not None
-            return per_token_group_quant_fp8(
-                act_out, self.block_shape[1], column_major_scales=True, out_q=output
+            a2q, a2q_scale = per_token_group_quant_fp8_packed_for_deepgemm(
+                act_out,
+                block_k,
+                out_q=output,
             )
+            return a2q, a2q_scale
+
+        # 2. Hopper / non‑E8M0: prefer the fused SiLU+mul+quant kernel
+        if activation == "silu":
+            use_ue8m0 = scale_fmt == DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0
+            return silu_mul_per_token_group_quant_fp8_colmajor(
+                input=input,
+                output=output,
+                use_ue8m0=use_ue8m0,
+            )
+
+        # 3. fallback path for non-SiLU activations in non‑UE8M0 cases.
+        M_sum, N = input.size()
+        act_out = torch.empty((M_sum, N // 2), dtype=input.dtype, device=input.device)
+        self.activation(activation, act_out, input)
+        return per_token_group_quant_fp8(
+            act_out, block_k, column_major_scales=True, out_q=output
+        )
 
     def apply(
         self,
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 6e73833d1..7e1bda863 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -269,7 +269,11 @@ class W8A8BlockFp8LinearOp:
         weight_scale: torch.Tensor,
     ) -> torch.Tensor:
         assert self.deepgemm_input_quant_op is not None
-        q_input, input_scale = self.deepgemm_input_quant_op(input_2d)
+        q_input, input_scale = per_token_group_quant_fp8_packed_for_deepgemm(
+            input_2d,
+            group_size=self.act_quant_group_shape.col,
+            use_ue8m0=True,
+        )
         output = torch.empty(
             (q_input.shape[0], weight.shape[0]),
             dtype=torch.bfloat16,
@@ -791,6 +795,80 @@ def per_token_group_quant_fp8(
     return x_q, x_s
 
 
+def per_token_group_quant_fp8_packed_for_deepgemm(
+    x: torch.Tensor,
+    group_size: int,
+    eps: float = 1e-10,
+    use_ue8m0: bool | None = None,
+    out_q: torch.Tensor | None = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """FP8 per-token-group quantization for DeepGEMM.
+
+    Returns:
+        (x_q, x_s_packed)
+            x_q: FP8 activations, same shape as `x`.
+            x_s_packed: Int32 tensor with logical shape
+                        [mn, ceil(num_groups_per_row / 4)], laid out with
+                        TMA-aligned stride along the packed-K dimension
+    """
+    if use_ue8m0 is None:
+        use_ue8m0 = is_deep_gemm_e8m0_used()
+    # for DeepGEMM UE8M0-packed layout we *require* UE8M0 scales.
+    assert use_ue8m0, (
+        "per_token_group_quant_fp8_packed_for_deepgemm requires UE8M0 scales."
+    )
+
+    dtype = current_platform.fp8_dtype()
+    assert x.shape[-1] % group_size == 0, (
+        f"the last dimension of `x` {x.shape[-1]} must be divisible "
+        f"by `group_size` {group_size}"
+    )
+    assert x.stride(-1) == 1, "`x` groups must be contiguous"
+
+    finfo = torch.finfo(dtype)
+    fp8_min, fp8_max = finfo.min, finfo.max
+
+    # compute DeepGEMM-style packed scale tensor shape.
+    hidden_dim = x.shape[-1]
+    mn = x.numel() // hidden_dim
+    num_groups_per_row = hidden_dim // group_size
+    k_num_packed_sf_k = (num_groups_per_row + 3) // 4
+    tma_aligned_mn = ((mn + 3) // 4) * 4
+
+    x_s_packed = torch.empty_strided(
+        (mn, k_num_packed_sf_k),
+        (1, tma_aligned_mn),
+        device=x.device,
+        dtype=torch.int32,
+    )
+
+    # CUDA kernel path only (DeepGEMM + E8M0 is CUDA-specific).
+    assert current_platform.is_cuda(), (
+        "per_token_group_quant_fp8_packed_for_deepgemm is only valid on CUDA "
+        "platforms using DeepGEMM."
+    )
+
+    x_contiguous = x.contiguous()
+    if out_q is not None:
+        x_q_local = out_q
+    else:
+        x_q_local = torch.empty_like(x_contiguous, device=x.device, dtype=dtype)
+
+    torch.ops._C.per_token_group_fp8_quant_packed(
+        x_contiguous,
+        x_q_local,
+        x_s_packed,
+        group_size,
+        eps,
+        fp8_min,
+        fp8_max,
+    )
+
+    # return a tensor with the original logical shape.
+    x_q = x_q_local.view_as(x)
+    return x_q, x_s_packed
+
+
 @triton.jit
 def _w8a8_triton_block_scaled_mm(
     # Pointers to inputs and output
-- 
GitLab


From b952f4d3c31068de3a98d680261be9a5caa04caa Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 7 Dec 2025 23:51:36 +0800
Subject: [PATCH 177/258] [v1] Add PrefixLM support to FlexAttention backend
 (#27938)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 docs/models/supported_models.md               | 20 ------
 .../multimodal/generation/test_common.py      |  1 -
 .../vllm_add_dummy_platform/dummy_platform.py |  1 +
 vllm/attention/backends/abstract.py           |  9 +++
 vllm/attention/layer.py                       |  5 ++
 vllm/attention/selector.py                    |  5 ++
 vllm/config/model.py                          | 14 +++++
 vllm/multimodal/inputs.py                     | 25 ++++++++
 vllm/platforms/cpu.py                         |  1 +
 vllm/platforms/cuda.py                        | 19 ++++++
 vllm/platforms/interface.py                   |  1 +
 vllm/platforms/rocm.py                        |  1 +
 vllm/platforms/tpu.py                         |  5 +-
 vllm/platforms/xpu.py                         |  3 +-
 vllm/v1/attention/backends/flex_attention.py  | 62 +++++++++++++++++++
 vllm/v1/worker/gpu_model_runner.py            | 26 +++++++-
 16 files changed, 173 insertions(+), 25 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 1089de87b..ec3ba4474 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -740,23 +740,6 @@ Some models are supported only via the [Transformers modeling backend](#transfor
 <sup>E</sup> Pre-computed embeddings can be inputted for this modality.
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
-!!! warning
-    Both V0 and V1 support `Gemma3ForConditionalGeneration` for text-only inputs.
-    However, there are differences in how they handle text + image inputs:
-
-    V0 correctly implements the model's attention pattern:
-    - Uses bidirectional attention between the image tokens corresponding to the same image
-    - Uses causal attention for other tokens
-    - Implemented via (naive) PyTorch SDPA with masking tensors
-    - Note: May use significant memory for long prompts with image
-
-    V1 currently uses a simplified attention pattern:
-    - Uses causal attention for all tokens, including image tokens
-    - Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": true}`
-    - Will be updated in the future to support the correct behavior
-
-    This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
-
 !!! note
     `Gemma3nForConditionalGeneration` is only supported on V1 due to shared KV caching and it depends on `timm>=1.0.17` to make use of its
     MobileNet-v5 vision backbone.
@@ -776,9 +759,6 @@ Some models are supported only via the [Transformers modeling backend](#transfor
     The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
     For more details, please see: <https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630>
 
-!!! warning
-    Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
-
 !!! note
     For Qwen2.5-Omni and Qwen3-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`) is currently work in progress and not yet supported.
 
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index fd26b838a..c5a0b6748 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -382,7 +382,6 @@ VLM_TEST_SETTINGS = {
         auto_cls=AutoModelForImageTextToText,
         vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
         patch_hf_runner=model_utils.gemma3_patch_hf_runner,
-        num_logprobs=10,
     ),
     "glm4v": VLMTestInfo(
         models=["zai-org/glm-4v-9b"],
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
index a80617a36..8448003e7 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
@@ -30,5 +30,6 @@ class DummyPlatform(Platform):
         use_mla,
         has_sink,
         use_sparse,
+        use_mm_prefix,
     ):
         return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend"  # noqa E501
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 84cca8e68..03f4c4030 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -166,6 +166,10 @@ class AttentionBackend(ABC):
     def supports_sink(cls) -> bool:
         return False
 
+    @classmethod
+    def supports_mm_prefix(cls) -> bool:
+        return False
+
     @classmethod
     def is_sparse(cls) -> bool:
         return False
@@ -207,6 +211,7 @@ class AttentionBackend(ABC):
         use_mla: bool,
         has_sink: bool,
         use_sparse: bool,
+        use_mm_prefix: bool,
         device_capability: "DeviceCapability",
         attn_type: str,
     ) -> list[str]:
@@ -219,6 +224,10 @@ class AttentionBackend(ABC):
             invalid_reasons.append("kv_cache_dtype not supported")
         if not cls.supports_block_size(block_size):
             invalid_reasons.append("block_size not supported")
+        if use_mm_prefix and not cls.supports_mm_prefix():
+            invalid_reasons.append(
+                "partial multimodal token full attention not supported"
+            )
         if use_mla != cls.is_mla():
             if use_mla:
                 invalid_reasons.append("MLA not supported")
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 8a522deed..340b161ea 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -230,6 +230,10 @@ class Attention(nn.Module, AttentionLayerBase):
         self.sliding_window = sliding_window
         self.has_sink = extra_impl_args.get("sinks") is not None
 
+        # NOTE: model_config may be None during certain tests
+        model_config = vllm_config.model_config
+        self.use_mm_prefix = model_config is not None and model_config.is_mm_prefix_lm
+
         # During model initialization, the default dtype is set as the model
         # weight and activation dtype.
         dtype = torch.get_default_dtype()
@@ -241,6 +245,7 @@ class Attention(nn.Module, AttentionLayerBase):
                 block_size,
                 use_mla=False,
                 has_sink=self.has_sink,
+                use_mm_prefix=self.use_mm_prefix,
                 attn_type=attn_type,
             )
         else:
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index aeb130dfe..f6aba271d 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -27,6 +27,7 @@ def get_attn_backend(
     use_mla: bool = False,
     has_sink: bool = False,
     use_sparse: bool = False,
+    use_mm_prefix: bool = False,
     attn_type: str | None = None,
 ) -> type[AttentionBackend]:
     """Selects which attention backend to use and lazily imports it."""
@@ -52,6 +53,7 @@ def get_attn_backend(
         use_mla=use_mla,
         has_sink=has_sink,
         use_sparse=use_sparse,
+        use_mm_prefix=use_mm_prefix,
         attn_type=attn_type,
     )
 
@@ -66,6 +68,7 @@ def _cached_get_attn_backend(
     use_mla: bool = False,
     has_sink: bool = False,
     use_sparse: bool = False,
+    use_mm_prefix: bool = False,
     attn_type: str | None = None,
 ) -> type[AttentionBackend]:
     from vllm.platforms import current_platform
@@ -87,6 +90,7 @@ def _cached_get_attn_backend(
             use_mla,
             has_sink,
             use_sparse,
+            use_mm_prefix,
             attn_type,
         )
     else:
@@ -99,6 +103,7 @@ def _cached_get_attn_backend(
             use_mla,
             has_sink,
             use_sparse,
+            use_mm_prefix,
             attn_type,
         )
     if not attention_cls:
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 509a9c5e1..583904a94 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -4,6 +4,7 @@
 import warnings
 from collections.abc import Callable
 from dataclasses import InitVar, field
+from functools import cached_property
 from typing import TYPE_CHECKING, Any, Literal, cast, get_args
 
 import torch
@@ -1217,6 +1218,19 @@ class ModelConfig:
             )
         return False
 
+    @cached_property
+    def is_mm_prefix_lm(self) -> bool:
+        """Whether to use bidirectional attention for mm positions."""
+        MM_PREFIX_LM_MODELS = (
+            "gemma3",
+            # TODO(Isotr0py): Disable paligemma for now before
+            # we supports soft cap attention for FlexAttention
+            # "paligemma",
+        )
+        if not hasattr(self.hf_config, "model_type"):
+            return False
+        return self.hf_config.model_type in MM_PREFIX_LM_MODELS
+
     def get_head_size(self) -> int:
         # TODO remove hard code
         if self.is_deepseek_mla:
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index d9118f5b9..2ed66554e 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -175,6 +175,31 @@ class PlaceholderRange:
 
         return int(self.is_embed.sum().item())
 
+    def extract_embeds_range(self) -> list[tuple[int, int]]:
+        """Extract the start and end indices of the embedded region in prompt.
+
+        For example, given `PlaceholderRange(offset=2, length=5)` and
+        `is_embed = [False, True, False, True, True]`, the output is
+        `[(1 + offset, 1 + offset), (3 + offset, 4 + offset)]`.
+
+        Returns:
+            A tuple `(start, end)` representing the start and end
+            indices (inclusive) of the embedded region.
+            Returns full placeholder range if `is_embed` is `None`.
+        """
+        if self.is_embed is None:
+            return [(self.offset, self.offset + self.length)]
+
+        mask_i = self.is_embed.int()
+        starts = torch.nonzero(
+            torch.diff(mask_i, prepend=mask_i.new_zeros(1)) == 1
+        ).flatten()
+        ends = torch.nonzero(
+            torch.diff(mask_i, append=mask_i.new_zeros(1)) == -1
+        ).flatten()
+        ranges = torch.stack((starts, ends), dim=1) + self.offset
+        return [tuple(x) for x in ranges.tolist()]
+
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, self.__class__):
             return False
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index a2518d5fd..a49b6e92d 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -133,6 +133,7 @@ class CpuPlatform(Platform):
         use_mla: bool,
         has_sink: bool,
         use_sparse: bool,
+        use_mm_prefix: bool,
         attn_type: str | None = None,
     ) -> str:
         if selected_backend and selected_backend != AttentionBackendEnum.CPU_ATTN:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 37c95f486..39101c431 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -233,6 +233,20 @@ class CudaPlatformBase(Platform):
                     "Forcing kv cache block size to 64 for FlashMLASparse backend."
                 )
 
+        scheduler_config = vllm_config.scheduler_config
+        # Note: model_config may be None during testing
+        if (
+            model_config is not None
+            and model_config.is_mm_prefix_lm
+            and scheduler_config.is_multimodal_model
+            and not scheduler_config.disable_chunked_mm_input
+        ):
+            logger.warning(
+                "Forcing --disable_chunked_mm_input for models "
+                "with multimodal-bidirectional attention."
+            )
+            scheduler_config.disable_chunked_mm_input = True
+
     @classmethod
     def get_current_memory_usage(
         cls, device: torch.types.Device | None = None
@@ -268,6 +282,7 @@ class CudaPlatformBase(Platform):
         use_mla,
         has_sink,
         use_sparse,
+        use_mm_prefix,
         device_capability,
         attn_type,
     ) -> tuple[
@@ -289,6 +304,7 @@ class CudaPlatformBase(Platform):
                     use_mla,
                     has_sink,
                     use_sparse,
+                    use_mm_prefix,
                     device_capability,
                     attn_type,
                 )
@@ -312,6 +328,7 @@ class CudaPlatformBase(Platform):
         use_mla: bool,
         has_sink: bool,
         use_sparse: bool,
+        use_mm_prefix: bool,
         attn_type: str | None = None,
     ) -> str:
         if attn_type is None:
@@ -332,6 +349,7 @@ class CudaPlatformBase(Platform):
                     use_mla,
                     has_sink,
                     use_sparse,
+                    use_mm_prefix,
                     device_capability,
                     attn_type,
                 )
@@ -356,6 +374,7 @@ class CudaPlatformBase(Platform):
             use_mla,
             has_sink,
             use_sparse,
+            use_mm_prefix,
             device_capability,
             attn_type,
         )
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 27c6fac09..f04e94e42 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -239,6 +239,7 @@ class Platform:
         use_mla: bool,
         has_sink: bool,
         use_sparse: bool,
+        use_mm_prefix: bool,
         attn_type: str | None = None,
     ) -> str:
         """Get the attention backend class of a device."""
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 32c7f8e53..ff0fc7851 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -216,6 +216,7 @@ class RocmPlatform(Platform):
         use_mla,
         has_sink,
         use_sparse,
+        use_mm_prefix,
         attn_type: str | None = None,
     ) -> str:
         from vllm._aiter_ops import rocm_aiter_ops
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index cbc0a996f..d6998e7a3 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -62,8 +62,9 @@ class TpuPlatform(Platform):
         kv_cache_dtype: str | None,
         block_size: int,
         use_mla: bool,
-        has_sink,
-        use_sparse,
+        has_sink: bool,
+        use_sparse: bool,
+        use_mm_prefix: bool,
         attn_type: str | None = None,
     ) -> str:
         if use_sparse:
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 768714fb1..0a0575076 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -48,7 +48,8 @@ class XPUPlatform(Platform):
         block_size: int,
         use_mla: bool,
         has_sink: bool,
-        use_sparse,
+        use_sparse: bool,
+        use_mm_prefix: bool,
         attn_type: str | None = None,
     ) -> str:
         from vllm.v1.attention.backends.utils import set_kv_cache_layout
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index a2a6eeeb1..d8dbe4cba 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -17,6 +17,7 @@ from torch.nn.attention.flex_attention import (
     and_masks,
     create_block_mask,
     flex_attention,
+    or_masks,
 )
 
 from vllm.attention.backends.abstract import (
@@ -42,6 +43,7 @@ from vllm.v1.kv_cache_interface import AttentionSpec
 
 logger = init_logger(__name__)
 
+torch._dynamo.config.recompile_limit = 16
 create_block_mask_compiled = torch.compile(
     create_block_mask, fullgraph=True, mode="reduce-overhead"
 )
@@ -91,6 +93,11 @@ class FlexAttentionBackend(AttentionBackend):
         """FlexAttention supports both decoder and encoder-only attention."""
         return attn_type in (AttentionType.DECODER, AttentionType.ENCODER_ONLY)
 
+    @classmethod
+    def supports_mm_prefix(cls) -> bool:
+        """FlexAttention supports full attention for image tokens."""
+        return True
+
     @staticmethod
     def get_impl_cls() -> type["FlexAttentionImpl"]:
         return FlexAttentionImpl
@@ -316,6 +323,7 @@ class FlexAttentionMetadata:
     kv_block_size: int = 16
     transformed_score_mod: _score_mod_signature | None = None
     sliding_window: int | None = None
+    mm_prefix_range: dict[int, list[tuple[int, int]]] | None = None
 
     @cached_property
     def logical_block_ids(self):
@@ -443,6 +451,45 @@ class FlexAttentionMetadata:
 
         return final_mask_mod if self.causal else sliding_window_mask_mod
 
+    def get_prefix_lm_mask_mod(self) -> _mask_mod_signature:
+        """Creates the prefix LM mask_mod function for FlexAttention."""
+
+        assert self.doc_ids is not None
+        request_lookup = self.doc_ids
+
+        def prefix_lm_mask_mod(
+            b: torch.Tensor,
+            h: torch.Tensor,
+            cu_q_idx: torch.Tensor,
+            q_idx: torch.Tensor,
+            kv_idx: torch.Tensor,
+        ):
+            mask = torch.zeros_like(q_idx, dtype=torch.bool)
+            for req, doc_range_lst in (self.mm_prefix_range or {}).items():
+                req_mask = request_lookup[cu_q_idx] == req
+                for start, end in doc_range_lst:
+                    doc_mask_q = (q_idx >= start) & (q_idx <= end)
+                    doc_mask_kv = (kv_idx >= start) & (kv_idx <= end)
+                    mask = mask | (req_mask & doc_mask_q & doc_mask_kv)
+            return mask
+
+        def final_mask_mod(
+            b: torch.Tensor,
+            h: torch.Tensor,
+            q_idx: torch.Tensor,
+            physical_kv_idx: torch.Tensor,
+        ) -> torch.Tensor:
+            (is_valid, logical_q_idx, logical_kv_idx) = (
+                self._convert_physical_to_logical(self.doc_ids, q_idx, physical_kv_idx)
+            )
+            return torch.where(
+                is_valid,
+                prefix_lm_mask_mod(b, h, q_idx, logical_q_idx, logical_kv_idx),
+                False,
+            )
+
+        return final_mask_mod
+
     def get_mask_mod(self):
         # Stage-1: initialize the base mask_mod
         # (causal mask for decoder or bidirectional mask for encoder)
@@ -456,6 +503,10 @@ class FlexAttentionMetadata:
             # Add sliding window mask for sliding window attention
             sliding_window_mask_mod = self.get_sliding_window_mask_mod()
             mask_mod = and_masks(mask_mod, sliding_window_mask_mod)
+        if self.mm_prefix_range:
+            # Add prefix LM mask for vision-language prefix LM attention
+            prefix_lm_mask_mod = self.get_prefix_lm_mask_mod()
+            mask_mod = or_masks(mask_mod, prefix_lm_mask_mod)
         return mask_mod
 
     def get_transformed_score_mod(self) -> _score_mod_signature | None:
@@ -709,6 +760,7 @@ class FlexAttentionImpl(AttentionImpl):
     sliding_window: int | None
     alibi_slopes: torch.Tensor | None
     logits_soft_cap: float | None
+    mm_prefix_range: dict[int, list[tuple[int, int]]] | None = None
 
     def __init__(
         self,
@@ -810,11 +862,21 @@ class FlexAttentionImpl(AttentionImpl):
 
         num_actual_tokens = attn_metadata.num_actual_tokens
 
+        needs_rebuild_block_mask = False
         if attn_metadata.sliding_window != self.sliding_window:
             attn_metadata.sliding_window = self.sliding_window
             if attn_metadata.direct_build:
                 # update mask mod in attention metadata
                 attn_metadata.mask_mod = attn_metadata.get_mask_mod()
+            needs_rebuild_block_mask = True
+
+        if self.mm_prefix_range != getattr(attn_metadata, "mm_prefix_range", None):
+            self.mm_prefix_range = attn_metadata.mm_prefix_range
+            attn_metadata.mask_mod = attn_metadata.get_mask_mod()
+            needs_rebuild_block_mask = True
+
+        if needs_rebuild_block_mask:
+            if attn_metadata.direct_build and attn_metadata.causal:
                 attn_metadata.block_mask = attn_metadata._build_block_mask_direct()
             else:
                 attn_metadata.block_mask = attn_metadata.build_block_mask()
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a50360ab0..22a3f9d8d 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -48,7 +48,10 @@ from vllm.distributed.parallel_state import (
     is_global_first_rank,
     prepare_communication_buffer_for_model,
 )
-from vllm.forward_context import BatchDescriptor, set_forward_context
+from vllm.forward_context import (
+    BatchDescriptor,
+    set_forward_context,
+)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.model_executor.layers.rotary_embedding import (
@@ -329,6 +332,7 @@ class GPUModelRunner(
         self.use_alibi = model_config.uses_alibi
 
         self.cascade_attn_enabled = not self.model_config.disable_cascade_attn
+        self.is_mm_prefix_lm = self.model_config.is_mm_prefix_lm
 
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
@@ -1700,6 +1704,26 @@ class GPUModelRunner(
                     for layer_name in attn_group.layer_names:
                         attn_metadata[layer_name] = attn_metadata_i
 
+        if self.is_mm_prefix_lm:
+            req_doc_ranges = {}
+            for req_id in self.input_batch.req_ids:
+                image_doc_ranges = []
+                req_state = self.requests[req_id]
+                for mm_feature in req_state.mm_features:
+                    pos_info = mm_feature.mm_position
+                    img_doc_range = pos_info.extract_embeds_range()
+                    image_doc_ranges.extend(img_doc_range)
+                req_idx = self.input_batch.req_id_to_index[req_id]
+                req_doc_ranges[req_idx] = image_doc_ranges
+
+            if isinstance(attn_metadata, list):
+                for ub_metadata in attn_metadata:
+                    for _metadata in ub_metadata.values():
+                        _metadata.mm_prefix_range = req_doc_ranges  # type: ignore[attr-defined]
+            else:
+                for _metadata in attn_metadata.values():
+                    _metadata.mm_prefix_range = req_doc_ranges  # type: ignore[attr-defined]
+
         if spec_decode_common_attn_metadata is not None and (
             num_reqs != num_reqs_padded or num_tokens != num_tokens_padded
         ):
-- 
GitLab


From 0044c4038c571e7fb38acb008256f925dfe515f1 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Sun, 7 Dec 2025 10:53:51 -0500
Subject: [PATCH 178/258] [BugFix][DeepSeek-V3.2] Fix backend selection logic
 for Blackwell (#30195)

---
 vllm/platforms/cuda.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 39101c431..915392a41 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -182,8 +182,8 @@ class CudaPlatformBase(Platform):
 
             if vllm_config.attention_config.backend is None:
                 # Default case
-                if cls.is_device_capability(100):
-                    # Blackwell => Force CutlassMLA.
+                if cls.is_device_capability(100) and not use_sparse:
+                    # Blackwell => Force CutlassMLA (unless sparse, i.e. DSv3.2).
                     use_cutlass_mla = True
                     # Set the backend in AttentionConfig so it's used during
                     # backend selection
-- 
GitLab


From af0444bf40b7db2f3fb9fe1508d25ceba24cac87 Mon Sep 17 00:00:00 2001
From: ElizaWszola <ewszola@redhat.com>
Date: Sun, 7 Dec 2025 17:38:04 +0100
Subject: [PATCH 179/258] [Performance] Fused blockwise quant RMS norm (#27883)

Signed-off-by: ElizaWszola <ewszola@redhat.com>
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: yewentao256 <zhyanwentao@126.com>
---
 .../fused_kernels/layernorm_rms_benchmarks.py |  89 +++-
 csrc/dispatch_utils.h                         |  18 +
 csrc/ops.h                                    |   7 +
 ...fused_layernorm_dynamic_per_token_quant.cu | 144 ++++++-
 .../fused_kernels/layernorm_utils.cuh         | 402 ++++++++++++++----
 csrc/torch_bindings.cpp                       |   8 +
 tests/compile/test_fusion.py                  |  69 ++-
 .../core/test_fused_quant_layernorm.py        |  82 +++-
 vllm/_custom_ops.py                           |  40 ++
 vllm/compilation/fusion.py                    | 172 +++++++-
 vllm/compilation/matcher_utils.py             |  44 +-
 .../layers/quantization/utils/fp8_utils.py    |   2 +-
 .../layers/quantization/utils/quant_utils.py  |   6 +
 vllm/utils/deep_gemm.py                       |  17 +
 14 files changed, 946 insertions(+), 154 deletions(-)

diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
index d809bf1db..fb3329975 100644
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -14,6 +14,9 @@ from tqdm import tqdm
 
 import vllm._custom_ops as ops
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8,
+)
 
 
 @dataclass
@@ -22,6 +25,7 @@ class bench_params_t:
     hidden_size: int
     add_residual: bool
     dtype: torch.dtype
+    group_size: list[int]
 
     def description(self):
         return (
@@ -29,6 +33,7 @@ class bench_params_t:
             f"x D {self.hidden_size} "
             f"x R {self.add_residual} "
             f"x DT {self.dtype}"
+            f"x GS {self.group_size}"
         )
 
 
@@ -38,10 +43,11 @@ def get_bench_params() -> list[bench_params_t]:
     HIDDEN_SIZES = list(range(1024, 8129, 1024))
     ADD_RESIDUAL = [True, False]
     DTYPES = [torch.bfloat16, torch.float]
+    GROUP_SIZES = [[1, 64], [1, 128]]
 
-    combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
+    combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES, GROUP_SIZES)
     bench_params = list(
-        map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations)
+        map(lambda x: bench_params_t(x[0], x[1], x[2], x[3], x[4]), combinations)
     )
     return bench_params
 
@@ -52,6 +58,7 @@ def unfused_int8_impl(
     x: torch.Tensor,
     residual: torch.Tensor | None,
     quant_dtype: torch.dtype,
+    group_size: list[int],
 ):
     # Norm
     torch_out = None
@@ -69,6 +76,7 @@ def unfused_fp8_impl(
     x: torch.Tensor,
     residual: torch.Tensor | None,
     quant_dtype: torch.dtype,
+    group_size: list[int],
 ):
     # Norm
     torch_out = None
@@ -81,23 +89,63 @@ def unfused_fp8_impl(
     torch_out, _ = ops.scaled_fp8_quant(torch_out)
 
 
+def unfused_groupwise_fp8_impl(
+    rms_norm_layer: RMSNorm,
+    x: torch.Tensor,
+    residual: torch.Tensor | None,
+    quant_dtype: torch.dtype,
+    group_size: list[int],
+):
+    # Norm
+    torch_out = None
+    if residual is None:
+        torch_out = rms_norm_layer.forward_cuda(x, residual)
+    else:
+        torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
+
+    # Quant
+    torch_out, _ = per_token_group_quant_fp8(
+        torch_out, group_size=group_size[1], use_ue8m0=False
+    )
+
+
 def fused_impl(
     rms_norm_layer: RMSNorm,  # this stores the weights
     x: torch.Tensor,
     residual: torch.Tensor | None,
     quant_dtype: torch.dtype,
+    group_size: list[int],
 ):
     out, _ = ops.rms_norm_dynamic_per_token_quant(
         x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual
     )
 
 
+def fused_groupwise_impl(
+    rms_norm_layer: RMSNorm,  # this stores the weights
+    x: torch.Tensor,
+    residual: torch.Tensor | None,
+    quant_dtype: torch.dtype,
+    group_size: list[int],
+):
+    out, _ = ops.rms_norm_per_block_quant(
+        x,
+        rms_norm_layer.weight,
+        1e-6,
+        quant_dtype,
+        group_size,
+        residual=residual,
+        is_scale_transposed=True,
+    )
+
+
 # Bench functions
 def bench_fn(
     rms_norm_layer: RMSNorm,
     x: torch.Tensor,
     residual: torch.Tensor,
     quant_dtype: torch.dtype,
+    group_size: list[int],
     label: str,
     sub_label: str,
     fn: Callable,
@@ -110,10 +158,11 @@ def bench_fn(
         "x": x,
         "residual": residual,
         "quant_dtype": quant_dtype,
+        "group_size": group_size,
         "fn": fn,
     }
     return TBenchmark.Timer(
-        stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
+        stmt="fn(rms_norm_layer, x, residual, quant_dtype, group_size)",
         globals=globals,
         label=label,
         sub_label=sub_label,
@@ -147,6 +196,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
             x,
             residual,
             torch.int8,
+            params.group_size,
             label,
             sub_label,
             unfused_int8_impl,
@@ -161,6 +211,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
             x,
             residual,
             torch.float8_e4m3fn,
+            params.group_size,
             label,
             sub_label,
             unfused_fp8_impl,
@@ -175,6 +226,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
             x,
             residual,
             torch.int8,
+            params.group_size,
             label,
             sub_label,
             fused_impl,
@@ -189,6 +241,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
             x,
             residual,
             torch.float8_e4m3fn,
+            params.group_size,
             label,
             sub_label,
             fused_impl,
@@ -196,6 +249,36 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
         )
     )
 
+    # unfused groupwise fp8 impl.
+    timers.append(
+        bench_fn(
+            layer,
+            x,
+            residual,
+            torch.float8_e4m3fn,
+            params.group_size,
+            label,
+            sub_label,
+            unfused_groupwise_fp8_impl,
+            "unfused_groupwise_fp8_impl",
+        )
+    )
+
+    # fused groupwise fp8 impl.
+    timers.append(
+        bench_fn(
+            layer,
+            x,
+            residual,
+            torch.float8_e4m3fn,
+            params.group_size,
+            label,
+            sub_label,
+            fused_groupwise_impl,
+            "fused_groupwise_fp8_impl",
+        )
+    )
+
     print_timers(timers)
 
     return timers
diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
index e1d131e4a..de0c505b7 100644
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -118,6 +118,24 @@
     }                                         \
   }
 
+#define VLLM_DISPATCH_BOOL(expr, const_expr, ...) \
+  if (expr) {                                     \
+    constexpr bool const_expr = true;             \
+    __VA_ARGS__();                                \
+  } else {                                        \
+    constexpr bool const_expr = false;            \
+    __VA_ARGS__();                                \
+  }
+
+#define VLLM_DISPATCH_GROUP_SIZE(group_size, const_group_size, ...) \
+  if (group_size == 128) {                                          \
+    constexpr int const_group_size = 128;                           \
+    __VA_ARGS__();                                                  \
+  } else if (group_size == 64) {                                    \
+    constexpr int const_group_size = 64;                            \
+    __VA_ARGS__();                                                  \
+  }
+
 #define VLLM_DISPATCH_RANK234(NUM_DIMS, ...)                                   \
   switch (NUM_DIMS) {                                                          \
     case 2: {                                                                  \
diff --git a/csrc/ops.h b/csrc/ops.h
index d302f0491..9617d6358 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -128,6 +128,13 @@ void rms_norm_dynamic_per_token_quant(torch::Tensor& out,
                                       std::optional<torch::Tensor> scale_ub,
                                       std::optional<torch::Tensor> residual);
 
+void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input,
+                              torch::Tensor const& weight,
+                              torch::Tensor& scales, double const epsilon,
+                              std::optional<torch::Tensor> scale_ub,
+                              std::optional<torch::Tensor> residual,
+                              int64_t group_size, bool is_scale_transposed);
+
 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
                       std::optional<torch::Tensor> key, int64_t head_size,
                       torch::Tensor& cos_sin_cache, bool is_neox);
diff --git a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
index 92d6c2f40..2080ef3cd 100644
--- a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
+++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
@@ -31,14 +31,15 @@ __device__ void rms_norm_dynamic_per_token_quant_vec(
 
   // RMS Norm + Quant
   if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
+    token_scale = 1.0f / token_scale;
     vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, true,
                                      has_residual>(
-        out, input, weight, rms, 1.0f / token_scale, hidden_size, residual);
+        out, input, weight, rms, &token_scale, hidden_size, residual);
   } else {
     // FP8 - Do not invert token_scale for exact match with FBGemm
     vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, false,
                                      has_residual>(
-        out, input, weight, rms, token_scale, hidden_size, residual);
+        out, input, weight, rms, &token_scale, hidden_size, residual);
   }
 }
 
@@ -75,14 +76,52 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(
 
   // RMS Norm + Quant
   if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
+    token_scale = 1.0f / token_scale;
     vllm::norm_and_quant<scalar_t, scalar_out_t, true, has_residual>(
-        out, input, weight, rms, 1.0f / token_scale, hidden_size, residual);
+        out, input, weight, rms, &token_scale, hidden_size, residual);
   } else {
     // FP8 - Do not invert s_token_scale for exact match with FBGemm
     vllm::norm_and_quant<scalar_t, scalar_out_t, false, has_residual>(
-        out, input, weight, rms, token_scale, hidden_size, residual);
+        out, input, weight, rms, &token_scale, hidden_size, residual);
   }
 }
+
+// RMS norm + quant kernel
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false,
+          bool is_scale_transposed = false, int32_t group_size = 0>
+__global__ void rms_norm_per_block_quant_kernel(
+    scalar_out_t* __restrict__ out,  // [..., hidden_size]
+    float* __restrict__ scales,      // [num_tokens, hidden_size / group_size]
+                                     // or
+                                     // [hidden_size / group_size, num_tokens]
+    scalar_t const* __restrict__ input,   // [..., hidden_size]
+    scalar_t const* __restrict__ weight,  // [hidden_size]
+    float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
+    scalar_t* __restrict__ residual = nullptr) {
+  float rms;
+  // Compute RMS
+  // Always able to vectorize due to constraints on hidden_size
+  vllm::vectorized::compute_rms<scalar_t, has_residual>(
+      &rms, input, hidden_size, var_epsilon, residual);
+
+  // Compute Scale
+  // Always able to vectorize due to constraints on hidden_size and group_size
+  vllm::vectorized::compute_dynamic_per_token_scales<
+      scalar_t, scalar_out_t, has_residual, is_scale_transposed, group_size>(
+      nullptr, scales, input, weight, rms, scale_ub, hidden_size, residual);
+
+  // RMS Norm + Quant
+  // Always able to vectorize due to constraints on hidden_size
+  // For int8, don't invert token_scale here: do it inside the norm_and_quant
+  // kernel. We do it because particular elements of token_scale can be shared
+  // between multiple threads, so this way, we avoid extra synchronization
+  // overhead.
+  vllm::vectorized::norm_and_quant<
+      scalar_t, scalar_out_t, std::is_same_v<scalar_out_t, int8_t>,
+      has_residual, is_scale_transposed, group_size>(
+      out, input, weight, rms, scales, hidden_size, residual);
+}
+
 }  // namespace vllm
 
 // Residual add + RMS norm + dynamic per token
@@ -103,30 +142,19 @@ void rms_norm_dynamic_per_token_quant_dispatch(
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  if (residual.has_value()) {
+  VLLM_DISPATCH_BOOL(residual.has_value(), has_residual, [&] {
     VLLM_DISPATCH_QUANT_TYPES(
         out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
           vllm::rms_norm_dynamic_per_token_quant_kernel<scalar_in_t, scalar_t,
-                                                        true>
+                                                        has_residual>
               <<<grid, block, 0, stream>>>(
                   out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
                   input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
                   scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
-                  var_epsilon, hidden_size, residual->data_ptr<scalar_in_t>());
+                  var_epsilon, hidden_size,
+                  has_residual ? residual->data_ptr<scalar_in_t>() : nullptr);
         });
-
-  } else {
-    VLLM_DISPATCH_QUANT_TYPES(
-        out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
-          vllm::rms_norm_dynamic_per_token_quant_kernel<scalar_in_t, scalar_t,
-                                                        false>
-              <<<grid, block, 0, stream>>>(
-                  out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
-                  input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
-                  scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
-                  var_epsilon, hidden_size, nullptr);
-        });
-  }
+  });
 }
 
 void rms_norm_dynamic_per_token_quant(
@@ -157,3 +185,79 @@ void rms_norm_dynamic_per_token_quant(
             out, input, weight, scales, var_epsilon, scale_ub, residual);
       });
 }
+
+// Residual add + RMS norm + dynamic per token
+void rms_norm_per_block_quant_dispatch(
+    torch::Tensor& out,           // [..., hidden_size]
+    torch::Tensor const& input,   // [..., hidden_size]
+    torch::Tensor const& weight,  // [hidden_size]
+    torch::Tensor& scales,        // [num_tokens, hidden_size / group_size] or
+                                  // [hidden_size / group_size, num_tokens]
+    int32_t group_size,
+    double const var_epsilon,  // Variance epsilon used in norm calculation
+    std::optional<at::Tensor> const& scale_ub,
+    std::optional<at::Tensor>& residual, bool is_scale_transposed) {
+  int32_t hidden_size = input.size(-1);
+  auto num_tokens = input.numel() / hidden_size;
+
+  dim3 grid(num_tokens);
+  const int max_block_size = (num_tokens <= 256) ? 512 : 256;
+  dim3 block(std::min(hidden_size, max_block_size));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "rms_norm_per_block_quant_fp_dispatch", [&] {
+        using scalar_in_t = scalar_t;
+        VLLM_DISPATCH_GROUP_SIZE(group_size, gs, [&] {
+          VLLM_DISPATCH_BOOL(residual.has_value(), has_residual, [&] {
+            VLLM_DISPATCH_BOOL(is_scale_transposed, transpose_scale, [&] {
+              VLLM_DISPATCH_QUANT_TYPES(
+                  out.scalar_type(), "rms_norm_per_block_quant_kernel", [&] {
+                    vllm::rms_norm_per_block_quant_kernel<scalar_in_t, scalar_t,
+                                                          has_residual,
+                                                          transpose_scale, gs>
+                        <<<grid, block, 0, stream>>>(
+                            out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
+                            input.data_ptr<scalar_in_t>(),
+                            weight.data_ptr<scalar_in_t>(),
+                            scale_ub.has_value() ? scale_ub->data_ptr<float>()
+                                                 : nullptr,
+                            var_epsilon, hidden_size,
+                            has_residual ? residual->data_ptr<scalar_in_t>()
+                                         : nullptr);
+                  });
+            });
+          });
+        });
+      });
+}
+
+void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input,
+                              torch::Tensor const& weight,
+                              torch::Tensor& scales, double const var_epsilon,
+                              std::optional<torch::Tensor> scale_ub,
+                              std::optional<torch::Tensor> residual,
+                              int64_t group_size, bool is_scale_transposed) {
+  static c10::ScalarType kFp8Type = is_fp8_ocp()
+                                        ? c10::ScalarType::Float8_e4m3fn
+                                        : c10::ScalarType::Float8_e4m3fnuz;
+  TORCH_CHECK(out.dtype() == kFp8Type || out.dtype() == torch::kInt8);
+  TORCH_CHECK(out.is_contiguous() && input.is_contiguous());
+
+  if (scale_ub.has_value()) {
+    TORCH_CHECK(out.dtype() == kFp8Type);
+  }
+  TORCH_CHECK(weight.dtype() == input.dtype());
+  TORCH_CHECK(scales.dtype() == torch::kFloat32);
+  if (residual) {
+    TORCH_CHECK(residual->scalar_type() == input.scalar_type());
+  }
+
+  TORCH_CHECK(group_size == 128 || group_size == 64,
+              "Unsupported group size: ", group_size);
+
+  rms_norm_per_block_quant_dispatch(out, input, weight, scales, group_size,
+                                    var_epsilon, scale_ub, residual,
+                                    is_scale_transposed);
+}
\ No newline at end of file
diff --git a/csrc/quantization/fused_kernels/layernorm_utils.cuh b/csrc/quantization/fused_kernels/layernorm_utils.cuh
index 2d2fd7712..cb7adc312 100644
--- a/csrc/quantization/fused_kernels/layernorm_utils.cuh
+++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh
@@ -9,6 +9,7 @@
 #include "quant_conversions.cuh"
 
 #include "../../cub_helpers.h"
+#include "../../cuda_compat.h"
 
 namespace vllm {
 
@@ -43,62 +44,150 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
   *rms = s_rms;
 }
 
-template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
+__device__ float warpReduceMaxSpecialized(volatile float* val, int64_t tid,
+                                          int64_t thread_in_warp,
+                                          int64_t reduced_elems) {
+  static_assert(WARP_SIZE == 32 || WARP_SIZE == 64);
+  if constexpr (WARP_SIZE == 64) {
+    if (thread_in_warp + 64 < reduced_elems)
+      val[tid] = fmaxf(val[tid], val[tid + 64]);
+  }
+  if (thread_in_warp + 32 < reduced_elems)
+    val[tid] = fmaxf(val[tid], val[tid + 32]);
+  if (thread_in_warp + 16 < reduced_elems)
+    val[tid] = fmaxf(val[tid], val[tid + 16]);
+  if (thread_in_warp + 8 < reduced_elems)
+    val[tid] = fmaxf(val[tid], val[tid + 8]);
+  if (thread_in_warp + 4 < reduced_elems)
+    val[tid] = fmaxf(val[tid], val[tid + 4]);
+  if (thread_in_warp + 2 < reduced_elems)
+    val[tid] = fmaxf(val[tid], val[tid + 2]);
+  if (thread_in_warp + 1 < reduced_elems)
+    val[tid] = fmaxf(val[tid], val[tid + 1]);
+  return val[tid];
+}
+
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false,
+          bool is_scale_transposed = false>
 __device__ void compute_dynamic_per_token_scales(
     float* __restrict__ token_scale, float* __restrict__ all_token_scales,
     scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
     float const rms, float const* __restrict__ scale_ub,
-    int32_t const hidden_size,
-    scalar_t const* __restrict__ residual = nullptr) {
-  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
-  ;
-  constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
-
+    int32_t const hidden_size, scalar_t const* __restrict__ residual = nullptr,
+    int32_t const group_size = 0) {
   float block_absmax_val_maybe = 0.0f;
-  for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
-    float x = static_cast<float>(input[token_offset + i]);
-    if constexpr (has_residual) {
-      x += static_cast<float>(residual[token_offset + i]);
+  constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
+  __syncthreads();
+  if (group_size > 0) {
+    __shared__ float s_max_vals[1024];
+    int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+    int64_t num_groups = hidden_size / group_size;
+    int64_t const threads_per_group = blockDim.x / num_groups;
+    int64_t const thread_in_group = threadIdx.x % threads_per_group;
+    int64_t const group_offset = threadIdx.x / threads_per_group * group_size;
+    int64_t const thread_offset = group_offset + thread_in_group;
+    int64_t const thread_end =
+        min(group_offset + group_size, static_cast<int64_t>(hidden_size));
+    for (auto i = thread_offset; i < thread_end; i += threads_per_group) {
+      float x = static_cast<float>(input[token_offset + i]);
+      if constexpr (has_residual) {
+        x += static_cast<float>(residual[token_offset + i]);
+      }
+      x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
+      block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x));
     }
+    s_max_vals[threadIdx.x] = block_absmax_val_maybe;
+    __syncthreads();
+
+    int64_t const warp_size = WARP_SIZE;
+    int64_t const num_warps = blockDim.x / warp_size;
+    int64_t const warp_id = threadIdx.x / warp_size;
+    int64_t const thread_in_warp = threadIdx.x % warp_size;
+    int64_t const groups_per_warp = (num_groups + num_warps - 1) / num_warps;
+    for (auto i = 0; i < groups_per_warp; ++i) {
+      int64_t const group_id = i * num_warps + warp_id;
+      if (group_id < num_groups) {
+        int64_t warp_start = group_id * threads_per_group;
+        int64_t const start = warp_start + thread_in_warp;
+        int64_t const warp_end = min(warp_start + threads_per_group,
+                                     static_cast<int64_t>(hidden_size));
+        for (auto j = start; j + warp_size < warp_end; j += warp_size) {
+          s_max_vals[start] =
+              fmaxf(s_max_vals[start], s_max_vals[j + warp_size]);
+        }
+        warpReduceMaxSpecialized(s_max_vals, start, thread_in_warp,
+                                 min(warp_end - warp_start, warp_size));
+      }
+    }
+    __syncthreads();
+
+    if (thread_in_group == 0 && thread_offset < thread_end) {
+      block_absmax_val_maybe = s_max_vals[threadIdx.x];
+      float scale = 0.0f;
+      if (scale_ub) {
+        scale = min(block_absmax_val_maybe, *scale_ub);
+      } else {
+        scale = block_absmax_val_maybe;
+      }
+      // token scale computation
+      scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
+      // Global output store
+      if constexpr (is_scale_transposed) {
+        all_token_scales[(threadIdx.x / threads_per_group) * gridDim.x +
+                         blockIdx.x] = scale;
+      } else {
+        all_token_scales[blockIdx.x * num_groups +
+                         threadIdx.x / threads_per_group] = scale;
+      }
+    }
+    __syncthreads();
+  } else {
+    int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+
+    for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+      float x = static_cast<float>(input[token_offset + i]);
+      if constexpr (has_residual) {
+        x += static_cast<float>(residual[token_offset + i]);
+      }
 
-    x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
-    block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x));
-  }
-
-  using BlockReduce = cub::BlockReduce<float, 1024>;
-  __shared__ typename BlockReduce::TempStorage reduceStore;
-  block_absmax_val_maybe =
-      BlockReduce(reduceStore)
-          .Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);
-
-  __shared__ float s_token_scale;
-  if (threadIdx.x == 0) {
-    float scale = 0.0f;
-    if (scale_ub) {
-      scale = min(block_absmax_val_maybe, *scale_ub);
-    } else {
-      scale = block_absmax_val_maybe;
+      x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
+      block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x));
     }
-    // token scale computation
-    scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
-    s_token_scale = scale;                 // Shared memory store
-    all_token_scales[blockIdx.x] = scale;  // Global output store
-  }
-  __syncthreads();
+    using BlockReduce = cub::BlockReduce<float, 1024>;
+    __shared__ typename BlockReduce::TempStorage reduceStore;
+    block_absmax_val_maybe =
+        BlockReduce(reduceStore)
+            .Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);
+
+    __shared__ float s_token_scale;
+    if (threadIdx.x == 0) {
+      float scale = 0.0f;
+      if (scale_ub) {
+        scale = min(block_absmax_val_maybe, *scale_ub);
+      } else {
+        scale = block_absmax_val_maybe;
+      }
+      // token scale computation
+      scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
+      s_token_scale = scale;                 // Shared memory store
+      all_token_scales[blockIdx.x] = scale;  // Global output store
+    }
+    __syncthreads();
 
-  *token_scale = s_token_scale;
+    *token_scale = s_token_scale;
+  }
 }
 
 template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
-          bool has_residual = false>
+          bool has_residual = false, bool is_scale_transposed = false>
 __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
                                scalar_t const* __restrict__ input,
                                scalar_t const* __restrict__ weight,
-                               float const rms, float const scale,
+                               float const rms, float* const scale,
                                int32_t const hidden_size,
-                               scalar_t* __restrict__ residual = nullptr) {
+                               scalar_t* __restrict__ residual = nullptr,
+                               int32_t const group_size = 0) {
   int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
-  ;
 
   for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
     float x = static_cast<float>(input[token_offset + i]);
@@ -109,8 +198,21 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
     // Norm
     x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
     // Quant
+    // If groupwise is_scale_inverted is true, so we invert the scale here.
+    int64_t scale_idx = 0;
+    if (group_size > 0) {
+      if constexpr (is_scale_transposed) {
+        scale_idx = (i / group_size) * gridDim.x + blockIdx.x;
+      } else {
+        scale_idx = blockIdx.x * (hidden_size / group_size) + i / group_size;
+      }
+    }
+    auto scale_val =
+        (group_size > 0
+             ? (is_scale_inverted ? 1.0f / scale[scale_idx] : scale[scale_idx])
+             : *scale);
     output[token_offset + i] =
-        ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(x, scale);
+        ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(x, scale_val);
   }
 }
 
@@ -178,95 +280,191 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
 
 // Vectorized version of vllm::compute_dynamic_per_token_scales
 // hidden_size must be a multiple of 4
-template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false,
+          bool is_scale_transposed = false, int32_t group_size = 0>
 __device__ void compute_dynamic_per_token_scales(
     float* __restrict__ token_scale, float* __restrict__ all_token_scales,
     scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
     float const rms, float const* __restrict__ scale_ub,
     int32_t const hidden_size,
     scalar_t const* __restrict__ residual = nullptr) {
-  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
-  ;
-
-  // Vectorized input/weight/residual to better utilize memory bandwidth.
-  vec4_t<scalar_t> const* vec_input =
-      reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
-  vec4_t<scalar_t> const* vec_weight =
-      reinterpret_cast<vec4_t<scalar_t> const*>(weight);
-  vec4_t<scalar_t> const* vec_residual = nullptr;
-  if constexpr (has_residual) {
-    vec_residual =
-        reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
-  }
-
   constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
 
   const int VEC_SIZE = 4;
-  int32_t const num_vec_elems = hidden_size >> 2;
   float block_absmax_val_maybe = 0.0f;
 
+  // Vectorized input/weight/residual to better utilize memory bandwidth.
+  vec4_t<scalar_t> const* vec_input = nullptr;
+  vec4_t<scalar_t> const* vec_weight = nullptr;
+  vec4_t<scalar_t> const* vec_residual = nullptr;
+
+  if constexpr (group_size > 0) {
+    __shared__ float s_max_vals[1024];
+
+    int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+    int64_t const num_groups = hidden_size / group_size;
+    int64_t const threads_per_group = blockDim.x / num_groups;
+    int64_t const thread_in_group = threadIdx.x % threads_per_group;
+    int64_t const group_offset =
+        threadIdx.x / threads_per_group * (group_size >> 2);
+    int64_t const thread_offset = group_offset + thread_in_group;
+    int64_t const thread_end = min(group_offset + (group_size >> 2),
+                                   static_cast<int64_t>(hidden_size >> 2));
+    vec_input = reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+    vec_weight = reinterpret_cast<vec4_t<scalar_t> const*>(weight);
+    if constexpr (has_residual) {
+      vec_residual =
+          reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
+    }
+    int32_t const num_vec_elems = thread_end;
+
 #pragma unroll 4
-  for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
-    vec4_t<scalar_t> in = vec_input[i];
-    vec4_t<scalar_t> const w = vec_weight[i];
+    for (auto i = thread_offset; i < num_vec_elems; i += threads_per_group) {
+      vec4_t<scalar_t> in = vec_input[i];
+      vec4_t<scalar_t> const w = vec_weight[i];
 
-    vec4_t<float> x;
+      vec4_t<float> x;
 #pragma unroll
-    for (int j = 0; j < VEC_SIZE; ++j) {
-      x.val[j] = static_cast<float>(in.val[j]);
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        x.val[j] = static_cast<float>(in.val[j]);
+      }
+
+      if constexpr (has_residual) {
+        vec4_t<scalar_t> r = vec_residual[i];
+#pragma unroll
+        for (int j = 0; j < VEC_SIZE; ++j) {
+          x.val[j] += static_cast<float>(r.val[j]);
+        }
+      }
+
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        block_absmax_val_maybe =
+            fmaxf(block_absmax_val_maybe,
+                  fabs(static_cast<scalar_t>(x.val[j] * rms) * w.val[j]));
+      }
     }
 
+    s_max_vals[threadIdx.x] = block_absmax_val_maybe;
+    __syncthreads();
+
+    int64_t const warp_size = WARP_SIZE;
+    int64_t const num_warps = blockDim.x / warp_size;
+    int64_t const warp_id = threadIdx.x / warp_size;
+    int64_t const thread_in_warp = threadIdx.x % warp_size;
+    int64_t const groups_per_warp = (num_groups + num_warps - 1) / num_warps;
+    for (auto i = 0; i < groups_per_warp; ++i) {
+      int64_t const group_id = i * num_warps + warp_id;
+      if (group_id < num_groups) {
+        int64_t warp_start = group_id * threads_per_group;
+        int64_t const start = warp_start + thread_in_warp;
+        int64_t const warp_end = min(warp_start + threads_per_group,
+                                     static_cast<int64_t>(hidden_size));
+        for (auto j = start; j + warp_size < warp_end; j += warp_size) {
+          s_max_vals[start] =
+              fmaxf(s_max_vals[start], s_max_vals[j + warp_size]);
+        }
+        warpReduceMaxSpecialized(s_max_vals, start, thread_in_warp,
+                                 min(warp_end - warp_start, warp_size));
+      }
+    }
+    __syncthreads();
+
+    if (thread_in_group == 0 && thread_offset < thread_end) {
+      block_absmax_val_maybe = s_max_vals[threadIdx.x];
+      float scale = 0.0f;
+      if (scale_ub) {
+        scale = min(block_absmax_val_maybe, *scale_ub);
+      } else {
+        scale = block_absmax_val_maybe;
+      }
+      // token scale computation
+      scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
+      // Global output store
+      if constexpr (is_scale_transposed) {
+        all_token_scales[(threadIdx.x / threads_per_group) * gridDim.x +
+                         blockIdx.x] = scale;
+      } else {
+        all_token_scales[blockIdx.x * num_groups +
+                         threadIdx.x / threads_per_group] = scale;
+      }
+    }
+    __syncthreads();
+
+  } else {
+    int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+    vec_input = reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+    vec_weight = reinterpret_cast<vec4_t<scalar_t> const*>(weight);
     if constexpr (has_residual) {
-      vec4_t<scalar_t> r = vec_residual[i];
+      vec_residual =
+          reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
+    }
+
+    int32_t const num_vec_elems = (hidden_size >> 2);
+
+#pragma unroll 4
+    for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+      vec4_t<scalar_t> in = vec_input[i];
+      vec4_t<scalar_t> const w = vec_weight[i];
+
+      vec4_t<float> x;
 #pragma unroll
       for (int j = 0; j < VEC_SIZE; ++j) {
-        x.val[j] += static_cast<float>(r.val[j]);
+        x.val[j] = static_cast<float>(in.val[j]);
       }
-    }
 
+      if constexpr (has_residual) {
+        vec4_t<scalar_t> r = vec_residual[i];
 #pragma unroll
-    for (int j = 0; j < VEC_SIZE; ++j) {
-      block_absmax_val_maybe =
-          fmaxf(block_absmax_val_maybe,
-                fabs(static_cast<scalar_t>(x.val[j] * rms) * w.val[j]));
-    }
-  }
+        for (int j = 0; j < VEC_SIZE; ++j) {
+          x.val[j] += static_cast<float>(r.val[j]);
+        }
+      }
 
-  using BlockReduce = cub::BlockReduce<float, 1024>;
-  __shared__ typename BlockReduce::TempStorage reduceStore;
-  block_absmax_val_maybe =
-      BlockReduce(reduceStore)
-          .Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        block_absmax_val_maybe =
+            fmaxf(block_absmax_val_maybe,
+                  fabs(static_cast<scalar_t>(x.val[j] * rms) * w.val[j]));
+      }
+    }
 
-  __shared__ float s_token_scale;
-  if (threadIdx.x == 0) {
-    float scale = 0.0f;
-    if (scale_ub) {
-      scale = min(block_absmax_val_maybe, *scale_ub);
-    } else {
-      scale = block_absmax_val_maybe;
+    using BlockReduce = cub::BlockReduce<float, 1024>;
+    __shared__ typename BlockReduce::TempStorage reduceStore;
+    block_absmax_val_maybe =
+        BlockReduce(reduceStore)
+            .Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);
+
+    __shared__ float s_token_scale;
+    if (threadIdx.x == 0) {
+      float scale = 0.0f;
+      if (scale_ub) {
+        scale = min(block_absmax_val_maybe, *scale_ub);
+      } else {
+        scale = block_absmax_val_maybe;
+      }
+      // token scale computation
+      scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
+      s_token_scale = scale;                 // shared memory store
+      all_token_scales[blockIdx.x] = scale;  // global output store
     }
-    // token scale computation
-    scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
-    s_token_scale = scale;                 // shared memory store
-    all_token_scales[blockIdx.x] = scale;  // global output store
-  }
-  __syncthreads();
+    __syncthreads();
 
-  *token_scale = s_token_scale;
+    *token_scale = s_token_scale;
+  }
 }
 
 // hidden_size must be a multiple of 4
 template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
-          bool has_residual = false>
+          bool has_residual = false, bool is_scale_transposed = false,
+          int32_t group_size = 0>
 __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
                                scalar_t const* __restrict__ input,
                                scalar_t const* __restrict__ weight,
-                               float const rms, float const scale,
+                               float const rms, float* const scale,
                                int32_t const hidden_size,
                                scalar_t* __restrict__ residual = nullptr) {
   int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
-  ;
 
   // Vectorized input/output/weight/residual to better utilize memory bandwidth.
   vec4_t<scalar_t> const* vec_input =
@@ -311,10 +509,26 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
     }
 
     q8x4_t<scalar_out_t> out;
+
+    float scale_val;
+
+    if constexpr (group_size > 0) {
+      int64_t const num_groups = hidden_size / group_size;
+      int64_t scale_idx = 0;
+      if constexpr (is_scale_transposed) {
+        scale_idx = (i * VEC_SIZE / group_size) * gridDim.x + blockIdx.x;
+      } else {
+        scale_idx = blockIdx.x * num_groups + i * VEC_SIZE / group_size;
+      }
+      scale_val =
+          is_scale_inverted ? 1.0f / scale[scale_idx] : scale[scale_idx];
+    } else {
+      scale_val = *scale;
+    }
 #pragma unroll
     for (int j = 0; j < VEC_SIZE; ++j) {
       out.val[j] = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
-          static_cast<scalar_t>(x.val[j] * rms) * w.val[j], scale);
+          static_cast<scalar_t>(x.val[j] * rms) * w.val[j], scale_val);
     }
     vec_output[i] = out;
   }
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 23ac1d9ab..db37a9b9b 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -215,6 +215,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("rms_norm_dynamic_per_token_quant", torch::kCUDA,
            &rms_norm_dynamic_per_token_quant);
 
+  // Fused Layernorm + Block quant kernels
+  ops.def(
+      "rms_norm_per_block_quant(Tensor! result, Tensor input, "
+      "Tensor weight, Tensor! scale, float epsilon, "
+      "Tensor? scale_ub, Tensor!? residual, int group_size, "
+      "bool is_scale_transposed) -> ()");
+  ops.impl("rms_norm_per_block_quant", torch::kCUDA, &rms_norm_per_block_quant);
+
   // Rotary embedding
   // Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
   ops.def(
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index d0ba8385f..2ad34a798 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -18,6 +18,9 @@ from vllm.config import (
     VllmConfig,
 )
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    W8A8BlockFp8LinearOp,
+)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape,
     QuantKey,
@@ -25,10 +28,12 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 )
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     Fp8LinearOp,
+    cutlass_block_fp8_supported,
     cutlass_fp8_supported,
     maybe_create_device_identity,
 )
 from vllm.platforms import current_platform
+from vllm.utils.deep_gemm import is_deep_gemm_supported
 
 from ..utils import override_cutlass_fp8_supported
 from .backend import TestBackend
@@ -44,7 +49,7 @@ class TestModel(torch.nn.Module):
         self,
         hidden_size: int,
         eps: float,
-        static: bool,
+        group_shape: GroupShape,
         cuda_force_torch: bool,
         *args,
         **kwargs,
@@ -52,8 +57,17 @@ class TestModel(torch.nn.Module):
         super().__init__(*args, **kwargs)
         self.cuda_force_torch = cuda_force_torch
         self.norm = [RMSNorm(hidden_size, eps) for _ in range(4)]
-        self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
-        group_shape = GroupShape.PER_TENSOR if static else GroupShape.PER_TOKEN
+        if group_shape.is_per_group():
+            self.wscale = [
+                torch.rand(
+                    (hidden_size // group_shape[1], hidden_size // group_shape[1]),
+                    dtype=torch.float32,
+                )
+                for _ in range(3)
+            ]
+        else:
+            self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
+        static = group_shape == GroupShape.PER_TENSOR
         quant_scale = ScaleDesc(torch.float32, static, group_shape)
         self.quant_key = QuantKey(dtype=FP8_DTYPE, scale=quant_scale, symmetric=True)
         if static:
@@ -61,18 +75,29 @@ class TestModel(torch.nn.Module):
         else:
             self.scale = [None for _ in range(3)]
         self.w = [
-            torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
-            for _ in range(3)
+            torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE) for _ in range(3)
         ]
+        if not group_shape.is_per_group():
+            self.w = [self.w[0].t() for _ in range(3)]
 
-        with override_cutlass_fp8_supported(not cuda_force_torch):
-            self.fp8_linear = Fp8LinearOp(
-                act_quant_static=static,
+        if group_shape.is_per_group():
+            self.fp8_linear = W8A8BlockFp8LinearOp(
+                weight_group_shape=GroupShape(group_shape[1], group_shape[1]),
                 act_quant_group_shape=group_shape,
+                cutlass_block_fp8_supported=cutlass_block_fp8_supported(),
+                use_aiter_and_is_supported=False,
             )
+            self.enable_quant_fp8_custom_op = self.fp8_linear.input_quant_op.enabled()
+        else:
+            with override_cutlass_fp8_supported(not cuda_force_torch):
+                self.fp8_linear = Fp8LinearOp(
+                    act_quant_static=static,
+                    act_quant_group_shape=group_shape,
+                )
+                self.enable_quant_fp8_custom_op = self.fp8_linear.quant_fp8.enabled()
 
         self.enable_rms_norm_custom_op = self.norm[0].enabled()
-        self.enable_quant_fp8_custom_op = self.fp8_linear.quant_fp8.enabled()
+        self.group_shape = group_shape
 
     def forward(self, x):
         # avoid having graph input be an arg to a pattern directly
@@ -119,11 +144,19 @@ class TestModel(torch.nn.Module):
         )
 
 
+GROUP_SHAPES = [
+    GroupShape.PER_TOKEN,
+    GroupShape.PER_TENSOR,
+    GroupShape(1, 128),
+    GroupShape(1, 64),
+]
+
+
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("hidden_size", [64])
+@pytest.mark.parametrize("hidden_size", [256])
 @pytest.mark.parametrize("num_tokens", [257])
 @pytest.mark.parametrize("eps", [1e-5, 1e-6])
-@pytest.mark.parametrize("static", [True, False])
+@pytest.mark.parametrize("group_shape", GROUP_SHAPES)
 @pytest.mark.parametrize("enable_rms_norm_custom_op", [True, False])
 @pytest.mark.parametrize("enable_quant_fp8_custom_op", [True, False])
 # cuda_force_torch used to test torch code path on platforms that
@@ -139,7 +172,7 @@ def test_fusion_rmsnorm_quant(
     hidden_size,
     num_tokens,
     eps,
-    static,
+    group_shape,
     enable_rms_norm_custom_op,
     enable_quant_fp8_custom_op,
     cuda_force_torch,
@@ -149,6 +182,15 @@ def test_fusion_rmsnorm_quant(
     torch.manual_seed(1)
     maybe_create_device_identity()  # needed for certain non-cutlass fp8 paths
 
+    if not enable_quant_fp8_custom_op and group_shape.is_per_group():
+        pytest.skip("Unsupported unwrapped quant fp8 op for blockwise quantization")
+
+    # Skip test for 64-bit group shape when running with cutlass or deepgemm
+    if group_shape == GroupShape(1, 64) and (
+        cutlass_block_fp8_supported() or is_deep_gemm_supported()
+    ):
+        pytest.skip("Unsupported group shape 64 for CUTLASS/DeepGemm")
+
     custom_ops = []
     if enable_rms_norm_custom_op:
         custom_ops.append("+rms_norm")
@@ -172,8 +214,7 @@ def test_fusion_rmsnorm_quant(
 
         backend = TestBackend(noop_pass, fusion_pass, cleanup_pass)
         backend2 = TestBackend(noop_pass, cleanup_pass)
-        model = TestModel(hidden_size, eps, static, cuda_force_torch)
-
+        model = TestModel(hidden_size, eps, group_shape, cuda_force_torch)
         # First dimension dynamic
         x = torch.rand(num_tokens, hidden_size)
         torch._dynamo.mark_dynamic(x, 0)
diff --git a/tests/kernels/core/test_fused_quant_layernorm.py b/tests/kernels/core/test_fused_quant_layernorm.py
index b5fc653ca..094073f5d 100644
--- a/tests/kernels/core/test_fused_quant_layernorm.py
+++ b/tests/kernels/core/test_fused_quant_layernorm.py
@@ -8,6 +8,12 @@ import torch
 import vllm._custom_ops as ops
 from tests.kernels.utils import opcheck
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8,
+)
+from vllm.model_executor.layers.quantization.utils.int8_utils import (
+    per_token_group_quant_int8,
+)
 
 DTYPES = [torch.bfloat16, torch.float]
 QUANT_DTYPES = [torch.int8, torch.float8_e4m3fn]
@@ -21,6 +27,7 @@ NUM_TOKENS_HIDDEN_SIZES = [
 
 ADD_RESIDUAL = [False, True]
 SCALE_UBS = [True, False]
+GROUP_SIZES = [None, [1, 64], [1, 128]]
 SEEDS = [0]
 CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
@@ -45,12 +52,13 @@ def ref_rms_norm(
     return out, residual
 
 
-def ref_dynamic_per_token_quant(
+def ref_dynamic_per_token_or_block_quant(
     rms_norm_layer: RMSNorm,
     x: torch.Tensor,
     quant_dtype: torch.dtype,
     residual: torch.Tensor | None,
     scale_ub: torch.Tensor | None,
+    group_size: list[int] | None,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
     if scale_ub is not None:
         assert quant_dtype == torch.float8_e4m3fn
@@ -59,13 +67,24 @@ def ref_dynamic_per_token_quant(
     torch_out, residual = ref_rms_norm(rms_norm_layer, x, residual)
 
     # Quant
-    if quant_dtype == torch.float8_e4m3fn:
-        torch_out, scales = ops.scaled_fp8_quant(
-            torch_out, scale_ub=scale_ub, use_per_token_if_dynamic=True
-        )
+    if group_size is not None:
+        if quant_dtype == torch.float8_e4m3fn:
+            torch_out, scales = per_token_group_quant_fp8(
+                torch_out, group_size=group_size[1], use_ue8m0=False
+            )
+        else:
+            assert quant_dtype == torch.int8
+            torch_out, scales = per_token_group_quant_int8(
+                torch_out, group_size=group_size[1]
+            )
     else:
-        assert quant_dtype == torch.int8
-        torch_out, scales, _ = ops.scaled_int8_quant(torch_out)
+        if quant_dtype == torch.float8_e4m3fn:
+            torch_out, scales = ops.scaled_fp8_quant(
+                torch_out, scale_ub=scale_ub, use_per_token_if_dynamic=True
+            )
+        else:
+            assert quant_dtype == torch.int8
+            torch_out, scales, _ = ops.scaled_int8_quant(torch_out)
 
     return torch_out, scales, residual
 
@@ -76,24 +95,32 @@ def ref_impl(
     quant_dtype: torch.dtype,
     residual: torch.Tensor | None,
     scale_ub: torch.Tensor | None,
+    group_size: list[int] | None,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
-    return ref_dynamic_per_token_quant(
-        rms_norm_layer, x, quant_dtype, residual, scale_ub
+    return ref_dynamic_per_token_or_block_quant(
+        rms_norm_layer, x, quant_dtype, residual, scale_ub, group_size
     )
 
 
-def ops_dynamic_per_token_quant(
+def ops_dynamic_per_token_or_block_quant(
     weight: torch.Tensor,
     x: torch.Tensor,
     quant_dtype: torch.dtype,
     residual: torch.Tensor | None,
     scale_ub: torch.Tensor | None,
+    group_size: list[int] | None,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
     if residual is not None:
         residual = residual.clone()
-    out, scales = ops.rms_norm_dynamic_per_token_quant(
-        x, weight, EPS, quant_dtype, scale_ub, residual
-    )
+    if group_size is not None:
+        out, scales = ops.rms_norm_per_block_quant(
+            x, weight, EPS, quant_dtype, group_size, scale_ub, residual, True
+        )
+        scales = scales.contiguous()
+    else:
+        out, scales = ops.rms_norm_dynamic_per_token_quant(
+            x, weight, EPS, quant_dtype, scale_ub, residual
+        )
     return out, scales, residual
 
 
@@ -103,8 +130,11 @@ def ops_impl(
     quant_dtype: torch.dtype,
     residual: torch.Tensor | None,
     scale_ub: torch.Tensor | None,
+    group_size: list[int] | None,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
-    return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual, scale_ub)
+    return ops_dynamic_per_token_or_block_quant(
+        weight, x, quant_dtype, residual, scale_ub, group_size
+    )
 
 
 @pytest.mark.parametrize("num_tokens, hidden_size", NUM_TOKENS_HIDDEN_SIZES)
@@ -112,6 +142,7 @@ def ops_impl(
 @pytest.mark.parametrize("has_scale_ub", SCALE_UBS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("quant_dtype", QUANT_DTYPES)
+@pytest.mark.parametrize("group_size", GROUP_SIZES)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
@@ -122,6 +153,7 @@ def test_rms_norm(
     has_scale_ub: bool,
     dtype: torch.dtype,
     quant_dtype: torch.dtype,
+    group_size: list[int] | None,
     seed: int,
     device: str,
 ) -> None:
@@ -130,6 +162,14 @@ def test_rms_norm(
         torch.cuda.manual_seed(seed)
     torch.set_default_device(device)
 
+    if group_size is not None and hidden_size % group_size[1] != 0:
+        # skip
+        return
+
+    if group_size is not None and has_scale_ub:
+        # blockwise baseline doesn't support scale_ub
+        return
+
     if has_scale_ub and quant_dtype != torch.float8_e4m3fn:
         # skip
         return
@@ -150,10 +190,10 @@ def test_rms_norm(
         scale_ub = None
 
     ref_out, ref_scales, ref_residual = ref_impl(
-        layer, x, quant_dtype, residual, scale_ub
+        layer, x, quant_dtype, residual, scale_ub, group_size
     )
     ops_out, ops_scales, ops_residual = ops_impl(
-        layer.weight, x, quant_dtype, residual, scale_ub
+        layer.weight, x, quant_dtype, residual, scale_ub, group_size
     )
 
     assert ref_out.dtype == quant_dtype
@@ -166,11 +206,15 @@ def test_rms_norm(
         assert torch.allclose(ref_scales, ops_scales)
         a = ref_out.to(dtype=torch.float32)
         b = ops_out.to(dtype=torch.float32)
-        ok = torch.allclose(a, b)
+        ok = torch.allclose(a, b, atol=1e-6)
         if not ok:
             # fallback: compare dequantized values with relaxed tolerance
-            a_deq = a * ref_scales.view(-1, 1)
-            b_deq = b * ops_scales.view(-1, 1)
+            if group_size is None:
+                a_deq = a * ref_scales.view(-1, 1)
+                b_deq = b * ops_scales.view(-1, 1)
+            else:
+                a_deq = a * ref_scales.repeat_interleave(group_size[1], dim=1)
+                b_deq = b * ops_scales.repeat_interleave(group_size[1], dim=1)
             # NOTE: It is possible that some future test cases trigger this
             # max diff due to precision issues. If such an error is
             # encountered, it's recommended to inspect the differences between
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 94e275452..77d545329 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -436,6 +436,46 @@ def rms_norm_dynamic_per_token_quant(
     return output, scales
 
 
+# fused quant layer norm ops blocked
+def rms_norm_per_block_quant(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+    quant_dtype: torch.dtype,
+    group_size: list[int],
+    scale_ub: torch.Tensor | None = None,
+    residual: torch.Tensor | None = None,
+    is_scale_transposed: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert len(group_size) == 2
+    output = torch.empty_like(input, dtype=quant_dtype)
+    if is_scale_transposed:
+        scales = torch.empty(
+            (input.shape[-1] // group_size[1], input.numel() // input.shape[-1]),
+            device=input.device,
+            dtype=torch.float32,
+        ).transpose(0, 1)
+    else:
+        scales = torch.empty(
+            (input.numel() // input.shape[-1], input.shape[-1] // group_size[1]),
+            device=input.device,
+            dtype=torch.float32,
+        )
+
+    torch.ops._C.rms_norm_per_block_quant(
+        output,
+        input,
+        weight,
+        scales,
+        epsilon,
+        scale_ub,
+        residual,
+        group_size[1],
+        is_scale_transposed,
+    )
+    return output, scales
+
+
 # quantization ops
 # awq
 def awq_dequantize(
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index 1d6e297b4..de083a2e5 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -15,13 +15,22 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape,
     QuantKey,
     ScaleDesc,
+    kFp8Dynamic64Sym,
+    kFp8Dynamic128Sym,
     kFp8DynamicTensorSym,
     kFp8DynamicTokenSym,
     kFp8StaticTensorSym,
     kNvfp4Quant,
     kStaticTensorScale,
 )
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    cutlass_block_fp8_supported,
+)
 from vllm.platforms import current_platform
+from vllm.utils.deep_gemm import (
+    is_deep_gemm_e8m0_used,
+    should_use_deepgemm_for_fp8_linear_for_nk,
+)
 
 from .inductor_pass import enable_fake_mode
 from .matcher_utils import MatcherFusedAddRMSNorm, MatcherQuantFP8, MatcherRMSNorm
@@ -58,6 +67,9 @@ QUANT_OPS: dict[QuantKey, OpOverload] = {
 }
 if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"):
     QUANT_OPS[kNvfp4Quant] = torch.ops._C.scaled_fp4_quant.default
+if current_platform.is_cuda():
+    QUANT_OPS[kFp8Dynamic128Sym] = torch.ops._C.per_token_group_fp8_quant.default  # noqa: E501
+    QUANT_OPS[kFp8Dynamic64Sym] = torch.ops._C.per_token_group_fp8_quant.default  # noqa: E501
 
 
 class FusedRMSQuantKey(NamedTuple):
@@ -90,6 +102,18 @@ FUSED_OPS: dict[FusedRMSQuantKey, OpOverload] = {
     FusedRMSQuantKey(
         kFp8DynamicTokenSym, True
     ): torch.ops._C.rms_norm_dynamic_per_token_quant.default,  # noqa: E501
+    FusedRMSQuantKey(
+        kFp8Dynamic128Sym, False
+    ): torch.ops._C.rms_norm_per_block_quant.default,  # noqa: E501
+    FusedRMSQuantKey(
+        kFp8Dynamic128Sym, True
+    ): torch.ops._C.rms_norm_per_block_quant.default,  # noqa: E501
+    FusedRMSQuantKey(
+        kFp8Dynamic64Sym, False
+    ): torch.ops._C.rms_norm_per_block_quant.default,  # noqa: E501
+    FusedRMSQuantKey(
+        kFp8Dynamic64Sym, True
+    ): torch.ops._C.rms_norm_per_block_quant.default,  # noqa: E501
 }
 
 
@@ -100,6 +124,15 @@ class RMSNormQuantPattern:
         config = get_current_vllm_config()
         self.model_dtype = config.model_config.dtype if config.model_config else None
 
+        # groupwise FP8 linear uses col major scales if deepgemm and cutlass
+        using_deepgemm = should_use_deepgemm_for_fp8_linear_for_nk(
+            self.model_dtype,
+            config.model_config.hf_config.intermediate_size,
+            config.model_config.hf_config.hidden_size,
+        )
+        use_col_major_scales = using_deepgemm or cutlass_block_fp8_supported()
+        use_e8m0 = is_deep_gemm_e8m0_used() if using_deepgemm else False
+
         assert key in FUSED_OPS, f"unsupported fused rmsnorm+quant op for {key}"
         self.FUSED_OP = FUSED_OPS[key]
 
@@ -108,7 +141,9 @@ class RMSNormQuantPattern:
             if not key.fused_add
             else MatcherFusedAddRMSNorm(epsilon)
         )
-        self.quant_matcher = MatcherQuantFP8(key.quant)
+        self.quant_matcher = MatcherQuantFP8(
+            key.quant, use_col_major_scales=use_col_major_scales, use_e8m0=use_e8m0
+        )
 
 
 class RMSNormStaticQuantPattern(RMSNormQuantPattern):
@@ -218,6 +253,120 @@ class FusedAddRMSNormStaticQuantPattern(RMSNormQuantPattern):
         )
 
 
+class FusedAddRMSNormGroupQuantPattern(RMSNormQuantPattern):
+    def __init__(
+        self,
+        epsilon: float,
+        quant_dtype: torch.dtype,
+        group_shape: GroupShape,
+        symmetric=True,
+    ):
+        scale = ScaleDesc(torch.float32, False, group_shape)
+        key = FusedRMSQuantKey(
+            fused_add=True,
+            quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric),
+        )
+        self.group_shape = group_shape
+        super().__init__(epsilon, key)
+
+    def register(self, pm_pass: PatternMatcherPass):
+        def pattern(input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor):
+            result_rms, residual = self.rmsnorm_matcher(input, weight, residual)
+            result, scale = self.quant_matcher(result_rms)
+            return result, residual, scale
+
+        def replacement(
+            input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor
+        ):
+            # In case we're matching native rms-norm, conversions might be
+            # optimized out. We convert here just to be safe.
+            input = input.to(dtype=self.model_dtype)
+
+            result = torch.empty_like(input, dtype=self.quant_dtype)
+            scale = self.quant_matcher.make_scale(
+                input, transposed=self.quant_matcher.use_col_major_scales
+            )
+            at = auto_functionalized(
+                self.FUSED_OP,
+                result=result,
+                input=input,
+                weight=weight,
+                scale=scale,
+                epsilon=self.epsilon,
+                scale_ub=None,
+                residual=residual,
+                group_size=self.group_shape[1],
+                is_scale_transposed=self.quant_matcher.use_col_major_scales,
+            )
+
+            # result, residual, scale
+            return at[1], at[3], at[2]
+
+        pm.register_replacement(
+            pattern,
+            replacement,
+            self.rmsnorm_matcher.inputs(),
+            pm.fwd_only,
+            pm_pass,
+        )
+
+
+class RMSNormGroupQuantPattern(RMSNormQuantPattern):
+    def __init__(
+        self,
+        epsilon: float,
+        quant_dtype: torch.dtype,
+        group_shape: GroupShape,
+        symmetric=True,
+    ):
+        scale = ScaleDesc(torch.float32, False, group_shape)
+        key = FusedRMSQuantKey(
+            fused_add=False,
+            quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric),
+        )
+        self.group_shape = group_shape
+        super().__init__(epsilon, key)
+
+    def register(self, pm_pass: PatternMatcherPass):
+        def pattern(input: torch.Tensor, weight: torch.Tensor):
+            result_rms = self.rmsnorm_matcher(input, weight)
+            result, scale = self.quant_matcher(result_rms)
+            return result, scale
+
+        def replacement(input: torch.Tensor, weight: torch.Tensor):
+            # In case we're matching native rms-norm, conversions might be
+            # optimized out. We convert here just to be safe.
+            input = input.to(dtype=self.model_dtype)
+
+            result = torch.empty_like(input, dtype=self.quant_dtype)
+            scale = self.quant_matcher.make_scale(
+                input, transposed=self.quant_matcher.use_col_major_scales
+            )
+            at = auto_functionalized(
+                self.FUSED_OP,
+                result=result,
+                input=input,
+                weight=weight,
+                scale=scale,
+                epsilon=self.epsilon,
+                scale_ub=None,
+                residual=None,
+                group_size=self.group_shape[1],
+                is_scale_transposed=self.quant_matcher.use_col_major_scales,
+            )
+
+            # result, scale
+            return at[1], at[2]
+
+        pm.register_replacement(
+            pattern,
+            replacement,
+            self.rmsnorm_matcher.inputs(),
+            pm.fwd_only,
+            pm_pass,
+        )
+
+
 class RMSNormDynamicQuantPattern(RMSNormQuantPattern):
     def __init__(
         self,
@@ -340,6 +489,25 @@ class RMSNormQuantFusionPass(VllmPatternMatcherPass):
         # Make sure fused add patterns are before simple rms norm,
         # as the latter is a subset of the former in torch ops
         for epsilon in [1e-5, 1e-6]:
+            # Fuse fused_add_rms_norm + fp8 group quant
+            FusedAddRMSNormGroupQuantPattern(
+                epsilon, FP8_DTYPE, group_shape=GroupShape(1, 128)
+            ).register(self.patterns)
+
+            # Fuse rms_norm + fp8 group quant
+            RMSNormGroupQuantPattern(
+                epsilon, FP8_DTYPE, group_shape=GroupShape(1, 128)
+            ).register(self.patterns)
+
+            FusedAddRMSNormGroupQuantPattern(
+                epsilon, FP8_DTYPE, group_shape=GroupShape(1, 64)
+            ).register(self.patterns)
+
+            # Fuse rms_norm + fp8 group quant
+            RMSNormGroupQuantPattern(
+                epsilon, FP8_DTYPE, group_shape=GroupShape(1, 64)
+            ).register(self.patterns)
+
             # Fuse fused_add_rms_norm + static fp8 quant
             FusedAddRMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register(
                 self.patterns
@@ -366,9 +534,11 @@ class RMSNormQuantFusionPass(VllmPatternMatcherPass):
     def uuid(self) -> Any:
         return self.hash_source(
             self,
+            RMSNormGroupQuantPattern,
             RMSNormQuantPattern,
             RMSNormStaticQuantPattern,
             RMSNormDynamicQuantPattern,
             FusedAddRMSNormStaticQuantPattern,
             FusedAddRMSNormDynamicQuantPattern,
+            FusedAddRMSNormGroupQuantPattern,
         )
diff --git a/vllm/compilation/matcher_utils.py b/vllm/compilation/matcher_utils.py
index e4cd063d2..0c0bece9b 100644
--- a/vllm/compilation/matcher_utils.py
+++ b/vllm/compilation/matcher_utils.py
@@ -13,6 +13,8 @@ from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     QuantKey,
     _normalize_quant_group_shape,
+    kFp8Dynamic64Sym,
+    kFp8Dynamic128Sym,
     kFp8DynamicTensorSym,
     kFp8DynamicTokenSym,
     kFp8StaticTensorSym,
@@ -35,6 +37,10 @@ QUANT_OPS: dict[QuantKey, OpOverload] = {
 if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"):
     QUANT_OPS[kNvfp4Quant] = torch.ops._C.scaled_fp4_quant.default  # noqa: E501
 
+if current_platform.is_cuda():
+    QUANT_OPS[kFp8Dynamic128Sym] = torch.ops._C.per_token_group_fp8_quant.default  # noqa: E501
+    QUANT_OPS[kFp8Dynamic64Sym] = torch.ops._C.per_token_group_fp8_quant.default  # noqa: E501
+
 SILU_MUL_OP = torch.ops._C.silu_and_mul.default
 
 
@@ -224,12 +230,20 @@ class MatcherFusedAddRMSNorm(MatcherCustomOp):
 
 
 class MatcherQuantFP8(MatcherCustomOp):
-    def __init__(self, quant_key: QuantKey, enabled: bool | None = None):
+    def __init__(
+        self,
+        quant_key: QuantKey,
+        enabled: bool | None = None,
+        use_col_major_scales: bool = False,
+        use_e8m0: bool = False,
+    ):
         if enabled is None:
             enabled = QuantFP8.enabled()
 
         super().__init__(enabled)
         self.quant_key = quant_key
+        self.use_col_major_scales = use_col_major_scales
+        self.use_e8m0 = use_e8m0
         assert quant_key in QUANT_OPS, f"unsupported quantization scheme {quant_key}"
         self.QUANT_OP = QUANT_OPS[quant_key]
 
@@ -248,6 +262,27 @@ class MatcherQuantFP8(MatcherCustomOp):
             input.shape, device=input.device, dtype=self.quant_key.dtype
         )
 
+        if self.quant_key.scale.group_shape.is_per_group():
+            assert scale is None
+            scale = self.make_scale(input, transposed=self.use_col_major_scales)
+
+            finfo = torch.finfo(self.quant_key.dtype)
+            fp8_min = finfo.min
+            fp8_max = finfo.max
+
+            _, result, scale = auto_functionalized(
+                self.QUANT_OP,
+                input=input,
+                output_q=result,
+                output_s=scale,
+                group_size=self.quant_key.scale.group_shape[1],
+                eps=1e-10,
+                fp8_min=fp8_min,
+                fp8_max=fp8_max,
+                scale_ue8m0=self.use_e8m0,
+            )
+            return result, scale
+
         if self.quant_key.scale.static:
             assert scale is not None
             _, result = auto_functionalized(
@@ -269,7 +304,7 @@ class MatcherQuantFP8(MatcherCustomOp):
     ) -> tuple[torch.Tensor, torch.Tensor]:
         return self.quant_fp8(input, scale)
 
-    def make_scale(self, input: torch.Tensor):
+    def make_scale(self, input: torch.Tensor, transposed: bool = False):
         normalized_group_shape = _normalize_quant_group_shape(
             input, self.quant_key.scale.group_shape
         )
@@ -277,6 +312,11 @@ class MatcherQuantFP8(MatcherCustomOp):
             input.shape[0] // normalized_group_shape[0],
             input.shape[1] // normalized_group_shape[1],
         )
+        if transposed:
+            scale_shape = tuple(reversed(scale_shape))
+            return torch.empty(
+                scale_shape, device=input.device, dtype=torch.float32
+            ).permute(-1, -2)
 
         return torch.empty(scale_shape, device=input.device, dtype=torch.float32)
 
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 7e1bda863..ad92f4ec6 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -733,7 +733,7 @@ def per_token_group_quant_fp8(
     assert out_q is None or out_q.shape == x.shape
     x_q = out_q
     if x_q is None:
-        x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+        x_q = torch.empty(x.shape, device=x.device, dtype=dtype)
 
     # Allocate the scale tensor in either row- or column-major format.
     if column_major_scales:
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index d056d3404..92ee8c498 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -115,6 +115,12 @@ kFp8DynamicTokenSym = QuantKey(FP8_DTYPE, kDynamicTokenScale, symmetric=True)
 kNvfp4GroupScale = ScaleDesc(FP8_DTYPE, False, GroupShape(1, 16))
 kNvfp4Quant = QuantKey(FP4_DTYPE, scale=kNvfp4GroupScale, scale2=kStaticTensorScale)
 
+kDynamic128Scale = ScaleDesc(torch.float32, False, GroupShape(1, 128))
+kFp8Dynamic128Sym = QuantKey(FP8_DTYPE, kDynamic128Scale, symmetric=True)
+
+kDynamic64Scale = ScaleDesc(torch.float32, False, GroupShape(1, 64))
+kFp8Dynamic64Sym = QuantKey(FP8_DTYPE, kDynamic64Scale, symmetric=True)
+
 
 # Normalize the group_shape to the full extent for any dims that are -1
 def _normalize_quant_group_shape(x: torch.Tensor, group_shape: GroupShape):
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index b25c1e3e1..8545108a0 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -381,6 +381,22 @@ def should_use_deepgemm_for_fp8_linear(
     )
 
 
+def should_use_deepgemm_for_fp8_linear_for_nk(
+    output_dtype: torch.dtype,
+    shape0: int,
+    shape1: int,
+    supports_deep_gemm: bool | None = None,
+):
+    if supports_deep_gemm is None:
+        supports_deep_gemm = is_deep_gemm_supported()
+    return (
+        supports_deep_gemm
+        and output_dtype == torch.bfloat16
+        and shape0 % 128 == 0
+        and shape1 % 128 == 0
+    )
+
+
 __all__ = [
     "calc_diff",
     "fp8_gemm_nt",
@@ -394,6 +410,7 @@ __all__ = [
     "is_deep_gemm_supported",
     "get_num_sms",
     "should_use_deepgemm_for_fp8_linear",
+    "should_use_deepgemm_for_fp8_linear_for_nk",
     "get_col_major_tma_aligned_tensor",
     "get_mk_alignment_for_contiguous_layout",
 ]
-- 
GitLab


From 444f0e3f339caba85f84c6628e1df50605b241a0 Mon Sep 17 00:00:00 2001
From: daniel-salib <danielsalib@meta.com>
Date: Sun, 7 Dec 2025 18:02:52 -0800
Subject: [PATCH 180/258] [Frontend] Add MCP type support infrastructure to
 Responses API (#30054)

Signed-off-by: Daniel Salib <danielsalib@meta.com>
---
 .../openai/parser/test_harmony_utils.py       | 185 +++++++++++++++++-
 .../openai/parser/harmony_utils.py            | 163 +++++++++++----
 vllm/entrypoints/openai/protocol.py           |   8 +
 3 files changed, 309 insertions(+), 47 deletions(-)

diff --git a/tests/entrypoints/openai/parser/test_harmony_utils.py b/tests/entrypoints/openai/parser/test_harmony_utils.py
index ae6f558f2..a3fd80938 100644
--- a/tests/entrypoints/openai/parser/test_harmony_utils.py
+++ b/tests/entrypoints/openai/parser/test_harmony_utils.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from openai.types.responses import ResponseFunctionToolCall, ResponseReasoningItem
+from openai.types.responses.response_output_item import McpCall
 from openai_harmony import Author, Message, Role, TextContent
 
 from vllm.entrypoints.openai.parser.harmony_utils import (
@@ -400,17 +401,19 @@ class TestParseOutputMessage:
         assert output_items[0].arguments == '{"location": "San Francisco"}'
         assert output_items[1].arguments == '{"location": "New York"}'
 
-    def test_commentary_with_unknown_recipient_raises_error(self):
-        """Test that commentary with unknown recipient raises ValueError."""
-        message = Message.from_role_and_content(Role.ASSISTANT, "some content")
+    def test_commentary_with_unknown_recipient_creates_mcp_call(self):
+        """Test that commentary with unknown recipient creates MCP call."""
+        message = Message.from_role_and_content(Role.ASSISTANT, '{"arg": "value"}')
         message = message.with_channel("commentary")
-        message = message.with_recipient("unknown_recipient")
+        message = message.with_recipient("custom_tool")
 
-        try:
-            parse_output_message(message)
-            raise AssertionError("Expected ValueError to be raised")
-        except ValueError as e:
-            assert "Unknown recipient: unknown_recipient" in str(e)
+        output_items = parse_output_message(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], McpCall)
+        assert output_items[0].type == "mcp_call"
+        assert output_items[0].name == "custom_tool"
+        assert output_items[0].server_label == "custom_tool"
 
     def test_analysis_channel_creates_reasoning(self):
         """Test that analysis channel creates reasoning items."""
@@ -451,3 +454,167 @@ def test_has_custom_tools() -> None:
     assert has_custom_tools(
         {"web_search_preview", "code_interpreter", "container", "others"}
     )
+
+
+def test_parse_mcp_call_basic() -> None:
+    """Test that MCP calls are parsed with correct type and server_label."""
+    message = Message.from_role_and_content(Role.ASSISTANT, '{"path": "/tmp"}')
+    message = message.with_recipient("filesystem")
+    message = message.with_channel("commentary")
+
+    output_items = parse_output_message(message)
+
+    assert len(output_items) == 1
+    assert isinstance(output_items[0], McpCall)
+    assert output_items[0].type == "mcp_call"
+    assert output_items[0].name == "filesystem"
+    assert output_items[0].server_label == "filesystem"
+    assert output_items[0].arguments == '{"path": "/tmp"}'
+    assert output_items[0].status == "completed"
+
+
+def test_parse_mcp_call_dotted_recipient() -> None:
+    """Test that dotted recipients extract the tool name correctly."""
+    message = Message.from_role_and_content(Role.ASSISTANT, '{"cmd": "ls"}')
+    message = message.with_recipient("repo_browser.list")
+    message = message.with_channel("commentary")
+
+    output_items = parse_output_message(message)
+
+    assert len(output_items) == 1
+    assert isinstance(output_items[0], McpCall)
+    assert output_items[0].name == "list"
+    assert output_items[0].server_label == "repo_browser"
+
+
+def test_mcp_vs_function_call() -> None:
+    """Test that function calls are not parsed as MCP calls."""
+    func_message = Message.from_role_and_content(Role.ASSISTANT, '{"arg": "value"}')
+    func_message = func_message.with_recipient("functions.my_tool")
+    func_message = func_message.with_channel("commentary")
+
+    func_items = parse_output_message(func_message)
+
+    assert len(func_items) == 1
+    assert not isinstance(func_items[0], McpCall)
+    assert func_items[0].type == "function_call"
+
+
+def test_mcp_vs_builtin_tools() -> None:
+    """Test that built-in tools (python, container) are not parsed as MCP calls."""
+    # Test python (built-in tool) - should be reasoning, not MCP
+    python_message = Message.from_role_and_content(Role.ASSISTANT, "print('hello')")
+    python_message = python_message.with_recipient("python")
+    python_message = python_message.with_channel("commentary")
+
+    python_items = parse_output_message(python_message)
+
+    assert len(python_items) == 1
+    assert not isinstance(python_items[0], McpCall)
+    assert python_items[0].type == "reasoning"
+
+
+def test_parse_remaining_state_commentary_channel() -> None:
+    """Test parse_remaining_state with commentary channel and various recipients."""
+    from unittest.mock import Mock
+
+    from vllm.entrypoints.openai.parser.harmony_utils import parse_remaining_state
+
+    # Test 1: functions.* recipient → should return function tool call
+    parser_func = Mock()
+    parser_func.current_content = '{"arg": "value"}'
+    parser_func.current_role = Role.ASSISTANT
+    parser_func.current_channel = "commentary"
+    parser_func.current_recipient = "functions.my_tool"
+
+    func_items = parse_remaining_state(parser_func)
+
+    assert len(func_items) == 1
+    assert not isinstance(func_items[0], McpCall)
+    assert func_items[0].type == "function_call"
+    assert func_items[0].name == "my_tool"
+    assert func_items[0].status == "in_progress"
+
+    # Test 2: MCP tool (not builtin) → should return MCP call
+    parser_mcp = Mock()
+    parser_mcp.current_content = '{"path": "/tmp"}'
+    parser_mcp.current_role = Role.ASSISTANT
+    parser_mcp.current_channel = "commentary"
+    parser_mcp.current_recipient = "filesystem"
+
+    mcp_items = parse_remaining_state(parser_mcp)
+
+    assert len(mcp_items) == 1
+    assert isinstance(mcp_items[0], McpCall)
+    assert mcp_items[0].type == "mcp_call"
+    assert mcp_items[0].name == "filesystem"
+    assert mcp_items[0].server_label == "filesystem"
+    assert mcp_items[0].status == "in_progress"
+
+    # Test 3: Built-in tool (python)
+    # should NOT return MCP call, falls through to reasoning
+    parser_builtin = Mock()
+    parser_builtin.current_content = "print('hello')"
+    parser_builtin.current_role = Role.ASSISTANT
+    parser_builtin.current_channel = "commentary"
+    parser_builtin.current_recipient = "python"
+
+    builtin_items = parse_remaining_state(parser_builtin)
+
+    # Should fall through to reasoning logic
+    assert len(builtin_items) == 1
+    assert not isinstance(builtin_items[0], McpCall)
+    assert builtin_items[0].type == "reasoning"
+
+
+def test_parse_remaining_state_analysis_channel() -> None:
+    """Test parse_remaining_state with analysis channel and various recipients."""
+    from unittest.mock import Mock
+
+    from vllm.entrypoints.openai.parser.harmony_utils import parse_remaining_state
+
+    # Test 1: functions.* recipient → should return function tool call
+    parser_func = Mock()
+    parser_func.current_content = '{"arg": "value"}'
+    parser_func.current_role = Role.ASSISTANT
+    parser_func.current_channel = "analysis"
+    parser_func.current_recipient = "functions.my_tool"
+
+    func_items = parse_remaining_state(parser_func)
+
+    assert len(func_items) == 1
+    assert not isinstance(func_items[0], McpCall)
+    assert func_items[0].type == "function_call"
+    assert func_items[0].name == "my_tool"
+    assert func_items[0].status == "in_progress"
+
+    # Test 2: MCP tool (not builtin) → should return MCP call
+    parser_mcp = Mock()
+    parser_mcp.current_content = '{"query": "test"}'
+    parser_mcp.current_role = Role.ASSISTANT
+    parser_mcp.current_channel = "analysis"
+    parser_mcp.current_recipient = "database"
+
+    mcp_items = parse_remaining_state(parser_mcp)
+
+    assert len(mcp_items) == 1
+    assert isinstance(mcp_items[0], McpCall)
+    assert mcp_items[0].type == "mcp_call"
+    assert mcp_items[0].name == "database"
+    assert mcp_items[0].server_label == "database"
+    assert mcp_items[0].status == "in_progress"
+
+    # Test 3: Built-in tool (container)
+    # should NOT return MCP call, falls through to reasoning
+    parser_builtin = Mock()
+    parser_builtin.current_content = "docker run"
+    parser_builtin.current_role = Role.ASSISTANT
+    parser_builtin.current_channel = "analysis"
+    parser_builtin.current_recipient = "container"
+
+    builtin_items = parse_remaining_state(parser_builtin)
+
+    # Should fall through to reasoning logic
+    assert len(builtin_items) == 1
+    assert not isinstance(builtin_items[0], McpCall)
+    assert builtin_items[0].type == "reasoning"
diff --git a/vllm/entrypoints/openai/parser/harmony_utils.py b/vllm/entrypoints/openai/parser/harmony_utils.py
index 7da0914ce..2260e9604 100644
--- a/vllm/entrypoints/openai/parser/harmony_utils.py
+++ b/vllm/entrypoints/openai/parser/harmony_utils.py
@@ -19,6 +19,7 @@ from openai.types.responses.response_function_web_search import (
     ActionSearch,
     ResponseFunctionWebSearch,
 )
+from openai.types.responses.response_output_item import McpCall
 from openai.types.responses.response_reasoning_item import (
     Content as ResponseReasoningTextContent,
 )
@@ -155,11 +156,7 @@ def get_developer_message(
                 "web_search_preview",
                 "code_interpreter",
                 "container",
-                "mcp",
             ):
-                # These are built-in tools that are added to the system message.
-                # Adding in MCP for now until we support MCP tools executed
-                # server side
                 pass
 
             elif tool.type == "function":
@@ -427,6 +424,44 @@ def _parse_final_message(message: Message) -> ResponseOutputItem:
     )
 
 
+def _parse_mcp_recipient(recipient: str) -> tuple[str, str]:
+    """
+    Parse MCP recipient into (server_label, tool_name).
+
+    For dotted recipients like "repo_browser.list":
+        - server_label: "repo_browser" (namespace/server)
+        - tool_name: "list" (specific tool)
+
+    For simple recipients like "filesystem":
+        - server_label: "filesystem"
+        - tool_name: "filesystem"
+    """
+    if "." in recipient:
+        server_label = recipient.split(".")[0]
+        tool_name = recipient.split(".")[-1]
+    else:
+        server_label = recipient
+        tool_name = recipient
+    return server_label, tool_name
+
+
+def _parse_mcp_call(message: Message, recipient: str) -> list[ResponseOutputItem]:
+    """Parse MCP calls into MCP call items."""
+    server_label, tool_name = _parse_mcp_recipient(recipient)
+    output_items = []
+    for content in message.content:
+        response_item = McpCall(
+            arguments=content.text,
+            type="mcp_call",
+            name=tool_name,
+            server_label=server_label,
+            id=f"mcp_{random_uuid()}",
+            status="completed",
+        )
+        output_items.append(response_item)
+    return output_items
+
+
 def parse_output_message(message: Message) -> list[ResponseOutputItem]:
     """
     Parse a Harmony message into a list of output response items.
@@ -440,33 +475,34 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
     output_items: list[ResponseOutputItem] = []
     recipient = message.recipient
 
-    # Browser tool calls
-    if recipient is not None and recipient.startswith("browser."):
-        output_items.append(_parse_browser_tool_call(message, recipient))
-
-    # Analysis channel (reasoning/chain-of-thought)
-    elif message.channel == "analysis":
-        output_items.extend(_parse_reasoning_content(message))
+    if recipient is not None:
+        # Browser tool calls
+        if recipient.startswith("browser."):
+            output_items.append(_parse_browser_tool_call(message, recipient))
 
-    # Commentary channel
-    elif message.channel == "commentary":
-        # Function calls
-        if recipient is not None and recipient.startswith("functions."):
+        # Function calls (should only happen on commentary channel)
+        elif message.channel == "commentary" and recipient.startswith("functions."):
             output_items.extend(_parse_function_call(message, recipient))
 
-        # Built-in tools on commentary channel are treated as reasoning for now
-        elif (
-            recipient is None  # Preambles: explanatory text before tool calls
-            or recipient.startswith(("python", "browser", "container"))
-        ):
-            # Per Harmony format, commentary channel can contain preambles to calling
-            # multiple functions - explanatory text with no recipient. Built-in tool
-            # recipients (python/browser/container) also generate reasoning output.
+        # Built-in tools are treated as reasoning
+        elif recipient.startswith(("python", "browser", "container")):
+            # Built-in tool recipients (python/browser/container)
+            # generate reasoning output
             output_items.extend(_parse_reasoning_content(message))
+
+        # All other recipients are MCP calls
         else:
-            raise ValueError(f"Unknown recipient: {recipient}")
+            output_items.extend(_parse_mcp_call(message, recipient))
+
+    # No recipient - handle based on channel for non-tool messages
+    elif message.channel == "analysis":
+        output_items.extend(_parse_reasoning_content(message))
+
+    elif message.channel == "commentary":
+        # Per Harmony format, commentary channel can contain preambles to calling
+        # multiple functions - explanatory text with no recipient
+        output_items.extend(_parse_reasoning_content(message))
 
-    # Final output message
     elif message.channel == "final":
         output_items.append(_parse_final_message(message))
 
@@ -485,20 +521,70 @@ def parse_remaining_state(parser: StreamableParser) -> list[ResponseOutputItem]:
     if current_recipient is not None and current_recipient.startswith("browser."):
         return []
 
-    if parser.current_channel == "analysis":
-        reasoning_item = ResponseReasoningItem(
-            id=f"rs_{random_uuid()}",
-            summary=[],
-            type="reasoning",
-            content=[
-                ResponseReasoningTextContent(
-                    text=parser.current_content, type="reasoning_text"
+    if current_recipient and parser.current_channel in ("commentary", "analysis"):
+        if current_recipient.startswith("functions."):
+            rid = random_uuid()
+            return [
+                ResponseFunctionToolCall(
+                    arguments=parser.current_content,
+                    call_id=f"call_{rid}",
+                    type="function_call",
+                    name=current_recipient.split(".")[-1],
+                    id=f"fc_{rid}",
+                    status="in_progress",
                 )
-            ],
-            status=None,
-        )
-        return [reasoning_item]
-    elif parser.current_channel == "final":
+            ]
+        # Built-in tools (python, browser, container) should be treated as reasoning
+        elif not (
+            current_recipient.startswith("python")
+            or current_recipient.startswith("browser")
+            or current_recipient.startswith("container")
+        ):
+            # All other recipients are MCP calls
+            rid = random_uuid()
+            server_label, tool_name = _parse_mcp_recipient(current_recipient)
+            return [
+                McpCall(
+                    arguments=parser.current_content,
+                    type="mcp_call",
+                    name=tool_name,
+                    server_label=server_label,
+                    id=f"mcp_{rid}",
+                    status="in_progress",
+                )
+            ]
+
+    if parser.current_channel == "commentary":
+        return [
+            ResponseReasoningItem(
+                id=f"rs_{random_uuid()}",
+                summary=[],
+                type="reasoning",
+                content=[
+                    ResponseReasoningTextContent(
+                        text=parser.current_content, type="reasoning_text"
+                    )
+                ],
+                status=None,
+            )
+        ]
+
+    if parser.current_channel == "analysis":
+        return [
+            ResponseReasoningItem(
+                id=f"rs_{random_uuid()}",
+                summary=[],
+                type="reasoning",
+                content=[
+                    ResponseReasoningTextContent(
+                        text=parser.current_content, type="reasoning_text"
+                    )
+                ],
+                status=None,
+            )
+        ]
+
+    if parser.current_channel == "final":
         output_text = ResponseOutputText(
             text=parser.current_content,
             annotations=[],  # TODO
@@ -515,6 +601,7 @@ def parse_remaining_state(parser: StreamableParser) -> list[ResponseOutputItem]:
             type="message",
         )
         return [text_item]
+
     return []
 
 
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 2d34a6a0c..aeff6bded 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -25,6 +25,10 @@ from openai.types.responses import (
     ResponseContentPartDoneEvent,
     ResponseFunctionToolCall,
     ResponseInputItemParam,
+    ResponseMcpCallArgumentsDeltaEvent,
+    ResponseMcpCallArgumentsDoneEvent,
+    ResponseMcpCallCompletedEvent,
+    ResponseMcpCallInProgressEvent,
     ResponseOutputItem,
     ResponseOutputItemAddedEvent,
     ResponseOutputItemDoneEvent,
@@ -1790,6 +1794,10 @@ StreamingResponsesResponse: TypeAlias = (
     | ResponseCodeInterpreterCallCodeDoneEvent
     | ResponseCodeInterpreterCallInterpretingEvent
     | ResponseCodeInterpreterCallCompletedEvent
+    | ResponseMcpCallArgumentsDeltaEvent
+    | ResponseMcpCallArgumentsDoneEvent
+    | ResponseMcpCallInProgressEvent
+    | ResponseMcpCallCompletedEvent
 )
 
 
-- 
GitLab


From 735284ed865e0c863cbd084527f0ae77a560fac1 Mon Sep 17 00:00:00 2001
From: Andrew Xia <axia@meta.com>
Date: Sun, 7 Dec 2025 18:04:03 -0800
Subject: [PATCH 181/258] [responsesAPI][7] Browser, Container MCP tools for
 non harmony models (#29989)

Signed-off-by: Andrew Xia <axia@meta.com>
Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/entrypoints/context.py | 89 ++++++++++++++++++++++++++++++++++---
 1 file changed, 83 insertions(+), 6 deletions(-)

diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index a484a437c..01ddab473 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -278,12 +278,14 @@ class ParsableContext(ConversationContext):
     def need_builtin_tool_call(self) -> bool:
         """Return true if the last message is a MCP tool call"""
         last_message = self.parser.response_messages[-1]
-        # TODO: figure out which tools are MCP tools
-        if (  # noqa: SIM103
-            last_message.type == "function_call"
-            and last_message.name in ("code_interpreter", "python")
-        ):
-            return True
+        # TODO(qandrew): figure out which tools are MCP tools
+        if last_message.type == "function_call":  # noqa: SIM102
+            if last_message.name in (
+                "code_interpreter",
+                "python",
+                "web_search_preview",
+            ) or last_message.name.startswith("container"):
+                return True
 
         return False
 
@@ -310,12 +312,87 @@ class ParsableContext(ConversationContext):
 
         return [message]
 
+    async def call_search_tool(
+        self, tool_session: Union["ClientSession", Tool], last_msg: FunctionCall
+    ) -> list[ResponseInputOutputItem]:
+        self.called_tools.add("browser")
+        if isinstance(tool_session, Tool):
+            return await tool_session.get_result_parsable_context(self)
+        if envs.VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY:
+            try:
+                args = json.loads(last_msg.arguments)
+            except json.JSONDecodeError as e:
+                return _create_json_parse_error_messages(last_msg, e)
+        else:
+            args = json.loads(last_msg.arguments)
+        result = await tool_session.call_tool("search", args)
+        result_str = result.content[0].text
+
+        message = ResponseFunctionToolCallOutputItem(
+            id=f"fco_{random_uuid()}",
+            type="function_call_output",
+            call_id=f"call_{random_uuid()}",
+            output=result_str,
+            status="completed",
+        )
+
+        return [message]
+
+    async def call_container_tool(
+        self, tool_session: Union["ClientSession", Tool], last_msg: Message
+    ) -> list[Message]:
+        """
+        Call container tool. Expect this to be run in a stateful docker
+        with command line terminal.
+        The official container tool would at least
+        expect the following format:
+        - for tool name: exec
+            - args:
+                {
+                    "cmd":List[str] "command to execute",
+                    "workdir":optional[str] "current working directory",
+                    "env":optional[object/dict] "environment variables",
+                    "session_name":optional[str] "session name",
+                    "timeout":optional[int] "timeout in seconds",
+                    "user":optional[str] "user name",
+                }
+        """
+        self.called_tools.add("container")
+        if isinstance(tool_session, Tool):
+            return await tool_session.get_result_parsable_context(self)
+        # tool_name = last_msg.recipient.split(".")[1].split(" ")[0]
+        if envs.VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY:
+            try:
+                args = json.loads(last_msg.arguments)
+            except json.JSONDecodeError as e:
+                return _create_json_parse_error_messages(last_msg, e)
+        else:
+            args = json.loads(last_msg.arguments)
+        result = await tool_session.call_tool("exec", args)
+        result_str = result.content[0].text
+
+        message = ResponseFunctionToolCallOutputItem(
+            id=f"fco_{random_uuid()}",
+            type="function_call_output",
+            call_id=f"call_{random_uuid()}",
+            output=result_str,
+            status="completed",
+        )
+
+        return [message]
+
     async def call_tool(self) -> list[ResponseInputOutputItem]:
         if not self.parser.response_messages:
             return []
         last_msg = self.parser.response_messages[-1]
         if last_msg.name == "code_interpreter":
             return await self.call_python_tool(self._tool_sessions["python"], last_msg)
+        elif last_msg.name == "web_search_preview":
+            return await self.call_search_tool(self._tool_sessions["browser"], last_msg)
+        elif last_msg.name.startswith("container"):
+            return await self.call_container_tool(
+                self._tool_sessions["container"], last_msg
+            )
         return []
 
     def render_for_completion(self):
-- 
GitLab


From 344b50d5258d7cf3f136416e1dbcd9b5ee99bb00 Mon Sep 17 00:00:00 2001
From: Zhijian Jiang <Zhijian.Jiang@outlook.com>
Date: Sun, 7 Dec 2025 19:26:25 -0800
Subject: [PATCH 182/258] Address comment to mergify.yml in #30117 (#30219)

Signed-off-by: Zhijian Jiang <Zhijian.Jiang@outlook.com>
---
 .github/mergify.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 5cb9fcdf9..58a5d7786 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -172,7 +172,7 @@ pull_request_rules:
       - files~=^tests/entrypoints/test_context.py
       - files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
       - files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
-      - files~=^vllm/entrypoints/harmony_utils.py
+      - files~=^vllm/entrypoints/openai/parser/harmony_utils.py
       - files~=^vllm/entrypoints/tool_server.py
       - files~=^vllm/entrypoints/tool.py
       - files~=^vllm/entrypoints/context.py
@@ -390,4 +390,4 @@ pull_request_rules:
   actions:
     label:
       add:
-        - kv-connector
\ No newline at end of file
+        - kv-connector
-- 
GitLab


From d726a7b0ed0b58b7703709dc3e486c5fe2f55db3 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Sun, 7 Dec 2025 20:21:05 -0800
Subject: [PATCH 183/258] [BugFix] Unblock use of LoRA with data parallel mode
 (#30220)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/metrics/loggers.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 6961e15c2..882e0ce0b 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -952,7 +952,10 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
         self.gauge_lora_info: Gauge | None = None
         if vllm_config.lora_config is not None:
             if len(self.engine_indexes) > 1:
-                raise NotImplementedError("LoRA in DP mode is not supported yet.")
+                logger.warning(
+                    "vllm:lora_requests_info prometheus metrics may be "
+                    "incorrect/misleading with data parallel deployments."
+                )
             self.labelname_max_lora = "max_lora"
             self.labelname_waiting_lora_adapters = "waiting_lora_adapters"
             self.labelname_running_lora_adapters = "running_lora_adapters"
-- 
GitLab


From c6df05ebb49901510890aea7c7411a17a38c0a8f Mon Sep 17 00:00:00 2001
From: Zhiwei <532707544@qq.com>
Date: Mon, 8 Dec 2025 13:23:46 +0800
Subject: [PATCH 184/258] [ROCm] [Fused Moe EP] Use binary expert mask for
 aiter fused moe kernel (#29773)

Signed-off-by: ZhiweiYan-96 <zhiwei.yan@amd.com>
---
 vllm/model_executor/layers/fused_moe/layer.py              | 4 ++++
 vllm/model_executor/layers/quantization/quark/quark_moe.py | 1 +
 2 files changed, 5 insertions(+)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 6001b6d83..9b4d77a06 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -520,6 +520,10 @@ class FusedMoE(CustomOp):
         self._init_aiter_shared_experts_topK_buffer(
             vllm_config=vllm_config, dp_size=dp_size_
         )
+        if self.use_ep and self.rocm_aiter_fmoe_enabled:
+            assert self.expert_mask is None or torch.all(
+                (expert_mask == 0) | (expert_mask == 1)
+            ), "Aiter Fused MoE kernel only supports expert_map with 0 and 1s."
 
         assert intermediate_size % self.tp_size == 0
         self.hidden_size = hidden_size
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 8be0299ea..9e2b21343 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -633,6 +633,7 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
                 topk_ids=topk_ids,
                 activation=activation,
                 quant_config=self.moe_quant_config,
+                expert_map=expert_map,
             )
         else:
             from vllm.model_executor.layers.fused_moe import fused_experts
-- 
GitLab


From d143271234454026454c5ee6a55fc516dd298dac Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Mon, 8 Dec 2025 14:43:47 +0800
Subject: [PATCH 185/258] [Bugfix] fix fuse_allreduce_rms when tp =1 (#30178)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
---
 vllm/compilation/collective_fusion.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
index 2717738dd..57bd94c7e 100644
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -1076,11 +1076,15 @@ class AllReduceFusionPass(VllmPatternMatcherPass):
         self.disabled = True
         self.tp_size = get_tensor_model_parallel_world_size()
         if self.tp_size <= 1:
+            logger.warning_once("AllReduce fusion pass is disabled for tp_size <= 1.")
             return
         self.patterns: PatternMatcherPass = PatternMatcherPass(
             pass_name="all_reduce_fusion_pass"
         )
         if config.model_config is None:
+            logger.warning_once(
+                "AllReduce fusion pass is disabled for missing model_config."
+            )
             return
         self.hidden_dim = config.model_config.get_hidden_size()
         self.group = get_tp_group().device_group
@@ -1188,6 +1192,9 @@ class AllReduceFusionPass(VllmPatternMatcherPass):
         self.disabled = False
 
     def is_applicable_for_range(self, compile_range: Range) -> bool:
+        if self.disabled:
+            logger.warning_once("AllReduce fusion pass is disabled.")
+            return False
         return compile_range.end <= self.max_token_num
 
     @VllmInductorPass.time_and_log
-- 
GitLab


From cd00c443d2272e5325dc6d730616f0a68ad533d1 Mon Sep 17 00:00:00 2001
From: Zhiyu <zhiyuc@nvidia.com>
Date: Sun, 7 Dec 2025 23:05:27 -0800
Subject: [PATCH 186/258] [Misc] Rename TensorRT Model Optimizer to Model
 Optimizer (#30091)

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 docs/features/quantization/README.md   | 2 +-
 docs/features/quantization/modelopt.md | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md
index 7b5287bad..8b4dcf019 100644
--- a/docs/features/quantization/README.md
+++ b/docs/features/quantization/README.md
@@ -14,7 +14,7 @@ Contents:
 - [INT4 W4A16](int4.md)
 - [INT8 W8A8](int8.md)
 - [FP8 W8A8](fp8.md)
-- [NVIDIA TensorRT Model Optimizer](modelopt.md)
+- [NVIDIA Model Optimizer](modelopt.md)
 - [AMD Quark](quark.md)
 - [Quantized KV Cache](quantized_kvcache.md)
 - [TorchAO](torchao.md)
diff --git a/docs/features/quantization/modelopt.md b/docs/features/quantization/modelopt.md
index c48ccb719..b02d5ba9e 100644
--- a/docs/features/quantization/modelopt.md
+++ b/docs/features/quantization/modelopt.md
@@ -1,6 +1,6 @@
-# NVIDIA TensorRT Model Optimizer
+# NVIDIA Model Optimizer
 
-The [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) is a library designed to optimize models for inference with NVIDIA GPUs. It includes tools for Post-Training Quantization (PTQ) and Quantization Aware Training (QAT) of Large Language Models (LLMs), Vision Language Models (VLMs), and diffusion models.
+The [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) is a library designed to optimize models for inference with NVIDIA GPUs. It includes tools for Post-Training Quantization (PTQ) and Quantization Aware Training (QAT) of Large Language Models (LLMs), Vision Language Models (VLMs), and diffusion models.
 
 We recommend installing the library with:
 
@@ -10,7 +10,7 @@ pip install nvidia-modelopt
 
 ## Quantizing HuggingFace Models with PTQ
 
-You can quantize HuggingFace models using the example scripts provided in the TensorRT Model Optimizer repository. The primary script for LLM PTQ is typically found within the `examples/llm_ptq` directory.
+You can quantize HuggingFace models using the example scripts provided in the Model Optimizer repository. The primary script for LLM PTQ is typically found within the `examples/llm_ptq` directory.
 
 Below is an example showing how to quantize a model using modelopt's PTQ API:
 
-- 
GitLab


From bcb6f5947f8acac22f5d1d5fc92a91b06fd57e77 Mon Sep 17 00:00:00 2001
From: Dazhi Jiang <dazhi_jiang@163.com>
Date: Mon, 8 Dec 2025 15:12:42 +0800
Subject: [PATCH 187/258] [Perf] Remove sync point in vit torch sdpa attn
 backend (#30232)

Signed-off-by: Dazhi Jiang <dazhi_jiang@163.com>
---
 vllm/attention/ops/vit_attn_wrappers.py  | 12 ++++++------
 vllm/model_executor/models/ernie45_vl.py | 12 ++++++------
 vllm/model_executor/models/glm4_1v.py    | 12 ++++++------
 vllm/model_executor/models/qwen2_vl.py   | 12 ++++++------
 4 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py
index d9f15f1e4..9036c2b80 100644
--- a/vllm/attention/ops/vit_attn_wrappers.py
+++ b/vllm/attention/ops/vit_attn_wrappers.py
@@ -93,12 +93,12 @@ def torch_sdpa_wrapper(
     cu_seqlens: torch.Tensor,
 ) -> torch.Tensor:
     outputs = []
-    for i in range(1, len(cu_seqlens)):
-        start_idx = cu_seqlens[i - 1]
-        end_idx = cu_seqlens[i]
-        q_i = q[:, start_idx:end_idx]
-        k_i = k[:, start_idx:end_idx]
-        v_i = v[:, start_idx:end_idx]
+
+    lens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+    q_chunks = torch.split(q, lens, dim=1)
+    k_chunks = torch.split(k, lens, dim=1)
+    v_chunks = torch.split(v, lens, dim=1)
+    for q_i, k_i, v_i in zip(q_chunks, k_chunks, v_chunks):
         q_i, k_i, v_i = (
             einops.rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i]
         )
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
index 3305b6a0e..053d260cc 100644
--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -289,12 +289,12 @@ class Ernie4_5_VisionAttention(nn.Module):
         elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             # Execute attention entry by entry for speed & less VRAM.
             outputs = []
-            for i in range(1, len(cu_seqlens)):
-                start_idx = cu_seqlens[i - 1]
-                end_idx = cu_seqlens[i]
-                q_i = q[:, start_idx:end_idx]
-                k_i = k[:, start_idx:end_idx]
-                v_i = v[:, start_idx:end_idx]
+
+            lens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+            q_chunks = torch.split(q, lens, dim=1)
+            k_chunks = torch.split(k, lens, dim=1)
+            v_chunks = torch.split(v, lens, dim=1)
+            for q_i, k_i, v_i in zip(q_chunks, k_chunks, v_chunks):
                 q_i, k_i, v_i = (
                     rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i]
                 )
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 39a837b78..741edfdda 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -377,12 +377,12 @@ class Glm4vVisionAttention(nn.Module):
         elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             # Execute attention entry by entry for speed & less VRAM.
             outputs = []
-            for i in range(1, len(cu_seqlens)):
-                start_idx = cu_seqlens[i - 1]
-                end_idx = cu_seqlens[i]
-                q_i = q[:, start_idx:end_idx]
-                k_i = k[:, start_idx:end_idx]
-                v_i = v[:, start_idx:end_idx]
+
+            lens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+            q_chunks = torch.split(q, lens, dim=1)
+            k_chunks = torch.split(k, lens, dim=1)
+            v_chunks = torch.split(v, lens, dim=1)
+            for q_i, k_i, v_i in zip(q_chunks, k_chunks, v_chunks):
                 q_i, k_i, v_i = (
                     rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i]
                 )
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 885e172d1..608e90337 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -424,12 +424,12 @@ class Qwen2VisionAttention(nn.Module):
                 k = k.contiguous()
                 v = v.contiguous()
             outputs = []
-            for i in range(1, len(cu_seqlens)):
-                start_idx = cu_seqlens[i - 1]
-                end_idx = cu_seqlens[i]
-                q_i = q[:, start_idx:end_idx]
-                k_i = k[:, start_idx:end_idx]
-                v_i = v[:, start_idx:end_idx]
+
+            lens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+            q_chunks = torch.split(q, lens, dim=1)
+            k_chunks = torch.split(k, lens, dim=1)
+            v_chunks = torch.split(v, lens, dim=1)
+            for q_i, k_i, v_i in zip(q_chunks, k_chunks, v_chunks):
                 q_i, k_i, v_i = (
                     rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i]
                 )
-- 
GitLab


From 9e77ffca3f41e0e73879098f1686a4c82b8619d9 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Mon, 8 Dec 2025 16:10:09 +0800
Subject: [PATCH 188/258] [Model][7/N] Improve all pooling task | Deprecation
 as_reward_model. Extract hidden states prefer using new multi-vector
 retrieval API (#26686)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
---
 docs/models/pooling_models.md                 | 10 ++-
 docs/models/supported_models.md               |  9 +--
 .../pooling/token_embed/jina_embeddings_v4.py | 71 +++++++++++++++++++
 tests/models/test_registry.py                 |  2 -
 tests/test_config.py                          |  2 +-
 vllm/config/model.py                          | 10 ++-
 vllm/model_executor/model_loader/utils.py     |  4 --
 vllm/model_executor/models/adapters.py        | 38 ----------
 8 files changed, 88 insertions(+), 58 deletions(-)
 create mode 100644 examples/pooling/token_embed/jina_embeddings_v4.py

diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index e2d427e8a..32ffcf96f 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -33,8 +33,8 @@ shown in the table below.
 | Architecture                                    | `--convert` | Supported pooling tasks               |
 |-------------------------------------------------|-------------|---------------------------------------|
 | `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed`     | `token_embed`, `embed`                |
+| `*ForRewardModeling`, `*RewardModel`            | `embed`     | `token_embed`, `embed`                |
 | `*For*Classification`, `*ClassificationModel`   | `classify`  | `token_classify`, `classify`, `score` |
-| `*ForRewardModeling`, `*RewardModel`            | `reward`    | `token_classify`                      |
 
 !!! tip
     You can explicitly set `--convert <type>` to specify how to convert the model.
@@ -70,7 +70,6 @@ the pooler assigned to each task has the following attributes by default:
 
 | Task       | Pooling Type | Normalization | Softmax |
 |------------|--------------|---------------|---------|
-| `reward`   | `ALL`        | ❌            | ❌     |
 | `embed`    | `LAST`       | ✅︎            | ❌      |
 | `classify` | `LAST`       | ❌            | ✅︎      |
 
@@ -318,3 +317,10 @@ We have split the `encode` task into two more specific token-wise tasks: `token_
 ### Remove softmax from PoolingParams
 
 We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function.
+
+### as_reward_model
+
+Pooling models now default support all pooling, you can use it without any settings.
+
+- Extracting hidden states prefers using `token_embed` task.
+- Reward models prefers using `token_classify` task.
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index ec3ba4474..d0166060c 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -581,16 +581,9 @@ These models primarily support the [`LLM.reward`](./pooling_models.md#llmreward)
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|-------------------|----------------------|---------------------------|
 | `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ |
-| `LlamaForCausalLM`<sup>C</sup> | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ |
+| `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ |
 | `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ |
 | `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ |
-| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
-
-<sup>C</sup> Automatically converted into a reward model via `--convert reward`. ([details](./pooling_models.md#model-conversion))  
-\* Feature support is the same as that of the original model.
-
-If your model is not in the above list, we will try to automatically convert the model using
-[as_reward_model][vllm.model_executor.models.adapters.as_reward_model]. By default, we return the hidden states of each token directly.
 
 !!! important
     For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
diff --git a/examples/pooling/token_embed/jina_embeddings_v4.py b/examples/pooling/token_embed/jina_embeddings_v4.py
new file mode 100644
index 000000000..83d4c446d
--- /dev/null
+++ b/examples/pooling/token_embed/jina_embeddings_v4.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm import LLM
+from vllm.inputs.data import TextPrompt
+from vllm.multimodal.utils import fetch_image
+
+# Initialize model
+model = LLM(
+    model="jinaai/jina-embeddings-v4-vllm-text-matching",
+    runner="pooling",
+    max_model_len=1024,
+    gpu_memory_utilization=0.8,
+)
+
+# Create text prompts
+text1 = "Ein wunderschöner Sonnenuntergang am Strand"
+text1_prompt = TextPrompt(prompt=f"Query: {text1}")
+
+text2 = "浜辺に沈む美しい夕日"
+text2_prompt = TextPrompt(prompt=f"Query: {text2}")
+
+# Create image prompt
+image = fetch_image(
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/eskimo.jpg"  # noqa: E501
+)
+image_prompt = TextPrompt(
+    prompt="<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|>\n",  # noqa: E501
+    multi_modal_data={"image": image},
+)
+
+# Encode all prompts
+prompts = [text1_prompt, text2_prompt, image_prompt]
+outputs = model.encode(prompts, pooling_task="token_embed")
+
+
+def get_embeddings(outputs):
+    VISION_START_TOKEN_ID, VISION_END_TOKEN_ID = 151652, 151653
+
+    embeddings = []
+    for output in outputs:
+        if VISION_START_TOKEN_ID in output.prompt_token_ids:
+            # Gather only vision tokens
+            img_start_pos = torch.where(
+                torch.tensor(output.prompt_token_ids) == VISION_START_TOKEN_ID
+            )[0][0]
+            img_end_pos = torch.where(
+                torch.tensor(output.prompt_token_ids) == VISION_END_TOKEN_ID
+            )[0][0]
+            embeddings_tensor = output.outputs.data.detach().clone()[
+                img_start_pos : img_end_pos + 1
+            ]
+        else:
+            # Use all tokens for text-only prompts
+            embeddings_tensor = output.outputs.data.detach().clone()
+
+        # Pool and normalize embeddings
+        pooled_output = (
+            embeddings_tensor.sum(dim=0, dtype=torch.float32)
+            / embeddings_tensor.shape[0]
+        )
+        embeddings.append(torch.nn.functional.normalize(pooled_output, dim=-1))
+    return embeddings
+
+
+embeddings = get_embeddings(outputs)
+
+for embedding in embeddings:
+    print(embedding.shape)
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index 9017a0fd9..a089696e1 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -13,7 +13,6 @@ from vllm.model_executor.models import (
 )
 from vllm.model_executor.models.adapters import (
     as_embedding_model,
-    as_reward_model,
     as_seq_cls_model,
 )
 from vllm.model_executor.models.registry import (
@@ -46,7 +45,6 @@ def test_registry_imports(model_arch):
     # All vLLM models should be convertible to a pooling model
     assert is_pooling_model(as_seq_cls_model(model_cls))
     assert is_pooling_model(as_embedding_model(model_cls))
-    assert is_pooling_model(as_reward_model(model_cls))
 
     if model_arch in _MULTIMODAL_MODELS:
         assert supports_multimodal(model_cls)
diff --git a/tests/test_config.py b/tests/test_config.py
index 203447cd5..77d3a7115 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -97,7 +97,7 @@ def test_update_config():
         ("intfloat/multilingual-e5-small", "pooling", "none", "embed"),
         ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
         ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none", "classify"),
-        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none", "reward"),
+        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none", "embed"),
         ("openai/whisper-small", "generate", "none", "transcription"),
     ],
 )
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 583904a94..764bdf700 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -516,7 +516,11 @@ class ModelConfig:
             if task == "classify":
                 return "classify"
             if task == "reward":
-                return "reward"
+                logger.warning(
+                    "Pooling models now default support all pooling; "
+                    "you can use it without any settings."
+                )
+                return "embed"
             if task == "score":
                 new_task = self._get_default_pooling_task(architectures)
                 return "classify" if new_task == "classify" else "embed"
@@ -1899,8 +1903,8 @@ _SUFFIX_TO_DEFAULTS: list[tuple[str, tuple[RunnerType, ConvertType]]] = [
     ("ForImageClassification", ("pooling", "classify")),
     ("ForVideoClassification", ("pooling", "classify")),
     ("ClassificationModel", ("pooling", "classify")),
-    ("ForRewardModeling", ("pooling", "reward")),
-    ("RewardModel", ("pooling", "reward")),
+    ("ForRewardModeling", ("pooling", "embed")),
+    ("RewardModel", ("pooling", "embed")),
     # Let other `*Model`s take priority
     ("Model", ("pooling", "embed")),
 ]
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index eeb244415..74b02e4c6 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -167,7 +167,6 @@ _MODEL_ARCH_BY_HASH = dict[int, tuple[type[nn.Module], str]]()
 def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], str]:
     from vllm.model_executor.models.adapters import (
         as_embedding_model,
-        as_reward_model,
         as_seq_cls_model,
         try_create_mm_pooling_model_cls,
     )
@@ -207,9 +206,6 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
     elif convert_type == "classify":
         logger.debug_once("Converting to sequence classification model.")
         model_cls = as_seq_cls_model(model_cls)
-    elif convert_type == "reward":
-        logger.debug_once("Converting to reward model.")
-        model_cls = as_reward_model(model_cls)
     else:
         assert_never(convert_type)
 
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 007d847ac..70f203b9f 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -346,44 +346,6 @@ def as_seq_cls_model(cls: _T) -> _T:
     return ModelForSequenceClassification  # type: ignore
 
 
-def as_reward_model(cls: _T) -> _T:
-    """
-    Subclass an existing vLLM model to support reward modeling.
-
-    By default, we return the hidden states of each token directly.
-
-    Note:
-        We assume that no extra layers are added to the original model;
-        please implement your own model if this is not the case.
-    """
-    # Avoid modifying existing reward models
-    if is_pooling_model(cls):
-        return cls
-
-    # Lazy import
-    from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
-
-    from .interfaces_base import default_pooling_type
-
-    @default_pooling_type("ALL")
-    class ModelForReward(_create_pooling_model_cls(cls)):
-        def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
-            pooler_config = vllm_config.model_config.pooler_config
-            assert pooler_config is not None
-
-            self.pooler = DispatchPooler(
-                {
-                    "token_classify": Pooler.for_token_classify(
-                        pooler_config=pooler_config
-                    )
-                }
-            )
-
-    ModelForReward.__name__ = _get_pooling_model_name(cls.__name__, "ForReward")
-
-    return ModelForReward  # type: ignore
-
-
 class SequenceClassificationConfig(VerifyAndUpdateConfig):
     @staticmethod
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-- 
GitLab


From 408cf42f67dbcd50027fcd0f6ba35df83ced9107 Mon Sep 17 00:00:00 2001
From: Shiming Zhang <wzshiming@hotmail.com>
Date: Mon, 8 Dec 2025 18:29:14 +0800
Subject: [PATCH 189/258] [CI] Prevents triggering of an inactive issue/PR
 check for forked repository. (#29654)

Signed-off-by: Shiming Zhang <wzshiming@hotmail.com>
---
 .github/workflows/stale.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index dca3089f4..c8a52f1a6 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -7,6 +7,8 @@ on:
 
 jobs:
   close-issues-and-pull-requests:
+    # Prevents triggering on forks or other repos
+    if: github.repository == 'vllm-project/vllm'
     permissions:
       issues: write
       pull-requests: write
-- 
GitLab


From 2e660c24349944ac78abcf2c35d17e3caf6cdd08 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Mon, 8 Dec 2025 20:01:21 +0800
Subject: [PATCH 190/258] [Frontend] Binary embedding response does not return
 metadata by setting encoding_format to bytes_only. (#30249)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: wang.yuqi <noooop@126.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 .../embed/embedding_requests_bytes_client.py  | 37 +++++++++++-
 .../entrypoints/pooling/embed/test_online.py  | 50 +++++++++++++++++
 .../pooling/pooling/test_online.py            | 56 +++++++++++++++++++
 vllm/entrypoints/pooling/embed/api_router.py  |  4 +-
 vllm/entrypoints/pooling/embed/protocol.py    |  4 +-
 vllm/entrypoints/pooling/embed/serving.py     | 34 ++++++-----
 .../entrypoints/pooling/pooling/api_router.py |  4 +-
 vllm/entrypoints/pooling/pooling/protocol.py  |  4 +-
 vllm/entrypoints/pooling/pooling/serving.py   | 35 +++++++-----
 vllm/utils/serial_utils.py                    | 43 ++++++++++++--
 10 files changed, 230 insertions(+), 41 deletions(-)

diff --git a/examples/pooling/embed/embedding_requests_bytes_client.py b/examples/pooling/embed/embedding_requests_bytes_client.py
index c2832f1b5..5ea452524 100644
--- a/examples/pooling/embed/embedding_requests_bytes_client.py
+++ b/examples/pooling/embed/embedding_requests_bytes_client.py
@@ -16,6 +16,7 @@ from vllm.utils.serial_utils import (
     EMBED_DTYPE_TO_TORCH_DTYPE,
     ENDIANNESS,
     MetadataItem,
+    build_metadata_items,
     decode_pooling_output,
 )
 
@@ -38,6 +39,11 @@ def parse_args():
 def main(args):
     api_url = f"http://{args.host}:{args.port}/v1/embeddings"
     model_name = args.model
+    embedding_size = 0
+
+    input_texts = [
+        "The best thing about vLLM is that it supports many different models",
+    ] * 2
 
     # The OpenAI client does not support the bytes encoding_format.
     # The OpenAI client does not support the embed_dtype and endianness parameters.
@@ -45,7 +51,7 @@ def main(args):
         for endianness in ENDIANNESS:
             prompt = {
                 "model": model_name,
-                "input": "vLLM is great!",
+                "input": input_texts,
                 "encoding_format": "bytes",
                 "embed_dtype": embed_dtype,
                 "endianness": endianness,
@@ -57,7 +63,34 @@ def main(args):
 
             embedding = decode_pooling_output(items=items, body=body)
             embedding = [x.to(torch.float32) for x in embedding]
-            embedding = torch.cat(embedding)
+            embedding = torch.stack(embedding)
+            embedding_size = embedding.shape[-1]
+            print(embed_dtype, endianness, embedding.shape)
+
+    # The vllm server always sorts the returned embeddings in the order of input. So
+    # returning metadata is not necessary. You can set encoding_format to bytes_only
+    # to let the server not return metadata.
+    for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
+        for endianness in ENDIANNESS:
+            prompt = {
+                "model": model_name,
+                "input": input_texts,
+                "encoding_format": "bytes_only",
+                "embed_dtype": embed_dtype,
+                "endianness": endianness,
+            }
+            response = post_http_request(prompt=prompt, api_url=api_url)
+            body = response.content
+
+            items = build_metadata_items(
+                embed_dtype=embed_dtype,
+                endianness=endianness,
+                shape=(embedding_size,),
+                n_request=len(input_texts),
+            )
+            embedding = decode_pooling_output(items=items, body=body)
+            embedding = [x.to(torch.float32) for x in embedding]
+            embedding = torch.stack(embedding)
             print(embed_dtype, endianness, embedding.shape)
 
 
diff --git a/tests/entrypoints/pooling/embed/test_online.py b/tests/entrypoints/pooling/embed/test_online.py
index ddba1c790..f96338c47 100644
--- a/tests/entrypoints/pooling/embed/test_online.py
+++ b/tests/entrypoints/pooling/embed/test_online.py
@@ -24,6 +24,7 @@ from vllm.utils.serial_utils import (
     ENDIANNESS,
     MetadataItem,
     binary2tensor,
+    build_metadata_items,
     decode_pooling_output,
 )
 
@@ -344,6 +345,55 @@ async def test_bytes_embed_dtype_and_endianness(
             )
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_bytes_only_embed_dtype_and_endianness(
+    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
+):
+    input_texts = [
+        "The best thing about vLLM is that it supports many different models",
+    ] * 2
+
+    responses_float = await client.embeddings.create(
+        input=input_texts, model=model_name, encoding_format="float"
+    )
+    float_data = [d.embedding for d in responses_float.data]
+    embedding_size = len(float_data[0])
+
+    for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
+        for endianness in ENDIANNESS:
+            responses_bytes = requests.post(
+                server.url_for("/v1/embeddings"),
+                json={
+                    "model": model_name,
+                    "input": input_texts,
+                    "encoding_format": "bytes_only",
+                    "embed_dtype": embed_dtype,
+                    "endianness": endianness,
+                },
+            )
+
+            assert "metadata" not in responses_bytes.headers
+            body = responses_bytes.content
+            items = build_metadata_items(
+                embed_dtype=embed_dtype,
+                endianness=endianness,
+                shape=(embedding_size,),
+                n_request=len(input_texts),
+            )
+
+            bytes_data = decode_pooling_output(items=items, body=body)
+            bytes_data = [x.to(torch.float32).tolist() for x in bytes_data]
+
+            check_embeddings_close(
+                embeddings_0_lst=float_data,
+                embeddings_1_lst=bytes_data,
+                name_0="float_data",
+                name_1="bytes_data",
+                tol=1e-2,
+            )
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"])
diff --git a/tests/entrypoints/pooling/pooling/test_online.py b/tests/entrypoints/pooling/pooling/test_online.py
index cc5c2f26f..33add5bda 100644
--- a/tests/entrypoints/pooling/pooling/test_online.py
+++ b/tests/entrypoints/pooling/pooling/test_online.py
@@ -18,6 +18,7 @@ from vllm.utils.serial_utils import (
     ENDIANNESS,
     MetadataItem,
     binary2tensor,
+    build_metadata_items,
     decode_pooling_output,
 )
 
@@ -352,6 +353,61 @@ async def test_bytes_embed_dtype_and_endianness(
             )
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_bytes_only_embed_dtype_and_endianness(
+    server: RemoteOpenAIServer, model_name: str
+):
+    input_texts = [
+        "The best thing about vLLM is that it supports many different models",
+    ] * 2
+
+    url = server.url_for("pooling")
+    float_response = requests.post(
+        url,
+        json={
+            "model": model_name,
+            "input": input_texts,
+            "encoding_format": "float",
+        },
+    )
+    responses_float = PoolingResponse.model_validate(float_response.json())
+    float_data = [np.array(d.data).squeeze(-1).tolist() for d in responses_float.data]
+    n_tokens = responses_float.usage.prompt_tokens // len(input_texts)
+
+    for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
+        for endianness in ENDIANNESS:
+            responses_bytes = requests.post(
+                url,
+                json={
+                    "model": model_name,
+                    "input": input_texts,
+                    "encoding_format": "bytes_only",
+                    "embed_dtype": embed_dtype,
+                    "endianness": endianness,
+                },
+            )
+
+            assert "metadata" not in responses_bytes.headers
+            body = responses_bytes.content
+            items = build_metadata_items(
+                embed_dtype=embed_dtype,
+                endianness=endianness,
+                shape=(n_tokens, 1),
+                n_request=len(input_texts),
+            )
+            bytes_data = decode_pooling_output(items=items, body=body)
+            bytes_data = [x.to(torch.float32).view(-1).tolist() for x in bytes_data]
+
+            check_embeddings_close(
+                embeddings_0_lst=float_data,
+                embeddings_1_lst=bytes_data,
+                name_0="float_data",
+                name_1="bytes_data",
+                tol=1e-2,
+            )
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"])
diff --git a/vllm/entrypoints/pooling/embed/api_router.py b/vllm/entrypoints/pooling/embed/api_router.py
index 5b10a32e7..24b0c8c2b 100644
--- a/vllm/entrypoints/pooling/embed/api_router.py
+++ b/vllm/entrypoints/pooling/embed/api_router.py
@@ -59,8 +59,8 @@ async def create_embedding(
         return JSONResponse(content=generator.model_dump())
     elif isinstance(generator, EmbeddingBytesResponse):
         return StreamingResponse(
-            content=generator.body,
-            headers={"metadata": generator.metadata},
+            content=generator.content,
+            headers=generator.headers,
             media_type=generator.media_type,
         )
 
diff --git a/vllm/entrypoints/pooling/embed/protocol.py b/vllm/entrypoints/pooling/embed/protocol.py
index 7eb53e14d..6a8f8c443 100644
--- a/vllm/entrypoints/pooling/embed/protocol.py
+++ b/vllm/entrypoints/pooling/embed/protocol.py
@@ -203,6 +203,6 @@ class EmbeddingResponse(OpenAIBaseModel):
 
 
 class EmbeddingBytesResponse(OpenAIBaseModel):
-    body: list[bytes]
-    metadata: str
+    content: list[bytes]
+    headers: dict[str, str] | None = None
     media_type: str = "application/octet-stream"
diff --git a/vllm/entrypoints/pooling/embed/serving.py b/vllm/entrypoints/pooling/embed/serving.py
index 868a3cb01..aafc35489 100644
--- a/vllm/entrypoints/pooling/embed/serving.py
+++ b/vllm/entrypoints/pooling/embed/serving.py
@@ -163,29 +163,35 @@ class EmbeddingMixin(OpenAIServing):
                 usage=usage,
             )
 
-        def encode_bytes():
-            body, items, usage = encode_pooling_bytes(
+        def encode_bytes(bytes_only: bool) -> EmbeddingBytesResponse:
+            content, items, usage = encode_pooling_bytes(
                 pooling_outputs=final_res_batch_checked,
                 embed_dtype=embed_dtype,
                 endianness=endianness,
             )
 
-            metadata = {
-                "id": ctx.request_id,
-                "created": ctx.created_time,
-                "model": ctx.model_name,
-                "data": items,
-                "usage": usage,
-            }
-            return EmbeddingBytesResponse(
-                body=body,
-                metadata=json.dumps(metadata),
+            headers = (
+                None
+                if bytes_only
+                else {
+                    "metadata": json.dumps(
+                        {
+                            "id": ctx.request_id,
+                            "created": ctx.created_time,
+                            "model": ctx.model_name,
+                            "data": items,
+                            "usage": usage,
+                        }
+                    )
+                }
             )
 
+            return EmbeddingBytesResponse(content=content, headers=headers)
+
         if encoding_format == "float" or encoding_format == "base64":
             return encode_float_base64()
-        elif encoding_format == "bytes":
-            return encode_bytes()
+        elif encoding_format == "bytes" or encoding_format == "bytes_only":
+            return encode_bytes(bytes_only=encoding_format == "bytes_only")
         else:
             assert_never(encoding_format)
 
diff --git a/vllm/entrypoints/pooling/pooling/api_router.py b/vllm/entrypoints/pooling/pooling/api_router.py
index 674da94d1..4baaf8f30 100644
--- a/vllm/entrypoints/pooling/pooling/api_router.py
+++ b/vllm/entrypoints/pooling/pooling/api_router.py
@@ -55,8 +55,8 @@ async def create_pooling(request: PoolingRequest, raw_request: Request):
         return JSONResponse(content=generator.model_dump())
     elif isinstance(generator, PoolingBytesResponse):
         return StreamingResponse(
-            content=generator.body,
-            headers={"metadata": generator.metadata},
+            content=generator.content,
+            headers=generator.headers,
             media_type=generator.media_type,
         )
 
diff --git a/vllm/entrypoints/pooling/pooling/protocol.py b/vllm/entrypoints/pooling/pooling/protocol.py
index 364cd9373..76b361b49 100644
--- a/vllm/entrypoints/pooling/pooling/protocol.py
+++ b/vllm/entrypoints/pooling/pooling/protocol.py
@@ -143,6 +143,6 @@ class PoolingResponse(OpenAIBaseModel):
 
 
 class PoolingBytesResponse(OpenAIBaseModel):
-    body: list[bytes]
-    metadata: str
+    content: list[bytes]
+    headers: dict[str, str] | None = None
     media_type: str = "application/octet-stream"
diff --git a/vllm/entrypoints/pooling/pooling/serving.py b/vllm/entrypoints/pooling/pooling/serving.py
index 7fb767e26..57f1a6440 100644
--- a/vllm/entrypoints/pooling/pooling/serving.py
+++ b/vllm/entrypoints/pooling/pooling/serving.py
@@ -314,29 +314,38 @@ class OpenAIServingPooling(OpenAIServing):
                 usage=usage,
             )
 
-        def encode_bytes():
-            body, items, usage = encode_pooling_bytes(
+        def encode_bytes(bytes_only: bool) -> PoolingBytesResponse:
+            content, items, usage = encode_pooling_bytes(
                 pooling_outputs=final_res_batch,
                 embed_dtype=embed_dtype,
                 endianness=endianness,
             )
 
-            metadata = {
-                "id": request_id,
-                "created": created_time,
-                "model": model_name,
-                "data": items,
-                "usage": usage,
-            }
+            headers = (
+                None
+                if bytes_only
+                else {
+                    "metadata": json.dumps(
+                        {
+                            "id": request_id,
+                            "created": created_time,
+                            "model": model_name,
+                            "data": items,
+                            "usage": usage,
+                        }
+                    )
+                }
+            )
+
             return PoolingBytesResponse(
-                body=body,
-                metadata=json.dumps(metadata),
+                content=content,
+                headers=headers,
             )
 
         if encoding_format == "float" or encoding_format == "base64":
             return encode_float_base64()
-        elif encoding_format == "bytes":
-            return encode_bytes()
+        elif encoding_format == "bytes" or encoding_format == "bytes_only":
+            return encode_bytes(bytes_only=encoding_format == "bytes_only")
         else:
             assert_never(encoding_format)
 
diff --git a/vllm/utils/serial_utils.py b/vllm/utils/serial_utils.py
index a6d717e03..07db5eaf7 100644
--- a/vllm/utils/serial_utils.py
+++ b/vllm/utils/serial_utils.py
@@ -2,15 +2,19 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import base64
 import io
+import math
 import sys
 from dataclasses import dataclass
-from typing import Literal
+from typing import TYPE_CHECKING, Any, Literal
 
 import numpy as np
 import torch
 from typing_extensions import assert_never
 
-from vllm import PoolingRequestOutput
+if TYPE_CHECKING:
+    from vllm import PoolingRequestOutput
+else:
+    PoolingRequestOutput = Any
 
 sys_byteorder = sys.byteorder
 
@@ -27,6 +31,14 @@ EMBED_DTYPE_TO_TORCH_DTYPE = {
     "fp8_e5m2": torch.float8_e5m2,
 }
 
+EMBED_DTYPE_TO_N_BYTES = {
+    "float32": 4,
+    "float16": 2,
+    "bfloat16": 2,
+    "fp8_e4m3": 1,
+    "fp8_e5m2": 1,
+}
+
 
 EMBED_DTYPE_TO_TORCH_DTYPE_VIEW = {
     "float32": torch.float32,
@@ -50,7 +62,7 @@ ENDIANNESS = ["native", "big", "little"]
 
 EmbedDType = Literal["float32", "float16", "bfloat16", "fp8_e4m3", "fp8_e5m2"]
 Endianness = Literal["native", "big", "little"]
-EncodingFormat = Literal["float", "base64", "bytes"]
+EncodingFormat = Literal["float", "base64", "bytes", "bytes_only"]
 
 
 def tensor2base64(x: torch.Tensor) -> str:
@@ -114,7 +126,7 @@ def encode_pooling_output(
     elif encoding_format == "base64":
         embedding_bytes = tensor2binary(output.outputs.data, embed_dtype, endianness)
         return base64.b64encode(embedding_bytes).decode("utf-8")
-    elif encoding_format == "bytes":
+    elif encoding_format == "bytes" or encoding_format == "bytes_only":
         return tensor2binary(output.outputs.data, embed_dtype, endianness)
     assert_never(encoding_format)
 
@@ -129,6 +141,29 @@ class MetadataItem:
     shape: tuple[int, ...]
 
 
+def build_metadata_items(
+    embed_dtype: EmbedDType,
+    endianness: Endianness,
+    shape: tuple[int, ...],
+    n_request: int,
+):
+    n_bytes = EMBED_DTYPE_TO_N_BYTES[embed_dtype]
+    size = math.prod(shape)
+    items = [
+        MetadataItem(
+            index=i,
+            embed_dtype=embed_dtype,
+            endianness=endianness,
+            start=i * size * n_bytes,
+            end=(i + 1) * size * n_bytes,
+            shape=shape,
+        )
+        for i in range(n_request)
+    ]
+
+    return items
+
+
 def encode_pooling_bytes(
     pooling_outputs: list[PoolingRequestOutput],
     embed_dtype: EmbedDType,
-- 
GitLab


From 77072e93b327fd391482f5facadf93a804ced0cc Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Mon, 8 Dec 2025 03:06:20 -0900
Subject: [PATCH 191/258] [docs] governance documents (#24801)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: Mark McLoughlin <markmc@redhat.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/.nav.yml                    |   1 +
 docs/governance/collaboration.md |  43 ++++++++
 docs/governance/committers.md    | 183 +++++++++++++++++++++++++++++++
 docs/governance/process.md       | 125 +++++++++++++++++++++
 4 files changed, 352 insertions(+)
 create mode 100644 docs/governance/collaboration.md
 create mode 100644 docs/governance/committers.md
 create mode 100644 docs/governance/process.md

diff --git a/docs/.nav.yml b/docs/.nav.yml
index aa98ad52b..835cc773e 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -59,6 +59,7 @@ nav:
   - CLI Reference: cli
   - Community:
     - community/*
+    - Governance: governance
     - Blog: https://blog.vllm.ai
     - Forum: https://discuss.vllm.ai
     - Slack: https://slack.vllm.ai
diff --git a/docs/governance/collaboration.md b/docs/governance/collaboration.md
new file mode 100644
index 000000000..5b3d2beff
--- /dev/null
+++ b/docs/governance/collaboration.md
@@ -0,0 +1,43 @@
+# Collaboration Policy
+
+This page outlines how vLLM collaborates with model providers, hardware vendors, and other stakeholders.
+
+## Adding New Major Features
+
+Anyone can contribute to vLLM. For major features, submit an RFC (request for comments) first. To submit an RFC, create an [issue](https://github.com/vllm-project/vllm/issues/new/choose) and select the `RFC` template.
+RFCs are similar to design docs that discuss the motivation, problem solved, alternatives considered, and proposed change.
+
+Once you submit the RFC, please post it in the #contributors channel in vLLM Slack, and loop in area owners and committers for feedback.
+For high-interest features, the committers nominate a person to help with the RFC process and PR review. This makes sure someone is guiding you through the process. It is reflected as the "assignee" field in the RFC issue.
+If the assignee and lead maintainers find the feature to be contentious, the maintainer team aims to make decisions quickly after learning the details from everyone. This involves assigning a committer as the DRI (Directly Responsible Individual) to make the decision and shepherd the code contribution process.
+
+For features that you intend to maintain, please feel free to add yourself in [`mergify.yml`](https://github.com/vllm-project/vllm/blob/main/.github/mergify.yml) to receive notifications and auto-assignment when the PRs touching the feature you are maintaining. Over time, the ownership will be evaluated and updated through the committers nomination and voting process.
+
+## Adding New Models
+
+If you use vLLM, we recommend you making the model work with vLLM by following the [model registration](../contributing/model/registration.md) process before you release it publicly.
+
+The vLLM team helps with new model architectures not supported by vLLM, especially models pushing architectural frontiers.
+Here's how the vLLM team works with model providers. The vLLM team includes all [committers](./committers.md) of the project. model providers can exclude certain members but shouldn't, as this may harm release timelines due to missing expertise. Contact [project leads](./process.md) if you want to collaborate.
+
+Once we establish the connection between the vLLM team and model provider:
+
+- The vLLM team learns the model architecture and relevant changes, then plans which area owners to involve and what features to include.
+- The vLLM team creates a private communication channel (currently a Slack channel in the vLLM workspace) and a private fork within the vllm-project organization. The model provider team can invite others to the channel and repo.
+- Third parties like compute providers, hosted inference providers, hardware vendors, and other organizations often work with both the model provider and vLLM on model releases. We establish direct communication (with permission) or three-way communication as needed.
+
+The vLLM team works with model providers on features, integrations, and release timelines. We work to meet release timelines, but engineering challenges like feature development, model accuracy alignment, and optimizations can cause delays.
+
+The vLLM maintainers will not publicly share details about model architecture, release timelines, or upcoming releases. We maintain model weights on secure servers with security measures (though we can work with security reviews and testing without certification). We delete pre-release weights or artifacts upon request.
+
+The vLLM team collaborates on marketing and promotional efforts for model releases. model providers can use vLLM's trademark and logo in publications and materials.
+
+## Adding New Hardware
+
+vLLM is designed as a platform for frontier model architectures and high-performance accelerators.
+For new hardware, follow the [hardware plugin](../design/plugin_system.md) system to add support.
+Use the platform plugin system to add hardware support.
+As hardware gains popularity, we help endorse it in our documentation and marketing materials.
+The vLLM GitHub organization can host hardware plugin repositories, especially for collaborative efforts among companies.
+
+We rarely add new hardware to vLLM directly. Instead, we make existing hardware platforms modular to keep the vLLM core hardware-agnostic.
diff --git a/docs/governance/committers.md b/docs/governance/committers.md
new file mode 100644
index 000000000..c9428027d
--- /dev/null
+++ b/docs/governance/committers.md
@@ -0,0 +1,183 @@
+# Committers
+
+This document lists the current committers of the vLLM project and the core areas they maintain.
+Committers have write access to the vLLM repository and are responsible for reviewing and merging PRs.
+You can also refer to the [CODEOWNERS](https://github.com/vllm-project/vllm/blob/main/.github/CODEOWNERS) file for concrete file-level ownership and reviewers. Both this documents and the CODEOWNERS file are living documents and they complement each other.
+
+## Active Committers
+
+We try to summarize each committer's role in vLLM in a few words. In general, vLLM committers cover a wide range of areas and help each other in the maintenance process.
+Please refer to the later section about Area Owners for exact component ownership details.
+Sorted alphabetically by GitHub handle:
+
+- [@22quinn](https://github.com/22quinn): RL API
+- [@aarnphm](https://github.com/aarnphm): Structured output
+- [@alexm-redhat](https://github.com/alexm-redhat): Performance
+- [@ApostaC](https://github.com/ApostaC): Connectors, offloading
+- [@benchislett](https://github.com/benchislett): Engine core and spec decode
+- [@bigPYJ1151](https://github.com/bigPYJ1151): Intel CPU/XPU integration
+- [@chaunceyjiang](https://github.com/chaunceyjiang): Tool use and reasoning parser
+- [@DarkLight1337](https://github.com/DarkLight1337): Multimodality, API server
+- [@esmeetu](https://github.com/esmeetu): developer marketing, community
+- [@gshtras](https://github.com/gshtras): AMD integration
+- [@heheda12345](https://github.com/heheda12345): Hybrid memory allocator
+- [@hmellor](https://github.com/hmellor): Hugging Face integration, documentation
+- [@houseroad](https://github.com/houseroad): Engine core and Llama models
+- [@Isotr0py](https://github.com/Isotr0py): Multimodality, new model support
+- [@jeejeelee](https://github.com/jeejeelee): LoRA, new model support
+- [@jikunshang](https://github.com/jikunshang): Intel CPU/XPU integration
+- [@khluu](https://github.com/khluu): CI infrastructure
+- [@KuntaiDu](https://github.com/KuntaiDu): KV Connector
+- [@LucasWilkinson](https://github.com/LucasWilkinson): Kernels and performance
+- [@luccafong](https://github.com/luccafong): Llama models, speculative decoding, distributed
+- [@markmc](https://github.com/markmc): Observability
+- [@mgoin](https://github.com/mgoin): Quantization and performance
+- [@NickLucche](https://github.com/NickLucche): KV connector
+- [@njhill](https://github.com/njhill): Distributed, API server, engine core
+- [@noooop](https://github.com/noooop): Pooling models
+- [@patrickvonplaten](https://github.com/patrickvonplaten): Mistral models, new model support
+- [@pavanimajety](https://github.com/pavanimajety): NVIDIA GPU integration
+- [@ProExpertProg](https://github.com/ProExpertProg): Compilation, startup UX
+- [@robertgshaw2-redhat](https://github.com/robertgshaw2-redhat): Core, distributed, disagg
+- [@ruisearch42](https://github.com/ruisearch42): Pipeline parallelism, Ray Support
+- [@russellb](https://github.com/russellb): Structured output, engine core, security
+- [@sighingnow](https://github.com/sighingnow): Qwen models, new model support
+- [@simon-mo](https://github.com/simon-mo): Project lead, API entrypoints, community
+- [@tdoublep](https://github.com/tdoublep): State space models
+- [@tjtanaa](https://github.com/tjtanaa): AMD GPU integration
+- [@tlrmchlsmth](https://github.com/tlrmchlsmth): Kernels and performance, distributed, disagg
+- [@WoosukKwon](https://github.com/WoosukKwon): Project lead, engine core
+- [@yaochengji](https://github.com/yaochengji): TPU integration
+- [@yeqcharlotte](https://github.com/yeqcharlotte): Benchmark, Llama models
+- [@yewentao256](https://github.com/yewentao256): Kernels and performance
+- [@Yikun](https://github.com/Yikun): Pluggable hardware interface
+- [@youkaichao](https://github.com/youkaichao): Project lead, distributed, compile, community
+- [@ywang96](https://github.com/ywang96): Multimodality, benchmarks
+- [@zhuohan123](https://github.com/zhuohan123): Project lead, RL integration, numerics
+- [@zou3519](https://github.com/zou3519): Compilation
+
+### Emeritus Committers
+
+Committers who have contributed to vLLM significantly in the past (thank you!) but no longer active:
+
+- [@andoorve](https://github.com/andoorve): Pipeline parallelism
+- [@cadedaniel](https://github.com/cadedaniel): Speculative decoding
+- [@comaniac](https://github.com/comaniac): KV cache management, pipeline parallelism
+- [@LiuXiaoxuanPKU](https://github.com/LiuXiaoxuanPKU): Speculative decoding
+- [@pcmoritz](https://github.com/pcmoritz): MoE
+- [@rkooo567](https://github.com/rkooo567): Chunked prefill
+- [@sroy745](https://github.com/sroy745): Speculative decoding
+- [@Yard1](https://github.com/Yard1): kernels and performance
+- [@zhisbug](https://github.com/zhisbug): Arctic models, distributed
+
+## Area Owners
+
+This section breaks down the active committers by vLLM components and lists the area owners.
+If you have PRs touching the area, please feel free to ping the area owner for review.
+
+### Engine Core
+
+- Scheduler: the core vLLM engine loop scheduling requests to next batch
+    - @WoosukKwon, @robertgshaw2-redhat, @njhill, @heheda12345
+- KV Cache Manager: memory management layer within scheduler maintaining KV cache logical block data
+    - @heheda12345, @WoosukKwon
+- AsyncLLM: the zmq based protocol hosting engine core and making it accessible for entrypoints
+    - @robertgshaw2-redhat, @njhill, @russellb
+- ModelRunner, Executor, Worker: the abstractions for engine wrapping model implementation
+    - @WoosukKwon, @tlrmchlsmth, @heheda12345, @LucasWilkinson, @ProExpertProg
+- KV Connector: Connector interface and implementation for KV cache offload and transfer
+    - @robertgshaw2-redhat, @njhill, @KuntaiDu, @NickLucche, @ApostaC
+- Distributed, Parallelism, Process Management: Process launchers managing each worker, and assign them to the right DP/TP/PP/EP ranks
+    - @youkaichao, @njhill, @WoosukKwon, @ruisearch42
+- Collectives: the usage of nccl and other communication libraries/kernels
+    - @tlrmchlsmth, @youkaichao
+- Multimodality engine and memory management: core scheduling and memory management concerning vision, audio, and video inputs.
+    - @ywang96, @DarkLight1337
+
+### Model Implementations
+
+- Model Interface: The `nn.Module` interface and implementation for various models
+    - @zhuohan123, @mgoin, @simon-mo, @houseroad, @ywang96 (multimodality), @jeejeelee (lora)
+- Logits Processors / Sampler: The provided sampler class and pluggable logits processors
+    - @njhill, @houseroad, @22quinn
+- Custom Layers: Utility layers in vLLM such as rotary embedding and rms norms
+    - @ProExpertProg
+- Attention: Attention interface for paged attention
+    - @WoosukKwon, @LucasWilkinson, @heheda12345
+- FusedMoE: FusedMoE kernel, Modular kernel framework, EPLB
+    - @tlrmchlsmth
+- Quantization: Various quantization config, weight loading, and kernel.
+    - @mgoin, @Isotr0py, @yewentao256
+- Custom quantized GEMM kernels (cutlass_scaled_mm, marlin, machete)
+    - @tlrmchlsmth, @LucasWilkinson
+- Multi-modal Input Processing: Components that load and process image/video/audio data into feature tensors
+    - @DarkLight1337, @ywang96, @Isotr0py
+- torch compile: The torch.compile integration in vLLM, custom passes & transformations
+    - @ProExpertProg, @zou3519, @youkaichao
+- State space models: The state space models implementation in vLLM
+    - @tdoublep, @tlrmchlsmth
+- Reasoning and tool calling parsers
+    - @chaunceyjiang, @aarnphm
+
+### Entrypoints
+
+- LLM Class: The LLM class for offline inference
+    - @DarkLight1337
+- API Server: The OpenAI-compatible API server
+    - @DarkLight1337, @njhill, @aarnphm, @simon-mo, @heheda12345 (Responses API)
+- Batch Runner: The OpenAI-compatible batch runner
+    - @simon-mo
+
+### Features
+
+- Spec Decode: Covers model definition, attention, sampler, and scheduler related to n-grams, EAGLE, and MTP.
+    - @WoosukKwon, @benchislett, @luccafong
+- Structured Output: The structured output implementation
+    - @russellb, @aarnphm
+- RL: The RL related features such as collective rpc, sleep mode, etc.
+    - @youkaichao, @zhuohan123, @22quinn
+- LoRA: @jeejeelee
+- Observability: Metrics and Logging
+    - @markmc, @robertgshaw2-redhat, @simon-mo
+
+### Code Base
+
+- Config: Configuration registration and parsing
+    - @hmellor
+- Documentation: @hmellor, @DarkLight1337, @simon-mo
+- Benchmarks: @ywang96, @simon-mo
+- CI, Build, Release Process: @khluu, @njhill, @simon-mo
+- Security: @russellb
+
+### External Kernels Integration
+
+- FlashAttention: @LucasWilkinson
+- FlashInfer: @LucasWilkinson, @mgoin, @WoosukKwon
+- Blackwell Kernels: @mgoin, @yewentao256
+- DeepEP/DeepGEMM/pplx: @mgoin, @yewentao256
+
+### Integrations
+
+- Hugging Face: @hmellor, @Isotr0py
+- Ray: @ruisearch42
+- NIXL: @robertgshaw2-redhat, @NickLucche
+
+### Collaboration with Model Vendors
+
+- gpt-oss: @heheda12345, @simon-mo, @zhuohan123
+- Llama: @luccafong
+- Qwen: @sighingnow
+- Mistral: @patrickvonplaten
+
+### Hardware
+
+- Plugin Interface: @youkaichao, @Yikun
+- NVIDIA GPU: @pavanimajety
+- AMD GPU: @gshtras, @tjtanaa
+- Intel CPU/GPU: @jikunshang, @bigPYJ1151
+- Google TPU: @yaochengji
+
+### Ecosystem Projects
+
+- Ascend NPU: [@wangxiyuan](https://github.com/wangxiyuan) and [see more details](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html#maintainers)
+- Intel Gaudi HPU [@xuechendi](https://github.com/xuechendi) and [@kzawora-intel](https://github.com/kzawora-intel)
diff --git a/docs/governance/process.md b/docs/governance/process.md
new file mode 100644
index 000000000..1e088dd3c
--- /dev/null
+++ b/docs/governance/process.md
@@ -0,0 +1,125 @@
+# Governance Process
+
+vLLM's success comes from our strong open source community. We favor informal, meritocratic norms over formal policies. This document clarifies our governance philosophy and practices.
+
+## Values
+
+vLLM aims to be the fastest and easiest-to-use LLM inference and serving engine. We stay current with advances, enable innovation, and support diverse models, modalities, and hardware.
+
+### Design Values
+
+1. **Top performance**: System performance is our top priority. We monitor overheads, optimize kernels, and publish benchmarks. We never leave performance on the table.
+2. **Ease of use**: vLLM must be simple to install, configure, and operate. We provide clear documentation, fast startup, clean logs, helpful error messages, and monitoring guides. Many users fork our code or study it deeply, so we keep it readable and modular.
+3. **Wide coverage**: vLLM supports frontier models and high-performance accelerators. We make it easy to add new models and hardware. vLLM + PyTorch form a simple interface that avoids complexity.
+4. **Production ready**: vLLM runs 24/7 in production. It must be easy to operate and monitor for health issues.
+5. **Extensibility**: vLLM serves as fundamental LLM infrastructure. Our codebase cannot cover every use case, so we design for easy forking and customization.
+
+### Collaboration Values
+
+1. **Tightly Knit and Fast-Moving**: Our maintainer team is aligned on vision, philosophy, and roadmap. We work closely to unblock each other and move quickly.
+2. **Individual Merit**: No one buys their way into governance. Committer status belongs to individuals, not companies. We reward contribution, maintenance, and project stewardship.
+
+## Project Maintainers
+
+Maintainers form a hierarchy based on sustained, high-quality contributions and alignment with our design philosophy.
+
+### Core Maintainers
+
+Core Maintainers function like a project planning and decision making committee. In other convention, they might be called a Technical Steering Committee (TSC). In vLLM vocabulary, they are often known as "Project Leads". They meet weekly to coordinate roadmap priorities and allocate engineering resources. Current active leads: @WoosukKwon, @zhuohan123, @simon-mo, @youkaichao, @robertshaw2-redhat, @tlrmchlsmth, @mgoin, @njhill, @ywang96, @houseroad, @yeqcharlotte, @ApostaC
+
+The responsibilities of the core maintainers are:
+
+* Author quarterly roadmap and responsible for each development effort.
+* Making major changes to the technical direction or scope of vLLM and vLLM projects.
+* Defining the project's release strategy.
+* Work with model providers, hardware vendors, and key users of vLLM to ensure the project is on the right track.
+
+### Lead Maintainers
+
+While Core maintainers assume the day-to-day responsibilities of the project, Lead maintainers are responsible for the overall direction and strategy of the project. A committee of @WoosukKwon, @zhuohan123, @simon-mo, and @youkaichao currently shares this role with divided responsibilities.
+
+The responsibilities of the lead maintainers are:
+
+* Making decisions where consensus among core maintainers cannot be reached.
+* Adopting changes to the project's technical governance.
+* Organizing the voting process for new committers.
+
+### Committers and Area Owners
+
+Committers have write access and merge rights. They typically have deep expertise in specific areas and help the community.
+
+The responsibilities of the committers are:
+
+* Reviewing PRs and providing feedback.
+* Addressing issues and questions from the community.
+* Own specific areas of the codebase and development efforts: reviewing PRs, addressing issues, answering questions, improving documentation.
+
+Specially, committers are almost all area owners. They author subsystems, review PRs, refactor code, monitor tests, and ensure compatibility with other areas. All area owners are committers with deep expertise in that area, but not all committers own areas.
+
+For a full list of committers and their respective areas, see the [committers](./committers.md) page.
+
+#### Nomination Process
+
+Any committer can nominate candidates via our private mailing list:
+
+1. **Nominate**: Any committer may nominate a candidate by email to the private maintainers’ list, citing evidence mapped to the pre‑existing standards with links to PRs, reviews, RFCs, issues, benchmarks, and adoption evidence.
+2. **Vote**: The lead maintainers will group voices support or concerns. Shared concerns can stop the process. The vote typically last 3 working days. For concerns, committers group discuss the clear criteria for such person to be nominated again. The lead maintainers will make the final decision.
+3. **Confirm**: The lead maintainers send invitation, update CODEOWNERS, assign permissions, add to communications channels (mailing list and Slack).
+
+Committership is highly selective and merit based. The selection criteria requires:
+
+* **Area expertise**: leading design/implementation of core subsystems, material performance or reliability improvements adopted project‑wide, or accepted RFCs that shape technical direction.
+* **Sustained contributions**: high‑quality merged contributions and reviews across releases, responsiveness to feedback, and stewardship of code health.
+* **Community leadership**: mentoring contributors, triaging issues, improving docs, and elevating project standards.
+
+To further illustrate, a committer typically satisfies at least two of the following accomplishment patterns:
+
+* Author of an accepted RFC or design that materially shaped project direction
+* Measurable, widely adopted performance or reliability improvement in core paths
+* Long‑term ownership of a subsystem with demonstrable quality and stability gains
+* Significant cross‑project compatibility or ecosystem enablement work (models, hardware, tooling)
+
+While there isn't a quantitative bar, past committers have:
+
+* Submitted approximately 30+ PRs of substantial quality and scope
+* Provided high-quality reviews of approximately 10+ substantial external contributor PRs
+* Addressed multiple issues and questions from the community in issues/forums/Slack
+* Led concentrated efforts on RFCs and their implementation, or significant performance or reliability improvements adopted project‑wide
+
+### Working Groups
+
+vLLM runs informal working groups such as CI, CI infrastructure, torch compile, and startup UX. These can be loosely tracked via `#sig-` (or `#feat-`) channels in vLLM Slack. Some groups have regular sync meetings.
+
+### Advisory Board
+
+vLLM project leads consult with an informal advisory board that is composed of model providers, hardware vendors, and ecosystem partners. This manifests as a collaboration channel in Slack and frequent communications.
+
+## Process
+
+### Project Roadmap
+
+Project Leads publish quarterly roadmaps as GitHub issues. These clarify current priorities. Unlisted topics aren't excluded but may get less review attention. See [https://roadmap.vllm.ai/](https://roadmap.vllm.ai/).
+
+### Decision Making
+
+We make technical decisions in Slack and GitHub using RFCs and design docs. Discussion may happen elsewhere, but we maintain public records of significant changes: problem statements, rationale, and alternatives considered.
+
+### Merging Code
+
+Contributors and maintainers often collaborate closely on code changes, especially within organizations or specific areas. Maintainers should give others appropriate review opportunities based on change significance.
+
+PRs requires at least one committer review and approval. If the code is covered by CODEOWNERS, the PR should be reviewed by the CODEOWNERS. There are cases where the code is trivial or hotfix, the PR can be merged by the lead maintainers directly.
+
+In case where CI didn't pass due to the failure is not related to the PR, the PR can be merged by the lead maintainers using "force merge" option that overrides the CI checks.
+
+### Slack
+
+Contributors are encouraged to join `#pr-reviews` and `#contributors` channels.
+
+There are `#sig-` and `#feat-` channels for discussion and coordination around specific topics.
+
+The project maintainer group also uses a private channel for high-bandwidth collaboration.
+
+### Meetings
+
+We hold weekly contributor syncs with standup-style updates on progress, blockers, and plans. You can refer to the notes [standup.vllm.ai](https://standup.vllm.ai) for joining instructions.
-- 
GitLab


From 5c2433a6f35a84741f26aa546432649891b896bb Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 8 Dec 2025 13:11:51 +0000
Subject: [PATCH 192/258] Add tip for `mypy` and `markdownlint` to the
 pre-commit comment (#30259)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .github/mergify.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 58a5d7786..3ad79f93b 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -35,6 +35,20 @@ pull_request_rules:
 
         For future commits, `pre-commit` will run automatically on changed files before each commit.
 
+        > [!TIP]
+        > <details>
+        > <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
+        > <br/>
+        > <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
+        >
+        > ```bash
+        > # For mypy (substitute "3.10" with the failing version if needed)
+        > pre-commit run --hook-stage manual mypy-3.10
+        > # For markdownlint
+        > pre-commit run --hook-stage manual markdownlint
+        > ```
+        > </details>
+
 - name: comment-dco-failure
   description: Comment on PR when DCO check fails
   conditions:
-- 
GitLab


From 80433e225ee0f91a7f6a082bf4e136df75ab4746 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Mon, 8 Dec 2025 21:29:47 +0800
Subject: [PATCH 193/258] [LoRA]  Reduce the loading time of MoE LoRA (#30243)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/models.py | 34 +++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index ada30da60..567ffce4e 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -115,7 +115,7 @@ class LoRAModel:
         weights_mapper: WeightsMapper | None = None,
     ) -> "LoRAModel":
         """Create a LoRAModel from a dictionary of tensors."""
-        pin_memory = str(device) == "cpu" and is_pin_memory_available()
+
         loras: dict[str, LoRALayerWeights] = {}
         for tensor_name, tensor in tensors.items():
             if is_base_embeddding_weights(tensor_name):
@@ -139,14 +139,8 @@ class LoRAModel:
                         f" with the base model's vocabulary size({model_vocab_size})."
                     )
                 loras[module_name].lora_a = tensor.to(device=device, dtype=dtype)
-                if pin_memory:
-                    loras[module_name].lora_a = loras[module_name].lora_a.pin_memory()
             else:
                 loras[module_name].lora_b = tensor.to(device=device, dtype=dtype)
-
-                if pin_memory:
-                    loras[module_name].lora_b = loras[module_name].lora_b.pin_memory()
-
         return cls(lora_model_id, peft_helper.r, loras)
 
     @classmethod
@@ -742,6 +736,32 @@ class LoRAModelManager:
         for lora in lora_model.loras.values():
             lora.optimize()
 
+        first_lora: LoRALayerWeights = next(iter(lora_model.loras.values()))
+        assert first_lora.lora_a is not None
+        if isinstance(first_lora.lora_a, list):
+            lora_device = next(iter(first_lora.lora_a))
+        else:
+            lora_device = first_lora.lora_a.device
+        # Execute pin_memory after LoRA weight merging, mainly because:
+        # 1. Some MoE models have a large number of LoRA weights. If we
+        # perform # pin_memory immediately after loading weights, the
+        # overhead is significant.
+        # 2. The weight packing above (e.g., pack_moe) may invalidate the
+        # pin_memory allocation, so we execute it after packing.
+
+        pin_memory = str(lora_device) == "cpu" and is_pin_memory_available()
+        if pin_memory:
+            for lora in lora_model.loras.values():
+                if isinstance(lora.lora_a, list):
+                    for index in range(len(lora.lora_a)):
+                        if lora.lora_a[index] is None:
+                            continue
+                        lora.lora_a[index] = lora.lora_a[index].pin_memory()
+                        lora.lora_b[index] = lora.lora_b[index].pin_memory()
+                else:
+                    lora.lora_a = lora.lora_a.pin_memory()
+                    lora.lora_b = lora.lora_b.pin_memory()
+
     def _get_lora_layer_weights(
         self, lora_model: LoRAModel, module_name: str
     ) -> LoRALayerWeights | None:
-- 
GitLab


From eb1051fb95323493d6d950c03dabac8ee56cb33e Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <yeq@meta.com>
Date: Mon, 8 Dec 2025 06:44:48 -0800
Subject: [PATCH 194/258] [ROCm] Guard group quant RMS norm fusion patterns
 (#30239)

---
 vllm/compilation/fusion.py | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index de083a2e5..a7e6a69e6 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -490,23 +490,25 @@ class RMSNormQuantFusionPass(VllmPatternMatcherPass):
         # as the latter is a subset of the former in torch ops
         for epsilon in [1e-5, 1e-6]:
             # Fuse fused_add_rms_norm + fp8 group quant
-            FusedAddRMSNormGroupQuantPattern(
-                epsilon, FP8_DTYPE, group_shape=GroupShape(1, 128)
-            ).register(self.patterns)
-
-            # Fuse rms_norm + fp8 group quant
-            RMSNormGroupQuantPattern(
-                epsilon, FP8_DTYPE, group_shape=GroupShape(1, 128)
-            ).register(self.patterns)
-
-            FusedAddRMSNormGroupQuantPattern(
-                epsilon, FP8_DTYPE, group_shape=GroupShape(1, 64)
-            ).register(self.patterns)
-
-            # Fuse rms_norm + fp8 group quant
-            RMSNormGroupQuantPattern(
-                epsilon, FP8_DTYPE, group_shape=GroupShape(1, 64)
-            ).register(self.patterns)
+            # Only register group quant patterns on CUDA where the C++ op exists
+            if current_platform.is_cuda():
+                FusedAddRMSNormGroupQuantPattern(
+                    epsilon, FP8_DTYPE, group_shape=GroupShape(1, 128)
+                ).register(self.patterns)
+
+                # Fuse rms_norm + fp8 group quant
+                RMSNormGroupQuantPattern(
+                    epsilon, FP8_DTYPE, group_shape=GroupShape(1, 128)
+                ).register(self.patterns)
+
+                FusedAddRMSNormGroupQuantPattern(
+                    epsilon, FP8_DTYPE, group_shape=GroupShape(1, 64)
+                ).register(self.patterns)
+
+                # Fuse rms_norm + fp8 group quant
+                RMSNormGroupQuantPattern(
+                    epsilon, FP8_DTYPE, group_shape=GroupShape(1, 64)
+                ).register(self.patterns)
 
             # Fuse fused_add_rms_norm + static fp8 quant
             FusedAddRMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register(
-- 
GitLab


From 184076c3fecf8c322648f36f69a4de836d7f519b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20C=C3=A1mpora?=
 <961215+dcampora@users.noreply.github.com>
Date: Mon, 8 Dec 2025 15:55:58 +0100
Subject: [PATCH 195/258] [DeepSeek v3.2] Make top-k work for any logit values.
 (#27568)

Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 csrc/ops.h                                |  13 +-
 csrc/sampler.cu                           | 715 ++++++++++++++++------
 csrc/torch_bindings.cpp                   |  10 +-
 tests/kernels/test_top_k_per_row.py       |  95 ++-
 vllm/model_executor/models/deepseek_v2.py |   6 +-
 5 files changed, 629 insertions(+), 210 deletions(-)

diff --git a/csrc/ops.h b/csrc/ops.h
index 9617d6358..5fce3a1a3 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -102,13 +102,16 @@ void apply_repetition_penalties_(torch::Tensor& logits,
                                  const torch::Tensor& output_mask,
                                  const torch::Tensor& repetition_penalties);
 
-void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
-                   const torch::Tensor& rowEnds, torch::Tensor& indices,
-                   int64_t numRows, int64_t stride0, int64_t stride1);
+void top_k_per_row_prefill(const torch::Tensor& logits,
+                           const torch::Tensor& rowStarts,
+                           const torch::Tensor& rowEnds, torch::Tensor& indices,
+                           int64_t numRows, int64_t stride0, int64_t stride1,
+                           int64_t topK);
 
 void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
-                          const torch::Tensor& seq_lens, torch::Tensor& indices,
-                          int64_t numRows, int64_t stride0, int64_t stride1);
+                          const torch::Tensor& seqLens, torch::Tensor& indices,
+                          int64_t numRows, int64_t stride0, int64_t stride1,
+                          int64_t topK);
 
 void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
                                torch::Tensor& weight, torch::Tensor& scale,
diff --git a/csrc/sampler.cu b/csrc/sampler.cu
index 410b8988f..fc2154bef 100644
--- a/csrc/sampler.cu
+++ b/csrc/sampler.cu
@@ -44,41 +44,300 @@ __global__ void apply_repetition_penalties_kernel(
   }
 }
 
-static inline __device__ uint16_t extractBinIdx(float x) {
-  union {
-    __half h;
-    uint16_t u16;
-  } tmp;
-  tmp.h = __float2half_rn(x);
-  tmp.u16 = (x < 0.f) ? (~tmp.u16 & 0xffff) : (tmp.u16 | 0x8000);
-  return 511 - (tmp.u16 >> 7);
+__device__ __forceinline__ auto convert_to_uint32(float x) -> uint32_t {
+  uint32_t bits = __float_as_uint(x);
+  return (bits & 0x80000000) ? bits : ~bits & 0x7fffffff;
 }
 
-template <int kNumThreadsPerBlock = 512, int kNumBins = 512, int kTopK = 2048>
-__device__ void topKPerRowJob(const float* logits, const int rowStart,
-                              const int rowEnd, const int rowIdx,
-                              int* outIndices, int stride0, int stride1) {
-  // The number of elements per thread for the final top-k sort.
-  static constexpr int kNumTopKItemsPerThread = kTopK / kNumThreadsPerBlock;
-  // The class to sort the elements during the final top-k sort.
-  using TopKSort = cub::BlockRadixSort<float, kNumThreadsPerBlock,
-                                       kNumTopKItemsPerThread, int>;
+template <int step>
+static inline __device__ uint32_t extractBinIdx(float x) {
+  if constexpr (step == 0) {
+    __half hx = __float2half(x);
+    uint16_t bits = __half_as_ushort(hx);
+    bits = (bits & 0x8000) ? bits : ~bits & 0x7fff;
+    return bits >> 5;
+  } else {
+    uint32_t bits = __float_as_uint(x);
+    bits = (bits & 0x80000000) ? bits : ~bits & 0x7fffffff;
+
+    if constexpr (step == 1) {
+      return bits >> 21;
+    } else if constexpr (step == 2) {
+      return (bits >> 10) & 0x7ff;
+    } else if constexpr (step == 3) {
+      return bits & 0x3ff;
+    }
+  }
+}
+
+template <int shift>
+static inline __device__ bool isPartialMatch(float x, uint32_t pattern) {
+  if constexpr (shift == 0) {
+    return true;
+  }
+  uint32_t bits = __float_as_uint(x);
+  bits = (bits & 0x80000000) ? bits : ~bits & 0x7fffffff;
+  return (bits ^ pattern) >> shift == 0;
+}
+
+/**
+ * Map a Func over the input data, using vectorized load instructions if
+ * possible.
+ *
+ * @tparam T element type
+ * @tparam IdxT indexing type
+ * @tparam Func void (T x, IdxT idx)
+ *
+ * @param thread_rank rank of the calling thread among all participating threads
+ * @param num_threads number of the threads that participate in processing
+ * @param in the input data
+ * @param len the number of elements to read
+ * @param f the lambda taking two arguments (T x, IdxT idx)
+ */
+template <typename T, typename idxT, typename Func>
+__device__ void vectorized_process(size_t thread_rank, size_t num_threads,
+                                   const T* in, idxT len, Func f) {
+  constexpr int WARP_SIZE = 32;
+  using WideT = float4;
+  if constexpr (sizeof(T) >= sizeof(WideT)) {
+    for (idxT i = thread_rank; i < len; i += num_threads) {
+      f(in[i], i);
+    }
+  } else {
+    static_assert(sizeof(WideT) % sizeof(T) == 0);
+    constexpr int items_per_scalar = sizeof(WideT) / sizeof(T);
+    // TODO: it's UB
+    union {
+      WideT scalar;
+      T array[items_per_scalar];
+    } wide;
+
+    int skip_cnt =
+        (reinterpret_cast<size_t>(in) % sizeof(WideT))
+            ? ((sizeof(WideT) - reinterpret_cast<size_t>(in) % sizeof(WideT)) /
+               sizeof(T))
+            : 0;
+    if (skip_cnt > len) {
+      skip_cnt = len;
+    }
+    const WideT* in_cast = reinterpret_cast<decltype(in_cast)>(in + skip_cnt);
+    const idxT len_cast = (len - skip_cnt) / items_per_scalar;
+
+    for (idxT i = thread_rank; i < len_cast; i += num_threads) {
+      wide.scalar = in_cast[i];
+      const idxT real_i = skip_cnt + i * items_per_scalar;
+#pragma unroll
+      for (int j = 0; j < items_per_scalar; ++j) {
+        f(wide.array[j], real_i + j);
+      }
+    }
+
+    static_assert(WARP_SIZE >= items_per_scalar);
+    // and because items_per_scalar > skip_cnt, WARP_SIZE > skip_cnt
+    // no need to use loop
+    if (thread_rank < skip_cnt) {
+      f(in[thread_rank], thread_rank);
+    }
+    // because len_cast = (len - skip_cnt) / items_per_scalar,
+    // len_cast * items_per_scalar + items_per_scalar > len - skip_cnt;
+    // and so
+    // len - (skip_cnt + len_cast * items_per_scalar) < items_per_scalar <=
+    // WARP_SIZE no need to use loop
+    const idxT remain_i = skip_cnt + len_cast * items_per_scalar + thread_rank;
+    if (remain_i < len) {
+      f(in[remain_i], remain_i);
+    }
+  }
+}
+
+template <int step, int kNumThreadsPerBlock, int kNumBins, int kNumFinalItems,
+          bool multipleBlocksPerRow, bool mergeBlocks, typename SmemFinalType,
+          typename SmemOutputType>
+__device__ bool processHistogramStep(
+    const int* indices, const float* logits, int rowEnd, uint32_t& logitPattern,
+    int& thresholdBinIdx, SmemOutputType& smemOutput, int* smemThresholdBinIdx,
+    int* smemFinalDstIdx, int* smemFinalBinSize, int* smemFoundTopKValues,
+    SmemFinalType& smemFinal, int stride1, int rowStart, int topK) {
+  // Clear the histogram.
+#pragma unroll
+  for (int idx = threadIdx.x; idx < kNumBins; idx += kNumThreadsPerBlock) {
+    smemFinal.histo.data[idx] = 0;
+  }
+
+  // Make sure the histogram is ready.
+  __syncthreads();
+
+  // Update pattern
+  constexpr auto patternShift = step < 2 ? 0 : step == 2 ? 21 : 10;
+  if constexpr (step == 2) {
+    logitPattern = static_cast<uint32_t>(thresholdBinIdx & 0x7ff)
+                   << patternShift;
+  } else if constexpr (step == 3) {
+    logitPattern |= static_cast<uint32_t>(thresholdBinIdx & 0x7ff)
+                    << patternShift;
+  }
+
+  auto distributeToBins = [&](float logit, int /* idx */ = 0) {
+    if (isPartialMatch<patternShift>(logit, logitPattern)) {
+      uint32_t binIdx = extractBinIdx<step>(logit);
+      atomicAdd(&smemFinal.histo.data[binIdx], 1);
+    }
+  };
+
+  // Distribute the elements to the histogram bins.
+  if (stride1 == 1) {
+    vectorized_process(threadIdx.x, kNumThreadsPerBlock, logits + rowStart,
+                       rowEnd - rowStart, distributeToBins);
+  } else {
+    for (int idx = rowStart + threadIdx.x; idx < rowEnd;
+         idx += kNumThreadsPerBlock) {
+      float logit = logits[idx * stride1];
+      distributeToBins(logit, idx);
+    }
+  }
+  // Make sure the histogram is ready.
+  __syncthreads();
+
+  // Reads the value of the starting position in the smemOutput array
+  int lastValue = smemFoundTopKValues[0];
+
+  for (int round = 0; round < kNumBins / kNumThreadsPerBlock; round++) {
+    // Read the values from SMEM.
+    int idx = threadIdx.x + kNumThreadsPerBlock * round;
+    int binCount{0};
+    binCount = smemFinal.histo.data[idx];
+
+    // Make sure each thread has read its value.
+    __syncthreads();
+
+    // Compute the prefix sum.
+    int prefixSum{0}, totalSum{0};
+    using Scan = cub::BlockScan<int, kNumThreadsPerBlock>;
+    Scan(smemFinal.histo.scan).ExclusiveSum(binCount, prefixSum, totalSum);
+
+    // Update the histogram with the prefix sums.
+    prefixSum += lastValue;
+    totalSum += lastValue;
+    smemFinal.histo.data[idx] = prefixSum;
+
+    // Make sure the data is in shared memory.
+    __syncthreads();
+
+    // Find the last valid bin.
+    bool foundThreshold = false;
+    if (prefixSum < topK) {
+      int nextPrefixSum = threadIdx.x == kNumThreadsPerBlock - 1
+                              ? totalSum
+                              : smemFinal.histo.data[idx + 1];
+
+      if (nextPrefixSum >= topK) {
+        smemThresholdBinIdx[0] = idx;
+        smemFinalBinSize[0] = nextPrefixSum - prefixSum;
+        foundThreshold = true;
+      }
+    }
+
+    // Early exit: if any thread found the threshold, we can skip remaining
+    // rounds
+    if (__syncthreads_or(foundThreshold)) {
+      break;
+    }
+
+    lastValue = totalSum;
+  }
+
+  // Make sure the data is in shared memory.
+  __syncthreads();
+
+  // The threshold bin.
+  thresholdBinIdx = smemThresholdBinIdx[0];
+
+  auto processBins = [&](float logit, int idx) {
+    if (isPartialMatch<patternShift>(logit, logitPattern)) {
+      uint32_t binIdx = extractBinIdx<step>(logit);
+      if (binIdx < thresholdBinIdx) {
+        // The element is part of the top-k selection
+        int dstIdx = atomicAdd(&smemFoundTopKValues[0], 1);
+
+        if constexpr (mergeBlocks) {
+          smemOutput[dstIdx] = indices[idx];
+        } else if constexpr (multipleBlocksPerRow) {
+          smemOutput[dstIdx] = idx + rowStart;
+          reinterpret_cast<float*>(smemOutput + topK)[dstIdx] = logit;
+        } else {
+          smemOutput[dstIdx] = idx;
+        }
+      }
+      if constexpr (step < 3) {
+        // Only fill the final items for sorting if the threshold bin fits
+        if (binIdx == thresholdBinIdx &&
+            smemFinalBinSize[0] <= kNumFinalItems) {
+          int dstIdx = atomicAdd(&smemFinalDstIdx[0], 1);
+          smemFinal.items.logits[dstIdx] = logit;
+          if constexpr (mergeBlocks) {
+            smemFinal.items.indices[dstIdx] = indices[idx];
+          } else if constexpr (multipleBlocksPerRow) {
+            smemFinal.items.indices[dstIdx] = idx + rowStart;
+          } else {
+            smemFinal.items.indices[dstIdx] = idx;
+          }
+        }
+      } else {
+        if (binIdx == thresholdBinIdx) {
+          // The elements in the threshold bin share the same 32 bits at step 3
+          int dstIdx = atomicAdd(&smemFinal.histo.data[binIdx], 1);
+          if (dstIdx < topK) {
+            if constexpr (mergeBlocks) {
+              smemOutput[dstIdx] = indices[idx];
+            } else if constexpr (multipleBlocksPerRow) {
+              smemOutput[dstIdx] = idx + rowStart;
+              reinterpret_cast<float*>(smemOutput + topK)[dstIdx] = logit;
+            } else {
+              smemOutput[dstIdx] = idx;
+            }
+          }
+        }
+      }
+    }
+  };
+
+  if (stride1 == 1) {
+    vectorized_process(threadIdx.x, kNumThreadsPerBlock, logits + rowStart,
+                       rowEnd - rowStart, processBins);
+  } else {
+    for (int idx = rowStart + threadIdx.x; idx < rowEnd;
+         idx += kNumThreadsPerBlock) {
+      float logit = logits[idx * stride1];
+      processBins(logit, idx);
+    }
+  }
+
+  // Make sure the elements are in shared memory.
+  __syncthreads();
+
+  // Check if we should continue to next step
+  return smemFinalBinSize[0] > kNumFinalItems;
+}
 
+// Follows half - 11 - 11 - 10 bit iterations
+template <int kNumThreadsPerBlock, int kNumBins, bool useRadixSort,
+          bool multipleBlocksPerRow = false, bool mergeBlocks = false>
+static __device__ void topKPerRowJob(const int* indices, const float* logits,
+                                     int rowStart, int rowEnd, int* outIndices,
+                                     float* outLogits, int stride1, int topK) {
   // The number of slots for the final pass.
-  static constexpr int kNumFinalItems = 3072;
+  static constexpr int kNumFinalItems = 2048;
   // The number of elements per thread for the final sort.
   static constexpr int kNumFinalItemsPerThread =
       kNumFinalItems / kNumThreadsPerBlock;
   // The class to sort the elements during the final pass.
   using FinalSort = cub::BlockRadixSort<float, kNumThreadsPerBlock,
                                         kNumFinalItemsPerThread, int>;
-
+  using FinalSortTempStorage =
+      std::conditional_t<useRadixSort, typename FinalSort::TempStorage, int>;
   // The class to compute the inclusive prefix-sum over the histogram.
   using Scan = cub::BlockScan<int, kNumThreadsPerBlock>;
 
-  // Shared memory to compute the block scan.
-  __shared__ typename Scan::TempStorage smemScan;
-
   // The structure to store the final items (for the final pass).
   struct FinalItems {
     // Shared memory to store the indices for the final pass.
@@ -87,200 +346,225 @@ __device__ void topKPerRowJob(const float* logits, const int rowStart,
     float logits[kNumFinalItems];
   };
 
+  struct Histogram {
+    typename Scan::TempStorage scan;
+    int data[kNumBins];
+  };
+
   // Shared memory to compute the block sort.
   __shared__ union {
     FinalItems items;
-    typename FinalSort::TempStorage finalSort;
-    typename TopKSort::TempStorage topKSort;
+    FinalSortTempStorage finalSort;
+    Histogram histo;
   } smemFinal;
 
-  // Shared memory to store the histogram.
-  __shared__ int smemHistogram[kNumBins];
   // Shared memory to store the selected indices.
-  __shared__ int smemIndices[kTopK];
+  // If we are processing using multiple blocks, we need to store the logits and
+  // indices.
+  extern __shared__ int32_t smemOutput[];
+
   // Shared memory to store the threshold bin.
   __shared__ int smemThresholdBinIdx[1];
   // Shared memory counter to register the candidates for the final phase.
   __shared__ int smemFinalDstIdx[1];
+  // Shared memory to determine if the threshold bin fits in the final items.
+  __shared__ int smemFinalBinSize[1];
+  // Shared memory to keep track of the top-k values found so far by the
+  // previous iterations
+  __shared__ int smemFoundTopKValues[1];
 
   // The length of the row.
   int rowLen = rowEnd - rowStart;
 
   // Shortcut if the length of the row is smaller than Top-K. Indices are not
   // sorted by their corresponding logit.
-  if (rowLen <= kTopK) {
+  if (rowLen <= topK) {
     for (int rowIt = threadIdx.x; rowIt < rowLen;
          rowIt += kNumThreadsPerBlock) {
-      int idx = rowStart + rowIt;
-      outIndices[rowIdx * kTopK + rowIt] = idx - rowStart;
+      if constexpr (multipleBlocksPerRow) {
+        outIndices[rowIt] = rowIt + rowStart;
+        outLogits[rowIt] = logits[rowIt + rowStart];
+      } else {
+        outIndices[rowIt] = rowIt;
+      }
     }
-    for (int rowIt = rowLen + threadIdx.x; rowIt < kTopK;
+    for (int rowIt = rowLen + threadIdx.x; rowIt < topK;
          rowIt += kNumThreadsPerBlock) {
-      outIndices[rowIdx * kTopK + rowIt] = -1;
+      outIndices[rowIt] = -1;
+      if constexpr (multipleBlocksPerRow) {
+        outLogits[rowIt] = -FLT_MAX;
+      }
     }
-    return;
-  }
-
-  // Clear the histogram.
-  if (threadIdx.x < kNumBins) {
-    smemHistogram[threadIdx.x] = 0;
-  }
-
-  // Make sure the histogram is ready.
-  __syncthreads();
-
-  // Fetch elements one-by-one.
-  for (int rowIt = rowStart + threadIdx.x; rowIt < rowEnd;
-       rowIt += kNumThreadsPerBlock) {
-    uint16_t idx = extractBinIdx(logits[rowIdx * stride0 + rowIt * stride1]);
-    atomicAdd(&smemHistogram[idx], 1);
-  }
-
-  // Make sure the histogram is ready.
-  __syncthreads();
-
-  // Read the values from SMEM.
-  int binCount{0};
-  if (threadIdx.x < kNumBins) {
-    binCount = smemHistogram[threadIdx.x];
-  }
-
-  // Make sure each thread has read its value.
-  __syncthreads();
-
-  // Compute the prefix sum.
-  int prefixSum{0}, totalSum{0};
-  Scan(smemScan).ExclusiveSum(binCount, prefixSum, totalSum);
-
-  // Update the histogram with the prefix sums.
-  if (threadIdx.x < kNumBins) {
-    smemHistogram[threadIdx.x] = prefixSum;
-  }
 
-  // Make sure the data is in shared memory.
-  __syncthreads();
-
-  // Find the last valid bin.
-  if (threadIdx.x < kNumBins) {
-    int nextPrefixSum =
-        threadIdx.x == kNumBins - 1 ? totalSum : smemHistogram[threadIdx.x + 1];
-    if (prefixSum < kTopK && nextPrefixSum >= kTopK) {
-      smemThresholdBinIdx[0] = threadIdx.x;
-    }
+    return;
   }
-
-  // Clear the counter to store the items for the final phase.
+  // Initialize values
   if (threadIdx.x == 0) {
     smemFinalDstIdx[0] = 0;
+    smemFoundTopKValues[0] = 0;
   }
-
-  // Make sure the data is in shared memory.
   __syncthreads();
+  int thresholdBinIdx = -1;
+  uint32_t logitPattern = 0;
+
+  // Step 0: Process first 11 bits of half representation
+  bool continueToNextStep =
+      processHistogramStep<0, kNumThreadsPerBlock, kNumBins, kNumFinalItems,
+                           multipleBlocksPerRow, mergeBlocks>(
+          indices, logits, rowEnd, logitPattern, thresholdBinIdx, smemOutput,
+          smemThresholdBinIdx, smemFinalDstIdx, smemFinalBinSize,
+          smemFoundTopKValues, smemFinal, stride1, rowStart, topK);
+
+  if (continueToNextStep) {
+    // Step 1: Process next 11 bits
+    continueToNextStep =
+        processHistogramStep<1, kNumThreadsPerBlock, kNumBins, kNumFinalItems,
+                             multipleBlocksPerRow, mergeBlocks>(
+            indices, logits, rowEnd, logitPattern, thresholdBinIdx, smemOutput,
+            smemThresholdBinIdx, smemFinalDstIdx, smemFinalBinSize,
+            smemFoundTopKValues, smemFinal, stride1, rowStart, topK);
+  }
 
-  // The threshold bin.
-  int thresholdBinIdx = smemThresholdBinIdx[0];
-
-  // Fetch elements one-by-one and populate the shared memory buffers.
-  for (int rowIt = rowStart + threadIdx.x; rowIt < rowEnd;
-       rowIt += kNumThreadsPerBlock) {
-    float logit = logits[rowIdx * stride0 + rowIt * stride1];
-    uint16_t idx = extractBinIdx(logit);
-    if (idx < thresholdBinIdx) {
-      int dstIdx = atomicAdd(&smemHistogram[idx], 1);
-      smemIndices[dstIdx] = rowIt;
-    } else if (idx == thresholdBinIdx) {
-      int dstIdx = atomicAdd(&smemFinalDstIdx[0], 1);
-      if (dstIdx < kNumFinalItems) {
-        smemFinal.items.logits[dstIdx] = logit;
-        smemFinal.items.indices[dstIdx] = rowIt;
-      }
-    }
+  if (continueToNextStep) {
+    // Step 2: Process next 11 bits
+    continueToNextStep =
+        processHistogramStep<2, kNumThreadsPerBlock, kNumBins, kNumFinalItems,
+                             multipleBlocksPerRow, mergeBlocks>(
+            indices, logits, rowEnd, logitPattern, thresholdBinIdx, smemOutput,
+            smemThresholdBinIdx, smemFinalDstIdx, smemFinalBinSize,
+            smemFoundTopKValues, smemFinal, stride1, rowStart, topK);
   }
 
-  // Make sure the elements are in shared memory.
-  __syncthreads();
+  if (continueToNextStep) {
+    // Step 3: Process last 10 bits
+    processHistogramStep<3, kNumThreadsPerBlock, kNumBins, kNumFinalItems,
+                         multipleBlocksPerRow, mergeBlocks>(
+        indices, logits, rowEnd, logitPattern, thresholdBinIdx, smemOutput,
+        smemThresholdBinIdx, smemFinalDstIdx, smemFinalBinSize,
+        smemFoundTopKValues, smemFinal, stride1, rowStart, topK);
+  }
 
-  // The logits of the elements to be sorted in the final pass.
-  float finalLogits[kNumFinalItemsPerThread];
-  // The indices of the elements to be sorted in the final pass.
-  int finalIndices[kNumFinalItemsPerThread];
+  if (!continueToNextStep) {
+    // The histogram did not proceed to the final 10 bits, therefore we need to
+    // sort the final items The logits of the elements to be sorted in the final
+    // pass.
+    if constexpr (useRadixSort) {
+      // Sorting with radix sort
+      float finalLogits[kNumFinalItemsPerThread];
+      // The indices of the elements to be sorted in the final pass.
+      int finalIndices[kNumFinalItemsPerThread];
 
-// Init.
 #pragma unroll
-  for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
-    finalLogits[ii] = -FLT_MAX;
-  }
+      for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
+        finalLogits[ii] = -FLT_MAX;
+      }
 
-// Read the elements from SMEM.
+      // Read the elements from SMEM.
 #pragma unroll
-  for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
-    int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
-    if (srcIdx < smemFinalDstIdx[0]) {
-      finalLogits[ii] = smemFinal.items.logits[srcIdx];
-      finalIndices[ii] = smemFinal.items.indices[srcIdx];
-    }
-  }
+      for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
+        int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
+        if (srcIdx < smemFinalDstIdx[0]) {
+          finalLogits[ii] = smemFinal.items.logits[srcIdx];
+          finalIndices[ii] = smemFinal.items.indices[srcIdx];
+        }
+      }
+      // Make sure the shared memory has been read.
+      __syncthreads();
 
-  // Make sure the shared memory has been read.
-  __syncthreads();
+      // Sort the elements.
+      FinalSort(smemFinal.finalSort)
+          .SortDescendingBlockedToStriped(finalLogits, finalIndices);
 
-  // Sort the elements.
-  FinalSort(smemFinal.finalSort)
-      .SortDescendingBlockedToStriped(finalLogits, finalIndices);
+      // Copy the data back to the shared memory storage.
+      int baseIdx = smemFoundTopKValues[0];
 
-  // Copy the data back to the shared memory storage.
-  int baseIdx = thresholdBinIdx > 0 ? smemHistogram[thresholdBinIdx - 1] : 0;
 #pragma unroll
-  for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
-    int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
-    int dstIdx = baseIdx + srcIdx;
-    if (dstIdx < kTopK) {
-      smemIndices[dstIdx] = finalIndices[ii];
+      for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
+        int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
+        int dstIdx = baseIdx + srcIdx;
+
+        if (dstIdx < topK) {
+          smemOutput[dstIdx] = finalIndices[ii];
+          if constexpr (multipleBlocksPerRow) {
+            reinterpret_cast<float*>(smemOutput + topK)[dstIdx] =
+                finalLogits[ii];
+          }
+        }
+      }
+    } else {
+      // Sorting with insertion sort
+      auto baseIdx = smemFoundTopKValues[0];
+      for (int i = threadIdx.x; i < smemFinalDstIdx[0];
+           i += kNumThreadsPerBlock) {
+        int outIndex = 0;
+        auto logit = smemFinal.items.logits[i];
+        for (int j = 0; j < smemFinalDstIdx[0]; j++) {
+          auto otherLogit = smemFinal.items.logits[j];
+          if (logit < otherLogit || (logit == otherLogit && i < j)) {
+            outIndex++;
+          }
+        }
+        // Store if outIndex is in bounds
+        if (outIndex + baseIdx < topK) {
+          smemOutput[outIndex + baseIdx] = smemFinal.items.indices[i];
+          if constexpr (multipleBlocksPerRow) {
+            reinterpret_cast<float*>(smemOutput + topK)[outIndex + baseIdx] =
+                smemFinal.items.logits[i];
+          }
+        }
+      }
     }
+    __syncthreads();
   }
 
-  // Make sure the data is in shared memory.
-  __syncthreads();
-
-// Store to global memory.
-#pragma unroll
-  for (int ii = 0; ii < kNumTopKItemsPerThread; ++ii) {
-    int offset = rowIdx * kTopK + ii * kNumThreadsPerBlock + threadIdx.x;
-    outIndices[offset] =
-        smemIndices[ii * kNumThreadsPerBlock + threadIdx.x] - rowStart;
+  // Store to global memory.
+  for (int i = threadIdx.x; i < topK; i += kNumThreadsPerBlock) {
+    if constexpr (multipleBlocksPerRow) {
+      outIndices[i] = smemOutput[i];
+      outLogits[i] = reinterpret_cast<float*>(smemOutput + topK)[i];
+    } else {
+      if (stride1 == 1) {
+        // stride1 == 1 will use vectorized_process, which indexes already skip
+        // the rowStart.
+        outIndices[i] = smemOutput[i];
+      } else {
+        outIndices[i] = smemOutput[i] - rowStart;
+      }
+    }
   }
 }
 
-template <int kNumThreadsPerBlock = 512>
-static __global__ void topKPerRow(const float* logits, const int* rowStarts,
-                                  const int* rowEnds, int* outIndices,
-                                  int stride0, int stride1) {
+template <int kNumThreadsPerBlock, bool useRadixSort>
+static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowPrefill(
+    const float* logits, const int* rowStarts, const int* rowEnds,
+    int* outIndices, int stride0, int stride1, const int topK,
+    const int offsetIndex) {
   // The number of bins in the histogram.
-  static constexpr int kNumBins = 512;
-
-  // The top-k width.
-  static constexpr int kTopK = 2048;
+  static constexpr int kNumBins = 2048;
 
   // The row computed by this block.
-  int rowIdx = blockIdx.x;
+  int rowIdx = blockIdx.x + offsetIndex;
 
   // The range of logits within the row.
   int rowStart = rowStarts[rowIdx];
   int rowEnd = rowEnds[rowIdx];
 
-  topKPerRowJob<kNumThreadsPerBlock, kNumBins, kTopK>(
-      logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1);
+  // Local pointers to this block
+  outIndices += rowIdx * topK;
+  logits += rowIdx * stride0;
+
+  topKPerRowJob<kNumThreadsPerBlock, kNumBins, useRadixSort>(
+      nullptr, logits, rowStart, rowEnd, outIndices, nullptr, stride1, topK);
 }
 
-template <int kNumThreadsPerBlock = 512>
-static __global__ void topKPerRowDecode(const float* logits, const int* seqLens,
-                                        int* outIndices, int stride0,
-                                        int stride1, int next_n) {
+template <int kNumThreadsPerBlock, bool useRadixSort,
+          bool multipleBlocksPerRow = false, bool mergeBlocks = false>
+static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowDecode(
+    const float* logits, const int* seqLens, int* outIndices, int stride0,
+    int stride1, const int topK, int next_n, float* outLogits = nullptr,
+    const int numBlocksToMerge = 0, const int* indices = nullptr) {
   // The number of bins in the histogram.
-  static constexpr int kNumBins = 512;
-
-  // The top-k width.
-  static constexpr int kTopK = 2048;
+  static constexpr int kNumBins = 2048;
 
   // The row computed by this block.
   int rowIdx = blockIdx.x;
@@ -290,8 +574,25 @@ static __global__ void topKPerRowDecode(const float* logits, const int* seqLens,
   int seq_len = seqLens[rowIdx / next_n];
   int rowEnd = seq_len - next_n + (rowIdx % next_n) + 1;
 
-  topKPerRowJob<kNumThreadsPerBlock, kNumBins, kTopK>(
-      logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1);
+  // Local pointers to this block
+  if constexpr (!multipleBlocksPerRow && !mergeBlocks) {
+    outIndices += rowIdx * topK;
+  } else if constexpr (multipleBlocksPerRow) {
+    const auto blockSize = rowEnd / gridDim.y;  // 16384 / 2 = 8192
+    rowStart = blockSize * blockIdx.y;          // 8192 * 1 = 8192
+    rowEnd = gridDim.y == blockIdx.y + 1 ? rowEnd : rowStart + blockSize;
+    outIndices += rowIdx * gridDim.y * topK + blockIdx.y * topK;
+    outLogits += rowIdx * gridDim.y * topK + blockIdx.y * topK;
+  } else if constexpr (mergeBlocks) {
+    rowEnd = numBlocksToMerge * topK;
+    indices += rowIdx * numBlocksToMerge * topK;
+    outIndices += rowIdx * topK;
+  }
+  logits += rowIdx * stride0;
+
+  topKPerRowJob<kNumThreadsPerBlock, kNumBins, useRadixSort,
+                multipleBlocksPerRow, mergeBlocks>(
+      indices, logits, rowStart, rowEnd, outIndices, outLogits, stride1, topK);
 }
 
 }  // namespace vllm
@@ -339,28 +640,84 @@ void apply_repetition_penalties_(
 
 void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
                           const torch::Tensor& seqLens, torch::Tensor& indices,
-                          int64_t numRows, int64_t stride0, int64_t stride1) {
-  // Compute the results on the device.
+                          int64_t numRows, int64_t stride0, int64_t stride1,
+                          int64_t topK) {
+  constexpr int kSortingAlgorithmThreshold = 12288;
+  constexpr int kSplitWorkThreshold = 200 * 1000;
   constexpr int kNumThreadsPerBlock = 512;
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  vllm::topKPerRowDecode<kNumThreadsPerBlock>
-      <<<numRows, kNumThreadsPerBlock, 0, stream>>>(
-          logits.data_ptr<float>(), seqLens.data_ptr<int>(),
-          indices.data_ptr<int>(), static_cast<int>(stride0),
-          static_cast<int>(stride1), static_cast<int>(next_n));
+  const auto numColumns = logits.size(1);
+
+  if (numColumns < kSortingAlgorithmThreshold) {
+    // Use insertion sort
+    vllm::topKPerRowDecode<kNumThreadsPerBlock, false>
+        <<<numRows, kNumThreadsPerBlock, topK * sizeof(int32_t), stream>>>(
+            logits.data_ptr<float>(), seqLens.data_ptr<int>(),
+            indices.data_ptr<int>(), static_cast<int>(stride0),
+            static_cast<int>(stride1), static_cast<int>(topK),
+            static_cast<int>(next_n));
+  } else if (numColumns < kSplitWorkThreshold) {
+    // From this threshold, use radix sort instead
+    vllm::topKPerRowDecode<kNumThreadsPerBlock, true>
+        <<<numRows, kNumThreadsPerBlock, topK * sizeof(int32_t), stream>>>(
+            logits.data_ptr<float>(), seqLens.data_ptr<int>(),
+            indices.data_ptr<int>(), static_cast<int>(stride0),
+            static_cast<int>(stride1), static_cast<int>(topK),
+            static_cast<int>(next_n));
+  } else {
+    // Long sequences are run in two steps
+    constexpr auto multipleBlocksPerRowConfig = 10;
+
+    const auto outIndicesAux =
+        torch::empty({numRows, multipleBlocksPerRowConfig, topK},
+                     torch::dtype(torch::kInt32).device(logits.device()));
+    const auto outLogitsAux =
+        torch::empty({numRows, multipleBlocksPerRowConfig, topK},
+                     torch::dtype(torch::kFloat).device(logits.device()));
+
+    vllm::topKPerRowDecode<kNumThreadsPerBlock, true, true>
+        <<<dim3(numRows, multipleBlocksPerRowConfig), kNumThreadsPerBlock,
+           2 * topK * sizeof(int32_t), stream>>>(
+            logits.data_ptr<float>(), seqLens.data_ptr<int>(),
+            outIndicesAux.data_ptr<int>(), static_cast<int>(stride0),
+            static_cast<int>(stride1), static_cast<int>(topK),
+            static_cast<int>(next_n), outLogitsAux.data_ptr<float>());
+
+    constexpr int kNumThreadsPerBlockMerge = 1024;
+    vllm::topKPerRowDecode<kNumThreadsPerBlockMerge, true, false, true>
+        <<<numRows, kNumThreadsPerBlockMerge, topK * sizeof(int32_t), stream>>>(
+            outLogitsAux.data_ptr<float>(), seqLens.data_ptr<int>(),
+            indices.data_ptr<int>(), multipleBlocksPerRowConfig * topK, 1,
+            static_cast<int>(topK), static_cast<int>(next_n), nullptr,
+            multipleBlocksPerRowConfig, outIndicesAux.data_ptr<int>());
+  }
 }
 
-void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
-                   const torch::Tensor& rowEnds, torch::Tensor& indices,
-                   int64_t numRows, int64_t stride0, int64_t stride1) {
-  // Compute the results on the device.
+void top_k_per_row_prefill(const torch::Tensor& logits,
+                           const torch::Tensor& rowStarts,
+                           const torch::Tensor& rowEnds, torch::Tensor& indices,
+                           int64_t numRows, int64_t stride0, int64_t stride1,
+                           int64_t topK) {
+  constexpr int kSortingAlgorithmThreshold = 12288;
   constexpr int kNumThreadsPerBlock = 512;
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  vllm::topKPerRow<kNumThreadsPerBlock>
-      <<<numRows, kNumThreadsPerBlock, 0, stream>>>(
-          logits.data_ptr<float>(), rowStarts.data_ptr<int>(),
-          rowEnds.data_ptr<int>(), indices.data_ptr<int>(),
-          static_cast<int>(stride0), static_cast<int>(stride1));
+  int numInsertionBlocks =
+      std::min(static_cast<int>(numRows), kSortingAlgorithmThreshold);
+  vllm::topKPerRowPrefill<kNumThreadsPerBlock, false>
+      <<<numInsertionBlocks, kNumThreadsPerBlock, topK * sizeof(int32_t),
+         stream>>>(logits.data_ptr<float>(), rowStarts.data_ptr<int>(),
+                   rowEnds.data_ptr<int>(), indices.data_ptr<int>(),
+                   static_cast<int>(stride0), static_cast<int>(stride1),
+                   static_cast<int>(topK), 0);
+
+  if (numRows > kSortingAlgorithmThreshold) {
+    int numRadixBlocks = numRows - kSortingAlgorithmThreshold;
+    vllm::topKPerRowPrefill<kNumThreadsPerBlock, true>
+        <<<numRadixBlocks, kNumThreadsPerBlock, topK * sizeof(int32_t),
+           stream>>>(logits.data_ptr<float>(), rowStarts.data_ptr<int>(),
+                     rowEnds.data_ptr<int>(), indices.data_ptr<int>(),
+                     static_cast<int>(stride0), static_cast<int>(stride1),
+                     static_cast<int>(topK), kSortingAlgorithmThreshold);
+  }
 }
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index db37a9b9b..62212f98b 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -179,15 +179,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   // Optimized top-k per row operation
   ops.def(
-      "top_k_per_row(Tensor logits, Tensor rowStarts, Tensor rowEnds, "
+      "top_k_per_row_prefill(Tensor logits, Tensor rowStarts, Tensor rowEnds, "
       "Tensor! indices, int numRows, int stride0, "
-      "int stride1) -> ()");
-  ops.impl("top_k_per_row", torch::kCUDA, &top_k_per_row);
+      "int stride1, int topK) -> ()");
+  ops.impl("top_k_per_row_prefill", torch::kCUDA, &top_k_per_row_prefill);
 
   ops.def(
       "top_k_per_row_decode(Tensor logits, int next_n, "
-      "Tensor seq_lens, Tensor! indices, int numRows, "
-      "int stride0, int stride1) -> ()");
+      "Tensor seq_lens, Tensor! indices, "
+      "int numRows, int stride0, int stride1, int topK) -> ()");
   ops.impl("top_k_per_row_decode", torch::kCUDA, &top_k_per_row_decode);
 
   // Layernorm-quant
diff --git a/tests/kernels/test_top_k_per_row.py b/tests/kernels/test_top_k_per_row.py
index cadda27b4..3bf693897 100644
--- a/tests/kernels/test_top_k_per_row.py
+++ b/tests/kernels/test_top_k_per_row.py
@@ -9,23 +9,45 @@ from vllm.platforms import current_platform
 
 # Test parameters
 NUM_ROWS = [1, 32, 2050]
-TOP_K_VALUES = [2048]
-BATCH_SIZE = [1, 2, 4, 2048, 4096]
-NEXT_N = [1, 2, 4, 8]
+TOP_K_VALUES = [2048, 3000]
+BATCH_SIZE = [1, 2, 2048]
+NEXT_N = [1, 8]
+DATA_GENERATION = ["random", "10LSBits"]
 
 
 def create_random_logits(
     row_starts: torch.Tensor,
     row_ends: torch.Tensor,
-    vocab_size: int,
     dtype: torch.dtype,
     seed: int,
+    data_generation: str,
 ) -> torch.Tensor:
     """Create random logits tensor for testing."""
     torch.manual_seed(seed)
     np.random.seed(seed)
     # Generate logits with some structure to make testing more meaningful
-    logits = torch.randn(row_starts.shape[0], max(row_ends), dtype=dtype, device="cuda")
+    if data_generation == "random":
+        logits = torch.randn(
+            row_starts.shape[0], max(row_ends), dtype=dtype, device="cuda"
+        )
+    elif data_generation == "10LSBits":
+        top_22_bits_mask = 0xFFFFFC00
+        last_10_bits_mask = 0x000003FF
+        fixed_top_22_bits = 0x3F900000
+        # Generate random bits for the last 10 bits
+        random_bottom_bits = torch.randint(
+            0,
+            2**10,
+            (row_starts.shape[0], max(row_ends)),
+            dtype=torch.int32,
+            device="cuda",
+        )
+        # Combine: fixed top 22 bits with random last 10 bits
+        logits_bits = (fixed_top_22_bits & top_22_bits_mask) | (
+            random_bottom_bits & last_10_bits_mask
+        )
+        logits = logits_bits.view(dtype)
+
     for i, end in enumerate(row_ends):
         logits[i, end:] = float("-inf")
     return logits
@@ -113,13 +135,13 @@ def test_top_k_per_row(
     # Create test data
     vocab_size = 20000
     row_starts, row_ends = create_row_boundaries(num_rows, vocab_size)
-    logits = create_random_logits(row_starts, row_ends, vocab_size, torch.float32, 42)
+    logits = create_random_logits(row_starts, row_ends, torch.float32, 42, "random")
 
     # Create output tensors
     indices = torch.empty((num_rows, top_k), dtype=torch.int32, device="cuda")
 
     # Run CUDA implementation
-    torch.ops._C.top_k_per_row(
+    torch.ops._C.top_k_per_row_prefill(
         logits,
         row_starts,
         row_ends,
@@ -127,6 +149,7 @@ def test_top_k_per_row(
         num_rows,
         logits.stride(0),
         logits.stride(1),
+        top_k,
     )
 
     # Run reference implementation
@@ -139,27 +162,23 @@ def test_top_k_per_row(
     # Compare results
     assert compare_top_k_results(
         logits, indices, torch_indices, row_starts, row_ends, top_k
-    ), "CUDA top_k_per_row results don't match torch.topk"
+    ), "CUDA top_k_per_row_prefill results don't match torch.topk"
 
 
-@pytest.mark.parametrize("top_k", TOP_K_VALUES)
-@pytest.mark.parametrize("batch_size", BATCH_SIZE)
-@pytest.mark.parametrize("next_n", NEXT_N)
-@pytest.mark.skipif(not current_platform.is_cuda(), reason="This test requires CUDA")
-@torch.inference_mode()
-def test_top_k_per_row_decode(
+def _run_top_k_per_row_decode_test(
     top_k: int,
     batch_size: int,
     next_n: int,
+    vocab_size: int,
+    data_generation: str,
 ) -> None:
     """
-    Test top_k_per_row with seq_lens tensor.
+    Helper function to run top_k_per_row_decode test with given parameters.
     """
     torch.set_default_device("cuda:0")
 
     # Create test data
     num_rows = batch_size * next_n
-    vocab_size = 20000
     seq_lens = torch.randint(
         vocab_size, (batch_size,), dtype=torch.int32, device="cuda"
     )
@@ -167,7 +186,9 @@ def test_top_k_per_row_decode(
     row_indices = torch.arange(num_rows, device="cuda") // next_n
     next_n_offset = torch.arange(num_rows, device="cuda") % next_n
     row_ends = seq_lens[row_indices] - next_n + next_n_offset + 1
-    logits = create_random_logits(row_starts, row_ends, vocab_size, torch.float32, 42)
+    logits = create_random_logits(
+        row_starts, row_ends, torch.float32, 42, data_generation
+    )
 
     # Create output tensors
     indices = torch.empty((num_rows, top_k), dtype=torch.int32, device="cuda")
@@ -181,6 +202,7 @@ def test_top_k_per_row_decode(
         num_rows,
         logits.stride(0),
         logits.stride(1),
+        top_k,
     )
 
     torch.cuda.synchronize()
@@ -195,4 +217,41 @@ def test_top_k_per_row_decode(
     # Compare results
     assert compare_top_k_results(
         logits, indices, torch_indices, row_starts, row_ends, top_k
-    ), "CUDA top_k_per_row results don't match torch.topk"
+    ), "CUDA top_k_per_row_decode results don't match torch.topk"
+
+
+@pytest.mark.parametrize("top_k", TOP_K_VALUES)
+@pytest.mark.parametrize("batch_size", BATCH_SIZE)
+@pytest.mark.parametrize("next_n", NEXT_N)
+@pytest.mark.parametrize("data_generation", DATA_GENERATION)
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="This test requires CUDA")
+@torch.inference_mode()
+def test_top_k_per_row_decode(
+    top_k: int,
+    batch_size: int,
+    next_n: int,
+    data_generation: str,
+) -> None:
+    """
+    Test top_k_per_row with seq_lens tensor.
+    """
+    vocab_size = 20000
+    _run_top_k_per_row_decode_test(
+        top_k, batch_size, next_n, vocab_size, data_generation
+    )
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="This test requires CUDA")
+@torch.inference_mode()
+def test_top_k_per_row_decode_large_vocab_size() -> None:
+    """
+    Test top_k_per_row_decode with large vocabulary size.
+    """
+    top_k = 2048
+    batch_size = 2
+    next_n = 2
+    vocab_size = 300000
+    data_generation = "random"
+    _run_top_k_per_row_decode_test(
+        top_k, batch_size, next_n, vocab_size, data_generation
+    )
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index a8eb4a69b..0b6513789 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -684,11 +684,10 @@ def sparse_attn_indexer(
                 chunk.cu_seqlen_ke,
             )
             num_rows = logits.shape[0]
-            assert topk_tokens == 2048, "top_k_per_row assumes size 2048"
             topk_indices = topk_indices_buffer[
                 chunk.token_start : chunk.token_end, :topk_tokens
             ]
-            torch.ops._C.top_k_per_row(
+            torch.ops._C.top_k_per_row_prefill(
                 logits,
                 chunk.cu_seqlen_ks,
                 chunk.cu_seqlen_ke,
@@ -696,6 +695,7 @@ def sparse_attn_indexer(
                 num_rows,
                 logits.stride(0),
                 logits.stride(1),
+                topk_tokens,
             )
 
     if has_decode:
@@ -738,7 +738,6 @@ def sparse_attn_indexer(
             max_model_len=max_model_len,
         )
         num_rows = logits.shape[0]
-        assert topk_tokens == 2048, "top_k_per_row assumes size 2048"
         topk_indices = topk_indices_buffer[:num_decode_tokens, :topk_tokens]
 
         torch.ops._C.top_k_per_row_decode(
@@ -749,6 +748,7 @@ def sparse_attn_indexer(
             num_rows,
             logits.stride(0),
             logits.stride(1),
+            topk_tokens,
         )
         if decode_metadata.requires_padding:
             # if padded, we need to unpack
-- 
GitLab


From 87aee9ed2b65b9ef7cdb83cc69dc1fc15f39ff36 Mon Sep 17 00:00:00 2001
From: Laith Sakka <lsakka@meta.com>
Date: Mon, 8 Dec 2025 07:46:15 -0800
Subject: [PATCH 196/258] Add evaluate_guards option to DynamicShapesConfig
 (#27432)

Signed-off-by: Laith Sakka <lsakka@meta.com>
---
 .../test_dynamic_shapes_compilation.py        | 139 +++++++++++++++++-
 vllm/compilation/backends.py                  |  26 +++-
 vllm/compilation/decorators.py                |   4 +-
 vllm/compilation/wrapper.py                   |  59 ++++++--
 vllm/config/compilation.py                    |  17 ++-
 vllm/config/vllm.py                           |   2 +-
 6 files changed, 219 insertions(+), 28 deletions(-)

diff --git a/tests/compile/test_dynamic_shapes_compilation.py b/tests/compile/test_dynamic_shapes_compilation.py
index 1966b03cd..bc3dbf553 100644
--- a/tests/compile/test_dynamic_shapes_compilation.py
+++ b/tests/compile/test_dynamic_shapes_compilation.py
@@ -2,12 +2,21 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import gc
+import tempfile
+from contextlib import contextmanager
 
 import pytest
 import torch
 
 from vllm import LLM, SamplingParams
-from vllm.config.compilation import CompilationMode, DynamicShapesType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
+from vllm.config.compilation import (
+    CompilationMode,
+    DynamicShapesConfig,
+    DynamicShapesType,
+)
+from vllm.forward_context import set_forward_context
 from vllm.tokenizers import get_tokenizer
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
@@ -29,18 +38,19 @@ def get_test_models():
 )
 @pytest.mark.parametrize("use_aot_compile", ["0"])
 @pytest.mark.parametrize("use_bytecode_hook", [True, False])
+@pytest.mark.parametrize("evaluate_guards", [False, True])
 @pytest.mark.skipif(
     not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
 )
 def test_dynamic_shapes_compilation(
-    monkeypatch, model_name, shapes_type, use_aot_compile, use_bytecode_hook
+    monkeypatch,
+    model_name,
+    shapes_type,
+    use_aot_compile,
+    use_bytecode_hook,
+    evaluate_guards,
 ):
     """Test that all dynamic shapes types compile successfully"""
-    print(
-        f"\nTesting model: {model_name} with {shapes_type.name}, "
-        f"AOT compile: {use_aot_compile}, "
-        f"Bytecode hook: {use_bytecode_hook}"
-    )
     if use_bytecode_hook and shapes_type == DynamicShapesType.UNBACKED:
         pytest.skip("UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0")
 
@@ -58,6 +68,7 @@ def test_dynamic_shapes_compilation(
             "mode": CompilationMode.VLLM_COMPILE,
             "dynamic_shapes_config": {
                 "type": shapes_type.value,
+                "evaluate_guards": evaluate_guards,
             },
         },
     )
@@ -86,3 +97,117 @@ def test_dynamic_shapes_compilation(
     torch.cuda.empty_cache()
     torch.cuda.synchronize()
     print("GPU memory cleared")
+
+
+@pytest.mark.parametrize("use_aot_compile", ["0", "1"])
+@pytest.mark.parametrize(
+    "dynamic_shapes_type",
+    [
+        DynamicShapesType.BACKED,
+        DynamicShapesType.BACKED_SIZE_OBLIVIOUS,
+    ],
+)
+@pytest.mark.parametrize("evaluate_guards", [False, True])
+def test_model_specialization_with_evaluate_guards(
+    monkeypatch, use_aot_compile, dynamic_shapes_type, evaluate_guards
+):
+    """Test that evaluate_guards correctly detects shape specialization
+    violations.
+    """
+
+    if (
+        use_aot_compile == "1"
+        and dynamic_shapes_type == DynamicShapesType.BACKED
+        and evaluate_guards
+    ):
+        pytest.skip("evaluate_guards for backed does not work with aot_compile =1")
+
+    @support_torch_compile
+    class ModelWithSizeCheck(torch.nn.Module):
+        def __init__(self, **kwargs):
+            super().__init__()
+
+        def forward(self, x: torch.Tensor):
+            # This will cause specialization - torch.compile will guard on
+            # sx.shape[0]
+            if x.shape[0] >= 10:
+                return x * 10
+            else:
+                return x * 10
+
+    @support_torch_compile
+    class ModelWithOneSizeCheck(torch.nn.Module):
+        def __init__(self, **kwargs):
+            super().__init__()
+
+        def forward(self, x: torch.Tensor):
+            # This will cause 0/1 specializations.
+            if x.shape[0] == 0:
+                return x * 10
+            if x.shape[0] == 1:
+                return x * 10
+            else:
+                return x * 10
+
+    @contextmanager
+    def use_vllm_config(vllm_config: VllmConfig):
+        with set_forward_context({}, vllm_config), set_current_vllm_config(vllm_config):
+            yield
+
+    monkeypatch.setenv("TOKENIZERS_PARALLELISM", "true")
+    monkeypatch.setenv("VLLM_USE_AOT_COMPILE", use_aot_compile)
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "0")
+
+    # Create vllm config with the desired settings
+    from vllm.config import CompilationMode
+
+    vllm_config = VllmConfig(
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            dynamic_shapes_config=DynamicShapesConfig(
+                type=dynamic_shapes_type,
+                evaluate_guards=evaluate_guards,
+            ),
+        )
+    )
+
+    def test(model_class, input1, input2, is_01_specialization=False):
+        with (
+            torch.no_grad(),
+            use_vllm_config(vllm_config),
+            tempfile.TemporaryDirectory() as tmpdirname,
+        ):
+            monkeypatch.setenv("VLLM_CACHE_ROOT", tmpdirname)
+
+            model = model_class(vllm_config=vllm_config).cuda()
+
+            model(input1)
+
+            if evaluate_guards and (
+                not (
+                    is_01_specialization
+                    and dynamic_shapes_type == DynamicShapesType.BACKED
+                )
+            ):
+                # This should fail because guards were added.
+                with pytest.raises(RuntimeError) as excinfo:
+                    model(input2)
+
+                # Expected failure - guard was violated
+                error_msg = str(excinfo.value)
+                assert (
+                    "GuardManager check failed" in error_msg
+                    or "Detected recompile when torch.compile stance" in error_msg
+                ), error_msg
+
+            else:
+                model(input2)
+
+    test(ModelWithSizeCheck, torch.randn(20, 10).cuda(), torch.randn(5, 10).cuda())
+    test(ModelWithSizeCheck, torch.randn(5, 10).cuda(), torch.randn(20, 10).cuda())
+    test(
+        ModelWithOneSizeCheck,
+        torch.randn(20, 10).cuda(),
+        torch.randn(1, 10).cuda(),
+        is_01_specialization=True,
+    )
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 26f4f16a8..dd2233522 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -26,6 +26,7 @@ from vllm.compilation.partition_rules import (
     should_split,
 )
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
+from vllm.config.compilation import DynamicShapesType
 from vllm.config.utils import Range, hash_factors
 from vllm.logger import init_logger
 from vllm.logging_utils import lazy
@@ -722,6 +723,29 @@ class VllmBackend:
             self.split_gm, submod_names_to_compile, self.vllm_config, self
         ).run(*fake_args)
 
+        from torch._guards import detect_fake_mode
+
+        fake_mode = detect_fake_mode()
+
+        if (
+            self.compilation_config.dynamic_shapes_config.evaluate_guards
+            and self.compilation_config.dynamic_shapes_config.type
+            == DynamicShapesType.BACKED
+        ):
+            from torch.utils._sympy.value_ranges import ValueRanges
+
+            # Drop counter-0/1 specializations guards; for backed dynamic shapes,
+            # torch.compile will specialize for 0/1 inputs or otherwise guards that
+            # shape is >= 2. This is because it's really hard not to hit a check
+            # against 0/1. When we evaluate shape guards, we exclude checking those
+            # guards (We would fail always otherwise).
+
+            # We avoid that by updating the ranges of backed sizes when the min is
+            # 2 for any, we assume it's 0.
+            for s, r in fake_mode.shape_env.var_to_range.items():
+                if r.lower == 2:
+                    fake_mode.shape_env.var_to_range[s] = ValueRanges(0, r.upper)
+
         graph_path = os.path.join(local_cache_dir, "computation_graph.py")
         if not os.path.exists(graph_path):
             # code adapted from
@@ -749,8 +773,6 @@ class VllmBackend:
                 graph, example_inputs, self.prefix, self.split_gm
             )
 
-        # if we need to copy input buffers for cudagraph
-        #
         # index of tensors that have symbolic shapes (batch size)
         # for weights and static buffers, they will have concrete shapes.
         # symbolic shape only happens for input tensors.
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 6bb66ce3e..31f5e7840 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -392,7 +392,6 @@ def _support_torch_compile(
 
             factors.append(_model_hash_key(self.forward))
             hash_key = hashlib.sha256(str(factors).encode()).hexdigest()
-
             cache_dir = os.path.join(
                 envs.VLLM_CACHE_ROOT,
                 "torch_aot_compile",
@@ -413,7 +412,8 @@ def _support_torch_compile(
                         f, f_globals=self.forward.__globals__
                     )
                 _verify_source_unchanged(loaded_fn.source_info(), self.vllm_config)
-                loaded_fn.disable_guard_check()
+                if not self.compilation_config.dynamic_shapes_config.evaluate_guards:
+                    loaded_fn.disable_guard_check()
                 self.aot_compiled_fn = loaded_fn
             except Exception as e:
                 if os.path.exists(aot_compilation_path):
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 69e1ed37a..b59a4a9dd 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -4,7 +4,7 @@
 import os
 import sys
 from abc import abstractmethod
-from contextlib import contextmanager
+from contextlib import contextmanager, nullcontext
 from types import CodeType
 from typing import Any
 
@@ -13,6 +13,7 @@ import torch._C._dynamo.guards
 
 import vllm.envs as envs
 from vllm.config import CompilationMode, CUDAGraphMode, get_current_vllm_config
+from vllm.config.compilation import DynamicShapesType
 from vllm.logger import init_logger
 from vllm.utils.nvtx_pytorch_hooks import layerwise_nvtx_marker_context
 
@@ -125,23 +126,49 @@ class TorchCompileWithNoGuardsWrapper:
         if isinstance(backend, str) and backend == "inductor":
             options = vllm_config.compilation_config.inductor_compile_config
 
+        self.first_compile = True
+        self.evaluate_guards = (
+            vllm_config.compilation_config.dynamic_shapes_config.evaluate_guards
+        )
+
+        ds_type = vllm_config.compilation_config.dynamic_shapes_config.type
+
         if mode != CompilationMode.STOCK_TORCH_COMPILE:
             # Drop all the guards.
-            options["guard_filter_fn"] = lambda x: [False for _ in x]
+            if self.evaluate_guards:
+                assert not envs.VLLM_USE_BYTECODE_HOOK, (
+                    "compilation_config.dynamic_shapes_config.evaluate_guards "
+                    "requires VLLM_USE_BYTECODE_HOOK=0. "
+                )
 
-        # Validate that unbacked dynamic shapes require VLLM_USE_BYTECODE_HOOK=False
-        from vllm.compilation.decorators import DynamicShapesType
+                if envs.VLLM_USE_AOT_COMPILE:
+                    # disabled until https://github.com/pytorch/pytorch/pull/169239
+                    # is picked up.
+                    assert ds_type != DynamicShapesType.BACKED, (
+                        "evaluate_guards for backed shapes requires "
+                        "VLLM_USE_AOT_COMPILE=False. "
+                    )
+
+                options["guard_filter_fn"] = lambda x: [
+                    entry.guard_type == "SHAPE_ENV" for entry in x
+                ]
+            else:
+                options["guard_filter_fn"] = lambda x: [False for _ in x]
 
-        ds_type = vllm_config.compilation_config.dynamic_shapes_config.type
         compiled_ptr: Any = self.forward
+        # Validate that unbacked dynamic shapes require VLLM_USE_BYTECODE_HOOK=False
+
         if ds_type == DynamicShapesType.UNBACKED:
-            if envs.VLLM_USE_BYTECODE_HOOK:
-                # reason is that bytecode does this hack torch._dynamo.eval_frame.
-                # remove_from_cache(self.original_code_object()) to force a new
-                # re-compilation.
-                raise ValueError(
-                    "UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0. "
-                )
+            # reason is that bytecode does torch._dynamo.eval_frame.
+            # remove_from_cache(self.original_code_object()) to force a new
+            # re-compilation. And if we use
+            # compiled_ptr = self.check_invariants_and_forward
+            # it will reset all entries.
+            assert not envs.VLLM_USE_BYTECODE_HOOK, (
+                "UNBACKED dynamic shapes requires VLLM_USE_BYTECODE_HOOK=0. "
+            )
+            assert not self.evaluate_guards, "UNBACKED dynamic shapes do not add guards"
+
             compiled_ptr = self.check_invariants_and_forward
 
         if envs.VLLM_USE_AOT_COMPILE:
@@ -195,7 +222,13 @@ class TorchCompileWithNoGuardsWrapper:
                         self.forward, *args, **kwargs
                     )
         else:
-            with _compilation_context():
+            ctx = (
+                nullcontext()
+                if self.first_compile or not self.evaluate_guards
+                else torch.compiler.set_stance("fail_on_recompile")
+            )
+            self.first_compile = False
+            with _compilation_context(), ctx:
                 return self._call_with_optional_nvtx_range(
                     self._compiled_callable, *args, **kwargs
                 )
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index b79200f0e..51e4912aa 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -344,7 +344,18 @@ class DynamicShapesConfig:
       backed/unbacked.
     """
 
-    # TODO add a debug mode to fail
+    evaluate_guards: bool = False
+    """
+    A debug mode to detect and fail if Dynamo ever specializes a dynamic shape by
+    guarding on it. When True, dynamic shape guards are not dropped from dynamo.
+    And a failure will be triggered if a recompilation ever happens due to that.
+    This mode requires VLLM_USE_BYTECODE_HOOK to be 0.
+    Enabling this allow observing the dynamic shapes guards in the tlparse
+    artifacts also.
+    When type is backed, aot_compile must be disabled for this mode to work.
+    until this change picked up https://github.com/pytorch/pytorch/pull/169239.
+
+    """
 
     def compute_hash(self) -> str:
         """
@@ -455,8 +466,8 @@ class CompilationConfig:
     We use string to avoid serialization issues when using compilation in a
     distributed setting. When the compilation mode is 1 or 2, the backend is
     used for the compilation directly (it sees the whole graph). When the
-    compilation mode is 3, the backend supports both whole graph and piecewise 
-    compilation, available backends include eager, inductor, and custom backends, 
+    compilation mode is 3, the backend supports both whole graph and piecewise
+    compilation, available backends include eager, inductor, and custom backends,
     the latter of which can be defined via `get_compile_backend`. Furthermore,
     compilation is only piecewise if splitting ops is set accordingly and
     use_inductor_graph_partition is off. Note that the default options for
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 36e4bd159..a74413536 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -66,7 +66,7 @@ class OptimizationLevel(IntEnum):
     """O0 : No optimization. no compilation, no cudagraphs, no other
     optimization, just starting up immediately"""
     O1 = 1
-    """O1: Quick optimizations. Dynamo+Inductor compilation and Piecewise 
+    """O1: Quick optimizations. Dynamo+Inductor compilation and Piecewise
     cudagraphs"""
     O2 = 2
     """O2: Full optimizations. -O1 as well as Full and Piecewise cudagraphs."""
-- 
GitLab


From 67312cad11835bd75ca55fda83708d4806b82436 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 9 Dec 2025 00:59:31 +0800
Subject: [PATCH 197/258] [Misc] Split the LoRA code (#30253)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_layers.py                 |   2 +-
 tests/lora/test_lora_checkpoints.py       |   2 +-
 tests/lora/test_lora_huggingface.py       |   2 +-
 tests/lora/test_lora_manager.py           |   4 +-
 tests/lora/test_worker.py                 |   2 +-
 vllm/lora/lora_model.py                   | 246 ++++++++++++++++++++++
 vllm/lora/{models.py => model_manager.py} | 237 +--------------------
 vllm/lora/utils.py                        |   9 +
 vllm/lora/worker_manager.py               |   4 +-
 9 files changed, 265 insertions(+), 243 deletions(-)
 create mode 100644 vllm/lora/lora_model.py
 rename vllm/lora/{models.py => model_manager.py} (74%)

diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 9df3a07a9..47d1fcfe9 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -28,7 +28,7 @@ from vllm.lora.layers import (
     RowParallelLinearWithShardedLoRA,
     VocabParallelEmbeddingWithLoRA,
 )
-from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights
+from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.punica_wrapper import get_punica_wrapper
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index e9653a2fe..e6816e83d 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-from vllm.lora.models import LoRAModel
+from vllm.lora.lora_model import LoRAModel
 from vllm.lora.peft_helper import PEFTHelper
 from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
 from vllm.model_executor.models.utils import WeightsMapper
diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
index 3348d2f8c..7c7f4eb4b 100644
--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-from vllm.lora.models import LoRAModel
+from vllm.lora.lora_model import LoRAModel
 from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.utils import get_adapter_absolute_path
 from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 081f14d6f..50f17ced5 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -15,10 +15,10 @@ from vllm.lora.layers import (
     MergedColumnParallelLinearWithLoRA,
     RowParallelLinearWithLoRA,
 )
+from vllm.lora.lora_model import LoRAModel
 from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
-from vllm.lora.models import (
+from vllm.lora.model_manager import (
     LoRAMapping,
-    LoRAModel,
     LoRAModelManager,
     LRUCacheLoRAModelManager,
 )
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 54059ec56..445aaf9cb 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -16,7 +16,7 @@ from vllm.config import (
 )
 from vllm.config.load import LoadConfig
 from vllm.config.lora import LoRAConfig
-from vllm.lora.models import LoRAMapping
+from vllm.lora.model_manager import LoRAMapping
 from vllm.lora.request import LoRARequest
 from vllm.v1.worker.gpu_worker import Worker
 
diff --git a/vllm/lora/lora_model.py b/vllm/lora/lora_model.py
new file mode 100644
index 000000000..db170f13a
--- /dev/null
+++ b/vllm/lora/lora_model.py
@@ -0,0 +1,246 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+
+import safetensors.torch
+import torch
+
+from vllm.logger import init_logger
+from vllm.lora.lora_weights import LoRALayerWeights
+from vllm.lora.peft_helper import PEFTHelper
+from vllm.lora.utils import (
+    get_lora_id,
+    is_base_embeddding_weights,
+    is_regex_target_modules,
+    parse_fine_tuned_lora_name,
+)
+from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+from vllm.model_executor.models.utils import WeightsMapper
+from vllm.utils.platform_utils import is_pin_memory_available
+
+logger = init_logger(__name__)
+
+
+class LoRAModel:
+    """A LoRA fine-tuned model."""
+
+    def __init__(
+        self,
+        lora_model_id: int,
+        rank: int,
+        loras: dict[str, LoRALayerWeights],
+    ) -> None:
+        """
+        Args:
+            lora_model_id: The integer id for the lora model.
+            rank: lora rank.
+            loras: module name -> weights for lora-replaced layers.
+
+        """
+        self.id = lora_model_id
+
+        assert lora_model_id > 0, (
+            f"a valid lora id should be greater than 0, got {self.id}"
+        )
+        self.rank = rank
+        self.loras: dict[str, LoRALayerWeights] = loras
+
+    def clone(self, lora_model_id: int) -> "LoRAModel":
+        """Return a copy of the object with different ids.
+
+        Will share the underlying tensors."""
+        return self.__class__(
+            lora_model_id,
+            rank=self.rank,
+            loras=self.loras.copy(),
+        )
+
+    def get_lora(self, module_name: str) -> LoRALayerWeights | None:
+        """Get LoRA for a given module by name"""
+        return self.loras.get(module_name, None)
+
+    def check_lora_name(self, lora_name: str) -> bool:
+        return lora_name in self.loras
+
+    @classmethod
+    def from_lora_tensors(
+        cls,
+        lora_model_id: int,
+        tensors: dict[str, torch.Tensor],
+        peft_helper: PEFTHelper,
+        device: str = "cuda",
+        dtype: torch.dtype | None = None,
+        model_vocab_size: int | None = None,
+        weights_mapper: WeightsMapper | None = None,
+    ) -> "LoRAModel":
+        """Create a LoRAModel from a dictionary of tensors."""
+        pin_memory = str(device) == "cpu" and is_pin_memory_available()
+        loras: dict[str, LoRALayerWeights] = {}
+        for tensor_name, tensor in tensors.items():
+            if is_base_embeddding_weights(tensor_name):
+                continue
+            module_name, is_lora_a = parse_fine_tuned_lora_name(
+                tensor_name, weights_mapper
+            )
+            if module_name not in loras:
+                loras[module_name] = LoRALayerWeights.from_config(
+                    module_name, peft_helper
+                )
+
+            if is_lora_a:
+                if (
+                    "lora_embedding_A" in tensor_name
+                    and model_vocab_size is not None
+                    and model_vocab_size != tensor.shape[1]
+                ):
+                    raise RuntimeError(
+                        f"The embedding LoRA size({tensor.shape[1]}) must be consistent"
+                        f" with the base model's vocabulary size({model_vocab_size})."
+                    )
+                loras[module_name].lora_a = tensor.to(device=device, dtype=dtype)
+                if pin_memory:
+                    loras[module_name].lora_a = loras[module_name].lora_a.pin_memory()
+            else:
+                loras[module_name].lora_b = tensor.to(device=device, dtype=dtype)
+
+                if pin_memory:
+                    loras[module_name].lora_b = loras[module_name].lora_b.pin_memory()
+
+        return cls(lora_model_id, peft_helper.r, loras)
+
+    @classmethod
+    def from_local_checkpoint(
+        cls,
+        lora_dir: str,
+        expected_lora_modules: set[str],
+        peft_helper: PEFTHelper,
+        *,
+        lora_model_id: int | None = None,
+        device: str = "cuda",
+        dtype: torch.dtype | None = None,
+        model_vocab_size: int | None = None,
+        weights_mapper: WeightsMapper | None = None,
+        tensorizer_config_dict: dict | None = None,
+    ) -> "LoRAModel":
+        """Create a LoRAModel from a local checkpoint.
+
+        Args:
+            lora_dir: The local path that has lora data.
+            expected_lora_modules: Name of modules that are expected to be
+                replaced by lora.
+            peft_helper: Loaded lora configuration information.
+            lora_model_id: LoRA model id. If not given, automatically set by
+                a global counter.
+            device: Device where the lora model is loaded.
+            dtype: dtype of the lora model weights.
+
+        Returns:
+            Loaded LoRA Model.
+        """
+        lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors")
+        lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin")
+        lora_pt_file_path = os.path.join(lora_dir, "adapter_model.pt")
+
+        tensors: dict[str, torch.Tensor] = {}
+        unexpected_modules: list[list[str] | str] = []
+
+        def check_unexpected_modules(modules: dict):
+            for lora_module in modules.keys():  # noqa
+                if is_base_embeddding_weights(lora_module):
+                    continue
+                # Handle PEFT file format where experts.base_layer is the
+                # gate_up_proj and experts is the down_proj
+                if "base_layer" in lora_module:
+                    continue
+                module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper)
+                # Case for expert lora weights
+                if ".experts" in module_name:
+                    expert_idx = module_name.find(".experts")
+                    expert_suffix = module_name[expert_idx + 1 :]
+                    if expert_suffix not in expected_lora_modules:
+                        unexpected_modules.append(module_name)
+
+                elif module_name.rsplit(".", 1)[-1] not in expected_lora_modules:
+                    unexpected_modules.append(module_name)
+
+            if unexpected_modules:
+                raise ValueError(
+                    f"While loading {lora_dir}, expected"
+                    f" target modules in {expected_lora_modules}"
+                    f" but received {unexpected_modules}."
+                    f" Please verify that the loaded LoRA module is correct"
+                )
+
+        if tensorizer_config_dict:
+            from tensorizer import TensorDeserializer
+
+            tensorizer_config = TensorizerConfig(**tensorizer_config_dict)
+            lora_tensor_path = os.path.join(
+                tensorizer_config.tensorizer_dir, "adapter_model.tensors"
+            )
+            tensorizer_args = tensorizer_config._construct_tensorizer_args()
+            tensors = TensorDeserializer(
+                lora_tensor_path,
+                dtype=tensorizer_config.dtype,
+                **tensorizer_args.deserialization_kwargs,
+            )
+            check_unexpected_modules(tensors)
+
+        elif os.path.isfile(lora_tensor_path):
+            # Find unexpected modules.
+            # Use safetensor key as a source of truth to find expected modules.
+            # in peft if you have target_modules A, B, C and C does not exist
+            # in the model it won’t error and model will be trained with A, B
+            # loraified. C won’t exist in the safetensor but it will exist in
+            # the target_modules of the adapter_config.json.
+            unexpected_modules = []
+            with safetensors.safe_open(lora_tensor_path, framework="pt") as f:  # type: ignore
+                # Load tensors if there are only expected modules.
+                check_unexpected_modules(f)
+                for module in f.keys():  # noqa
+                    tensors[module] = f.get_tensor(module)
+        elif os.path.isfile(lora_bin_file_path) or os.path.isfile(lora_pt_file_path):
+            # When a bin/pt file is provided, we rely on config to find
+            # unexpected modules.
+            unexpected_modules = []
+            target_modules = peft_helper.target_modules
+            if not isinstance(target_modules, list):
+                target_modules = [target_modules]
+            for module in target_modules:
+                # Compatible with more modules,
+                # such as:layers.11.self_attn.k_proj
+                part_name = module.split(".")[-1]
+                if part_name not in expected_lora_modules:
+                    unexpected_modules.append(module)
+            # loaded lora's target modules must be a subset of
+            # expected_lora_modules. It is not reliable. See
+            # https://github.com/vllm-project/vllm/pull/5909. But there's no
+            # other better mechanism.
+            if unexpected_modules and not is_regex_target_modules(
+                peft_helper.target_modules, expected_lora_modules
+            ):
+                raise ValueError(
+                    f"While loading {lora_dir}, expected"
+                    f" target modules in {expected_lora_modules}"
+                    f" but received {unexpected_modules}."
+                    f" Please verify that the loaded LoRA module is correct"
+                )
+            lora_file_path = (
+                lora_bin_file_path
+                if os.path.isfile(lora_bin_file_path)
+                else lora_pt_file_path
+            )
+            tensors = torch.load(lora_file_path, map_location=device, weights_only=True)
+        else:
+            raise ValueError(f"{lora_dir} doesn't contain tensors")
+
+        return cls.from_lora_tensors(
+            lora_model_id=get_lora_id() if lora_model_id is None else lora_model_id,
+            tensors=tensors,
+            peft_helper=peft_helper,
+            device=device,
+            dtype=dtype,
+            model_vocab_size=model_vocab_size,
+            weights_mapper=weights_mapper,
+        )
diff --git a/vllm/lora/models.py b/vllm/lora/model_manager.py
similarity index 74%
rename from vllm/lora/models.py
rename to vllm/lora/model_manager.py
index 567ffce4e..44e0448d9 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/model_manager.py
@@ -2,38 +2,32 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
-import os
 from collections.abc import Callable
 from typing import TypeVar
 
 import regex as re
-import safetensors.torch
 import torch
 from torch import nn
 
 from vllm.config.lora import LoRAConfig
 from vllm.logger import init_logger
 from vllm.lora.layers import BaseLayerWithLoRA, FusedMoE3DWithLoRA, LoRAMapping
+from vllm.lora.lora_model import LoRAModel
 from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
-from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.punica_wrapper import get_punica_wrapper
 from vllm.lora.utils import (
     from_layer,
     from_layer_logits_processor,
     get_supported_lora_modules,
-    is_base_embeddding_weights,
     is_moe_model,
-    is_regex_target_modules,
-    parse_fine_tuned_lora_name,
     process_packed_modules_mapping,
     replace_submodule,
 )
 from vllm.model_executor.layers.fused_moe import FusedMoE
-from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 from vllm.model_executor.models import SupportsLoRA, supports_multimodal
 from vllm.model_executor.models.interfaces import is_pooling_model
 from vllm.model_executor.models.module_mapping import MultiModelKeys
-from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper
+from vllm.model_executor.models.utils import PPMissingLayer
 from vllm.utils.cache import LRUCache
 from vllm.utils.platform_utils import is_pin_memory_available
 
@@ -53,233 +47,6 @@ class AdapterLRUCache(LRUCache[int, T]):
         return super()._on_remove(key, value)
 
 
-_GLOBAL_LORA_ID = 0
-
-
-def get_lora_id():
-    global _GLOBAL_LORA_ID
-    _GLOBAL_LORA_ID += 1
-    return _GLOBAL_LORA_ID
-
-
-class LoRAModel:
-    """A LoRA fine-tuned model."""
-
-    def __init__(
-        self,
-        lora_model_id: int,
-        rank: int,
-        loras: dict[str, LoRALayerWeights],
-    ) -> None:
-        """
-        Args:
-            lora_model_id: The integer id for the lora model.
-            rank: lora rank.
-            loras: module name -> weights for lora-replaced layers.
-
-        """
-        self.id = lora_model_id
-
-        assert lora_model_id > 0, (
-            f"a valid lora id should be greater than 0, got {self.id}"
-        )
-        self.rank = rank
-        self.loras: dict[str, LoRALayerWeights] = loras
-
-    def clone(self, lora_model_id: int) -> "LoRAModel":
-        """Return a copy of the object with different ids.
-
-        Will share the underlying tensors."""
-        return self.__class__(
-            lora_model_id,
-            rank=self.rank,
-            loras=self.loras.copy(),
-        )
-
-    def get_lora(self, module_name: str) -> LoRALayerWeights | None:
-        """Get LoRA for a given module by name"""
-        return self.loras.get(module_name, None)
-
-    def check_lora_name(self, lora_name: str) -> bool:
-        return lora_name in self.loras
-
-    @classmethod
-    def from_lora_tensors(
-        cls,
-        lora_model_id: int,
-        tensors: dict[str, torch.Tensor],
-        peft_helper: PEFTHelper,
-        device: str = "cuda",
-        dtype: torch.dtype | None = None,
-        model_vocab_size: int | None = None,
-        weights_mapper: WeightsMapper | None = None,
-    ) -> "LoRAModel":
-        """Create a LoRAModel from a dictionary of tensors."""
-
-        loras: dict[str, LoRALayerWeights] = {}
-        for tensor_name, tensor in tensors.items():
-            if is_base_embeddding_weights(tensor_name):
-                continue
-            module_name, is_lora_a = parse_fine_tuned_lora_name(
-                tensor_name, weights_mapper
-            )
-            if module_name not in loras:
-                loras[module_name] = LoRALayerWeights.from_config(
-                    module_name, peft_helper
-                )
-
-            if is_lora_a:
-                if (
-                    "lora_embedding_A" in tensor_name
-                    and model_vocab_size is not None
-                    and model_vocab_size != tensor.shape[1]
-                ):
-                    raise RuntimeError(
-                        f"The embedding LoRA size({tensor.shape[1]}) must be consistent"
-                        f" with the base model's vocabulary size({model_vocab_size})."
-                    )
-                loras[module_name].lora_a = tensor.to(device=device, dtype=dtype)
-            else:
-                loras[module_name].lora_b = tensor.to(device=device, dtype=dtype)
-        return cls(lora_model_id, peft_helper.r, loras)
-
-    @classmethod
-    def from_local_checkpoint(
-        cls,
-        lora_dir: str,
-        expected_lora_modules: set[str],
-        peft_helper: PEFTHelper,
-        *,
-        lora_model_id: int | None = None,
-        device: str = "cuda",
-        dtype: torch.dtype | None = None,
-        model_vocab_size: int | None = None,
-        weights_mapper: WeightsMapper | None = None,
-        tensorizer_config_dict: dict | None = None,
-    ) -> "LoRAModel":
-        """Create a LoRAModel from a local checkpoint.
-
-        Args:
-            lora_dir: The local path that has lora data.
-            expected_lora_modules: Name of modules that are expected to be
-                replaced by lora.
-            peft_helper: Loaded lora configuration information.
-            lora_model_id: LoRA model id. If not given, automatically set by
-                a global counter.
-            device: Device where the lora model is loaded.
-            dtype: dtype of the lora model weights.
-
-        Returns:
-            Loaded LoRA Model.
-        """
-        lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors")
-        lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin")
-        lora_pt_file_path = os.path.join(lora_dir, "adapter_model.pt")
-
-        tensors: dict[str, torch.Tensor] = {}
-        unexpected_modules: list[list[str] | str] = []
-
-        def check_unexpected_modules(modules: dict):
-            for lora_module in modules.keys():  # noqa
-                if is_base_embeddding_weights(lora_module):
-                    continue
-                # Handle PEFT file format where experts.base_layer is the
-                # gate_up_proj and experts is the down_proj
-                if "base_layer" in lora_module:
-                    continue
-                module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper)
-                # Case for expert lora weights
-                if ".experts" in module_name:
-                    expert_idx = module_name.find(".experts")
-                    expert_suffix = module_name[expert_idx + 1 :]
-                    if expert_suffix not in expected_lora_modules:
-                        unexpected_modules.append(module_name)
-
-                elif module_name.rsplit(".", 1)[-1] not in expected_lora_modules:
-                    unexpected_modules.append(module_name)
-
-            if unexpected_modules:
-                raise ValueError(
-                    f"While loading {lora_dir}, expected"
-                    f" target modules in {expected_lora_modules}"
-                    f" but received {unexpected_modules}."
-                    f" Please verify that the loaded LoRA module is correct"
-                )
-
-        if tensorizer_config_dict:
-            from tensorizer import TensorDeserializer
-
-            tensorizer_config = TensorizerConfig(**tensorizer_config_dict)
-            lora_tensor_path = os.path.join(
-                tensorizer_config.tensorizer_dir, "adapter_model.tensors"
-            )
-            tensorizer_args = tensorizer_config._construct_tensorizer_args()
-            tensors = TensorDeserializer(
-                lora_tensor_path,
-                dtype=tensorizer_config.dtype,
-                **tensorizer_args.deserialization_kwargs,
-            )
-            check_unexpected_modules(tensors)
-
-        elif os.path.isfile(lora_tensor_path):
-            # Find unexpected modules.
-            # Use safetensor key as a source of truth to find expected modules.
-            # in peft if you have target_modules A, B, C and C does not exist
-            # in the model it won’t error and model will be trained with A, B
-            # loraified. C won’t exist in the safetensor but it will exist in
-            # the target_modules of the adapter_config.json.
-            unexpected_modules = []
-            with safetensors.safe_open(lora_tensor_path, framework="pt") as f:  # type: ignore
-                # Load tensors if there are only expected modules.
-                check_unexpected_modules(f)
-                for module in f.keys():  # noqa
-                    tensors[module] = f.get_tensor(module)
-        elif os.path.isfile(lora_bin_file_path) or os.path.isfile(lora_pt_file_path):
-            # When a bin/pt file is provided, we rely on config to find
-            # unexpected modules.
-            unexpected_modules = []
-            target_modules = peft_helper.target_modules
-            if not isinstance(target_modules, list):
-                target_modules = [target_modules]
-            for module in target_modules:
-                # Compatible with more modules,
-                # such as:layers.11.self_attn.k_proj
-                part_name = module.split(".")[-1]
-                if part_name not in expected_lora_modules:
-                    unexpected_modules.append(module)
-            # loaded lora's target modules must be a subset of
-            # expected_lora_modules. It is not reliable. See
-            # https://github.com/vllm-project/vllm/pull/5909. But there's no
-            # other better mechanism.
-            if unexpected_modules and not is_regex_target_modules(
-                peft_helper.target_modules, expected_lora_modules
-            ):
-                raise ValueError(
-                    f"While loading {lora_dir}, expected"
-                    f" target modules in {expected_lora_modules}"
-                    f" but received {unexpected_modules}."
-                    f" Please verify that the loaded LoRA module is correct"
-                )
-            lora_file_path = (
-                lora_bin_file_path
-                if os.path.isfile(lora_bin_file_path)
-                else lora_pt_file_path
-            )
-            tensors = torch.load(lora_file_path, map_location=device, weights_only=True)
-        else:
-            raise ValueError(f"{lora_dir} doesn't contain tensors")
-
-        return cls.from_lora_tensors(
-            lora_model_id=get_lora_id() if lora_model_id is None else lora_model_id,
-            tensors=tensors,
-            peft_helper=peft_helper,
-            device=device,
-            dtype=dtype,
-            model_vocab_size=model_vocab_size,
-            weights_mapper=weights_mapper,
-        )
-
-
 class LoRAModelManager:
     """A manager that manages multiple LoRA-fine-tuned models."""
 
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 47484b2b9..4d264c068 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -48,6 +48,15 @@ if TYPE_CHECKING:
 
 logger = init_logger(__name__)
 
+_GLOBAL_LORA_ID = 0
+
+
+def get_lora_id():
+    global _GLOBAL_LORA_ID
+    _GLOBAL_LORA_ID += 1
+    return _GLOBAL_LORA_ID
+
+
 _all_lora_classes: set[type[BaseLayerWithLoRA]] = {
     VocabParallelEmbeddingWithLoRA,
     ColumnParallelLinearWithLoRA,
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 7d77ba724..28c2a53d8 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -8,8 +8,8 @@ import torch
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.lora.models import (
-    LoRAModel,
+from vllm.lora.lora_model import LoRAModel
+from vllm.lora.model_manager import (
     LoRAModelManager,
     LRUCacheLoRAModelManager,
     create_lora_manager,
-- 
GitLab


From 398a596ed249c14b76806900894304a62d653603 Mon Sep 17 00:00:00 2001
From: weiguihua2 <weiguihua2@huawei.com>
Date: Tue, 9 Dec 2025 01:33:48 +0800
Subject: [PATCH 198/258] [MP executor] fix get device count for multi node of
 mp executor feature (#30042)

Signed-off-by: weiguihua2 <weiguihua2@huawei.com>
---
 vllm/distributed/device_communicators/shm_broadcast.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 052df19e3..114516ff0 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -27,6 +27,7 @@ from zmq import (  # type: ignore
 import vllm.envs as envs
 from vllm.distributed.utils import StatelessProcessGroup, sched_yield
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.utils.network_utils import (
     get_ip,
     get_open_port,
@@ -632,7 +633,7 @@ class MessageQueue:
             The MessageQueue instance for the calling process,
             and a list of handles (only non-empty for the reader process).
         """
-        local_size = torch.cuda.device_count()
+        local_size = current_platform.device_count()
         rank = dist.get_rank()
         same_node = rank // local_size == reader_rank // local_size
         buffer_io = MessageQueue(
-- 
GitLab


From fcd5306f65876f72122d7e5852b6000738498d7e Mon Sep 17 00:00:00 2001
From: shaharmor98 <17088876+shaharmor98@users.noreply.github.com>
Date: Mon, 8 Dec 2025 19:35:01 +0200
Subject: [PATCH 199/258] Add latent MoE support (#30203)

Signed-off-by: Shahar Mor <smor@nvidia.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 vllm/model_executor/models/nemotron_h.py      | 62 +++++++++++++++++--
 vllm/transformers_utils/configs/nemotron_h.py |  2 +
 2 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index baeb901bb..2d9dfbd3e 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -83,6 +83,7 @@ class NemotronHMLP(nn.Module):
     def __init__(
         self,
         config: NemotronHConfig,
+        hidden_size: int,
         intermediate_size: int,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -93,7 +94,7 @@ class NemotronHMLP(nn.Module):
         super().__init__()
 
         self.up_proj = ColumnParallelLinear(
-            input_size=config.hidden_size,
+            input_size=hidden_size,
             output_size=intermediate_size,
             bias=bias,
             quant_config=quant_config,
@@ -102,7 +103,7 @@ class NemotronHMLP(nn.Module):
         )
         self.down_proj = RowParallelLinear(
             input_size=intermediate_size,
-            output_size=config.hidden_size,
+            output_size=hidden_size,
             bias=bias,
             quant_config=quant_config,
             reduce_results=reduce_results,
@@ -135,6 +136,10 @@ class NemotronHMoE(nn.Module):
         self.ep_size = self.ep_group.size()
         self.n_routed_experts: int = config.n_routed_experts
         self.n_shared_experts: int = config.n_shared_experts
+        self.use_latent_moe: bool = getattr(config, "moe_latent_size", None) is not None
+        self.moe_hidden_size: int = (
+            config.moe_latent_size if self.use_latent_moe else config.hidden_size
+        )
 
         self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe
 
@@ -172,6 +177,7 @@ class NemotronHMoE(nn.Module):
 
             self.shared_experts = NemotronHMLP(
                 config=config,
+                hidden_size=config.hidden_size,
                 intermediate_size=intermediate_size,
                 quant_config=quant_config,
                 reduce_results=False,
@@ -180,10 +186,12 @@ class NemotronHMoE(nn.Module):
             )
 
         self.experts = SharedFusedMoE(
-            shared_experts=self.shared_experts,
+            # TODO: make it possible for shared experts to have
+            # different input in SharedFusedMoE
+            shared_experts=self.shared_experts if not self.use_latent_moe else None,
             num_experts=config.n_routed_experts,
             top_k=config.num_experts_per_tok,
-            hidden_size=config.hidden_size,
+            hidden_size=self.moe_hidden_size,
             intermediate_size=config.moe_intermediate_size,
             reduce_results=False,
             renormalize=config.norm_topk_prob,
@@ -201,6 +209,32 @@ class NemotronHMoE(nn.Module):
             is_sequence_parallel=self.is_sequence_parallel,
         )
 
+        if self.use_latent_moe:
+            # TODO: check if using ReplicatedLinear is better than
+            # ColumnParallelLinear + all_gather
+            self.fc1_latent_proj = ColumnParallelLinear(
+                input_size=config.hidden_size,
+                output_size=self.moe_hidden_size,
+                bias=config.mlp_bias,
+                quant_config=quant_config,
+                disable_tp=self.is_sequence_parallel,
+                # We need to gather the output to prepare input for moe
+                gather_output=True,
+                prefix=f"{prefix}.fc1_latent_proj",
+            )
+            self.fc2_latent_proj = ReplicatedLinear(
+                input_size=self.moe_hidden_size,
+                output_size=config.hidden_size,
+                bias=config.mlp_bias,
+                quant_config=quant_config,
+                disable_tp=self.is_sequence_parallel,
+                prefix=f"{prefix}.fc2_latent_proj",
+            )
+
+        else:
+            self.fc1_latent_proj = None
+            self.fc2_latent_proj = None
+
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
@@ -210,12 +244,20 @@ class NemotronHMoE(nn.Module):
 
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states.to(dtype=torch.float32))
+        shared_output = None
+        if self.use_latent_moe:
+            if self.shared_experts is not None:
+                shared_output = self.shared_experts(hidden_states)
+            hidden_states, _ = self.fc1_latent_proj(hidden_states)
 
         fused_moe_out = self.experts(
             hidden_states=hidden_states, router_logits=router_logits
         )
 
-        shared_output, final_hidden_states = fused_moe_out
+        if self.use_latent_moe:
+            _, final_hidden_states = fused_moe_out
+        else:
+            shared_output, final_hidden_states = fused_moe_out
 
         # Fix FP16 overflow
         # See DeepseekV2DecoderLayer for more details.
@@ -225,6 +267,13 @@ class NemotronHMoE(nn.Module):
             assert shared_output is not None
             shared_output *= 1.0 / self.routed_scaling_factor
 
+        # TODO: currently latent up_proj is done before all-reduce for simplicity.
+        #  if and when shared experts will be part of SharedFusedMoE,
+        #  we should do the up_proj after all-reduce,
+        #  to have the all-reduce in the smaller latent dimension.
+        if self.use_latent_moe:
+            final_hidden_states, _ = self.fc2_latent_proj(final_hidden_states)
+
         if self.shared_experts is not None:
             assert shared_output is not None
             final_hidden_states += shared_output
@@ -268,6 +317,7 @@ class NemotronHMLPDecoderLayer(nn.Module):
 
         self.mixer = NemotronHMLP(
             config,
+            hidden_size=config.hidden_size,
             intermediate_size=intermediate_size,
             quant_config=quant_config,
             bias=config.mlp_bias,
@@ -846,5 +896,5 @@ class NemotronHForCausalLM(
         return logits
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(self)
+        loader = AutoWeightsLoader(self, skip_prefixes=["mtp"])
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/transformers_utils/configs/nemotron_h.py b/vllm/transformers_utils/configs/nemotron_h.py
index 68c400020..86c117fd9 100644
--- a/vllm/transformers_utils/configs/nemotron_h.py
+++ b/vllm/transformers_utils/configs/nemotron_h.py
@@ -189,6 +189,7 @@ class NemotronHConfig(PretrainedConfig):
         n_shared_experts=1,
         moe_intermediate_size=7688,
         moe_shared_expert_intermediate_size=7688,
+        moe_latent_size=None,
         num_experts_per_tok=2,
         routed_scaling_factor=1.0,
         n_group=1,
@@ -254,6 +255,7 @@ class NemotronHConfig(PretrainedConfig):
         self.n_shared_experts = n_shared_experts
         self.moe_intermediate_size = moe_intermediate_size
         self.moe_shared_expert_intermediate_size = moe_shared_expert_intermediate_size  # noqa: E501
+        self.moe_latent_size = moe_latent_size
         self.num_experts_per_tok = num_experts_per_tok
         self.routed_scaling_factor = routed_scaling_factor
         self.n_group = n_group
-- 
GitLab


From d1b5e7afbf8a4ff45904c191748594167dd34e78 Mon Sep 17 00:00:00 2001
From: Johnny Yang <24908445+jcyang43@users.noreply.github.com>
Date: Mon, 8 Dec 2025 12:10:10 -0800
Subject: [PATCH 200/258] [TPU] Bump tpu-inference to 0.12.0 (#30221)

Signed-off-by: Johnny Yang <johnnyyang@google.com>
---
 requirements/tpu.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index e6fff58f7..7695b4ba2 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -11,5 +11,4 @@ ray[default]
 ray[data]
 setuptools==78.1.0
 nixl==0.3.0
-tpu_info==0.4.0
-tpu-inference==0.11.1
+tpu-inference==0.12.0
-- 
GitLab


From 0d402d2600490bac17bc5d079e89b1136fe37eda Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vkuzo@users.noreply.github.com>
Date: Mon, 8 Dec 2025 15:15:10 -0500
Subject: [PATCH 201/258] online fp8 quant with streaming weight
 post-processing (#29196)

Signed-off-by: vasiliy <vasiliy@fb.com>
---
 .../model_executor/layers/quantization/fp8.py | 67 ++++++++++++++++++-
 1 file changed, 66 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 0e3e13f59..419ddd91b 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -465,6 +465,30 @@ class Fp8LinearMethod(LinearMethodBase):
                 output_size_per_partition, input_size_per_partition, weight_loader
             )
         else:
+
+            def patched_weight_loader(param, loaded_weight, *args, **kwargs):
+                # load the current weight chunk
+                res = weight_loader(param, loaded_weight, *args, **kwargs)  # type: ignore[misc]
+
+                # track how many elements we have updated
+                if not hasattr(layer, "_loaded_numel"):
+                    layer._loaded_numel = 0
+                layer._loaded_numel += loaded_weight.numel()
+
+                # if we have loaded all of the elements, call
+                # process_weights_after_loading
+                target_loaded_numel = layer.weight.numel()
+                if layer._loaded_numel == target_loaded_numel:
+                    self.process_weights_after_loading(layer)
+
+                    # Delete the bookkeeping
+                    del layer._loaded_numel
+                    # Prevent the usual `process_weights_after_loading` call from doing
+                    # anything
+                    layer._already_called_process_weights_after_loading = True
+
+                return res
+
             # For non-serialized checkpoints, use original dtype
             weight = ModelWeightParameter(
                 data=torch.empty(
@@ -474,7 +498,7 @@ class Fp8LinearMethod(LinearMethodBase):
                 ),
                 input_dim=1,
                 output_dim=0,
-                weight_loader=weight_loader,
+                weight_loader=patched_weight_loader,
             )
         layer.register_parameter("weight", weight)
 
@@ -515,6 +539,9 @@ class Fp8LinearMethod(LinearMethodBase):
                 layer.register_parameter("input_scale", None)
 
     def process_weights_after_loading(self, layer: Module) -> None:
+        if getattr(layer, "_already_called_process_weights_after_loading", False):
+            return
+
         size_k_first = True
         input_scale = None
         # TODO(rob): refactor block quant into separate class.
@@ -738,6 +765,41 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     f"weight quantization block_k = {block_k}."
                 )
 
+        # if we are doing online quantization, patch the weight
+        # loaded to call `process_weights_after_loading` in a streaming fashion
+        # as soon as the last weight chunk is loaded
+        if not self.quant_config.is_checkpoint_fp8_serialized:
+            weight_loader = extra_weight_attrs["weight_loader"]
+            # create a new holder to prevent modifying behavior of any other
+            # objects which might depend on the old one
+            new_extra_weight_attrs = extra_weight_attrs
+
+            def patched_weight_loader(param, loaded_weight, *args, **kwargs):
+                # load the current weight chunk
+                res = weight_loader(param, loaded_weight, *args, **kwargs)  # type: ignore[misc]
+
+                # add a counter to track how many elements we have updated
+                if not hasattr(layer, "_loaded_numel"):
+                    layer._loaded_numel = 0
+                layer._loaded_numel += loaded_weight.numel()
+
+                # if we have loaded all of the elements, call
+                # process_weights_after_loading
+                target_loaded_numel = layer.w13_weight.numel() + layer.w2_weight.numel()
+                if layer._loaded_numel == target_loaded_numel:
+                    self.process_weights_after_loading(layer)
+
+                    # Delete the bookkeeping
+                    del layer._loaded_numel
+                    # Prevent the usual `process_weights_after_loading` call
+                    # from doing anything
+                    layer._already_called_process_weights_after_loading = True
+
+                return res
+
+            new_extra_weight_attrs["weight_loader"] = patched_weight_loader
+            extra_weight_attrs = new_extra_weight_attrs
+
         # WEIGHTS
         w13_weight = torch.nn.Parameter(
             torch.empty(
@@ -839,6 +901,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         self.rocm_aiter_moe_enabled = False
 
     def process_weights_after_loading(self, layer: Module) -> None:
+        if getattr(layer, "_already_called_process_weights_after_loading", False):
+            return
+
         # Lazy import to avoid importing triton too early.
 
         self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
-- 
GitLab


From 799804d140fc99ce3964648ba91aaa810cf28fef Mon Sep 17 00:00:00 2001
From: Dmitry Tokarev <dtokarev@nvidia.com>
Date: Mon, 8 Dec 2025 15:24:34 -0500
Subject: [PATCH 202/258] Bump nvshmem to 3.3.24 and fix CUDA 13 installation
 (#30149)

Signed-off-by: Dmitry Tokarev <dtokarev@nvidia.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 tools/ep_kernels/install_python_libraries.sh | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh
index 88be5cd77..1bb7fd834 100755
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@@ -10,9 +10,10 @@ set -ex
 CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
 PPLX_COMMIT_HASH=${PPLX_COMMIT_HASH:-"12cecfd"}
 DEEPEP_COMMIT_HASH=${DEEPEP_COMMIT_HASH:-"73b6ea4"}
-NVSHMEM_VER=3.3.9
+NVSHMEM_VER=3.3.24  # Suppports both CUDA 12 and 13
 WORKSPACE=${WORKSPACE:-$(pwd)/ep_kernels_workspace}
 MODE=${MODE:-install}
+CUDA_VERSION_MAJOR=$(${CUDA_HOME}/bin/nvcc --version | egrep -o "release [0-9]+" | cut -d ' ' -f 2)
 
 # Parse arguments
 while [[ $# -gt 0 ]]; do
@@ -75,11 +76,9 @@ ARCH=$(uname -m)
 case "${ARCH,,}" in
   x86_64|amd64)
     NVSHMEM_SUBDIR="linux-x86_64"
-    NVSHMEM_FILE="libnvshmem-linux-x86_64-${NVSHMEM_VER}_cuda12-archive.tar.xz"
     ;;
   aarch64|arm64)
     NVSHMEM_SUBDIR="linux-sbsa"
-    NVSHMEM_FILE="libnvshmem-linux-sbsa-${NVSHMEM_VER}_cuda12-archive.tar.xz"
     ;;
   *)
     echo "Unsupported architecture: ${ARCH}" >&2
@@ -87,6 +86,7 @@ case "${ARCH,,}" in
     ;;
 esac
 
+NVSHMEM_FILE="libnvshmem-${NVSHMEM_SUBDIR}-${NVSHMEM_VER}_cuda${CUDA_VERSION_MAJOR}-archive.tar.xz"
 NVSHMEM_URL="https://developer.download.nvidia.com/compute/nvshmem/redist/libnvshmem/${NVSHMEM_SUBDIR}/${NVSHMEM_FILE}"
 
 pushd "$WORKSPACE"
@@ -142,13 +142,6 @@ clone_repo() {
     fi
 }
 
-deepep_cuda13_patch() {
-    cuda_version_major=$(${CUDA_HOME}/bin/nvcc --version | egrep -o "release [0-9]+" | cut -d ' ' -f 2)
-    if [ ${cuda_version_major} -ge 13 ]; then
-        sed -i "s|f'{nvshmem_dir}/include']|f'{nvshmem_dir}/include', '${CUDA_HOME}/include/cccl']|" "setup.py"
-    fi
-}
-
 do_build() {
     local repo=$1
     local name=$2
@@ -160,8 +153,9 @@ do_build() {
     clone_repo "$repo" "$name" "$key" "$commit"
     cd "$name"
 
-    if [ "$name" == "DeepEP" ]; then
-        deepep_cuda13_patch
+    # DeepEP CUDA 13 patch
+    if [[ "$name" == "DeepEP" && "${CUDA_VERSION_MAJOR}" -ge 13 ]]; then
+        sed -i "s|f'{nvshmem_dir}/include']|f'{nvshmem_dir}/include', '${CUDA_HOME}/include/cccl']|" "setup.py"
     fi
 
     if [ "$MODE" = "install" ]; then
-- 
GitLab


From ae0f69b16aaa54197257267a8262555beb7d5ae3 Mon Sep 17 00:00:00 2001
From: roikoren755 <26850796+roikoren755@users.noreply.github.com>
Date: Mon, 8 Dec 2025 23:45:18 +0200
Subject: [PATCH 203/258] Add SpecDec support to `selective_state_update`
 (#29488)

Signed-off-by: Roi Koren <roik@nvidia.com>
---
 tests/kernels/mamba/test_mamba_ssm.py         | 325 ++++++++++++++++++
 .../layers/mamba/ops/mamba_ssm.py             | 256 ++++++++++----
 2 files changed, 507 insertions(+), 74 deletions(-)

diff --git a/tests/kernels/mamba/test_mamba_ssm.py b/tests/kernels/mamba/test_mamba_ssm.py
index 98edc9599..50e48aad6 100644
--- a/tests/kernels/mamba/test_mamba_ssm.py
+++ b/tests/kernels/mamba/test_mamba_ssm.py
@@ -425,6 +425,80 @@ def test_selective_state_update(dim, dstate, has_z, itype):
     assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
 
 
+@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16])
+@pytest.mark.parametrize("has_z", [False, True])
+@pytest.mark.parametrize("dstate", [16, 64])
+@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
+@pytest.mark.parametrize("max_seq_len", [1, 2, 4])
+def test_selective_state_update_varlen(dim, dstate, has_z, itype, max_seq_len):
+    device = "cuda"
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2)
+    if itype == torch.bfloat16:
+        rtol, atol = 5e-2, 1.5e-1
+        if torch.version.hip:
+            atol *= 2
+    # set seed
+    current_platform.seed_everything(0)
+    batch_size = 4
+    token_counts = torch.randint(1, max_seq_len + 1, (batch_size,), device=device)
+    total_tokens = int(token_counts.sum().item())
+    cu_seqlens = torch.tensor(
+        [0] + torch.cumsum(token_counts, dim=0).tolist(),
+        dtype=torch.int32,
+        device=device,
+    )
+    state = torch.randn(batch_size, dim, dstate, dtype=itype, device=device)
+    x = torch.randn(total_tokens, dim, device=device, dtype=itype)
+    out = torch.empty_like(x)
+    dt = torch.randn(total_tokens, dim, device=device, dtype=itype)
+    dt_bias = torch.rand(dim, device=device) - 4.0
+    A = -torch.rand(dim, dstate, device=device) - 1.0
+    B = torch.randn(total_tokens, dstate, device=device)
+    C = torch.randn(total_tokens, dstate, device=device)
+    D = torch.randn(dim, device=device)
+    z = torch.randn_like(x) if has_z else None
+    state_ref = state.detach().clone()
+    selective_state_update(
+        state,
+        x,
+        dt,
+        A,
+        B,
+        C,
+        D=D,
+        z=z,
+        dt_bias=dt_bias,
+        dt_softplus=True,
+        out=out,
+        cu_seqlens=cu_seqlens,
+    )
+
+    out_ref_list = []
+    for seq_idx in range(batch_size):
+        start_idx = cu_seqlens[seq_idx].item()
+        end_idx = cu_seqlens[seq_idx + 1].item()
+        num_tokens = end_idx - start_idx
+        for token_idx in range(num_tokens):
+            idx = start_idx + token_idx
+            out_ref_list.append(
+                selective_state_update_ref(
+                    state_ref[seq_idx : seq_idx + 1],
+                    x[idx : idx + 1],
+                    dt[idx : idx + 1],
+                    A,
+                    B[idx : idx + 1],
+                    C[idx : idx + 1],
+                    D=D,
+                    z=z[idx : idx + 1] if has_z else None,
+                    dt_bias=dt_bias,
+                    dt_softplus=True,
+                )
+            )
+    out_ref = torch.cat(out_ref_list, dim=0)
+    assert torch.allclose(state, state_ref, rtol=rtol, atol=atol)
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
+
+
 @pytest.mark.parametrize("wtype", [torch.float32])
 @pytest.mark.parametrize("itype", [torch.float32])
 @pytest.mark.parametrize("seqlen", [1, 256, 1024, 4096])
@@ -766,3 +840,254 @@ def test_selective_state_update_with_heads_with_batch_indices(
     print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
     assert torch.allclose(state[state_indices, :], state_ref, rtol=rtol, atol=atol)
     assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16])
+@pytest.mark.parametrize("has_z", [False, True])
+@pytest.mark.parametrize("dstate", [16, 64])
+@pytest.mark.parametrize("dim", [2048, 4096])
+@pytest.mark.parametrize("max_seq_len", [2, 4])
+def test_selective_state_update_with_num_accepted_tokens(
+    dim, dstate, has_z, itype, max_seq_len
+):
+    device = "cuda"
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2)
+    if itype == torch.bfloat16:
+        rtol, atol = 5e-2, 1.5e-1
+        if torch.version.hip:
+            atol *= 2
+
+    current_platform.seed_everything(0)
+    batch_size = 4
+
+    tokens_per_seq = torch.randint(1, max_seq_len + 1, (batch_size,), device=device)
+    total_tokens = int(tokens_per_seq.sum().item())
+
+    num_accepted_tokens = torch.randint(0, max_seq_len, (batch_size,), device=device)
+    num_accepted_tokens[0] = 0  # Add edge-case of no accepted tokens
+    num_accepted_tokens[1] = max_seq_len  # Add edge-case of all tokens accepted
+
+    cu_seqlens = torch.tensor(
+        [0] + torch.cumsum(tokens_per_seq, dim=0).tolist(),
+        dtype=torch.int32,
+        device=device,
+    )
+
+    total_state_slots = 50
+    state = torch.randn(total_state_slots, dim, dstate, dtype=itype, device=device)
+
+    state_batch_indices = torch.full(
+        (batch_size, max_seq_len), PAD_SLOT_ID, dtype=torch.int32, device=device
+    )
+    initial_state_slots = torch.randint(
+        0, 15, (batch_size,), device=device, dtype=torch.int32
+    )
+    for seq_idx in range(batch_size):
+        token_pos = max(num_accepted_tokens[seq_idx].item() - 1, 0)
+        state_batch_indices[seq_idx, token_pos] = initial_state_slots[seq_idx]
+
+    dst_state_batch_indices = torch.full(
+        (batch_size, max_seq_len), PAD_SLOT_ID, dtype=torch.int32, device=device
+    )
+    slot_offset = 15
+    dst_slots_map = {}
+    for seq_idx in range(batch_size):
+        for token_idx in range(tokens_per_seq[seq_idx].item()):
+            dst_state_batch_indices[seq_idx, token_idx] = slot_offset
+            dst_slots_map[(seq_idx, token_idx)] = slot_offset
+            slot_offset += 1
+
+    x = torch.randn(total_tokens, dim, device=device, dtype=itype)
+    out = torch.empty_like(x)
+    dt = torch.randn(total_tokens, dim, device=device, dtype=itype)
+    dt_bias = torch.rand(dim, device=device) - 4.0
+    A = -torch.rand(dim, dstate, device=device) - 1.0
+    B = torch.randn(total_tokens, dstate, device=device)
+    C = torch.randn(total_tokens, dstate, device=device)
+    D = torch.randn(dim, device=device)
+    z = torch.randn_like(x) if has_z else None
+
+    state_ref_intermediate = {}
+    out_ref_list = []
+
+    for seq_idx in range(batch_size):
+        seq_start = cu_seqlens[seq_idx].item()
+        seq_end = cu_seqlens[seq_idx + 1].item()
+        num_tokens = seq_end - seq_start
+
+        token_pos = max(num_accepted_tokens[seq_idx].item() - 1, 0)
+        initial_slot = state_batch_indices[seq_idx, token_pos].item()
+        state_seq = state[initial_slot : initial_slot + 1].clone()
+
+        for token_idx in range(num_tokens):
+            global_idx = seq_start + token_idx
+
+            out_token = selective_state_update_ref(
+                state_seq,
+                x[global_idx : global_idx + 1],
+                dt[global_idx : global_idx + 1],
+                A,
+                B[global_idx : global_idx + 1],
+                C[global_idx : global_idx + 1],
+                D=D,
+                z=z[global_idx : global_idx + 1] if has_z else None,
+                dt_bias=dt_bias,
+                dt_softplus=True,
+            )
+            out_ref_list.append(out_token)
+            state_ref_intermediate[(seq_idx, token_idx)] = state_seq.clone()
+
+    out_ref = torch.cat(out_ref_list, dim=0)
+
+    selective_state_update(
+        state,
+        x,
+        dt,
+        A,
+        B,
+        C,
+        D=D,
+        z=z,
+        dt_bias=dt_bias,
+        dt_softplus=True,
+        out=out,
+        cu_seqlens=cu_seqlens,
+        state_batch_indices=state_batch_indices,
+        dst_state_batch_indices=dst_state_batch_indices,
+        num_accepted_tokens=num_accepted_tokens,
+        pad_slot_id=PAD_SLOT_ID,
+    )
+
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
+
+    for seq_idx in range(batch_size):
+        num_tokens = tokens_per_seq[seq_idx].item()
+        for token_idx in range(num_tokens):
+            dst_slot = dst_slots_map[(seq_idx, token_idx)]
+            state_ref = state_ref_intermediate[(seq_idx, token_idx)].squeeze(0)
+            assert torch.allclose(state[dst_slot], state_ref, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16])
+@pytest.mark.parametrize("has_z", [False, True])
+@pytest.mark.parametrize("dstate", [16, 64])
+@pytest.mark.parametrize("dim", [2048, 4096])
+@pytest.mark.parametrize("max_seq_len", [2, 4])
+def test_selective_state_update_varlen_with_num_accepted(
+    dim, dstate, has_z, itype, max_seq_len
+):
+    device = "cuda"
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2)
+    if itype == torch.bfloat16:
+        rtol, atol = 5e-2, 1.5e-1
+        if torch.version.hip:
+            atol *= 2
+
+    current_platform.seed_everything(0)
+    batch_size = 4
+
+    tokens_per_seq = torch.randint(1, max_seq_len + 1, (batch_size,), device=device)
+    total_tokens = int(tokens_per_seq.sum().item())
+
+    num_accepted_tokens = torch.randint(0, max_seq_len, (batch_size,), device=device)
+    num_accepted_tokens[0] = 0  # Add edge-case of no accepted tokens
+    num_accepted_tokens[1] = max_seq_len  # Add edge-case of all tokens accepted
+
+    cu_seqlens = torch.tensor(
+        [0] + torch.cumsum(tokens_per_seq, dim=0).tolist(),
+        dtype=torch.int32,
+        device=device,
+    )
+
+    total_state_slots = 50
+    state = torch.randn(total_state_slots, dim, dstate, dtype=itype, device=device)
+
+    state_batch_indices = torch.full(
+        (batch_size, max_seq_len), PAD_SLOT_ID, dtype=torch.int32, device=device
+    )
+
+    initial_state_slots = torch.randint(
+        0, 15, (batch_size,), device=device, dtype=torch.int32
+    )
+    for seq_idx in range(batch_size):
+        token_pos = max(num_accepted_tokens[seq_idx].item() - 1, 0)
+        state_batch_indices[seq_idx, token_pos] = initial_state_slots[seq_idx]
+
+    dst_state_batch_indices = torch.full(
+        (batch_size, max_seq_len), PAD_SLOT_ID, dtype=torch.int32, device=device
+    )
+
+    slot_offset = 15
+    dst_slots_map = {}
+    for seq_idx in range(batch_size):
+        for token_idx in range(tokens_per_seq[seq_idx].item()):
+            dst_state_batch_indices[seq_idx, token_idx] = slot_offset
+            dst_slots_map[(seq_idx, token_idx)] = slot_offset
+            slot_offset += 1
+
+    x = torch.randn(total_tokens, dim, device=device, dtype=itype)
+    out = torch.empty_like(x)
+    dt = torch.randn(total_tokens, dim, device=device, dtype=itype)
+    dt_bias = torch.rand(dim, device=device) - 4.0
+    A = -torch.rand(dim, dstate, device=device) - 1.0
+    B = torch.randn(total_tokens, dstate, device=device)
+    C = torch.randn(total_tokens, dstate, device=device)
+    D = torch.randn(dim, device=device)
+    z = torch.randn_like(x) if has_z else None
+
+    state_ref_intermediate = {}
+
+    for seq_idx in range(batch_size):
+        seq_start = cu_seqlens[seq_idx].item()
+        seq_end = cu_seqlens[seq_idx + 1].item()
+        num_tokens = seq_end - seq_start
+
+        token_pos = max(num_accepted_tokens[seq_idx].item() - 1, 0)
+        initial_slot = state_batch_indices[seq_idx, token_pos].item()
+        state_seq = state[initial_slot : initial_slot + 1].clone()
+
+        for token_idx in range(num_tokens):
+            global_idx = seq_start + token_idx
+
+            selective_state_update_ref(
+                state_seq,
+                x[global_idx : global_idx + 1],
+                dt[global_idx : global_idx + 1],
+                A,
+                B[global_idx : global_idx + 1],
+                C[global_idx : global_idx + 1],
+                D=D,
+                z=z[global_idx : global_idx + 1] if has_z else None,
+                dt_bias=dt_bias,
+                dt_softplus=True,
+            )
+
+            state_ref_intermediate[(seq_idx, token_idx)] = state_seq.clone()
+
+    selective_state_update(
+        state,
+        x,
+        dt,
+        A,
+        B,
+        C,
+        D=D,
+        z=z,
+        dt_bias=dt_bias,
+        dt_softplus=True,
+        out=out,
+        cu_seqlens=cu_seqlens,
+        state_batch_indices=state_batch_indices,
+        dst_state_batch_indices=dst_state_batch_indices,
+        num_accepted_tokens=num_accepted_tokens,
+        pad_slot_id=PAD_SLOT_ID,
+    )
+
+    for seq_idx in range(batch_size):
+        num_tokens = tokens_per_seq[seq_idx].item()
+
+        for token_idx in range(num_tokens):
+            dst_slot = dst_slots_map[(seq_idx, token_idx)]
+            state_ref = state_ref_intermediate[(seq_idx, token_idx)].squeeze(0)
+
+            assert torch.allclose(state[dst_slot], state_ref, rtol=rtol, atol=atol)
diff --git a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
index 53fd5d545..800f8bd84 100644
--- a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
+++ b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
@@ -36,10 +36,14 @@ else:
         is not None
     }
 )
+@triton.heuristics(
+    {"IS_SPEC_DECODING": lambda args: args["num_accepted_tokens_ptr"] is not None}
+)
+@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens_ptr"] is not None})
 @triton.heuristics(
     {"BLOCK_SIZE_DSTATE": lambda args: triton.next_power_of_2(args["dstate"])}
 )
-@triton.jit
+@triton.jit(do_not_specialize=["N"])
 def _selective_scan_update_kernel(
     # Pointers to matrices
     state_ptr,
@@ -55,8 +59,10 @@ def _selective_scan_update_kernel(
     state_batch_indices_ptr,
     dst_state_batch_indices_ptr,
     pad_slot_id,
+    num_accepted_tokens_ptr,
+    cu_seqlens_ptr,
     # Matrix dimensions
-    batch,
+    N,
     nheads,
     dim,
     dstate,
@@ -91,6 +97,10 @@ def _selective_scan_update_kernel(
     stride_out_batch,
     stride_out_head,
     stride_out_dim,
+    stride_state_indices_batch,
+    stride_state_indices_T,
+    stride_dst_state_indices_batch,
+    stride_dst_state_indices_T,
     # Meta-parameters
     DT_SOFTPLUS: tl.constexpr,
     TIE_HDIM: tl.constexpr,
@@ -99,22 +109,50 @@ def _selective_scan_update_kernel(
     HAS_D: tl.constexpr,
     HAS_Z: tl.constexpr,
     HAS_STATE_BATCH_INDICES: tl.constexpr,
+    IS_SPEC_DECODING: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
     BLOCK_SIZE_DSTATE: tl.constexpr,
 ):
     pid_m = tl.program_id(axis=0)
     pid_b = tl.program_id(axis=1)
     pid_h = tl.program_id(axis=2)
 
+    if IS_VARLEN:
+        bos = tl.load(cu_seqlens_ptr + pid_b).to(tl.int64)
+        eos = tl.load(cu_seqlens_ptr + pid_b + 1).to(tl.int64)
+        seq_len = eos - bos
+
+        if seq_len == 0:
+            return
+    else:
+        bos = pid_b
+        seq_len = 1
+
+    state_ptr_base = state_ptr
+
     # If HAS_STATE_BATCH_INDICES is true, then the ssm state's batch coordinate
     # is taken from the state_batch_indices_ptr Otherwise, the state coordinate
     # is the same as the batch id.
     if HAS_STATE_BATCH_INDICES:
-        dst_state_batch_indices_ptr += pid_b
-        dst_state_batch_idx = tl.load(dst_state_batch_indices_ptr).to(tl.int64)
-        dst_state_ptr = state_ptr + (
-            dst_state_batch_idx * stride_state_batch + pid_h * stride_state_head
+        if IS_SPEC_DECODING:
+            num_accepted = tl.load(num_accepted_tokens_ptr + pid_b).to(tl.int64)
+            init_token_idx = tl.maximum(num_accepted - 1, 0)
+        else:
+            init_token_idx = 0
+
+        dst_state_batch_indices_ptr += pid_b * stride_dst_state_indices_batch
+        if not IS_SPEC_DECODING:
+            dst_state_batch_idx = tl.load(
+                dst_state_batch_indices_ptr
+                + init_token_idx * stride_dst_state_indices_T
+            ).to(tl.int64)
+            dst_state_ptr = state_ptr + (
+                dst_state_batch_idx * stride_state_batch + pid_h * stride_state_head
+            )
+
+        state_batch_indices_ptr += (
+            pid_b * stride_state_indices_batch + init_token_idx * stride_state_indices_T
         )
-        state_batch_indices_ptr += pid_b
         state_batch_idx = tl.load(state_batch_indices_ptr).to(tl.int64)
         state_ptr += state_batch_idx * stride_state_batch + pid_h * stride_state_head
     else:
@@ -123,86 +161,112 @@ def _selective_scan_update_kernel(
         )
         state_ptr += pid_b * stride_state_batch + pid_h * stride_state_head
 
-    x_ptr += pid_b * stride_x_batch + pid_h * stride_x_head
-    dt_ptr += pid_b * stride_dt_batch + pid_h * stride_dt_head
+    x_ptr += bos * stride_x_batch + pid_h * stride_x_head
+    dt_ptr += bos * stride_dt_batch + pid_h * stride_dt_head
     if HAS_DT_BIAS:
         dt_bias_ptr += pid_h * stride_dt_bias_head
     A_ptr += pid_h * stride_A_head
-    B_ptr += pid_b * stride_B_batch + (pid_h // nheads_ngroups_ratio) * stride_B_group
-    C_ptr += pid_b * stride_C_batch + (pid_h // nheads_ngroups_ratio) * stride_C_group
+    B_ptr += bos * stride_B_batch + (pid_h // nheads_ngroups_ratio) * stride_B_group
+    C_ptr += bos * stride_C_batch + (pid_h // nheads_ngroups_ratio) * stride_C_group
     if HAS_Z:
-        z_ptr += pid_b * stride_z_batch + pid_h * stride_z_head
-    out_ptr += pid_b * stride_out_batch + pid_h * stride_out_head
+        z_ptr += bos * stride_z_batch + pid_h * stride_z_head
+    out_ptr += bos * stride_out_batch + pid_h * stride_out_head
 
     offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
     offs_n = tl.arange(0, BLOCK_SIZE_DSTATE)
     state_ptrs = state_ptr + (
         offs_m[:, None] * stride_state_dim + offs_n[None, :] * stride_state_dstate
     )
-    dst_state_ptrs = dst_state_ptr + (
-        offs_m[:, None] * stride_state_dim + offs_n[None, :] * stride_state_dstate
-    )
-    x_ptrs = x_ptr + offs_m * stride_x_dim
-    dt_ptrs = dt_ptr + offs_m * stride_dt_dim
+    if not IS_SPEC_DECODING:
+        dst_state_ptrs = dst_state_ptr + (
+            offs_m[:, None] * stride_state_dim + offs_n[None, :] * stride_state_dstate
+        )
+
+    mask = (offs_m[:, None] < dim) & (offs_n[None, :] < dstate)
+    if HAS_STATE_BATCH_INDICES:
+        mask &= state_batch_idx != pad_slot_id
+    state = tl.load(state_ptrs, mask=mask, other=0.0).to(tl.float32)
+
     if HAS_DT_BIAS:
         dt_bias_ptrs = dt_bias_ptr + offs_m * stride_dt_bias_dim
     if HAS_D:
         D_ptr += pid_h * stride_D_head
-    A_ptrs = A_ptr + (
-        offs_m[:, None] * stride_A_dim + offs_n[None, :] * stride_A_dstate
-    )
-    B_ptrs = B_ptr + offs_n * stride_B_dstate
-    C_ptrs = C_ptr + offs_n * stride_C_dstate
-    if HAS_D:
         D_ptrs = D_ptr + offs_m * stride_D_dim
-    if HAS_Z:
-        z_ptrs = z_ptr + offs_m * stride_z_dim
-    out_ptrs = out_ptr + offs_m * stride_out_dim
-    mask = (offs_m[:, None] < dim) & (offs_n[None, :] < dstate)
-    if HAS_STATE_BATCH_INDICES:
-        mask &= state_batch_idx != pad_slot_id
-    state = tl.load(state_ptrs, mask=mask, other=0.0)
-
-    x = tl.load(x_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-    if not TIE_HDIM:
-        dt = tl.load(dt_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-        if HAS_DT_BIAS:
-            dt += tl.load(dt_bias_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-        if DT_SOFTPLUS:
-            dt = softplus(dt)
-        A = tl.load(
-            A_ptrs, mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate), other=0.0
-        ).to(tl.float32)
-        dA = tl.exp(A * dt[:, None])
-    else:
-        dt = tl.load(dt_ptr).to(tl.float32)
-        if HAS_DT_BIAS:
-            dt += tl.load(dt_bias_ptr).to(tl.float32)
-        if DT_SOFTPLUS:
-            dt = softplus(dt)
-        A = tl.load(A_ptr).to(tl.float32)
-        dA = tl.exp(A * dt)  # scalar, not a matrix
-
-    B = tl.load(B_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
-    C = tl.load(C_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
-    if HAS_D:
-        D = tl.load(D_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-    if HAS_Z:
-        z = tl.load(z_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+    A_ptrs = A_ptr + offs_m[:, None] * stride_A_dim + offs_n[None, :] * stride_A_dstate
 
-    dB = B[None, :] * dt[:, None] if not TIE_HDIM else B * dt
-    state = state * dA + dB * x[:, None]
+    for i_t in range(seq_len):
+        x_ptrs = x_ptr + offs_m * stride_x_dim
+        dt_ptrs = dt_ptr + offs_m * stride_dt_dim
+        B_ptrs = B_ptr + offs_n * stride_B_dstate
+        C_ptrs = C_ptr + offs_n * stride_C_dstate
+        if HAS_Z:
+            z_ptrs = z_ptr + offs_m * stride_z_dim
+        out_ptrs = out_ptr + offs_m * stride_out_dim
 
-    mask = (offs_m[:, None] < dim) & (offs_n[None, :] < dstate)
-    if HAS_STATE_BATCH_INDICES:
-        mask &= state_batch_idx != pad_slot_id
-    tl.store(dst_state_ptrs, state, mask=mask)
-    out = tl.sum(state * C[None, :], axis=1)
-    if HAS_D:
-        out += x * D
-    if HAS_Z:
-        out *= z * tl.sigmoid(z)
-    tl.store(out_ptrs, out, mask=offs_m < dim)
+        x = tl.load(x_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+        if not TIE_HDIM:
+            dt = tl.load(dt_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+            if HAS_DT_BIAS:
+                dt += tl.load(dt_bias_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+            if DT_SOFTPLUS:
+                dt = softplus(dt)
+            A = tl.load(
+                A_ptrs,
+                mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate),
+                other=0.0,
+            ).to(tl.float32)
+            dA = tl.exp(A * dt[:, None])
+        else:
+            dt = tl.load(dt_ptr).to(tl.float32)
+            if HAS_DT_BIAS:
+                dt += tl.load(dt_bias_ptr).to(tl.float32)
+            if DT_SOFTPLUS:
+                dt = softplus(dt)
+            A = tl.load(A_ptr).to(tl.float32)
+            dA = tl.exp(A * dt)  # scalar, not a matrix
+
+        B = tl.load(B_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
+        C = tl.load(C_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
+        if HAS_D:
+            D = tl.load(D_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+        if HAS_Z:
+            z = tl.load(z_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+
+        dB = B[None, :] * dt[:, None] if not TIE_HDIM else B * dt
+        state = state * dA + dB * x[:, None]
+
+        if IS_SPEC_DECODING:
+            dst_idx_ptr = dst_state_batch_indices_ptr + i_t * stride_dst_state_indices_T
+            token_dst_idx = tl.load(dst_idx_ptr).to(tl.int64)
+            if token_dst_idx != pad_slot_id:
+                token_dst_ptrs = (
+                    state_ptr_base
+                    + token_dst_idx * stride_state_batch
+                    + pid_h * stride_state_head
+                    + offs_m[:, None] * stride_state_dim
+                    + offs_n[None, :] * stride_state_dstate
+                )
+                tl.store(
+                    token_dst_ptrs, state.to(token_dst_ptrs.dtype.element_ty), mask=mask
+                )
+
+        out = tl.sum(state * C[None, :], axis=1)
+        if HAS_D:
+            out += x * D
+        if HAS_Z:
+            out *= z * tl.sigmoid(z)
+        tl.store(out_ptrs, out, mask=offs_m < dim)
+
+        x_ptr += stride_x_batch
+        dt_ptr += stride_dt_batch
+        B_ptr += stride_B_batch
+        C_ptr += stride_C_batch
+        out_ptr += stride_out_batch
+        if HAS_Z:
+            z_ptr += stride_z_batch
+
+    if not IS_SPEC_DECODING:
+        tl.store(dst_state_ptrs, state.to(dst_state_ptrs.dtype.element_ty), mask=mask)
 
 
 def selective_state_update(
@@ -220,6 +284,8 @@ def selective_state_update(
     dst_state_batch_indices=None,
     pad_slot_id=PAD_SLOT_ID,
     out=None,
+    num_accepted_tokens=None,
+    cu_seqlens=None,
 ):
     """
     Argument:
@@ -240,6 +306,11 @@ def selective_state_update(
             indices 0 and 3
         out: Preallocated ssm output tensor. Assume same shape as x.
              In-place updated.
+        num_accepted_tokens: (batch,)
+            number of accepted tokens from previous verification step,
+            tells the kernel which initial state to use
+        cu_seqlens: (batch,)
+            length per sequence, for variable length in speculative decoding cases
     """
     if state.dim() == 3:
         state = state.unsqueeze(1)
@@ -261,9 +332,26 @@ def selective_state_update(
         dt_bias = dt_bias.unsqueeze(0)
     if out.dim() == 2:
         out = out.unsqueeze(1)
+    if num_accepted_tokens is not None:
+        assert state_batch_indices is not None and state_batch_indices.dim() == 2
+        assert dst_state_batch_indices is None or dst_state_batch_indices.dim() == 2
+    if state_batch_indices is not None and state_batch_indices.dim() == 1:
+        state_batch_indices = state_batch_indices.unsqueeze(1)
+    if dst_state_batch_indices is not None and dst_state_batch_indices.dim() == 1:
+        dst_state_batch_indices = dst_state_batch_indices.unsqueeze(1)
 
     _, nheads, dim, dstate = state.shape
     batch = x.shape[0]
+    if cu_seqlens is not None:
+        N = len(cu_seqlens) - 1
+        # Only used to verify the shape of
+        # state_batch_indices and dst_state_batch_indices
+        max_seqlen = (
+            state_batch_indices.size(-1) if state_batch_indices is not None else 1
+        )
+    else:
+        N = batch
+        max_seqlen = 1
 
     assert x.shape == (batch, nheads, dim)
     assert dt.shape == x.shape
@@ -279,16 +367,30 @@ def selective_state_update(
     if dt_bias is not None:
         assert dt_bias.shape == (nheads, dim)
     if state_batch_indices is not None:
-        assert state_batch_indices.shape == (batch,)
+        assert state_batch_indices.shape[0] >= N
+        assert state_batch_indices.shape[1] >= max_seqlen
     if dst_state_batch_indices is not None:
-        assert dst_state_batch_indices.shape == (batch,)
+        assert dst_state_batch_indices.shape[0] >= N
+        assert dst_state_batch_indices.shape[1] >= max_seqlen
     else:
         # revert to the default behavior of in-place state updates
         dst_state_batch_indices = state_batch_indices
     assert out.shape == x.shape
+    if num_accepted_tokens is not None:
+        assert num_accepted_tokens.shape == (N,)
 
-    grid = lambda META: (triton.cdiv(dim, META["BLOCK_SIZE_M"]), batch, nheads)
+    grid = lambda META: (triton.cdiv(dim, META["BLOCK_SIZE_M"]), N, nheads)
     z_strides = (z.stride(0), z.stride(1), z.stride(2)) if z is not None else (0, 0, 0)
+    state_batch_indices_strides = (
+        (state_batch_indices.stride(0), state_batch_indices.stride(1))
+        if state_batch_indices is not None
+        else (0, 0)
+    )
+    dst_state_batch_indices_strides = (
+        (dst_state_batch_indices.stride(0), dst_state_batch_indices.stride(1))
+        if dst_state_batch_indices is not None
+        else (0, 0)
+    )
     # We don't want autotune since it will overwrite the state
     # We instead tune by hand.
     BLOCK_SIZE_M, num_warps = (
@@ -321,7 +423,9 @@ def selective_state_update(
             state_batch_indices,
             dst_state_batch_indices,
             pad_slot_id,
-            batch,
+            num_accepted_tokens,
+            cu_seqlens,
+            N,
             nheads,
             dim,
             dstate,
@@ -353,6 +457,10 @@ def selective_state_update(
             out.stride(0),
             out.stride(1),
             out.stride(2),
+            state_batch_indices_strides[0],
+            state_batch_indices_strides[1],
+            dst_state_batch_indices_strides[0],
+            dst_state_batch_indices_strides[1],
             dt_softplus,
             tie_hdim,
             BLOCK_SIZE_M,
-- 
GitLab


From 6af70e11a0a3cb7109ce904e23ff90c73f573ef0 Mon Sep 17 00:00:00 2001
From: Charlie Fu <charlifu@amd.com>
Date: Mon, 8 Dec 2025 15:58:30 -0600
Subject: [PATCH 204/258] [ROCm][CI] Fix test_max_len.py for Rocm (#29916)

Signed-off-by: charlifu <charlifu@amd.com>
Signed-off-by: Charlie Fu <Charlie.Fu@amd.com>
---
 tests/basic_correctness/test_basic_correctness.py | 5 ++++-
 tests/utils.py                                    | 4 ++--
 tests/v1/e2e/test_spec_decode.py                  | 4 ++--
 tests/v1/spec_decode/test_eagle.py                | 8 ++++++--
 tests/v1/spec_decode/test_max_len.py              | 2 +-
 5 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 521d6c33d..9e1cc309e 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -13,12 +13,15 @@ import pytest
 import torch
 
 from vllm import LLM
+from vllm.platforms import current_platform
 from vllm.v1.engine.llm_engine import LLMEngine
 
 from ..conftest import HfRunner, VllmRunner
 from ..models.utils import check_outputs_equal
 from ..utils import multi_gpu_test
 
+ATTN_BACKEND = ["ROCM_ATTN"] if current_platform.is_rocm() else ["FLASH_ATTN"]
+
 MODELS = [
     "hmellor/tiny-random-Gemma2ForCausalLM",
     "meta-llama/Llama-3.2-1B-Instruct",
@@ -57,7 +60,7 @@ def _fix_prompt_embed_outputs(
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
+@pytest.mark.parametrize("backend", ATTN_BACKEND)
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("enforce_eager", [False])
 @pytest.mark.parametrize("async_scheduling", [True, False])
diff --git a/tests/utils.py b/tests/utils.py
index 539f67c47..ea3675b14 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1225,9 +1225,9 @@ def get_attn_backend_list_based_on_platform() -> list[str]:
         try:
             import aiter  # noqa: F401
 
-            attn_backend_list.append("FLASH_ATTN")
+            attn_backend_list.append("ROCM_AITER_FA")
         except Exception:
-            print("Skip FLASH_ATTN on ROCm as aiter is not installed")
+            print("Skip ROCM_AITER_FA on ROCm as aiter is not installed")
 
         return attn_backend_list
     elif current_platform.is_xpu():
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 575a6a151..416b582df 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -417,9 +417,9 @@ def test_eagle_correctness(
                 "multi-token eagle spec decode on current platform"
             )
 
-        if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
+        if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm():
             if "deepseek" in model_setup[1].lower():
-                pytest.skip("FLASH_ATTN for deepseek not supported on ROCm platform")
+                pytest.skip("ROCM_AITER_FA for deepseek not supported on ROCm platform")
             else:
                 m.setenv("VLLM_ROCM_USE_AITER", "1")
 
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 616e57de3..55e9b4d06 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -339,7 +339,7 @@ def test_load_model(
             "multi-token eagle spec decode on current platform"
         )
 
-    if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
+    if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm():
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
     # Setup draft model mock
@@ -434,7 +434,7 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
             "because it requires special input mocking."
         )
 
-    if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
+    if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm():
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
     # Use GPU device
@@ -541,6 +541,10 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
         attn_metadata_builder_cls, _ = try_get_attention_backend(
             AttentionBackendEnum.TREE_ATTN
         )
+    elif attn_backend == "ROCM_AITER_FA":
+        attn_metadata_builder_cls, _ = try_get_attention_backend(
+            AttentionBackendEnum.ROCM_AITER_FA
+        )
     else:
         raise ValueError(f"Unsupported attention backend: {attn_backend}")
 
diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py
index fa1d0437f..81da8609a 100644
--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py
@@ -47,7 +47,7 @@ def test_eagle_max_len(
                 "multi-token eagle spec decode on current platform"
             )
 
-        if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
+        if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm():
             m.setenv("VLLM_ROCM_USE_AITER", "1")
 
         llm = LLM(
-- 
GitLab


From 1fb632fdb6b6391cd1fe3146d05cc2858c2446ab Mon Sep 17 00:00:00 2001
From: Lain <siyuanf@nvidia.com>
Date: Mon, 8 Dec 2025 15:02:34 -0800
Subject: [PATCH 205/258] [Perf] Improve fp8 quant in mla; replace ReduceSum
 with ReduceScatterSum (#29795)

Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
---
 .../device_communicators/cuda_communicator.py |  2 +-
 vllm/v1/attention/backends/mla/common.py      | 33 ++++++++++++-------
 2 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
index 2e878eef9..cd9c267be 100644
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -225,7 +225,7 @@ class CudaCommunicator(DeviceCommunicatorBase):
             output_shape, dtype=input_tensor.dtype, device=input_tensor.device
         )
 
-        if sizes is not None:
+        if sizes is not None and sizes.count(sizes[0]) != len(sizes):
             pynccl_comm.reduce_scatterv(output, input_tensor, sizes=sizes)
         else:
             pynccl_comm.reduce_scatter(output, input_tensor)
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 309ddee4f..0a5257a1d 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -2037,21 +2037,30 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
 
             if fp8_attention:
                 ql_nope_shape = decode_ql_nope.shape
-                decode_ql_nope, _ = ops.scaled_fp8_quant(
-                    decode_ql_nope.reshape(
-                        [ql_nope_shape[0], ql_nope_shape[1] * ql_nope_shape[2]]
-                    ),
-                    layer._q_scale,
-                )
-                decode_ql_nope = decode_ql_nope.reshape(ql_nope_shape)
                 q_pe_shape = decode_q_pe.shape
-                decode_q_pe, _ = ops.scaled_fp8_quant(
-                    decode_q_pe.reshape([q_pe_shape[0], q_pe_shape[1] * q_pe_shape[2]]),
-                    layer._q_scale,
+                assert decode_ql_nope.shape[0] == decode_q_pe.shape[0]
+                assert decode_ql_nope.shape[1] == decode_q_pe.shape[1]
+                decode_q_shape = (
+                    ql_nope_shape[0],
+                    ql_nope_shape[1],
+                    ql_nope_shape[2] + q_pe_shape[2],
+                )
+                # Using empty and copy since torch.cat introduces significant overhead.
+                decode_q0 = torch.empty(
+                    decode_q_shape,
+                    device=decode_ql_nope.device,
+                    dtype=decode_ql_nope.dtype,
                 )
-                decode_q_pe = decode_q_pe.reshape(q_pe_shape)
+                decode_q0[..., : ql_nope_shape[2]].copy_(decode_ql_nope)
+                decode_q0[..., ql_nope_shape[2] :].copy_(decode_q_pe)
 
-            decode_q = (decode_ql_nope, decode_q_pe)
+                decode_q, _ = ops.scaled_fp8_quant(
+                    decode_q0.view(decode_q_shape[0], -1),
+                    layer._q_scale,
+                )
+                decode_q = decode_q.view(decode_q_shape)
+            else:
+                decode_q = (decode_ql_nope, decode_q_pe)
             if self.dcp_world_size > 1:
                 assert not fp8_attention, "DCP not support fp8 kvcache now."
                 # concatenate decode_ql_nope and decode_q_pe -> (B, N, L + P)
-- 
GitLab


From 60d17251c920ae3c9d02e4b4101b738e4905aee4 Mon Sep 17 00:00:00 2001
From: Ming Yang <minos.future@gmail.com>
Date: Mon, 8 Dec 2025 16:01:08 -0800
Subject: [PATCH 206/258] [Disagg] Support large batch size in proxy server and
 update NixlConnector doc for DP (#28782)

Signed-off-by: Ming Yang <minos.future@gmail.com>
---
 docs/features/nixl_connector_usage.md         |  2 ++
 .../disagg_proxy_server.py                    | 21 +++++++++++++++--
 .../nixl_integration/toy_proxy_server.py      | 23 +++++++++++++++++--
 3 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/docs/features/nixl_connector_usage.md b/docs/features/nixl_connector_usage.md
index f0e25e31a..84c8f9e77 100644
--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@@ -146,6 +146,8 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
   --decoder-ports 8000 8000
 ```
 
+For multi-host DP deployment, only need to provide the host/port of the head instances.
+
 ### KV Role Options
 
 - **kv_producer**: For prefiller instances that generate KV caches
diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
index 5d8e38c73..c8965e050 100644
--- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
+++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
@@ -26,9 +26,21 @@ async def lifespan(app: FastAPI):
     )
 
     app.state.prefill_client = httpx.AsyncClient(
-        timeout=None, base_url=prefiller_base_url
+        timeout=None,
+        base_url=prefiller_base_url,
+        limits=httpx.Limits(
+            max_connections=None,
+            max_keepalive_connections=None,
+        ),
+    )
+    app.state.decode_client = httpx.AsyncClient(
+        timeout=None,
+        base_url=decoder_base_url,
+        limits=httpx.Limits(
+            max_connections=None,
+            max_keepalive_connections=None,
+        ),
     )
-    app.state.decode_client = httpx.AsyncClient(timeout=None, base_url=decoder_base_url)
 
     yield
 
@@ -105,6 +117,11 @@ async def send_request_to_service(
     headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
     response = await client.post(endpoint, json=req_data, headers=headers)
     response.raise_for_status()
+
+    # read/consume the response body to release the connection
+    # otherwise, it would http.ReadError
+    await response.aread()
+
     return response
 
 
diff --git a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
index 5768fcdb5..b92d3fcd6 100644
--- a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
+++ b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
@@ -30,7 +30,14 @@ async def lifespan(app: FastAPI):
         prefiller_base_url = f"http://{host}:{port}/v1"
         app.state.prefill_clients.append(
             {
-                "client": httpx.AsyncClient(timeout=None, base_url=prefiller_base_url),
+                "client": httpx.AsyncClient(
+                    timeout=None,
+                    base_url=prefiller_base_url,
+                    limits=httpx.Limits(
+                        max_connections=None,
+                        max_keepalive_connections=None,
+                    ),
+                ),
                 "host": host,
                 "port": port,
                 "id": i,
@@ -42,7 +49,14 @@ async def lifespan(app: FastAPI):
         decoder_base_url = f"http://{host}:{port}/v1"
         app.state.decode_clients.append(
             {
-                "client": httpx.AsyncClient(timeout=None, base_url=decoder_base_url),
+                "client": httpx.AsyncClient(
+                    timeout=None,
+                    base_url=decoder_base_url,
+                    limits=httpx.Limits(
+                        max_connections=None,
+                        max_keepalive_connections=None,
+                    ),
+                ),
                 "host": host,
                 "port": port,
                 "id": i,
@@ -169,6 +183,10 @@ async def send_request_to_service(
     )
     response.raise_for_status()
 
+    # read/consume the response body to release the connection
+    # otherwise, it would http.ReadError
+    await response.aread()
+
     return response
 
 
@@ -206,6 +224,7 @@ async def _handle_completions(api: str, request: Request):
 
         # Extract the needed fields
         response_json = response.json()
+        await response.aclose()  # CRITICAL: Release connection back to pool
         kv_transfer_params = response_json.get("kv_transfer_params", {})
         if kv_transfer_params:
             req_data["kv_transfer_params"] = kv_transfer_params
-- 
GitLab


From f1599ca55d79cb686cb94dc3ff2f65d82db94940 Mon Sep 17 00:00:00 2001
From: Victor Ziliang Peng <ziliang@character.ai>
Date: Mon, 8 Dec 2025 16:08:48 -0800
Subject: [PATCH 207/258] feat(metrics): Add prefill KV compute metric
 excluding cached tokens (#30189)

Signed-off-by: Ziliang Peng <ziliang@character.ai>
---
 tests/v1/metrics/test_stats.py     | 103 ++++++++++++++++++++++++++++-
 vllm/v1/engine/output_processor.py |   1 +
 vllm/v1/metrics/loggers.py         |  20 ++++++
 vllm/v1/metrics/stats.py           |   3 +
 4 files changed, 126 insertions(+), 1 deletion(-)

diff --git a/tests/v1/metrics/test_stats.py b/tests/v1/metrics/test_stats.py
index 48067def8..7d902bbc6 100644
--- a/tests/v1/metrics/test_stats.py
+++ b/tests/v1/metrics/test_stats.py
@@ -1,8 +1,109 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from vllm.v1.metrics.stats import IterationStats
+from vllm.v1.engine import FinishReason
+from vllm.v1.metrics.stats import IterationStats, RequestStateStats
 
 
 def test_iteration_stats_repr():
     iteration_stats = IterationStats()
     assert repr(iteration_stats).startswith("IterationStats(")
+
+
+def test_prefill_kv_computed_with_cache():
+    """Test that prefill KV compute correctly excludes cached tokens."""
+    iteration_stats = IterationStats()
+    req_stats = RequestStateStats(arrival_time=0.0)
+    req_stats.scheduled_ts = 0.1
+    req_stats.first_token_ts = 0.5
+    req_stats.last_token_ts = 5.0
+    req_stats.num_generation_tokens = 50
+
+    # Case 1: With prefix cache (1200 tokens cached)
+    iteration_stats.update_from_finished_request(
+        finish_reason=FinishReason.STOP,
+        num_prompt_tokens=10000,
+        max_tokens_param=100,
+        req_stats=req_stats,
+        num_cached_tokens=1200,
+    )
+
+    finished_req = iteration_stats.finished_requests[0]
+    assert finished_req.num_prompt_tokens == 10000
+    assert finished_req.num_cached_tokens == 1200
+
+    # Verify calculation: prefill KV = prompt tokens - cached tokens
+    prefill_kv_computed = finished_req.num_prompt_tokens - max(
+        finished_req.num_cached_tokens, 0
+    )
+    assert prefill_kv_computed == 8800  # 10000 - 1200
+
+
+def test_prefill_kv_computed_no_cache():
+    """Test prefill KV compute without prefix caching."""
+    iteration_stats = IterationStats()
+    req_stats = RequestStateStats(arrival_time=0.0)
+    req_stats.scheduled_ts = 0.1
+    req_stats.first_token_ts = 0.5
+    req_stats.last_token_ts = 2.0
+    req_stats.num_generation_tokens = 10
+
+    # Case 2: No prefix cache
+    iteration_stats.update_from_finished_request(
+        finish_reason=FinishReason.STOP,
+        num_prompt_tokens=2000,
+        max_tokens_param=100,
+        req_stats=req_stats,
+        num_cached_tokens=0,
+    )
+
+    finished_req = iteration_stats.finished_requests[0]
+    assert finished_req.num_prompt_tokens == 2000
+    assert finished_req.num_cached_tokens == 0
+
+    # Verify calculation: prefill KV = full prompt when no cache
+    prefill_kv_computed = finished_req.num_prompt_tokens - max(
+        finished_req.num_cached_tokens, 0
+    )
+    assert prefill_kv_computed == 2000
+
+
+def test_prefill_kv_computed_edge_cases():
+    """Test edge cases for prefill KV compute calculation."""
+    iteration_stats = IterationStats()
+    req_stats = RequestStateStats(arrival_time=0.0)
+    req_stats.scheduled_ts = 0.1
+    req_stats.first_token_ts = 0.5
+    req_stats.last_token_ts = 1.0
+    req_stats.num_generation_tokens = 1
+
+    # Case 3: Negative num_cached_tokens (shouldn't happen, but handle gracefully)
+    iteration_stats.update_from_finished_request(
+        finish_reason=FinishReason.STOP,
+        num_prompt_tokens=100,
+        max_tokens_param=10,
+        req_stats=req_stats,
+        num_cached_tokens=-1,
+    )
+
+    finished_req = iteration_stats.finished_requests[0]
+    # max() should handle negative values
+    prefill_kv_computed = finished_req.num_prompt_tokens - max(
+        finished_req.num_cached_tokens, 0
+    )
+    assert prefill_kv_computed == 100  # Should treat negative as 0
+
+    # Case 4: All tokens cached (shouldn't happen in practice)
+    iteration_stats2 = IterationStats()
+    iteration_stats2.update_from_finished_request(
+        finish_reason=FinishReason.STOP,
+        num_prompt_tokens=100,
+        max_tokens_param=10,
+        req_stats=req_stats,
+        num_cached_tokens=100,
+    )
+
+    finished_req2 = iteration_stats2.finished_requests[0]
+    prefill_kv_computed2 = finished_req2.num_prompt_tokens - max(
+        finished_req2.num_cached_tokens, 0
+    )
+    assert prefill_kv_computed2 == 0  # All cached, nothing computed
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index e85fbb4ee..9be3f4da7 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -650,6 +650,7 @@ class OutputProcessor:
             ),
             max_tokens_param=req_state.max_tokens_param,
             req_stats=req_state.stats,
+            num_cached_tokens=req_state.num_cached_tokens,
         )
         self.lora_states.request_finished(req_state.request_id, req_state.lora_name)
 
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 882e0ce0b..9eaee1bb9 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -870,6 +870,19 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
             histogram_decode_time_request, engine_indexes, model_name
         )
 
+        histogram_prefill_kv_computed_request = self._histogram_cls(
+            name="vllm:request_prefill_kv_computed_tokens",
+            documentation=(
+                "Histogram of new KV tokens computed during prefill "
+                "(excluding cached tokens)."
+            ),
+            buckets=build_1_2_5_buckets(max_model_len),
+            labelnames=labelnames,
+        )
+        self.histogram_prefill_kv_computed_request = make_per_engine(
+            histogram_prefill_kv_computed_request, engine_indexes, model_name
+        )
+
         #
         # KV Cache residency metrics
         #
@@ -1118,6 +1131,13 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
             self.histogram_decode_time_request[engine_idx].observe(
                 finished_request.decode_time
             )
+            # Calculate prefill KV compute (excludes cached tokens)
+            prefill_kv_computed = finished_request.num_prompt_tokens - max(
+                finished_request.num_cached_tokens, 0
+            )
+            self.histogram_prefill_kv_computed_request[engine_idx].observe(
+                prefill_kv_computed
+            )
             self.histogram_num_prompt_tokens_request[engine_idx].observe(
                 finished_request.num_prompt_tokens
             )
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 733d3ae12..a0cc58d0a 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -224,6 +224,7 @@ class FinishedRequestStats:
     decode_time: float = 0.0
     mean_time_per_output_token: float = 0.0
     is_corrupted: bool = False
+    num_cached_tokens: int = 0
 
 
 class IterationStats:
@@ -330,6 +331,7 @@ class IterationStats:
         num_prompt_tokens: int,
         max_tokens_param: int | None,
         req_stats: RequestStateStats,
+        num_cached_tokens: int = 0,
     ):
         e2e_latency = self._time_since(req_stats.arrival_time)
 
@@ -367,6 +369,7 @@ class IterationStats:
             decode_time=decode_time,
             mean_time_per_output_token=mean_time_per_output_token,
             is_corrupted=req_stats.is_corrupted,
+            num_cached_tokens=num_cached_tokens,
         )
         self.finished_requests.append(finished_req)
 
-- 
GitLab


From 9d6235ca9a36e76911045999ed72e3c8aad66b8a Mon Sep 17 00:00:00 2001
From: Ming Yang <minos.future@gmail.com>
Date: Mon, 8 Dec 2025 16:29:36 -0800
Subject: [PATCH 208/258] [moe] Allow disabling DP chunking (#29936)

Signed-off-by: Ming Yang <minos.future@gmail.com>
---
 vllm/envs.py                                  | 4 ++++
 vllm/model_executor/layers/fused_moe/layer.py | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 37711dece..91d1b0107 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -144,6 +144,7 @@ if TYPE_CHECKING:
     VLLM_DP_MASTER_IP: str = ""
     VLLM_DP_MASTER_PORT: int = 0
     VLLM_MOE_DP_CHUNK_SIZE: int = 256
+    VLLM_ENABLE_MOE_DP_CHUNK: bool = True
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
     VLLM_RAY_DP_PACK_STRATEGY: Literal["strict", "fill", "span"] = "strict"
     VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
@@ -1101,6 +1102,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # rank. All DP ranks process the activations in VLLM_MOE_DP_CHUNK_SIZE
     # units.
     "VLLM_MOE_DP_CHUNK_SIZE": lambda: int(os.getenv("VLLM_MOE_DP_CHUNK_SIZE", "256")),
+    "VLLM_ENABLE_MOE_DP_CHUNK": lambda: bool(
+        int(os.getenv("VLLM_ENABLE_MOE_DP_CHUNK", "1"))
+    ),
     # Randomize inputs during dummy runs when using Data Parallel
     "VLLM_RANDOMIZE_DP_DUMMY_INPUTS": lambda: os.environ.get(
         "VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0"
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 9b4d77a06..5df348609 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -753,7 +753,7 @@ class FusedMoE(CustomOp):
             self.moe_parallel_config.use_pplx_kernels
             or self.moe_parallel_config.use_deepep_ll_kernels
             or (self.dp_size > 1 and self.use_flashinfer_cutlass_kernels)
-        )
+        ) and envs.VLLM_ENABLE_MOE_DP_CHUNK
 
     @property
     def is_internal_router(self) -> bool:
-- 
GitLab


From d9417096d1347ead26b7c0cc1afb22fc0028e6e8 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Mon, 8 Dec 2025 19:31:57 -0500
Subject: [PATCH 209/258] [Feature] Batch invariant: Enable `TRITON_MLA`
 without prefix-caching (#29125)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 tests/v1/determinism/test_batch_invariance.py |  6 +---
 .../test_online_batch_invariance.py           |  5 ++-
 tests/v1/determinism/utils.py                 |  1 +
 vllm/attention/layer.py                       | 36 +++++++++++++++++++
 vllm/model_executor/layers/batch_invariant.py |  2 +-
 5 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py
index 4311547ba..fc953a66f 100644
--- a/tests/v1/determinism/test_batch_invariance.py
+++ b/tests/v1/determinism/test_batch_invariance.py
@@ -185,7 +185,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
     llm = LLM(
         model=model_name,
         tensor_parallel_size=tp_size,
-        enable_prefix_caching=False,
+        # enable_prefix_caching=False,
         max_num_seqs=32,
         max_model_len=8192,
         dtype="bfloat16",  # not everything is supported
@@ -393,7 +393,6 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
         gpu_memory_utilization=0.9,
         max_model_len=2048,
         dtype="bfloat16",
-        enable_prefix_caching=False,
     )
 
     prompt = "the capital of france is"
@@ -457,7 +456,6 @@ def test_logprobs_without_batch_invariance_should_fail(
     llm = LLM(
         model=model_name,
         tensor_parallel_size=tp_size,
-        enable_prefix_caching=False,
         max_num_seqs=32,
         max_model_len=8192,
         dtype="bfloat16",
@@ -681,7 +679,6 @@ def test_decode_logprobs_match_prefill_logprobs(
     llm = LLM(
         model=model_name,
         tensor_parallel_size=tp_size,
-        enable_prefix_caching=False,
         max_num_seqs=32,
         max_model_len=8192,
         dtype="bfloat16",
@@ -928,7 +925,6 @@ def LLM_with_max_seqs(
         max_model_len=max_model_len,
         dtype="bfloat16",
         tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
-        enable_prefix_caching=False,
         # Enable for MOE models
         # enable_expert_parallel=True,
     )
diff --git a/tests/v1/determinism/test_online_batch_invariance.py b/tests/v1/determinism/test_online_batch_invariance.py
index d74b43579..5e3b99736 100644
--- a/tests/v1/determinism/test_online_batch_invariance.py
+++ b/tests/v1/determinism/test_online_batch_invariance.py
@@ -153,7 +153,10 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
     }
 
     tp_size = os.getenv("VLLM_TP_SIZE", "1")
-    server_args: list[str] = []
+    server_args: list[str] = [
+        "--max-model-len=8192",
+        "--max-num-seqs=32",
+    ]
     if tp_size:
         server_args += ["-tp", tp_size]
 
diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py
index 0d7da1077..6aab50cf8 100644
--- a/tests/v1/determinism/utils.py
+++ b/tests/v1/determinism/utils.py
@@ -17,6 +17,7 @@ skip_unsupported = pytest.mark.skipif(
 
 BACKENDS: list[str] = [
     "FLASH_ATTN",
+    "TRITON_MLA",
 ]
 
 if has_flashinfer():
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 340b161ea..7e5adfe07 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -25,6 +25,7 @@ from vllm.config.vllm import VllmConfig
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     UnquantizedLinearMethod,
@@ -251,6 +252,24 @@ class Attention(nn.Module, AttentionLayerBase):
         else:
             self.attn_backend = attn_backend
 
+        # prefix caching + batch invariance is currently not supported for
+        # FLASHINFER and TRITON_MLA.
+        if (
+            cache_config is not None
+            and cache_config.enable_prefix_caching
+            and vllm_is_batch_invariant()
+            and (
+                self.attn_backend.get_name() == "FLASHINFER"
+                or self.attn_backend.get_name() == "TRITON_MLA"
+            )
+        ):
+            logger.warning_once(
+                "Disabling prefix caching for FLASHINFER/TRITON_MLA "
+                "with batch invariance, as it is not yet supported.",
+                scope="local",
+            )
+            cache_config.enable_prefix_caching = False
+
         impl_cls = self.attn_backend.get_impl_cls()
         self.impl = impl_cls(
             num_heads,
@@ -628,6 +647,23 @@ class MLAAttention(nn.Module, AttentionLayerBase):
             use_mla=True,
             use_sparse=use_sparse,
         )
+
+        if (
+            cache_config is not None
+            and cache_config.enable_prefix_caching
+            and vllm_is_batch_invariant()
+            and (
+                self.attn_backend.get_name() == "TRITON_MLA"
+                or self.attn_backend.get_name() == "FLASHINFER"
+            )
+        ):
+            logger.warning_once(
+                "Disabling prefix caching for TRITON_MLA / FLASHINFER "
+                "with batch invariance, as it is not yet supported.",
+                scope="local",
+            )
+            cache_config.enable_prefix_caching = False
+
         impl_cls = cast(type[MLAAttentionImpl], self.attn_backend.get_impl_cls())
         self.impl = impl_cls(
             num_heads=self.num_heads,
diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index 415412263..4cab47f41 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -1006,11 +1006,11 @@ def override_envs_for_invariance():
         "FLASH_ATTN",  # best supported backend
         "FLASHINFER",
         "FLASH_ATTN_MLA",
+        "TRITON_MLA",
         # Not yet supported MLA backends
         # "FLASHMLA",
         # "FLEX_ATTENTION", # IMA issue even if we disable batch invariance
         # "FLASHINFER_MLA", https://github.com/vllm-project/vllm/pull/28967
-        # "TRITON_MLA",
     ]
     if curr_attn_backend not in supported_backends:
         error = (
-- 
GitLab


From 0ee6416f678de761f827f4bd387b36f0cbc43a0a Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Mon, 8 Dec 2025 19:44:01 -0500
Subject: [PATCH 210/258] [Perf] Optimize `group_topk` kernel, 1.9% Throughput
 improvement, 2.1% TPOT improvemnt (#30159)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 csrc/moe/grouped_topk_kernels.cu | 175 ++++++++++++++++++++++---------
 1 file changed, 128 insertions(+), 47 deletions(-)

diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu
index 69b4c1fb1..47ee5f021 100644
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@@ -444,23 +444,27 @@ __device__ inline T apply_sigmoid(T val) {
   return cuda_cast<T, float>(sigmoid_accurate(f));
 }
 
-template <typename T>
+template <ScoringFunc SF, typename T>
+__device__ inline T apply_scoring(T val) {
+  if constexpr (SF == SCORING_SIGMOID) {
+    return apply_sigmoid(val);
+  } else {
+    return val;
+  }
+}
+
+template <typename T, ScoringFunc SF>
 __device__ void topk_with_k2(T* output, T const* input, T const* bias,
                              cg::thread_block_tile<32> const& tile,
                              int32_t const lane_id,
-                             int const num_experts_per_group,
-                             int const scoring_func) {
+                             int const num_experts_per_group) {
   // Get the top2 per thread
   T largest = neg_inf<T>();
   T second_largest = neg_inf<T>();
 
   if (num_experts_per_group > WARP_SIZE) {
     for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
-      T value = input[i];
-      // Apply scoring function if needed
-      if (scoring_func == SCORING_SIGMOID) {
-        value = apply_sigmoid(value);
-      }
+      T value = apply_scoring<SF>(input[i]);
       value = value + bias[i];
 
       if (value > largest) {
@@ -472,11 +476,7 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias,
     }
   } else {
     for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
-      T value = input[i];
-      // Apply scoring function if needed
-      if (scoring_func == SCORING_SIGMOID) {
-        value = apply_sigmoid(value);
-      }
+      T value = apply_scoring<SF>(input[i]);
       value = value + bias[i];
       largest = value;
     }
@@ -501,13 +501,12 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias,
   }
 }
 
-template <typename T>
+template <typename T, ScoringFunc SF>
 __global__ void topk_with_k2_kernel(T* output, T* input, T const* bias,
                                     int64_t const num_tokens,
                                     int64_t const num_cases,
                                     int64_t const n_group,
-                                    int64_t const num_experts_per_group,
-                                    int const scoring_func) {
+                                    int64_t const num_experts_per_group) {
   int32_t warp_id = threadIdx.x / WARP_SIZE;
   int32_t lane_id = threadIdx.x % WARP_SIZE;
 
@@ -525,21 +524,21 @@ __global__ void topk_with_k2_kernel(T* output, T* input, T const* bias,
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
     asm volatile("griddepcontrol.wait;");
 #endif
-    topk_with_k2(output, input, group_bias, tile, lane_id,
-                 num_experts_per_group, scoring_func);
+    topk_with_k2<T, SF>(output, input, group_bias, tile, lane_id,
+                        num_experts_per_group);
   }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
   asm volatile("griddepcontrol.launch_dependents;");
 #endif
 }
 
-template <typename T, typename IdxT>
+template <typename T, typename IdxT, ScoringFunc SF, int NGroup = -1>
 __global__ void group_idx_and_topk_idx_kernel(
     T* scores, T const* group_scores, float* topk_values, IdxT* topk_indices,
     T const* bias, int64_t const num_tokens, int64_t const n_group,
     int64_t const topk_group, int64_t const topk, int64_t const num_experts,
     int64_t const num_experts_per_group, bool renormalize,
-    double routed_scaling_factor, int scoring_func) {
+    double routed_scaling_factor) {
   int32_t warp_id = threadIdx.x / WARP_SIZE;
   int32_t lane_id = threadIdx.x % WARP_SIZE;
   int32_t case_id =
@@ -549,6 +548,11 @@ __global__ void group_idx_and_topk_idx_kernel(
   topk_values += case_id * topk;
   topk_indices += case_id * topk;
 
+  constexpr bool kUseStaticNGroup = (NGroup > 0);
+  // use int32 to avoid implicit conversion
+  int32_t const n_group_i32 =
+      kUseStaticNGroup ? NGroup : static_cast<int32_t>(n_group);
+
   int32_t align_num_experts_per_group =
       warp_topk::round_up_to_multiple_of<WARP_SIZE>(num_experts_per_group);
 
@@ -574,13 +578,14 @@ __global__ void group_idx_and_topk_idx_kernel(
 
   if (case_id < num_tokens) {
     // calculate group_idx
-    int32_t target_num_min = WARP_SIZE - n_group + topk_group;
+    int32_t target_num_min =
+        WARP_SIZE - n_group_i32 + static_cast<int32_t>(topk_group);
     // The check is necessary to avoid abnormal input
-    if (lane_id < n_group && is_finite(group_scores[lane_id])) {
+    if (lane_id < n_group_i32 && is_finite(group_scores[lane_id])) {
       value = group_scores[lane_id];
     }
 
-    int count_equal_to_top_value = WARP_SIZE - n_group;
+    int count_equal_to_top_value = WARP_SIZE - n_group_i32;
     int pre_count_equal_to_top_value = 0;
     // Use loop to find the largset top_group
     while (count_equal_to_top_value < target_num_min) {
@@ -604,7 +609,7 @@ __global__ void group_idx_and_topk_idx_kernel(
   int count_equalto_topkth_group = 0;
   bool if_proceed_next_topk = topk_group_value != neg_inf<T>();
   if (case_id < num_tokens && if_proceed_next_topk) {
-    for (int i_group = 0; i_group < n_group; i_group++) {
+    auto process_group = [&](int i_group) {
       if ((group_scores[i_group] > topk_group_value) ||
           ((group_scores[i_group] == topk_group_value) &&
            (count_equalto_topkth_group < num_equalto_topkth_group))) {
@@ -613,11 +618,10 @@ __global__ void group_idx_and_topk_idx_kernel(
              i += WARP_SIZE) {
           T candidates = neg_inf<T>();
           if (i < num_experts_per_group) {
-            // Apply scoring function (if any) and add bias
+            // apply scoring function (if any) and add bias
             T input = scores[offset + i];
             if (is_finite(input)) {
-              T score = (scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input)
-                                                          : input;
+              T score = apply_scoring<SF>(input);
               candidates = score + bias[offset + i];
             }
           }
@@ -627,6 +631,17 @@ __global__ void group_idx_and_topk_idx_kernel(
           count_equalto_topkth_group++;
         }
       }
+    };
+
+    if constexpr (kUseStaticNGroup) {
+#pragma unroll
+      for (int i_group = 0; i_group < NGroup; ++i_group) {
+        process_group(i_group);
+      }
+    } else {
+      for (int i_group = 0; i_group < n_group_i32; ++i_group) {
+        process_group(i_group);
+      }
     }
     queue.done();
     __syncwarp();
@@ -646,12 +661,13 @@ __global__ void group_idx_and_topk_idx_kernel(
       if (i < topk) {
         // Load the score value (without bias) for normalization
         T input = scores[s_topk_idx[i]];
-        value =
-            (scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input) : input;
+        value = apply_scoring<SF>(input);
         s_topk_value[i] = value;
       }
-      topk_sum +=
-          cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
+      if (renormalize) {
+        topk_sum +=
+            cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
+      }
     }
   }
 
@@ -660,13 +676,9 @@ __global__ void group_idx_and_topk_idx_kernel(
   if (case_id < num_tokens) {
     if (if_proceed_next_topk) {
       for (int i = lane_id; i < topk; i += WARP_SIZE) {
-        float value;
-        if (renormalize) {
-          value = cuda_cast<float, T>(s_topk_value[i]) / topk_sum *
-                  routed_scaling_factor;
-        } else {
-          value = cuda_cast<float, T>(s_topk_value[i]) * routed_scaling_factor;
-        }
+        float base = cuda_cast<float, T>(s_topk_value[i]);
+        float value = renormalize ? (base / topk_sum * routed_scaling_factor)
+                                  : (base * routed_scaling_factor);
         topk_indices[i] = s_topk_idx[i];
         topk_values[i] = value;
       }
@@ -684,6 +696,45 @@ __global__ void group_idx_and_topk_idx_kernel(
 #endif
 }
 
+template <typename T, typename IdxT, ScoringFunc SF>
+inline void launch_group_idx_and_topk_kernel(
+    cudaLaunchConfig_t const& config, T* scores, T* group_scores,
+    float* topk_values, IdxT* topk_indices, T const* bias,
+    int64_t const num_tokens, int64_t const n_group, int64_t const topk_group,
+    int64_t const topk, int64_t const num_experts,
+    int64_t const num_experts_per_group, bool const renormalize,
+    double const routed_scaling_factor) {
+  auto launch = [&](auto* kernel_instance2) {
+    cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
+                       topk_values, topk_indices, bias, num_tokens, n_group,
+                       topk_group, topk, num_experts, num_experts_per_group,
+                       renormalize, routed_scaling_factor);
+  };
+
+  switch (n_group) {
+    case 4: {
+      launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 4>);
+      break;
+    }
+    case 8: {
+      launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 8>);
+      break;
+    }
+    case 16: {
+      launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 16>);
+      break;
+    }
+    case 32: {
+      launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 32>);
+      break;
+    }
+    default: {
+      launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF>);
+      break;
+    }
+  }
+}
+
 template <typename T, typename IdxT>
 void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
                    IdxT* topk_indices, T const* bias, int64_t const num_tokens,
@@ -694,7 +745,6 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
                    cudaStream_t const stream = 0) {
   int64_t num_cases = num_tokens * n_group;
   int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
-  auto* kernel_instance1 = &topk_with_k2_kernel<T>;
   cudaLaunchConfig_t config;
   config.gridDim = topk_with_k2_num_blocks;
   config.blockDim = BLOCK_SIZE;
@@ -705,16 +755,33 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
   attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
   config.numAttrs = 1;
   config.attrs = attrs;
-  cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias,
-                     num_tokens, num_cases, n_group, num_experts / n_group,
-                     scoring_func);
+  auto const sf = static_cast<ScoringFunc>(scoring_func);
+  int64_t const num_experts_per_group = num_experts / n_group;
+  auto launch_topk_with_k2 = [&](auto* kernel_instance1) {
+    cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias,
+                       num_tokens, num_cases, n_group, num_experts_per_group);
+  };
+  switch (sf) {
+    case SCORING_NONE: {
+      auto* kernel_instance1 = &topk_with_k2_kernel<T, SCORING_NONE>;
+      launch_topk_with_k2(kernel_instance1);
+      break;
+    }
+    case SCORING_SIGMOID: {
+      auto* kernel_instance1 = &topk_with_k2_kernel<T, SCORING_SIGMOID>;
+      launch_topk_with_k2(kernel_instance1);
+      break;
+    }
+    default:
+      // should be guarded by higher level checks.
+      TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
+  }
 
   int64_t topk_with_k_group_num_blocks =
       (num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
   size_t dynamic_smem_in_bytes =
       warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
                                                            topk);
-  auto* kernel_instance2 = &group_idx_and_topk_idx_kernel<T, IdxT>;
   config.gridDim = topk_with_k_group_num_blocks;
   config.blockDim = BLOCK_SIZE;
   config.dynamicSmemBytes = dynamic_smem_in_bytes;
@@ -723,10 +790,24 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
   attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
   config.numAttrs = 1;
   config.attrs = attrs;
-  cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
-                     topk_values, topk_indices, bias, num_tokens, n_group,
-                     topk_group, topk, num_experts, num_experts / n_group,
-                     renormalize, routed_scaling_factor, scoring_func);
+  switch (sf) {
+    case SCORING_NONE: {
+      launch_group_idx_and_topk_kernel<T, IdxT, SCORING_NONE>(
+          config, scores, group_scores, topk_values, topk_indices, bias,
+          num_tokens, n_group, topk_group, topk, num_experts,
+          num_experts_per_group, renormalize, routed_scaling_factor);
+      break;
+    }
+    case SCORING_SIGMOID: {
+      launch_group_idx_and_topk_kernel<T, IdxT, SCORING_SIGMOID>(
+          config, scores, group_scores, topk_values, topk_indices, bias,
+          num_tokens, n_group, topk_group, topk, num_experts,
+          num_experts_per_group, renormalize, routed_scaling_factor);
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
+  }
 }
 
 #define INSTANTIATE_NOAUX_TC(T, IdxT)                                       \
-- 
GitLab


From ae339b1a67ac6651dae926aae2afa0c168c1ddb5 Mon Sep 17 00:00:00 2001
From: Zhewen Li <zhewenli@meta.com>
Date: Mon, 8 Dec 2025 17:05:27 -0800
Subject: [PATCH 211/258] [Bugfix] Fix DeepGEMM after #29546  (#30267)

Signed-off-by: zhewenli <zhewenli@meta.com>
Signed-off-by: Zhewen Li <zhewenli@meta.com>
---
 .../layers/quantization/utils/fp8_utils.py       | 16 ++++++++++------
 vllm/utils/deep_gemm.py                          |  1 +
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index ad92f4ec6..366c5778f 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -30,6 +30,7 @@ from vllm.model_executor.parameter import (
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils.deep_gemm import (
+    DeepGemmQuantScaleFMT,
     fp8_gemm_nt,
     is_deep_gemm_e8m0_used,
     is_deep_gemm_supported,
@@ -268,12 +269,15 @@ class W8A8BlockFp8LinearOp:
         weight: torch.Tensor,
         weight_scale: torch.Tensor,
     ) -> torch.Tensor:
-        assert self.deepgemm_input_quant_op is not None
-        q_input, input_scale = per_token_group_quant_fp8_packed_for_deepgemm(
-            input_2d,
-            group_size=self.act_quant_group_shape.col,
-            use_ue8m0=True,
-        )
+        if DeepGemmQuantScaleFMT.from_oracle() == DeepGemmQuantScaleFMT.UE8M0:
+            q_input, input_scale = per_token_group_quant_fp8_packed_for_deepgemm(
+                input_2d,
+                group_size=self.act_quant_group_shape.col,
+                use_ue8m0=True,
+            )
+        else:
+            assert self.deepgemm_input_quant_op is not None
+            q_input, input_scale = self.deepgemm_input_quant_op(input_2d)
         output = torch.empty(
             (q_input.shape[0], weight.shape[0]),
             dtype=torch.bfloat16,
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index 8545108a0..a099fde1b 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -399,6 +399,7 @@ def should_use_deepgemm_for_fp8_linear_for_nk(
 
 __all__ = [
     "calc_diff",
+    "DeepGemmQuantScaleFMT",
     "fp8_gemm_nt",
     "m_grouped_fp8_gemm_nt_contiguous",
     "fp8_m_grouped_gemm_nt_masked",
-- 
GitLab


From 7b35011ad136748b3d18d069103feda9a239459f Mon Sep 17 00:00:00 2001
From: Yanan Cao <gmagogsfm@users.noreply.github.com>
Date: Mon, 8 Dec 2025 17:14:10 -0800
Subject: [PATCH 212/258] Mark qwen2_5_vl as xfail (#30283)

Signed-off-by: Yanan Cao <gmagogsfm@gmail.com>
---
 tests/compile/fullgraph/test_multimodal_compile.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/compile/fullgraph/test_multimodal_compile.py b/tests/compile/fullgraph/test_multimodal_compile.py
index 621f6a51a..e2897b227 100644
--- a/tests/compile/fullgraph/test_multimodal_compile.py
+++ b/tests/compile/fullgraph/test_multimodal_compile.py
@@ -17,6 +17,7 @@ def test_compile():
 # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
 @pytest.mark.forked
 @pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
+@pytest.mark.xfail
 def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch):
     """Test that Qwen2.5-VL vision submodules are compiled.
 
-- 
GitLab


From e41312a2f5086cc9199e024c5d451e65a303a4d1 Mon Sep 17 00:00:00 2001
From: Christina Norman <truffle@gmail.com>
Date: Mon, 8 Dec 2025 19:52:43 -0600
Subject: [PATCH 213/258] [Bugfix] Skip generation config fallback for GGUF to
 prevent multi-process hang (#30209)

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
---
 vllm/transformers_utils/config.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 773fc05a5..d761802da 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -954,6 +954,13 @@ def try_get_generation_config(
     revision: str | None = None,
     config_format: str | ConfigFormat = "auto",
 ) -> GenerationConfig | None:
+    # GGUF files don't have generation_config.json - their config is embedded
+    # in the file header. Skip all filesystem lookups to avoid re-reading the
+    # memory-mapped file, which can hang in multi-process scenarios when the
+    # EngineCore process already has the file mapped.
+    if is_gguf(model):
+        return None
+
     try:
         return GenerationConfig.from_pretrained(
             model,
-- 
GitLab


From 78c75033642b8c2832fcfb8d483871b65673326b Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Mon, 8 Dec 2025 20:14:02 -0600
Subject: [PATCH 214/258] [ROCm][CI] Skip NVIDIA-Only Prime-RL Test in AMD CI
 (#29420)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 .buildkite/scripts/run-prime-rl-test.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.buildkite/scripts/run-prime-rl-test.sh b/.buildkite/scripts/run-prime-rl-test.sh
index 5b25c358f..3fb7c82c8 100755
--- a/.buildkite/scripts/run-prime-rl-test.sh
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@@ -12,6 +12,11 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
 PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
 PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
 
+if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
+    echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
+    exit 0
+fi
+
 echo "Setting up Prime-RL integration test environment..."
 
 # Clean up any existing Prime-RL directory
-- 
GitLab


From db14f61f2d6d33e38d382f2e3e7de514dac8e218 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Mon, 8 Dec 2025 18:25:43 -0800
Subject: [PATCH 215/258] [ci] Refactor CI file structure (#29343)

---
 .buildkite/ci_config.yaml                     |  24 +++
 .buildkite/image_build/image_build.sh         |  56 +++++
 .buildkite/image_build/image_build.yaml       |  57 +++++
 .buildkite/image_build/image_build_cpu.sh     |  36 ++++
 .../image_build/image_build_cpu_arm64.sh      |  33 +++
 .buildkite/image_build/image_build_hpu.sh     |  34 +++
 .buildkite/test_areas/attention.yaml          |  21 ++
 .buildkite/test_areas/basic_correctness.yaml  |  16 ++
 .buildkite/test_areas/benchmarks.yaml         |  19 ++
 .buildkite/test_areas/compile.yaml            |  57 +++++
 .buildkite/test_areas/cuda.yaml               |  22 ++
 .buildkite/test_areas/distributed.yaml        | 199 ++++++++++++++++++
 .buildkite/test_areas/e2e_integration.yaml    |  59 ++++++
 .buildkite/test_areas/engine.yaml             |  26 +++
 .buildkite/test_areas/entrypoints.yaml        |  68 ++++++
 .buildkite/test_areas/expert_parallelism.yaml |  23 ++
 .buildkite/test_areas/kernels.yaml            | 117 ++++++++++
 .buildkite/test_areas/lm_eval.yaml            |  46 ++++
 .buildkite/test_areas/lora.yaml               |  31 +++
 .buildkite/test_areas/misc.yaml               | 163 ++++++++++++++
 .buildkite/test_areas/model_executor.yaml     |  17 ++
 .buildkite/test_areas/models_basic.yaml       |  62 ++++++
 .buildkite/test_areas/models_distributed.yaml |  22 ++
 .buildkite/test_areas/models_language.yaml    |  91 ++++++++
 .buildkite/test_areas/models_multimodal.yaml  |  79 +++++++
 .buildkite/test_areas/plugins.yaml            |  34 +++
 .buildkite/test_areas/pytorch.yaml            |  50 +++++
 .buildkite/test_areas/quantization.yaml       |  46 ++++
 .buildkite/test_areas/samplers.yaml           |  14 ++
 .buildkite/test_areas/tool_use.yaml           |  23 ++
 .buildkite/test_areas/weight_loading.yaml     |  25 +++
 31 files changed, 1570 insertions(+)
 create mode 100644 .buildkite/ci_config.yaml
 create mode 100755 .buildkite/image_build/image_build.sh
 create mode 100644 .buildkite/image_build/image_build.yaml
 create mode 100755 .buildkite/image_build/image_build_cpu.sh
 create mode 100755 .buildkite/image_build/image_build_cpu_arm64.sh
 create mode 100755 .buildkite/image_build/image_build_hpu.sh
 create mode 100644 .buildkite/test_areas/attention.yaml
 create mode 100644 .buildkite/test_areas/basic_correctness.yaml
 create mode 100644 .buildkite/test_areas/benchmarks.yaml
 create mode 100644 .buildkite/test_areas/compile.yaml
 create mode 100644 .buildkite/test_areas/cuda.yaml
 create mode 100644 .buildkite/test_areas/distributed.yaml
 create mode 100644 .buildkite/test_areas/e2e_integration.yaml
 create mode 100644 .buildkite/test_areas/engine.yaml
 create mode 100644 .buildkite/test_areas/entrypoints.yaml
 create mode 100644 .buildkite/test_areas/expert_parallelism.yaml
 create mode 100644 .buildkite/test_areas/kernels.yaml
 create mode 100644 .buildkite/test_areas/lm_eval.yaml
 create mode 100644 .buildkite/test_areas/lora.yaml
 create mode 100644 .buildkite/test_areas/misc.yaml
 create mode 100644 .buildkite/test_areas/model_executor.yaml
 create mode 100644 .buildkite/test_areas/models_basic.yaml
 create mode 100644 .buildkite/test_areas/models_distributed.yaml
 create mode 100644 .buildkite/test_areas/models_language.yaml
 create mode 100644 .buildkite/test_areas/models_multimodal.yaml
 create mode 100644 .buildkite/test_areas/plugins.yaml
 create mode 100644 .buildkite/test_areas/pytorch.yaml
 create mode 100644 .buildkite/test_areas/quantization.yaml
 create mode 100644 .buildkite/test_areas/samplers.yaml
 create mode 100644 .buildkite/test_areas/tool_use.yaml
 create mode 100644 .buildkite/test_areas/weight_loading.yaml

diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml
new file mode 100644
index 000000000..199c33159
--- /dev/null
+++ b/.buildkite/ci_config.yaml
@@ -0,0 +1,24 @@
+name: vllm_ci
+job_dirs:
+  - ".buildkite/test_areas"
+  - ".buildkite/image_build"
+run_all_patterns:
+  - "docker/Dockerfile"
+  - "CMakeLists.txt"
+  - "requirements/common.txt"
+  - "requirements/cuda.txt"
+  - "requirements/build.txt"
+  - "requirements/test.txt"
+  - "setup.py"
+  - "csrc/"
+  - "cmake/"
+run_all_exclude_patterns:
+  - "docker/Dockerfile."
+  - "csrc/cpu/"
+  - "csrc/rocm/"
+  - "cmake/hipify.py"
+  - "cmake/cpu_extension.cmake"
+registries: public.ecr.aws/q9t5s3a7
+repositories:
+  main: "vllm-ci-postmerge-repo"
+  premerge: "vllm-ci-test-repo"
diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh
new file mode 100755
index 000000000..9a2384e52
--- /dev/null
+++ b/.buildkite/image_build/image_build.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 8 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
+  exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+BRANCH=$4
+VLLM_USE_PRECOMPILED=$5
+VLLM_MERGE_BASE_COMMIT=$6
+CACHE_FROM=$7
+CACHE_TO=$8
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+
+# docker buildx 
+docker buildx create --name vllm-builder --driver docker-container --use
+docker buildx inspect --bootstrap
+docker buildx ls
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
+fi
+
+if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
+  merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
+else
+  merge_base_commit_build_args=""
+fi
+
+# build
+docker buildx build --file docker/Dockerfile \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --build-arg USE_SCCACHE=1 \
+  --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
+  --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
+  --build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
+  ${merge_base_commit_build_args} \
+  --cache-from type=registry,ref=${CACHE_FROM},mode=max \
+  --cache-to type=registry,ref=${CACHE_TO},mode=max \
+  --tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
+  $( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
+  --push \
+  --target test \
+  --progress plain .
diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml
new file mode 100644
index 000000000..d01c71dd9
--- /dev/null
+++ b/.buildkite/image_build/image_build.yaml
@@ -0,0 +1,57 @@
+group: Abuild
+steps:
+  - label: ":docker: Build image"
+    key: image-build
+    depends_on: []
+    commands:
+    - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
+
+  - label: ":docker: Build CPU image"
+    key: image-build-cpu
+    depends_on: []
+    commands:
+    - .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
+
+  - label: ":docker: Build HPU image"
+    soft_fail: true
+    depends_on: []
+    key: image-build-hpu
+    commands:
+    - .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
+  
+  - label: ":docker: Build CPU arm64 image"
+    key: cpu-arm64-image-build
+    depends_on: []
+    optional: true
+    commands:
+    - .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
diff --git a/.buildkite/image_build/image_build_cpu.sh b/.buildkite/image_build/image_build_cpu.sh
new file mode 100755
index 000000000..a69732f43
--- /dev/null
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit>"
+  exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
+fi
+
+# build
+docker build --file docker/Dockerfile.cpu \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --build-arg VLLM_CPU_AVX512BF16=true \
+  --build-arg VLLM_CPU_AVX512VNNI=true \
+  --build-arg VLLM_CPU_AMXBF16=true \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+  --target vllm-test \
+  --progress plain .
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
diff --git a/.buildkite/image_build/image_build_cpu_arm64.sh b/.buildkite/image_build/image_build_cpu_arm64.sh
new file mode 100755
index 000000000..615298b65
--- /dev/null
+++ b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit>"
+  exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
+fi
+
+# build
+docker build --file docker/Dockerfile.cpu \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+  --target vllm-test \
+  --progress plain .
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
diff --git a/.buildkite/image_build/image_build_hpu.sh b/.buildkite/image_build/image_build_hpu.sh
new file mode 100755
index 000000000..192447ef4
--- /dev/null
+++ b/.buildkite/image_build/image_build_hpu.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit>"
+  exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
+fi
+
+# build
+docker build \
+  --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
+  --progress plain \
+  https://github.com/vllm-project/vllm-gaudi.git
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
diff --git a/.buildkite/test_areas/attention.yaml b/.buildkite/test_areas/attention.yaml
new file mode 100644
index 000000000..6e444eae1
--- /dev/null
+++ b/.buildkite/test_areas/attention.yaml
@@ -0,0 +1,21 @@
+group: Attention
+depends_on: 
+  - image-build
+steps:
+- label: V1 attention (H100)
+  timeout_in_minutes: 30
+  gpu: h100
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - pytest -v -s v1/attention
+
+- label: V1 attention (B200)
+  timeout_in_minutes: 30
+  gpu: b200
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
diff --git a/.buildkite/test_areas/basic_correctness.yaml b/.buildkite/test_areas/basic_correctness.yaml
new file mode 100644
index 000000000..759d2b535
--- /dev/null
+++ b/.buildkite/test_areas/basic_correctness.yaml
@@ -0,0 +1,16 @@
+group: Basic Correctness
+depends_on: 
+  - image-build
+steps:
+- label: Basic Correctness
+  timeout_in_minutes: 30
+  source_file_dependencies:
+  - vllm/
+  - tests/basic_correctness/test_basic_correctness
+  - tests/basic_correctness/test_cpu_offload
+  - tests/basic_correctness/test_cumem.py
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s basic_correctness/test_cumem.py
+  - pytest -v -s basic_correctness/test_basic_correctness.py
+  - pytest -v -s basic_correctness/test_cpu_offload.py
diff --git a/.buildkite/test_areas/benchmarks.yaml b/.buildkite/test_areas/benchmarks.yaml
new file mode 100644
index 000000000..574b642d4
--- /dev/null
+++ b/.buildkite/test_areas/benchmarks.yaml
@@ -0,0 +1,19 @@
+group: Benchmarks
+depends_on: 
+  - image-build
+steps:
+- label: Benchmarks
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/.buildkite"
+  source_file_dependencies:
+  - benchmarks/
+  commands:
+  - bash scripts/run-benchmarks.sh
+
+- label: Benchmarks CLI Test
+  timeout_in_minutes: 20
+  source_file_dependencies:
+  - vllm/
+  - tests/benchmarks/
+  commands:
+  - pytest -v -s benchmarks/
diff --git a/.buildkite/test_areas/compile.yaml b/.buildkite/test_areas/compile.yaml
new file mode 100644
index 000000000..0ba00925a
--- /dev/null
+++ b/.buildkite/test_areas/compile.yaml
@@ -0,0 +1,57 @@
+group: Compile
+depends_on: 
+  - image-build
+steps:
+- label: Fusion and Compile Tests (B200)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusion_attn.py
+  - tests/compile/test_silu_mul_quant_fusion.py
+  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  - tests/compile/fullgraph/test_full_graph.py
+  commands:
+    - nvidia-smi
+    - pytest -v -s tests/compile/test_fusion_attn.py
+    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    # this runner has 2 GPUs available even though num_gpus=2 is not set
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    # Wrap with quotes to escape yaml
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+
+- label: Fusion E2E (2 GPUs)(B200)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  commands:
+    - nvidia-smi
+    # Run all e2e fusion tests
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
+
diff --git a/.buildkite/test_areas/cuda.yaml b/.buildkite/test_areas/cuda.yaml
new file mode 100644
index 000000000..50c0c338c
--- /dev/null
+++ b/.buildkite/test_areas/cuda.yaml
@@ -0,0 +1,22 @@
+group: CUDA
+depends_on: 
+  - image-build
+steps:
+- label: Platform Tests (CUDA)
+  timeout_in_minutes: 15
+  source_file_dependencies:
+  - vllm/
+  - tests/cuda
+  commands:
+    - pytest -v -s cuda/test_cuda_context.py
+
+- label: Cudagraph
+  timeout_in_minutes: 20
+  source_file_dependencies:
+  - tests/v1/cudagraph
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/config/compilation.py
+  - vllm/compilation
+  commands:
+    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
new file mode 100644
index 000000000..2cc90698d
--- /dev/null
+++ b/.buildkite/test_areas/distributed.yaml
@@ -0,0 +1,199 @@
+group: Distributed
+depends_on: 
+  - image-build
+steps:
+- label: Distributed Comm Ops
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/distributed
+  - tests/distributed
+  commands:
+  - pytest -v -s distributed/test_comm_ops.py
+  - pytest -v -s distributed/test_shm_broadcast.py
+  - pytest -v -s distributed/test_shm_buffer.py
+  - pytest -v -s distributed/test_shm_storage.py
+
+- label: Distributed (2 GPUs)
+  timeout_in_minutes: 90
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/compilation/
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/worker/worker_base.py
+  - vllm/v1/engine/
+  - vllm/v1/worker/
+  - tests/compile/fullgraph/test_basic_correctness.py
+  - tests/compile/test_wrapper.py
+  - tests/distributed/
+  - tests/entrypoints/llm/test_collective_rpc.py
+  - tests/v1/distributed
+  - tests/v1/entrypoints/openai/test_multi_api_servers.py
+  - tests/v1/shutdown
+  - tests/v1/worker/test_worker_memory_snapshot.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
+  - pytest -v -s entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
+  - pytest -v -s ./compile/test_wrapper.py
+  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - pytest -v -s distributed/test_sequence_parallel.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
+
+- label: Distributed Tests (4 GPUs)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - tests/distributed/test_utils
+  - tests/distributed/test_pynccl
+  - tests/distributed/test_events
+  - tests/compile/fullgraph/test_basic_correctness.py
+  - examples/offline_inference/rlhf.py
+  - examples/offline_inference/rlhf_colocate.py
+  - tests/examples/offline_inference/data_parallel.py
+  - tests/v1/distributed
+  - tests/v1/engine/test_engine_core_client.py
+  - tests/distributed/test_symm_mem_allreduce.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  # test with torchrun tp=2 and external_dp=2
+  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  # test with torchrun tp=2 and pp=2
+  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  # test with torchrun tp=4 and dp=1
+  - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=2, pp=2 and dp=1
+  - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=1 and dp=4 with ep
+  - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=2 and dp=2 with ep
+  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with internal dp
+  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
+  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
+  - pytest -v -s distributed/test_utils.py
+  - pytest -v -s compile/fullgraph/test_basic_correctness.py
+  - pytest -v -s distributed/test_pynccl.py
+  - pytest -v -s distributed/test_events.py
+  - pytest -v -s distributed/test_symm_mem_allreduce.py
+  # TODO: create a dedicated test section for multi-GPU example tests
+  # when we have multiple distributed example tests
+  - cd ../examples/offline_inference
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+
+- label: Distributed Tests (8 GPUs)(H100)
+  timeout_in_minutes: 10
+  gpu: h100
+  num_gpus: 8
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - examples/offline_inference/torchrun_dp_example.py
+  - vllm/config/parallel.py
+  - vllm/distributed/
+  - vllm/v1/engine/llm_engine.py
+  - vllm/v1/executor/uniproc_executor.py
+  - vllm/v1/worker/gpu_worker.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  # test with torchrun tp=2 and dp=4 with ep
+  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+
+- label: Distributed Tests (4 GPUs)(A100)
+  gpu: a100
+  optional: true
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/
+  commands:
+  # NOTE: don't test llama model here, it seems hf implementation is buggy
+  # see https://github.com/vllm-project/vllm/pull/5689 for details
+  - pytest -v -s distributed/test_custom_all_reduce.py
+  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
+  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+  - pytest -v -s -x lora/test_mixtral.py
+
+- label: Distributed Tests (2 GPUs)(H200)
+  gpu: h200
+  optional: true
+  working_dir: "/vllm-workspace/"
+  num_gpus: 2
+  commands:
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
+    - pytest -v -s tests/distributed/test_context_parallel.py
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - pytest -v -s tests/v1/distributed/test_dbo.py
+
+- label: Distributed Tests (2 GPUs)(B200)
+  gpu: b200
+  optional: true
+  working_dir: "/vllm-workspace/"
+  num_gpus: 2
+  commands:
+    - pytest -v -s tests/distributed/test_context_parallel.py
+    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+    - pytest -v -s tests/v1/distributed/test_dbo.py
+
+- label: 2 Node Test (4 GPUs)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  num_nodes: 2
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/model_executor/models/
+  - tests/distributed/
+  - tests/examples/offline_inference/data_parallel.py
+  commands:
+    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
+
+- label: Distributed NixlConnector PD accuracy (4 GPUs)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+
+- label: Pipeline + Context Parallelism (4 GPUs))
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/model_executor/models/
+  - tests/distributed/
+  commands:
+  - pytest -v -s distributed/test_pp_cudagraph.py
+  - pytest -v -s distributed/test_pipeline_parallel.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml
new file mode 100644
index 000000000..93d389815
--- /dev/null
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -0,0 +1,59 @@
+group: E2E Integration
+depends_on: 
+  - image-build
+steps:
+- label: DeepSeek V2-Lite Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+  timeout_in_minutes: 60
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+
+- label: Prime-RL Integration (2 GPUs)
+  timeout_in_minutes: 30
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  source_file_dependencies:
+  - vllm/
+  - .buildkite/scripts/run-prime-rl-test.sh
+  commands:
+    - bash .buildkite/scripts/run-prime-rl-test.sh
+
+- label: DeepSeek V2-Lite Async EPLB Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
+
+- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml
new file mode 100644
index 000000000..a028e0e4a
--- /dev/null
+++ b/.buildkite/test_areas/engine.yaml
@@ -0,0 +1,26 @@
+group: Engine
+depends_on: 
+  - image-build
+steps:
+- label: Engine
+  timeout_in_minutes: 15
+  source_file_dependencies:
+  - vllm/
+  - tests/engine
+  - tests/test_sequence
+  - tests/test_config
+  - tests/test_logger
+  - tests/test_vllm_port
+  commands:
+  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
+
+- label: V1 e2e + engine
+  timeout_in_minutes: 45
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    # TODO: accuracy does not match, whether setting
+    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+    - pytest -v -s v1/e2e
+    - pytest -v -s v1/engine
diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml
new file mode 100644
index 000000000..0a789be94
--- /dev/null
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -0,0 +1,68 @@
+group: Entrypoints
+depends_on: 
+  - image-build
+steps:
+- label: Entrypoints Unit Tests  
+  timeout_in_minutes: 10
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/entrypoints
+  - tests/entrypoints/
+  commands:
+  - pytest -v -s entrypoints/openai/tool_parsers
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+
+- label: Entrypoints Integration (LLM)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/llm
+  - tests/entrypoints/offline_mode
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
+  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+
+- label: Entrypoints Integration (API Server)
+  timeout_in_minutes: 130
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/openai
+  - tests/entrypoints/test_chat_utils
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+  - pytest -v -s entrypoints/test_chat_utils.py
+
+
+- label: Entrypoints Integration (Pooling)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/pooling
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/pooling
+
+
+- label: Entrypoints V1
+  timeout_in_minutes: 50
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    - pytest -v -s v1/entrypoints
+
+- label: OpenAI API Correctness
+  timeout_in_minutes: 30
+  source_file_dependencies:
+  - csrc/
+  - vllm/entrypoints/openai/
+  - vllm/model_executor/models/whisper.py
+  commands: # LMEval+Transcription WER check
+  - pytest -s entrypoints/openai/correctness/
diff --git a/.buildkite/test_areas/expert_parallelism.yaml b/.buildkite/test_areas/expert_parallelism.yaml
new file mode 100644
index 000000000..feb825214
--- /dev/null
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -0,0 +1,23 @@
+group: Expert Parallelism
+depends_on: 
+  - image-build
+steps:
+- label: EPLB Algorithm
+  timeout_in_minutes: 15
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/distributed/eplb
+  - tests/distributed/test_eplb_algo.py
+  commands:
+  - pytest -v -s distributed/test_eplb_algo.py
+
+- label: EPLB Execution
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/eplb
+  - tests/distributed/test_eplb_execute.py
+  commands:
+  - pytest -v -s distributed/test_eplb_execute.py
+  - pytest -v -s distributed/test_eplb_spec_decode.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml
new file mode 100644
index 000000000..7ca099516
--- /dev/null
+++ b/.buildkite/test_areas/kernels.yaml
@@ -0,0 +1,117 @@
+group: Kernels
+depends_on: 
+  - image-build
+steps:
+- label: Kernels Core Operation Test
+  timeout_in_minutes: 75
+  source_file_dependencies:
+  - csrc/
+  - tests/kernels/core
+  - tests/kernels/test_top_k_per_row.py
+  commands:
+    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
+
+- label: Kernels Attention Test %N
+  timeout_in_minutes: 35
+  source_file_dependencies:
+  - csrc/attention/
+  - vllm/attention
+  - vllm/v1/attention
+  - tests/kernels/attention
+  commands:
+    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels Quantization Test %N
+  timeout_in_minutes: 90
+  source_file_dependencies:
+  - csrc/quantization/
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization
+  commands:
+    - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels MoE Test %N
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - csrc/quantization/cutlass_w8a8/moe/
+  - csrc/moe/
+  - tests/kernels/moe
+  - vllm/model_executor/layers/fused_moe/
+  - vllm/distributed/device_communicators/
+  - vllm/envs.py
+  - vllm/config
+  commands:
+    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels Mamba Test
+  timeout_in_minutes: 45
+  source_file_dependencies:
+  - csrc/mamba/
+  - tests/kernels/mamba
+  - vllm/model_executor/layers/mamba/ops
+  commands:
+    - pytest -v -s kernels/mamba
+
+- label: Kernels DeepGEMM Test (H100)
+  timeout_in_minutes: 45
+  gpu: h100
+  num_gpus: 1
+  source_file_dependencies:
+  - tools/install_deepgemm.sh
+  - vllm/utils/deep_gemm.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization/test_block_fp8.py
+  - tests/kernels/moe/test_deepgemm.py
+  - tests/kernels/moe/test_batched_deepgemm.py
+  - tests/kernels/attention/test_deepgemm_attention.py
+  commands:
+    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+    - pytest -v -s kernels/moe/test_deepgemm.py
+    - pytest -v -s kernels/moe/test_batched_deepgemm.py
+    - pytest -v -s kernels/attention/test_deepgemm_attention.py
+
+- label: Kernels (B200)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  # optional: true
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - csrc/attention/mla/
+  - csrc/quantization/cutlass_w8a8/moe/
+  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/attention/backends/mla/cutlass_mla.py
+  - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/platforms/cuda.py
+  - vllm/attention/selector.py
+  commands:
+    - nvidia-smi
+    - python3 examples/offline_inference/basic/chat.py
+    # Attention
+    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+    - pytest -v -s tests/kernels/attention/test_attention_selector.py
+    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
+    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
+    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+    - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
+    # Quantization
+    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+    - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
+    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
+    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
+    - pytest -v -s tests/kernels/moe/test_flashinfer.py
+    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml
new file mode 100644
index 000000000..9af43e0c3
--- /dev/null
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -0,0 +1,46 @@
+group: LM Eval
+depends_on: 
+  - image-build
+steps:
+- label: LM Eval Small Models
+  timeout_in_minutes: 75
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  autorun_on_main: true
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+
+- label: LM Eval Large Models (4 GPUs)(A100)
+  gpu: a100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+- label: LM Eval Large Models (4 GPUs)(H100)
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
+- label: LM Eval Small Models (B200)
+  timeout_in_minutes: 120
+  gpu: b200
+  optional: true
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml
new file mode 100644
index 000000000..809b4138f
--- /dev/null
+++ b/.buildkite/test_areas/lora.yaml
@@ -0,0 +1,31 @@
+group: LoRA
+depends_on: 
+  - image-build
+steps:
+- label: LoRA %N
+  timeout_in_minutes: 30
+  source_file_dependencies:
+  - vllm/lora
+  - tests/lora
+  commands:
+    - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
+  parallelism: 4
+
+
+- label: LoRA TP (Distributed)
+  timeout_in_minutes: 30
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/lora
+  - tests/lora
+  commands:
+    # FIXIT: find out which code initialize cuda before running the test
+    # before the fix, we need to use spawn to test it
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    # There is some Tensor Parallelism related processing logic in LoRA that
+    # requires multi-GPU testing for validation.
+    - pytest -v -s -x lora/test_chatglm3_tp.py
+    - pytest -v -s -x lora/test_llama_tp.py
+    - pytest -v -s -x lora/test_llm_with_multi_loras.py
+    - pytest -v -s -x lora/test_olmoe_tp.py
+    - pytest -v -s -x lora/test_gptoss_tp.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml
new file mode 100644
index 000000000..072bccadb
--- /dev/null
+++ b/.buildkite/test_areas/misc.yaml
@@ -0,0 +1,163 @@
+group: Miscellaneous
+depends_on: 
+  - image-build
+steps:
+- label: V1 Others
+  timeout_in_minutes: 60
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    # split the test to avoid interference
+    - pytest -v -s -m 'not cpu_test' v1/core
+    - pytest -v -s v1/executor
+    - pytest -v -s v1/kv_offload
+    - pytest -v -s v1/sample
+    - pytest -v -s v1/logits_processors
+    - pytest -v -s v1/worker
+    - pytest -v -s v1/spec_decode
+    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+    - pytest -v -s -m 'not cpu_test' v1/metrics
+    - pytest -v -s v1/test_oracle.py
+    - pytest -v -s v1/test_request.py
+    - pytest -v -s v1/test_outputs.py
+    # Integration test for streaming correctness (requires special branch).
+    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
+    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+
+- label: V1 Others (CPU)
+  depends_on: ~
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  no_gpu: true
+  commands:
+    # split the test to avoid interference
+    - pytest -v -s -m 'cpu_test' v1/core
+    - pytest -v -s v1/structured_output
+    - pytest -v -s v1/test_serial_utils.py
+    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
+    - pytest -v -s -m 'cpu_test' v1/metrics
+
+- label: Regression
+  timeout_in_minutes: 20
+  source_file_dependencies:
+  - vllm/
+  - tests/test_regression
+  commands:
+  - pip install modelscope
+  - pytest -v -s test_regression.py
+  working_dir: "/vllm-workspace/tests" # optional
+
+- label: Examples
+  timeout_in_minutes: 45
+  working_dir: "/vllm-workspace/examples"
+  source_file_dependencies:
+  - vllm/entrypoints
+  - vllm/multimodal
+  - examples/
+  commands:
+    - pip install tensorizer # for tensorizer test
+    - python3 offline_inference/basic/chat.py # for basic
+    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 offline_inference/basic/classify.py
+    - python3 offline_inference/basic/embed.py
+    - python3 offline_inference/basic/score.py
+    # for multi-modal models
+    - python3 offline_inference/audio_language.py --seed 0
+    - python3 offline_inference/vision_language.py --seed 0
+    - python3 offline_inference/vision_language_multi_image.py --seed 0
+    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+     # for pooling models
+    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    # for features demo
+    - python3 offline_inference/prefix_caching.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
+    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+
+- label: Metrics, Tracing (2 GPUs)
+  timeout_in_minutes: 20
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/
+  - tests/v1/tracing
+  commands:
+  - "pip install \
+      'opentelemetry-sdk>=1.26.0' \
+      'opentelemetry-api>=1.26.0' \
+      'opentelemetry-exporter-otlp>=1.26.0' \
+      'opentelemetry-semantic-conventions-ai>=0.4.1'"
+  - pytest -v -s v1/tracing
+
+- label: Python-only Installation
+  depends_on: ~
+  timeout_in_minutes: 20
+  source_file_dependencies:
+  - tests/standalone_tests/python_only_compile.sh
+  - setup.py
+  commands:
+  - bash standalone_tests/python_only_compile.sh
+
+- label: Async Engine, Inputs, Utils, Worker
+  timeout_in_minutes: 50
+  source_file_dependencies:
+  - vllm/
+  - tests/multimodal
+  - tests/utils_
+  commands:
+  - pytest -v -s -m 'not cpu_test' multimodal
+  - pytest -v -s utils_
+
+- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
+  depends_on: ~
+  timeout_in_minutes: 20
+  source_file_dependencies:
+  - vllm/
+  - tests/test_inputs.py
+  - tests/test_outputs.py
+  - tests/multimodal
+  - tests/standalone_tests/lazy_imports.py
+  - tests/tokenizers_
+  - tests/transformers_utils
+  - tests/config
+  no_gpu: true
+  commands:
+  - python3 standalone_tests/lazy_imports.py
+  - pytest -v -s test_inputs.py
+  - pytest -v -s test_outputs.py
+  - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s tokenizers_
+  - pytest -v -s transformers_utils
+  - pytest -v -s config
+
+- label: GPT-OSS Eval (B200)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true
+  source_file_dependencies:
+  - tests/evals/gpt_oss
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - uv pip install --system 'gpt-oss[eval]==0.0.5'
+    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
+- label: Batch Invariance (H100)
+  timeout_in_minutes: 25
+  gpu: h100
+  source_file_dependencies:
+    - vllm/v1/attention
+    - vllm/model_executor/layers
+    - tests/v1/determinism/
+  commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pip install pytest-timeout pytest-forked
+    - pytest -v -s v1/determinism/test_batch_invariance.py
+    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/model_executor.yaml b/.buildkite/test_areas/model_executor.yaml
new file mode 100644
index 000000000..996c8bb8b
--- /dev/null
+++ b/.buildkite/test_areas/model_executor.yaml
@@ -0,0 +1,17 @@
+group: Model Executor
+depends_on: 
+  - image-build
+steps:
+- label: Model Executor
+  timeout_in_minutes: 35
+  source_file_dependencies:
+  - vllm/engine/arg_utils.py
+  - vllm/config/model.py
+  - vllm/model_executor
+  - tests/model_executor
+  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
+  commands:
+    - apt-get update && apt-get install -y curl libsodium23
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s model_executor
+    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml
new file mode 100644
index 000000000..39a5d51c4
--- /dev/null
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -0,0 +1,62 @@
+group: Models - Basic
+depends_on: 
+  - image-build
+steps:
+- label: Basic Models Tests (Initialization)
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/test_initialization.py
+  commands:
+    # Run a subset of model initialization tests
+    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
+
+- label: Basic Models Tests (Extra Initialization) %N
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/model_executor/models/
+  - tests/models/test_initialization.py
+  commands:
+    # Only when vLLM model source is modified - test initialization of a large
+    # subset of supported models (the complement of the small subset in the above
+    # test.) Also run if model initialization test file is modified
+    - pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+
+- label: Basic Models Tests (Other)
+  timeout_in_minutes: 45
+  source_file_dependencies:
+  - vllm/
+  - tests/models/test_transformers.py
+  - tests/models/test_registry.py
+  commands:
+    - pytest -v -s models/test_transformers.py models/test_registry.py
+
+- label: Basic Models Test (Other CPU) # 5min
+  timeout_in_minutes: 10
+  source_file_dependencies:
+  - vllm/
+  - tests/models/test_utils.py
+  - tests/models/test_vision.py
+  no_gpu: true
+  commands:
+    - pytest -v -s models/test_utils.py models/test_vision.py
+
+- label: Transformers Nightly Models
+  working_dir: "/vllm-workspace/"
+  optional: true
+  soft_fail: true
+  commands:
+    - pip install --upgrade git+https://github.com/huggingface/transformers
+    - pytest -v -s tests/models/test_initialization.py
+    - pytest -v -s tests/models/test_transformers.py
+    - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/test_mapping.py
+    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    # Whisper needs spawn method to avoid deadlock
+    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
diff --git a/.buildkite/test_areas/models_distributed.yaml b/.buildkite/test_areas/models_distributed.yaml
new file mode 100644
index 000000000..b6bfbf2dd
--- /dev/null
+++ b/.buildkite/test_areas/models_distributed.yaml
@@ -0,0 +1,22 @@
+group: Models - Distributed
+depends_on: 
+  - image-build
+steps:
+- label: Distributed Model Tests (2 GPUs)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/model_executor/model_loader/sharded_state_loader.py
+  - vllm/model_executor/models/
+  - tests/basic_correctness/
+  - tests/model_executor/model_loader/test_sharded_state_loader.py
+  - tests/models/
+  commands:
+  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
+  # Avoid importing model tests that cause CUDA reinitialization error
+  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml
new file mode 100644
index 000000000..f70192c4e
--- /dev/null
+++ b/.buildkite/test_areas/models_language.yaml
@@ -0,0 +1,91 @@
+group: Models - Language
+depends_on: 
+  - image-build
+steps:
+- label: Language Models Tests (Standard)
+  timeout_in_minutes: 25
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language
+  commands:
+    # Test standard language models, excluding a subset of slow tests
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/language -m 'core_model and (not slow_test)'
+
+- label: Language Models Tests (Extra Standard) %N
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/model_executor/models/
+  - tests/models/language/pooling/test_embedding.py
+  - tests/models/language/generation/test_common.py
+  - tests/models/language/pooling/test_classification.py
+  commands:
+    # Shard slow subset of standard language models tests. Only run when model
+    # source is modified, or when specified test files are modified
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+
+- label: Language Models Tests (Hybrid) %N
+  timeout_in_minutes: 75
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation
+  commands:
+    # Install fast path packages for testing against transformers
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+    # Shard hybrid language model tests
+    - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+
+- label: Language Models Test (Extended Generation) # 80min
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation
+  commands:
+    # Install fast path packages for testing against transformers
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
+
+- label: Language Models Test (PPL)
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation_ppl_test
+  commands:
+    - pytest -v -s models/language/generation_ppl_test
+
+- label: Language Models Test (Extended Pooling)  # 36min
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/pooling
+  commands:
+    - pytest -v -s models/language/pooling -m 'not core_model'
+
+- label: Language Models Test (MTEB)
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/pooling_mteb_test
+  commands:
+    - pytest -v -s models/language/pooling_mteb_test
diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml
new file mode 100644
index 000000000..fc24068c2
--- /dev/null
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -0,0 +1,79 @@
+group: Models - Multimodal
+depends_on: 
+  - image-build
+steps:
+- label: Multi-Modal Models (Standard) # 60min
+  timeout_in_minutes: 80
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
+
+- label: Multi-Modal Processor Test (CPU)
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  no_gpu: true
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Processor # 44min
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Accuracy Eval (Small Models) # 50min
+  timeout_in_minutes: 70
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - vllm/multimodal/
+  - vllm/inputs/
+  - vllm/v1/core/
+  commands:
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+
+- label: Multi-Modal Models (Extended) 1
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
+
+- label: Multi-Modal Models (Extended) 2
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+
+- label: Multi-Modal Models (Extended) 3
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+
+# This test is used only in PR development phase to test individual models and should never run on main
+- label: Custom Models
+  optional: true
+  commands:
+    - echo 'Testing custom models...'
+    # PR authors can temporarily add commands below to test individual models
+    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
+    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
diff --git a/.buildkite/test_areas/plugins.yaml b/.buildkite/test_areas/plugins.yaml
new file mode 100644
index 000000000..60c179aa0
--- /dev/null
+++ b/.buildkite/test_areas/plugins.yaml
@@ -0,0 +1,34 @@
+group: Plugins
+depends_on: 
+  - image-build
+steps:
+- label: Plugin Tests (2 GPUs)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/plugins/
+  - tests/plugins/
+  commands:
+  # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
+  - pip install -e ./plugins/vllm_add_dummy_platform
+  - pytest -v -s plugins_tests/test_platform_plugins.py
+  - pip uninstall vllm_add_dummy_platform -y
+  # end platform plugin tests
+  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+  - pip install -e ./plugins/prithvi_io_processor_plugin
+  - pytest -v -s plugins_tests/test_io_processor_plugins.py
+  - pip uninstall prithvi_io_processor_plugin -y
+  # end io_processor plugins test
+  # begin stat_logger plugins test
+  - pip install -e ./plugins/vllm_add_dummy_stat_logger
+  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
+  - pip uninstall dummy_stat_logger -y
+  # end stat_logger plugins test
+  # other tests continue here:
+  - pytest -v -s plugins_tests/test_scheduler_plugins.py
+  - pip install -e ./plugins/vllm_add_dummy_model
+  - pytest -v -s distributed/test_distributed_oot.py
+  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
+  - pytest -v -s models/test_oot_registration.py # it needs a clean process
+  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
diff --git a/.buildkite/test_areas/pytorch.yaml b/.buildkite/test_areas/pytorch.yaml
new file mode 100644
index 000000000..703c82eb1
--- /dev/null
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -0,0 +1,50 @@
+group: PyTorch
+depends_on: 
+  - image-build
+steps:
+- label: PyTorch Compilation Unit Tests
+  timeout_in_minutes: 30
+  source_file_dependencies:
+    - vllm/
+    - tests/compile
+  commands:
+  # Run unit tests defined directly under compile/,
+  # not including subdirectories, which are usually heavier
+  # tests covered elsewhere.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;"
+
+- label: PyTorch Fullgraph Smoke Test
+  timeout_in_minutes: 30
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+  # Run smoke tests under fullgraph directory, except test_full_graph.py
+  # as it is a heavy test that is covered in other steps.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
+
+- label: PyTorch Fullgraph
+  timeout_in_minutes: 40
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+    # fp8 kv scales not supported on sm89, tested on Blackwell instead
+  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+    # Limit to no custom ops to reduce running time
+    # Wrap with quotes to escape yaml and avoid starting -k string with a -
+  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+
+- label: Pytorch Nightly Dependency Override Check # 2min
+  # if this test fails, it means the nightly torch version is not compatible with some
+  # of the dependencies. Please check the error message and add the package to whitelist
+  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
+  soft_fail: true
+  source_file_dependencies:
+  - requirements/nightly_torch_test.txt
+  commands:
+  - bash standalone_tests/pytorch_nightly_dependency.sh
\ No newline at end of file
diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml
new file mode 100644
index 000000000..6e89d6af3
--- /dev/null
+++ b/.buildkite/test_areas/quantization.yaml
@@ -0,0 +1,46 @@
+group: Quantization
+depends_on: 
+  - image-build
+steps:
+- label: Quantization
+  timeout_in_minutes: 90
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  - tests/quantization
+  commands:
+  # temporary install here since we need nightly, will move to requirements/test.in
+  # after torchao 0.12 release, and pin a working version of torchao nightly here
+
+  # since torchao nightly is only compatible with torch nightly currently
+  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
+  # we can only upgrade after this is resolved
+  # TODO(jerryzh168): resolve the above comment
+  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system conch-triton-kernels
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
+
+- label: Quantized MoE Test (B200)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  source_file_dependencies:
+  - tests/quantization/test_blackwell_moe.py
+  - vllm/model_executor/models/deepseek_v2.py
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/models/llama4.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization/compressed_tensors
+  - vllm/model_executor/layers/quantization/modelopt.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - pytest -s -v tests/quantization/test_blackwell_moe.py
+
+- label: Quantized Models Test
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - vllm/model_executor/layers/quantization
+  - tests/models/quantization
+  commands:
+    - pytest -v -s models/quantization
diff --git a/.buildkite/test_areas/samplers.yaml b/.buildkite/test_areas/samplers.yaml
new file mode 100644
index 000000000..ad377148f
--- /dev/null
+++ b/.buildkite/test_areas/samplers.yaml
@@ -0,0 +1,14 @@
+group: Samplers
+depends_on: 
+  - image-build
+steps:
+- label: Samplers Test
+  timeout_in_minutes: 75
+  source_file_dependencies:
+  - vllm/model_executor/layers
+  - vllm/sampling_metadata.py
+  - tests/samplers
+  - tests/conftest.py
+  commands:
+    - pytest -v -s samplers
+    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
diff --git a/.buildkite/test_areas/tool_use.yaml b/.buildkite/test_areas/tool_use.yaml
new file mode 100644
index 000000000..7040cd1d2
--- /dev/null
+++ b/.buildkite/test_areas/tool_use.yaml
@@ -0,0 +1,23 @@
+group: Tool use
+depends_on: 
+  - image-build
+steps:
+- label: OpenAI-Compatible Tool Use
+  timeout_in_minutes: 35
+  mirror_hardwares: [amdexperimental]
+  fast_check: false
+  source_file_dependencies:
+    - vllm/
+    - tests/tool_use
+  commands:
+    - pytest -v -s -m 'not cpu_test' tool_use
+
+- label: OpenAI-Compatible Tool Use (CPU)
+  depends_on: ~
+  timeout_in_minutes: 10
+  source_file_dependencies:
+    - vllm/
+    - tests/tool_use
+  no_gpu: true
+  commands:
+    - pytest -v -s -m 'cpu_test' tool_use
diff --git a/.buildkite/test_areas/weight_loading.yaml b/.buildkite/test_areas/weight_loading.yaml
new file mode 100644
index 000000000..cfc5bb20f
--- /dev/null
+++ b/.buildkite/test_areas/weight_loading.yaml
@@ -0,0 +1,25 @@
+group: Weight Loading
+depends_on: 
+  - image-build
+steps:
+- label: Weight Loading Multiple GPU  # 33min
+  timeout_in_minutes: 45
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+
+- label: Weight Loading Multiple GPU - Large Models # optional
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  gpu: a100
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
-- 
GitLab


From ea657f2078072541f68fbc11ab83f81a585a9dc4 Mon Sep 17 00:00:00 2001
From: gnovack <gnovack@amazon.com>
Date: Mon, 8 Dec 2025 18:35:16 -0800
Subject: [PATCH 216/258] Lora MoE Align Improvements (#29257)

Signed-off-by: gnovack <gnovack@amazon.com>
---
 CMakeLists.txt                         |   1 -
 csrc/moe/moe_align_sum_kernels.cu      | 425 ++++++++++++++++++++-----
 csrc/moe/moe_lora_align_sum_kernels.cu | 174 ----------
 csrc/moe/moe_ops.h                     |   2 +-
 csrc/moe/torch_bindings.cpp            |   3 +-
 tests/lora/test_moe_lora_align_sum.py  |   2 +-
 vllm/_custom_ops.py                    |   2 +
 7 files changed, 360 insertions(+), 249 deletions(-)
 delete mode 100644 csrc/moe/moe_lora_align_sum_kernels.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e09972fe7..69a538b06 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -944,7 +944,6 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 set(VLLM_MOE_EXT_SRC
   "csrc/moe/torch_bindings.cpp"
   "csrc/moe/moe_align_sum_kernels.cu"
-  "csrc/moe/moe_lora_align_sum_kernels.cu"
   "csrc/moe/topk_softmax_kernels.cu")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index ddcdcc38b..5c9e47402 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -14,7 +14,6 @@
 
 namespace vllm {
 namespace moe {
-
 namespace batched_moe_align_block_size {
 
 // Note num_threads needs to be 1024 for BlockScan Reduction in the kernel.
@@ -80,23 +79,30 @@ __global__ void batched_moe_align_block_size_kernel(
 }  // namespace batched_moe_align_block_size
 
 template <typename scalar_t>
-__global__ void moe_align_block_size_kernel(
+__device__ void _moe_align_block_size(
     const scalar_t* __restrict__ topk_ids,
     int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
     int32_t* __restrict__ total_tokens_post_pad,
     int32_t* __restrict__ expert_map, int32_t num_experts,
     int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
     size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded,
-    bool has_expert_map) {
+    int32_t max_num_m_blocks, int32_t model_offset, int32_t inactive_expert_id,
+    int32_t topk_num, int32_t* token_mask, bool has_expert_map) {
   extern __shared__ int32_t shared_counts[];
 
-  // Use a separate threadblock to fill sorted_token_ids.
+  // Compute input buffer offsets. Typically these will all be 0, except when
+  // using Multi LoRA.
+  int sorted_token_ids_offset = max_num_tokens_padded * model_offset;
+  int expert_ids_offset = max_num_m_blocks * model_offset;
+  int cumsum_offset = (num_experts + 1) * model_offset;
+
+  // Use separate threadblocks to fill sorted_token_ids.
   // This is safe since the current kernel does not use sorted_token_ids.
-  if (blockIdx.x == 1) {
+  if (blockIdx.x % 2) {
     // Initialize sorted_token_ids with numel
     for (size_t it = threadIdx.x; it < max_num_tokens_padded;
          it += blockDim.x) {
-      sorted_token_ids[it] = numel;
+      sorted_token_ids[sorted_token_ids_offset + it] = numel;
     }
     return;
   }
@@ -127,7 +133,9 @@ __global__ void moe_align_block_size_kernel(
     }
     int warp_idx = expert_id / experts_per_warp;
     int expert_offset = expert_id % experts_per_warp;
-    atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
+    int mask = token_mask == nullptr ? 1 : token_mask[i / topk_num];
+    atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset],
+              mask);
   }
 
   __syncthreads();
@@ -148,77 +156,44 @@ __global__ void moe_align_block_size_kernel(
   int cumsum_val;
   BlockScan(temp_storage).ExclusiveSum(expert_count, cumsum_val);
   if (expert_id <= num_experts) {
-    cumsum[expert_id] = cumsum_val;
+    cumsum[cumsum_offset + expert_id] = cumsum_val;
   }
 
   if (expert_id == num_experts) {
-    *total_tokens_post_pad = cumsum_val;
+    total_tokens_post_pad[model_offset] = cumsum_val;
   }
 
   __syncthreads();
 
   if (threadIdx.x < num_experts) {
-    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
-         i += block_size) {
-      expert_ids[i / block_size] = threadIdx.x;
+    for (int i = cumsum[cumsum_offset + threadIdx.x];
+         i < cumsum[cumsum_offset + threadIdx.x + 1]; i += block_size) {
+      expert_ids[expert_ids_offset + i / block_size] = threadIdx.x;
     }
   }
 
   // Fill remaining expert_ids with 0
-  const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x;
-  const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size);
-  for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) {
-    expert_ids[i] = 0;
-  }
-}
-
-template <typename scalar_t>
-__global__ void count_and_sort_expert_tokens_kernel(
-    const scalar_t* __restrict__ topk_ids,
-    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
-    int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
-    bool has_expert_map) {
-  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-  const size_t stride = blockDim.x * gridDim.x;
-
-  for (size_t i = tid; i < numel; i += stride) {
-    int32_t expert_id = topk_ids[i];
-    if (expert_id >= num_experts) {
-      continue;
-    }
-    if (has_expert_map) {
-      expert_id = expert_map[expert_id];
-      // filter invalid experts
-      if (expert_id == -1) continue;
-    }
-    int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
-    sorted_token_ids[rank_post_pad] = i;
-  }
-}
-
-template <typename scalar_t, int TOPK>
-__global__ void moe_sum_kernel(
-    scalar_t* __restrict__ out,          // [..., d]
-    const scalar_t* __restrict__ input,  // [..., topk, d]
-    const int d) {
-  const int64_t token_idx = blockIdx.x;
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    scalar_t x = 0.0;
-#pragma unroll
-    for (int k = 0; k < TOPK; ++k) {
-      x += VLLM_LDG(&input[token_idx * TOPK * d + k * d + idx]);
-    }
-    out[token_idx * d + idx] = x;
+  const size_t fill_start_idx =
+      cumsum[cumsum_offset + num_experts] / block_size + threadIdx.x;
+  for (size_t i = fill_start_idx; i < max_num_m_blocks; i += blockDim.x) {
+    expert_ids[expert_ids_offset + i] = inactive_expert_id;
   }
 }
 
 template <typename scalar_t, int32_t fill_threads>
-__global__ void moe_align_block_size_small_batch_expert_kernel(
+__device__ void _moe_align_block_size_small_batch_expert(
     const scalar_t* __restrict__ topk_ids,
     int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
     int32_t* __restrict__ total_tokens_post_pad,
     int32_t* __restrict__ expert_map, int32_t num_experts, int32_t block_size,
-    size_t numel, int32_t max_num_tokens_padded, bool has_expert_map) {
+    size_t numel, int32_t max_num_tokens_padded, int32_t max_num_m_blocks,
+    int32_t inactive_expert_id, int32_t model_offset, int32_t topk_num,
+    int32_t* token_mask, bool has_expert_map) {
+  // Compute input buffer offsets. Typically these will all be 0, except when
+  // using Multi LoRA.
+  int sorted_token_ids_offset = max_num_tokens_padded * model_offset;
+  int expert_ids_offset = max_num_m_blocks * model_offset;
+
   // Use an additional group of threads to fill sorted_token_ids.
   // Since the current kernel will use sorted_token_ids afterward,
   // we fill sorted_token_ids within the same threadblock to make
@@ -227,7 +202,7 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
     // Initialize sorted_token_ids with numel
     for (size_t it = threadIdx.x; it < max_num_tokens_padded;
          it += fill_threads) {
-      sorted_token_ids[it] = numel;
+      sorted_token_ids[sorted_token_ids_offset + it] = numel;
     }
     // Three __syncthreads() corresponding to the other threads
     __syncthreads();
@@ -254,7 +229,8 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
       // filter invalid expert
       if (expert_id == -1) continue;
     }
-    ++tokens_cnts[(tid + 1) * num_experts + expert_id];
+    int mask = token_mask == nullptr ? 1 : token_mask[i / topk_num];
+    tokens_cnts[(tid + 1) * num_experts + expert_id] += mask;
   }
 
   __syncthreads();
@@ -277,22 +253,22 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
           CEILDIV(tokens_cnts[stride * num_experts + i - 1], block_size) *
               block_size;
     }
-    *total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
+    total_tokens_post_pad[model_offset] =
+        static_cast<int32_t>(cumsum[num_experts]);
   }
 
   __syncthreads();
 
   if (tid < num_experts) {
     for (int i = cumsum[tid]; i < cumsum[tid + 1]; i += block_size) {
-      expert_ids[i / block_size] = tid;
+      expert_ids[expert_ids_offset + i / block_size] = tid;
     }
   }
 
   // Fill remaining expert_ids with 0
   const size_t fill_start_idx = cumsum[num_experts] / block_size + tid;
-  const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size);
-  for (size_t i = fill_start_idx; i < expert_ids_size; i += stride) {
-    expert_ids[i] = 0;
+  for (size_t i = fill_start_idx; i < max_num_m_blocks; i += stride) {
+    expert_ids[expert_ids_offset + i] = inactive_expert_id;
   }
 
   for (size_t i = tid; i < numel; i += stride) {
@@ -304,11 +280,195 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
     }
     int32_t rank_post_pad =
         tokens_cnts[tid * num_experts + expert_id] + cumsum[expert_id];
-    sorted_token_ids[rank_post_pad] = i;
-    ++tokens_cnts[tid * num_experts + expert_id];
+
+    if (token_mask == nullptr || token_mask[i / topk_num]) {
+      sorted_token_ids[sorted_token_ids_offset + rank_post_pad] = i;
+      ++tokens_cnts[tid * num_experts + expert_id];
+    }
+  }
+}
+
+template <typename scalar_t>
+__device__ void _count_and_sort_expert_tokens(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
+    int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
+    int32_t max_num_tokens_padded, int32_t* __restrict__ token_mask,
+    int32_t model_offset, int32_t topk_num, bool has_expert_map) {
+  const size_t tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const size_t stride = blockDim.x * gridDim.y;
+
+  for (size_t i = tid; i < numel; i += stride) {
+    int32_t expert_id = topk_ids[i];
+    if (expert_id >= num_experts) {
+      continue;
+    }
+
+    if (has_expert_map) {
+      expert_id = expert_map[expert_id];
+      // filter invalid experts
+      if (expert_id == -1) continue;
+    }
+
+    if (token_mask == nullptr || token_mask[i / topk_num]) {
+      int32_t rank_post_pad = atomicAdd(
+          &cumsum_buffer[(model_offset * (num_experts + 1)) + expert_id], 1);
+      sorted_token_ids[max_num_tokens_padded * model_offset + rank_post_pad] =
+          i;
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void moe_align_block_size_kernel(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
+    int32_t* __restrict__ total_tokens_post_pad,
+    int32_t* __restrict__ expert_map, int32_t num_experts,
+    int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
+    size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded,
+    int32_t topk_num, bool has_expert_map) {
+  _moe_align_block_size(
+      topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
+      num_experts, padded_num_experts, experts_per_warp, block_size, numel,
+      cumsum, max_num_tokens_padded, CEILDIV(max_num_tokens_padded, block_size),
+      0, 0, topk_num, nullptr, has_expert_map);
+}
+
+template <typename scalar_t>
+__global__ void count_and_sort_expert_tokens_kernel(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
+    int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
+    int32_t max_num_tokens_padded, int32_t topk_num, bool has_expert_map) {
+  _count_and_sort_expert_tokens(
+      topk_ids, sorted_token_ids, cumsum_buffer, expert_map, numel, num_experts,
+      max_num_tokens_padded, nullptr, 0, topk_num, has_expert_map);
+}
+
+template <typename scalar_t, int TOPK>
+__global__ void moe_sum_kernel(
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., topk, d]
+    const int d) {
+  const int64_t token_idx = blockIdx.x;
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    scalar_t x = 0.0;
+#pragma unroll
+    for (int k = 0; k < TOPK; ++k) {
+      x += VLLM_LDG(&input[token_idx * TOPK * d + k * d + idx]);
+    }
+    out[token_idx * d + idx] = x;
   }
 }
 
+template <typename scalar_t, int32_t fill_threads>
+__global__ void moe_align_block_size_small_batch_expert_kernel(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
+    int32_t* __restrict__ total_tokens_post_pad,
+    int32_t* __restrict__ expert_map, int32_t num_experts, int32_t block_size,
+    size_t numel, int32_t max_num_tokens_padded, int32_t topk_num,
+    bool has_expert_map) {
+  _moe_align_block_size_small_batch_expert<scalar_t, fill_threads>(
+      topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
+      num_experts, block_size, numel, max_num_tokens_padded,
+      CEILDIV(max_num_tokens_padded, block_size), 0, 0, topk_num, nullptr,
+      has_expert_map);
+}
+
+template <typename scalar_t>
+__global__ void moe_lora_align_block_size_kernel(
+    scalar_t* __restrict__ topk_ids, int32_t* __restrict__ token_lora_mapping,
+    int64_t block_size, int32_t* __restrict__ expert_map, int num_experts,
+    int max_loras, size_t numel, int max_num_tokens_padded,
+    int max_num_m_blocks, int32_t* __restrict__ sorted_token_ids,
+    int32_t* __restrict__ expert_ids, int32_t topk_num,
+    int32_t* total_tokens_post_pad, int32_t* adapter_enabled,
+    int32_t* __restrict__ cumsum, int32_t experts_per_warp,
+    int32_t padded_num_experts, int32_t* lora_ids,
+    int32_t* __restrict__ token_mask, bool has_expert_map) {
+  int lora_idx = blockIdx.x / 2;
+  int lora_id = lora_ids[lora_idx];
+  if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
+    return;
+  }
+
+  // Populate the token_mask based on the token-LoRA mapping
+  int num_tokens = numel / topk_num;
+  if (threadIdx.x == 0) {
+    total_tokens_post_pad[lora_id] = 0;
+
+    for (int i = 0; i < num_tokens; i++) {
+      token_mask[(lora_id * num_tokens) + i] =
+          (int)token_lora_mapping[i] == lora_id;
+    }
+  }
+
+  __syncthreads();
+
+  _moe_align_block_size(
+      topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
+      num_experts, padded_num_experts, experts_per_warp, block_size, numel,
+      cumsum, max_num_tokens_padded, max_num_m_blocks, lora_id, -1, topk_num,
+      &token_mask[(lora_id * num_tokens)], has_expert_map);
+}
+
+template <typename scalar_t>
+__global__ void lora_count_and_sort_expert_tokens_kernel(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
+    int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
+    int32_t max_num_tokens_padded, int32_t topk_num, int32_t* token_mask,
+    int32_t* lora_ids, bool has_expert_map) {
+  int lora_idx = blockIdx.x;
+  int lora_id = lora_ids[lora_idx];
+  if (lora_id == -1) {
+    return;
+  }
+
+  int num_tokens = numel / topk_num;
+
+  _count_and_sort_expert_tokens(
+      topk_ids, sorted_token_ids, cumsum_buffer, expert_map, numel, num_experts,
+      max_num_tokens_padded, &token_mask[(lora_id * num_tokens)], lora_id,
+      topk_num, has_expert_map);
+}
+
+template <typename scalar_t, int32_t fill_threads>
+__global__ void moe_lora_align_block_size_small_batch_expert_kernel(
+    scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping,
+    int64_t block_size, int32_t* __restrict__ expert_map, int num_experts,
+    int max_loras, size_t numel, int max_num_tokens_padded,
+    int max_num_m_blocks, int32_t* __restrict__ sorted_token_ids,
+    int32_t* __restrict__ expert_ids, int topk_num,
+    int32_t* total_tokens_post_pad, int32_t* adapter_enabled, int32_t* lora_ids,
+    int32_t* token_mask, bool has_expert_map) {
+  int lora_idx = blockIdx.x;
+  int lora_id = lora_ids[lora_idx];
+  if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
+    return;
+  }
+
+  int num_tokens = numel / topk_num;
+  if (threadIdx.x == 0) {
+    total_tokens_post_pad[lora_id] = 0;
+
+    for (int i = 0; i < num_tokens; i++) {
+      token_mask[(lora_id * num_tokens) + i] =
+          (int)token_lora_mapping[i] == lora_id;
+    }
+  }
+
+  __syncthreads();
+
+  _moe_align_block_size_small_batch_expert<scalar_t, fill_threads>(
+      topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
+      num_experts, block_size, numel, max_num_tokens_padded, max_num_m_blocks,
+      -1, lora_id, topk_num, &token_mask[(lora_id * num_tokens)],
+      has_expert_map);
+}
+
 }  // namespace moe
 }  // namespace vllm
 
@@ -365,7 +525,8 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
               experts_ids.data_ptr<int32_t>(),
               num_tokens_post_pad.data_ptr<int32_t>(),
               expert_map.data_ptr<int32_t>(), num_experts, block_size,
-              topk_ids.numel(), sorted_token_ids.size(0), has_expert_map);
+              topk_ids.numel(), sorted_token_ids.size(0), topk_ids.size(1),
+              has_expert_map);
         } else {
           torch::Tensor cumsum_buffer =
               torch::empty({num_experts + 1}, options_int);
@@ -386,21 +547,23 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
               expert_map.data_ptr<int32_t>(), num_experts, padded_num_experts,
               experts_per_warp, block_size, topk_ids.numel(),
               cumsum_buffer.data_ptr<int32_t>(), sorted_token_ids.size(0),
-              has_expert_map);
+              topk_ids.size(1), has_expert_map);
 
           const int block_threads = std::min(256, (int)threads);
           const int num_blocks =
               (topk_ids.numel() + block_threads - 1) / block_threads;
           const int max_blocks = 65535;
           const int actual_blocks = std::min(num_blocks, max_blocks);
+          dim3 gridDims(1, actual_blocks);
 
           auto sort_kernel =
               vllm::moe::count_and_sort_expert_tokens_kernel<scalar_t>;
-          sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
+          sort_kernel<<<gridDims, block_threads, 0, stream>>>(
               topk_ids.data_ptr<scalar_t>(),
               sorted_token_ids.data_ptr<int32_t>(),
               cumsum_buffer.data_ptr<int32_t>(), expert_map.data_ptr<int32_t>(),
-              topk_ids.numel(), num_experts, has_expert_map);
+              topk_ids.numel(), num_experts, sorted_token_ids.size(0),
+              topk_ids.size(1), has_expert_map);
         }
       });
 }
@@ -474,3 +637,123 @@ void moe_sum(torch::Tensor& input,   // [num_tokens, topk, hidden_size]
       break;
   }
 }
+
+void moe_lora_align_block_size(
+    torch::Tensor topk_ids, torch::Tensor token_lora_mapping,
+    int64_t num_experts, int64_t block_size, int64_t max_loras,
+    int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
+    torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
+    torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
+    torch::Tensor lora_ids, std::optional<torch::Tensor> maybe_expert_map) {
+  const int topk_num = topk_ids.size(1);
+
+  TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");
+
+  int device_max_shared_mem;
+  auto dev = topk_ids.get_device();
+  cudaDeviceGetAttribute(&device_max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  int64_t padded_num_experts =
+      ((num_experts + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
+
+  // BlockScan uses 1024 threads and assigns one thread per expert.
+  TORCH_CHECK(padded_num_experts < 1024,
+              "padded_num_experts must be less than 1024");
+
+  auto options_int =
+      torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
+  torch::Tensor token_mask =
+      torch::empty({max_loras * topk_ids.size(0)}, options_int);
+  bool has_expert_map = maybe_expert_map.has_value();
+  torch::Tensor expert_map;
+  if (has_expert_map) {
+    expert_map = maybe_expert_map.value();
+  } else {
+    expert_map = torch::empty({0}, options_int);
+  }
+
+  VLLM_DISPATCH_INTEGRAL_TYPES(
+      topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] {
+        bool small_batch_expert_mode =
+            (topk_ids.numel() < 1024) && (num_experts <= 64);
+
+        if (small_batch_expert_mode) {
+          const int32_t num_thread = max((int32_t)num_experts, 128);
+          const int32_t shared_mem =
+              (num_thread + 1) * num_experts * sizeof(int32_t) +
+              (num_experts + 1) * sizeof(int32_t);
+          if (shared_mem > device_max_shared_mem) {
+            TORCH_CHECK(false, "Shared memory usage exceeds device limit.");
+          }
+
+          // threadIdx.x >= fill_threads: counting experts and aligning
+          // threadIdx.x < fill_threads: filling sorted_token_ids
+          constexpr int32_t fill_threads = 256;
+
+          dim3 blockDim(num_thread + fill_threads);
+          auto kernel =
+              vllm::moe::moe_lora_align_block_size_small_batch_expert_kernel<
+                  scalar_t, fill_threads>;
+          AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
+              (void*)kernel, shared_mem));
+          kernel<<<max_loras, blockDim, shared_mem, stream>>>(
+              topk_ids.data_ptr<scalar_t>(),
+              token_lora_mapping.data_ptr<int32_t>(), block_size,
+              expert_map.data_ptr<int32_t>(), num_experts, max_loras,
+              topk_ids.numel(), max_num_tokens_padded, max_num_m_blocks,
+              sorted_token_ids.data_ptr<int32_t>(),
+              expert_ids.data_ptr<int32_t>(), topk_num,
+              num_tokens_post_pad.data_ptr<int32_t>(),
+              adapter_enabled.data_ptr<int32_t>(), lora_ids.data_ptr<int32_t>(),
+              token_mask.data_ptr<int32_t>(), has_expert_map);
+        } else {
+          int num_thread = 1024;
+          dim3 blockDim(num_thread);
+          size_t num_warps = CEILDIV(padded_num_experts, WARP_SIZE);
+
+          size_t shared_mem_size = num_warps * WARP_SIZE * sizeof(int32_t);
+
+          // cumsum buffer
+          torch::Tensor cumsum =
+              torch::zeros({max_loras * (num_experts + 1)}, options_int);
+
+          auto align_kernel =
+              vllm::moe::moe_lora_align_block_size_kernel<scalar_t>;
+
+          // launch two threadblocks for each lora
+          // blockIdx.x % 2 == 0: counting experts and aligning
+          // blockIdx.x % 2 == 1: filling sorted_token_ids
+          align_kernel<<<max_loras * 2, blockDim, shared_mem_size, stream>>>(
+              topk_ids.data_ptr<scalar_t>(),
+              token_lora_mapping.data_ptr<int32_t>(), block_size,
+              expert_map.data_ptr<int32_t>(), num_experts, max_loras,
+              topk_ids.numel(), max_num_tokens_padded, max_num_m_blocks,
+              sorted_token_ids.data_ptr<int32_t>(),
+              expert_ids.data_ptr<int32_t>(), topk_num,
+              num_tokens_post_pad.data_ptr<int32_t>(),
+              adapter_enabled.data_ptr<int32_t>(), cumsum.data_ptr<int32_t>(),
+              WARP_SIZE, padded_num_experts, lora_ids.data_ptr<int32_t>(),
+              token_mask.data_ptr<int32_t>(), has_expert_map);
+
+          const int block_threads = std::min(256, (int)num_thread);
+          const int num_blocks =
+              (topk_ids.numel() + block_threads - 1) / block_threads;
+
+          const int max_blocks = 65535;
+          const int actual_blocks = std::min(num_blocks, max_blocks);
+
+          dim3 gridDims(max_loras, actual_blocks);
+          auto sort_kernel =
+              vllm::moe::lora_count_and_sort_expert_tokens_kernel<scalar_t>;
+
+          sort_kernel<<<gridDims, block_threads, 0, stream>>>(
+              topk_ids.data_ptr<scalar_t>(),
+              sorted_token_ids.data_ptr<int32_t>(), cumsum.data_ptr<int32_t>(),
+              expert_map.data_ptr<int32_t>(), topk_ids.numel(), num_experts,
+              max_num_tokens_padded, topk_num, token_mask.data_ptr<int32_t>(),
+              lora_ids.data_ptr<int32_t>(), has_expert_map);
+        }
+      });
+}
\ No newline at end of file
diff --git a/csrc/moe/moe_lora_align_sum_kernels.cu b/csrc/moe/moe_lora_align_sum_kernels.cu
deleted file mode 100644
index 360f1312c..000000000
--- a/csrc/moe/moe_lora_align_sum_kernels.cu
+++ /dev/null
@@ -1,174 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <torch/all.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-
-#include <ATen/ATen.h>
-#include <ATen/cuda/Atomic.cuh>
-
-#include "../cuda_compat.h"
-#include "../dispatch_utils.h"
-#include "core/math.hpp"
-
-namespace {
-
-__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
-                                         int32_t col) {
-  return row * total_col + col;
-}
-
-}  // namespace
-
-// TODO: Refactor common parts with moe_align_sum_kernels
-template <typename scalar_t, typename token_cnts_t>
-__global__ void moe_lora_align_sum_kernel(
-    scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping,
-    int64_t block_size, int num_experts, int max_loras, size_t numel,
-    int max_num_tokens_padded, int max_num_m_blocks,
-    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
-    int topk_num, int32_t* total_tokens_post_pad, int32_t* adapter_enabled,
-    int32_t* lora_ids) {
-  const size_t tokens_per_thread = div_ceil(numel, blockDim.x);
-  const size_t start_idx = threadIdx.x * tokens_per_thread;
-
-  int lora_idx = blockIdx.x;
-  int lora_id = lora_ids[lora_idx];
-  if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
-    return;
-  }
-  extern __shared__ int32_t shared_mem[];
-  int32_t* cumsum = shared_mem;
-  token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + num_experts + 1);
-
-  // Initialize sorted_token_ids with numel
-  for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
-    sorted_token_ids[lora_id * max_num_tokens_padded + it] = numel;
-  }
-
-  // Initialize expert_ids with -1
-  for (size_t it = threadIdx.x; it < max_num_m_blocks; it += blockDim.x) {
-    expert_ids[lora_id * max_num_m_blocks + it] = -1;
-  }
-
-  // Initialize total_tokens_post_pad with 0
-  if (threadIdx.x == 0) {
-    total_tokens_post_pad[lora_id] = 0;
-  }
-
-  for (int i = 0; i < num_experts; ++i) {
-    tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
-  }
-
-  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
-    int mask = token_lora_mapping[i / topk_num] == lora_id;
-    int idx = index(num_experts, threadIdx.x + 1, topk_ids[i]);
-    tokens_cnts[idx] += mask;
-  }
-
-  __syncthreads();
-
-  // For each expert we accumulate the token counts from the different threads.
-  if (threadIdx.x < num_experts) {
-    tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
-    for (int i = 1; i <= blockDim.x; ++i) {
-      tokens_cnts[index(num_experts, i, threadIdx.x)] +=
-          tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
-    }
-  }
-
-  __syncthreads();
-
-  // We accumulate the token counts of all experts in thread 0.
-  if (threadIdx.x == 0) {
-    cumsum[0] = 0;
-    for (int i = 1; i <= num_experts; ++i) {
-      cumsum[i] = cumsum[i - 1] +
-                  div_ceil(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
-                           block_size) *
-                      block_size;
-    }
-    total_tokens_post_pad[lora_id] = static_cast<int32_t>(cumsum[num_experts]);
-  }
-
-  __syncthreads();
-
-  /**
-   * For each expert, each thread processes the tokens of the corresponding
-   * blocks and stores the corresponding expert_id for each block.
-   */
-  if (threadIdx.x < num_experts) {
-    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
-         i += block_size) {
-      expert_ids[index(max_num_m_blocks, lora_id, i / block_size)] =
-          threadIdx.x;
-    }
-  }
-
-  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
-    int32_t expert_id = topk_ids[i];
-    /** The cumsum[expert_id] stores the starting index of the tokens that the
-     * expert with expert_id needs to process, and
-     * tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
-     * processed by the expert with expert_id within the current thread's token
-     * shard.
-     */
-    int32_t rank_post_pad =
-        tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
-        cumsum[expert_id];
-
-    int mask = (int)token_lora_mapping[i / topk_num] == lora_id;
-    atomicAdd(
-        &sorted_token_ids[index(max_num_tokens_padded, lora_id, rank_post_pad)],
-        (i - numel) * mask);
-    tokens_cnts[index(num_experts, threadIdx.x, expert_id)] += mask;
-  }
-}
-
-void moe_lora_align_block_size(
-    torch::Tensor topk_ids, torch::Tensor token_lora_mapping,
-    int64_t num_experts, int64_t block_size, int64_t max_loras,
-    int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
-    torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
-    torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
-    torch::Tensor lora_ids) {
-  const int topk_num = topk_ids.size(1);
-
-  TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");
-
-  int device_max_shared_mem;
-  auto dev = topk_ids.get_device();
-  cudaDeviceGetAttribute(&device_max_shared_mem,
-                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  const int32_t num_thread = max((int32_t)num_experts, 128);  // WARP_SIZE,
-  TORCH_CHECK(num_thread <= 1024,
-              "num_thread must be less than 1024, "
-              "and fallback is not implemented yet.");
-  const int32_t shared_mem = (num_thread + 1) * num_experts * sizeof(int32_t) +
-                             (num_experts + 1) * sizeof(int32_t);
-
-  if (shared_mem > device_max_shared_mem) {
-    TORCH_CHECK(false,
-                "Shared memory usage exceeds device limit, and global memory "
-                "fallback is not implemented yet.");
-  }
-
-  VLLM_DISPATCH_INTEGRAL_TYPES(
-      topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] {
-        dim3 blockDim(num_thread);
-        auto kernel = moe_lora_align_sum_kernel<scalar_t, int32_t>;
-        AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
-            (void*)kernel, shared_mem));
-        kernel<<<max_loras, blockDim, shared_mem, stream>>>(
-            topk_ids.data_ptr<scalar_t>(),
-            token_lora_mapping.data_ptr<int32_t>(), block_size, num_experts,
-            max_loras, topk_ids.numel(), max_num_tokens_padded,
-            max_num_m_blocks, sorted_token_ids.data_ptr<int32_t>(),
-            expert_ids.data_ptr<int32_t>(), topk_num,
-            num_tokens_post_pad.data_ptr<int32_t>(),
-            adapter_enabled.data_ptr<int32_t>(), lora_ids.data_ptr<int32_t>());
-      });
-}
\ No newline at end of file
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
index 4c7accf03..337dcc50b 100644
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -27,7 +27,7 @@ void moe_lora_align_block_size(
     int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
     torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
     torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
-    torch::Tensor lora_ids);
+    torch::Tensor lora_ids, std::optional<torch::Tensor> maybe_expert_map);
 #ifndef USE_ROCM
 torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                              torch::Tensor b_qweight, torch::Tensor b_scales,
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index fca57c31c..779ad70ad 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -47,7 +47,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "                     Tensor !experts_ids,"
       "                     Tensor !num_tokens_post_pad,"
       "                     Tensor !adapter_enabled,"
-      "                     Tensor !lora_ids) -> () ");
+      "                     Tensor !lora_ids,"
+      "                     Tensor? maybe_expert_map) -> () ");
   m.impl("moe_lora_align_block_size", torch::kCUDA, &moe_lora_align_block_size);
 
 #ifndef USE_ROCM
diff --git a/tests/lora/test_moe_lora_align_sum.py b/tests/lora/test_moe_lora_align_sum.py
index 72f1d759f..3a17f3eba 100644
--- a/tests/lora/test_moe_lora_align_sum.py
+++ b/tests/lora/test_moe_lora_align_sum.py
@@ -32,7 +32,7 @@ def sample_data(num_experts, max_loras, num_tokens, topk_num):
 
 @pytest.mark.parametrize("num_tokens", [100, 200, 1024, 4096])  # 81920
 @pytest.mark.parametrize("topk_num", [6])
-@pytest.mark.parametrize("num_experts", [64, 128])
+@pytest.mark.parametrize("num_experts", [64, 128, 256, 512])
 @pytest.mark.parametrize("max_loras", [2, 32])
 @pytest.mark.parametrize("block_size", [16])
 def test_moe_lora_align_block_size(
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 77d545329..56c780ceb 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1961,6 +1961,7 @@ def moe_lora_align_block_size(
     num_tokens_post_pad: torch.Tensor,
     adapter_enabled: torch.Tensor,
     lora_ids: torch.Tensor,
+    expert_map: torch.Tensor | None = None,
 ) -> None:
     torch.ops._moe_C.moe_lora_align_block_size(
         topk_ids,
@@ -1975,6 +1976,7 @@ def moe_lora_align_block_size(
         num_tokens_post_pad,
         adapter_enabled,
         lora_ids,
+        expert_map,
     )
 
 
-- 
GitLab


From f6227c22ab8976a24913122874c24624102da1b4 Mon Sep 17 00:00:00 2001
From: czhu-cohere <conway.zhu@cohere.com>
Date: Mon, 8 Dec 2025 22:29:06 -0500
Subject: [PATCH 217/258] [Kernel]Support W4A8 Grouped GEMM on Hopper (#29691)

Signed-off-by: czhu-cohere <conway.zhu@cohere.com>
---
 CMakeLists.txt                                |   5 +-
 csrc/ops.h                                    |   3 +-
 .../cutlass_w4a8/get_group_starts.cuh         | 104 ++++
 .../cutlass_w4a8/w4a8_grouped_mm_entry.cu     | 483 ++++++++++++++++++
 .../cutlass_w4a8/w4a8_mm_entry.cu             |  70 +--
 csrc/quantization/cutlass_w4a8/w4a8_utils.cu  |  90 ++++
 csrc/quantization/cutlass_w4a8/w4a8_utils.cuh |  11 +
 .../quantization/w8a8/cutlass/moe/moe_data.cu |   8 +-
 .../w8a8/cutlass/scaled_mm_entry.cu           |   8 +-
 csrc/torch_bindings.cpp                       |  26 +-
 .../kernels/quantization/test_cutlass_w4a8.py |  46 +-
 .../quantization/test_cutlass_w4a8_moe.py     | 340 ++++++++++++
 vllm/_custom_ops.py                           |  90 +++-
 .../layers/fused_moe/__init__.py              |   4 +
 .../model_executor/layers/fused_moe/config.py |  29 ++
 .../layers/fused_moe/cutlass_moe.py           | 401 +++++++++++++++
 .../layers/fused_moe/modular_kernel.py        |   2 +-
 .../compressed_tensors/compressed_tensors.py  |   2 +-
 .../compressed_tensors_moe.py                 | 340 +++++++++++-
 .../schemes/compressed_tensors_w4a8_fp8.py    |  15 +-
 .../kernels/mixed_precision/cutlass.py        |  19 +-
 .../layers/quantization/utils/quant_utils.py  |  50 +-
 22 files changed, 2045 insertions(+), 101 deletions(-)
 create mode 100644 csrc/quantization/cutlass_w4a8/get_group_starts.cuh
 create mode 100644 csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu
 create mode 100644 csrc/quantization/cutlass_w4a8/w4a8_utils.cu
 create mode 100644 csrc/quantization/cutlass_w4a8/w4a8_utils.cuh
 create mode 100644 tests/kernels/quantization/test_cutlass_w4a8_moe.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 69a538b06..6b93e3fe9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -874,7 +874,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
     set(SRCS
-       "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
+       "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu"
+       "csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu"
+       "csrc/quantization/cutlass_w4a8/w4a8_utils.cu"
+       )
 
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
diff --git a/csrc/ops.h b/csrc/ops.h
index 5fce3a1a3..37e3aaf74 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -262,7 +262,8 @@ void get_cutlass_moe_mm_data(
 void get_cutlass_moe_mm_problem_sizes(
     const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
     torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets);
+    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
+    std::optional<bool> force_swap_ab = std::nullopt);
 
 void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
                                   torch::Tensor& problem_sizes1,
diff --git a/csrc/quantization/cutlass_w4a8/get_group_starts.cuh b/csrc/quantization/cutlass_w4a8/get_group_starts.cuh
new file mode 100644
index 000000000..fec142d0d
--- /dev/null
+++ b/csrc/quantization/cutlass_w4a8/get_group_starts.cuh
@@ -0,0 +1,104 @@
+// see csrc/quantization/w8a8/cutlass/moe/get_group_starts.cuh
+#pragma once
+
+#include <cuda.h>
+#include <torch/all.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include "core/scalar_type.hpp"
+#include "cutlass/bfloat16.h"
+#include "cutlass/float8.h"
+
+// ElementB is int32 (packed int4)
+// ElementGroupScale is cutlass::Array<cutlass::float_e4m3_t, 8> (packed fp8)
+template <typename ElementA, typename ElementB, typename ElementC,
+          typename ElementAccumulator, typename ElementGroupScale>
+__global__ void get_group_gemm_starts(
+    int64_t* expert_offsets, ElementA** a_offsets, ElementB** b_offsets,
+    ElementC** out_offsets, ElementAccumulator** a_scales_offsets,
+    ElementAccumulator** b_scales_offsets,
+    ElementGroupScale** b_group_scales_offsets, ElementA* a_base_as_int,
+    ElementB* b_base_as_int, ElementC* out_base_as_int,
+    ElementAccumulator* a_scales_base_as_int,
+    ElementAccumulator* b_scales_base_as_int,
+    ElementGroupScale* b_group_scales_base_as_int, int64_t n, int64_t k,
+    int64_t scale_k) {
+  int expert_id = threadIdx.x;
+
+  int64_t expert_offset = expert_offsets[expert_id];
+
+  // same as w8a8
+  a_offsets[expert_id] = a_base_as_int + expert_offset * k;
+  out_offsets[expert_id] = out_base_as_int + expert_offset * n;
+  a_scales_offsets[expert_id] = a_scales_base_as_int + expert_offset;
+  b_scales_offsets[expert_id] = b_scales_base_as_int + (n * expert_id);
+
+  // w4a8 specific
+  constexpr int pack_factor = 8;  // pack 8 int4 into int32
+  b_offsets[expert_id] = b_base_as_int + (expert_id * k * n / pack_factor);
+  b_group_scales_offsets[expert_id] =
+      b_group_scales_base_as_int + (expert_id * scale_k * n);
+}
+
+#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE)                  \
+  else if (out_tensors.dtype() == TENSOR_C_TYPE) {                       \
+    get_group_gemm_starts<cutlass::float_e4m3_t, int32_t, C_TYPE, float, \
+                          cutlass::Array<cutlass::float_e4m3_t, 8>>      \
+        <<<1, num_experts, 0, stream>>>(                                 \
+            static_cast<int64_t*>(expert_offsets.data_ptr()),            \
+            static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()),     \
+            static_cast<int32_t**>(b_ptrs.data_ptr()),                   \
+            static_cast<C_TYPE**>(out_ptrs.data_ptr()),                  \
+            static_cast<float**>(a_scales_ptrs.data_ptr()),              \
+            static_cast<float**>(b_scales_ptrs.data_ptr()),              \
+            static_cast<cutlass::Array<cutlass::float_e4m3_t, 8>**>(     \
+                b_group_scales_ptrs.data_ptr()),                         \
+            static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()),   \
+            static_cast<int32_t*>(b_tensors.data_ptr()),                 \
+            static_cast<C_TYPE*>(out_tensors.data_ptr()),                \
+            static_cast<float*>(a_scales.data_ptr()),                    \
+            static_cast<float*>(b_scales.data_ptr()),                    \
+            static_cast<cutlass::Array<cutlass::float_e4m3_t, 8>*>(      \
+                b_group_scales.data_ptr()),                              \
+            n, k, scale_k);                                              \
+  }
+
+namespace {
+
+void run_get_group_gemm_starts(
+    torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
+    torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
+    torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
+    torch::Tensor& b_group_scales_ptrs, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor& out_tensors,
+    torch::Tensor const& a_scales, torch::Tensor const& b_scales,
+    torch::Tensor const& b_group_scales, const int64_t b_group_size) {
+  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b_tensors.dtype() == torch::kInt32);  // int4 8x packed into int32
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_group_scales.dtype() ==
+              torch::kFloat8_e4m3fn);  // the underlying torch type is e4m3
+  TORCH_CHECK(out_tensors.dtype() ==
+              torch::kBFloat16);  // only support bf16 for now
+  // expect int64_t to avoid overflow during offset calculations
+  TORCH_CHECK(expert_offsets.dtype() == torch::kInt64);
+
+  int num_experts = static_cast<int>(expert_offsets.size(0));
+  // logical k, n
+  int64_t n = out_tensors.size(1);
+  int64_t k = a_tensors.size(1);
+  int64_t scale_k = cutlass::ceil_div(k, b_group_size);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
+
+  if (false) {
+  }
+  __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t)
+  __CALL_GET_STARTS_KERNEL(torch::kFloat16, half)
+  else {
+    TORCH_CHECK(false, "Invalid output type (must be float16 or bfloat16)");
+  }
+}
+
+}  // namespace
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu b/csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu
new file mode 100644
index 000000000..4b425790d
--- /dev/null
+++ b/csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu
@@ -0,0 +1,483 @@
+#include <vector>
+#include <tuple>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/mixed_dtype_utils.hpp"
+
+// vllm includes
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+#include "cutlass_extensions/torch_utils.hpp"
+#include "cutlass_extensions/common.hpp"
+
+#include "core/registration.h"
+#include "get_group_starts.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+#include "w4a8_utils.cuh"
+
+namespace vllm::cutlass_w4a8_moe {
+
+using namespace cute;
+
+// -------------------------------------------------------------------------------------
+// Static configuration shared across all instantiations
+// -------------------------------------------------------------------------------------
+using ProblemShape =
+    cutlass::gemm::GroupProblemShape<Shape<int, int, int>>;  // <M,N,K> per
+                                                             // group
+using MmaType = cutlass::float_e4m3_t;
+using QuantType = cutlass::int4b_t;
+
+constexpr int TileShapeK = 128 * 8 / sizeof_bits<MmaType>::value;
+static int constexpr PackFactor = 8;  // 8 int4 packed into int32
+
+// A matrix configuration
+using ElementA = MmaType;
+using LayoutA = cutlass::layout::RowMajor;  // Layout type for A matrix operand
+constexpr int AlignmentA =
+    128 /
+    cutlass::sizeof_bits<ElementA>::value;  // Alignment of A matrix in units of
+                                            // elements (up to 16 bytes)
+
+// B matrix configuration
+using ElementB = QuantType;  // Element type for B matrix operand
+using LayoutB =
+    cutlass::layout::ColumnMajor;  // Layout type for B matrix operand
+constexpr int AlignmentB =
+    128 / cutlass::sizeof_bits<
+              ElementB>::value;  // Memory access granularity/alignment of B
+                                 // matrix in units of elements (up to 16 bytes)
+
+// This example manually swaps and transposes, so keep transpose of input
+// layouts
+using LayoutA_Transpose =
+    typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+using LayoutB_Transpose =
+    typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+
+// Need to pass a pointer type to make the 3rd dimension of Stride be _0
+using StrideA =
+    cute::remove_pointer_t<cutlass::detail::TagToStrideA_t<LayoutA*>>;
+using StrideB =
+    cute::remove_pointer_t<cutlass::detail::TagToStrideB_t<LayoutB*>>;
+
+// Define the CuTe layout for reoredered quantized tensor B
+// LayoutAtomQuant places values that will be read by the same thread in
+// contiguous locations in global memory. It specifies the reordering within a
+// single warp's fragment
+using LayoutAtomQuant =
+    decltype(cutlass::compute_memory_reordering_atom<MmaType>());
+using LayoutB_Reordered = decltype(cute::tile_to_shape(
+    LayoutAtomQuant{}, Layout<Shape<int, int, Int<1>>, StrideB>{}));
+
+using ElementScale = cutlass::float_e4m3_t;
+using LayoutScale = cutlass::layout::RowMajor;
+
+// C/D matrix configuration
+using ElementC =
+    cutlass::bfloat16_t;  // Element type for C and D matrix operands
+using LayoutC =
+    cutlass::layout::RowMajor;  // Layout type for C and D matrix operands
+constexpr int AlignmentC =
+    128 / cutlass::sizeof_bits<
+              ElementC>::value;  // Memory access granularity/alignment of C
+                                 // matrix in units of elements (up to 16 bytes)
+
+// D matrix configuration
+using ElementD = ElementC;
+using LayoutD = LayoutC;
+constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+// Core kernel configurations
+using ElementAccumulator = float;     // Element type for internal accumulation
+using ArchTag = cutlass::arch::Sm90;  // Tag indicating the minimum SM that
+                                      // supports the intended feature
+using OperatorClass = cutlass::arch::OpClassTensorOp;  // Operator class tag
+using StageCountType =
+    cutlass::gemm::collective::StageCountAuto;  // Stage count maximized based
+                                                // on the tile size
+
+// per-channel and per-token scales for epilogue
+using ElementSChannel = float;
+
+template <class TileShape_MN, class ClusterShape_MNK, class KernelSchedule,
+          class EpilogueSchedule>
+struct W4A8GroupedGemmKernel {
+  using TileShape =
+      decltype(cute::append(TileShape_MN{}, cute::Int<TileShapeK>{}));
+  using ClusterShape = ClusterShape_MNK;
+
+  // per-channel, per-token scales epilogue
+  using ChTokScalesEpilogue =
+      typename vllm::c3x::ScaledEpilogueArray<ElementAccumulator, ElementD,
+                                              TileShape>;
+  using EVTCompute = typename ChTokScalesEpilogue::EVTCompute;
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, TileShape, ClusterShape,
+          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
+          ElementSChannel, ElementC,
+          typename cutlass::layout::LayoutTranspose<LayoutC>::type*, AlignmentC,
+          ElementD, typename cutlass::layout::LayoutTranspose<LayoutD>::type*,
+          AlignmentD, EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+  // =========================================================== MIXED INPUT
+  // WITH SCALES
+  // ===========================================================================
+  // The Scale information must get paired with the operand that will be scaled.
+  // In this example, B is scaled so we make a tuple of B's information and the
+  // scale information.
+  using CollectiveMainloopShuffled =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass,
+          cute::tuple<ElementB, cutlass::Array<ElementScale, 8>>,
+          LayoutB_Reordered*, AlignmentB, ElementA, LayoutA_Transpose*,
+          AlignmentA, ElementAccumulator, TileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using GemmKernelShuffled = cutlass::gemm::kernel::GemmUniversal<
+      ProblemShape, CollectiveMainloopShuffled, CollectiveEpilogue>;
+
+  using GemmShuffled =
+      cutlass::gemm::device::GemmUniversalAdapter<GemmKernelShuffled>;
+
+  using StrideC = typename GemmKernelShuffled::InternalStrideC;
+  using StrideD = typename GemmKernelShuffled::InternalStrideD;
+
+  using StrideC_ref = cutlass::detail::TagToStrideC_t<LayoutC>;
+  using StrideD_ref = cutlass::detail::TagToStrideC_t<LayoutD>;
+  using StrideS = typename CollectiveMainloopShuffled::StrideScale;
+  using StrideS_ref = cutlass::detail::TagToStrideB_t<LayoutScale>;
+
+  // static asserts for passing in strides/layouts
+  // pack to 2x int64
+  static_assert(sizeof(StrideS) == 2 * sizeof(int64_t));
+  // pack to 3xint32,
+  static_assert(sizeof(LayoutB_Reordered) % sizeof(int32_t) == 0,
+                "LayoutB_Reordered size must be divisible by 4 bytes");
+
+  static void grouped_mm(
+      torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
+      const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
+      const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
+      const int64_t b_group_size, const torch::Tensor& expert_offsets,
+      const torch::Tensor& problem_sizes_torch, const torch::Tensor& a_strides,
+      const torch::Tensor& b_strides, const torch::Tensor& c_strides,
+      const torch::Tensor& group_scale_strides) {
+    auto device = a_tensors.device();
+    auto device_id = device.index();
+    const at::cuda::OptionalCUDAGuard device_guard(device);
+    auto stream = at::cuda::getCurrentCUDAStream(device_id);
+
+    int num_experts = static_cast<int>(expert_offsets.size(0));
+    int n = static_cast<int>(b_tensors.size(1));
+    int k = static_cast<int>(b_tensors.size(2)) * PackFactor;
+
+    auto options_int =
+        torch::TensorOptions().dtype(torch::kInt64).device(device);
+    torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
+    torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
+    torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
+    torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
+    torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
+    torch::Tensor b_group_scales_ptrs = torch::empty(num_experts, options_int);
+
+    // get the correct offsets to pass to gemm
+    run_get_group_gemm_starts(expert_offsets, a_ptrs, b_ptrs, out_ptrs,
+                              a_scales_ptrs, b_scales_ptrs, b_group_scales_ptrs,
+                              a_tensors, b_tensors, out_tensors, a_scales,
+                              b_scales, b_group_scales, b_group_size);
+
+    // construct args
+    using Args = typename GemmShuffled::Arguments;
+    using MainloopArguments = typename GemmKernelShuffled::MainloopArguments;
+    using EpilogueArguments = typename GemmKernelShuffled::EpilogueArguments;
+    Args arguments;
+
+    ProblemShape::UnderlyingProblemShape* problem_sizes_as_shapes =
+        static_cast<ProblemShape::UnderlyingProblemShape*>(
+            problem_sizes_torch.data_ptr());
+    ProblemShape prob_shape{num_experts, problem_sizes_as_shapes, nullptr};
+
+    // SwapAB so B operands come first
+    MainloopArguments mainloop_arguments{
+        static_cast<const QuantType**>(b_ptrs.data_ptr()),
+        static_cast<LayoutB_Reordered*>(b_strides.data_ptr()),
+        static_cast<const MmaType**>(a_ptrs.data_ptr()),
+        static_cast<StrideA*>(a_strides.data_ptr()),
+        static_cast<const cutlass::Array<ElementScale, 8>**>(
+            b_group_scales_ptrs.data_ptr()),
+        static_cast<StrideS*>(group_scale_strides.data_ptr()),
+        static_cast<int>(b_group_size)};
+
+    EpilogueArguments epilogue_arguments{
+        // since we are doing SwapAB the channel scales comes first, then token
+        // scales
+        ChTokScalesEpilogue::prepare_args(  // see ScaledEpilogueArray
+            static_cast<const ElementAccumulator**>(
+                b_scales_ptrs.data_ptr()),  // per-channel
+            static_cast<const ElementAccumulator**>(
+                a_scales_ptrs.data_ptr()),  // per-token
+            true, true),
+        nullptr,                                       // C
+        static_cast<StrideC*>(c_strides.data_ptr()),   // C
+        static_cast<ElementD**>(out_ptrs.data_ptr()),  // D
+        static_cast<StrideC*>(c_strides.data_ptr())    // D
+    };
+
+    static const cutlass::KernelHardwareInfo hw_info{
+        device_id,
+        cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+            device_id)};
+
+    arguments = Args{cutlass::gemm::GemmUniversalMode::kGrouped, prob_shape,
+                     mainloop_arguments, epilogue_arguments, hw_info};
+
+    // Allocate workspace
+    size_t workspace_size = GemmShuffled::get_workspace_size(arguments);
+    torch::Tensor workspace =
+        torch::empty(workspace_size,
+                     torch::TensorOptions().dtype(torch::kU8).device(device));
+
+    // Run GEMM
+    GemmShuffled gemm;
+    CUTLASS_CHECK(gemm.can_implement(arguments));
+    CUTLASS_CHECK(gemm.initialize(arguments, workspace.data_ptr(), stream));
+    CUTLASS_CHECK(gemm.run(stream));
+  }
+};
+
+// ----------------------------------------------------------------------------
+// Kernel instantiations and dispatch logic
+// ----------------------------------------------------------------------------
+using Coop = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative;
+using CoopEpi = cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative;
+
+// Kernel_TileShape_ClusterShape_Schedule
+using Kernel_128x16_1x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_128, _16>, Shape<_1, _1, _1>, Coop, CoopEpi>;
+using Kernel_128x16_2x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_128, _16>, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+using Kernel_256x16_1x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _16>, Shape<_1, _1, _1>, Coop, CoopEpi>;
+using Kernel_256x16_2x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _16>, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+using Kernel_256x32_1x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _32>, Shape<_1, _1, _1>, Coop, CoopEpi>;
+using Kernel_256x32_2x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _32>, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+using Kernel_256x64_1x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _64>, Shape<_1, _1, _1>, Coop, CoopEpi>;
+using Kernel_256x64_2x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _64>, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+using Kernel_256x128_1x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _128>, Shape<_1, _1, _1>, Coop, CoopEpi>;
+using Kernel_256x128_2x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _128>, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+using Kernel_128x256_2x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_128, _256>, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+void mm_dispatch(
+    torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
+    const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
+    const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
+    const int64_t b_group_size, const torch::Tensor& expert_offsets,
+    const torch::Tensor& problem_sizes, const torch::Tensor& a_strides,
+    const torch::Tensor& b_strides, const torch::Tensor& c_strides,
+    const torch::Tensor& group_scale_strides, const std::string& schedule) {
+  if (schedule == "Kernel_128x16_1x1x1_Coop") {
+    Kernel_128x16_1x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_128x16_2x1x1_Coop") {
+    Kernel_128x16_2x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x16_1x1x1_Coop") {
+    Kernel_256x16_1x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x16_2x1x1_Coop") {
+    Kernel_256x16_2x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x32_1x1x1_Coop") {
+    Kernel_256x32_1x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x32_2x1x1_Coop") {
+    Kernel_256x32_2x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x64_1x1x1_Coop") {
+    Kernel_256x64_1x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x64_2x1x1_Coop") {
+    Kernel_256x64_2x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x128_1x1x1_Coop") {
+    Kernel_256x128_1x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x128_2x1x1_Coop") {
+    Kernel_256x128_2x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_128x256_2x1x1_Coop") {
+    Kernel_128x256_2x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else {
+    TORCH_CHECK(false,
+                "cutlass_w4a8_moe_mm: unknown schedule string: ", schedule);
+  }
+}
+
+void mm(torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
+        const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
+        const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
+        const int64_t b_group_size, const torch::Tensor& expert_offsets,
+        const torch::Tensor& problem_sizes, const torch::Tensor& a_strides,
+        const torch::Tensor& b_strides, const torch::Tensor& c_strides,
+        const torch::Tensor& group_scale_strides,
+        std::optional<std::string> maybe_schedule) {
+  // user has specified a schedule
+  if (maybe_schedule) {
+    mm_dispatch(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
+                b_group_scales, b_group_size, expert_offsets, problem_sizes,
+                a_strides, b_strides, c_strides, group_scale_strides,
+                *maybe_schedule);
+    return;
+  }
+
+  // use heuristic
+  int m_full = a_tensors.size(0);
+  int n = b_tensors.size(1);
+  int k = b_tensors.size(2) * PackFactor;  // logical k
+  int num_experts = b_tensors.size(0);
+  // per-expert batch size assuming uniform distribution
+  int m_expert = m_full / num_experts;
+
+  std::string schedule;
+  if (m_expert <= 16) {
+    schedule = "Kernel_128x16_2x1x1_Coop";
+  } else if (m_expert <= 32) {
+    schedule = "Kernel_256x32_1x1x1_Coop";
+  } else if (m_expert <= 64) {
+    schedule = "Kernel_256x64_1x1x1_Coop";
+  } else if (m_expert <= 128) {
+    schedule = "Kernel_256x128_2x1x1_Coop";
+  } else {  // m_expert > 128
+    schedule = "Kernel_128x256_2x1x1_Coop";
+  }
+
+  mm_dispatch(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
+              b_group_scales, b_group_size, expert_offsets, problem_sizes,
+              a_strides, b_strides, c_strides, group_scale_strides, schedule);
+}
+
+std::tuple<torch::Tensor, torch::Tensor> encode_and_reorder_int4b(
+    torch::Tensor const& b_tensors) {
+  TORCH_CHECK(b_tensors.dtype() == torch::kInt32);
+  TORCH_CHECK(b_tensors.dim() == 3);  // (experts, n, k)
+  TORCH_CHECK(b_tensors.is_contiguous());
+  TORCH_CHECK(b_tensors.is_cuda());
+
+  int n = static_cast<int>(b_tensors.size(1));
+  int k = static_cast<int>(b_tensors.size(2)) * PackFactor;  // logical k
+
+  // CUTLASS reorder_tensor requires k % 256 == 0 and n % 16 == 0.
+  // These misalignments cause silent OOB unless run under Compute Sanitizer.
+  TORCH_CHECK(k % 256 == 0, "logical k must be divisible by 256");
+  TORCH_CHECK(n % 16 == 0, "n must be divisible by 16");
+
+  // we will store the layout to an int32 tensor;
+  // this is the number of elements we need per layout
+  constexpr size_t layout_width = sizeof(LayoutB_Reordered) / sizeof(int32_t);
+
+  torch::Tensor b_tensors_packed = torch::empty_like(b_tensors);
+  int num_experts = static_cast<int>(b_tensors.size(0));
+
+  auto b_ptr = static_cast<QuantType const*>(b_tensors.const_data_ptr());
+  auto b_packed_ptr = static_cast<QuantType*>(b_tensors_packed.data_ptr());
+
+  // multiply by ull so result does not overflow int32
+  size_t num_int4_elems = 1ull * num_experts * n * k;
+  bool ok = vllm::cutlass_w4a8_utils::unified_encode_int4b(b_ptr, b_packed_ptr,
+                                                           num_int4_elems);
+  TORCH_CHECK(ok, "unified_encode_int4b failed");
+
+  // construct the layout once; assumes each expert has the same layout
+  using LayoutType = LayoutB_Reordered;
+  std::vector<LayoutType> layout_B_reordered_host(num_experts);
+  auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {n, k, Int<1>{}});
+  auto shape_B = cute::make_shape(n, k, Int<1>{});
+  auto layout_B = make_layout(shape_B, stride_B);
+  LayoutType layout_B_reordered = tile_to_shape(LayoutAtomQuant{}, shape_B);
+
+  // reorder weights for each expert
+  for (int i = 0; i < num_experts; i++) {
+    // since the storage type of int4b is 1 byte but one element is 4 bits
+    // we need to adjust the offset
+    int64_t offset =
+        1ull * i * n * k * cutlass::sizeof_bits<QuantType>::value / 8;
+    cutlass::reorder_tensor(b_packed_ptr + offset, layout_B,
+                            layout_B_reordered);
+  }
+
+  // save the packed layout to torch tensor so we can re-use it
+  auto cpu_opts =
+      torch::TensorOptions().dtype(torch::kInt32).device(torch::kCPU);
+  torch::Tensor layout_cpu =
+      torch::empty({num_experts, layout_width}, cpu_opts);
+
+  int32_t* layout_data = layout_cpu.data_ptr<int32_t>();
+  for (int i = 0; i < num_experts; ++i) {
+    std::memcpy(layout_data + i * layout_width,  // dst (int32*)
+                &layout_B_reordered,             // src (LayoutType*)
+                sizeof(LayoutType));             // number of bytes
+  }
+
+  torch::Tensor packed_layout =
+      layout_cpu.to(b_tensors.device(), /*non_blocking=*/false);
+
+  return {b_tensors_packed, packed_layout};
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("cutlass_w4a8_moe_mm", &mm);
+  m.impl("cutlass_encode_and_reorder_int4b_grouped", &encode_and_reorder_int4b);
+}
+
+}  // namespace vllm::cutlass_w4a8_moe
+/////////////////////////////////////////////////////////////////////////////////////////////////
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
index 2d1568b08..f77af06cd 100644
--- a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
+++ b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
@@ -7,6 +7,7 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/all.h>
 #include "cutlass_extensions/torch_utils.hpp"
+#include "w4a8_utils.cuh"
 
 #include "core/registration.h"
 
@@ -395,71 +396,6 @@ torch::Tensor pack_scale_fp8(torch::Tensor const& scales) {
   return packed_scales;
 }
 
-/*
-  GPU-accelerated implementation of cutlass::unified_encode_int4b.
-  Constructs a lookup table in constant memory to map 8 bits
-  (two 4-bit values) at a time. Assumes memory is contiguous
-  and pointers are 16-byte aligned.
-*/
-__constant__ uint8_t kNibbleLUT[256];
-
-__global__ void unified_encode_int4b_device(const uint8_t* in, uint8_t* out,
-                                            size_t nbytes) {
-  constexpr size_t V = sizeof(uint4);  // 16 bytes
-  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-  const size_t nthreads = size_t(gridDim.x) * blockDim.x;
-  const size_t nvec = nbytes / V;
-
-  // 1-D grid-stride loop over 16-byte chunks
-  for (size_t vec = tid; vec < nvec; vec += nthreads) {
-    uint4 v = reinterpret_cast<const uint4*>(in)[vec];
-    uint8_t* b = reinterpret_cast<uint8_t*>(&v);
-#pragma unroll
-    for (int i = 0; i < int(V); ++i) b[i] = kNibbleLUT[b[i]];
-    reinterpret_cast<uint4*>(out)[vec] = v;
-  }
-}
-
-static bool upload_lut() {
-  std::array<uint8_t, 256> lut{};
-  auto map_nib = [](uint8_t v) -> uint8_t {
-    // 1..7 -> (8 - v); keep 0 and 8..15
-    return (v == 0 || (v & 0x8)) ? v : uint8_t(8 - v);
-  };
-  for (int b = 0; b < 256; ++b) {
-    uint8_t lo = b & 0xF;
-    uint8_t hi = (b >> 4) & 0xF;
-    lut[b] = uint8_t((map_nib(hi) << 4) | map_nib(lo));
-  }
-  cudaError_t e = cudaMemcpyToSymbol(kNibbleLUT, lut.data(), lut.size(),
-                                     /*offset=*/0, cudaMemcpyHostToDevice);
-
-  return (e == cudaSuccess);
-}
-
-static bool unified_encode_int4b(cutlass::int4b_t const* in,
-                                 cutlass::int4b_t* out, size_t num_int4_elems) {
-  // Build/upload LUT
-  if (!upload_lut()) return false;
-
-  static_assert(sizeof(typename cutlass::int4b_t::Storage) == 1,
-                "int4 storage must be 1 byte");
-  const size_t nbytes = num_int4_elems >> 1;
-
-  auto* in_bytes = reinterpret_cast<uint8_t const*>(in);
-  auto* out_bytes = reinterpret_cast<uint8_t*>(out);
-
-  // kernel launch params
-  constexpr int block = 256;
-  const size_t nvec = nbytes / sizeof(uint4);  // # of 16B vectors
-  int grid = int((nvec + block - 1) / block);
-  if (grid == 0) grid = 1;  // ensure we still cover the tail in the kernel
-
-  unified_encode_int4b_device<<<grid, block>>>(in_bytes, out_bytes, nbytes);
-  cudaError_t err = cudaGetLastError();
-  return (err == cudaSuccess);
-}
-
 torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {
   TORCH_CHECK(B.dtype() == torch::kInt32);
   TORCH_CHECK(B.dim() == 2);
@@ -477,8 +413,8 @@ torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {
   LayoutB_Reordered layout_B_reordered =
       cute::tile_to_shape(LayoutAtomQuant{}, shape_B);
 
-  bool ok =
-      vllm::cutlass_w4a8::unified_encode_int4b(B_ptr, B_packed_ptr, n * k);
+  bool ok = vllm::cutlass_w4a8_utils::unified_encode_int4b(B_ptr, B_packed_ptr,
+                                                           n * k);
   TORCH_CHECK(ok, "unified_encode_int4b failed");
   cutlass::reorder_tensor(B_packed_ptr, layout_B, layout_B_reordered);
 
diff --git a/csrc/quantization/cutlass_w4a8/w4a8_utils.cu b/csrc/quantization/cutlass_w4a8/w4a8_utils.cu
new file mode 100644
index 000000000..f238d0a5b
--- /dev/null
+++ b/csrc/quantization/cutlass_w4a8/w4a8_utils.cu
@@ -0,0 +1,90 @@
+#include "w4a8_utils.cuh"
+
+#include <array>
+#include <cuda_runtime.h>
+#include <cstdio>
+
+namespace vllm::cutlass_w4a8_utils {
+
+/*
+  GPU-accelerated implementation of cutlass::unified_encode_int4b.
+  Constructs a lookup table in constant memory to map 8 bits
+  (two 4-bit values) at a time. Assumes memory is contiguous
+  and pointers are 16-byte aligned.
+*/
+__constant__ uint8_t kNibbleLUT[256];
+
+__global__ void unified_encode_int4b_device(const uint8_t* in, uint8_t* out,
+                                            size_t nbytes) {
+  constexpr size_t V = sizeof(uint4);  // 16 bytes
+  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const size_t nthreads = size_t(gridDim.x) * blockDim.x;
+  const size_t nvec = nbytes / V;
+
+  // 1-D grid-stride loop over 16-byte chunks
+  for (size_t vec = tid; vec < nvec; vec += nthreads) {
+    uint4 v = reinterpret_cast<const uint4*>(in)[vec];
+    uint8_t* b = reinterpret_cast<uint8_t*>(&v);
+#pragma unroll
+    for (int i = 0; i < int(V); ++i) b[i] = kNibbleLUT[b[i]];
+    reinterpret_cast<uint4*>(out)[vec] = v;
+  }
+}
+
+static bool upload_lut() {
+  std::array<uint8_t, 256> lut{};
+  auto map_nib = [](uint8_t v) -> uint8_t {
+    // 1..7 -> (8 - v); keep 0 and 8..15
+    return (v == 0 || (v & 0x8)) ? v : uint8_t(8 - v);
+  };
+  for (int b = 0; b < 256; ++b) {
+    uint8_t lo = b & 0xF;
+    uint8_t hi = (b >> 4) & 0xF;
+    lut[b] = uint8_t((map_nib(hi) << 4) | map_nib(lo));
+  }
+  cudaError_t e = cudaMemcpyToSymbol(kNibbleLUT, lut.data(), lut.size(),
+                                     /*offset=*/0, cudaMemcpyHostToDevice);
+
+  return (e == cudaSuccess);
+}
+
+bool unified_encode_int4b(cutlass::int4b_t const* in, cutlass::int4b_t* out,
+                          size_t num_int4_elems) {
+  // Build/upload LUT
+  if (!upload_lut()) return false;
+
+  static_assert(sizeof(typename cutlass::int4b_t::Storage) == 1,
+                "int4 storage must be 1 byte");
+  const size_t nbytes = num_int4_elems >> 1;
+
+  auto* in_bytes = reinterpret_cast<uint8_t const*>(in);
+  auto* out_bytes = reinterpret_cast<uint8_t*>(out);
+
+  // kernel launch params
+  constexpr int block = 256;
+  const size_t nvec = nbytes / sizeof(uint4);  // # of 16B vectors
+  int grid = int((nvec + block - 1) / block);
+  if (grid == 0) grid = 1;  // ensure we still cover the tail in the kernel
+
+  unified_encode_int4b_device<<<grid, block>>>(in_bytes, out_bytes, nbytes);
+
+  // launch errors
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("unified_encode_int4b_device launch error: %s (%d)\n",
+           cudaGetErrorString(err), err);
+    return false;
+  }
+
+  // runtime errors
+  err = cudaDeviceSynchronize();
+  if (err != cudaSuccess) {
+    printf("unified_encode_int4b_device runtime error: %s (%d)\n",
+           cudaGetErrorString(err), err);
+    return false;
+  }
+
+  return true;
+}
+
+}  // namespace vllm::cutlass_w4a8_utils
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w4a8/w4a8_utils.cuh b/csrc/quantization/cutlass_w4a8/w4a8_utils.cuh
new file mode 100644
index 000000000..25090091a
--- /dev/null
+++ b/csrc/quantization/cutlass_w4a8/w4a8_utils.cuh
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <cstddef>
+#include "cutlass/numeric_types.h"
+
+namespace vllm::cutlass_w4a8_utils {
+
+bool unified_encode_int4b(cutlass::int4b_t const* in, cutlass::int4b_t* out,
+                          size_t num_int4_elems);
+
+}  // namespace vllm::cutlass_w4a8_utils
\ No newline at end of file
diff --git a/csrc/quantization/w8a8/cutlass/moe/moe_data.cu b/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
index 49cafcc32..99fec8fd6 100644
--- a/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
+++ b/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
@@ -136,15 +136,17 @@ inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids,
 void get_cutlass_moe_mm_problem_sizes_caller(
     const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
     torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
+    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
+    std::optional<bool> force_swap_ab = std::nullopt) {
   auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
   auto options_int32 =
       torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
   torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);
 
   // Swap-AB should be disabled for FP4 path
-  bool may_swap_ab = (!blockscale_offsets.has_value()) &&
-                     (topk_ids.numel() <= SWAP_AB_THRESHOLD);
+  bool may_swap_ab =
+      force_swap_ab.value_or((!blockscale_offsets.has_value()) &&
+                             (topk_ids.numel() <= SWAP_AB_THRESHOLD));
 
   launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2,
                                atomic_buffer, num_experts, n, k, stream,
diff --git a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
index c5012a866..5de21cfbb 100644
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
@@ -80,7 +80,8 @@ void get_cutlass_moe_mm_data_caller(
 void get_cutlass_moe_mm_problem_sizes_caller(
     const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
     torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets);
+    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
+    std::optional<bool> force_swap_ab = std::nullopt);
 
 void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
                                          torch::Tensor& problem_sizes1,
@@ -303,14 +304,15 @@ void get_cutlass_moe_mm_data(
 void get_cutlass_moe_mm_problem_sizes(
     const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
     torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
+    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
+    std::optional<bool> force_swap_ab = std::nullopt) {
   int32_t version_num = get_sm_version_num();
 #if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
     (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
     (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
   get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
                                           problem_sizes2, num_experts, n, k,
-                                          blockscale_offsets);
+                                          blockscale_offsets, force_swap_ab);
   return;
 #endif
   TORCH_CHECK_NOT_IMPLEMENTED(
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 62212f98b..d4c6f8c67 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -350,6 +350,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("cutlass_encode_and_reorder_int4b(Tensor B) -> Tensor");
   // conditionally compiled so impl registration is in source file
 
+  // CUTLASS w4a8 grouped GEMM
+  ops.def(
+      "cutlass_w4a8_moe_mm("
+      "   Tensor! out_tensors,"
+      "   Tensor a_tensors,"
+      "   Tensor b_tensors,"
+      "   Tensor a_scales,"
+      "   Tensor b_scales,"
+      "   Tensor b_group_scales,"
+      "   int b_group_size,"
+      "   Tensor expert_offsets,"
+      "   Tensor problem_sizes,"
+      "   Tensor a_strides,"
+      "   Tensor b_strides,"
+      "   Tensor c_strides,"
+      "   Tensor group_scale_strides,"
+      "   str? maybe_schedule"
+      ") -> ()");
+  ops.def(
+      "cutlass_encode_and_reorder_int4b_grouped(Tensor b_tensors) -> (Tensor, "
+      "Tensor)");
+  // conditionally compiled so impl registration is in source file
+
 #endif
 
   // Dequantization for GGML.
@@ -466,7 +489,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                                 Tensor! problem_sizes1, "
       "                                 Tensor! problem_sizes2, "
       "                                 int num_experts, int n, int k, "
-      "                                 Tensor? blockscale_offsets) -> ()");
+      "                                 Tensor? blockscale_offsets, "
+      "                                 bool? force_swap_ab) -> ()");
   ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA,
            &get_cutlass_moe_mm_problem_sizes);
 
diff --git a/tests/kernels/quantization/test_cutlass_w4a8.py b/tests/kernels/quantization/test_cutlass_w4a8.py
index 465e24fd7..cccef28f5 100644
--- a/tests/kernels/quantization/test_cutlass_w4a8.py
+++ b/tests/kernels/quantization/test_cutlass_w4a8.py
@@ -12,8 +12,11 @@ import torch
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    convert_packed_uint4b8_to_signed_int4_inplace,
+    pack_cols,
     pack_rows,
     quantize_weights,
+    unpack_quantized_values_into_int32,
 )
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
@@ -167,8 +170,7 @@ def create_test_tensors(
 
     # for the practical use case we need per-tok scales for fp8 activations
     w_tok_s = torch.randn((m,), device="cuda", dtype=types.token_scale_type)
-    # weights are already per-group quantized, use placeholder here
-    w_ch_s = torch.ones((n,), device="cuda", dtype=types.channel_scale_type)
+    w_ch_s = torch.randn((n,), device="cuda", dtype=types.channel_scale_type)
 
     return Tensors(
         w_ref=w_ref,
@@ -211,7 +213,7 @@ def mm_test_helper(
     print(output_ref)
 
     torch.testing.assert_close(
-        output, output_ref.to(output.dtype), rtol=1e-3, atol=1e-3
+        output, output_ref.to(output.dtype), rtol=1e-2, atol=1e-2
     )
 
 
@@ -257,7 +259,7 @@ def test_w4a8_cuda_graph():
     )
 
     w_tok_s = torch.randn((m,), device="cuda", dtype=torch.float32)
-    w_ch_s = torch.ones((n,), device="cuda", dtype=torch.float32)
+    w_ch_s = torch.randn((n,), device="cuda", dtype=torch.float32)
 
     # Construct a trivial model with a single layer that calls the kernel
     model = W4A8Layer(
@@ -287,4 +289,38 @@ def test_w4a8_cuda_graph():
     output.zero_()
     g.replay()
 
-    torch.testing.assert_close(output, output_ref, rtol=1e-3, atol=1e-3)
+    torch.testing.assert_close(output, output_ref, rtol=1e-2, atol=1e-2)
+
+
+@pytest.mark.skipif(
+    not IS_SUPPORTED_BY_GPU, reason="CUTLASS W4A8 is not supported on this GPU type."
+)
+@pytest.mark.parametrize("shape", MNK_SHAPES)
+def test_convert_packed_uint4b8_to_signed_int4_inplace(shape):
+    """
+    The W4A16 checkpoints encode the weights as int4b8 packed to int32.
+    The CUTLASS kernels expect signed int4 packed to int32.
+    This tests checks that the runtime int4b8 -> signed int4 conversion
+    matches the offline conversion step exactly.
+    """
+    _, N, K = shape
+    # random weights packed to int32
+    t = torch.randint(
+        low=torch.iinfo(torch.int32).min,
+        high=torch.iinfo(torch.int32).max + 1,
+        size=(N, K // 8),
+        dtype=torch.int32,
+        device="cuda",
+    )
+
+    # compute reference
+    unpacked = unpack_quantized_values_into_int32(
+        t.clone(), scalar_types.uint4b8, packed_dim=1
+    )
+    unpacked = unpacked - 8  # int4b8 -> signed int4
+    ref = pack_cols(unpacked & 0x0F, 4, *unpacked.shape)
+
+    out = convert_packed_uint4b8_to_signed_int4_inplace(t.clone())
+
+    assert torch.equal(ref, out)
+    assert not torch.equal(ref, t)
diff --git a/tests/kernels/quantization/test_cutlass_w4a8_moe.py b/tests/kernels/quantization/test_cutlass_w4a8_moe.py
new file mode 100644
index 000000000..3560402a2
--- /dev/null
+++ b/tests/kernels/quantization/test_cutlass_w4a8_moe.py
@@ -0,0 +1,340 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for the CUTLASS-based W4A8 grouped GEMM kernel and the full MoE layer.
+"""
+
+import random
+from dataclasses import dataclass
+
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    pack_rows,
+    quantize_weights,
+)
+from vllm.platforms import current_platform
+from vllm.scalar_type import ScalarType, scalar_types
+
+IS_SUPPORTED_BY_GPU = current_platform.get_device_capability()[0] >= 9
+
+
+def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return tensor.clamp(min=finfo.min, max=finfo.max).to(dtype=torch.float8_e4m3fn)
+
+
+def cutlass_quantize(
+    atype: torch.dtype,
+    w: torch.Tensor,
+    wtype: ScalarType,
+    stype: torch.dtype | None,
+    group_size: int | None,
+    zero_points: bool = False,
+):
+    """
+    Quantize weights into W4 and compute reference dequantized weights.
+
+    Encoding/reordering of weights and packing of scales is deferred
+    until after all experts are combined.
+    """
+    assert wtype.is_integer(), "TODO: support floating point weights"
+
+    w_ref, w_q, w_s, w_zp = quantize_weights(
+        w, wtype, group_size=group_size, zero_points=zero_points
+    )
+
+    # Since scales are later cast to fp8, recompute w_ref in atype here.
+    w_ref = (
+        w_q.to(torch.float32)
+        * w_s.to(atype).to(torch.float32).repeat_interleave(group_size, dim=0)
+    ).to(atype)
+
+    # Bit mask prevents sign extension of int4 when packing.
+    w_q = pack_rows(w_q & 0x0F, wtype.size_bits, *w_q.shape)
+    # Make weights row-major (N, K).
+    w_q = w_q.t().contiguous()
+
+    return w_ref, w_q, w_s.to(atype), w_zp
+
+
+def cutlass_preprocess(
+    w_q_experts: list[torch.Tensor], w_s_experts: list[torch.Tensor]
+):
+    """
+    Reorder/encode expert weights and pack scales.
+
+    Returns:
+        w_q_packed: Packed/encoded int4 weights for all experts.
+        w_s_packed: Packed fp8 scales for all experts.
+        packed_layout: Layout/stride metadata for grouped GEMM.
+    """
+    w_s_packed = ops.cutlass_pack_scale_fp8(torch.stack(w_s_experts))
+    w_q_packed, packed_layout = ops.cutlass_encode_and_reorder_int4b_grouped(
+        torch.stack(w_q_experts)
+    )  # expects dim 3
+    return w_q_packed, w_s_packed, packed_layout
+
+
+GROUP_SIZE = 128
+# (num_experts, N, K)
+TEST_SHAPES = [
+    (8, 512, 2048),
+    (8, 2048, 2048),
+    (64, 512, 1024),
+    (64, 2048, 2048),
+    (4, 2048, 768),
+    (8, 768, 2048),
+    (64, 1536, 2048),
+    (128, 8192, 4096),  # test overflow int32
+]
+ALIGNMENT = 16  # torch._scaled_mm alignment for M, needed for reference check
+
+
+@dataclass
+class MoETestSetup:
+    num_experts: int
+    K: int
+    N: int
+    Ms: list[int]
+    M_full: int
+    a: torch.Tensor
+    a_ref: torch.Tensor
+    a_strides: torch.Tensor
+    out: torch.Tensor
+    c_strides: torch.Tensor
+    per_tok_scales: torch.Tensor
+    per_chan_scales: torch.Tensor
+    w_refs: list[torch.Tensor]
+    w_q_packed: torch.Tensor
+    w_s_packed: torch.Tensor
+    problem_sizes: torch.Tensor
+    expert_offsets: torch.Tensor
+    b_strides: torch.Tensor
+    group_scale_strides: torch.Tensor
+
+
+def make_moe_test_setup(
+    num_experts: int,
+    K: int,
+    N: int,
+    *,
+    alignment: int = ALIGNMENT,
+    max_blocks: int = 64,
+    device: str = "cuda",
+    random_zero: bool = False,
+) -> MoETestSetup:
+    """Create a full set of tensors for testing cutlass_w4a8_moe_mm."""
+
+    assert K % GROUP_SIZE == 0
+    # Token counts per expert (multiples of `alignment`).
+    Ms = [alignment * random.randint(1, max_blocks) for _ in range(num_experts)]
+
+    # set random experts to 0 tokens
+    if random_zero and num_experts > 1:
+        num_zero = max(1, num_experts // 8)
+        zero_indices = random.sample(range(num_experts), k=num_zero)
+        for idx in zero_indices:
+            Ms[idx] = 0
+
+    M_full = sum(Ms)
+    assert M_full > 0
+
+    # Activations.
+    a = to_fp8(torch.randn((M_full, K), device=device))
+    a_ref = a.to(torch.float32)
+    a_strides = torch.full((num_experts,), K, dtype=torch.int64, device=device)
+
+    # Output buffer.
+    out = torch.empty((M_full, N), dtype=torch.bfloat16, device=device)
+    c_strides = torch.full((num_experts,), N, dtype=torch.int64, device=device)
+
+    # Channel/token scales.
+    per_tok_scales = torch.randn((M_full, 1), dtype=torch.float32, device=device)
+    per_chan_scales = torch.randn(
+        (num_experts, N, 1), dtype=torch.float32, device=device
+    )
+
+    # Expert weights and scales.
+    wtype = scalar_types.int4
+    atype = stype = torch.float8_e4m3fn
+    w_refs, w_qs, w_ss = [], [], []
+    for _ in range(num_experts):
+        b = to_fp8(torch.randn((K, N), device=device))
+        w_ref, w_q, w_s, _ = cutlass_quantize(
+            atype, b.to(torch.float16), wtype, stype, GROUP_SIZE, zero_points=False
+        )
+        w_refs.append(w_ref)
+        w_qs.append(w_q)
+        w_ss.append(w_s)
+
+    w_q_packed, w_s_packed, packed_layout = cutlass_preprocess(w_qs, w_ss)
+
+    problem_sizes = torch.tensor(
+        [[N, M, K] for M in Ms], dtype=torch.int32, device=device
+    )
+
+    expert_offsets = torch.cat(
+        [
+            torch.tensor([0], dtype=torch.int64),
+            torch.cumsum(torch.tensor(Ms, dtype=torch.int64), dim=0)[:-1],
+        ]
+    ).to(device=device)
+
+    # B strides and group scale strides.
+    b_strides = packed_layout
+    group_scale_strides = torch.zeros(
+        (num_experts, 2), dtype=torch.int64, device=device
+    )
+    group_scale_strides[:, 0] = N
+
+    return MoETestSetup(
+        num_experts=num_experts,
+        K=K,
+        N=N,
+        Ms=Ms,
+        M_full=M_full,
+        a=a,
+        a_ref=a_ref,
+        a_strides=a_strides,
+        out=out,
+        c_strides=c_strides,
+        per_tok_scales=per_tok_scales,
+        per_chan_scales=per_chan_scales,
+        w_refs=w_refs,
+        w_q_packed=w_q_packed,
+        w_s_packed=w_s_packed,
+        problem_sizes=problem_sizes,
+        expert_offsets=expert_offsets,
+        b_strides=b_strides,
+        group_scale_strides=group_scale_strides,
+    )
+
+
+def compute_moe_reference_output(setup: MoETestSetup) -> torch.Tensor:
+    """Compute reference output using torch._scaled_mm per expert."""
+    out_ref = torch.empty_like(setup.out)
+
+    ends = torch.cumsum(torch.tensor(setup.Ms), 0).tolist()
+    starts = setup.expert_offsets.cpu().tolist()
+
+    for i in range(setup.num_experts):
+        start, end = starts[i], ends[i]
+        if start == end:
+            continue
+
+        out_ref_i = torch._scaled_mm(
+            setup.a_ref[start:end].to(torch.float8_e4m3fn),
+            setup.w_refs[i].to(torch.float8_e4m3fn).t().contiguous().t(),
+            setup.per_tok_scales[start:end],  # (M, 1)
+            setup.per_chan_scales[i].reshape(1, -1),  # (1, N)
+            out_dtype=torch.bfloat16,
+            use_fast_accum=True,
+        )
+        out_ref[start:end] = out_ref_i
+
+    return out_ref
+
+
+@pytest.mark.skipif(
+    not IS_SUPPORTED_BY_GPU,
+    reason="W4A8 Grouped GEMM is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("shape", TEST_SHAPES)
+@pytest.mark.parametrize("random_zero", [True, False])
+def test_cutlass_w4a8_moe_mm_end_to_end(shape, random_zero):
+    num_experts, N, K = shape
+    current_platform.seed_everything(42)
+    setup = make_moe_test_setup(
+        num_experts=num_experts, K=K, N=N, max_blocks=64, random_zero=random_zero
+    )
+
+    ops.cutlass_w4a8_moe_mm(
+        setup.out,
+        setup.a,
+        setup.w_q_packed,
+        setup.per_tok_scales,
+        setup.per_chan_scales,
+        setup.w_s_packed,
+        GROUP_SIZE,
+        setup.expert_offsets,
+        setup.problem_sizes,
+        setup.a_strides,
+        setup.b_strides,
+        setup.c_strides,
+        setup.group_scale_strides,
+    )
+    torch.cuda.synchronize()
+
+    out_ref = compute_moe_reference_output(setup)
+    torch.testing.assert_close(setup.out, out_ref, rtol=1e-2, atol=1e-2)
+
+
+class W4A8MoELayer(torch.nn.Module):
+    """
+    Minimal wrapper module to test cuda graphs
+    """
+
+    def __init__(self, setup: MoETestSetup):
+        super().__init__()
+        self.setup = setup
+
+    def forward(self, a: torch.Tensor) -> torch.Tensor:
+        s = self.setup
+        ops.cutlass_w4a8_moe_mm(
+            s.out,
+            a,
+            s.w_q_packed,
+            s.per_tok_scales,
+            s.per_chan_scales,
+            s.w_s_packed,
+            GROUP_SIZE,
+            s.expert_offsets,
+            s.problem_sizes,
+            s.a_strides,
+            s.b_strides,
+            s.c_strides,
+            s.group_scale_strides,
+        )
+        return s.out
+
+
+@pytest.mark.skipif(
+    not IS_SUPPORTED_BY_GPU,
+    reason="W4A8 Grouped GEMM is not supported on this GPU type.",
+)
+def test_cutlass_w4a8_moe_mm_cuda_graph():
+    current_platform.seed_everything(42)
+    # Fixed config for CUDA graph test (single parameter point).
+    num_experts = 8
+    K = 512
+    N = 2048
+
+    setup = make_moe_test_setup(
+        num_experts=num_experts,
+        K=K,
+        N=N,
+        max_blocks=32,
+    )
+
+    # Construct model that calls the grouped GEMM kernel.
+    model = W4A8MoELayer(setup)
+
+    # Build reference output once.
+    out_ref = compute_moe_reference_output(setup)
+
+    # Capture and run the model in a CUDA graph.
+    a_static = setup.a.clone()  # static input tensor for graph replay
+
+    stream = torch.cuda.Stream()
+    with torch.cuda.stream(stream):
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g):
+            out_static = model(a_static)
+
+    out_static.zero_()
+    g.replay()
+
+    torch.testing.assert_close(out_static, out_ref, rtol=1e-2, atol=1e-2)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 56c780ceb..6bbfe11b6 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -695,6 +695,10 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
     def cutlass_encode_and_reorder_int4b_fake(b: torch.Tensor) -> torch.Tensor:
         return torch.empty_like(b, memory_format=torch.contiguous_format)
 
+    @register_fake("_C::cutlass_encode_and_reorder_int4b_grouped")
+    def cutlass_encode_and_reorder_int4b_grouped_fake(b: torch.Tensor) -> torch.Tensor:
+        return torch.empty_like(b, memory_format=torch.contiguous_format)
+
 
 if hasattr(torch.ops._C, "allspark_w8a16_gemm"):
 
@@ -1058,6 +1062,7 @@ def get_cutlass_moe_mm_problem_sizes(
     n: int,
     k: int,
     blockscale_offsets: torch.Tensor | None = None,
+    force_swap_ab: bool | None = None,
 ):
     """
     Compute only the per-expert problem sizes needed by the two grouped matrix
@@ -1067,9 +1072,20 @@ def get_cutlass_moe_mm_problem_sizes(
     - problem_sizes1, problem_sizes2: M×N×K sizes of each expert's
                                     multiplication for the two grouped MMs
                                     used in the fused MoE operation.
+    Optional:
+    - force_swap_ab: If set to True or False, explicitly enable or disable the
+                     A/B input swap optimization. If None (default), the swap
+                     is selected automatically based on tensor sizes.
     """
     return torch.ops._C.get_cutlass_moe_mm_problem_sizes(
-        topk_ids, problem_sizes1, problem_sizes2, num_experts, n, k, blockscale_offsets
+        topk_ids,
+        problem_sizes1,
+        problem_sizes2,
+        num_experts,
+        n,
+        k,
+        blockscale_offsets,
+        force_swap_ab,
     )
 
 
@@ -1457,6 +1473,78 @@ def cutlass_encode_and_reorder_int4b(b: torch.Tensor) -> torch.Tensor:
     return torch.ops._C.cutlass_encode_and_reorder_int4b(b)
 
 
+def cutlass_w4a8_moe_mm(
+    out_tensors: torch.Tensor,
+    a_tensors: torch.Tensor,
+    b_tensors: torch.Tensor,
+    a_scales: torch.Tensor,
+    b_scales: torch.Tensor,
+    b_group_scales: torch.Tensor,
+    b_group_size: int,
+    expert_offsets: torch.Tensor,
+    problem_sizes: torch.Tensor,
+    a_strides: torch.Tensor,
+    b_strides: torch.Tensor,
+    c_strides: torch.Tensor,
+    group_scale_strides: torch.Tensor,
+    maybe_schedule: str | None = None,
+):
+    """
+    Executes the CUTLASS-based fused-MoE grouped matrix multiplication for the
+    W4A8 quantization scheme. Uses group-wise quantization (INT4 -> FP8)
+    and both per-channel + per-token scaling in the epilogue.
+
+    Args:
+        out_tensors:
+            Output buffer for all experts (updated in-place).
+        a_tensors:
+            FP8 (E4M3FN) activations for all experts.
+        b_tensors:
+            INT4-packed weight matrix for all experts, packed to INT32
+        a_scales:
+            Per-token FP8 activation scales, applied in the epilogue.
+        b_scales:
+            Per-channel FP8 weight scales for each expert, applied in the epilogue.
+        b_group_scales:
+            FP8 scale values for group-wise INT4 weight blocks.
+        b_group_size:
+            Number of elements grouped under each entry of b_group_scales.
+        expert_offsets:
+            Cumulative token offsets
+        problem_sizes:
+            Per-expert (M, N, K) GEMM sizes used by the grouped GEMM launcher.
+        a/b/c/group_scale_strides:
+            Strides describing the memory layout of the input tensors.
+        maybe_schedule:
+            Optional override to choose a specific kernel or epilogue schedule.
+
+    Returns:
+        out_tensors updated in-place with the dequantized INT4xFP8 grouped GEMM result.
+    """
+    return torch.ops._C.cutlass_w4a8_moe_mm(
+        out_tensors,
+        a_tensors,
+        b_tensors,
+        a_scales,
+        b_scales,
+        b_group_scales,
+        b_group_size,
+        expert_offsets,
+        problem_sizes,
+        a_strides,
+        b_strides,
+        c_strides,
+        group_scale_strides,
+        maybe_schedule,
+    )
+
+
+def cutlass_encode_and_reorder_int4b_grouped(
+    b_tensors: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    return torch.ops._C.cutlass_encode_and_reorder_int4b_grouped(b_tensors)
+
+
 if hasattr(torch.ops._C, "permute_cols"):
 
     @register_fake("_C::permute_cols")
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 9103e84aa..1e145a8fc 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -63,8 +63,10 @@ if HAS_TRITON:
     from vllm.model_executor.layers.fused_moe.cutlass_moe import (
         CutlassBatchedExpertsFp8,
         CutlassExpertsFp8,
+        CutlassExpertsW4A8Fp8,
         cutlass_moe_fp4,
         cutlass_moe_fp8,
+        cutlass_moe_w4a8_fp8,
     )
     from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
     from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
@@ -88,8 +90,10 @@ if HAS_TRITON:
         "grouped_topk",
         "cutlass_moe_fp8",
         "cutlass_moe_fp4",
+        "cutlass_moe_w4a8_fp8",
         "CutlassExpertsFp8",
         "CutlassBatchedExpertsFp8",
+        "CutlassExpertsW4A8Fp8",
         "TritonExperts",
         "BatchedTritonExperts",
         "DeepGemmExperts",
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index e52845dfa..f35cafa0f 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -143,6 +143,7 @@ class FusedMoEQuantDesc:
     scale: Union[torch.Tensor, "PrecisionConfig", None] = None
 
     # Quantization alphas or gscales, used for nvfp4 types.
+    # W4A8 FP8: used for per-channel scales
     # TODO(bnell): put some of these in subclasses
     alpha_or_gscale: torch.Tensor | None = None
 
@@ -442,7 +443,9 @@ class FusedMoEQuantConfig:
         - a1_scale: Optional scale to be used for a1.
         - a2_scale: Optional scale to be used for a2.
         - g1_alphas: Optional global quantization scales for w1 (for nvfp4).
+            per-channel scales for w1 (for W4A8 FP8).
         - g2_alphas: Optional global quantization scales for w2 (for nvfp4).
+            per-channel scales for w2 (for W4A8 FP8).
         - a1_gscale: Optional global quantization scales for a1 (for nvfp4).
         - a2_gscale: Optional global quantization scales for a2 (for nvfp4).
         - w1_bias: Optional biases for w1 (GPT OSS Triton).
@@ -461,6 +464,7 @@ class FusedMoEQuantConfig:
             "mxfp4",
             "mxfp6_e3m2",
             "mxfp6_e2m3",
+            "int4",
         }
 
         if weight_dtype is None:
@@ -671,6 +675,31 @@ def int8_w8a16_moe_quant_config(
     )
 
 
+def int4_w4afp8_moe_quant_config(
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    g1_alphas: torch.Tensor,
+    g2_alphas: torch.Tensor,
+    per_act_token_quant: bool = False,
+    per_out_ch_quant: bool = False,
+    block_shape: list[int] | None = None,
+) -> FusedMoEQuantConfig:
+    """
+    Construct a quant config for fp8 activations and int4 weights.
+    """
+    return FusedMoEQuantConfig.make(
+        torch.float8_e4m3fn,  # quant dtype for activations
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        g1_alphas=g1_alphas,
+        g2_alphas=g2_alphas,
+        per_act_token_quant=per_act_token_quant,
+        per_out_ch_quant=per_out_ch_quant,
+        block_shape=block_shape,
+        weight_dtype="int4",  # weight dtype for weights
+    )
+
+
 def biased_moe_quant_config(
     w1_bias: torch.Tensor | None,
     w2_bias: torch.Tensor | None,
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index 30144ca54..552e38a71 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -1052,3 +1052,404 @@ def run_cutlass_block_scaled_fused_experts(
     return (
         c2[c_map].view(m, topk, k) * topk_weights.view(m, topk, 1).to(out_dtype)
     ).sum(dim=1)
+
+
+# W4A8
+def run_cutlass_moe_w4a8_fp8(
+    output: torch.Tensor,
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_ids: torch.Tensor,
+    activation_callable: Callable,
+    global_num_experts: int,
+    expert_map: torch.Tensor | None,
+    w1_scale: torch.Tensor | None,
+    w2_scale: torch.Tensor | None,
+    a1q_scale: torch.Tensor | None,
+    a2_scale: torch.Tensor | None,
+    w1_chan_scale: torch.Tensor,
+    w2_chan_scale: torch.Tensor,
+    a_strides1: torch.Tensor,
+    a_strides2: torch.Tensor,
+    b_strides1: torch.Tensor,
+    b_strides2: torch.Tensor,
+    c_strides1: torch.Tensor,
+    c_strides2: torch.Tensor,
+    s_strides1: torch.Tensor,
+    s_strides2: torch.Tensor,
+    workspace13: torch.Tensor,
+    workspace2: torch.Tensor,
+    expert_num_tokens: torch.Tensor | None,
+    out_dtype: torch.dtype,
+    per_act_token: bool,
+    per_out_ch: bool,
+    use_batched_format: bool,
+    topk_weights: torch.Tensor | None,
+    group_size: int,
+):
+    a1q = hidden_states
+    M = a1q.size(0)
+    local_E = w1.size(0)
+    device = a1q.device
+    _, K, N_packed = w2.shape
+    N = N_packed * 8  # logical N, pack 8 int4 into 1 int32
+
+    assert per_act_token, "W4A8 must use per-token scales"
+    assert per_out_ch, "W4A8 must use per-channel scales"
+    assert w1_scale is not None
+    assert w2_scale is not None
+    assert w1_scale.dtype == torch.float8_e4m3fn
+    assert w2_scale.dtype == torch.float8_e4m3fn
+    assert w1.dtype == torch.int32
+    assert w2.dtype == torch.int32
+    assert w1_chan_scale.dtype == torch.float32
+    assert w2_chan_scale.dtype == torch.float32
+    assert w1.size(0) == w2.size(0), "Weights expert number mismatch"
+    assert a1q_scale is not None
+    assert a2_scale is None
+    assert out_dtype in [torch.bfloat16], f"Invalid output dtype: {out_dtype}"
+    if expert_map is not None:
+        assert expert_num_tokens is None
+    assert not use_batched_format, "batched format not supported yet"
+    assert group_size == 128, f"Only group size 128 supported but got {group_size=}"
+
+    assert global_num_experts != -1
+    assert w1.size(2) * 8 == K, (
+        f"w1 hidden size mismatch: got {w1.size(2) * 8}, expected {K=}"
+    )
+
+    # Translate info from expert_map to topk_ids
+    if expert_map is not None:
+        local_topk_ids = torch.where(
+            expert_map[topk_ids] != -1, expert_map[topk_ids], -1
+        )
+    else:
+        local_topk_ids = topk_ids
+
+    topk = local_topk_ids.size(1)
+    a1q_perm = _resize_cache(workspace2.view(dtype=torch.float8_e4m3fn), (M * topk, K))
+    mm1_out = _resize_cache(workspace13, (M * topk, N * 2))
+    act_out = _resize_cache(workspace2, (M * topk, N))
+    # original workspace are based on input hidden_states dtype (bf16)
+    quant_out = _resize_cache(
+        workspace13.view(dtype=torch.float8_e4m3fn), (M * topk, N)
+    )
+    mm2_out = _resize_cache(workspace2, (M * topk, K))
+
+    problem_sizes1 = torch.empty(
+        (global_num_experts, 3), dtype=torch.int32, device=device
+    )
+    problem_sizes2 = torch.empty(
+        (global_num_experts, 3), dtype=torch.int32, device=device
+    )
+
+    num_expert = global_num_experts if expert_map is None else expert_map.size(0)
+    # permuted a1q reuses workspace2
+    a1q, a1q_scale, expert_offsets, inv_perm, _ = moe_permute(
+        a1q,
+        a1q_scale,
+        topk_ids,
+        num_expert,
+        local_E,
+        expert_map,
+        permuted_hidden_states=a1q_perm,
+    )
+    expert_offsets = expert_offsets[:-1]
+
+    # For RS gemm SwapAB is always enabled (swap logical M, N in the problem shape)
+    ops.get_cutlass_moe_mm_problem_sizes(
+        local_topk_ids,
+        problem_sizes1,
+        problem_sizes2,
+        global_num_experts,
+        N,
+        K,
+        force_swap_ab=True,
+    )
+
+    ops.cutlass_w4a8_moe_mm(
+        mm1_out,
+        a1q,
+        w1,
+        a1q_scale,
+        w1_chan_scale,
+        w1_scale,
+        group_size,
+        expert_offsets,
+        problem_sizes1,
+        a_strides1,
+        b_strides1,
+        c_strides1,
+        s_strides1,
+    )
+
+    activation_callable(act_out, mm1_out)
+
+    a2q, a2q_scale = ops.scaled_fp8_quant(
+        act_out, a2_scale, use_per_token_if_dynamic=per_act_token, output=quant_out
+    )
+
+    if expert_map is not None:
+        mm2_out.fill_(0)
+
+    ops.cutlass_w4a8_moe_mm(
+        mm2_out,
+        a2q,
+        w2,
+        a2q_scale,
+        w2_chan_scale,
+        w2_scale,
+        group_size,
+        expert_offsets,
+        problem_sizes2,
+        a_strides2,
+        b_strides2,
+        c_strides2,
+        s_strides2,
+    )
+
+    # for non-chunking mode the output is resized from workspace13
+    # so we need to make sure mm2_out uses workspace2.
+    moe_unpermute(
+        out=output,
+        permuted_hidden_states=mm2_out,
+        topk_weights=topk_weights,
+        inv_permuted_idx=inv_perm,
+    )
+
+
+class CutlassExpertsW4A8Fp8(mk.FusedMoEPermuteExpertsUnpermute):
+    def __init__(
+        self,
+        out_dtype: torch.dtype | None,
+        a_strides1: torch.Tensor,
+        a_strides2: torch.Tensor,
+        b_strides1: torch.Tensor,
+        b_strides2: torch.Tensor,
+        c_strides1: torch.Tensor,
+        c_strides2: torch.Tensor,
+        s_strides1: torch.Tensor,
+        s_strides2: torch.Tensor,
+        quant_config: FusedMoEQuantConfig,
+        group_size: int,
+    ):
+        super().__init__(quant_config)
+        self.out_dtype = out_dtype
+        self.a_strides1 = a_strides1
+        self.a_strides2 = a_strides2
+        self.b_strides1 = b_strides1
+        self.b_strides2 = b_strides2
+        self.c_strides1 = c_strides1
+        self.c_strides2 = c_strides2
+        self.s_strides1 = s_strides1
+        self.s_strides2 = s_strides2
+        self.group_size = group_size
+
+    @property
+    def activation_formats(
+        self,
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (
+            mk.FusedMoEActivationFormat.Standard,
+            mk.FusedMoEActivationFormat.Standard,
+        )
+
+    def supports_chunking(self) -> bool:
+        return True
+
+    def supports_expert_map(self) -> bool:
+        return True
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        # topk weights and reduction are fused in moe_unpermute cuda kernel
+        return TopKWeightAndReduceNoOP()
+
+    def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype:
+        return self.out_dtype if self.out_dtype is not None else act_dtype
+
+    def workspace_shapes(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+        workspace1 = (M * topk, max(N, K))
+        workspace2 = (M * topk, max(N // 2, K))
+        output = (M, K)
+        return (workspace1, workspace2, output)
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
+        workspace13: torch.Tensor | None,
+        workspace2: torch.Tensor | None,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        apply_router_weight_on_input: bool,
+    ):
+        assert self.w1_zp is None, "w1_zp is not supported in CUTLASS MoE"
+        assert self.w2_zp is None, "w2_zp is not supported in CUTLASS MoE"
+
+        expert_num_tokens = None
+        activation_callable = lambda o, i: self.activation(activation, o, i)
+
+        use_batched_format = (
+            self.activation_formats[0] == mk.FusedMoEActivationFormat.BatchedExperts
+        )
+        assert not use_batched_format, "batched format not supported"
+
+        in_dtype = hidden_states.dtype
+
+        run_cutlass_moe_w4a8_fp8(
+            output,
+            hidden_states,
+            w1,
+            w2,
+            topk_ids,
+            activation_callable,
+            global_num_experts,
+            expert_map,
+            self.w1_scale,
+            self.w2_scale,
+            a1q_scale,
+            a2_scale,
+            self.g1_alphas,  # per-channel scales
+            self.g2_alphas,  # per-channel scales
+            self.a_strides1,
+            self.a_strides2,
+            self.b_strides1,
+            self.b_strides2,
+            self.c_strides1,
+            self.c_strides2,
+            self.s_strides1,
+            self.s_strides2,
+            workspace13,
+            workspace2,
+            expert_num_tokens,
+            self.out_dtype if self.out_dtype is not None else in_dtype,
+            self.per_act_token_quant,
+            self.per_out_ch_quant,
+            use_batched_format,
+            topk_weights,
+            self.group_size,
+        )
+
+
+def cutlass_moe_w4a8_fp8(
+    a: torch.Tensor,
+    w1_q: torch.Tensor,
+    w2_q: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    a_strides1: torch.Tensor,
+    a_strides2: torch.Tensor,
+    b_strides1: torch.Tensor,
+    b_strides2: torch.Tensor,
+    c_strides1: torch.Tensor,
+    c_strides2: torch.Tensor,
+    s_strides1: torch.Tensor,
+    s_strides2: torch.Tensor,
+    quant_config: FusedMoEQuantConfig,
+    activation: str = "silu",
+    expert_map: torch.Tensor | None = None,
+    apply_router_weight_on_input: bool = False,
+    global_num_experts: int = -1,
+    group_size: int = 128,
+) -> torch.Tensor:
+    """
+    This function computes a w4a8-quantized Mixture of Experts (MoE) layer
+    using two sets of quantized weights, w1_q and w2_q, and top-k gating
+    mechanism. The matrix multiplications are implemented with CUTLASS
+    mixed-dtype grouped gemm.
+
+    Parameters:
+    - a (torch.Tensor): The input tensor to the MoE layer.
+        Shape: [M, K]
+    - w1_q (torch.Tensor): The first set of fp8-quantized expert weights.
+        Shape: [num_experts, 2*N, K // packed_factor]
+    - w2_q (torch.Tensor): The second set of fp8-quantized expert weights.
+        Shape: [num_experts, K, N // packed_factor]
+    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
+    - topk_ids (torch.Tensor): The token->expert mappings.
+    - a_strides1 (torch.Tensor): The input strides for the first gemm.
+        Shape: [num_experts]
+    - a_strides2 (torch.Tensor): The input strides for the second gemm.
+        Shape: [num_experts]
+    - b_strides1 (torch.Tensor): The packed layout for the first gemm weights.
+        Shape: [num_experts, 3]
+        dtype: torch.int32
+    - b_strides2 (torch.Tensor): The packed layout for the second gemm weights.
+        Shape: [num_experts, 3]
+        dtype: torch.int32
+    - c_strides1 (torch.Tensor): The output strides for the first gemm.
+        Shape: [num_experts]
+    - c_strides2 (torch.Tensor): The output strides for the second gemm.
+        Shape: [num_experts]
+    - s_strides1 (torch.Tensor): strides for the group-wise scales for the first gemm.
+        Shape: [num_experts, 2]
+        dtype: torch.int64
+    - s_strides2 (torch.Tensor): strides for the group-wise scales for the second gemm.
+        Shape: [num_experts, 2]
+        dtype: torch.int64
+    - per_act_token (Optional[bool]): Whether the scale is per-token or
+                                      per-tensor.
+    - activation (str): The activation function to use.
+    - expert_map (Optional[torch.Tensor]): In the case of Expert parallel,
+        every Rank is responsible for a subset of experts. expert_map is a
+        mapping from global expert-id to local expert-id. When expert_map[i]
+        is -1, it means that this Rank is not responsible for global
+        expert-id i.
+    - apply_router_weight_on_input (bool): When true, the topk weights are
+        applied directly on the inputs. This is only applicable when topk is 1.
+    - global_num_experts (int): The total number of experts.
+    - group_size (int): The number of weights per scale factor
+
+    Returns:
+    - torch.Tensor: The bf16 output tensor after applying the MoE layer.
+    """
+    assert quant_config is not None
+
+    num_experts = global_num_experts if global_num_experts != -1 else w1_q.size(0)
+
+    fn = mk.FusedMoEModularKernel(
+        MoEPrepareAndFinalizeNoEP(),
+        CutlassExpertsW4A8Fp8(
+            out_dtype=a.dtype,
+            a_strides1=a_strides1,
+            a_strides2=a_strides2,
+            b_strides1=b_strides1,
+            b_strides2=b_strides2,
+            c_strides1=c_strides1,
+            c_strides2=c_strides2,
+            s_strides1=s_strides1,
+            s_strides2=s_strides2,
+            quant_config=quant_config,
+            group_size=group_size,
+        ),
+    )
+
+    return fn(
+        a,
+        w1_q,
+        w2_q,
+        topk_weights,
+        topk_ids,
+        activation=activation,
+        global_num_experts=num_experts,
+        expert_map=expert_map,
+        apply_router_weight_on_input=apply_router_weight_on_input,
+    )
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 51d3299e7..075610ec5 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -367,7 +367,7 @@ class FusedMoEPrepareAndFinalize(ABC):
 class FusedMoEPermuteExpertsUnpermute(ABC):
     """
     An abstract base class for the [Permute-Experts-Unpermute] step described
-    above.
+        above.
     """
 
     def __init__(
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index b91ecb59f..21f4cfe51 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -256,7 +256,7 @@ class CompressedTensorsConfig(QuantizationConfig):
                     if format is not None
                     else is_activation_quantization_format(quant_format)
                 )
-                # TODO(czhu): w4a8fp8 is in packed-quantized format
+                # w4a8fp8 is in packed-quantized format
                 # but needs input activation quantization
                 input_activations = quant_config.get("input_activations")
                 if act_quant_format or input_activations:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 8013b29f7..619162272 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -33,6 +33,7 @@ from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
     fp8_w8a8_moe_quant_config,
     int4_w4a16_moe_quant_config,
+    int4_w4afp8_moe_quant_config,
     int8_w8a8_moe_quant_config,
     int8_w8a16_moe_quant_config,
     nvfp4_moe_quant_config,
@@ -79,7 +80,11 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     prepare_moe_fp8_layer_for_marlin,
 )
-from vllm.model_executor.layers.quantization.utils.quant_utils import swizzle_blockscale
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    convert_bf16_scales_to_fp8,
+    convert_packed_uint4b8_to_signed_int4_inplace,
+    swizzle_blockscale,
+)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     all_close_1d,
     normalize_e4m3fn_to_e4m3fnuz,
@@ -204,6 +209,11 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
             return CompressedTensorsW8A8Int8MoEMethod(
                 weight_quant, input_quant, layer.moe_config
             )
+        elif quant_config._is_fp8_w4a8_sm90(weight_quant, input_quant):
+            logger.info_once("Using CompressedTensorsW4A8Fp8MoEMethod")
+            return CompressedTensorsW4A8Fp8MoEMethod(
+                weight_quant, input_quant, layer.moe_config
+            )
         elif quant_config._is_dynamic_token_w4a8_int(weight_quant, input_quant):
             return CompressedTensorsW4A8Int8MoEMethod(
                 weight_quant, input_quant, layer.moe_config
@@ -2428,3 +2438,331 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
             apply_router_weight_on_input,
             int(_act_kind(activation)),
         )
+
+
+class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod):
+    def __init__(
+        self,
+        weight_quant: QuantizationArgs,
+        input_quant: QuantizationArgs,
+        moe: FusedMoEConfig,
+        layer_name: str | None = None,
+    ):
+        super().__init__(moe)
+        self.weight_quant = weight_quant
+        self.input_quant = input_quant
+
+        self.group_size = self.weight_quant.group_size
+        self.num_bits = self.weight_quant.num_bits
+        self.packed_factor = 32 // self.num_bits
+
+        assert self.weight_quant.symmetric, (
+            "Only symmetric quantization is supported for W4A8 MoE"
+        )
+        assert self.weight_quant.actorder != "group"
+        assert self.group_size == 128, "Only group size 128 supported for W4A8 MoE"
+
+        self.disable_expert_map = False
+        self.layer_name = layer_name
+
+        from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
+        from vllm.model_executor.layers.quantization.utils.quant_utils import (
+            GroupShape,
+        )
+
+        self.quant_fp8 = QuantFP8(static=False, group_shape=GroupShape.PER_TOKEN)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        layer.intermediate_size_per_partition = intermediate_size_per_partition
+        layer.hidden_size = hidden_size
+        layer.num_experts = num_experts
+        layer.orig_dtype = params_dtype
+        layer.weight_block_size = None
+
+        # requirement for CUTLASS reorder_tensor
+        assert hidden_size % 256 == 0, f"{hidden_size=} must be divisible by 256"
+        assert intermediate_size_per_partition % 256 == 0, (
+            f"{intermediate_size_per_partition=} must be divisible by 256"
+        )
+        # storage type, pack 8xint4 into int32
+        params_dtype = torch.int32
+
+        # WEIGHTS
+        w13_weight_packed = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size // self.packed_factor,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_packed", w13_weight_packed)
+        set_weight_attrs(w13_weight_packed, extra_weight_attrs)
+
+        w2_weight_packed = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition // self.packed_factor,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_packed", w2_weight_packed)
+        set_weight_attrs(w2_weight_packed, extra_weight_attrs)
+
+        # SCALES
+        # weight_scale refers to the group-wise scales
+        # they are initially loaded as bf16, we will convert to fp8
+        # after loading
+        w13_weight_scale = torch.nn.Parameter(
+            torch.ones(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size // self.group_size,
+                dtype=layer.orig_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+
+        w2_weight_scale = torch.nn.Parameter(
+            torch.ones(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition // self.group_size,
+                dtype=layer.orig_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        # Add PER-GROUP quantization for FusedMoE.weight_loader.
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.GROUP.value}
+        )
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # weight shapes
+        w2_weight_shape = torch.nn.Parameter(
+            torch.empty(num_experts, 2), requires_grad=False
+        )
+        layer.register_parameter("w2_weight_shape", w2_weight_shape)
+        set_weight_attrs(w2_weight_shape, extra_weight_attrs)
+        w13_weight_shape = torch.nn.Parameter(
+            torch.empty(num_experts, 2), requires_grad=False
+        )
+        layer.register_parameter("w13_weight_shape", w13_weight_shape)
+        set_weight_attrs(w13_weight_shape, extra_weight_attrs)
+
+        # don't use input scales
+        layer.w13_input_scale = None
+        layer.w2_input_scale = None
+
+    def process_weights_after_loading(self, layer):
+        device = layer.w13_weight_packed.device
+
+        # STRIDES
+        # A, C
+        self.a_strides1_c_strides2 = torch.full(
+            (layer.local_num_experts,),
+            layer.hidden_size,
+            device=device,
+            dtype=torch.int64,
+        )
+        self.a_strides2 = torch.full(
+            (layer.local_num_experts,),
+            layer.intermediate_size_per_partition,
+            device=device,
+            dtype=torch.int64,
+        )
+        self.c_strides1 = torch.full(
+            (layer.local_num_experts,),
+            2 * layer.intermediate_size_per_partition,
+            device=device,
+            dtype=torch.int64,
+        )
+
+        # S (group-wise scales)
+        # sizeof(StrideS) = 16 bytes, so we need to use 2xint64 to encode it
+        self.s_strides1 = torch.zeros(
+            (layer.local_num_experts, 2), device=device, dtype=torch.int64
+        )
+        self.s_strides1[:, 0] = 2 * layer.intermediate_size_per_partition
+
+        self.s_strides2 = torch.zeros(
+            (layer.local_num_experts, 2), device=device, dtype=torch.int64
+        )
+        self.s_strides2[:, 0] = layer.hidden_size
+
+        # encode and reorder weight tensors, and get the layout to pass to
+        # the grouped gemm kernel. `b_strides1/2` specifies the entire layout
+        convert_packed_uint4b8_to_signed_int4_inplace(layer.w13_weight_packed)
+        w13_weight_shuffled, self.b_strides1 = (
+            ops.cutlass_encode_and_reorder_int4b_grouped(layer.w13_weight_packed)
+        )
+        replace_parameter(layer, "w13_weight_packed", w13_weight_shuffled)
+        convert_packed_uint4b8_to_signed_int4_inplace(layer.w2_weight_packed)
+        w2_weight_shuffled, self.b_strides2 = (
+            ops.cutlass_encode_and_reorder_int4b_grouped(layer.w2_weight_packed)
+        )
+        replace_parameter(layer, "w2_weight_packed", w2_weight_shuffled)
+
+        # convert bf16 scales to (fp8_scales, channel_scales)
+        w13_weight_scale, w13_weight_chan_scale = convert_bf16_scales_to_fp8(
+            self.quant_fp8, layer.w13_weight_scale
+        )
+        w2_weight_scale, w2_weight_chan_scale = convert_bf16_scales_to_fp8(
+            self.quant_fp8, layer.w2_weight_scale
+        )
+
+        # register channel scales
+        layer.register_parameter(
+            "w13_weight_chan_scale",
+            torch.nn.Parameter(w13_weight_chan_scale, requires_grad=False),
+        )
+        layer.register_parameter(
+            "w2_weight_chan_scale",
+            torch.nn.Parameter(w2_weight_chan_scale, requires_grad=False),
+        )
+
+        # The scales are stored as (E, N, K // 128) but the kernel expects
+        # (E, K // 128, N) in row-major format, so we need to permute the last 2 dims
+        # and make it contiguous
+        w13_weight_scale_packed = ops.cutlass_pack_scale_fp8(
+            w13_weight_scale.permute(0, 2, 1).contiguous()
+        )
+        replace_parameter(layer, "w13_weight_scale", w13_weight_scale_packed)
+        w2_weight_scale_packed = ops.cutlass_pack_scale_fp8(
+            w2_weight_scale.permute(0, 2, 1).contiguous()
+        )
+        replace_parameter(layer, "w2_weight_scale", w2_weight_scale_packed)
+
+    def maybe_make_prepare_finalize(
+        self,
+        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
+    ) -> mk.FusedMoEPrepareAndFinalize | None:
+        return super().maybe_make_prepare_finalize(routing_tables)
+
+    def get_fused_moe_quant_config(
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
+        # Store quantization scales; both per-group and per-channel
+        # Note we haven't specified the group size here because
+        # the quant config logic assumes group-wise scaling
+        # and channel-wise scaling are exclusive.
+        return int4_w4afp8_moe_quant_config(
+            w1_scale=layer.w13_weight_scale,  # group scale
+            w2_scale=layer.w2_weight_scale,  # group scale
+            g1_alphas=layer.w13_weight_chan_scale,
+            g2_alphas=layer.w2_weight_chan_scale,
+            per_act_token_quant=True,  # always use dynamc per-token
+            per_out_ch_quant=True,  # always use per-channel
+        )
+
+    def select_gemm_impl(
+        self,
+        prepare_finalize: mk.FusedMoEPrepareAndFinalize,
+        layer: torch.nn.Module,
+    ) -> mk.FusedMoEPermuteExpertsUnpermute:
+        assert self.moe_quant_config is not None
+        assert (
+            prepare_finalize.activation_format == FusedMoEActivationFormat.Standard
+        ), "BatchedExperts not supported"
+
+        from vllm.model_executor.layers.fused_moe import CutlassExpertsW4A8Fp8
+
+        experts: FusedMoEPermuteExpertsUnpermute
+
+        logger.debug("CutlassExpertsW4A8Fp8(%s)", self.__class__.__name__)
+        experts = CutlassExpertsW4A8Fp8(
+            out_dtype=self.moe.in_dtype,
+            a_strides1=self.a_strides1_c_strides2,
+            a_strides2=self.a_strides2,
+            b_strides1=self.b_strides1,
+            b_strides2=self.b_strides2,
+            c_strides1=self.c_strides1,
+            c_strides2=self.a_strides1_c_strides2,
+            s_strides1=self.s_strides1,
+            s_strides2=self.s_strides2,
+            quant_config=self.moe_quant_config,
+            group_size=self.group_size,
+        )
+
+        num_dispatchers = prepare_finalize.num_dispatchers()
+        self.disable_expert_map = (
+            num_dispatchers > 1 or not experts.supports_expert_map()
+        )
+
+        return experts
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ):
+        if enable_eplb:
+            raise NotImplementedError(
+                "EPLB not supported for `CompressedTensorsW4A8Fp8MoEMethod` yet."
+            )
+        assert self.moe_quant_config is not None
+        topk_weights, topk_ids, _ = layer.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+        )
+
+        from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+            cutlass_moe_w4a8_fp8,
+        )
+
+        return cutlass_moe_w4a8_fp8(
+            x,
+            layer.w13_weight_packed,
+            layer.w2_weight_packed,
+            topk_weights,
+            topk_ids,
+            quant_config=self.moe_quant_config,
+            activation=activation,
+            global_num_experts=global_num_experts,
+            expert_map=None if self.disable_expert_map else expert_map,
+            a_strides1=self.a_strides1_c_strides2,
+            a_strides2=self.a_strides2,
+            b_strides1=self.b_strides1,
+            b_strides2=self.b_strides2,
+            c_strides1=self.c_strides1,
+            c_strides2=self.a_strides1_c_strides2,
+            s_strides1=self.s_strides1,
+            s_strides2=self.s_strides2,
+            group_size=self.group_size,
+        )
+
+    @property
+    def supports_eplb(self) -> bool:
+        return False
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py
index a23961e89..9a25e08cb 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py
@@ -128,14 +128,15 @@ class CompressedTensorsW4A8Fp8(CompressedTensorsScheme):
             ),
         )
 
-        # TODO(czhu): allocate the packed fp8 scales memory here?
-        # the scales will be expanded by 8x via `cutlass_pack_scale_fp8`
+        # After loading, we will transform bf16 -> fp8 ->
+        # expand by 8x via `cutlass_pack_scale_fp8`
+        # and construct per-channel fp32 scales.
         weight_scale_args = {
             "weight_loader": weight_loader,
             "data": torch.empty(
                 output_size_per_partition,
                 scales_and_zp_size,
-                dtype=torch.float8_e4m3fn,
+                dtype=params_dtype,
             ),
         }
 
@@ -152,17 +153,9 @@ class CompressedTensorsW4A8Fp8(CompressedTensorsScheme):
             data=torch.empty(2, dtype=torch.int64), weight_loader=weight_loader
         )
 
-        # per-channel scales
-        weight_chan_scale = ChannelQuantScaleParameter(
-            data=torch.empty((output_size_per_partition, 1), dtype=torch.float32),
-            output_dim=0,
-            weight_loader=weight_loader,
-        )
-
         layer.register_parameter("weight_packed", weight)
         layer.register_parameter("weight_scale", weight_scale)
         layer.register_parameter("weight_shape", weight_shape)
-        layer.register_parameter("weight_chan_scale", weight_chan_scale)
 
         self.kernel = kernel_type(
             mp_linear_kernel_config,
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py
index 8ef6457c9..c9c1a3abf 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py
@@ -6,7 +6,11 @@ import torch
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
-from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape,
+    convert_bf16_scales_to_fp8,
+    convert_packed_uint4b8_to_signed_int4_inplace,
+)
 from vllm.model_executor.parameter import BasevLLMParameter, permute_param_layout_
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
@@ -48,7 +52,6 @@ class CutlassW4A8LinearKernel(MPLinearKernel):
                 "CUTLASS W4A8, only supported int4",
             )
 
-        # TODO(czhu): support -1 (column-wise)
         if c.group_size != 128:
             return False, "Only group_size 128 is supported"
 
@@ -71,9 +74,9 @@ class CutlassW4A8LinearKernel(MPLinearKernel):
     #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
     #  `weight_scale`  is: {input_dim = 0, output_dim = 1}
     def process_weights_after_loading(self, layer: torch.nn.Module):
-        # TODO(czhu): optimize speed/mem usage
         def transform_w_q(x):
             assert isinstance(x, BasevLLMParameter)
+            convert_packed_uint4b8_to_signed_int4_inplace(x.data)
             permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
             x.data = ops.cutlass_encode_and_reorder_int4b(x.data.t().contiguous().t())
             return x
@@ -85,10 +88,18 @@ class CutlassW4A8LinearKernel(MPLinearKernel):
             x.data = ops.cutlass_pack_scale_fp8(x.data)
             return x
 
+        w_s = getattr(layer, self.w_s_name)
+        fp8_scales, chan_scales = convert_bf16_scales_to_fp8(self.quant_fp8, w_s.data)
+        w_s.data = fp8_scales
+
+        # register per-channel scales
+        layer.register_parameter(
+            "weight_chan_scale", torch.nn.Parameter(chan_scales, requires_grad=False)
+        )
+
         # Encode/reorder weights and pack scales
         self._transform_param(layer, self.w_q_name, transform_w_q)
         self._transform_param(layer, self.w_s_name, transform_w_s)
-        self._transform_param(layer, "weight_chan_scale", lambda x: x)
 
     def apply_weights(
         self,
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 92ee8c498..d01263f82 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This file is used for /tests and /benchmarks"""
 
-from collections.abc import Mapping
+from collections.abc import Callable, Mapping
 from dataclasses import dataclass
 from types import MappingProxyType
 from typing import ClassVar, NamedTuple
@@ -691,3 +691,51 @@ def cutlass_fp4_supported() -> bool:
     capability_tuple = current_platform.get_device_capability()
     capability = -1 if capability_tuple is None else capability_tuple.to_int()
     return cutlass_scaled_mm_supports_fp4(capability)
+
+
+def convert_bf16_scales_to_fp8(
+    quant_fp8: Callable, scales: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Convert a BF16 scale tensor into the pair of (fp8_scales, channel_scales)
+    expected by W4A8 GEMM kernels.
+    """
+    assert scales.is_contiguous(), (
+        f"scale tensor must be contiguous, got {scales.stride()=}"
+    )
+    assert scales.is_cuda, "scales must be on gpu"
+
+    orig_shape = scales.shape
+    k_groups = orig_shape[-1]
+    flat_scales = scales.view(-1, k_groups)
+
+    fp8_scales, chan_scales = quant_fp8(flat_scales)
+    fp8_scales = (fp8_scales.float() / 8.0).to(torch.float8_e4m3fn)
+    chan_scales *= 8.0
+
+    # restore original shape
+    fp8_scales = fp8_scales.view(orig_shape)
+    chan_scales = chan_scales.view(orig_shape[:-1], -1)
+
+    return fp8_scales, chan_scales
+
+
+def convert_packed_uint4b8_to_signed_int4_inplace(t: torch.Tensor) -> torch.Tensor:
+    """
+    Convert int4b8 (packed to int32) to signed int4
+    """
+    assert t.is_cuda, "tensor must be on gpu"
+    assert t.dtype == torch.int32, f"expected int32 packed weights but got {t.dtype}"
+
+    # loop through the 8 4-bit nibbles in each int32 entry
+    for i in range(8):
+        shift = 4 * i
+        # extract the i-th 4-bit nibble
+        nib = (t >> shift) & 0xF
+        # clear the original nibble by masking out
+        t &= ~(0xF << shift)
+        # convert int4b8 [0..15] to signed int4 [-8..7] by subtracting 8
+        # and update in-place
+        t |= ((nib - 8) & 0xF) << shift
+
+    return t
-- 
GitLab


From 03b91f726214b4a022f73a084ee283001e5bba0c Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Mon, 8 Dec 2025 23:44:28 -0500
Subject: [PATCH 218/258] [Bugfix] Fix compressed-tensors models failing to
 load with transformers backend (#30287)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 .../compressed_tensors/compressed_tensors.py  | 39 ++++++++++++++-----
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 21f4cfe51..f83558421 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -116,16 +116,37 @@ class CompressedTensorsConfig(QuantizationConfig):
         return "compressed-tensors"
 
     def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
-        self.target_scheme_map = hf_to_vllm_mapper.apply_dict(self.target_scheme_map)
-        self.ignore = hf_to_vllm_mapper.apply_list(self.ignore)
-        self.sparsity_scheme_map = hf_to_vllm_mapper.apply_dict(
-            self.sparsity_scheme_map
-        )
-        self.sparsity_ignore_list = hf_to_vllm_mapper.apply_list(
-            self.sparsity_ignore_list
-        )
+        """
+        Transform layer paths in config targets to match vLLM's naming.
+
+        The WeightsMapper is designed for weight paths, but some backends
+        (e.g. transformers) use broad prefix mappings like "" -> "model."
+        which would incorrectly transform non-path targets.
+
+        compressed-tensors targets can be:
+        - Layer paths: "layers.0.self_attn.q_proj" -> transformed
+        - Module class names: "Linear" -> preserved (no ".")
+        - Regex patterns: "re:.*proj" -> preserved (starts with "re:")
+        """
+
+        def _map_target(target: str) -> str | None:
+            is_layer_path = "." in target and not target.startswith("re:")
+            if is_layer_path:
+                return hf_to_vllm_mapper._map_name(target)
+            return target
+
+        def _apply_dict(d: dict) -> dict:
+            return {k: v for t, v in d.items() if (k := _map_target(t)) is not None}
+
+        def _apply_list(lst: list) -> list:
+            return [t for x in lst if (t := _map_target(x)) is not None]
+
+        self.target_scheme_map = _apply_dict(self.target_scheme_map)
+        self.ignore = _apply_list(self.ignore)
+        self.sparsity_scheme_map = _apply_dict(self.sparsity_scheme_map)
+        self.sparsity_ignore_list = _apply_list(self.sparsity_ignore_list)
         if self.kv_cache_scheme is not None:
-            self.kv_cache_scheme = hf_to_vllm_mapper.apply_dict(self.kv_cache_scheme)
+            self.kv_cache_scheme = _apply_dict(self.kv_cache_scheme)
 
     def get_quant_method(
         self,
-- 
GitLab


From 4c6fd258808ed42fc98a94f3a849f5fc9efebc20 Mon Sep 17 00:00:00 2001
From: Or Ozeri <oro@il.ibm.com>
Date: Tue, 9 Dec 2025 06:46:09 +0200
Subject: [PATCH 219/258] kv_transfer: Rename the shared storage connectors
 (#30201)

Signed-off-by: Or Ozeri <oro@il.ibm.com>
---
 .../scripts/hardware_ci/run-xpu-test.sh       |  2 +-
 docs/features/disagg_encoder.md               |  6 +-
 docs/features/disagg_prefill.md               |  4 +-
 .../decode_example.py                         |  2 +-
 .../prefill_example.py                        |  2 +-
 .../kv_load_failure_recovery/README.md        |  4 +-
 .../decode_example.py                         |  6 +-
 ....py => load_recovery_example_connector.py} | 20 ++---
 .../prefill_example.py                        |  2 +-
 .../disaggregated_encoder/README.md           |  6 +-
 .../disagg_1e1p1d_example.sh                  |  4 +-
 .../disagg_1e1pd_example.sh                   |  4 +-
 tests/distributed/test_kvlayout.py            |  2 +-
 tests/v1/core/test_scheduler.py               |  6 +-
 tests/v1/core/utils.py                        |  4 +-
 .../integration/run_epd_correctness_test.sh   |  8 +-
 ...nector.py => test_ec_example_connector.py} | 86 +++++++++----------
 tests/v1/engine/test_engine_core.py           |  4 +-
 .../unit/test_backwards_compatibility.py      |  8 +-
 ...connector.py => test_example_connector.py} |  6 +-
 .../unit/test_kv_connector_lifecyle.py        | 10 +--
 .../kv_connector/unit/test_multi_connector.py | 22 ++---
 tests/v1/kv_connector/unit/utils.py           | 10 +--
 ...rage_connector.py => example_connector.py} |  8 +-
 .../ec_transfer/ec_connector/factory.py       |  6 +-
 .../kv_transfer/kv_connector/factory.py       |  6 +-
 ...rage_connector.py => example_connector.py} | 10 +--
 27 files changed, 129 insertions(+), 129 deletions(-)
 rename examples/offline_inference/kv_load_failure_recovery/{rogue_shared_storage_connector.py => load_recovery_example_connector.py} (88%)
 rename tests/v1/ec_connector/unit/{test_ec_shared_storage_connector.py => test_ec_example_connector.py} (90%)
 rename tests/v1/kv_connector/unit/{test_shared_storage_connector.py => test_example_connector.py} (97%)
 rename vllm/distributed/ec_transfer/ec_connector/{shared_storage_connector.py => example_connector.py} (96%)
 rename vllm/distributed/kv_transfer/kv_connector/v1/{shared_storage_connector.py => example_connector.py} (98%)

diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index 1d5dba3f2..dfc9db512 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -47,6 +47,6 @@ docker run \
     pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
     pytest -v -s v1/structured_output
     pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
-    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
+    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
     pytest -v -s v1/test_serial_utils.py
 '
diff --git a/docs/features/disagg_encoder.md b/docs/features/disagg_encoder.md
index 7d40af706..f18a0e85e 100644
--- a/docs/features/disagg_encoder.md
+++ b/docs/features/disagg_encoder.md
@@ -32,14 +32,14 @@ Design doc: <https://docs.google.com/document/d/1aed8KtC6XkXtdoV87pWT0a8OJlZ-Cpn
 
 ## 2  Usage Example
 
-The current reference pathway is **SharedStorageConnector**.  
+The current reference pathway is **ExampleConnector**.  
 Below ready-to-run scripts shows the workflow:
 
 1 Encoder instance + 1 PD instance:
-`examples/online_serving/disaggregated_encoder/shared_storage_connector/disagg_encoder_example.sh`
+`examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh`
 
 1 Encoder instance + 1 Prefill instance + 1 Decode instance:
-`examples/online_serving/disaggregated_encoder/shared_storage_connector/disagg_epd_example.sh`
+`examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh`
 
 ---
 
diff --git a/docs/features/disagg_prefill.md b/docs/features/disagg_prefill.md
index fd4f249f2..dc5e11ea2 100644
--- a/docs/features/disagg_prefill.md
+++ b/docs/features/disagg_prefill.md
@@ -21,14 +21,14 @@ Please refer to [examples/online_serving/disaggregated_prefill.sh](../../example
 
 Now supports 5 types of connectors:
 
-- **SharedStorageConnector**: refer to [examples/offline_inference/disaggregated-prefill-v1/run.sh](../../examples/offline_inference/disaggregated-prefill-v1/run.sh) for the example usage of SharedStorageConnector disaggregated prefilling.
+- **ExampleConnector**: refer to [examples/offline_inference/disaggregated-prefill-v1/run.sh](../../examples/offline_inference/disaggregated-prefill-v1/run.sh) for the example usage of ExampleConnector disaggregated prefilling.
 - **LMCacheConnectorV1**: refer to [examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh](../../examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh) for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
 - **NixlConnector**: refer to [tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh) for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. For detailed usage guide, see [NixlConnector Usage Guide](nixl_connector_usage.md).
 - **P2pNcclConnector**: refer to [examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh](../../examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh) for the example usage of P2pNcclConnector disaggregated prefilling.
 - **MultiConnector**: take advantage of the kv_connector_extra_config: dict[str, Any] already present in KVTransferConfig to stash all the connectors we want in an ordered list of kwargs.such as:
 
   ```bash
-  --kv-transfer-config '{"kv_connector":"MultiConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"NixlConnector","kv_role":"kv_both"},{"kv_connector":"SharedStorageConnector","kv_role":"kv_both","kv_connector_extra_config":{"shared_storage_path":"local_storage"}}]}}'
+  --kv-transfer-config '{"kv_connector":"MultiConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"NixlConnector","kv_role":"kv_both"},{"kv_connector":"ExampleConnector","kv_role":"kv_both","kv_connector_extra_config":{"shared_storage_path":"local_storage"}}]}}'
   ```
 
 For NixlConnector, you may also specify one or multiple NIXL_Backend. Such as:
diff --git a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
index 8f3d1a5c0..2d575840e 100644
--- a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
@@ -30,7 +30,7 @@ def main():
         max_num_batched_tokens=64,
         max_num_seqs=16,
         kv_transfer_config=KVTransferConfig(
-            kv_connector="SharedStorageConnector",
+            kv_connector="ExampleConnector",
             kv_role="kv_both",
             kv_connector_extra_config={"shared_storage_path": "local_storage"},
         ),
diff --git a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
index 0bfe7ec0e..207c6daeb 100644
--- a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
@@ -26,7 +26,7 @@ def main():
         enforce_eager=True,
         gpu_memory_utilization=0.8,
         kv_transfer_config=KVTransferConfig(
-            kv_connector="SharedStorageConnector",
+            kv_connector="ExampleConnector",
             kv_role="kv_both",
             kv_connector_extra_config={"shared_storage_path": "local_storage"},
         ),
diff --git a/examples/offline_inference/kv_load_failure_recovery/README.md b/examples/offline_inference/kv_load_failure_recovery/README.md
index 230a16812..1f29a6ff5 100644
--- a/examples/offline_inference/kv_load_failure_recovery/README.md
+++ b/examples/offline_inference/kv_load_failure_recovery/README.md
@@ -10,7 +10,7 @@ It demonstrates vLLM's ability to recover from KV load failures in both synchron
 - `decode_example.py` – performs the decode stage. Accepts:
     - `--simulate-failure`: simulates KV load failure using a custom connector.
     - `--async-load`: enables asynchronous KV loading mode.
-- `rogue_shared_storage_connector.py` – defines `RogueSharedStorageConnector`, a subclass of `SharedStorageConnector`, that simulates missing or corrupted external KV blocks by failing to load blocks for the first decode request.
+- `load_recovery_example_connector.py` – defines `LoadRecoveryExampleConnector`, a subclass of `ExampleConnector`, that simulates missing or corrupted external KV blocks by failing to load blocks for the first decode request.
 - `run.sh` – orchestrates the test: runs the prefill stage, then three decode stages:
     1. Normal decode (baseline).
     2. Decode with simulated sync KV load failure.
@@ -20,7 +20,7 @@ It demonstrates vLLM's ability to recover from KV load failures in both synchron
 
 ## How It Works
 
-- The test dynamically loads `RogueSharedStorageConnector` via `KVTransferConfig.kv_connector_module_path`, enabling controlled simulation of load failures without modifying the original connector.
+- The test dynamically loads `LoadRecoveryExampleConnector` via `KVTransferConfig.kv_connector_module_path`, enabling controlled simulation of load failures without modifying the original connector.
 - The decode stages that simulate failure are expected to trigger recovery logic in vLLM, resulting in the same output as the baseline decode.
 - If recovery fails, the script prints a unified diff of the output mismatch and exits with error.
 
diff --git a/examples/offline_inference/kv_load_failure_recovery/decode_example.py b/examples/offline_inference/kv_load_failure_recovery/decode_example.py
index 69523f56e..d0df54167 100644
--- a/examples/offline_inference/kv_load_failure_recovery/decode_example.py
+++ b/examples/offline_inference/kv_load_failure_recovery/decode_example.py
@@ -35,13 +35,13 @@ def main():
 
     if args.simulate_failure:
         ktc = KVTransferConfig(
-            kv_connector="RogueSharedStorageConnector",
+            kv_connector="LoadRecoveryExampleConnector",
             kv_role="kv_both",
             kv_connector_extra_config={
                 "shared_storage_path": "local_storage",
                 "async_load": args.async_load,
             },
-            kv_connector_module_path="rogue_shared_storage_connector",
+            kv_connector_module_path="load_recovery_example_connector",
         )
         out_file = (
             "async_decode_recovered_output.txt"
@@ -50,7 +50,7 @@ def main():
         )
     else:
         ktc = KVTransferConfig(
-            kv_connector="SharedStorageConnector",
+            kv_connector="ExampleConnector",
             kv_role="kv_both",
             kv_connector_extra_config={
                 "shared_storage_path": "local_storage",
diff --git a/examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py b/examples/offline_inference/kv_load_failure_recovery/load_recovery_example_connector.py
similarity index 88%
rename from examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py
rename to examples/offline_inference/kv_load_failure_recovery/load_recovery_example_connector.py
index 5b2acea4c..7aab07f8a 100644
--- a/examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py
+++ b/examples/offline_inference/kv_load_failure_recovery/load_recovery_example_connector.py
@@ -10,9 +10,9 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorMetadata,
     KVConnectorRole,
 )
-from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (
-    SharedStorageConnector,
-    SharedStorageConnectorMetadata,
+from vllm.distributed.kv_transfer.kv_connector.v1.example_connector import (
+    ExampleConnector,
+    ExampleConnectorMetadata,
 )
 from vllm.forward_context import ForwardContext
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks
@@ -26,15 +26,15 @@ logging.basicConfig(level=logging.INFO)
 
 
 @dataclass
-class RogueSharedStorageConnectorMetadata(SharedStorageConnectorMetadata):
+class LoadRecoveryExampleConnectorMetadata(ExampleConnectorMetadata):
     req_to_block_ids: dict[str, set[int]] = field(default_factory=dict)
 
     @classmethod
-    def from_base(cls, base: SharedStorageConnectorMetadata):
+    def from_base(cls, base: ExampleConnectorMetadata):
         return cls(requests=base.requests)
 
 
-class RogueSharedStorageConnector(SharedStorageConnector):
+class LoadRecoveryExampleConnector(ExampleConnector):
     def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
         super().__init__(vllm_config=vllm_config, role=role)
         self._async_load = vllm_config.kv_transfer_config.get_from_extra_config(
@@ -45,7 +45,7 @@ class RogueSharedStorageConnector(SharedStorageConnector):
         self._req_to_block_ids: dict[str, list[int]] = dict()
 
     def bind_connector_metadata(self, connector_metadata: KVConnectorMetadata) -> None:
-        assert isinstance(connector_metadata, RogueSharedStorageConnectorMetadata)
+        assert isinstance(connector_metadata, LoadRecoveryExampleConnectorMetadata)
         index, failed_request = next(
             (
                 (i, x)
@@ -84,7 +84,7 @@ class RogueSharedStorageConnector(SharedStorageConnector):
     ) -> tuple[set[str] | None, set[str] | None]:
         if self._async_load:
             meta = self._get_connector_metadata()
-            assert isinstance(meta, RogueSharedStorageConnectorMetadata)
+            assert isinstance(meta, LoadRecoveryExampleConnectorMetadata)
             if meta.req_to_block_ids:
                 return None, set(meta.req_to_block_ids)
 
@@ -126,9 +126,9 @@ class RogueSharedStorageConnector(SharedStorageConnector):
     ) -> KVConnectorMetadata:
         if not self._async_load:
             base = super().build_connector_meta(scheduler_output)
-            meta = RogueSharedStorageConnectorMetadata.from_base(base)
+            meta = LoadRecoveryExampleConnectorMetadata.from_base(base)
         else:
-            meta = RogueSharedStorageConnectorMetadata()
+            meta = LoadRecoveryExampleConnectorMetadata()
             if self._requests_need_load:
                 for req_id, request in self._requests_need_load.items():
                     meta.add_request(
diff --git a/examples/offline_inference/kv_load_failure_recovery/prefill_example.py b/examples/offline_inference/kv_load_failure_recovery/prefill_example.py
index 047b81c82..ee4a84fd9 100644
--- a/examples/offline_inference/kv_load_failure_recovery/prefill_example.py
+++ b/examples/offline_inference/kv_load_failure_recovery/prefill_example.py
@@ -26,7 +26,7 @@ def main():
         enforce_eager=True,
         gpu_memory_utilization=0.8,
         kv_transfer_config=KVTransferConfig(
-            kv_connector="SharedStorageConnector",
+            kv_connector="ExampleConnector",
             kv_role="kv_both",
             kv_connector_extra_config={"shared_storage_path": "local_storage"},
         ),
diff --git a/examples/online_serving/disaggregated_encoder/README.md b/examples/online_serving/disaggregated_encoder/README.md
index 5813a3cec..b2c3bb974 100644
--- a/examples/online_serving/disaggregated_encoder/README.md
+++ b/examples/online_serving/disaggregated_encoder/README.md
@@ -50,12 +50,12 @@ The vllm instances and `disagg_encoder_proxy` supports local URIs with ```{"url"
 
 ## EC connector and KV transfer
 
-The `ECSharedStorageConnector` is used to store the encoder cache on local disk and facilitate transfer. To enable the encoder disaggregation feature, add the following configuration:
+The `ECExampleonnector` is used to store the encoder cache on local disk and facilitate transfer. To enable the encoder disaggregation feature, add the following configuration:
 
 ```bash
 # Add to encoder instance: 
 --ec-transfer-config '{
-    "ec_connector": "ECSharedStorageConnector",
+    "ec_connector": "ECExampleConnector",
     "ec_role": "ec_producer",
     "ec_connector_extra_config": {
         "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
@@ -64,7 +64,7 @@ The `ECSharedStorageConnector` is used to store the encoder cache on local disk
 
 # Add to prefill/prefill+decode instance: 
 --ec-transfer-config '{
-    "ec_connector": "ECSharedStorageConnector",
+    "ec_connector": "ECExampleConnector",
     "ec_role": "ec_consumer",
     "ec_connector_extra_config": {
         "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
diff --git a/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh b/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh
index 57489df64..95a418374 100644
--- a/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh
+++ b/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh
@@ -102,7 +102,7 @@ CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
     --max-num-seqs 128 \
     --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
     --ec-transfer-config '{
-        "ec_connector": "ECSharedStorageConnector",
+        "ec_connector": "ECExampleConnector",
         "ec_role": "ec_producer",
         "ec_connector_extra_config": {
             "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
@@ -126,7 +126,7 @@ vllm serve "$MODEL" \
     --max-num-seqs 128 \
     --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
     --ec-transfer-config '{
-        "ec_connector": "ECSharedStorageConnector",
+        "ec_connector": "ECExampleConnector",
         "ec_role": "ec_consumer",
         "ec_connector_extra_config": {
             "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
diff --git a/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh b/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh
index 6073e0580..c4a591d74 100644
--- a/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh
+++ b/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh
@@ -96,7 +96,7 @@ CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
     --max-num-seqs 128 \
     --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
     --ec-transfer-config '{
-        "ec_connector": "ECSharedStorageConnector",
+        "ec_connector": "ECExampleConnector",
         "ec_role": "ec_producer",
         "ec_connector_extra_config": {
             "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
@@ -117,7 +117,7 @@ CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
     --max-num-seqs 128 \
     --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
     --ec-transfer-config '{
-        "ec_connector": "ECSharedStorageConnector",
+        "ec_connector": "ECExampleConnector",
         "ec_role": "ec_consumer",
         "ec_connector_extra_config": {
             "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
diff --git a/tests/distributed/test_kvlayout.py b/tests/distributed/test_kvlayout.py
index b190b2820..c8177f1c7 100644
--- a/tests/distributed/test_kvlayout.py
+++ b/tests/distributed/test_kvlayout.py
@@ -61,7 +61,7 @@ def test_get_kv_connector_cache_layout_with_multi_connector():
         kv_role="kv_both",
         kv_connector_extra_config={
             "connectors": [
-                {"kv_connector": "SharedStorageConnector", "kv_role": "kv_both"},
+                {"kv_connector": "ExampleConnector", "kv_role": "kv_both"},
                 {"kv_connector": "NixlConnector", "kv_role": "kv_both"},
             ]
         },
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index c6c4a5085..1999e9f6c 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1536,7 +1536,7 @@ def create_scheduler_with_priority(
     )
     kv_transfer_config = (
         KVTransferConfig(
-            kv_connector="SharedStorageConnector",
+            kv_connector="ExampleConnector",
             kv_role="kv_both",
             kv_connector_extra_config={"shared_storage_path": "local_storage"},
         )
@@ -1552,7 +1552,7 @@ def create_scheduler_with_priority(
 
     ec_transfer_config = (
         ECTransferConfig(
-            ec_connector="ECSharedStorageConnector",
+            ec_connector="ECExampleConnector",
             ec_role=ec_role,
             ec_connector_extra_config={"shared_storage_path": "/tmp/ec_test"},
         )
@@ -2413,7 +2413,7 @@ def _assert_right_ec_connector_metadata(
     metadata_dict = {mm_data.mm_hash: mm_data for mm_data in metadata.mm_datas}
 
     # Check all required identifiers exist in metadata; and no extra
-    # In ECSharedStorageConnector format
+    # In ECExampleConnector format
     # NOTE: even having same identifier, the mm_features can be different
     # since their mm_position can be in different offsets, etc
     identifiers_dict = {f.identifier for f in mm_features_list}
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index f5ba613d3..531b9c595 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -108,7 +108,7 @@ def create_scheduler(
         )
     elif use_kv_connector:
         kv_transfer_config = KVTransferConfig(
-            kv_connector="SharedStorageConnector",
+            kv_connector="ExampleConnector",
             kv_role="kv_both",
             kv_connector_extra_config={"shared_storage_path": "local_storage"},
         )
@@ -121,7 +121,7 @@ def create_scheduler(
 
     ec_transfer_config = (
         ECTransferConfig(
-            ec_connector="ECSharedStorageConnector",
+            ec_connector="ECExampleConnector",
             ec_role=ec_role,
             ec_connector_extra_config={"shared_storage_path": "/tmp/ec_test"},
         )
diff --git a/tests/v1/ec_connector/integration/run_epd_correctness_test.sh b/tests/v1/ec_connector/integration/run_epd_correctness_test.sh
index 55dd39c0a..0c2666306 100644
--- a/tests/v1/ec_connector/integration/run_epd_correctness_test.sh
+++ b/tests/v1/ec_connector/integration/run_epd_correctness_test.sh
@@ -148,7 +148,7 @@ run_epd_1e_1pd() {
         --max-num-seqs 128 \
         --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
         --ec-transfer-config '{
-            "ec_connector": "ECSharedStorageConnector",
+            "ec_connector": "ECExampleConnector",
             "ec_role": "ec_producer",
             "ec_connector_extra_config": {
                 "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
@@ -167,7 +167,7 @@ run_epd_1e_1pd() {
         --max-num-seqs 128 \
         --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
         --ec-transfer-config '{
-            "ec_connector": "ECSharedStorageConnector",
+            "ec_connector": "ECExampleConnector",
             "ec_role": "ec_consumer",
             "ec_connector_extra_config": {
                 "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
@@ -348,7 +348,7 @@ run_epd_1e_1p_1d() {
         --max-num-seqs 128 \
         --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
         --ec-transfer-config '{
-            "ec_connector": "ECSharedStorageConnector",
+            "ec_connector": "ECExampleConnector",
             "ec_role": "ec_producer",
             "ec_connector_extra_config": {
                 "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
@@ -369,7 +369,7 @@ run_epd_1e_1p_1d() {
         --max-num-seqs 128 \
         --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
         --ec-transfer-config '{
-            "ec_connector": "ECSharedStorageConnector",
+            "ec_connector": "ECExampleConnector",
             "ec_role": "ec_consumer",
             "ec_connector_extra_config": {
                 "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
diff --git a/tests/v1/ec_connector/unit/test_ec_shared_storage_connector.py b/tests/v1/ec_connector/unit/test_ec_example_connector.py
similarity index 90%
rename from tests/v1/ec_connector/unit/test_ec_shared_storage_connector.py
rename to tests/v1/ec_connector/unit/test_ec_example_connector.py
index a58daa262..7e9eb2131 100644
--- a/tests/v1/ec_connector/unit/test_ec_shared_storage_connector.py
+++ b/tests/v1/ec_connector/unit/test_ec_example_connector.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
-Unit tests for ECSharedStorageConnector.
+Unit tests for ECExampleConnector.
 """
 
 import os
@@ -13,9 +13,9 @@ import torch
 
 from vllm.config import VllmConfig
 from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorRole
-from vllm.distributed.ec_transfer.ec_connector.shared_storage_connector import (
-    ECSharedStorageConnector,
-    ECSharedStorageConnectorMetadata,
+from vllm.distributed.ec_transfer.ec_connector.example_connector import (
+    ECExampleConnector,
+    ECExampleConnectorMetadata,
     MMMeta,
 )
 from vllm.multimodal.inputs import MultiModalFeatureSpec, PlaceholderRange
@@ -81,12 +81,12 @@ def mock_request_with_3_mm():
 
 
 # ------------------ Unit Tests ------------------ #
-class TestECSharedStorageConnectorBasics:
+class TestECExampleConnectorBasics:
     """Test basic EC connector functionality."""
 
     def test_initialization_producer(self, mock_vllm_config_producer, temp_storage):
         """Test connector initializes correctly as producer."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.SCHEDULER,
         )
@@ -98,7 +98,7 @@ class TestECSharedStorageConnectorBasics:
 
     def test_initialization_consumer(self, mock_vllm_config_consumer, temp_storage):
         """Test connector initializes correctly as consumer."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_consumer,
             role=ECConnectorRole.WORKER,
         )
@@ -109,11 +109,11 @@ class TestECSharedStorageConnectorBasics:
 
     def test_role_assignment(self, mock_vllm_config_producer):
         """Test role is correctly assigned."""
-        scheduler_connector = ECSharedStorageConnector(
+        scheduler_connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.SCHEDULER,
         )
-        worker_connector = ECSharedStorageConnector(
+        worker_connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.WORKER,
         )
@@ -133,7 +133,7 @@ class TestCacheExistence:
     ):
         """Test has_caches returns True when all 3 caches exist."""
         # Test for producer first
-        producer = ECSharedStorageConnector(
+        producer = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.SCHEDULER,
         )
@@ -154,7 +154,7 @@ class TestCacheExistence:
         assert all(producer_result), f"Expected all True, got {producer_result}"
 
         # Also test consumer can check if cache exists
-        consumer = ECSharedStorageConnector(
+        consumer = ECExampleConnector(
             vllm_config=mock_vllm_config_consumer,
             role=ECConnectorRole.SCHEDULER,
         )
@@ -170,7 +170,7 @@ class TestCacheExistence:
         self, mock_vllm_config_producer, mock_request_with_3_mm
     ):
         """Test has_caches returns False when no caches exist."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.SCHEDULER,
         )
@@ -186,7 +186,7 @@ class TestCacheExistence:
         self, mock_vllm_config_producer, mock_request_with_3_mm
     ):
         """Test has_caches with some caches existing (1 of 3)."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.SCHEDULER,
         )
@@ -213,7 +213,7 @@ class TestStateManagement:
         self, mock_vllm_config_producer, mock_request_with_3_mm
     ):
         """Test state update after allocation for 3 MM items."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.SCHEDULER,
         )
@@ -238,7 +238,7 @@ class TestStateManagement:
         self, mock_vllm_config_producer, mock_request_with_3_mm
     ):
         """Test metadata building for 3 MM items."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.SCHEDULER,
         )
@@ -252,7 +252,7 @@ class TestStateManagement:
         metadata = connector.build_connector_meta(scheduler_output)
 
         # Assert
-        assert isinstance(metadata, ECSharedStorageConnectorMetadata)
+        assert isinstance(metadata, ECExampleConnectorMetadata)
         assert len(metadata.mm_datas) == 3
         assert metadata.mm_datas[0].mm_hash == "img_hash_1"
         assert metadata.mm_datas[0].num_token == 100
@@ -266,7 +266,7 @@ class TestStateManagement:
 
     def test_build_connector_meta_empty(self, mock_vllm_config_producer):
         """Test metadata building with empty state."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.SCHEDULER,
         )
@@ -274,14 +274,14 @@ class TestStateManagement:
         scheduler_output = Mock(spec=SchedulerOutput)
         metadata = connector.build_connector_meta(scheduler_output)
 
-        assert isinstance(metadata, ECSharedStorageConnectorMetadata)
+        assert isinstance(metadata, ECExampleConnectorMetadata)
         assert len(metadata.mm_datas) == 0
 
     def test_state_cleared_after_metadata_build(
         self, mock_vllm_config_producer, mock_request_with_3_mm
     ):
         """Test that state is properly cleared after building metadata."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.SCHEDULER,
         )
@@ -310,7 +310,7 @@ class TestCacheSaving:
         self, mock_vllm_config_producer, mock_request_with_3_mm, temp_storage
     ):
         """Test cache saving as producer for 3 different MM items."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.WORKER,
         )
@@ -336,7 +336,7 @@ class TestCacheSaving:
 
     def test_save_caches_consumer_skips(self, mock_vllm_config_consumer):
         """Test cache saving is skipped for consumer."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_consumer,
             role=ECConnectorRole.WORKER,
         )
@@ -366,7 +366,7 @@ class TestCacheLoading:
     ):
         """Test consumer loads 3 caches from storage."""
         # First, create producer to save caches
-        producer = ECSharedStorageConnector(
+        producer = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.WORKER,
         )
@@ -379,13 +379,13 @@ class TestCacheLoading:
             producer.save_caches(saved_caches, mm_hash)
 
         # Now consumer loads
-        consumer = ECSharedStorageConnector(
+        consumer = ECExampleConnector(
             vllm_config=mock_vllm_config_consumer,
             role=ECConnectorRole.WORKER,
         )
 
         # Setup metadata for all 3
-        metadata = ECSharedStorageConnectorMetadata()
+        metadata = ECExampleConnectorMetadata()
         for mm_hash in mm_hashes:
             metadata.add_mm_data(MMMeta.make_meta(mm_hash, 100))
         consumer.bind_connector_metadata(metadata)
@@ -410,7 +410,7 @@ class TestCacheLoading:
     ):
         """Test cache loading skips already cached items."""
         # Setup: producer saves cache
-        producer = ECSharedStorageConnector(
+        producer = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.WORKER,
         )
@@ -420,12 +420,12 @@ class TestCacheLoading:
         producer.save_caches({mm_hash: saved_cache}, mm_hash)
 
         # Consumer setup
-        consumer = ECSharedStorageConnector(
+        consumer = ECExampleConnector(
             vllm_config=mock_vllm_config_consumer,
             role=ECConnectorRole.WORKER,
         )
 
-        metadata = ECSharedStorageConnectorMetadata()
+        metadata = ECExampleConnectorMetadata()
         metadata.add_mm_data(MMMeta.make_meta(mm_hash, 100))
         consumer.bind_connector_metadata(metadata)
 
@@ -444,13 +444,13 @@ class TestCacheLoading:
 
     def test_start_load_caches_empty_metadata(self, mock_vllm_config_consumer):
         """Test loading with empty metadata does nothing."""
-        consumer = ECSharedStorageConnector(
+        consumer = ECExampleConnector(
             vllm_config=mock_vllm_config_consumer,
             role=ECConnectorRole.WORKER,
         )
 
         # Setup empty metadata
-        metadata = ECSharedStorageConnectorMetadata()
+        metadata = ECExampleConnectorMetadata()
         consumer.bind_connector_metadata(metadata)
 
         # Load (should not raise)
@@ -466,7 +466,7 @@ class TestFilenameGeneration:
 
     def test_generate_foldername(self, mock_vllm_config_producer, temp_storage):
         """Test folder name generation."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.WORKER,
         )
@@ -479,7 +479,7 @@ class TestFilenameGeneration:
 
     def test_generate_filename(self, mock_vllm_config_producer, temp_storage):
         """Test filename generation."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.WORKER,
         )
@@ -493,7 +493,7 @@ class TestFilenameGeneration:
 
     def test_generate_filename_consistency(self, mock_vllm_config_producer):
         """Test filename generation is consistent."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.WORKER,
         )
@@ -510,12 +510,12 @@ class TestMetadataBindingLifecycle:
 
     def test_bind_connector_metadata(self, mock_vllm_config_consumer):
         """Test binding connector metadata."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_consumer,
             role=ECConnectorRole.WORKER,
         )
 
-        metadata = ECSharedStorageConnectorMetadata()
+        metadata = ECExampleConnectorMetadata()
         metadata.add_mm_data(MMMeta.make_meta("hash_1", 100))
 
         connector.bind_connector_metadata(metadata)
@@ -524,12 +524,12 @@ class TestMetadataBindingLifecycle:
 
     def test_clear_connector_metadata(self, mock_vllm_config_consumer):
         """Test clearing connector metadata."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_consumer,
             role=ECConnectorRole.WORKER,
         )
 
-        metadata = ECSharedStorageConnectorMetadata()
+        metadata = ECExampleConnectorMetadata()
         connector.bind_connector_metadata(metadata)
 
         connector.clear_connector_metadata()
@@ -538,12 +538,12 @@ class TestMetadataBindingLifecycle:
 
     def test_get_connector_metadata(self, mock_vllm_config_consumer):
         """Test getting connector metadata."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_consumer,
             role=ECConnectorRole.WORKER,
         )
 
-        metadata = ECSharedStorageConnectorMetadata()
+        metadata = ECExampleConnectorMetadata()
         connector.bind_connector_metadata(metadata)
 
         retrieved = connector._get_connector_metadata()
@@ -552,7 +552,7 @@ class TestMetadataBindingLifecycle:
 
     def test_get_connector_metadata_not_set(self, mock_vllm_config_consumer):
         """Test getting metadata when not set raises."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_consumer,
             role=ECConnectorRole.WORKER,
         )
@@ -566,7 +566,7 @@ class TestEdgeCases:
 
     def test_save_empty_cache(self, mock_vllm_config_producer):
         """Test saving empty tensor."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.WORKER,
         )
@@ -579,12 +579,12 @@ class TestEdgeCases:
 
     def test_load_nonexistent_cache(self, mock_vllm_config_consumer):
         """Test loading cache that doesn't exist raises error."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_consumer,
             role=ECConnectorRole.WORKER,
         )
 
-        metadata = ECSharedStorageConnectorMetadata()
+        metadata = ECExampleConnectorMetadata()
         metadata.add_mm_data(MMMeta.make_meta("nonexistent_hash", 100))
         connector.bind_connector_metadata(metadata)
 
@@ -596,7 +596,7 @@ class TestEdgeCases:
 
     def test_has_caches_empty_request(self, mock_vllm_config_producer):
         """Test has_caches with request that has no MM data."""
-        connector = ECSharedStorageConnector(
+        connector = ECExampleConnector(
             vllm_config=mock_vllm_config_producer,
             role=ECConnectorRole.SCHEDULER,
         )
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 48be8c15a..5fa16897b 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -507,7 +507,7 @@ def test_encoder_instance_zero_kv_cache(
     )
     kv_transfer_config = (
         KVTransferConfig(
-            kv_connector="SharedStorageConnector",
+            kv_connector="ExampleConnector",
             kv_role="kv_both",
             kv_connector_extra_config={"shared_storage_path": "local_storage"},
         )
@@ -515,7 +515,7 @@ def test_encoder_instance_zero_kv_cache(
         else None
     )
     ec_transfer_config = ECTransferConfig(
-        ec_connector="ECSharedStorageConnector",
+        ec_connector="ECExampleConnector",
         ec_role=ec_role,
         ec_connector_extra_config={"shared_storage_path": "/tmp/ec_test_encoder"},
     )
diff --git a/tests/v1/kv_connector/unit/test_backwards_compatibility.py b/tests/v1/kv_connector/unit/test_backwards_compatibility.py
index 7cd23805c..0d29ca5fc 100644
--- a/tests/v1/kv_connector/unit/test_backwards_compatibility.py
+++ b/tests/v1/kv_connector/unit/test_backwards_compatibility.py
@@ -218,12 +218,12 @@ def test_internal_connector_uses_new_signature():
     Test that internal connectors (registered in factory) always use the new
     signature and get kv_cache_config.
     """
-    from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (
-        SharedStorageConnector,
+    from vllm.distributed.kv_transfer.kv_connector.v1.example_connector import (
+        ExampleConnector,
     )
 
     vllm_config = create_vllm_config()
-    vllm_config.kv_transfer_config.kv_connector = "SharedStorageConnector"
+    vllm_config.kv_transfer_config.kv_connector = "ExampleConnector"
 
     scheduler = create_scheduler(vllm_config)
     kv_cache_config = scheduler.kv_cache_config
@@ -233,7 +233,7 @@ def test_internal_connector_uses_new_signature():
     )
 
     assert connector is not None
-    assert isinstance(connector, SharedStorageConnector)
+    assert isinstance(connector, ExampleConnector)
     assert connector._kv_cache_config is not None
     assert connector._kv_cache_config == kv_cache_config
 
diff --git a/tests/v1/kv_connector/unit/test_shared_storage_connector.py b/tests/v1/kv_connector/unit/test_example_connector.py
similarity index 97%
rename from tests/v1/kv_connector/unit/test_shared_storage_connector.py
rename to tests/v1/kv_connector/unit/test_example_connector.py
index ff4697a97..75edb79fb 100644
--- a/tests/v1/kv_connector/unit/test_shared_storage_connector.py
+++ b/tests/v1/kv_connector/unit/test_example_connector.py
@@ -119,16 +119,16 @@ def process_prompt(processor, llm: LLM, question: str, image_urls: list[Image]):
 )
 def test_shared_storage_connector_hashes(tmp_path):
     """
-    Tests that SharedStorageConnector saves KV to the storage locations
+    Tests that ExampleConnector saves KV to the storage locations
     with proper hashes; that are unique for inputs with identical text but
     different images (same size), or same multiple images but different orders.
     """
     # Using tmp_path as the storage path to store KV
     print(f"KV storage path at: {str(tmp_path)}")
 
-    # Configure the SharedStorageConnector
+    # Configure the ExampleConnector
     kv_transfer_config = KVTransferConfig(
-        kv_connector="SharedStorageConnector",
+        kv_connector="ExampleConnector",
         kv_role="kv_both",
         kv_connector_extra_config={"shared_storage_path": str(tmp_path)},
     )
diff --git a/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py b/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py
index d0a6eeae6..4ba6b2201 100644
--- a/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py
+++ b/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (  # noqa: E501
-    SharedStorageConnectorMetadata,
+from vllm.distributed.kv_transfer.kv_connector.v1.example_connector import (  # noqa: E501
+    ExampleConnectorMetadata,
 )
 from vllm.distributed.kv_transfer.kv_transfer_state import (
     ensure_kv_transfer_initialized,
@@ -11,7 +11,7 @@ from vllm.distributed.kv_transfer.kv_transfer_state import (
 from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
 from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
 
-# Importing utils registers TestSharedStorageConnector with the factory
+# Importing utils registers TestExampleConnector with the factory
 from .utils import create_vllm_config
 
 
@@ -26,13 +26,13 @@ def _make_empty_scheduler_output():
         num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        kv_connector_metadata=SharedStorageConnectorMetadata(),
+        kv_connector_metadata=ExampleConnectorMetadata(),
     )
 
 
 def test_kv_connector_mixin_clears_metadata():
     vllm_config = create_vllm_config()
-    vllm_config.kv_transfer_config.kv_connector = "TestSharedStorageConnector"
+    vllm_config.kv_transfer_config.kv_connector = "TestExampleConnector"
     vllm_config.kv_transfer_config.kv_role = "kv_both"
     vllm_config.kv_transfer_config.kv_connector_extra_config["name"] = "unit"
 
diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py
index ffa7d884d..9b6d52e7c 100644
--- a/tests/v1/kv_connector/unit/test_multi_connector.py
+++ b/tests/v1/kv_connector/unit/test_multi_connector.py
@@ -77,9 +77,9 @@ def _compare_directories(dir1: Path, dir2: Path) -> bool:
         "https://github.com/ROCm/pytorch/issues/2822"
     ),
 )
-def test_multi_shared_storage_connector_consistency():
+def test_multi_example_connector_consistency():
     """
-    Tests that MultiConnector with two SharedStorageConnectors saves
+    Tests that MultiConnector with two ExampleConnectors saves
     identical KV cache data to separate storage locations.
     """
     storage_1_path = Path("storage_1/")
@@ -89,14 +89,14 @@ def test_multi_shared_storage_connector_consistency():
     storage_1_path.mkdir()
     storage_2_path.mkdir()
 
-    # Configure MultiConnector with two SharedStorageConnectors
+    # Configure MultiConnector with two ExampleConnectors
     kv_transfer_config = KVTransferConfig(
         kv_connector="MultiConnector",
         kv_role="kv_both",
         kv_connector_extra_config={
             "connectors": [
                 {
-                    "kv_connector": "TestSharedStorageConnector",
+                    "kv_connector": "TestExampleConnector",
                     "kv_role": "kv_both",
                     "kv_connector_extra_config": {
                         "shared_storage_path": str(storage_1_path),
@@ -105,7 +105,7 @@ def test_multi_shared_storage_connector_consistency():
                     "kv_connector_module_path": "tests.v1.kv_connector.unit.utils",
                 },
                 {
-                    "kv_connector": "TestSharedStorageConnector",
+                    "kv_connector": "TestExampleConnector",
                     "kv_role": "kv_both",
                     "kv_connector_extra_config": {
                         "shared_storage_path": str(storage_2_path),
@@ -427,7 +427,7 @@ class TestMultiConnectorStats:
 
     def test_build_kv_connector_stats_skips_connectors_without_custom_stats(self):
         """Test that connectors without custom stats (return None) are skipped."""
-        # SharedStorageConnector doesn't override build_kv_connector_stats,
+        # ExampleConnector doesn't override build_kv_connector_stats,
         # so it returns None and should be skipped
         serialized_data = {
             "NixlConnector": {
@@ -440,7 +440,7 @@ class TestMultiConnectorStats:
                     "num_failed_notifications": [],
                 }
             },
-            "SharedStorageConnector": {"data": {"some_field": [1, 2, 3]}},
+            "ExampleConnector": {"data": {"some_field": [1, 2, 3]}},
         }
 
         stats = MultiConnector.build_kv_connector_stats(data=serialized_data)
@@ -451,8 +451,8 @@ class TestMultiConnectorStats:
         assert len(stats.data) == 1
         assert "NixlConnector" in stats.data
         assert isinstance(stats.data["NixlConnector"], NixlKVConnectorStats)
-        # SharedStorageConnector should be skipped (returns None)
-        assert "SharedStorageConnector" not in stats.data
+        # ExampleConnector should be skipped (returns None)
+        assert "ExampleConnector" not in stats.data
 
     def test_build_kv_connector_stats_handles_malformed_data(self):
         """Test that malformed data raises appropriate errors."""
@@ -527,13 +527,13 @@ class TestMultiConnectorStats:
         )
 
         stats2 = MultiKVConnectorStats(
-            data={"SharedStorageConnector": KVConnectorStats(data={"field": [1, 2]})}
+            data={"ExampleConnector": KVConnectorStats(data={"field": [1, 2]})}
         )
 
         result = stats1.aggregate(stats2)
 
         assert "NixlConnector" in result.data
-        assert "SharedStorageConnector" in result.data
+        assert "ExampleConnector" in result.data
 
     def test_reduce(self):
         """Test that reduce() correctly reduces all nested connector stats."""
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 58f1a7282..5cdb1f84b 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -24,8 +24,8 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorMetadata,
     KVConnectorRole,
 )
-from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (  # noqa
-    SharedStorageConnector,
+from vllm.distributed.kv_transfer.kv_connector.v1.example_connector import (  # noqa
+    ExampleConnector,
 )
 from vllm.utils.hashing import sha256
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks
@@ -264,10 +264,10 @@ def create_model_runner_output(
     )
 
 
-class TestSharedStorageConnector(SharedStorageConnector):
+class TestExampleConnector(ExampleConnector):
     def __init__(self, config: VllmConfig, role, kv_cache_config):
         self.name = config.kv_transfer_config.kv_connector_extra_config["name"]
-        self._connector = SharedStorageConnector(config, role)
+        self._connector = ExampleConnector(config, role)
         self.call_record: dict[str, int] = defaultdict(int)
         # Use a unique temp file per connector
         self._event_file = (
@@ -394,7 +394,7 @@ class MockKVConnector(KVConnectorBase_V1):
 
 
 KVConnectorFactory.register_connector(
-    "TestSharedStorageConnector", __name__, TestSharedStorageConnector.__name__
+    "TestExampleConnector", __name__, TestExampleConnector.__name__
 )
 
 KVConnectorFactory.register_connector(
diff --git a/vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py b/vllm/distributed/ec_transfer/ec_connector/example_connector.py
similarity index 96%
rename from vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py
rename to vllm/distributed/ec_transfer/ec_connector/example_connector.py
index c8388141d..5f2eff5a8 100644
--- a/vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py
+++ b/vllm/distributed/ec_transfer/ec_connector/example_connector.py
@@ -32,7 +32,7 @@ class MMMeta:
 
 
 @dataclass
-class ECSharedStorageConnectorMetadata(ECConnectorMetadata):
+class ECExampleConnectorMetadata(ECConnectorMetadata):
     mm_datas: list[MMMeta]
 
     def __init__(self):
@@ -42,7 +42,7 @@ class ECSharedStorageConnectorMetadata(ECConnectorMetadata):
         self.mm_datas.append(mm_data)
 
 
-class ECSharedStorageConnector(ECConnectorBase):
+class ECExampleConnector(ECConnectorBase):
     # NOTE: This is Simple debug implementation of the EC connector.
     # It save / load the EC cache to / from the disk.
 
@@ -76,7 +76,7 @@ class ECSharedStorageConnector(ECConnectorBase):
 
         # Get the metadata
         metadata: ECConnectorMetadata = self._get_connector_metadata()
-        assert isinstance(metadata, ECSharedStorageConnectorMetadata)
+        assert isinstance(metadata, ECExampleConnectorMetadata)
         assert encoder_cache is not None
         if metadata is None:
             logger.warning(
@@ -160,7 +160,7 @@ class ECSharedStorageConnector(ECConnectorBase):
         Args:
             scheduler_output (SchedulerOutput): the scheduler output object.
         """
-        meta = ECSharedStorageConnectorMetadata()
+        meta = ECExampleConnectorMetadata()
         for mm_hash, num_encoder_token in self._mm_datas_need_loads.items():
             meta.add_mm_data(MMMeta.make_meta(mm_hash, num_encoder_token))
         self._mm_datas_need_loads.clear()
diff --git a/vllm/distributed/ec_transfer/ec_connector/factory.py b/vllm/distributed/ec_transfer/ec_connector/factory.py
index e51b32e6f..32f36ffbb 100644
--- a/vllm/distributed/ec_transfer/ec_connector/factory.py
+++ b/vllm/distributed/ec_transfer/ec_connector/factory.py
@@ -79,7 +79,7 @@ class ECConnectorFactory:
 # only load the files corresponding to the current connector.
 
 ECConnectorFactory.register_connector(
-    "ECSharedStorageConnector",
-    "vllm.distributed.ec_transfer.ec_connector.shared_storage_connector",
-    "ECSharedStorageConnector",
+    "ECExampleConnector",
+    "vllm.distributed.ec_transfer.ec_connector.example_connector",
+    "ECExampleConnector",
 )
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index 02f51a1dc..02d9a1ec9 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -144,9 +144,9 @@ class KVConnectorFactory:
 # only load the files corresponding to the current connector.
 
 KVConnectorFactory.register_connector(
-    "SharedStorageConnector",
-    "vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector",
-    "SharedStorageConnector",
+    "ExampleConnector",
+    "vllm.distributed.kv_transfer.kv_connector.v1.example_connector",
+    "ExampleConnector",
 )
 
 KVConnectorFactory.register_connector(
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py
similarity index 98%
rename from vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
rename to vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py
index ed641cfc4..41243fc86 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py
@@ -65,7 +65,7 @@ class ReqMeta:
 
 
 @dataclass
-class SharedStorageConnectorMetadata(KVConnectorMetadata):
+class ExampleConnectorMetadata(KVConnectorMetadata):
     requests: list[ReqMeta] = field(default_factory=list)
 
     def add_request(
@@ -81,7 +81,7 @@ class SharedStorageConnectorMetadata(KVConnectorMetadata):
         )
 
 
-class SharedStorageConnector(KVConnectorBase_V1):
+class ExampleConnector(KVConnectorBase_V1):
     # NOTE: This is Simple debug implementation of the KV connector.
     # It save / load the KV cache to / from the disk.
     # It does extra work which will overwrite the existing prefix-cache in GPU
@@ -157,7 +157,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
 
         # Get the metadata
         metadata: KVConnectorMetadata = self._get_connector_metadata()
-        assert isinstance(metadata, SharedStorageConnectorMetadata)
+        assert isinstance(metadata, ExampleConnectorMetadata)
 
         if metadata is None:
             logger.warning(
@@ -241,7 +241,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
             return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping, ...]
 
         connector_metadata = self._get_connector_metadata()
-        assert isinstance(connector_metadata, SharedStorageConnectorMetadata)
+        assert isinstance(connector_metadata, ExampleConnectorMetadata)
         for request in connector_metadata.requests:
             if request.is_store:
                 filename = self._generate_filename_debug(
@@ -315,7 +315,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
         Args:
             scheduler_output (SchedulerOutput): the scheduler output object.
         """
-        meta = SharedStorageConnectorMetadata()
+        meta = ExampleConnectorMetadata()
 
         total_need_load = 0
         for new_req in scheduler_output.scheduled_new_reqs:
-- 
GitLab


From 4b03b502119f857bbead7064a29267b9ea8999e5 Mon Sep 17 00:00:00 2001
From: liangel-02 <liangel@meta.com>
Date: Mon, 8 Dec 2025 23:46:35 -0500
Subject: [PATCH 220/258] update torchao safetensors impl (#30155)

Signed-off-by: Angel Li <liangel@meta.com>
---
 vllm/model_executor/model_loader/weight_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 0496b7a84..610e6a620 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -641,7 +641,6 @@ def safetensors_weights_iterator(
     if safetensors_load_strategy == "eager":
         loading_desc += " (eager)"
 
-    state_dict = {}
     leftover_state_dict: dict[str, torch.Tensor] = {}
 
     for st_file in tqdm(
@@ -667,6 +666,7 @@ def safetensors_weights_iterator(
             )
 
             with safe_open(st_file, framework="pt") as f:
+                state_dict = {}
                 for name in f.keys():  # noqa: SIM118
                     state_dict[name] = f.get_tensor(name)
 
-- 
GitLab


From e130845984b77b248436bdbe8f3afdf7b3107a62 Mon Sep 17 00:00:00 2001
From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com>
Date: Tue, 9 Dec 2025 04:55:39 +0000
Subject: [PATCH 221/258] [CPU][CI] Enable fused MoE tests in Arm CI (#30132)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
---
 .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
index b5f6b2494..9c6e7766b 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -40,7 +40,8 @@ function cpu_tests() {
   docker exec cpu-test bash -c "
     set -e
     pytest -x -v -s tests/kernels/test_onednn.py
-    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
+    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
+    pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"
 
   # basic online serving
   docker exec cpu-test bash -c '
-- 
GitLab


From c2e1987a6e794b745be4c060ae4bbd06864d8028 Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli.lin@intel.com>
Date: Tue, 9 Dec 2025 13:16:44 +0800
Subject: [PATCH 222/258] [Doc] update Intel GPU MM status in Feature x
 Hardware matrix (#30294)

Signed-off-by: Lin, Fanli <fanli.lin@intel.com>
---
 docs/features/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/features/README.md b/docs/features/README.md
index 684802301..e9e523292 100644
--- a/docs/features/README.md
+++ b/docs/features/README.md
@@ -68,8 +68,8 @@ th:not(:first-child) {
 | CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | [❌](https://github.com/vllm-project/vllm/issues/26970)        |
 | [pooling](../models/pooling_models.md)                    | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
 | <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ✅        |
-| [mm](multimodal_inputs.md)                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | [🟠](https://github.com/vllm-project/vllm/issues/26965)       |
-| [prompt-embeds](prompt_embeds.md)                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❔     | ✅       |
+| [mm](multimodal_inputs.md)                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+| [prompt-embeds](prompt_embeds.md)                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❔     | ✅        |
 | <abbr title="Logprobs">logP</abbr>                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
 | <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
 | <abbr title="Async Output Processing">async output</abbr> | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ❌     | ✅        |
-- 
GitLab


From 58d5b3f51455706bf4f1f2360a0feb83d161147e Mon Sep 17 00:00:00 2001
From: Tsukasa OI <floss_llm@irq.a4lg.com>
Date: Tue, 9 Dec 2025 14:30:05 +0900
Subject: [PATCH 223/258] [Model][Quantization] Restore MoE + GGUF models
 support (incl. Qwen3 MoE) by allowing Sideload Parameters (#30116)

Signed-off-by: Tsukasa OI <floss_llm@irq.a4lg.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../layers/quantization/gguf.py               |  1 +
 .../model_loader/gguf_loader.py               | 24 ++++++++++++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index bcdfafb50..ee819df29 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -82,6 +82,7 @@ class GGUFConfig(QuantizationConfig):
                 return UnquantizedEmbeddingMethod()
             return GGUFEmbeddingMethod(self)
         elif isinstance(layer, FusedMoE):
+            # TODO: Select UnquantizedFusedMoEMethod on unquantized layers.
             return GGUFMoEMethod(self, layer.moe_config)
         return None
 
diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py
index 74052f72c..7f94bd234 100644
--- a/vllm/model_executor/model_loader/gguf_loader.py
+++ b/vllm/model_executor/model_loader/gguf_loader.py
@@ -4,6 +4,7 @@ import os
 from collections.abc import Generator
 
 import gguf
+import regex as re
 import torch
 import torch.nn as nn
 from huggingface_hub import hf_hub_download
@@ -94,6 +95,7 @@ class GGUFModelLoader(BaseModelLoader):
             hasattr(config, "vision_config") and config.vision_config is not None
         )
         gguf_to_hf_name_map = {}
+        sideload_params: list[re.Pattern] = []
         # hack: ggufs have a different name than transformers
         if model_type == "cohere":
             model_type = "command-r"
@@ -118,6 +120,12 @@ class GGUFModelLoader(BaseModelLoader):
                 gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = (
                     f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
                 )
+                sideload_params.append(
+                    re.compile(
+                        f"model\\.layers\\.{idx}"
+                        r"\.mlp\.experts\.[0-9]+\.(gate|up|down)_proj\.weight"
+                    )
+                )
         if model_type in ("qwen2_moe", "qwen3_moe"):
             model_type = model_type.replace("_", "")
             # GGUF layer map assumes that we will have a merged expert weights
@@ -132,6 +140,12 @@ class GGUFModelLoader(BaseModelLoader):
                 gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = (
                     f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
                 )
+                sideload_params.append(
+                    re.compile(
+                        f"model\\.layers\\.{idx}"
+                        r"\.mlp\.experts\.[0-9]+\.(gate|up|down)_proj\.weight"
+                    )
+                )
 
         arch = None
         for key, value in gguf.MODEL_ARCH_NAMES.items():
@@ -241,7 +255,15 @@ class GGUFModelLoader(BaseModelLoader):
                 # Parameter not in manual overrides either
                 unmapped_params.append(hf_name)
 
-        # All parameters must be mapped: both vision/projector and backbone
+        # All parameters (except those initialized by other means) must be mapped:
+        # both vision/projector and backbone
+        if unmapped_params:
+            unmapped_params = list(
+                filter(
+                    lambda x: not any(re.fullmatch(p, x) for p in sideload_params),
+                    unmapped_params,
+                )
+            )
         if unmapped_params:
             raise RuntimeError(
                 f"Failed to map GGUF parameters "
-- 
GitLab


From e4605d225e020154bc98efd15a05e11b83eaefb7 Mon Sep 17 00:00:00 2001
From: Yongtao Huang <yongtaoh2022@gmail.com>
Date: Tue, 9 Dec 2025 14:50:06 +0800
Subject: [PATCH 224/258] [Misc] Fix safetensors import for safe_open (#30300)

Signed-off-by: Yongtao Huang <yongtaoh2022@gmail.com>
---
 vllm/lora/lora_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/lora/lora_model.py b/vllm/lora/lora_model.py
index db170f13a..f5e36697e 100644
--- a/vllm/lora/lora_model.py
+++ b/vllm/lora/lora_model.py
@@ -3,7 +3,7 @@
 
 import os
 
-import safetensors.torch
+import safetensors
 import torch
 
 from vllm.logger import init_logger
-- 
GitLab


From aed846917fee3b27df876d21ade4d3a30d4de402 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Tue, 9 Dec 2025 02:24:01 -0500
Subject: [PATCH 225/258] [Attention] Make `split_decodes_and_prefills(...,
 require_uniform=True)` support padding (#29644)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Benjamin Chislett <chislett.ben@gmail.com>
---
 .../v1/attention/test_attention_splitting.py  | 25 ++++++++++++++++++-
 vllm/v1/attention/backends/utils.py           | 10 +++++---
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/tests/v1/attention/test_attention_splitting.py b/tests/v1/attention/test_attention_splitting.py
index f60861e34..f08e2f480 100644
--- a/tests/v1/attention/test_attention_splitting.py
+++ b/tests/v1/attention/test_attention_splitting.py
@@ -154,7 +154,10 @@ def test_split_attn_metadata_decode_batch(large_decode_metadata):
 
 
 def apply_split_decodes_and_prefills(
-    query_lens: list[int], decode_threshold: int, require_uniform: bool
+    query_lens: list[int],
+    decode_threshold: int,
+    require_uniform: bool,
+    padded_num_tokens: int | None = None,
 ):
     """Helper function to apply split_decodes_and_prefills and return
     the results."""
@@ -165,6 +168,10 @@ def apply_split_decodes_and_prefills(
         block_size=16,
         device=device,
     )
+
+    if padded_num_tokens is not None:
+        common_metadata.num_actual_tokens = padded_num_tokens
+
     return split_decodes_and_prefills(
         common_metadata,
         decode_threshold=decode_threshold,
@@ -271,6 +278,22 @@ def test_split_decodes_and_prefills_uniform_mixed_batch_non_uniform_decodes():
     assert num_prefill_tokens == (sum(query_lens) - 2)  # rest of the tokens
 
 
+def test_split_decodes_and_prefills_uniform_padded_batch_all_same():
+    """uniform batch where all query lengths are identical with 0 length padded reqs."""
+    # All query lengths are 2, with decode_threshold=3 (so 2 <= 3)
+    # This triggers the padded uniform path at line 891
+    query_lens = [2, 2, 2, 0]
+    padded_num_tokens = 8
+    num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
+        apply_split_decodes_and_prefills(query_lens, 3, True, padded_num_tokens)
+    )
+    # With uniform batch, all requests are treated as decodes
+    assert num_decodes == 4
+    assert num_prefills == 0
+    assert num_decode_tokens == padded_num_tokens
+    assert num_prefill_tokens == 0
+
+
 @pytest.mark.parametrize(
     "seq_lens,query_lens,split_point,expected_first_reqs,expected_second_reqs",
     [
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 8edfbb514..5200bc48b 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -883,11 +883,15 @@ def split_decodes_and_prefills(
         return 0, num_reqs, 0, num_tokens
 
     if require_uniform:
+        # check if we are in a padded uniform batch; this is used for full-CGs, some
+        # requests may have a query length of 0 but since they are padding its fine
+        # to treat them as decodes (ensures num_decodes matches the captured size)
+        if torch.all((query_lens == query_lens[0]) | (query_lens == 0)):
+            assert num_reqs * query_lens[0] == num_tokens, "tokens not padded correctly"
+            return num_reqs, 0, num_tokens, 0  # all decodes
         is_prefill = query_lens != query_lens[0]
     else:
-        # 0-query len indicates a padded request; leave this at the back
-        # of the batch with the prefills
-        is_prefill = (query_lens > decode_threshold) | (query_lens == 0)
+        is_prefill = query_lens > decode_threshold
 
     if not torch.any(is_prefill):
         return num_reqs, 0, num_tokens, 0
-- 
GitLab


From aeb82b1930454498fccc7e91f7c4e0f360cf658a Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Tue, 9 Dec 2025 01:33:34 -0600
Subject: [PATCH 226/258] [CI] Fix Flaky test_eagle_max_len Test (#30306)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 tests/v1/spec_decode/test_max_len.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py
index 81da8609a..15a6bd265 100644
--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py
@@ -82,7 +82,7 @@ def test_eagle_max_len(
                 len(o.prompt_token_ids)
                 < 80
                 < len(o.prompt_token_ids) + len(o.outputs[0].token_ids)
-                < 200
+                <= 200
             ), (
                 "This test is only meaningful if the output "
                 "is longer than the eagle max length"
-- 
GitLab


From 9c32df6101b24040c824dee5b0e5543a94dc6f45 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Tue, 9 Dec 2025 16:04:02 +0800
Subject: [PATCH 227/258] [Bugfix] Qwen 3 VL Embedding loading (#30303)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: wang.yuqi <noooop@126.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/model_executor/models/adapters.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 70f203b9f..9ba76f312 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -175,9 +175,14 @@ def _create_pooling_model_cls(orig_cls: _T) -> _T:
             self.vllm_config = vllm_config
 
             # These are not used in pooling models
-            for attr in ("lm_head", "logits_processor"):
-                if hasattr(self, attr):
-                    delattr(self, attr)
+            objects_to_clean = [self]
+            if language_model := getattr(self, "language_model", None):
+                objects_to_clean.append(language_model)
+
+            for obj in objects_to_clean:
+                for attr in ("lm_head", "logits_processor"):
+                    if hasattr(obj, attr):
+                        delattr(obj, attr)
 
             # If the model already defines a pooler instance, don't overwrite it
             if not getattr(self, "pooler", None):
-- 
GitLab


From 67475a6e81abea915857f82e6f10d80b03b842c9 Mon Sep 17 00:00:00 2001
From: Jaya Yuan <yuanyongjie.yyj@antgroup.com>
Date: Tue, 9 Dec 2025 16:22:14 +0800
Subject: [PATCH 228/258] [DCP][Bugfix][CI] Fix accuracy issue of DCP when
 using FLASH_ATTN_MLA (#30309)

Signed-off-by: FENP <yuanyongjie.yyj@antgroup.com>
---
 tests/distributed/test_context_parallel.py      | 5 ++++-
 vllm/v1/attention/backends/mla/flashattn_mla.py | 3 ++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py
index 3cb533dcc..aa47f28a3 100644
--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
@@ -123,8 +123,11 @@ class CPTestSettings:
 
 CP_TEXT_GENERATION_MODELS = {
     "deepseek-ai/DeepSeek-V2-Lite-Chat": [
+        CPTestSettings.detailed(dcp_multipliers=[1]),
         CPTestSettings.detailed(
-            dcp_multipliers=[0.5, 1], cp_kv_cache_interleave_size=64
+            dcp_multipliers=[0.5],
+            cp_kv_cache_interleave_size=64,
+            attn_backend="FLASHMLA",
         ),
     ],
     "Qwen/Qwen2.5-1.5B-Instruct": [
diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py
index eccf4ec79..b28814ace 100644
--- a/vllm/v1/attention/backends/mla/flashattn_mla.py
+++ b/vllm/v1/attention/backends/mla/flashattn_mla.py
@@ -105,13 +105,14 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata]
         vllm_config: VllmConfig,
         device: torch.device,
     ):
+        interleave_size = vllm_config.parallel_config.cp_kv_cache_interleave_size
         super().__init__(
             kv_cache_spec,
             layer_names,
             vllm_config,
             device,
             FlashAttnMLAMetadata,
-            supports_dcp_with_varlen=True,
+            supports_dcp_with_varlen=(interleave_size == 1),
         )
         self.max_num_splits = 0  # No upper bound on the number of splits.
         self.fa_aot_schedule = get_flash_attn_version() == 3
-- 
GitLab


From c72ea1072313b30fc8b4701697520fdd0b56fc35 Mon Sep 17 00:00:00 2001
From: Hubert de La Jonquiere <hubert@hcompany.ai>
Date: Tue, 9 Dec 2025 11:54:08 +0100
Subject: [PATCH 229/258] [Structured Output][Reasoning] Improves decoding
 throughput for models using single-token reasoning endings. (#30056)

---
 docs/features/reasoning_outputs.md            |  3 ++
 .../test_base_thinking_reasoning_parser.py    | 35 +++++++++++++++++++
 .../test_deepseekv3_reasoning_parser.py       |  1 +
 .../test_reasoning_structured_output.py       |  1 +
 vllm/reasoning/abs_reasoning_parsers.py       | 25 +++++++++++++
 vllm/reasoning/basic_parsers.py               |  6 ++++
 .../reasoning/deepseek_v3_reasoning_parser.py |  5 +++
 vllm/reasoning/holo2_reasoning_parser.py      |  5 +++
 vllm/reasoning/identity_reasoning_parser.py   |  5 +++
 vllm/v1/structured_output/__init__.py         |  4 ++-
 10 files changed, 89 insertions(+), 1 deletion(-)

diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index 3315c0949..93cca2385 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -299,6 +299,9 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner
 
         def is_reasoning_end(self, input_ids: list[int]) -> bool:
             return self.end_token_id in input_ids
+
+        def is_reasoning_end_streaming(self, input_ids: list[int], delta_ids: list[int]) -> bool:
+            return self.end_token_id in delta_token_ids
         ...
     ```
 
diff --git a/tests/reasoning/test_base_thinking_reasoning_parser.py b/tests/reasoning/test_base_thinking_reasoning_parser.py
index 34e9483de..165e91a2c 100644
--- a/tests/reasoning/test_base_thinking_reasoning_parser.py
+++ b/tests/reasoning/test_base_thinking_reasoning_parser.py
@@ -132,6 +132,41 @@ class TestBaseThinkingReasoningParserMethods:
             is False
         )
 
+    def test_is_reasoning_end_streaming(self, test_tokenizer):
+        """Test the is_reasoning_end_streaming method."""
+        parser = TestThinkingReasoningParser(test_tokenizer)
+        end_token_id = parser.end_token_id
+        start_token_id = parser.start_token_id
+
+        assert (
+            parser.is_reasoning_end_streaming([1, 2, end_token_id], [end_token_id])
+            is True
+        )
+        assert parser.is_reasoning_end_streaming([1, 2, 3, 4], [4]) is False
+        assert parser.is_reasoning_end_streaming([], []) is False
+        assert (
+            parser.is_reasoning_end_streaming(
+                [1, start_token_id, 2, end_token_id], [end_token_id]
+            )
+            is True
+        )
+        assert (
+            parser.is_reasoning_end_streaming([1, start_token_id, 2, 3], [3]) is False
+        )
+        assert (
+            parser.is_reasoning_end_streaming(
+                [1, start_token_id, 2, end_token_id, 2, start_token_id, 2],
+                [2],
+            )
+            is False
+        )
+        assert (
+            parser.is_reasoning_end_streaming(
+                [1, start_token_id, 2, end_token_id, 2, 2], [2]
+            )
+            is False
+        )
+
     def test_extract_content_ids(self, test_tokenizer):
         """Test the extract_content_ids method."""
         parser = TestThinkingReasoningParser(test_tokenizer)
diff --git a/tests/reasoning/test_deepseekv3_reasoning_parser.py b/tests/reasoning/test_deepseekv3_reasoning_parser.py
index 6e8f0e8dc..874fdef77 100644
--- a/tests/reasoning/test_deepseekv3_reasoning_parser.py
+++ b/tests/reasoning/test_deepseekv3_reasoning_parser.py
@@ -40,6 +40,7 @@ def test_identity_reasoning_parser_basic(tokenizer):
     input_tokens = tokenizer.tokenize(input_text)
     input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
     assert parser.is_reasoning_end(input_ids) is True
+    assert parser.is_reasoning_end_streaming(input_ids, input_ids) is True
 
     # Test extract_content_ids returns all input_ids
     assert parser.extract_content_ids(input_ids) == input_ids
diff --git a/tests/v1/structured_output/test_reasoning_structured_output.py b/tests/v1/structured_output/test_reasoning_structured_output.py
index 70047a993..ba52af3ad 100644
--- a/tests/v1/structured_output/test_reasoning_structured_output.py
+++ b/tests/v1/structured_output/test_reasoning_structured_output.py
@@ -70,6 +70,7 @@ class TestReasoningStructuredOutput:
         request.use_structured_output = True
         request.prompt_token_ids = [1, 2, 3, 4, 5]
         request.all_token_ids = [1, 2, 3, 4, 5, 6, 7, 8]
+        request.num_computed_tokens = 5
         return request
 
     def test_should_fill_bitmask_with_enable_in_reasoning(
diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
index d0661d1f2..bf593ca4e 100644
--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -63,6 +63,31 @@ class ReasoningParser:
             True if the reasoning content ends in the input_ids.
         """
 
+    def is_reasoning_end_streaming(
+        self, input_ids: list[int], delta_ids: list[int]
+    ) -> bool:
+        """
+        Check if the reasoning content ends in the input_ids on a
+        decode step.
+
+        It is used in structured engines like `xgrammar` to check if the
+        reasoning content ends in the model output during a decode step.
+        `input_ids` the entire model output and `delta_ids` are the last few
+        computed tokens of the model output (like during a decode step).
+
+        Parameters:
+        input_ids: list[int]
+            The entire model output.
+        delta_ids: list[int]
+            The last few computed tokens of the model output at the current decode step.
+
+        Returns:
+        bool
+            True if the reasoning content ends in the `delta_ids` on a
+            decode step.
+        """
+        return self.is_reasoning_end(input_ids)
+
     @abstractmethod
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
         """
diff --git a/vllm/reasoning/basic_parsers.py b/vllm/reasoning/basic_parsers.py
index e78ac4a5e..43067ca4a 100644
--- a/vllm/reasoning/basic_parsers.py
+++ b/vllm/reasoning/basic_parsers.py
@@ -74,6 +74,12 @@ class BaseThinkingReasoningParser(ReasoningParser):
                 return True
         return False
 
+    def is_reasoning_end_streaming(
+        self, input_ids: list[int], delta_ids: list[int]
+    ) -> bool:
+        end_token_id = self.end_token_id
+        return end_token_id in delta_ids
+
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
         """
         Extract the content after the end tokens
diff --git a/vllm/reasoning/deepseek_v3_reasoning_parser.py b/vllm/reasoning/deepseek_v3_reasoning_parser.py
index afdf73262..6604f70ba 100644
--- a/vllm/reasoning/deepseek_v3_reasoning_parser.py
+++ b/vllm/reasoning/deepseek_v3_reasoning_parser.py
@@ -35,6 +35,11 @@ class DeepSeekV3ReasoningParser(ReasoningParser):
     def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
         return self._parser.is_reasoning_end(input_ids)
 
+    def is_reasoning_end_streaming(
+        self, input_ids: list[int], delta_ids: list[int]
+    ) -> bool:
+        return self._parser.is_reasoning_end_streaming(input_ids, delta_ids)
+
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
         return self._parser.extract_content_ids(input_ids)
 
diff --git a/vllm/reasoning/holo2_reasoning_parser.py b/vllm/reasoning/holo2_reasoning_parser.py
index 76de1c077..f80190d28 100644
--- a/vllm/reasoning/holo2_reasoning_parser.py
+++ b/vllm/reasoning/holo2_reasoning_parser.py
@@ -56,6 +56,11 @@ class Holo2ReasoningParser(ReasoningParser):
     def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
         return self._parser.is_reasoning_end(input_ids)
 
+    def is_reasoning_end_streaming(
+        self, input_ids: list[int], delta_ids: list[int]
+    ) -> bool:
+        return self._parser.is_reasoning_end_streaming(input_ids, delta_ids)
+
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
         return self._parser.extract_content_ids(input_ids)
 
diff --git a/vllm/reasoning/identity_reasoning_parser.py b/vllm/reasoning/identity_reasoning_parser.py
index e92f8add0..e998e071e 100644
--- a/vllm/reasoning/identity_reasoning_parser.py
+++ b/vllm/reasoning/identity_reasoning_parser.py
@@ -32,6 +32,11 @@ class IdentityReasoningParser(ReasoningParser):
         # Always return True, since we never treat reasoning specially
         return True
 
+    def is_reasoning_end_streaming(
+        self, input_ids: list[int], delta_ids: list[int]
+    ) -> bool:
+        return True
+
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
         # Identity: return all tokens as content
         return input_ids
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 5ee88178c..4dd478804 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -339,7 +339,9 @@ class StructuredOutputManager:
             return True
 
         # Check if reasoning ends in *this* step
-        if self.reasoner.is_reasoning_end(request.all_token_ids):
+        if self.reasoner.is_reasoning_end_streaming(
+            request.all_token_ids, request.all_token_ids[request.num_computed_tokens :]
+        ):
             # Reasoning just ended, so we shouldn't advance til
             # next pass
             structured_req.reasoning_ended = True
-- 
GitLab


From 03416eada6c01770fb71c3d988fc3c74958d8f5e Mon Sep 17 00:00:00 2001
From: haoyangli-amd <lihaoyang0109@gmail.com>
Date: Tue, 9 Dec 2025 19:28:50 +0800
Subject: [PATCH 230/258] [bugfix][quantization] Fix fp8 per_tensor scale shape
 (#30257)

Signed-off-by: Haoyang Li <lihaoyang0109@gmail.com>
---
 vllm/_custom_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 6bbfe11b6..6d862c581 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1726,7 +1726,7 @@ def scaled_fp8_quant(
                 output, input, scale, scale_ub
             )
         else:
-            scale = torch.empty((1, 1), device=input.device, dtype=torch.float32)
+            scale = torch.empty(1, device=input.device, dtype=torch.float32)
             torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
     else:
         assert scale.numel() == 1, f"{scale.shape}"
-- 
GitLab


From 1166c31cc78073378a16509fbbbed4cb4f040a4d Mon Sep 17 00:00:00 2001
From: Dongjie Zou <85092850+baonudesifeizhai@users.noreply.github.com>
Date: Tue, 9 Dec 2025 07:20:21 -0500
Subject: [PATCH 231/258] [Bugfix]: Fix glm46 awq marlin moe wna16
 compatibility (#30210)

Signed-off-by: baonudesifeizhai <baonudesifeizhai@gmail.com>
---
 .../layers/fused_moe/fused_moe.py             | 45 +++++++++++++++++++
 .../layers/quantization/moe_wna16.py          |  9 ++--
 2 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index f3c158ee2..0b83a3f5c 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -895,6 +895,48 @@ def get_moe_configs(
     return None
 
 
+def _ensure_block_size_k_divisible(
+    size_k: int, block_size_k: int, group_size: int
+) -> int:
+    """Ensure block_size_k is a divisor of size_k and divisible by group_size.
+
+    This ensures BLOCK_SIZE_K compatibility with MoeWNA16 CUDA kernel which
+    requires size_k % BLOCK_SIZE_K == 0 and BLOCK_SIZE_K % group_size == 0.
+
+    Args:
+        size_k: The size_k dimension that must be divisible by result.
+        block_size_k: Preferred block size (will be adjusted if needed).
+        group_size: The result must be divisible by this.
+
+    Returns:
+        A valid BLOCK_SIZE_K that divides size_k and is divisible by group_size.
+    """
+    # Fast path: already valid
+    if size_k % block_size_k == 0 and block_size_k % group_size == 0:
+        return block_size_k
+
+    # Find the largest value that:
+    # 1. Divides size_k (size_k % candidate == 0)
+    # 2. Is divisible by group_size (candidate % group_size == 0)
+    # 3. Is <= block_size_k (prefer smaller values close to block_size_k)
+    #
+    # Strategy: Search from min(block_size_k, size_k) down to group_size,
+    # stepping by group_size to ensure divisibility by group_size
+    max_search = min(block_size_k, size_k)
+    start = (max_search // group_size) * group_size
+    for candidate in range(start, group_size - 1, -group_size):
+        if size_k % candidate == 0:
+            return candidate
+
+    # Fallback: if group_size divides size_k, use it
+    # This should always be true with correct group_size configuration
+    if size_k % group_size == 0:
+        return group_size
+
+    # This should not happen with correct group_size, but ensure divisibility
+    return size_k
+
+
 def get_moe_wna16_block_config(
     config: dict[str, int],
     use_moe_wna16_cuda: bool,
@@ -960,6 +1002,9 @@ def get_moe_wna16_block_config(
             # at the same time.
             block_size_n = 1024
 
+        # Ensure BLOCK_SIZE_K is a divisor of size_k for CUDA kernel compatibility
+        block_size_k = _ensure_block_size_k_divisible(size_k, block_size_k, group_size)
+
         return {"BLOCK_SIZE_N": block_size_n, "BLOCK_SIZE_K": block_size_k}
 
 
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index cf348290a..8570b8c33 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -60,7 +60,7 @@ class MoeWNA16Config(QuantizationConfig):
 
         if self.linear_quant_method == "gptq":
             self.use_marlin = GPTQMarlinConfig.is_gptq_marlin_compatible(full_config)
-        elif self.linear_quant_method == "awq":
+        elif self.linear_quant_method in ("awq", "awq_marlin"):
             capability_tuple = current_platform.get_device_capability()
             device_capability = (
                 -1 if capability_tuple is None else capability_tuple.to_int()
@@ -107,7 +107,7 @@ class MoeWNA16Config(QuantizationConfig):
         if linear_quant_method == "gptq":
             has_zp = not cls.get_from_keys(config, ["sym"])
             modules_to_not_convert = []
-        elif linear_quant_method == "awq":
+        elif linear_quant_method in ("awq", "awq_marlin"):
             has_zp = cls.get_from_keys(config, ["zero_point"])
             modules_to_not_convert = cls.get_from_keys_or(
                 config, ["modules_to_not_convert"], None
@@ -184,7 +184,7 @@ class MoeWNA16Config(QuantizationConfig):
                     return GPTQConfig.from_config(self.full_config).get_quant_method(
                         layer, prefix
                     )
-            elif self.linear_quant_method == "awq":
+            elif self.linear_quant_method in ("awq", "awq_marlin"):
                 if self.use_marlin and check_marlin_supports_layer(
                     layer, self.group_size
                 ):
@@ -468,7 +468,8 @@ class MoeWNA16Method(FusedMoEMethodBase):
             shard_size = layer.intermediate_size_per_partition
 
             # convert gptq and awq weight to a standard format
-            if layer.quant_config.linear_quant_method == "awq":
+            # awq_marlin uses the same weight format as awq
+            if layer.quant_config.linear_quant_method in ("awq", "awq_marlin"):
                 assert layer.quant_config.weight_bits == 4
                 if "weight" in weight_name:
                     loaded_weight = convert_awq_tensor(loaded_weight, "qweight")
-- 
GitLab


From ee14644ba9a3696c83ede2c948b73ebc3e1ffb33 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Tue, 9 Dec 2025 22:27:37 +0800
Subject: [PATCH 232/258] [ROCm] Aiter Quant Kernels (#25552)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/_aiter_ops.py                            | 87 +++++++++++++++++++
 .../layers/quantization/input_quant_fp8.py    | 31 +++++++
 vllm/platforms/rocm.py                        |  7 +-
 3 files changed, 123 insertions(+), 2 deletions(-)

diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index 35920d826..94bbc9b00 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -9,6 +9,8 @@ import vllm.envs as envs
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import direct_register_custom_op, is_torch_equal_or_newer
 
+_FP8_DTYPE = current_platform.fp8_dtype()
+
 
 def is_aiter_found() -> bool:
     from importlib.util import find_spec
@@ -467,6 +469,59 @@ def _rocm_aiter_rmsnorm2d_fwd_with_add_fake(
     return torch.empty_like(x), torch.empty_like(residual)
 
 
+def _rocm_aiter_per_tensor_quant_impl(
+    x: torch.Tensor,
+    quant_dtype: torch.dtype,
+    scale: torch.Tensor | None = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    from aiter.ops.quant import per_tensor_quant_hip
+
+    return per_tensor_quant_hip(x, scale, quant_dtype)
+
+
+def _rocm_aiter_per_tensor_quant_fake(
+    x: torch.Tensor,
+    quant_dtype: torch.dtype,
+    scale: torch.Tensor | None = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    return torch.empty_like(x, dtype=quant_dtype), torch.empty(
+        1, dtype=torch.float32, device=x.device
+    )
+
+
+def _rocm_aiter_per_token_quant_impl(
+    x: torch.Tensor, quant_dtype: torch.dtype, scale: torch.Tensor | None = None
+) -> tuple[torch.Tensor, torch.Tensor]:
+    from aiter.ops.quant import dynamic_per_token_scaled_quant
+
+    assert quant_dtype in [torch.int8, _FP8_DTYPE]
+
+    out_shape = x.shape
+    out = torch.empty(x.shape, dtype=_FP8_DTYPE, device=x.device)
+    if scale is None:
+        scale = torch.empty((*out_shape[:-1], 1), dtype=torch.float32, device=x.device)
+    dynamic_per_token_scaled_quant(
+        out,
+        x,
+        scale,
+        scale_ub=None,
+        shuffle_scale=False,
+        num_rows=None,
+        num_rows_factor=1,
+    )
+    return out, scale
+
+
+def _rocm_aiter_per_token_quant_fake(
+    x: torch.Tensor, quant_dtype: torch.dtype, scale: torch.Tensor | None = None
+) -> tuple[torch.Tensor, torch.Tensor]:
+    out_shape = x.shape
+    return (
+        torch.empty(x.shape, dtype=_FP8_DTYPE, device=x.device),
+        torch.empty((*out_shape[:-1], 1), dtype=torch.float32, device=x.device),
+    )
+
+
 # Global flag to ensure ops are registered only once
 _OPS_REGISTERED = False
 
@@ -665,6 +720,22 @@ class rocm_aiter_ops:
                 dispatch_key=current_platform.dispatch_key,
             )
 
+            direct_register_custom_op(
+                op_name="rocm_aiter_per_tensor_quant",
+                op_func=_rocm_aiter_per_tensor_quant_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_per_tensor_quant_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_per_token_quant",
+                op_func=_rocm_aiter_per_token_quant_impl,
+                mutates_args=["scale"],
+                fake_impl=_rocm_aiter_per_token_quant_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
             _OPS_REGISTERED = True
 
     @staticmethod
@@ -859,6 +930,22 @@ class rocm_aiter_ops:
             kv_scale=kv_scale,
         )
 
+    @staticmethod
+    def per_tensor_quant(
+        x: torch.Tensor,
+        quant_dtype: torch.dtype,
+        scale: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return torch.ops.vllm.rocm_aiter_per_tensor_quant(x, quant_dtype, scale)
+
+    @staticmethod
+    def per_token_quant(
+        x: torch.Tensor,
+        quant_dtype: torch.dtype,
+        scale: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return torch.ops.vllm.rocm_aiter_per_token_quant(x, quant_dtype, scale)
+
     @staticmethod
     def triton_fp4_gemm_dynamic_qaunt(
         x: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py
index 7ded8eea7..a5db086fb 100644
--- a/vllm/model_executor/layers/quantization/input_quant_fp8.py
+++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py
@@ -5,6 +5,7 @@ import torch
 import torch.nn.functional as F
 
 from vllm import _custom_ops as ops
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.platforms import current_platform
@@ -45,10 +46,13 @@ class QuantFP8(CustomOp):
         super().__init__()
         self.static = static
         self.group_shape = group_shape
+        self.use_per_token_if_dynamic = group_shape == GroupShape.PER_TOKEN
         self.num_token_padding = num_token_padding
         self.column_major_scales = column_major_scales
         self.use_ue8m0 = use_ue8m0
 
+        self.use_aiter = rocm_aiter_ops.is_linear_fp8_enaled()
+
         self.is_group_quant = group_shape.is_per_group()
         if self.is_group_quant:
             assert not static, "Group quantization only supports dynamic mode"
@@ -92,6 +96,33 @@ class QuantFP8(CustomOp):
             use_per_token_if_dynamic=self.use_per_token_if_dynamic,
         )
 
+    def forward_hip(
+        self,
+        x: torch.Tensor,
+        scale: torch.Tensor | None = None,
+        scale_ub: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        use_aiter_quant = (
+            not self.is_group_quant
+            and self.use_aiter
+            and scale_ub is None
+            and x.is_contiguous()
+        )
+        use_aiter_per_tensor_quant = (
+            use_aiter_quant and self.group_shape == GroupShape.PER_TENSOR
+        )
+        use_aiter_per_token_quant = (
+            use_aiter_quant and self.group_shape == GroupShape.PER_TOKEN
+        )
+
+        if use_aiter_per_tensor_quant:
+            return rocm_aiter_ops.per_tensor_quant(x, _FP8_DTYPE, scale)
+        if use_aiter_per_token_quant:
+            return rocm_aiter_ops.per_token_quant(x, _FP8_DTYPE, scale)
+
+        # Fallback to CUDA implementation
+        return self.forward_cuda(x, scale, scale_ub)
+
     def forward_native(
         self,
         x: torch.Tensor,
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index ff0fc7851..f7adecbd8 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -381,6 +381,8 @@ class RocmPlatform(Platform):
         compilation_config = vllm_config.compilation_config
         parallel_config = vllm_config.parallel_config
         is_eager_execution = compilation_config == CUDAGraphMode.NONE
+        use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled()
+        use_aiter_fp8_linear = rocm_aiter_ops.is_linear_fp8_enaled()
 
         if compilation_config.cudagraph_mode.has_full_cudagraphs():
             # decode context parallel does not support full cudagraphs
@@ -400,8 +402,6 @@ class RocmPlatform(Platform):
                 )
                 compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
 
-        use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled()
-
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16
 
@@ -415,6 +415,9 @@ class RocmPlatform(Platform):
         ):
             compilation_config.custom_ops.append("+rms_norm")
 
+        if use_aiter_fp8_linear and "-quant_fp8" not in compilation_config.custom_ops:
+            compilation_config.custom_ops.append("+quant_fp8")
+
     @classmethod
     def verify_model_arch(cls, model_arch: str) -> None:
         if model_arch in _ROCM_UNSUPPORTED_MODELS:
-- 
GitLab


From 5c213d2899f5a2d439c8d771a0abc156a5412a2b Mon Sep 17 00:00:00 2001
From: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Date: Tue, 9 Dec 2025 15:55:38 +0100
Subject: [PATCH 233/258] [BUGFIX] Mistral tool call parser v11+ (#30332)

Signed-off-by: juliendenize <julien.denize@mistral.ai>
---
 tests/tool_use/test_mistral_tool_parser.py    | 16 +++++++++
 .../tool_parsers/mistral_tool_parser.py       | 36 +++++++++----------
 2 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/tests/tool_use/test_mistral_tool_parser.py b/tests/tool_use/test_mistral_tool_parser.py
index e5deb7f40..2dd0399cb 100644
--- a/tests/tool_use/test_mistral_tool_parser.py
+++ b/tests/tool_use/test_mistral_tool_parser.py
@@ -615,6 +615,7 @@ def test_extract_tool_calls_streaming(
         "single_tool_weather",
         "multiple_tool_calls",
         "content_before_tool",
+        "complex",
     ],
     argnames=["model_output", "expected_tool_calls", "expected_content"],
     argvalues=[
@@ -673,6 +674,21 @@ def test_extract_tool_calls_streaming(
             ],
             "bla",
         ),
+        (
+            # Complex
+            """[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="bash",
+                        arguments=json.dumps(
+                            {"command": "print(\"hello world!\")\nre.compile(r'{}')"}
+                        ),
+                    )
+                )
+            ],
+            "",
+        ),
     ],
 )
 def test_extract_tool_calls_streaming_one_chunk(
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index aa5089ffe..bc827f045 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -99,12 +99,7 @@ class MistralToolParser(ToolParser):
         self.bot_token = "[TOOL_CALLS]"
         self.bot_token_id = self.vocab.get(self.bot_token)
         self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
-        if not _is_pre_v11_tokeniser(self.model_tokenizer):
-            self.fn_name_regex = re.compile(
-                r"([a-zA-Z0-9_-]+)(\{[\s\S]*?\}+)", re.DOTALL
-            )
-        else:
-            self.fn_name_regex = None
+        self._is_pre_v11 = _is_pre_v11_tokeniser(self.model_tokenizer)
 
         if self.bot_token_id is None:
             raise RuntimeError(
@@ -148,23 +143,24 @@ class MistralToolParser(ToolParser):
         tool_content = model_output.replace(self.bot_token, "").strip()
 
         try:
-            # we first try to directly load the json as parsing very nested
-            # jsons is difficult
             try:
-                if self.fn_name_regex:
+                if not self._is_pre_v11:
                     function_call_arr = []
                     for single_tool_content in model_output.split(self.bot_token):
-                        matches = self.fn_name_regex.findall(single_tool_content)
-
-                        for match in matches:
-                            fn_name = match[0]
-                            args = match[1]
-
-                            # fn_name is encoded outside serialized json dump
-                            # only arguments are serialized
-                            function_call_arr.append(
-                                {"name": fn_name, "arguments": json.loads(args)}
-                            )
+                        if "{" not in single_tool_content:
+                            continue
+
+                        end_name = single_tool_content.find("{")
+                        fn_name, args = (
+                            single_tool_content[:end_name],
+                            single_tool_content[end_name:],
+                        )
+
+                        # fn_name is encoded outside serialized json dump
+                        # only arguments are serialized
+                        function_call_arr.append(
+                            {"name": fn_name, "arguments": json.loads(args)}
+                        )
                 else:
                     function_call_arr = json.loads(tool_content)
             except json.JSONDecodeError:
-- 
GitLab


From 5dcd593baf9ceaad68a0e5e16f20f6ccccd10797 Mon Sep 17 00:00:00 2001
From: quanliu <18646313696@163.com>
Date: Tue, 9 Dec 2025 23:01:38 +0800
Subject: [PATCH 234/258] [Feature] Batch-Invariant Support for FA2 and LoRA
 (#30018)

Signed-off-by: quanliu <18646313696@163.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 tests/v1/determinism/test_batch_invariance.py | 10 ++++++++++
 tests/v1/determinism/utils.py                 | 10 ++++++++--
 vllm/model_executor/layers/batch_invariant.py |  6 +++++-
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py
index fc953a66f..1c45e7fe3 100644
--- a/tests/v1/determinism/test_batch_invariance.py
+++ b/tests/v1/determinism/test_batch_invariance.py
@@ -10,6 +10,7 @@ from utils import (
     BACKENDS,
     _extract_step_logprobs,
     _random_prompt,
+    is_device_capability_below_90,
     resolve_model_name,
     skip_unsupported,
 )
@@ -17,6 +18,8 @@ from utils import (
 import vllm.model_executor.layers.batch_invariant as batch_invariant
 from vllm import LLM, SamplingParams
 
+IS_DEVICE_CAPABILITY_BELOW_90 = is_device_capability_below_90()
+
 
 @skip_unsupported
 @pytest.mark.timeout(1000)
@@ -190,6 +193,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
         max_model_len=8192,
         dtype="bfloat16",  # not everything is supported
         gpu_memory_utilization=0.9,
+        enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
     )
 
     # Use more realistic prompts for better token generation
@@ -393,6 +397,8 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
         gpu_memory_utilization=0.9,
         max_model_len=2048,
         dtype="bfloat16",
+        enable_prefix_caching=False,
+        enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
     )
 
     prompt = "the capital of france is"
@@ -459,6 +465,7 @@ def test_logprobs_without_batch_invariance_should_fail(
         max_num_seqs=32,
         max_model_len=8192,
         dtype="bfloat16",
+        enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
     )
 
     # build ragged prompts to change shapes significantly across BS=1 vs BS=N
@@ -682,6 +689,7 @@ def test_decode_logprobs_match_prefill_logprobs(
         max_num_seqs=32,
         max_model_len=8192,
         dtype="bfloat16",
+        enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
     )
 
     # Use a few test prompts
@@ -925,6 +933,8 @@ def LLM_with_max_seqs(
         max_model_len=max_model_len,
         dtype="bfloat16",
         tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
+        enable_prefix_caching=False,
+        enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
         # Enable for MOE models
         # enable_expert_parallel=True,
     )
diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py
index 6aab50cf8..a8013ed22 100644
--- a/tests/v1/determinism/utils.py
+++ b/tests/v1/determinism/utils.py
@@ -11,8 +11,10 @@ from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer
 
 skip_unsupported = pytest.mark.skipif(
-    not (current_platform.is_cuda() and current_platform.has_device_capability(90)),
-    reason="Requires CUDA and >= Hopper (SM90)",
+    not (current_platform.is_cuda() and current_platform.has_device_capability(80)),
+    # Supports testing on Ampere and Ada Lovelace devices.
+    # Note: For devices with SM < 90, batch invariance does not support CUDA Graphs.
+    reason="Requires CUDA and >= Ampere (SM80)",
 )
 
 BACKENDS: list[str] = [
@@ -97,3 +99,7 @@ def _extract_step_logprobs(request_output):
             return t, inner.token_ids
 
     return None, None
+
+
+def is_device_capability_below_90() -> bool:
+    return not current_platform.has_device_capability(90)
diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index 4cab47f41..b14e7dad7 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -935,7 +935,11 @@ def enable_batch_invariant_mode():
 
     # Batch invariant matmuls are no longer needed after cublas overrides
     if not is_torch_equal_or_newer("2.10.0.dev"):
-        if current_platform.is_device_capability(100):
+        if (
+            current_platform.is_device_capability(100)
+            or current_platform.is_device_capability(80)
+            or current_platform.is_device_capability(89)
+        ):
             # For PyTorch 2.9, B200 uses GEMV for bs=1
             # Requires https://github.com/pytorch/pytorch/pull/166735
             _batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, "CUDA")
-- 
GitLab


From 56037dfa2fc10a5f28e919d2c26e4c2abbd313a6 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Tue, 9 Dec 2025 10:36:12 -0500
Subject: [PATCH 235/258] [BugFix] Fix `assert  batch_descriptor.num_tokens ==
 num_tokens_padded` (#30173)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 tests/v1/cudagraph/test_cudagraph_dispatch.py |  4 +-
 vllm/forward_context.py                       |  2 +-
 vllm/v1/cudagraph_dispatcher.py               |  4 +-
 vllm/v1/spec_decode/eagle.py                  |  2 +-
 vllm/v1/worker/dp_utils.py                    | 49 ++++++++++++++-----
 vllm/v1/worker/gpu_model_runner.py            | 37 ++++++++------
 6 files changed, 65 insertions(+), 33 deletions(-)

diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py
index b86534d3d..0e71d6c63 100644
--- a/tests/v1/cudagraph/test_cudagraph_dispatch.py
+++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py
@@ -161,10 +161,10 @@ class TestCudagraphDispatcher:
         assert rt_mode == CUDAGraphMode.NONE
         assert key == BatchDescriptor(num_tokens=15)
 
-        # 4. Cascade attention should have a fall back mode
+        # 4. disable_full should have a fall back mode (e.g., cascade attention)
         desc_full_exact = BatchDescriptor(num_tokens=8, uniform=False)
         rt_mode, key = dispatcher.dispatch(
-            num_tokens=8, uniform_decode=False, has_lora=False, use_cascade_attn=True
+            num_tokens=8, uniform_decode=False, has_lora=False, disable_full=True
         )
         if "PIECEWISE" in cudagraph_mode_str:  # string contains check
             assert rt_mode == CUDAGraphMode.PIECEWISE
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 173d36626..033cc1f54 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -292,7 +292,7 @@ def set_forward_context(
         if num_tokens_across_dp is None:
             assert ubatch_slices is None
             assert num_tokens is not None
-            _, num_tokens_across_dp = coordinate_batch_across_dp(
+            _, num_tokens_across_dp, _ = coordinate_batch_across_dp(
                 num_tokens_unpadded=num_tokens,
                 parallel_config=vllm_config.parallel_config,
                 allow_microbatching=False,
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index ef0f8d9e6..8a3500c0a 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -145,7 +145,7 @@ class CudagraphDispatcher:
         num_tokens: int,
         uniform_decode: bool,
         has_lora: bool,
-        use_cascade_attn: bool = False,
+        disable_full: bool = False,
     ) -> tuple[CUDAGraphMode, BatchDescriptor]:
         """
         Given conditions(e.g.,batch descriptor and if using cascade attention),
@@ -165,7 +165,7 @@ class CudagraphDispatcher:
         )
         relaxed_batch_desc = batch_desc.relax_for_mixed_batch_cudagraphs()
 
-        if not use_cascade_attn:
+        if not disable_full:
             # check if key exists for full cudagraph
             if batch_desc in self.cudagraph_keys[CUDAGraphMode.FULL]:
                 return CUDAGraphMode.FULL, batch_desc
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 31428db2d..9f7859a5c 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -1258,7 +1258,7 @@ class EagleProposer:
         num_tokens_padded: int,
     ) -> tuple[int, torch.Tensor]:
         # TODO(Flechman): support DBO ubatching
-        should_ubatch, num_toks_across_dp = coordinate_batch_across_dp(
+        should_ubatch, num_toks_across_dp, _ = coordinate_batch_across_dp(
             num_tokens_unpadded=num_tokens_unpadded,
             parallel_config=self.vllm_config.parallel_config,
             allow_microbatching=False,
diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py
index 5da55d740..1b9646e19 100644
--- a/vllm/v1/worker/dp_utils.py
+++ b/vllm/v1/worker/dp_utils.py
@@ -40,16 +40,18 @@ def _run_ar(
     should_dp_pad: bool,
     orig_num_tokens_per_ubatch: int,
     padded_num_tokens_per_ubatch: int,
+    cudagraph_mode: int,
     parallel_config: ParallelConfig,
 ) -> torch.Tensor:
     dp_size = parallel_config.data_parallel_size
     dp_rank = parallel_config.data_parallel_rank
     device, group = _get_device_and_group(parallel_config)
-    tensor = torch.zeros(4, dp_size, device=device, dtype=torch.int32)
+    tensor = torch.zeros(5, dp_size, device=device, dtype=torch.int32)
     tensor[0][dp_rank] = orig_num_tokens_per_ubatch
     tensor[1][dp_rank] = padded_num_tokens_per_ubatch
     tensor[2][dp_rank] = 1 if should_ubatch else 0
     tensor[3][dp_rank] = 1 if should_dp_pad else 0
+    tensor[4][dp_rank] = cudagraph_mode
     dist.all_reduce(tensor, group=group)
     return tensor
 
@@ -89,13 +91,23 @@ def _post_process_dp_padding(tensor: torch.Tensor, should_dp_pad: bool) -> torch
         return num_tokens_across_dp.cpu()
 
 
+def _post_process_cudagraph_mode(tensor: torch.Tensor) -> int:
+    """
+    Synchronize cudagraph_mode across DP ranks by taking the minimum.
+    If any rank has NONE (0), all ranks use NONE.
+    This ensures all ranks send consistent values (all padded or all unpadded).
+    """
+    return int(tensor[4, :].min().item())
+
+
 def _synchronize_dp_ranks(
     num_tokens_unpadded: int,
     num_tokens_padded: int,
     should_attempt_ubatching: bool,
     should_attempt_dp_padding: bool,
+    cudagraph_mode: int,
     parallel_config: ParallelConfig,
-) -> tuple[bool, torch.Tensor | None]:
+) -> tuple[bool, torch.Tensor | None, int]:
     """
     1. Decides if each DP rank is going to microbatch. Either all ranks
     run with microbatching or none of them do.
@@ -104,10 +116,13 @@ def _synchronize_dp_ranks(
     When running microbatched or if should_attempt_dp_padding is True, all
     ranks will be padded out so that the run with the same number of tokens
 
+    3. Synchronizes cudagraph_mode across ranks by taking the minimum.
+
     Returns: tuple[
         should_ubatch: Are all DP ranks going to microbatch
         num_tokens_after_padding: A tensor containing the total number of
         tokens per-microbatch for each DP rank including any DP padding.
+        synced_cudagraph_mode: The synchronized cudagraph mode (min across ranks)
     ]
 
     """
@@ -121,6 +136,7 @@ def _synchronize_dp_ranks(
         should_dp_pad=should_attempt_dp_padding,
         orig_num_tokens_per_ubatch=num_tokens_unpadded,
         padded_num_tokens_per_ubatch=num_tokens_padded,
+        cudagraph_mode=cudagraph_mode,
         parallel_config=parallel_config,
     )
 
@@ -148,7 +164,10 @@ def _synchronize_dp_ranks(
         should_dp_pad,
     )
 
-    return should_ubatch, num_tokens_after_padding
+    # Synchronize cudagraph_mode across ranks (take min)
+    synced_cudagraph_mode = _post_process_cudagraph_mode(tensor)
+
+    return should_ubatch, num_tokens_after_padding, synced_cudagraph_mode
 
 
 def coordinate_batch_across_dp(
@@ -159,7 +178,8 @@ def coordinate_batch_across_dp(
     num_tokens_padded: int | None = None,
     uniform_decode: bool | None = None,
     num_scheduled_tokens_per_request: np.ndarray | None = None,
-) -> tuple[bool, torch.Tensor | None]:
+    cudagraph_mode: int = 0,
+) -> tuple[bool, torch.Tensor | None, int]:
     """
     Coordinates amongst all DP ranks to determine if and how the full batch
     should be split into microbatches.
@@ -175,6 +195,7 @@ def coordinate_batch_across_dp(
             only contains single token decodes
         num_scheduled_tokens_per_request: Only used if allow_microbatching is True. The
             number of tokens per request.
+        cudagraph_mode: The cudagraph mode for this rank (0=NONE, 1=PIECEWISE, 2=FULL)
 
     Returns: tuple[
         ubatch_slices: if this is set then all DP ranks have agreed to
@@ -183,12 +204,13 @@ def coordinate_batch_across_dp(
         tokens per-microbatch for each DP rank including padding. Will be
         padded up to the max value across all DP ranks when allow_dp_padding
         is True.
+        synced_cudagraph_mode: The synchronized cudagraph mode (min across ranks)
     ]
 
     """
     if parallel_config.data_parallel_size == 1:
         # Early exit.
-        return False, None
+        return False, None, cudagraph_mode
 
     # If the caller has explicitly enabled microbatching.
     should_attempt_ubatching = False
@@ -204,12 +226,15 @@ def coordinate_batch_across_dp(
     if num_tokens_padded is None:
         num_tokens_padded = num_tokens_unpadded
 
-    (should_ubatch, num_tokens_after_padding) = _synchronize_dp_ranks(
-        num_tokens_unpadded,
-        num_tokens_padded,
-        should_attempt_ubatching,
-        allow_dp_padding,
-        parallel_config,
+    (should_ubatch, num_tokens_after_padding, synced_cudagraph_mode) = (
+        _synchronize_dp_ranks(
+            num_tokens_unpadded,
+            num_tokens_padded,
+            should_attempt_ubatching,
+            allow_dp_padding,
+            cudagraph_mode,
+            parallel_config,
+        )
     )
 
-    return (should_ubatch, num_tokens_after_padding)
+    return (should_ubatch, num_tokens_after_padding, synced_cudagraph_mode)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 22a3f9d8d..766c2acd0 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2788,17 +2788,19 @@ class GPUModelRunner(
         )
 
         dispatch_cudagraph = (
-            lambda num_tokens: self.cudagraph_dispatcher.dispatch(
+            lambda num_tokens, disable_full: self.cudagraph_dispatcher.dispatch(
                 num_tokens=num_tokens,
                 has_lora=has_lora,
-                use_cascade_attn=use_cascade_attn,
                 uniform_decode=uniform_decode,
+                disable_full=disable_full,
             )
             if not force_eager
             else (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded))
         )
 
-        cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded)
+        cudagraph_mode, batch_descriptor = dispatch_cudagraph(
+            num_tokens_padded, use_cascade_attn
+        )
         num_tokens_padded = batch_descriptor.num_tokens
 
         # Extra coordination when running data-parallel since we need to coordinate
@@ -2813,23 +2815,28 @@ class GPUModelRunner(
                 self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
             )
 
-            should_ubatch, num_tokens_across_dp = coordinate_batch_across_dp(
-                num_tokens_unpadded=num_tokens,
-                parallel_config=self.parallel_config,
-                allow_microbatching=allow_microbatching,
-                allow_dp_padding=allow_dp_padding,
-                num_tokens_padded=num_tokens_padded,
-                uniform_decode=uniform_decode,
-                num_scheduled_tokens_per_request=num_scheduled_tokens_np,
+            should_ubatch, num_tokens_across_dp, synced_cudagraph_mode = (
+                coordinate_batch_across_dp(
+                    num_tokens_unpadded=num_tokens,
+                    parallel_config=self.parallel_config,
+                    allow_microbatching=allow_microbatching,
+                    allow_dp_padding=allow_dp_padding,
+                    num_tokens_padded=num_tokens_padded,
+                    uniform_decode=uniform_decode,
+                    num_scheduled_tokens_per_request=num_scheduled_tokens_np,
+                    cudagraph_mode=cudagraph_mode.value,
+                )
             )
 
-            # Extract DP padding if there is any
+            # Extract DP-synced values
             if num_tokens_across_dp is not None:
                 dp_rank = self.parallel_config.data_parallel_rank
                 num_tokens_padded = int(num_tokens_across_dp[dp_rank].item())
-
-                # Re-dispatch with DP padding
-                cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded)
+                # Re-dispatch with DP padding so we have the correct batch_descriptor
+                cudagraph_mode, batch_descriptor = dispatch_cudagraph(
+                    num_tokens_padded,
+                    disable_full=synced_cudagraph_mode <= CUDAGraphMode.PIECEWISE.value,
+                )
                 # Assert to make sure the agreed upon token count is correct otherwise
                 # num_tokens_across_dp will no-longer be valid
                 assert batch_descriptor.num_tokens == num_tokens_padded
-- 
GitLab


From 83319b44c26af45de4753c74f55a07df8c637a25 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Tue, 9 Dec 2025 10:40:37 -0500
Subject: [PATCH 236/258] [Compile] Fix torch warning `TensorFloat32 tensor
 cores for float32 matrix multiplication available but not enabled` (#29897)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 tests/v1/e2e/test_async_scheduling.py | 2 ++
 vllm/envs.py                          | 9 +++++++++
 vllm/v1/worker/gpu_worker.py          | 4 ++++
 3 files changed, 15 insertions(+)

diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py
index 945276376..838d05f04 100644
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@@ -124,6 +124,8 @@ def run_tests(
     with monkeypatch.context() as m:
         # avoid precision errors
         m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
+        # lock matmul precision to full FP32
+        m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest")
         # m.setenv("VLLM_BATCH_INVARIANT", "1")
         outputs: list[tuple[str, list, list]] = []
         for n, (
diff --git a/vllm/envs.py b/vllm/envs.py
index 91d1b0107..bda9e6e42 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -75,6 +75,7 @@ if TYPE_CHECKING:
     VLLM_MM_INPUT_CACHE_GIB: int = 4
     VLLM_TARGET_DEVICE: str = "cuda"
     VLLM_MAIN_CUDA_VERSION: str = "12.9"
+    VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest"
     MAX_JOBS: str | None = None
     NVCC_THREADS: str | None = None
     VLLM_USE_PRECOMPILED: bool = False
@@ -452,6 +453,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # Main CUDA version of vLLM. This follows PyTorch but can be overridden.
     "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower()
     or "12.9",
+    # Controls PyTorch float32 matmul precision mode within vLLM workers.
+    # Valid options mirror torch.set_float32_matmul_precision
+    "VLLM_FLOAT32_MATMUL_PRECISION": env_with_choices(
+        "VLLM_FLOAT32_MATMUL_PRECISION",
+        "highest",
+        ["highest", "high", "medium"],
+        case_sensitive=False,
+    ),
     # Maximum number of compilation jobs to run in parallel.
     # By default this is the number of CPUs
     "MAX_JOBS": lambda: os.getenv("MAX_JOBS", None),
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index a46ec2bd1..24a3533a1 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -79,6 +79,10 @@ class Worker(WorkerBase):
             is_driver_worker=is_driver_worker,
         )
 
+        # configure float32 matmul precision according to vLLM env.
+        precision = envs.VLLM_FLOAT32_MATMUL_PRECISION
+        torch.set_float32_matmul_precision(precision)
+
         if self.model_config.trust_remote_code:
             # note: lazy import to avoid importing torch before initializing
             from vllm.utils.import_utils import init_cached_hf_modules
-- 
GitLab


From 804e3468c04b1a43c0019d2835dabc74b779c1fc Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Tue, 9 Dec 2025 11:31:30 -0600
Subject: [PATCH 237/258] Update AMD test definitions (2025-12-08) (#30298)

Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
---
 .buildkite/test-amd.yaml | 188 +++++++++++++++++++++++++++------------
 1 file changed, 130 insertions(+), 58 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 6950ad774..4038d3283 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -398,7 +398,8 @@ steps:
   timeout_in_minutes: 25
   gpu: h100
   source_file_dependencies:
-    - vllm/
+    - vllm/v1/attention
+    - vllm/model_executor/layers
     - tests/v1/determinism/
   commands:
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -440,23 +441,29 @@ steps:
   working_dir: "/vllm-workspace/examples"
   source_file_dependencies:
   - vllm/entrypoints
+  - vllm/multimodal
   - examples/
   commands:
     - pip install tensorizer # for tensorizer test
+    # for basic
+    - python3 offline_inference/basic/chat.py
     - python3 offline_inference/basic/generate.py --model facebook/opt-125m
     - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/basic/chat.py
-    - python3 offline_inference/prefix_caching.py
-    - python3 offline_inference/llm_engine_example.py
+    - python3 offline_inference/basic/classify.py
+    - python3 offline_inference/basic/embed.py
+    - python3 offline_inference/basic/score.py
+    # for multi-modal models
     - python3 offline_inference/audio_language.py --seed 0
     - python3 offline_inference/vision_language.py --seed 0
     - python3 offline_inference/vision_language_pooling.py --seed 0
     - python3 offline_inference/vision_language_multi_image.py --seed 0
-    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
-    - python3 offline_inference/basic/classify.py
-    - python3 offline_inference/basic/embed.py
-    - python3 offline_inference/basic/score.py
+    # for pooling models
+    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    # for features demo
+    - python3 offline_inference/prefix_caching.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
     # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
     - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@@ -718,6 +725,18 @@ steps:
   - uv pip install --system conch-triton-kernels
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
+- label: LM Eval Small Models # 53min
+  timeout_in_minutes: 75
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  autorun_on_main: true
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+
 - label: OpenAI API correctness # 10min
   timeout_in_minutes: 15
   mirror_hardwares: [amdexperimental, amdproduction]
@@ -727,7 +746,7 @@ steps:
   - csrc/
   - vllm/entrypoints/openai/
   - vllm/model_executor/models/whisper.py
-  commands: # LMEval
+  commands: # LMEval+Transcription WER check
   # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
   - pytest -s entrypoints/openai/correctness/
 
@@ -963,6 +982,19 @@ steps:
     - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
     - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 
+- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - vllm/multimodal/
+  - vllm/inputs/
+  - vllm/v1/core/
+  commands:
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+
 - label: Multi-Modal Models Test (Extended) 1 # 60min
   timeout_in_minutes: 120
   mirror_hardwares: [amdexperimental]
@@ -1098,7 +1130,6 @@ steps:
   - vllm/model_executor/layers/layernorm.py
   - vllm/model_executor/layers/activation.py
   - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - vllm/model_executor/layers/fused_moe/layer.py
   - tests/compile/test_fusion_attn.py
   - tests/compile/test_silu_mul_quant_fusion.py
   - tests/compile/distributed/test_fusion_all_reduce.py
@@ -1132,12 +1163,25 @@ steps:
   - vllm/model_executor/layers/activation.py
   - vllm/model_executor/layers/quantization/input_quant_fp8.py
   - tests/compile/distributed/test_fusions_e2e.py
-  - tests/compile/fullgraph/test_full_graph.py
   commands:
     - nvidia-smi
     # Run all e2e fusion tests
     - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
 
+- label: Blackwell GPT-OSS Eval
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true # run on nightlies
+  source_file_dependencies:
+  - tests/evals/gpt_oss
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - uv pip install --system 'gpt-oss[eval]==0.0.5'
+    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
 - label: Blackwell Quantized MoE Test
   timeout_in_minutes: 60
   working_dir: "/vllm-workspace/"
@@ -1155,6 +1199,16 @@ steps:
   commands:
     - pytest -s -v tests/quantization/test_blackwell_moe.py
 
+- label: Blackwell LM Eval Small Models
+  timeout_in_minutes: 120
+  gpu: b200
+  optional: true # run on nightlies
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+
 #####  1 GPU test  #####
 #####  multi gpus test  #####
 
@@ -1397,6 +1451,39 @@ steps:
   - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
   - pytest -v -s -x lora/test_mixtral.py
 
+
+- label: LM Eval Large Models # optional
+  gpu: a100
+  optional: true
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+##### H100 test #####
+- label: LM Eval Large Models (H100) # optional
+  gpu: h100
+  optional: true
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
+
 ##### H200 test #####
 - label: Distributed Tests (H200) # optional
   mirror_hardwares: [amdexperimental]
@@ -1440,29 +1527,6 @@ steps:
   commands:
   - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
 
-- label: Blackwell LM Eval Small Models
-  timeout_in_minutes: 120
-  gpu: b200
-  optional: true # run on nightlies
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
-
-- label: Multi-Modal Accuracy Eval (Small Models) # 10min
-  timeout_in_minutes: 70
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - vllm/multimodal/
-  - vllm/inputs/
-  - vllm/v1/core/
-  commands:
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
-
 - label: LM Eval Large Models (4 Card)
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_4
@@ -1478,21 +1542,6 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 
-- label: LM Eval Large Models (H100) # optional
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_4
-  # grade: Blocking
-  gpu: h100
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
-    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
-
 - label: ROCm LM Eval Large Models (8 Card)
   mirror_hardwares: [amdproduction]
   agent_pool: mi325_8
@@ -1517,6 +1566,20 @@ steps:
     - uv pip install --system 'gpt-oss[eval]==0.0.5'
     - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
 
+##### RL Integration Tests #####
+- label: Prime-RL Integration Test # 15min
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_2
+  # grade: Blocking
+  timeout_in_minutes: 30
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  source_file_dependencies:
+  - vllm/
+  - .buildkite/scripts/run-prime-rl-test.sh
+  commands:
+    - bash .buildkite/scripts/run-prime-rl-test.sh
 - label: DeepSeek V2-Lite Accuracy
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_4
@@ -1550,17 +1613,26 @@ steps:
   commands:
   - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
 
-##### RL Integration Tests #####
-- label: Prime-RL Integration Test # 15min
+- label: DeepSeek V2-Lite Async EPLB Accuracy
+  timeout_in_minutes: 60
   mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_2
+  agent_pool: mi325_4
   # grade: Blocking
-  timeout_in_minutes: 30
+  gpu: h100
   optional: true
-  num_gpus: 2
+  num_gpus: 4
   working_dir: "/vllm-workspace"
-  source_file_dependencies:
-  - vllm/
-  - .buildkite/scripts/run-prime-rl-test.sh
   commands:
-    - bash .buildkite/scripts/run-prime-rl-test.sh
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
+
+- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
+  timeout_in_minutes: 60
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
-- 
GitLab


From 0b6a8a304cd8bab21383c4c2904064f6f6f2fd62 Mon Sep 17 00:00:00 2001
From: Ilya Markov <markovilya197@gmail.com>
Date: Tue, 9 Dec 2025 18:57:55 +0100
Subject: [PATCH 238/258] [BugFix] Fix non detected failing tests (#30277)

Signed-off-by: ilmarkov <markovilya197@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  8 +-
 .../fullgraph/test_multimodal_compile.py      |  1 -
 tests/compile/test_compile_ranges.py          |  6 ++
 tests/compile/test_pass_manager.py            | 73 ++++++++++---------
 vllm/compilation/inductor_pass.py             |  8 +-
 vllm/compilation/piecewise_backend.py         | 19 +++++
 6 files changed, 77 insertions(+), 38 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 0a99994e2..8fc3587f7 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -468,7 +468,9 @@ steps:
   # tests covered elsewhere.
   # Use `find` to launch multiple instances of pytest so that
   # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
-  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
+  # However, find does not normally propagate error codes, so we combine it with xargs
+  # (using -0 for proper path handling)
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
 
 - label: PyTorch Fullgraph Smoke Test # 15min
   timeout_in_minutes: 30
@@ -482,7 +484,9 @@ steps:
   # as it is a heavy test that is covered in other steps.
   # Use `find` to launch multiple instances of pytest so that
   # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
-  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
+  # However, find does not normally propagate error codes, so we combine it with xargs
+  # (using -0 for proper path handling)
+  - "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
 
 - label: PyTorch Fullgraph Test # 27min
   timeout_in_minutes: 40
diff --git a/tests/compile/fullgraph/test_multimodal_compile.py b/tests/compile/fullgraph/test_multimodal_compile.py
index e2897b227..621f6a51a 100644
--- a/tests/compile/fullgraph/test_multimodal_compile.py
+++ b/tests/compile/fullgraph/test_multimodal_compile.py
@@ -17,7 +17,6 @@ def test_compile():
 # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
 @pytest.mark.forked
 @pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
-@pytest.mark.xfail
 def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch):
     """Test that Qwen2.5-VL vision submodules are compiled.
 
diff --git a/tests/compile/test_compile_ranges.py b/tests/compile/test_compile_ranges.py
index d849a8617..14ae8233f 100644
--- a/tests/compile/test_compile_ranges.py
+++ b/tests/compile/test_compile_ranges.py
@@ -80,6 +80,8 @@ def test_compile_ranges(use_fresh_inductor_cache):
     vllm_config = VllmConfig(
         scheduler_config=SchedulerConfig(
             max_num_batched_tokens=8192,
+            max_model_len=8192,
+            is_encoder_decoder=False,
         ),
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
@@ -112,6 +114,8 @@ def test_compile_config_get_compile_ranges():
     VllmConfig(
         scheduler_config=SchedulerConfig(
             max_num_batched_tokens=8192,
+            max_model_len=8192,
+            is_encoder_decoder=False,
         ),
         compilation_config=compilation_config,
     )
@@ -134,6 +138,8 @@ def test_inductor_cache_compile_ranges(monkeypatch, use_fresh_inductor_cache):
     )
     scheduler_config = SchedulerConfig(
         max_num_batched_tokens=8192,
+        max_model_len=8192,
+        is_encoder_decoder=False,
     )
     torch.set_default_device("cuda")
 
diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py
index 6d0ba6b65..6ed77b008 100644
--- a/tests/compile/test_pass_manager.py
+++ b/tests/compile/test_pass_manager.py
@@ -5,9 +5,14 @@ import copy
 import pytest
 import torch
 
-from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
+from vllm.compilation.inductor_pass import (
+    CallableInductorPass,
+    InductorPass,
+    pass_context,
+)
 from vllm.compilation.pass_manager import PostGradPassManager
 from vllm.config import ModelConfig, VllmConfig
+from vllm.config.utils import Range
 
 
 # dummy custom pass that doesn't inherit
@@ -42,35 +47,37 @@ class ProperPass(InductorPass):
     ],
 )
 def test_pass_manager_uuid(callable):
-    # Some passes need dtype to be set
-    config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16))
-
-    pass_manager = PostGradPassManager()
-    pass_manager.configure(config)
-
-    # Check that UUID is different if the same pass is added 2x
-    pass_manager.add(callable)
-    uuid1 = pass_manager.uuid()
-    pass_manager.add(callable)
-    uuid2 = pass_manager.uuid()
-    assert uuid1 != uuid2
-
-    # UUID should be the same as the original one,
-    # as we constructed in the same way.
-    pass_manager2 = PostGradPassManager()
-    pass_manager2.configure(config)
-    pass_manager2.add(callable)
-    assert uuid1 == pass_manager2.uuid()
-
-    # UUID should be different due to config change
-    config2 = copy.deepcopy(config)
-    config2.compilation_config.pass_config.fuse_norm_quant = (
-        not config2.compilation_config.pass_config.fuse_norm_quant
-    )
-    config2.compilation_config.pass_config.fuse_act_quant = (
-        not config2.compilation_config.pass_config.fuse_act_quant
-    )
-    pass_manager3 = PostGradPassManager()
-    pass_manager3.configure(config2)
-    pass_manager3.add(callable)
-    assert uuid1 != pass_manager3.uuid()
+    # Set the pass context as PassManager uuid uses it
+    with pass_context(Range(start=1, end=8)):
+        # Some passes need dtype to be set
+        config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16))
+
+        pass_manager = PostGradPassManager()
+        pass_manager.configure(config)
+
+        # Check that UUID is different if the same pass is added 2x
+        pass_manager.add(callable)
+        uuid1 = pass_manager.uuid()
+        pass_manager.add(callable)
+        uuid2 = pass_manager.uuid()
+        assert uuid1 != uuid2
+
+        # UUID should be the same as the original one,
+        # as we constructed in the same way.
+        pass_manager2 = PostGradPassManager()
+        pass_manager2.configure(config)
+        pass_manager2.add(callable)
+        assert uuid1 == pass_manager2.uuid()
+
+        # UUID should be different due to config change
+        config2 = copy.deepcopy(config)
+        config2.compilation_config.pass_config.fuse_norm_quant = (
+            not config2.compilation_config.pass_config.fuse_norm_quant
+        )
+        config2.compilation_config.pass_config.fuse_act_quant = (
+            not config2.compilation_config.pass_config.fuse_act_quant
+        )
+        pass_manager3 = PostGradPassManager()
+        pass_manager3.configure(config2)
+        pass_manager3.add(callable)
+        assert uuid1 != pass_manager3.uuid()
diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py
index 8159b817f..dbf154eeb 100644
--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from __future__ import annotations
+
 import functools
 import hashlib
 import inspect
@@ -8,15 +10,17 @@ import json
 import types
 from collections.abc import Callable
 from contextlib import contextmanager
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import torch
 from torch import fx
 from torch._subclasses.fake_tensor import FakeTensorMode, unset_fake_temporarily
 
-from vllm.config.utils import Range
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
+if TYPE_CHECKING:
+    from vllm.config.utils import Range
+
 if is_torch_equal_or_newer("2.6"):
     from torch._inductor.custom_graph_pass import CustomGraphPass
 else:
diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py
index 129b9b5de..a15c69376 100644
--- a/vllm/compilation/piecewise_backend.py
+++ b/vllm/compilation/piecewise_backend.py
@@ -53,8 +53,27 @@ class PiecewiseBackend:
         self.is_last_graph = piecewise_compile_index == total_piecewise_compiles - 1
 
         self.is_full_graph = total_piecewise_compiles == 1
+        # TODO: we need to generalize encoder compilation to other models
+        self.is_encoder_compilation = vllm_backend.prefix in [
+            "Qwen2_5_VisionPatchEmbed",
+            "Qwen2_5_VisionPatchMerger",
+            "Qwen2_5_VisionBlock",
+        ]
 
         self.compile_ranges = self.compilation_config.get_compile_ranges()
+        if self.is_encoder_compilation:
+            # For encoder compilation we use the max int32 value
+            # to set the upper bound of the compile ranges
+            max_int32 = 2**31 - 1
+            last_compile_range = self.compile_ranges[-1]
+            assert (
+                last_compile_range.end
+                == vllm_config.scheduler_config.max_num_batched_tokens
+            )
+            self.compile_ranges[-1] = Range(
+                start=last_compile_range.start, end=max_int32
+            )
+
         log_string = f"PiecewiseBackend: compile_ranges: {self.compile_ranges}"
         logger.debug_once(log_string)
 
-- 
GitLab


From 9e6562a3f625279fd7c8b9ac53c30fed3b01f5b9 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 9 Dec 2025 09:59:54 -0800
Subject: [PATCH 239/258] [Model Runner V2] Fix Triton warning on tl.where
 (#30355)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/sample/penalties.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/v1/worker/gpu/sample/penalties.py b/vllm/v1/worker/gpu/sample/penalties.py
index c8d4b7d81..b4fcc822e 100644
--- a/vllm/v1/worker/gpu/sample/penalties.py
+++ b/vllm/v1/worker/gpu/sample/penalties.py
@@ -62,6 +62,7 @@ def _penalties_and_temperature_kernel(
                 mask=packed_block < tl.cdiv(vocab_size, 32),
             )
             prompt_bin_mask = (packed_mask[:, None] >> (tl.arange(0, 32)[None, :])) & 1
+            prompt_bin_mask = prompt_bin_mask.to(tl.int1)
             prompt_bin_mask = prompt_bin_mask.reshape(BLOCK_SIZE)
 
             # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
-- 
GitLab


From d471b2aff09028f9c62e861f760a74fd8f99081d Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 9 Dec 2025 10:00:49 -0800
Subject: [PATCH 240/258] [Model Runner V2] Support num NaNs in logits (#30187)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/async_utils.py      | 41 +++++++++++++------------
 vllm/v1/worker/gpu/metrics/__init__.py |  0
 vllm/v1/worker/gpu/metrics/logits.py   | 42 ++++++++++++++++++++++++++
 vllm/v1/worker/gpu/model_runner.py     |  2 +-
 vllm/v1/worker/gpu/sample/min_p.py     |  4 +--
 vllm/v1/worker/gpu/sample/output.py    | 14 +++++++++
 vllm/v1/worker/gpu/sample/sampler.py   | 12 ++++++--
 7 files changed, 89 insertions(+), 26 deletions(-)
 create mode 100644 vllm/v1/worker/gpu/metrics/__init__.py
 create mode 100644 vllm/v1/worker/gpu/metrics/logits.py
 create mode 100644 vllm/v1/worker/gpu/sample/output.py

diff --git a/vllm/v1/worker/gpu/async_utils.py b/vllm/v1/worker/gpu/async_utils.py
index f6bc607c1..a2e3decad 100644
--- a/vllm/v1/worker/gpu/async_utils.py
+++ b/vllm/v1/worker/gpu/async_utils.py
@@ -2,14 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from contextlib import contextmanager
 
+import numpy as np
 import torch
 
 from vllm.v1.outputs import (
     AsyncModelRunnerOutput,
     LogprobsTensors,
     ModelRunnerOutput,
-    SamplerOutput,
 )
+from vllm.v1.worker.gpu.sample.output import SamplerOutput
 
 
 class AsyncOutput(AsyncModelRunnerOutput):
@@ -34,29 +35,18 @@ class AsyncOutput(AsyncModelRunnerOutput):
         with torch.cuda.stream(self.copy_stream):
             self.copy_stream.wait_stream(default_stream)
 
-            # NOTE(woosuk): We must ensure that CPU tensors are not freed
-            # before the device-to-host copy is fully completed. For instance,
-            # operations like
-            # self.sampled_token_np = ...to("cpu", non_blocking=True).numpy()
-            # are unsafe because the underlying CPU tensor can be prematurely freed and
-            # reused by other tensors before the asynchronous copy finishes, potentially
-            # causing race conditions. To prevent this, we delay freeing by holding
-            # references until the copy event signals completion.
-            # Likewise, we also need to keep the reference to the GPU tensors.
-            # This is done by keeping the reference to sampler_output and
-            # model_runner_output.
-            self.sampled_token_ids = sampler_output.sampled_token_ids.to(
-                "cpu", non_blocking=True
-            )
+            self.sampled_token_ids = async_copy_to_np(sampler_output.sampled_token_ids)
             if sampler_output.logprobs_tensors is not None:
                 self.logprobs_tensors: LogprobsTensors | None = (
                     sampler_output.logprobs_tensors.to_cpu_nonblocking()
                 )
             else:
                 self.logprobs_tensors = None
-            self.num_sampled_tokens_cpu = num_sampled_tokens.to(
-                "cpu", non_blocking=True
-            )
+            if sampler_output.num_nans is not None:
+                self.num_nans = async_copy_to_np(sampler_output.num_nans)
+            else:
+                self.num_nans = None
+            self.num_sampled_tokens_np = async_copy_to_np(num_sampled_tokens)
             self.prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {}
             if self.model_runner_output.prompt_logprobs_dict:
                 for k, v in self.model_runner_output.prompt_logprobs_dict.items():
@@ -68,7 +58,6 @@ class AsyncOutput(AsyncModelRunnerOutput):
 
     def get_output(self) -> ModelRunnerOutput:
         self.copy_event.synchronize()
-        num_sampled_tokens_np = self.num_sampled_tokens_cpu.numpy()
 
         # NOTE(woosuk): The following code is to ensure compatibility with
         # the existing model runner.
@@ -76,10 +65,18 @@ class AsyncOutput(AsyncModelRunnerOutput):
         # rather than Python lists.
         sampled_token_ids: list[list[int]] = self.sampled_token_ids.tolist()
         num_reqs = len(sampled_token_ids)
+        num_sampled_tokens = self.num_sampled_tokens_np.tolist()
         for i in range(num_reqs):
-            del sampled_token_ids[i][num_sampled_tokens_np[i] :]
+            del sampled_token_ids[i][num_sampled_tokens[i] :]
         self.model_runner_output.sampled_token_ids = sampled_token_ids
 
+        if self.num_nans is not None:
+            num_nans = self.num_nans.tolist()
+            self.model_runner_output.num_nans_in_logits = {
+                req_id: num_nans[i]
+                for i, req_id in enumerate(self.model_runner_output.req_ids)
+            }
+
         if self.logprobs_tensors is not None:
             self.model_runner_output.logprobs = self.logprobs_tensors.tolists()
         self.model_runner_output.prompt_logprobs_dict = self.prompt_logprobs_dict
@@ -95,3 +92,7 @@ def async_barrier(event: torch.cuda.Event | None):
     finally:
         if event is not None:
             event.record()
+
+
+def async_copy_to_np(x: torch.Tensor) -> np.ndarray:
+    return x.to("cpu", non_blocking=True).numpy()
diff --git a/vllm/v1/worker/gpu/metrics/__init__.py b/vllm/v1/worker/gpu/metrics/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/vllm/v1/worker/gpu/metrics/logits.py b/vllm/v1/worker/gpu/metrics/logits.py
new file mode 100644
index 000000000..fd7b30bea
--- /dev/null
+++ b/vllm/v1/worker/gpu/metrics/logits.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from torch._inductor.runtime.triton_helpers import libdevice
+
+from vllm.triton_utils import tl, triton
+
+
+@triton.jit
+def _num_nans_kernel(
+    logits_ptr,
+    logits_stride,
+    num_nans_ptr,
+    vocab_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    num_nans = 0
+    for i in range(0, vocab_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < vocab_size
+        logits = tl.load(
+            logits_ptr + req_idx * logits_stride + block, mask=mask, other=0
+        )
+        logits = logits.to(tl.float32)
+        is_nan = libdevice.isnan(logits).to(tl.int1)
+        num_nans += tl.sum(is_nan).to(tl.int32)
+    tl.store(num_nans_ptr + req_idx, num_nans)
+
+
+def get_num_nans(logits: torch.Tensor) -> torch.Tensor:
+    num_reqs, vocab_size = logits.shape
+    BLOCK_SIZE = 8192
+    num_nans = torch.empty(num_reqs, dtype=torch.int32, device=logits.device)
+    _num_nans_kernel[(num_reqs,)](
+        logits,
+        logits.stride(0),
+        num_nans,
+        vocab_size,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+    return num_nans
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 464f7b7bd..9f4c6edfb 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -25,7 +25,6 @@ from vllm.v1.outputs import (
     LogprobsTensors,
     ModelRunnerOutput,
 )
-from vllm.v1.sample.sampler import SamplerOutput
 from vllm.v1.worker.gpu.async_utils import AsyncOutput, async_barrier
 from vllm.v1.worker.gpu.attn_utils import (
     build_attn_metadata,
@@ -53,6 +52,7 @@ from vllm.v1.worker.gpu.sample.metadata import (
     SamplingMetadata,
     expand_sampling_metadata,
 )
+from vllm.v1.worker.gpu.sample.output import SamplerOutput
 from vllm.v1.worker.gpu.sample.sampler import Sampler
 from vllm.v1.worker.gpu.spec_decode import init_speculator
 from vllm.v1.worker.gpu.spec_decode.rejection_sample import rejection_sample
diff --git a/vllm/v1/worker/gpu/sample/min_p.py b/vllm/v1/worker/gpu/sample/min_p.py
index 063881800..c98a42cb2 100644
--- a/vllm/v1/worker/gpu/sample/min_p.py
+++ b/vllm/v1/worker/gpu/sample/min_p.py
@@ -39,9 +39,7 @@ def _min_p_kernel(
         tl.store(logits_ptr + req_idx * logits_stride + block, logits, mask=mask)
 
 
-def apply_min_p(logits: torch.Tensor, min_p: torch.Tensor | None) -> None:
-    if min_p is None:
-        return
+def apply_min_p(logits: torch.Tensor, min_p: torch.Tensor) -> None:
     num_reqs, vocab_size = logits.shape
     BLOCK_SIZE = 1024
     _min_p_kernel[(num_reqs,)](
diff --git a/vllm/v1/worker/gpu/sample/output.py b/vllm/v1/worker/gpu/sample/output.py
new file mode 100644
index 000000000..13e8cf1d6
--- /dev/null
+++ b/vllm/v1/worker/gpu/sample/output.py
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+
+import torch
+
+from vllm.v1.outputs import LogprobsTensors
+
+
+@dataclass
+class SamplerOutput:
+    sampled_token_ids: torch.Tensor
+    logprobs_tensors: LogprobsTensors | None
+    num_nans: torch.Tensor | None
diff --git a/vllm/v1/worker/gpu/sample/sampler.py b/vllm/v1/worker/gpu/sample/sampler.py
index 9a4224d8f..84a3e1867 100644
--- a/vllm/v1/worker/gpu/sample/sampler.py
+++ b/vllm/v1/worker/gpu/sample/sampler.py
@@ -3,13 +3,15 @@
 
 import torch
 
+import vllm.envs as envs
 from vllm.config.model import LogprobsMode
-from vllm.v1.outputs import SamplerOutput
 from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
+from vllm.v1.worker.gpu.metrics.logits import get_num_nans
 from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample
 from vllm.v1.worker.gpu.sample.logprob import compute_topk_logprobs
 from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu.sample.min_p import apply_min_p
+from vllm.v1.worker.gpu.sample.output import SamplerOutput
 from vllm.v1.worker.gpu.sample.penalties import apply_penalties_and_temperature
 
 
@@ -21,12 +23,16 @@ class Sampler:
         if logprobs_mode not in ["processed_logprobs", "raw_logprobs"]:
             raise NotImplementedError(f"Unsupported logprobs_mode: {logprobs_mode}")
         self.logprobs_mode = logprobs_mode
+        self.compute_nans = envs.VLLM_COMPUTE_NANS_IN_LOGITS  # False by default.
 
     def __call__(
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> SamplerOutput:
+        # NOTE(woosuk): We intentionally compute num_nans before sampling to make clear
+        # that num_nans is computed before applying penalties and temperature.
+        num_nans = get_num_nans(logits) if self.compute_nans else None
         sampled, processed_logits = self.sample(logits, sampling_metadata)
         if sampling_metadata.max_num_logprobs is not None:
             logits = (
@@ -49,6 +55,7 @@ class Sampler:
             # token per request.
             sampled_token_ids=sampled.view(-1, 1),
             logprobs_tensors=logprobs_tensors,
+            num_nans=num_nans,
         )
         return sampler_output
 
@@ -63,7 +70,8 @@ class Sampler:
         # Apply penalties and temperature in place.
         apply_penalties_and_temperature(logits, sampling_metadata)
         # Apply min_p in place.
-        apply_min_p(logits, sampling_metadata.min_p)
+        if sampling_metadata.min_p is not None:
+            apply_min_p(logits, sampling_metadata.min_p)
         # Apply top_k and/or top_p. This might return a new tensor.
         logits = apply_top_k_top_p(
             logits, sampling_metadata.top_k, sampling_metadata.top_p
-- 
GitLab


From e858bfe05167a3bbb064e283da5a1a7709dee24e Mon Sep 17 00:00:00 2001
From: Benjamin Chislett <bchislett@nvidia.com>
Date: Tue, 9 Dec 2025 13:29:33 -0500
Subject: [PATCH 241/258] [Cleanup] Refactor profiling env vars into a CLI
 config (#29912)

Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
Signed-off-by: Benjamin Chislett <chislett.ben@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 benchmarks/auto_tune/auto_tune.sh             |   5 +-
 .../benchmark_serving_structured_output.py    |   3 +-
 docs/api/README.md                            |   1 +
 docs/contributing/profiling.md                |  23 +-
 .../offline_inference/simple_profiling.py     |  13 +-
 tests/v1/worker/test_gpu_profiler.py          |  71 ++++---
 vllm/benchmarks/latency.py                    |  12 +-
 vllm/benchmarks/serve.py                      |   3 +-
 vllm/benchmarks/throughput.py                 |   3 +-
 vllm/config/__init__.py                       |   3 +
 vllm/config/profiler.py                       | 199 ++++++++++++++++++
 vllm/config/vllm.py                           |   5 +
 vllm/engine/arg_utils.py                      |   6 +-
 vllm/entrypoints/llm.py                       |  17 ++
 vllm/entrypoints/serve/profile/api_router.py  |  17 +-
 vllm/envs.py                                  | 118 +++++------
 vllm/profiler/{gpu_profiler.py => wrapper.py} |  72 ++++---
 vllm/v1/engine/async_llm.py                   |  22 +-
 vllm/v1/worker/cpu_worker.py                  |  36 +---
 vllm/v1/worker/gpu_worker.py                  |  18 +-
 vllm/v1/worker/tpu_worker.py                  |   4 +-
 vllm/v1/worker/xpu_worker.py                  |  42 +---
 22 files changed, 437 insertions(+), 256 deletions(-)
 create mode 100644 vllm/config/profiler.py
 rename vllm/profiler/{gpu_profiler.py => wrapper.py} (73%)

diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
index 56b721cbb..25baa9cbd 100644
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -96,8 +96,9 @@ start_server() {
     # This correctly passes each element as a separate argument.
     if [[ -n "$profile_dir" ]]; then
         # Start server with profiling enabled
-        VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
-            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
+        local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}"
+        VLLM_SERVER_DEV_MODE=1 \
+            vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 &
     else
         # Start server without profiling
         VLLM_SERVER_DEV_MODE=1 \
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index df122b4c5..a4e1b163d 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -963,8 +963,7 @@ def create_argument_parser():
     parser.add_argument(
         "--profile",
         action="store_true",
-        help="Use Torch Profiler. The endpoint must be launched with "
-        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
+        help="Use vLLM Profiling. --profiler-config must be provided on the server.",
     )
     parser.add_argument(
         "--result-dir",
diff --git a/docs/api/README.md b/docs/api/README.md
index d3a141f32..d51329ec2 100644
--- a/docs/api/README.md
+++ b/docs/api/README.md
@@ -15,6 +15,7 @@ API documentation for vLLM's configuration classes.
 - [vllm.config.MultiModalConfig][]
 - [vllm.config.PoolerConfig][]
 - [vllm.config.StructuredOutputsConfig][]
+- [vllm.config.ProfilerConfig][]
 - [vllm.config.ObservabilityConfig][]
 - [vllm.config.KVTransferConfig][]
 - [vllm.config.CompilationConfig][]
diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
index 65382afbe..cbce14ce9 100644
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -5,16 +5,15 @@
 
 ## Profile with PyTorch Profiler
 
-We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`. Additionally, you can control the profiling content by specifying the following environment variables:
+We support tracing vLLM workers using the `torch.profiler` module. You can enable the torch profiler by setting `--profiler-config`
+when launching the server, and setting the entries `profiler` to `'torch'` and `torch_profiler_dir` to the directory where you want to save the traces. Additionally, you can control the profiling content by specifying the following additional arguments in the config:
 
-- `VLLM_TORCH_PROFILER_RECORD_SHAPES=1` to enable recording Tensor Shapes, off by default
-- `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1` to record memory, off by default
-- `VLLM_TORCH_PROFILER_WITH_STACK=1` to enable recording stack information, on by default
-- `VLLM_TORCH_PROFILER_WITH_FLOPS=1` to enable recording FLOPs, off by default
-- `VLLM_TORCH_PROFILER_USE_GZIP=0` to disable gzip-compressing profiling files, on by default
-- `VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0` to disable dumping and printing the aggregated CUDA self time table, on by default
-
-The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
+- `torch_profiler_record_shapes` to enable recording Tensor Shapes, off by default
+- `torch_profiler_with_memory` to record memory, off by default
+- `torch_profiler_with_stack` to enable recording stack information, on by default
+- `torch_profiler_with_flops` to enable recording FLOPs, off by default
+- `torch_profiler_use_gzip` to control gzip-compressing profiling files, on by default
+- `torch_profiler_dump_cuda_time_total` to control dumping and printing the aggregated CUDA self time table, on by default
 
 When using `vllm bench serve`, you can enable profiling by passing the `--profile` flag.
 
@@ -40,8 +39,7 @@ Refer to [examples/offline_inference/simple_profiling.py](../../examples/offline
 #### OpenAI Server
 
 ```bash
-VLLM_TORCH_PROFILER_DIR=./vllm_profile \
-    vllm serve meta-llama/Llama-3.1-8B-Instruct
+vllm serve meta-llama/Llama-3.1-8B-Instruct --profiler-config '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile"}'
 ```
 
 vllm bench command:
@@ -104,13 +102,12 @@ To profile the server, you will want to prepend your `vllm serve` command with `
 
 ```bash
 # server
-VLLM_TORCH_CUDA_PROFILE=1 \
 nsys profile \
     --trace-fork-before-exec=true \
     --cuda-graph-trace=node \
     --capture-range=cudaProfilerApi \
     --capture-range-end repeat \
-    vllm serve meta-llama/Llama-3.1-8B-Instruct
+    vllm serve meta-llama/Llama-3.1-8B-Instruct --profiler-config.profiler cuda
 
 # client
 vllm bench serve \
diff --git a/examples/offline_inference/simple_profiling.py b/examples/offline_inference/simple_profiling.py
index 46858fffa..e8a75cd03 100644
--- a/examples/offline_inference/simple_profiling.py
+++ b/examples/offline_inference/simple_profiling.py
@@ -1,14 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import os
 import time
 
 from vllm import LLM, SamplingParams
 
-# enable torch profiler, can also be set on cmd line
-os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile"
-
 # Sample prompts.
 prompts = [
     "Hello, my name is",
@@ -22,7 +18,14 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
 def main():
     # Create an LLM.
-    llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
+    llm = LLM(
+        model="facebook/opt-125m",
+        tensor_parallel_size=1,
+        profiler_config={
+            "profiler": "torch",
+            "torch_profiler_dir": "./vllm_profile",
+        },
+    )
 
     llm.start_profile()
 
diff --git a/tests/v1/worker/test_gpu_profiler.py b/tests/v1/worker/test_gpu_profiler.py
index f7255fae0..933ea42f1 100644
--- a/tests/v1/worker/test_gpu_profiler.py
+++ b/tests/v1/worker/test_gpu_profiler.py
@@ -2,8 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
-import vllm.envs as envs
-from vllm.profiler.gpu_profiler import WorkerProfiler
+from vllm.config import ProfilerConfig
+from vllm.profiler.wrapper import WorkerProfiler
 
 
 class ConcreteWorkerProfiler(WorkerProfiler):
@@ -11,11 +11,11 @@ class ConcreteWorkerProfiler(WorkerProfiler):
     A basic implementation of a worker profiler for testing purposes.
     """
 
-    def __init__(self):
+    def __init__(self, profiler_config: ProfilerConfig):
         self.start_call_count = 0
         self.stop_call_count = 0
         self.should_fail_start = False
-        super().__init__()
+        super().__init__(profiler_config)
 
     def _start(self) -> None:
         if self.should_fail_start:
@@ -26,17 +26,19 @@ class ConcreteWorkerProfiler(WorkerProfiler):
         self.stop_call_count += 1
 
 
-@pytest.fixture(autouse=True)
-def reset_mocks():
-    """Fixture to reset mocks and env variables before each test."""
-    envs.VLLM_PROFILER_DELAY_ITERS = 0
-    envs.VLLM_PROFILER_MAX_ITERS = 0
+@pytest.fixture
+def default_profiler_config():
+    return ProfilerConfig(
+        profiler="torch",
+        torch_profiler_dir="/tmp/mock",
+        delay_iterations=0,
+        max_iterations=0,
+    )
 
 
-def test_immediate_start_stop():
+def test_immediate_start_stop(default_profiler_config):
     """Test standard start without delay."""
-    profiler = ConcreteWorkerProfiler()
-
+    profiler = ConcreteWorkerProfiler(default_profiler_config)
     profiler.start()
     assert profiler._running is True
     assert profiler._active is True
@@ -48,10 +50,10 @@ def test_immediate_start_stop():
     assert profiler.stop_call_count == 1
 
 
-def test_delayed_start():
+def test_delayed_start(default_profiler_config):
     """Test that profiler waits for N steps before actually starting."""
-    envs.VLLM_PROFILER_DELAY_ITERS = 2
-    profiler = ConcreteWorkerProfiler()
+    default_profiler_config.delay_iterations = 2
+    profiler = ConcreteWorkerProfiler(default_profiler_config)
 
     # User requests start
     profiler.start()
@@ -71,10 +73,10 @@ def test_delayed_start():
     assert profiler.start_call_count == 1
 
 
-def test_max_iterations():
+def test_max_iterations(default_profiler_config):
     """Test that profiler stops automatically after max iterations."""
-    envs.VLLM_PROFILER_MAX_ITERS = 2
-    profiler = ConcreteWorkerProfiler()
+    default_profiler_config.max_iterations = 2
+    profiler = ConcreteWorkerProfiler(default_profiler_config)
 
     profiler.start()
     assert profiler._running is True
@@ -95,12 +97,11 @@ def test_max_iterations():
     assert profiler.stop_call_count == 1
 
 
-def test_delayed_start_and_max_iters():
+def test_delayed_start_and_max_iters(default_profiler_config):
     """Test combined delayed start and max iterations."""
-    envs.VLLM_PROFILER_DELAY_ITERS = 2
-    envs.VLLM_PROFILER_MAX_ITERS = 2
-    profiler = ConcreteWorkerProfiler()
-
+    default_profiler_config.delay_iterations = 2
+    default_profiler_config.max_iterations = 2
+    profiler = ConcreteWorkerProfiler(default_profiler_config)
     profiler.start()
 
     # Step 1
@@ -127,9 +128,9 @@ def test_delayed_start_and_max_iters():
     assert profiler.stop_call_count == 1
 
 
-def test_idempotency():
+def test_idempotency(default_profiler_config):
     """Test that calling start/stop multiple times doesn't break logic."""
-    profiler = ConcreteWorkerProfiler()
+    profiler = ConcreteWorkerProfiler(default_profiler_config)
 
     # Double Start
     profiler.start()
@@ -142,10 +143,10 @@ def test_idempotency():
     assert profiler.stop_call_count == 1  # Should only stop once
 
 
-def test_step_inactive():
+def test_step_inactive(default_profiler_config):
     """Test that stepping while inactive does nothing."""
-    envs.VLLM_PROFILER_DELAY_ITERS = 2
-    profiler = ConcreteWorkerProfiler()
+    default_profiler_config.delay_iterations = 2
+    profiler = ConcreteWorkerProfiler(default_profiler_config)
 
     # Not started yet
     profiler.step()
@@ -155,9 +156,9 @@ def test_step_inactive():
     assert profiler.start_call_count == 0
 
 
-def test_start_failure():
+def test_start_failure(default_profiler_config):
     """Test behavior when the underlying _start method raises exception."""
-    profiler = ConcreteWorkerProfiler()
+    profiler = ConcreteWorkerProfiler(default_profiler_config)
     profiler.should_fail_start = True
 
     profiler.start()
@@ -168,9 +169,9 @@ def test_start_failure():
     assert profiler.start_call_count == 0  # Logic failed inside start
 
 
-def test_shutdown():
+def test_shutdown(default_profiler_config):
     """Test that shutdown calls stop only if running."""
-    profiler = ConcreteWorkerProfiler()
+    profiler = ConcreteWorkerProfiler(default_profiler_config)
 
     # Case 1: Not running
     profiler.shutdown()
@@ -182,10 +183,10 @@ def test_shutdown():
     assert profiler.stop_call_count == 1
 
 
-def test_mixed_delay_and_stop():
+def test_mixed_delay_and_stop(default_profiler_config):
     """Test manual stop during the delay period."""
-    envs.VLLM_PROFILER_DELAY_ITERS = 5
-    profiler = ConcreteWorkerProfiler()
+    default_profiler_config.delay_iterations = 5
+    profiler = ConcreteWorkerProfiler(default_profiler_config)
 
     profiler.start()
     profiler.step()
diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py
index b4f175183..99c1c846f 100644
--- a/vllm/benchmarks/latency.py
+++ b/vllm/benchmarks/latency.py
@@ -12,7 +12,6 @@ from typing import Any
 import numpy as np
 from tqdm import tqdm
 
-import vllm.envs as envs
 from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
 from vllm.engine.arg_utils import EngineArgs
 from vllm.inputs import PromptType
@@ -79,12 +78,11 @@ def add_cli_args(parser: argparse.ArgumentParser):
 
 
 def main(args: argparse.Namespace):
-    if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
-        raise OSError(
-            "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
-            "Please set it to a valid path to use torch profiler."
-        )
     engine_args = EngineArgs.from_cli_args(args)
+    if args.profile and not engine_args.profiler_config.profiler == "torch":
+        raise ValueError(
+            "The torch profiler is not enabled. Please provide profiler_config."
+        )
 
     # Lazy import to avoid importing LLM when the bench command is not selected.
     from vllm import LLM, SamplingParams
@@ -144,7 +142,7 @@ def main(args: argparse.Namespace):
         run_to_completion(profile_dir=None)
 
     if args.profile:
-        profile_dir = envs.VLLM_TORCH_PROFILER_DIR
+        profile_dir = engine_args.profiler_config.torch_profiler_dir
         print(f"Profiling (results will be saved to '{profile_dir}')...")
         run_to_completion(profile_dir=profile_dir)
         return
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 568290aa8..2e2054a8a 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -1097,8 +1097,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--profile",
         action="store_true",
-        help="Use Torch Profiler. The endpoint must be launched with "
-        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
+        help="Use vLLM Profiling. --profiler-config must be provided on the server.",
     )
     parser.add_argument(
         "--save-result",
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
index ea693613f..d824e982b 100644
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -655,8 +655,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "--profile",
         action="store_true",
         default=False,
-        help="Use Torch Profiler. The env variable "
-        "VLLM_TORCH_PROFILER_DIR must be set to enable profiler.",
+        help="Use vLLM Profiling. --profiler-config must be provided on the server.",
     )
 
     # prefix repetition dataset
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 0f84f3ca9..0e91dd574 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -24,6 +24,7 @@ from vllm.config.multimodal import MultiModalConfig
 from vllm.config.observability import ObservabilityConfig
 from vllm.config.parallel import EPLBConfig, ParallelConfig
 from vllm.config.pooler import PoolerConfig
+from vllm.config.profiler import ProfilerConfig
 from vllm.config.scheduler import SchedulerConfig
 from vllm.config.speculative import SpeculativeConfig
 from vllm.config.speech_to_text import SpeechToTextConfig
@@ -89,6 +90,8 @@ __all__ = [
     "SpeechToTextConfig",
     # From vllm.config.structured_outputs
     "StructuredOutputsConfig",
+    # From vllm.config.profiler
+    "ProfilerConfig",
     # From vllm.config.utils
     "ConfigType",
     "SupportsMetricsInfo",
diff --git a/vllm/config/profiler.py b/vllm/config/profiler.py
new file mode 100644
index 000000000..76cc546f3
--- /dev/null
+++ b/vllm/config/profiler.py
@@ -0,0 +1,199 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from typing import Any, Literal
+
+from pydantic import Field, model_validator
+from pydantic.dataclasses import dataclass
+from typing_extensions import Self
+
+import vllm.envs as envs
+from vllm.config.utils import config
+from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
+
+logger = init_logger(__name__)
+
+ProfilerKind = Literal["torch", "cuda"]
+
+
+@config
+@dataclass
+class ProfilerConfig:
+    """Dataclass which contains profiler config for the engine."""
+
+    profiler: ProfilerKind | None = None
+    """Which profiler to use. Defaults to None. Options are:
+
+    - 'torch': Use PyTorch profiler.\n
+    - 'cuda': Use CUDA profiler."""
+
+    torch_profiler_dir: str = ""
+    """Directory to save torch profiler traces. Both AsyncLLM's CPU traces and
+    worker's traces (CPU & GPU) will be saved under this directory. Note that
+    it must be an absolute path."""
+
+    torch_profiler_with_stack: bool = True
+    """If `True`, enables stack tracing in the torch profiler. Enabled by default."""
+
+    torch_profiler_with_flops: bool = False
+    """If `True`, enables FLOPS counting in the torch profiler. Disabled by default."""
+
+    torch_profiler_use_gzip: bool = True
+    """If `True`, saves torch profiler traces in gzip format. Enabled by default"""
+
+    torch_profiler_dump_cuda_time_total: bool = True
+    """If `True`, dumps total CUDA time in torch profiler traces. Enabled by default."""
+
+    torch_profiler_record_shapes: bool = False
+    """If `True`, records tensor shapes in the torch profiler. Disabled by default."""
+
+    torch_profiler_with_memory: bool = False
+    """If `True`, enables memory profiling in the torch profiler.
+    Disabled by default."""
+
+    ignore_frontend: bool = False
+    """If `True`, disables the front-end profiling of AsyncLLM when using the 
+    'torch' profiler. This is needed to reduce overhead when using delay/limit options,
+    since the front-end profiling does not track iterations and will capture the
+    entire range.
+    """
+
+    delay_iterations: int = Field(default=0, ge=0)
+    """Number of engine iterations to skip before starting profiling.
+    Defaults to 0, meaning profiling starts immediately after receiving /start_profile.
+    """
+
+    max_iterations: int = Field(default=0, ge=0)
+    """Maximum number of engine iterations to profile after starting profiling.
+    Defaults to 0, meaning no limit.
+    """
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: list[Any] = []
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
+        return hash_str
+
+    def _get_from_env_if_set(self, field_name: str, env_var_name: str) -> None:
+        """Get field from env var if set, with deprecation warning."""
+
+        if envs.is_set(env_var_name):
+            value = getattr(envs, env_var_name)
+            logger.warning_once(
+                "Using %s environment variable is deprecated and will be removed in "
+                "v0.14.0 or v1.0.0, whichever is soonest. Please use "
+                "--profiler-config.%s command line argument or "
+                "ProfilerConfig(%s=...) config field instead.",
+                env_var_name,
+                field_name,
+                field_name,
+            )
+            return value
+        return None
+
+    def _set_from_env_if_set(
+        self,
+        field_name: str,
+        env_var_name: str,
+        to_bool: bool = True,
+        to_int: bool = False,
+    ) -> None:
+        """Set field from env var if set, with deprecation warning."""
+        value = self._get_from_env_if_set(field_name, env_var_name)
+        if value is not None:
+            if to_bool:
+                value = value == "1"
+            if to_int:
+                value = int(value)
+            setattr(self, field_name, value)
+
+    @model_validator(mode="after")
+    def _validate_profiler_config(self) -> Self:
+        maybe_use_cuda_profiler = self._get_from_env_if_set(
+            "profiler", "VLLM_TORCH_CUDA_PROFILE"
+        )
+        if maybe_use_cuda_profiler is not None:
+            self.profiler = "cuda" if maybe_use_cuda_profiler == "1" else None
+        else:
+            self._set_from_env_if_set(
+                "torch_profiler_dir", "VLLM_TORCH_PROFILER_DIR", to_bool=False
+            )
+            if self.torch_profiler_dir:
+                self.profiler = "torch"
+                self._set_from_env_if_set(
+                    "torch_profiler_record_shapes",
+                    "VLLM_TORCH_PROFILER_RECORD_SHAPES",
+                )
+                self._set_from_env_if_set(
+                    "torch_profiler_with_memory",
+                    "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY",
+                )
+                self._set_from_env_if_set(
+                    "torch_profiler_with_stack",
+                    "VLLM_TORCH_PROFILER_WITH_STACK",
+                )
+                self._set_from_env_if_set(
+                    "torch_profiler_with_flops",
+                    "VLLM_TORCH_PROFILER_WITH_FLOPS",
+                )
+                self._set_from_env_if_set(
+                    "ignore_frontend",
+                    "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM",
+                )
+                self._set_from_env_if_set(
+                    "torch_profiler_use_gzip",
+                    "VLLM_TORCH_PROFILER_USE_GZIP",
+                )
+                self._set_from_env_if_set(
+                    "torch_profiler_dump_cuda_time_total",
+                    "VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL",
+                )
+
+        self._set_from_env_if_set(
+            "delay_iterations", "VLLM_PROFILER_DELAY_ITERS", to_bool=False, to_int=True
+        )
+        self._set_from_env_if_set(
+            "max_iterations", "VLLM_PROFILER_MAX_ITERS", to_bool=False, to_int=True
+        )
+
+        has_delay_or_limit = self.delay_iterations > 0 or self.max_iterations > 0
+        if self.profiler == "torch" and has_delay_or_limit and not self.ignore_frontend:
+            logger.warning_once(
+                "Using 'torch' profiler with delay_iterations or max_iterations "
+                "while ignore_frontend is False may result in high overhead."
+            )
+
+        profiler_dir = self.torch_profiler_dir
+        if profiler_dir and self.profiler != "torch":
+            raise ValueError(
+                "torch_profiler_dir is only applicable when profiler is set to 'torch'"
+            )
+        if self.profiler == "torch" and not profiler_dir:
+            raise ValueError("torch_profiler_dir must be set when profiler is 'torch'")
+
+        if profiler_dir:
+            is_gs_path = (
+                profiler_dir.startswith("gs://")
+                and profiler_dir[5:]
+                and profiler_dir[5] != "/"
+            )
+            if not is_gs_path:
+                self.torch_profiler_dir = os.path.abspath(
+                    os.path.expanduser(profiler_dir)
+                )
+
+        return self
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index a74413536..614a3226c 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -39,6 +39,7 @@ from .lora import LoRAConfig
 from .model import ModelConfig
 from .observability import ObservabilityConfig
 from .parallel import ParallelConfig
+from .profiler import ProfilerConfig
 from .scheduler import SchedulerConfig
 from .speculative import SpeculativeConfig
 from .structured_outputs import StructuredOutputsConfig
@@ -218,6 +219,8 @@ class VllmConfig:
     You can specify the full compilation config like so:
     `{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
     """
+    profiler_config: ProfilerConfig = Field(default_factory=ProfilerConfig)
+    """Profiling configuration."""
     kv_transfer_config: KVTransferConfig | None = None
     """The configurations for distributed KV cache transfer."""
     kv_events_config: KVEventsConfig | None = None
@@ -296,6 +299,8 @@ class VllmConfig:
             vllm_factors.append("None")
         if self.structured_outputs_config:
             vllm_factors.append(self.structured_outputs_config.compute_hash())
+        if self.profiler_config:
+            vllm_factors.append(self.profiler_config.compute_hash())
         else:
             vllm_factors.append("None")
         vllm_factors.append(self.observability_config.compute_hash())
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ceac5407a..2f307a7cc 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -50,6 +50,7 @@ from vllm.config import (
     ObservabilityConfig,
     ParallelConfig,
     PoolerConfig,
+    ProfilerConfig,
     SchedulerConfig,
     SpeculativeConfig,
     StructuredOutputsConfig,
@@ -532,6 +533,8 @@ class EngineArgs:
     worker_cls: str = ParallelConfig.worker_cls
     worker_extension_cls: str = ParallelConfig.worker_extension_cls
 
+    profiler_config: ProfilerConfig = get_field(VllmConfig, "profiler_config")
+
     kv_transfer_config: KVTransferConfig | None = None
     kv_events_config: KVEventsConfig | None = None
 
@@ -1164,7 +1167,7 @@ class EngineArgs:
         vllm_group.add_argument(
             "--structured-outputs-config", **vllm_kwargs["structured_outputs_config"]
         )
-
+        vllm_group.add_argument("--profiler-config", **vllm_kwargs["profiler_config"])
         vllm_group.add_argument(
             "--optimization-level", **vllm_kwargs["optimization_level"]
         )
@@ -1782,6 +1785,7 @@ class EngineArgs:
             kv_transfer_config=self.kv_transfer_config,
             kv_events_config=self.kv_events_config,
             ec_transfer_config=self.ec_transfer_config,
+            profiler_config=self.profiler_config,
             additional_config=self.additional_config,
             optimization_level=self.optimization_level,
         )
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 913324fd5..5d5c4a1cd 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -20,6 +20,7 @@ from vllm.beam_search import (
 from vllm.config import (
     CompilationConfig,
     PoolerConfig,
+    ProfilerConfig,
     StructuredOutputsConfig,
     is_init_field,
 )
@@ -211,6 +212,7 @@ class LLM:
         structured_outputs_config: dict[str, Any]
         | StructuredOutputsConfig
         | None = None,
+        profiler_config: dict[str, Any] | ProfilerConfig | None = None,
         kv_cache_memory_bytes: int | None = None,
         compilation_config: int | dict[str, Any] | CompilationConfig | None = None,
         logits_processors: list[str | type[LogitsProcessor]] | None = None,
@@ -282,6 +284,20 @@ class LLM:
         else:
             structured_outputs_instance = StructuredOutputsConfig()
 
+        if profiler_config is not None:
+            if isinstance(profiler_config, dict):
+                profiler_config_instance = ProfilerConfig(
+                    **{
+                        k: v
+                        for k, v in profiler_config.items()
+                        if is_init_field(ProfilerConfig, k)
+                    }
+                )
+            else:
+                profiler_config_instance = profiler_config
+        else:
+            profiler_config_instance = ProfilerConfig()
+
         # warn about single-process data parallel usage.
         _dp_size = int(kwargs.get("data_parallel_size", 1))
         _distributed_executor_backend = kwargs.get("distributed_executor_backend")
@@ -324,6 +340,7 @@ class LLM:
             mm_processor_kwargs=mm_processor_kwargs,
             pooler_config=pooler_config,
             structured_outputs_config=structured_outputs_instance,
+            profiler_config=profiler_config_instance,
             compilation_config=compilation_config_instance,
             logits_processors=logits_processors,
             **kwargs,
diff --git a/vllm/entrypoints/serve/profile/api_router.py b/vllm/entrypoints/serve/profile/api_router.py
index 166f13764..eeed6b45e 100644
--- a/vllm/entrypoints/serve/profile/api_router.py
+++ b/vllm/entrypoints/serve/profile/api_router.py
@@ -5,7 +5,7 @@
 from fastapi import APIRouter, FastAPI, Request
 from fastapi.responses import Response
 
-import vllm.envs as envs
+from vllm.config import ProfilerConfig
 from vllm.engine.protocol import EngineClient
 from vllm.logger import init_logger
 
@@ -35,15 +35,12 @@ async def stop_profile(raw_request: Request):
 
 
 def attach_router(app: FastAPI):
-    if envs.VLLM_TORCH_PROFILER_DIR:
+    profiler_config = getattr(app.state.args, "profiler_config", None)
+    assert profiler_config is None or isinstance(profiler_config, ProfilerConfig)
+    if profiler_config is not None and profiler_config.profiler is not None:
         logger.warning_once(
-            "Torch Profiler is enabled in the API server. This should ONLY be "
-            "used for local development!"
+            "Profiler with mode '%s' is enabled in the "
+            "API server. This should ONLY be used for local development!",
+            profiler_config.profiler,
         )
-    elif envs.VLLM_TORCH_CUDA_PROFILE:
-        logger.warning_once(
-            "CUDA Profiler is enabled in the API server. This should ONLY be "
-            "used for local development!"
-        )
-    if envs.VLLM_TORCH_PROFILER_DIR or envs.VLLM_TORCH_CUDA_PROFILE:
         app.include_router(router)
diff --git a/vllm/envs.py b/vllm/envs.py
index bda9e6e42..8246109eb 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -89,20 +89,23 @@ if TYPE_CHECKING:
     VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5  # seconds
     VLLM_PLUGINS: list[str] | None = None
     VLLM_LORA_RESOLVER_CACHE_DIR: str | None = None
-    VLLM_TORCH_CUDA_PROFILE: bool = False
+    # Deprecated env variables for profiling, kept for backward compatibility
+    # See also vllm/config/profiler.py and `--profiler-config` argument
+    VLLM_TORCH_CUDA_PROFILE: str | None = None
     VLLM_TORCH_PROFILER_DIR: str | None = None
-    VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False
-    VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False
-    VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM: bool = False
+    VLLM_TORCH_PROFILER_RECORD_SHAPES: str | None = None
+    VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: str | None = None
+    VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM: str | None = None
+    VLLM_TORCH_PROFILER_WITH_STACK: str | None = None
+    VLLM_TORCH_PROFILER_WITH_FLOPS: str | None = None
+    VLLM_TORCH_PROFILER_USE_GZIP: str | None = None
+    VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL: str | None = None
+    VLLM_PROFILER_DELAY_ITERS: str | None = None
+    VLLM_PROFILER_MAX_ITERS: str | None = None
+    # End of deprecated env variables for profiling
     VLLM_USE_AOT_COMPILE: bool = False
     VLLM_USE_BYTECODE_HOOK: bool = False
     VLLM_FORCE_AOT_LOAD: bool = False
-    VLLM_TORCH_PROFILER_WITH_STACK: bool = True
-    VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False
-    VLLM_PROFILER_DELAY_ITERS: int = 0
-    VLLM_PROFILER_MAX_ITERS: int = 0
-    VLLM_TORCH_PROFILER_USE_GZIP: bool = True
-    VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL: bool = True
     VLLM_USE_TRITON_AWQ: bool = False
     VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
     VLLM_SKIP_P2P_CHECK: bool = False
@@ -850,71 +853,52 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_LORA_RESOLVER_CACHE_DIR": lambda: os.getenv(
         "VLLM_LORA_RESOLVER_CACHE_DIR", None
     ),
-    # Enables torch CUDA profiling if set.
-    # On NVIDIA GPUs, this will start/stop cudaProfilerApi when triggered.
-    "VLLM_TORCH_CUDA_PROFILE": lambda: bool(
-        os.getenv("VLLM_TORCH_CUDA_PROFILE", "0") != "0"
-    ),
+    # Enables torch CUDA profiling if set to 1.
+    # Deprecated, see profiler_config.
+    "VLLM_TORCH_CUDA_PROFILE": lambda: os.getenv("VLLM_TORCH_CUDA_PROFILE"),
     # Enables torch profiler if set.
-    # Both AsyncLLM's CPU traces as well as workers'
-    # traces (CPU & GPU) will be saved under this directory.
-    # Note that it must be an absolute path.
-    "VLLM_TORCH_PROFILER_DIR": lambda: (
-        None
-        if (val := os.getenv("VLLM_TORCH_PROFILER_DIR")) is None
-        else (
-            val
-            if val.startswith("gs://") and val[5:] and val[5] != "/"
-            else os.path.abspath(os.path.expanduser(val))
-        )
-    ),
-    # Enable torch profiler to record shapes if set
-    # VLLM_TORCH_PROFILER_RECORD_SHAPES=1. If not set, torch profiler will
-    # not record shapes.
-    "VLLM_TORCH_PROFILER_RECORD_SHAPES": lambda: bool(
-        os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES", "0") != "0"
-    ),
-    # Enable torch profiler to profile memory if set
-    # VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1. If not set, torch profiler
-    # will not profile memory.
-    "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": lambda: bool(
-        os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY", "0") != "0"
-    ),
-    # Enable torch profiler to profile stack if set
-    # VLLM_TORCH_PROFILER_WITH_STACK=1. If not set, torch profiler WILL
-    # profile stack by default.
-    "VLLM_TORCH_PROFILER_WITH_STACK": lambda: bool(
-        os.getenv("VLLM_TORCH_PROFILER_WITH_STACK", "1") != "0"
-    ),
-    # Enable torch profiler to profile flops if set
-    # VLLM_TORCH_PROFILER_WITH_FLOPS=1. If not set, torch profiler will
-    # not profile flops.
-    "VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: bool(
-        os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0"
-    ),
-    # Disable torch profiling of the AsyncLLMEngine process.
-    # If set to 1, will not profile the engine process.
-    "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM": lambda: bool(
-        os.getenv("VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM", "0") != "0"
+    # Deprecated, see profiler_config.
+    "VLLM_TORCH_PROFILER_DIR": lambda: os.getenv("VLLM_TORCH_PROFILER_DIR"),
+    # Enable torch profiler to record shapes if set to 1.
+    # Deprecated, see profiler_config.
+    "VLLM_TORCH_PROFILER_RECORD_SHAPES": lambda: (
+        os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES")
+    ),
+    # Enable torch profiler to profile memory if set to 1.
+    # Deprecated, see profiler_config.
+    "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": lambda: (
+        os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY")
+    ),
+    # Enable torch profiler to profile stack if set to 1.
+    # Deprecated, see profiler_config.
+    "VLLM_TORCH_PROFILER_WITH_STACK": lambda: (
+        os.getenv("VLLM_TORCH_PROFILER_WITH_STACK")
+    ),
+    # Enable torch profiler to profile flops if set to 1.
+    # Deprecated, see profiler_config.
+    "VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: (
+        os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS")
+    ),
+    # Disable torch profiling of the AsyncLLMEngine process if set to 1.
+    # Deprecated, see profiler_config.
+    "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM": lambda: (
+        os.getenv("VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM")
     ),
     # Delay number of iterations before starting profiling when using
     # the torch/torch CUDA profiler. If set to 0, will start profiling immediately.
-    "VLLM_PROFILER_DELAY_ITERS": lambda: int(
-        os.getenv("VLLM_PROFILER_DELAY_ITERS", "0")
-    ),
+    # Deprecated, see profiler_config.
+    "VLLM_PROFILER_DELAY_ITERS": lambda: (os.getenv("VLLM_PROFILER_DELAY_ITERS")),
     # Maximum number of iterations to profile when using the torch/torch CUDA profiler.
     # If set to 0, will not limit the number of iterations.
-    "VLLM_PROFILER_MAX_ITERS": lambda: int(os.getenv("VLLM_PROFILER_MAX_ITERS", "0")),
+    "VLLM_PROFILER_MAX_ITERS": lambda: os.getenv("VLLM_PROFILER_MAX_ITERS"),
     # Control whether torch profiler gzip-compresses profiling files.
-    # Set VLLM_TORCH_PROFILER_USE_GZIP=0 to disable gzip (enabled by default).
-    "VLLM_TORCH_PROFILER_USE_GZIP": lambda: bool(
-        os.getenv("VLLM_TORCH_PROFILER_USE_GZIP", "1") != "0"
-    ),
+    # Deprecated, see profiler_config.
+    "VLLM_TORCH_PROFILER_USE_GZIP": lambda: os.getenv("VLLM_TORCH_PROFILER_USE_GZIP"),
     # Control whether torch profiler dumps the self_cuda_time_total table.
-    # Set VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0 to disable dumping
-    # (enabled by default).
-    "VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL": lambda: bool(
-        os.getenv("VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL", "1") != "0"
+    # Set to 0 to disable dumping the table.
+    # Deprecated, see profiler_config.
+    "VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL": lambda: (
+        os.getenv("VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL")
     ),
     # If set, vLLM will use Triton implementations of AWQ.
     "VLLM_USE_TRITON_AWQ": lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),
diff --git a/vllm/profiler/gpu_profiler.py b/vllm/profiler/wrapper.py
similarity index 73%
rename from vllm/profiler/gpu_profiler.py
rename to vllm/profiler/wrapper.py
index 798c61522..a44a6a5ee 100644
--- a/vllm/profiler/gpu_profiler.py
+++ b/vllm/profiler/wrapper.py
@@ -3,26 +3,27 @@
 
 from abc import ABC, abstractmethod
 from contextlib import nullcontext
+from typing import Literal
 
 import torch
 from typing_extensions import override
 
-import vllm.envs as envs
+from vllm.config import ProfilerConfig
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
 
 class WorkerProfiler(ABC):
-    def __init__(self) -> None:
-        self._delay_iters = envs.VLLM_PROFILER_DELAY_ITERS
+    def __init__(self, profiler_config: ProfilerConfig) -> None:
+        self._delay_iters = profiler_config.delay_iterations
         if self._delay_iters > 0:
             logger.info_once(
                 "GPU profiling will start "
                 f"{self._delay_iters} steps after start_profile."
             )
 
-        self._max_iters = envs.VLLM_PROFILER_MAX_ITERS
+        self._max_iters = profiler_config.max_iterations
         if self._max_iters > 0:
             logger.info_once(
                 "GPU profiling will stop "
@@ -133,12 +134,27 @@ class WorkerProfiler(ABC):
         return nullcontext()
 
 
+TorchProfilerActivity = Literal["CPU", "CUDA", "XPU"]
+TorchProfilerActivityMap = {
+    "CPU": torch.profiler.ProfilerActivity.CPU,
+    "CUDA": torch.profiler.ProfilerActivity.CUDA,
+    "XPU": torch.profiler.ProfilerActivity.XPU,
+}
+
+
 class TorchProfilerWrapper(WorkerProfiler):
-    def __init__(self, worker_name: str, local_rank: int) -> None:
-        super().__init__()
+    def __init__(
+        self,
+        profiler_config: ProfilerConfig,
+        worker_name: str,
+        local_rank: int,
+        activities: list[TorchProfilerActivity],
+    ) -> None:
+        super().__init__(profiler_config)
 
         self.local_rank = local_rank
-        torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
+        self.profiler_config = profiler_config
+        torch_profiler_trace_dir = profiler_config.torch_profiler_dir
         if local_rank in (None, 0):
             logger.info(
                 "Torch profiling enabled. Traces will be saved to: %s",
@@ -147,24 +163,23 @@ class TorchProfilerWrapper(WorkerProfiler):
             logger.debug(
                 "Profiler config: record_shapes=%s,"
                 "profile_memory=%s,with_stack=%s,with_flops=%s",
-                envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
-                envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
-                envs.VLLM_TORCH_PROFILER_WITH_STACK,
-                envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
+                profiler_config.torch_profiler_record_shapes,
+                profiler_config.torch_profiler_with_memory,
+                profiler_config.torch_profiler_with_stack,
+                profiler_config.torch_profiler_with_flops,
             )
+
+        self.dump_cpu_time_total = "CPU" in activities and len(activities) == 1
         self.profiler = torch.profiler.profile(
-            activities=[
-                torch.profiler.ProfilerActivity.CPU,
-                torch.profiler.ProfilerActivity.CUDA,
-            ],
-            record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
-            profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
-            with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
-            with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
+            activities=[TorchProfilerActivityMap[activity] for activity in activities],
+            record_shapes=profiler_config.torch_profiler_record_shapes,
+            profile_memory=profiler_config.torch_profiler_with_memory,
+            with_stack=profiler_config.torch_profiler_with_stack,
+            with_flops=profiler_config.torch_profiler_with_flops,
             on_trace_ready=torch.profiler.tensorboard_trace_handler(
                 torch_profiler_trace_dir,
                 worker_name=worker_name,
-                use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
+                use_gzip=profiler_config.torch_profiler_use_gzip,
             ),
         )
 
@@ -176,9 +191,10 @@ class TorchProfilerWrapper(WorkerProfiler):
     def _stop(self) -> None:
         self.profiler.stop()
 
-        if envs.VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL:
-            rank = self.local_rank
-            profiler_dir = envs.VLLM_TORCH_PROFILER_DIR
+        profiler_config = self.profiler_config
+        rank = self.local_rank
+        if profiler_config.torch_profiler_dump_cuda_time_total:
+            profiler_dir = profiler_config.torch_profiler_dir
             profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt"
             sort_key = "self_cuda_time_total"
             table = self.profiler.key_averages().table(sort_by=sort_key)
@@ -189,6 +205,12 @@ class TorchProfilerWrapper(WorkerProfiler):
             # only print profiler results on rank 0
             if rank == 0:
                 print(table)
+        if self.dump_cpu_time_total and rank == 0:
+            logger.info(
+                self.profiler.key_averages().table(
+                    sort_by="self_cpu_time_total", row_limit=50
+                )
+            )
 
     @override
     def annotate_context_manager(self, name: str):
@@ -196,8 +218,8 @@ class TorchProfilerWrapper(WorkerProfiler):
 
 
 class CudaProfilerWrapper(WorkerProfiler):
-    def __init__(self) -> None:
-        super().__init__()
+    def __init__(self, profiler_config: ProfilerConfig) -> None:
+        super().__init__(profiler_config)
         # Note: lazy import to avoid dependency issues if CUDA is not available.
         import torch.cuda.profiler as cuda_profiler
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index fd7e04dc0..931d13be3 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -166,32 +166,24 @@ class AsyncLLM(EngineClient):
             pass
 
         if (
-            envs.VLLM_TORCH_PROFILER_DIR
-            and not envs.VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM
+            vllm_config.profiler_config.profiler == "torch"
+            and not vllm_config.profiler_config.ignore_frontend
         ):
+            profiler_dir = vllm_config.profiler_config.torch_profiler_dir
             logger.info(
                 "Torch profiler enabled. AsyncLLM CPU traces will be collected under %s",  # noqa: E501
-                envs.VLLM_TORCH_PROFILER_DIR,
+                profiler_dir,
             )
-            if envs.VLLM_PROFILER_MAX_ITERS > 0 or envs.VLLM_PROFILER_DELAY_ITERS > 0:
-                logger.warning_once(
-                    "Torch profiler received max_iters or delay_iters setting. These "
-                    "are not compatible with the AsyncLLM profiler and will be ignored "
-                    "for the AsyncLLM process. Engine process profiling will still "
-                    "respect these settings. Consider setting "
-                    "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM=1 to disable "
-                    "AsyncLLM profiling."
-                )
             worker_name = f"{socket.gethostname()}_{os.getpid()}.async_llm"
             self.profiler = torch.profiler.profile(
                 activities=[
                     torch.profiler.ProfilerActivity.CPU,
                 ],
-                with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
+                with_stack=vllm_config.profiler_config.torch_profiler_with_stack,
                 on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    envs.VLLM_TORCH_PROFILER_DIR,
+                    profiler_dir,
                     worker_name=worker_name,
-                    use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
+                    use_gzip=vllm_config.profiler_config.torch_profiler_use_gzip,
                 ),
             )
         else:
diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py
index b080fea1d..e54b995ab 100644
--- a/vllm/v1/worker/cpu_worker.py
+++ b/vllm/v1/worker/cpu_worker.py
@@ -13,6 +13,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.utils import set_random_seed
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.platforms.cpu import CpuPlatform, LogicalCPUInfo
+from vllm.profiler.wrapper import TorchProfilerWrapper
 from vllm.v1.worker.cpu_model_runner import CPUModelRunner
 from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment
 
@@ -38,30 +39,17 @@ class CPUWorker(Worker):
 
         self.parallel_config.disable_custom_all_reduce = True
 
-        # Torch profiler. Enabled and configured through env vars:
-        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
+        # Torch profiler. Enabled and configured through profiler_config.
         self.profiler: Any | None = None
-        if envs.VLLM_TORCH_PROFILER_DIR:
-            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
+        profiler_config = vllm_config.profiler_config
+        if profiler_config.profiler == "torch":
             worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
-            logger.info(
-                "Profiling enabled. Traces will be saved to: %s",
-                torch_profiler_trace_dir,
+            self.profiler = TorchProfilerWrapper(
+                profiler_config,
+                worker_name=worker_name,
+                local_rank=self.local_rank,
+                activities=["CPU"],
             )
-            self.profiler = torch.profiler.profile(
-                activities=[
-                    torch.profiler.ProfilerActivity.CPU,
-                ],
-                record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
-                profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
-                with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
-                with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
-                on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    torch_profiler_trace_dir, worker_name=worker_name, use_gzip=False
-                ),
-            )
-        else:
-            self.profiler = None
 
     def init_device(self):
         # Setup OpenMP threads affinity.
@@ -202,9 +190,3 @@ class CPUWorker(Worker):
             self.profiler.start()
         else:
             self.profiler.stop()
-            if self.local_rank == 0:
-                logger.info(
-                    self.profiler.key_averages().table(
-                        sort_by="self_cpu_time_total", row_limit=50
-                    )
-                )
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 24a3533a1..f2b6a1f76 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -38,7 +38,7 @@ from vllm.model_executor import set_random_seed
 from vllm.model_executor.models.interfaces import is_mixture_of_experts
 from vllm.model_executor.warmup.kernel_warmup import kernel_warmup
 from vllm.platforms import current_platform
-from vllm.profiler.gpu_profiler import CudaProfilerWrapper, TorchProfilerWrapper
+from vllm.profiler.wrapper import CudaProfilerWrapper, TorchProfilerWrapper
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import SupportedTask
 from vllm.utils.mem_constants import GiB_bytes
@@ -92,17 +92,19 @@ class Worker(WorkerBase):
         # Buffers saved before sleep
         self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
 
-        # Torch/CUDA profiler. Enabled and configured through env vars:
-        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
-        # VLLM_TORCH_CUDA_PROFILE=1
+        # Torch/CUDA profiler. Enabled and configured through profiler_config.
         self.profiler: Any | None = None
-        if envs.VLLM_TORCH_PROFILER_DIR:
+        profiler_config = vllm_config.profiler_config
+        if profiler_config.profiler == "torch":
             worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
             self.profiler = TorchProfilerWrapper(
-                worker_name=worker_name, local_rank=self.local_rank
+                profiler_config,
+                worker_name=worker_name,
+                local_rank=self.local_rank,
+                activities=["CPU", "CUDA"],
             )
-        elif envs.VLLM_TORCH_CUDA_PROFILE:
-            self.profiler = CudaProfilerWrapper()
+        elif profiler_config.profiler == "cuda":
+            self.profiler = CudaProfilerWrapper(profiler_config)
         else:
             self.profiler = None
 
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index ce18ca6c3..7a10ac198 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -98,10 +98,10 @@ class TPUWorker:
         # MP runtime is initialized.
         self.profiler = None
         self.profile_dir = None
-        if envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1:
+        if vllm_config.profiler_config.profiler == "torch" and self.rank < 1:
             # For TPU, we can only have 1 active profiler session for 1 profiler
             # server. So we only profile on rank0.
-            self.profile_dir = envs.VLLM_TORCH_PROFILER_DIR
+            self.profile_dir = vllm_config.profiler_config.torch_profiler_dir
             logger.info(
                 "Profiling enabled. Traces will be saved to: %s", self.profile_dir
             )
diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py
index 267369c73..1faa1a24f 100644
--- a/vllm/v1/worker/xpu_worker.py
+++ b/vllm/v1/worker/xpu_worker.py
@@ -6,12 +6,12 @@ from typing import Any
 import torch
 import torch.distributed
 
-import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.distributed import get_world_group
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
+from vllm.profiler.wrapper import TorchProfilerWrapper
 from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment
 from vllm.v1.worker.xpu_model_runner import XPUModelRunner
 
@@ -36,41 +36,17 @@ class XPUWorker(Worker):
         assert device_config.device_type == "xpu"
         assert current_platform.is_xpu()
 
-        # Torch profiler. Enabled and configured through env vars:
-        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
+        # Torch profiler. Enabled and configured through profiler_config.
         self.profiler: Any | None = None
-        if envs.VLLM_TORCH_PROFILER_DIR:
-            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
+        profiler_config = vllm_config.profiler_config
+        if profiler_config.profiler == "torch":
             worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
-            logger.info(
-                "Profiling enabled. Traces will be saved to: %s",
-                torch_profiler_trace_dir,
+            self.profiler = TorchProfilerWrapper(
+                profiler_config,
+                worker_name=worker_name,
+                local_rank=self.local_rank,
+                activities=["CPU", "XPU"],
             )
-            logger.debug(
-                "Profiler config: record_shapes=%s,"
-                "profile_memory=%s,with_stack=%s,with_flops=%s",
-                envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
-                envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
-                envs.VLLM_TORCH_PROFILER_WITH_STACK,
-                envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
-            )
-            self.profiler = torch.profiler.profile(
-                activities=[
-                    torch.profiler.ProfilerActivity.CPU,
-                    torch.profiler.ProfilerActivity.XPU,
-                ],
-                record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
-                profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
-                with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
-                with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
-                on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    torch_profiler_trace_dir,
-                    worker_name=worker_name,
-                    use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
-                ),
-            )
-        else:
-            self.profiler = None
 
     # we provide this function due to `torch.xpu.mem_get_info()` doesn't
     # return correct free_gpu_memory on intel client GPU. We need to
-- 
GitLab


From 95501a70ec69d182b124774ff708c3050ab4e91e Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Tue, 9 Dec 2025 13:51:19 -0500
Subject: [PATCH 242/258] [BugFix] Fix DeepSeek-R1 hang with DP and MTP
 (#30119)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/v1/worker/gpu_model_runner.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 766c2acd0..7398defd7 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -4168,10 +4168,19 @@ class GPUModelRunner(
 
             if self.speculative_config and self.speculative_config.use_eagle():
                 assert isinstance(self.drafter, EagleProposer)
+                # Eagle currently only supports PIECEWISE cudagraphs.
+                # Therefore only use cudagraphs if the main model uses PIECEWISE
+                # NOTE(lucas): this is a hack, need to clean up.
                 use_cudagraphs = (
-                    cudagraph_runtime_mode.has_mode(CUDAGraphMode.PIECEWISE)
-                    and not self.speculative_config.enforce_eager
-                )
+                    (
+                        is_graph_capturing
+                        and cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
+                    )
+                    or (
+                        not is_graph_capturing
+                        and cudagraph_runtime_mode != CUDAGraphMode.NONE
+                    )
+                ) and not self.speculative_config.enforce_eager
 
                 # Note(gnovack) - We need to disable cudagraphs for one of the two
                 # lora cases when cudagraph_specialize_lora is enabled. This is a
-- 
GitLab


From b37bf51e7594133772f4f06446dd5aa08aa5be0e Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Tue, 9 Dec 2025 13:52:20 -0500
Subject: [PATCH 243/258] [CI/Test] Fix FP8 per-tensor quant test reference
 scale shape (#30352)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 tests/kernels/quant_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
index 830d43569..e29f66dca 100644
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -103,7 +103,7 @@ def ref_dynamic_per_tensor_fp8_quant(
         .clamp(fp8_traits_min, fp8_traits_max)
         .to(FP8_DTYPE)
     )
-    return ref_out, ref_scale.view((1, 1))
+    return ref_out, ref_scale.view(1)
 
 
 def native_w8a8_block_matmul(
-- 
GitLab


From 73a484caa1ad320d6e695f098c25c479a71e6774 Mon Sep 17 00:00:00 2001
From: Tsukasa OI <floss_llm@irq.a4lg.com>
Date: Wed, 10 Dec 2025 04:13:10 +0900
Subject: [PATCH 244/258] [Model][Quantization] Fix / Add GGUF support for
 Qwen2 MoE models (#30307)

Signed-off-by: Tsukasa OI <floss_llm@irq.a4lg.com>
---
 vllm/model_executor/models/qwen2_moe.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 5a4287400..cbc618f1a 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -367,6 +367,8 @@ class Qwen2MoeModel(nn.Module):
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.embed_tokens",
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
@@ -512,6 +514,12 @@ class Qwen2MoeModel(nn.Module):
                             continue
                         else:
                             name = remapped_kv_scale_name
+                    # GGUF: make sure that shared_expert_gate is a 2D tensor.
+                    if (
+                        "mlp.shared_expert_gate" in name
+                        and len(loaded_weight.shape) == 1
+                    ):
+                        loaded_weight = loaded_weight[None, :]
                     param = params_dict[name]
                     weight_loader = getattr(
                         param, "weight_loader", default_weight_loader
-- 
GitLab


From 7cab92fd45ce6ad7fd78d705ef31b12d3beebb4b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 9 Dec 2025 20:03:16 +0000
Subject: [PATCH 245/258] Bump actions/checkout from 6.0.0 to 6.0.1 (#30233)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/cleanup_pr_body.yml  | 2 +-
 .github/workflows/macos-smoke-test.yml | 2 +-
 .github/workflows/pre-commit.yml       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
index 56fbe5ca7..df8910837 100644
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -13,7 +13,7 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
 
       - name: Set up Python
         uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml
index 3a12c4b3a..e80a5c0cc 100644
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -12,7 +12,7 @@ jobs:
     timeout-minutes: 30
 
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v6.0.1
 
       - uses: astral-sh/setup-uv@v7
         with:
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index a03b979ad..1041653c2 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -16,7 +16,7 @@ jobs:
   pre-commit:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+    - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
     - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
       with:
         python-version: "3.12"
-- 
GitLab


From f8dacc66b69bfbd9a9addf542572487035f0a1db Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 9 Dec 2025 20:12:14 +0000
Subject: [PATCH 246/258] Bump actions/stale from 10.1.0 to 10.1.1 (#30234)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/stale.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index c8a52f1a6..44bf71db5 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -15,7 +15,7 @@ jobs:
       actions: write
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
+      - uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
         with:
           # Increasing this value ensures that changes to this workflow
           # propagate to all issues and PRs in days rather than months
-- 
GitLab


From 7618dc973dd1e56a46162bc7bd6e7625143bead0 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Tue, 9 Dec 2025 14:18:17 -0600
Subject: [PATCH 247/258] [CI/Build] Make test_mha_attn.py run on correct
 platform only and check for flash_attn_varlen_func in layer.py (#29145)

---
 tests/kernels/attention/test_mha_attn.py | 11 +++++++++--
 vllm/attention/layer.py                  |  5 ++++-
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
index ae3c63cc6..639abdf6f 100644
--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@@ -26,7 +26,14 @@ def clear_cache():
     _cached_get_attn_backend.cache_clear()
 
 
-@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
+devices = ["cpu"]
+if current_platform.is_cuda():
+    devices.append("cuda")
+if current_platform.is_rocm():
+    devices.append("hip")
+
+
+@pytest.mark.parametrize("device", devices)
 def test_mha_attn_platform(device: str):
     """
     Test the attention selector between different platform and device.
@@ -46,7 +53,7 @@ def test_mha_attn_platform(device: str):
             patch("vllm.model_executor.models.vision.current_platform", RocmPlatform()),
         ):
             attn = MultiHeadAttention(16, 64, scale=1)
-            assert attn.attn_backend == AttentionBackendEnum.TORCH_SDPA
+            assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
     else:
         # Test CUDA with head_size=64 (divisible by 32)
         # - should use vLLM's FlashAttention
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 7e5adfe07..c77fc0fad 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -89,7 +89,10 @@ def maybe_get_vit_flash_attn_backend(
         if attn_backend == AttentionBackendEnum.ROCM_AITER_FA:
             from aiter import flash_attn_varlen_func
         else:
-            from vllm.attention.utils.fa_utils import flash_attn_varlen_func
+            try:
+                from vllm.attention.utils.fa_utils import flash_attn_varlen_func
+            except ImportError:
+                flash_attn_varlen_func = None
     else:
         flash_attn_varlen_func = None
 
-- 
GitLab


From 00e5cbb96789fd9d083b4015cdcff318e4de3808 Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Tue, 9 Dec 2025 16:48:25 -0500
Subject: [PATCH 248/258] [MoE][Refactor] Remove most arguments to
 FusedMoEMethodBase.apply (#29066)

Signed-off-by: Bill Nell <bnell@redhat.com>
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
---
 .../layers/fused_moe/__init__.py              |   6 +-
 .../layers/fused_moe/fused_moe_method_base.py |  18 --
 .../fused_moe/fused_moe_modular_method.py     |  26 +-
 vllm/model_executor/layers/fused_moe/layer.py |  74 ++---
 .../fused_moe/unquantized_fused_moe_method.py | 224 +++++----------
 .../layers/quantization/awq_marlin.py         |  26 +-
 .../layers/quantization/bitsandbytes.py       |  26 +-
 .../compressed_tensors_moe.py                 | 254 +++++-------------
 .../layers/quantization/experts_int8.py       |  26 +-
 .../model_executor/layers/quantization/fp8.py | 104 +++----
 .../layers/quantization/gguf.py               |  25 +-
 .../layers/quantization/gptq_marlin.py        |  26 +-
 .../layers/quantization/ipex_quant.py         |  30 +--
 .../layers/quantization/modelopt.py           | 106 +++-----
 .../layers/quantization/moe_wna16.py          |  26 +-
 .../layers/quantization/mxfp4.py              |  97 +++----
 .../layers/quantization/quark/quark_moe.py    |  72 ++---
 .../model_executor/layers/quantization/rtn.py |  24 +-
 18 files changed, 318 insertions(+), 872 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 1e145a8fc..d71cfc5ad 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -4,7 +4,10 @@
 from contextlib import contextmanager
 from typing import Any
 
-from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    RoutingMethodType,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe_method_base import (
     FusedMoEMethodBase,
 )
@@ -49,6 +52,7 @@ __all__ = [
     "FusedMoEPermuteExpertsUnpermute",
     "FusedMoEActivationFormat",
     "FusedMoEPrepareAndFinalize",
+    "RoutingMethodType",
     "SharedFusedMoE",
     "activation_without_mul",
     "override_config",
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
index ef7090c34..8c9d8a277 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import abstractmethod
-from collections.abc import Callable
 
 import torch
 
@@ -100,22 +99,5 @@ class FusedMoEMethodBase(QuantizeMethodBase):
         layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         raise NotImplementedError
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
index b33e7fd8a..1947423bf 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
 
 import torch
 
@@ -97,23 +96,6 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
         layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         topk_weights, topk_ids, zero_expert_result = layer.select_experts(
             hidden_states=x,
@@ -127,10 +109,10 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
             topk_weights=topk_weights,
             topk_ids=topk_ids,
             inplace=self.allow_inplace,
-            activation=activation,
-            global_num_experts=global_num_experts,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            expert_map=None if self.disable_expert_map else expert_map,
+            activation=layer.activation,
+            global_num_experts=layer.global_num_experts,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+            expert_map=None if self.disable_expert_map else layer.expert_map,
         )
 
         if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 5df348609..e63538206 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -33,10 +33,6 @@ from vllm.model_executor.layers.fused_moe.config import (
     RoutingMethodType,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
-from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEPermuteExpertsUnpermute,
-    FusedMoEPrepareAndFinalize,
-)
 from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
     init_aiter_topK_meta_data,
 )
@@ -57,11 +53,8 @@ from vllm.utils.torch_utils import (
 from vllm.v1.worker.ubatching import dbo_current_ubatch_id
 
 if current_platform.is_cuda_alike():
-    from .fused_moe import eplb_map_to_physical_and_record, fused_experts
+    from .fused_moe import eplb_map_to_physical_and_record
 else:
-    fused_experts = None  # type: ignore
-    FusedMoEPermuteExpertsUnpermute = object  # type: ignore
-    FusedMoEPrepareAndFinalize = object  # type: ignore
 
     def _eplb_map_to_physical_and_record(
         topk_ids: torch.Tensor,
@@ -483,7 +476,7 @@ class FusedMoE(CustomOp):
                 enable_eplb=self.enable_eplb,
             )
 
-            self.expert_map: torch.Tensor | None
+            self._expert_map: torch.Tensor | None
             local_num_experts, expert_map, expert_mask = determine_expert_map(
                 ep_size=self.ep_size,
                 ep_rank=self.ep_rank,
@@ -493,7 +486,7 @@ class FusedMoE(CustomOp):
                 return_expert_mask=self.rocm_aiter_fmoe_enabled,
             )
             self.local_num_experts = local_num_experts
-            self.register_buffer("expert_map", expert_map)
+            self.register_buffer("_expert_map", expert_map)
             self.register_buffer("expert_mask", expert_mask)
             self._maybe_init_expert_routing_tables()
             logger.info_once(
@@ -506,10 +499,10 @@ class FusedMoE(CustomOp):
                 self.expert_placement_strategy,
                 self.local_num_experts,
                 self.global_num_experts,
-                get_compressed_expert_map(self.expert_map),
+                get_compressed_expert_map(self._expert_map),
             )
         else:
-            self.local_num_experts, self.expert_map, self.expert_mask = (
+            self.local_num_experts, self._expert_map, self.expert_mask = (
                 self.global_num_experts,
                 None,
                 None,
@@ -781,7 +774,7 @@ class FusedMoE(CustomOp):
                 ),
             )
 
-        if self.expert_map is None:
+        if self._expert_map is None:
             return None
 
         routing_tables = self.ensure_round_robin_expert_routing_tables(
@@ -789,7 +782,7 @@ class FusedMoE(CustomOp):
             ep_size=self.ep_size,
             ep_rank=self.ep_rank,
             local_num_experts=self.local_num_experts,
-            device=self.expert_map.device,
+            device=self._expert_map.device,
         )
 
         global_to_physical, physical_to_global, local_global = routing_tables
@@ -840,8 +833,8 @@ class FusedMoE(CustomOp):
 
     def update_expert_map(self):
         # ep_size and ep_rank should already be updated
-        assert self.expert_map is not None
-        with self.expert_map.device:
+        assert self._expert_map is not None
+        with self._expert_map.device:
             local_num_experts, expert_map, expert_mask = determine_expert_map(
                 ep_size=self.ep_size,
                 ep_rank=self.ep_rank,
@@ -851,7 +844,7 @@ class FusedMoE(CustomOp):
                 return_expert_mask=self.rocm_aiter_fmoe_enabled,
             )
             self.local_num_experts = local_num_experts
-            self.register_buffer("expert_map", expert_map)
+            self.register_buffer("_expert_map", expert_map)
             self.register_buffer("expert_mask", expert_mask)
             self._maybe_init_expert_routing_tables()
             if self.aiter_fmoe_shared_expert_enabled:
@@ -1068,9 +1061,9 @@ class FusedMoE(CustomOp):
             expert_data.copy_(loaded_weight)
 
     def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int:
-        if self.expert_map is None:
+        if self._expert_map is None:
             return expert_id
-        return self.expert_map[expert_id].item()
+        return self._expert_map[expert_id].item()
 
     def _init_aiter_shared_experts_topK_buffer(
         self, vllm_config: VllmConfig, dp_size: int
@@ -1744,6 +1737,12 @@ class FusedMoE(CustomOp):
                 reduce_output(fused_output)[..., :og_hidden_states],
             )
 
+    @property
+    def expert_map(self) -> torch.Tensor | None:
+        return (
+            self._expert_map if not self.rocm_aiter_fmoe_enabled else self.expert_mask
+        )
+
     def forward_cuda(
         self,
         hidden_states: torch.Tensor,
@@ -1805,24 +1804,6 @@ class FusedMoE(CustomOp):
                 layer=self,
                 x=staged_hidden_states,
                 router_logits=staged_router_logits,
-                top_k=self.top_k,
-                renormalize=self.renormalize,
-                use_grouped_topk=self.use_grouped_topk,
-                global_num_experts=self.global_num_experts,
-                expert_map=self.expert_map
-                if not self.rocm_aiter_fmoe_enabled
-                else self.expert_mask,
-                topk_group=self.topk_group,
-                num_expert_group=self.num_expert_group,
-                custom_routing_function=self.custom_routing_function,
-                scoring_func=self.scoring_func,
-                routed_scaling_factor=self.routed_scaling_factor,
-                e_score_correction_bias=self.e_score_correction_bias,
-                activation=self.activation,
-                enable_eplb=self.enable_eplb,
-                expert_load_view=self.expert_load_view,
-                logical_to_physical_map=self.logical_to_physical_map,
-                logical_replica_count=self.logical_replica_count,
             )
 
             if has_separate_shared_experts:
@@ -1968,25 +1949,6 @@ class FusedMoE(CustomOp):
                 if do_naive_dispatch_combine
                 else hidden_states,
                 router_logits=router_logits,
-                top_k=self.top_k,
-                renormalize=self.renormalize,
-                use_grouped_topk=self.use_grouped_topk,
-                global_num_experts=self.global_num_experts,
-                expert_map=self.expert_map
-                if not self.rocm_aiter_fmoe_enabled
-                else self.expert_mask,
-                topk_group=self.topk_group,
-                num_expert_group=self.num_expert_group,
-                custom_routing_function=self.custom_routing_function,
-                scoring_func=self.scoring_func,
-                routed_scaling_factor=self.routed_scaling_factor,
-                e_score_correction_bias=self.e_score_correction_bias,
-                activation=self.activation,
-                apply_router_weight_on_input=self.apply_router_weight_on_input,
-                enable_eplb=self.enable_eplb,
-                expert_load_view=self.expert_load_view,
-                logical_to_physical_map=self.logical_to_physical_map,
-                logical_replica_count=self.logical_replica_count,
             )
 
             if has_separate_shared_experts:
diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
index 48e5a8907..6182f10aa 100644
--- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
+++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
 
 import torch
 import torch.nn.functional as F
@@ -269,53 +268,14 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            assert expert_load_view is not None
-            assert logical_to_physical_map is not None
-            assert logical_replica_count is not None
-
         return self.forward(
-            x=x,
             layer=layer,
+            x=x,
             router_logits=router_logits,
-            top_k=top_k,
-            renormalize=renormalize,
-            use_grouped_topk=use_grouped_topk,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            activation=activation,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            enable_eplb=enable_eplb,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
         )
 
     def get_fused_moe_quant_config(
@@ -333,24 +293,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         self,
         layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
         router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         topk_weights, topk_ids, zero_expert_result = layer.select_experts(
             hidden_states=x,
@@ -364,9 +307,9 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 w2=layer.w2_weight,
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
-                expert_map=expert_map,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+                expert_map=layer.expert_map,
+                activation=layer.activation,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
         elif self.flashinfer_cutlass_moe_enabled:
             return self.flashinfer_cutlass_moe(
@@ -375,8 +318,8 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 w2=layer.w2_weight,
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+                activation=layer.activation,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
         else:
             result = fused_experts(
@@ -386,11 +329,11 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
                 inplace=True,
-                activation=activation,
+                activation=layer.activation,
                 quant_config=self.moe_quant_config,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
             )
 
         if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
@@ -405,148 +348,101 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         self,
         layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
         router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if (
-            enable_eplb is not False
-            or expert_load_view is not None
-            or logical_to_physical_map is not None
-            or logical_replica_count is not None
+            layer.enable_eplb is not False
+            or layer.expert_load_view is not None
+            or layer.logical_to_physical_map is not None
+            or layer.logical_replica_count is not None
         ):
             raise NotImplementedError("Expert load balancing is not supported for CPU.")
+
         return layer.cpu_fused_moe(
             layer,
             x,
-            use_grouped_topk,
-            top_k,
+            layer.use_grouped_topk,
+            layer.top_k,
             router_logits,
-            renormalize,
-            topk_group,
-            num_expert_group,
-            global_num_experts,
-            expert_map,
-            custom_routing_function,
-            scoring_func,
-            routed_scaling_factor,
-            e_score_correction_bias,
-            apply_router_weight_on_input,
-            activation,
+            layer.renormalize,
+            layer.topk_group,
+            layer.num_expert_group,
+            layer.global_num_experts,
+            layer.expert_map,
+            layer.custom_routing_function,
+            layer.scoring_func,
+            layer.routed_scaling_factor,
+            layer.e_score_correction_bias,
+            layer.apply_router_weight_on_input,
+            layer.activation,
         )
 
     def forward_xpu(
         self,
         layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
         router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if (
-            enable_eplb is not False
-            or expert_load_view is not None
-            or logical_to_physical_map is not None
-            or logical_replica_count is not None
+            layer.enable_eplb is not False
+            or layer.expert_load_view is not None
+            or layer.logical_to_physical_map is not None
+            or layer.logical_replica_count is not None
         ):
             raise NotImplementedError("Expert load balancing is not supported for XPU.")
         return layer.ipex_fusion(
             x,
-            use_grouped_topk,
-            top_k,
+            layer.use_grouped_topk,
+            layer.top_k,
             router_logits,
-            renormalize,
-            topk_group,
-            num_expert_group,
-            custom_routing_function=custom_routing_function,
+            layer.renormalize,
+            layer.topk_group,
+            layer.num_expert_group,
+            custom_routing_function=layer.custom_routing_function,
         )
 
     def forward_tpu(
         self,
         layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
         router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert not use_grouped_topk
-        assert num_expert_group is None
-        assert topk_group is None
-        assert custom_routing_function is None
-        assert apply_router_weight_on_input is False
-        if scoring_func != "softmax":
+        assert not layer.use_grouped_topk
+        assert layer.num_expert_group is None
+        assert layer.topk_group is None
+        assert layer.custom_routing_function is None
+        assert layer.apply_router_weight_on_input is False
+        if layer.scoring_func != "softmax":
             raise NotImplementedError(
                 "Only softmax scoring function is supported for TPU."
             )
-        if e_score_correction_bias is not None:
+        if layer.e_score_correction_bias is not None:
             raise NotImplementedError(
                 "Expert score correction bias is not supported for TPU."
             )
-        assert activation == "silu", f"{activation} is not supported for TPU."
-        assert routed_scaling_factor == 1.0, (
-            f"routed_scaling_factor {routed_scaling_factor} is not supported for TPU."
+        assert layer.activation == "silu", (
+            f"{layer.activation} is not supported for TPU."
+        )
+        assert layer.routed_scaling_factor == 1.0, (
+            f"routed_scaling_factor {layer.routed_scaling_factor} is "
+            "not supported for TPU."
         )
         if (
-            enable_eplb is not False
-            or expert_load_view is not None
-            or logical_to_physical_map is not None
-            or logical_replica_count is not None
+            layer.enable_eplb is not False
+            or layer.expert_load_view is not None
+            or layer.logical_to_physical_map is not None
+            or layer.logical_replica_count is not None
         ):
             raise NotImplementedError("Expert load balancing is not supported for TPU.")
         return fused_moe_pallas(
             hidden_states=x,
             w1=layer.w13_weight,
             w2=layer.w2_weight,
-            topk=top_k,
+            topk=layer.top_k,
             gating_output=router_logits,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
-            renormalize=renormalize,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
+            renormalize=layer.renormalize,
         )
 
     if current_platform.is_tpu():
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index d463e181f..16aa4f1e2 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
 from typing import TYPE_CHECKING, Any, Optional
 
 import torch
@@ -669,25 +668,8 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert activation == "silu", "Only SiLU activation is supported."
+        assert layer.activation == "silu", "Only SiLU activation is supported."
 
         topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
@@ -708,9 +690,9 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase):
             input_global_scale1=getattr(layer, "w13_input_global_scale", None),
             input_global_scale2=getattr(layer, "w2_input_global_scale", None),
             quant_type_id=self.quant_type.id,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
             w1_zeros=layer.w13_qzeros,
             w2_zeros=layer.w2_qzeros,
             workspace=layer.workspace,
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 1e57fa218..1fd959cb3 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
 from typing import Any, Union
 
 import torch
@@ -498,23 +497,6 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
@@ -534,10 +516,10 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
             topk_weights=topk_weights,
             topk_ids=topk_ids,
             inplace=True,
-            activation=activation,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
+            activation=layer.activation,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
             quant_config=self.moe_quant_config,
         )
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 619162272..5ad26f931 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
-from collections.abc import Callable
 from enum import Enum
 
 import torch
@@ -558,31 +557,14 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert activation == "silu", "Only SiLU activation is supported."
+        assert layer.activation == "silu", "Only SiLU activation is supported."
 
         if (
             self.allow_flashinfer
             and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
         ):
-            if enable_eplb:
+            if layer.enable_eplb:
                 raise NotImplementedError(
                     "EPLB not supported for `CompressedTensorsW4A4MoEMethod` yet."
                 )
@@ -591,12 +573,12 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
                 layer=layer,
                 x=x,
                 router_logits=router_logits,
-                top_k=top_k,
-                global_num_experts=global_num_experts,
-                num_expert_group=num_expert_group,
-                topk_group=topk_group,
-                custom_routing_function=custom_routing_function,
-                e_score_correction_bias=e_score_correction_bias,
+                top_k=layer.top_k,
+                global_num_experts=layer.global_num_experts,
+                num_expert_group=layer.num_expert_group,
+                topk_group=layer.topk_group,
+                custom_routing_function=layer.custom_routing_function,
+                e_score_correction_bias=layer.e_score_correction_bias,
             )
 
         topk_weights, topk_ids, _ = layer.select_experts(
@@ -619,9 +601,9 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
                 global_scale1=layer.w13_weight_scale_2,
                 global_scale2=layer.w2_weight_scale_2,
                 quant_type_id=scalar_types.float4_e2m1f.id,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
                 input_dtype=self.marlin_input_dtype,
                 workspace=layer.workspace,
             )
@@ -646,15 +628,15 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
                 topk_ids=topk_ids,
                 quant_config=self.moe_quant_config,
                 inplace=False,  # TODO(shuw): fix later, now output is high prec
-                activation=activation,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+                activation=layer.activation,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
         else:
             from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
 
-            assert expert_map is None, (
+            assert layer.expert_map is None, (
                 "Expert Parallelism / expert_map "
                 "is currently not supported for "
                 "CompressedTensorsW4A4Nvfp4MoEMethod."
@@ -670,7 +652,7 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
                 quant_config=self.moe_quant_config,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
                 # TODO(bnell): derive these from arguments
                 m=x.shape[0],
                 n=layer.w2_weight.shape[2] * 2,
@@ -1188,23 +1170,6 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
@@ -1215,7 +1180,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         per_channel_quant = self.weight_quant.strategy == QuantizationStrategy.CHANNEL
 
         if self.use_marlin:
-            assert activation == "silu", f"{activation} not supported for Marlin MoE."
+            assert layer.activation == "silu", (
+                f"{layer.activation} not supported for Marlin MoE."
+            )
             return fused_marlin_moe(
                 x,
                 layer.w13_weight,
@@ -1228,9 +1195,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 topk_weights,
                 topk_ids,
                 quant_type_id=scalar_types.float8_e4m3fn.id,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
                 input_dtype=self.marlin_input_dtype,
                 workspace=layer.workspace,
             )
@@ -1248,9 +1215,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 w2=layer.w2_weight,
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                expert_map=expert_map,
+                activation=layer.activation,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                expert_map=layer.expert_map,
                 quant_config=self.moe_quant_config,
             )
 
@@ -1270,10 +1237,12 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                     topk_weights=topk_weights,
                     topk_ids=topk_ids,
                     inplace=True,
-                    activation=activation,
-                    apply_router_weight_on_input=apply_router_weight_on_input,
-                    global_num_experts=global_num_experts,
-                    expert_map=None if self.disable_expert_map else expert_map,
+                    activation=layer.activation,
+                    apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                    global_num_experts=layer.global_num_experts,
+                    expert_map=None
+                    if self.disable_expert_map
+                    else layer.expert_map,  # ???
                     quant_config=self.moe_quant_config,
                 )
             else:
@@ -1290,9 +1259,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                     topk_weights,
                     topk_ids,
                     quant_config=self.moe_quant_config,
-                    activation=activation,
-                    global_num_experts=global_num_experts,
-                    expert_map=None if self.disable_expert_map else expert_map,
+                    activation=layer.activation,
+                    global_num_experts=layer.global_num_experts,
+                    expert_map=None if self.disable_expert_map else layer.expert_map,
                     ab_strides1=self.ab_strides1_c_strides2,
                     ab_strides2=self.ab_strides2,
                     c_strides1=self.c_strides1,
@@ -1314,10 +1283,10 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
                 inplace=True,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
+                activation=layer.activation,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
                 quant_config=self.moe_quant_config,
             )
 
@@ -1437,23 +1406,6 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
@@ -1469,10 +1421,10 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
             topk_weights=topk_weights,
             topk_ids=topk_ids,
             inplace=True,
-            activation=activation,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
+            activation=layer.activation,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
             quant_config=self.moe_quant_config,
         )
 
@@ -1814,25 +1766,10 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert activation == "silu", f"{activation} not supported for Marlin MoE."
+        assert layer.activation == "silu", (
+            f"{layer.activation} not supported for Marlin MoE."
+        )
 
         topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
@@ -1853,9 +1790,9 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
             input_global_scale1=getattr(layer, "w13_input_global_scale", None),
             input_global_scale2=getattr(layer, "w2_input_global_scale", None),
             quant_type_id=self.quant_type.id,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
             g_idx1=layer.w13_weight_g_idx,
             g_idx2=layer.w2_weight_g_idx,
             sort_indices1=layer.w13_g_idx_sort_indices,
@@ -2057,23 +1994,6 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
@@ -2089,10 +2009,10 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
             topk_weights=topk_weights,
             topk_ids=topk_ids,
             inplace=True,
-            activation=activation,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
+            activation=layer.activation,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
             quant_config=self.moe_quant_config,
         )
 
@@ -2372,32 +2292,15 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        assert not enable_eplb, "EPLB not supported for W4A8-int MoE yet."
-        assert activation in ("silu", "swigluoai", "swiglu"), (
+        assert not layer.enable_eplb, "EPLB not supported for W4A8-int MoE yet."
+        assert layer.activation in ("silu", "swigluoai", "swiglu"), (
             "Only SiLU/SwiGLUGU/SwiGLUUG are supported."
         )
-        assert expert_map is None, """expert_map/EP not implemented
+        assert layer.expert_map is None, """expert_map/EP not implemented
         for CPU dyn-4bit MoE."""
 
         def _act_kind(s: str) -> int:
@@ -2414,15 +2317,9 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
         topk_weights, topk_ids = select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
+            top_k=layer.top_k,
+            use_grouped_topk=layer.use_grouped_topk,
+            renormalize=layer.renormalize,
         )
 
         return torch.ops._C.dynamic_4bit_int_moe(
@@ -2435,8 +2332,8 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
             layer.w2_in_features,
             layer.w13_out_features,
             layer.group_size,
-            apply_router_weight_on_input,
-            int(_act_kind(activation)),
+            layer.apply_router_weight_on_input,
+            int(_act_kind(layer.activation)),
         )
 
 
@@ -2707,28 +2604,11 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ):
-        if enable_eplb:
+        if layer.enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `CompressedTensorsW4A8Fp8MoEMethod` yet."
             )
@@ -2749,9 +2629,9 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             topk_weights,
             topk_ids,
             quant_config=self.moe_quant_config,
-            activation=activation,
-            global_num_experts=global_num_experts,
-            expert_map=None if self.disable_expert_map else expert_map,
+            activation=layer.activation,
+            global_num_experts=layer.global_num_experts,
+            expert_map=None if self.disable_expert_map else layer.expert_map,
             a_strides1=self.a_strides1_c_strides2,
             a_strides2=self.a_strides2,
             b_strides1=self.b_strides1,
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 7ebe40ec8..11097cf36 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
 from typing import Any, Optional
 
 import torch
@@ -140,23 +139,6 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
@@ -172,10 +154,10 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
             topk_weights=topk_weights,
             topk_ids=topk_ids,
             inplace=True,
-            activation=activation,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
+            activation=layer.activation,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
             quant_config=self.moe_quant_config,
         )
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 419ddd91b..8567f64b9 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
 from enum import Enum
 from functools import partial
 from typing import TYPE_CHECKING, Any, Optional
@@ -1242,41 +1241,20 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            assert expert_load_view is not None
-            assert logical_to_physical_map is not None
-            assert logical_replica_count is not None
-            assert isinstance(layer, FusedMoE)
-
         if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
-            assert activation == "silu", (
-                f"Expected 'silu' activation but got {activation}"
+            if layer.enable_eplb:
+                raise NotImplementedError("EPLB not supported for `Fp8MoEMethod` yet.")
+            assert layer.activation == "silu", (
+                f"Expected 'silu' activation but got {layer.activation}"
             )
 
             if self.block_quant:
                 import vllm.model_executor.layers.fused_moe.flashinfer_trtllm_moe  # noqa: E501, F401
 
                 e_score_correction_bias = (
-                    e_score_correction_bias.to(x.dtype)
-                    if e_score_correction_bias is not None
+                    layer.e_score_correction_bias.to(x.dtype)
+                    if layer.e_score_correction_bias is not None
                     else None
                 )
                 routing_method_type = layer.routing_method_type
@@ -1290,29 +1268,31 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     w13_weight_scale_inv=layer.w13_weight_scale_inv,
                     w2_weight=layer.w2_weight,
                     w2_weight_scale_inv=layer.w2_weight_scale_inv,
-                    global_num_experts=global_num_experts,
-                    top_k=top_k,
-                    num_expert_group=num_expert_group,
-                    topk_group=topk_group,
+                    global_num_experts=layer.global_num_experts,
+                    top_k=layer.top_k,
+                    num_expert_group=layer.num_expert_group,
+                    topk_group=layer.topk_group,
                     intermediate_size=layer.intermediate_size_per_partition,
                     expert_offset=layer.ep_rank * layer.local_num_experts,
                     local_num_experts=layer.local_num_experts,
                     block_shape=self.weight_block_size,
                     routing_method_type=routing_method_type,
-                    routed_scaling=routed_scaling_factor,
+                    routed_scaling=layer.routed_scaling_factor,
                 )
             else:
-                assert not renormalize and custom_routing_function is not None
+                assert (
+                    not layer.renormalize and layer.custom_routing_function is not None
+                )
                 result = apply_flashinfer_per_tensor_scale_fp8(
                     layer=layer,
                     hidden_states=x,
                     router_logits=router_logits,
-                    routing_bias=e_score_correction_bias,
-                    global_num_experts=global_num_experts,
-                    top_k=top_k,
-                    num_expert_group=num_expert_group,
-                    topk_group=topk_group,
-                    apply_router_weight_on_input=apply_router_weight_on_input,
+                    routing_bias=layer.e_score_correction_bias,
+                    global_num_experts=layer.global_num_experts,
+                    top_k=layer.top_k,
+                    num_expert_group=layer.num_expert_group,
+                    topk_group=layer.topk_group,
+                    apply_router_weight_on_input=layer.apply_router_weight_on_input,
                 )
 
         select_result = layer.select_experts(
@@ -1333,13 +1313,15 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 layer.w2_weight,
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                expert_map=expert_map,
+                activation=layer.activation,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                expert_map=layer.expert_map,
                 quant_config=self.moe_quant_config,
             )
         elif self.use_marlin:
-            assert activation == "silu", f"{activation} not supported for Marlin MoE."
+            assert layer.activation == "silu", (
+                f"{layer.activation} not supported for Marlin MoE."
+            )
             result = fused_marlin_moe(
                 x,
                 layer.w13_weight,
@@ -1352,20 +1334,22 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 topk_weights,
                 topk_ids,
                 quant_type_id=scalar_types.float8_e4m3fn.id,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
                 input_dtype=self.marlin_input_dtype,
                 workspace=layer.workspace,
             )
         elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
-            assert activation == "silu", (
-                f"Expected 'silu' activation but got {activation}"
+            assert layer.activation == "silu", (
+                f"Expected 'silu' activation but got {layer.activation}"
             )
             if not self.block_quant:
-                assert not renormalize and custom_routing_function is not None
-                assert scoring_func == "sigmoid", (
-                    f"Expected 'sigmoid' scoring func but got {scoring_func}"
+                assert (
+                    not layer.renormalize and layer.custom_routing_function is not None
+                )
+                assert layer.scoring_func == "sigmoid", (
+                    f"Expected 'sigmoid' scoring func but got {layer.scoring_func}"
                 )
             # Delegate to CUTLASS FlashInfer path; function already bound with
             # use_deepseek_fp8_block_scale for block-quant when applicable
@@ -1375,10 +1359,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 topk_weights,
                 topk_ids,
                 inplace=False,
-                activation=activation,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+                activation=layer.activation,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
         else:
             from vllm.model_executor.layers.fused_moe import fused_experts
@@ -1390,10 +1374,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
                 inplace=True,
-                activation=activation,
-                global_num_experts=global_num_experts,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                expert_map=expert_map,
+                activation=layer.activation,
+                global_num_experts=layer.global_num_experts,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                expert_map=layer.expert_map,
                 quant_config=self.moe_quant_config,
                 allow_deep_gemm=self.allow_deep_gemm,
                 allow_cutlass_block_scaled_grouped_gemm=(
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index ee819df29..13aa2bcad 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable, Mapping
+from collections.abc import Mapping
 from types import MappingProxyType
 from typing import Any, Optional
 
@@ -625,26 +625,9 @@ class GGUFMoEMethod(FusedMoEMethodBase):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert activation == "silu", "Only SiLU activation is supported."
-        if apply_router_weight_on_input:
+        assert layer.activation == "silu", "Only SiLU activation is supported."
+        if layer.apply_router_weight_on_input:
             raise NotImplementedError(
                 "Apply router weight on input is not supported for"
                 "fused GGUF MoE method."
@@ -662,7 +645,7 @@ class GGUFMoEMethod(FusedMoEMethodBase):
             topk_ids,
             layer.w13_qweight_type.weight_type,
             layer.w2_qweight_type.weight_type,
-            activation,
+            layer.activation,
         )
 
 
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 56034e113..8d1715f52 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
 from copy import deepcopy
 from typing import Any, Optional
 
@@ -790,25 +789,8 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert activation == "silu", "Only SiLU activation is supported."
+        assert layer.activation == "silu", "Only SiLU activation is supported."
 
         topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
@@ -829,9 +811,9 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             input_global_scale1=getattr(layer, "w13_input_global_scale", None),
             input_global_scale2=getattr(layer, "w2_input_global_scale", None),
             quant_type_id=self.quant_type.id,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
             g_idx1=layer.w13_g_idx,
             g_idx2=layer.w2_g_idx,
             sort_indices1=layer.w13_g_idx_sort_indices,
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index a1571afba..463c74c1c 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
 from typing import Any, Optional
 
 import torch
@@ -440,31 +439,14 @@ class XPUFp8MoEMethod(FusedMoEMethodBase):
         layer: torch.nn.Module,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return layer.ipex_fusion(
             x,
-            use_grouped_topk,
-            top_k,
+            layer.use_grouped_topk,
+            layer.top_k,
             router_logits,
-            renormalize,
-            topk_group,
-            num_expert_group,
-            custom_routing_function=custom_routing_function,
+            layer.renormalize,
+            layer.topk_group,
+            layer.num_expert_group,
+            custom_routing_function=layer.custom_routing_function,
         )
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 034e97a71..e825cb33c 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
 from fnmatch import fnmatch
 from typing import TYPE_CHECKING, Any, Optional
 
@@ -707,43 +706,27 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
             if layer.enable_eplb:
                 raise NotImplementedError(
                     "EPLB not supported for `ModelOptFp8MoEMethod` yet."
                 )
-            assert activation == "silu", (
-                f"Expected 'silu' activation but got {activation}"
+            assert layer.activation == "silu", (
+                f"Expected 'silu' activation but got {layer.activation}"
             )
-            assert not renormalize
+
+            assert not layer.renormalize
             return apply_flashinfer_per_tensor_scale_fp8(
                 layer=layer,
                 hidden_states=x,
                 router_logits=router_logits,
-                routing_bias=e_score_correction_bias,
-                global_num_experts=global_num_experts,
-                top_k=top_k,
-                num_expert_group=num_expert_group,
-                topk_group=topk_group,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+                routing_bias=layer.e_score_correction_bias,
+                global_num_experts=layer.global_num_experts,
+                top_k=layer.top_k,
+                num_expert_group=layer.num_expert_group,
+                topk_group=layer.topk_group,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
 
         # Expert selection
@@ -753,9 +736,9 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         )
 
         if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
-            assert activation in ("silu", "relu2_no_mul"), (
+            assert layer.activation in ("silu", "relu2_no_mul"), (
                 "Expected activation to be in ('silu', 'relu2_no_mul'),"
-                f"but got {activation}"
+                f"but got {layer.activation}"
             )
             return flashinfer_cutlass_moe_fp8(
                 x,
@@ -763,10 +746,10 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
                 topk_weights,
                 topk_ids,
                 inplace=False,
-                activation=activation,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+                activation=layer.activation,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
         else:
             from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
@@ -780,11 +763,11 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
                 inplace=True,
-                activation=activation,
+                activation=layer.activation,
                 quant_config=self.moe_quant_config,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
 
 
@@ -1504,23 +1487,6 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if not self.moe.is_act_and_mul:
             assert (
@@ -1535,7 +1501,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             self.allow_flashinfer
             and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
         ):
-            if enable_eplb:
+            if layer.enable_eplb:
                 raise NotImplementedError(
                     "EPLB not supported for `ModelOptNvFp4FusedMoE` yet."
                 )
@@ -1543,12 +1509,12 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 layer=layer,
                 x=x,
                 router_logits=router_logits,
-                top_k=top_k,
-                global_num_experts=global_num_experts,
-                num_expert_group=num_expert_group,
-                topk_group=topk_group,
-                custom_routing_function=custom_routing_function,
-                e_score_correction_bias=e_score_correction_bias,
+                top_k=layer.top_k,
+                global_num_experts=layer.global_num_experts,
+                num_expert_group=layer.num_expert_group,
+                topk_group=layer.topk_group,
+                custom_routing_function=layer.custom_routing_function,
+                e_score_correction_bias=layer.e_score_correction_bias,
             )
 
         topk_weights, topk_ids, _ = layer.select_experts(
@@ -1571,9 +1537,9 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 global_scale1=layer.w13_weight_scale_2,
                 global_scale2=layer.w2_weight_scale_2,
                 quant_type_id=scalar_types.float4_e2m1f.id,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
                 input_dtype=self.marlin_input_dtype,
             )
 
@@ -1604,10 +1570,10 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 topk_ids=topk_ids,
                 quant_config=self.moe_quant_config,
                 inplace=False,
-                activation=activation,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+                activation=layer.activation,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
         else:
             # If no modular kernel is provided, use cutlass_moe_fp4 for TP case
@@ -1622,8 +1588,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
                 quant_config=self.moe_quant_config,
-                expert_map=expert_map,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+                expert_map=layer.expert_map,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
                 # TODO: derive from arguments
                 m=x.shape[0],
                 n=layer.w2_weight.shape[2] * 2,
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 8570b8c33..0131a330f 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
 from typing import Any, Optional
 
 import torch
@@ -362,27 +361,10 @@ class MoeWNA16Method(FusedMoEMethodBase):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        assert activation == "silu", "Only SiLU activation is supported."
+        assert layer.activation == "silu", "Only SiLU activation is supported."
         topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
@@ -395,9 +377,9 @@ class MoeWNA16Method(FusedMoEMethodBase):
             topk_weights=topk_weights,
             topk_ids=topk_ids,
             inplace=True,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
             quant_config=self.moe_quant_config,
         )
 
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 5d330e837..6eae4e9e6 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Callable
 from enum import Enum
 from typing import Optional
 
@@ -892,25 +891,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
+        if layer.enable_eplb:
             raise NotImplementedError("EPLB is not supported for mxfp4")
 
         if self.mxfp4_backend == Mxfp4Backend.MARLIN:
@@ -933,26 +915,26 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 global_scale1=None,
                 global_scale2=None,
                 quant_type_id=scalar_types.float4_e2m1f.id,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                global_num_experts=global_num_experts,
-                activation=activation,
-                expert_map=expert_map,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                global_num_experts=layer.global_num_experts,
+                activation=layer.activation,
+                expert_map=layer.expert_map,
                 input_dtype=self.marlin_input_dtype,
             )
 
         assert _can_support_mxfp4(
-            use_grouped_topk,
-            topk_group,
-            num_expert_group,
-            expert_map,
-            custom_routing_function,
-            e_score_correction_bias,
-            apply_router_weight_on_input,
-            scoring_func,
-            activation,
-            expert_load_view,
-            logical_to_physical_map,
-            logical_replica_count,
+            layer.use_grouped_topk,
+            layer.topk_group,
+            layer.num_expert_group,
+            layer.expert_map,
+            layer.custom_routing_function,
+            layer.e_score_correction_bias,
+            layer.apply_router_weight_on_input,
+            layer.scoring_func,
+            layer.activation,
+            layer.expert_load_view,
+            layer.logical_to_physical_map,
+            layer.logical_replica_count,
         ), "MXFP4 are not supported with this configuration."
 
         if (
@@ -988,8 +970,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 None,  # output1_scale_scalar
                 None,  # output1_scale_gate_scalar
                 None,  # output2_scale_scalar
-                global_num_experts,
-                top_k,
+                layer.global_num_experts,
+                layer.top_k,
                 None,  # n_group
                 None,  # topk_group
                 self.intermediate_size,  # padded to multiple of 256
@@ -997,7 +979,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 self.num_experts,  # local num experts
                 None,
                 None,
-                1 if renormalize else 0,  # routing_method_type, renormalize
+                1 if layer.renormalize else 0,  # routing_method_type, renormalize
                 True,  # do finalize
                 tune_max_num_tokens=max(self.max_capture_size, 1),
             )[0]
@@ -1081,12 +1063,12 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 w1=layer.w13_weight,
                 w2=layer.w2_weight,
                 gating_output=router_logits,
-                topk=top_k,
-                renormalize=renormalize,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
+                topk=layer.top_k,
+                renormalize=layer.renormalize,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
                 quant_config=self.moe_quant_config,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
         else:
             raise ValueError(f"Unsupported backend: {self.mxfp4_backend}")
@@ -1138,37 +1120,20 @@ class IpexMxfp4MoEMethod(Mxfp4MoEMethod):
         layer: torch.nn.Module,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        assert activation == "swigluoai", (
+        assert layer.activation == "swigluoai", (
             "Only swiglu_oai activation is supported for IPEX MXFP4 MoE"
         )
         hidden_size_pad = round_up(self.original_hidden_size, 128)
         x_pad = torch.nn.functional.pad(x, (0, hidden_size_pad - x.size(-1)))
         hidden_states = layer.ipex_fusion(
             x_pad,
-            use_grouped_topk,
-            top_k,
+            layer.use_grouped_topk,
+            layer.top_k,
             router_logits,
-            renormalize,
-            topk_group,
-            num_expert_group,
+            layer.renormalize,
+            layer.topk_group,
+            layer.num_expert_group,
             activation="swiglu_oai",
         )
         hidden_states = hidden_states[..., : self.original_hidden_size].contiguous()
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 9e2b21343..d84e22d1f 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
 from typing import Any
 
 import torch
@@ -337,23 +336,6 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
@@ -371,13 +353,15 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
                 w2=layer.w2_weight,
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input,
+                activation=layer.activation,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
                 quant_config=self.moe_quant_config,
-                expert_map=expert_map,
+                expert_map=layer.expert_map,
             )
         elif self.use_marlin:
-            assert activation == "silu", f"{activation} not supported for Marlin MoE."
+            assert layer.activation == "silu", (
+                f"{layer.activation} not supported for Marlin MoE."
+            )
             return fused_marlin_moe(
                 x,
                 layer.w13_weight,
@@ -390,9 +374,9 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
                 topk_weights,
                 topk_ids,
                 quant_type_id=scalar_types.float8_e4m3fn.id,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
             )
         else:
             from vllm.model_executor.layers.fused_moe import fused_experts
@@ -404,10 +388,10 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
                 inplace=True,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
+                activation=layer.activation,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                global_num_experts=layer.global_num_experts,
+                expert_map=layer.expert_map,
                 quant_config=self.moe_quant_config,
             )
 
@@ -597,23 +581,6 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
@@ -631,9 +598,9 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
                 layer.w2_weight,
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
-                activation=activation,
+                activation=layer.activation,
                 quant_config=self.moe_quant_config,
-                expert_map=expert_map,
+                expert_map=layer.expert_map,
             )
         else:
             from vllm.model_executor.layers.fused_moe import fused_experts
@@ -645,10 +612,11 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
                 inplace=True,
-                activation=activation,
-                global_num_experts=global_num_experts,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                expert_map=expert_map,
+                activation=layer.activation,
+                global_num_experts=layer.global_num_experts,
+                apply_router_weight_on_input=layer.apply_router_weight_on_input,
+                expert_map=layer.expert_map,
                 quant_config=self.moe_quant_config,
             )
+
         return out
diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py
index 7b51b8280..b2ecb0b17 100644
--- a/vllm/model_executor/layers/quantization/rtn.py
+++ b/vllm/model_executor/layers/quantization/rtn.py
@@ -3,7 +3,6 @@
 # Copyright © 2025, Oracle and/or its affiliates.
 
 import os
-from collections.abc import Callable
 from typing import Any, Optional
 
 import numpy as np
@@ -359,23 +358,6 @@ class RTNMoEMethod(FusedMoEMethodBase):
         layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
@@ -394,9 +376,9 @@ class RTNMoEMethod(FusedMoEMethodBase):
             topk_weights,
             topk_ids,
             quant_type_id=self.quant_config.quant_type.id,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
             workspace=workspace,
         )
 
-- 
GitLab


From fccd5325874321f34daef0a8ed4d1b15b26ca34e Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Tue, 9 Dec 2025 16:54:32 -0500
Subject: [PATCH 249/258] [Quantization] FP8 Weight Reloading for Quantized RL
 Rollout (#28480)

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 tests/quantization/test_fp8.py                |  88 ++++++++++
 .../model_executor/layers/quantization/fp8.py | 151 +++++++++---------
 .../layers/quantization/kv_cache.py           |   7 +
 .../layers/quantization/utils/fp8_utils.py    |   7 +-
 .../quantization/utils/marlin_utils_fp8.py    |  11 +-
 .../layers/quantization/utils/w8a8_utils.py   |   5 +-
 vllm/model_executor/utils.py                  |  25 +++
 7 files changed, 207 insertions(+), 87 deletions(-)

diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 7bcac9ad7..622031865 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -10,10 +10,14 @@ import torch
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm import _custom_ops as ops
+from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.quantization.fp8 import (
+    Fp8Config,
     Fp8KVCacheMethod,
     Fp8LinearMethod,
+    Fp8MoEMethod,
 )
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.platforms import current_platform
 
 MODELS = [
@@ -261,3 +265,87 @@ def test_scaled_fp8_quant(dtype) -> None:
             torch.narrow(y_nc_pad, 0, 0, x_nc.shape[0]), inv_scale_nc, dtype
         ),
     )
+
+
+@pytest.mark.parametrize("method_cls", [Fp8LinearMethod, Fp8MoEMethod])
+# FP8 weight reloading does not support online quantization
+@pytest.mark.parametrize("is_checkpoint_fp8_serialized", [True])  # skip False
+@pytest.mark.parametrize("weight_block_size", [None, [1, 1]])
+# any postprocessing that is applied to the weights such as padding and repacking
+# (excluding device sharding) must also be applied to the reloaded weights
+#
+# this is the case for marlin as well as per-tensor Fp8MoEMethod
+@pytest.mark.parametrize("use_marlin", [False])  # skip True
+def test_fp8_reloading(
+    method_cls, is_checkpoint_fp8_serialized, weight_block_size, use_marlin, dist_init
+):
+    if is_checkpoint_fp8_serialized is False:
+        pytest.skip("FP8 weight reloading does not support online quantization")
+
+    if method_cls is Fp8MoEMethod and weight_block_size is None:
+        pytest.skip(
+            "FP8 Tensor weight reloading does not support fusing w13_weight_scale. "
+            "If this is your use case, consider using a restore function like #26327"
+        )
+
+    with torch.device("cuda:0"):
+        config = Fp8Config(
+            is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
+            weight_block_size=weight_block_size,
+        )
+
+        if method_cls is Fp8LinearMethod:
+            layer = torch.nn.Linear(1, 1)
+            method = method_cls(config)
+            method.create_weights(
+                layer=layer,
+                input_size_per_partition=1,
+                output_partition_sizes=[1],
+                input_size=1,
+                output_size=1,
+                params_dtype=torch.bfloat16,
+                weight_loader=default_weight_loader,
+            )
+
+        else:
+            layer = FusedMoE(
+                num_experts=1,
+                top_k=1,
+                hidden_size=1,
+                intermediate_size=1,
+            )
+            method = method_cls(config, layer)
+            method.create_weights(
+                layer=layer,
+                num_experts=1,
+                hidden_size=1,
+                intermediate_size_per_partition=1,
+                params_dtype=torch.bfloat16,
+                weight_loader=default_weight_loader,
+            )
+
+        method.use_marlin = use_marlin
+
+    # capture weights format during loading
+    original_metadata = [
+        (name, param.shape, getattr(param, "weight_loader", default_weight_loader))
+        for name, param in layer.named_parameters()
+    ]
+
+    # test loading
+    for name, shape, _ in original_metadata:
+        param = getattr(layer, name)
+        weight_loader = getattr(param, "weight_loader", default_weight_loader)
+        weight_loader(param, torch.zeros(shape))  # cannot use empty
+
+    method.process_weights_after_loading(layer)
+
+    # test reloading works after loading
+    # assuming that no reshaping occurred
+    for name, shape, original_weight_loader in original_metadata:
+        param = getattr(layer, name)
+        weight_loader = getattr(param, "weight_loader", default_weight_loader)
+        assert weight_loader is original_weight_loader
+        weight_loader(param, torch.zeros(shape))  # cannot use empty
+
+    method.process_weights_after_loading(layer)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 8567f64b9..60dde9eb5 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -94,7 +94,7 @@ from vllm.model_executor.parameter import (
     ModelWeightParameter,
     PerTensorScaleParameter,
 )
-from vllm.model_executor.utils import set_weight_attrs
+from vllm.model_executor.utils import replace_parameter, set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 from vllm.utils.deep_gemm import (
@@ -548,46 +548,50 @@ class Fp8LinearMethod(LinearMethodBase):
             assert not self.act_q_static
             size_k_first = False
 
-            weight, weight_scale = process_fp8_weight_block_strategy(
+            weight, weight_scale_inv = process_fp8_weight_block_strategy(
                 layer.weight, layer.weight_scale_inv
             )
-            # Delete the weight_scale_inv parameter to avoid confusion
-            # with the weight_scale parameter
-            del layer.weight_scale_inv
+
+            # Update layer with new values
+            replace_parameter(layer, "weight", weight.data)
+            replace_parameter(layer, "weight_scale_inv", weight_scale_inv.data)
 
         # If checkpoint not serialized fp8, quantize the weights.
-        elif not self.quant_config.is_checkpoint_fp8_serialized:
-            qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None)
-            weight = qweight.t()
+        else:
+            if not self.quant_config.is_checkpoint_fp8_serialized:
+                qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None)
+                weight = qweight.t()
+
+            # If checkpoint is fp8 per-tensor, handle that there are N scales for N
+            # shards in a fused module
+            else:
+                weight = layer.weight
+                weight_scale = layer.weight_scale
+
+                # If using w8a8, torch._scaled_mm needs per tensor, so
+                # requantize the logical shards as a single weight.
+                if not self.use_marlin:
+                    weight, weight_scale, input_scale = (
+                        process_fp8_weight_tensor_strategy(
+                            weight,
+                            weight_scale,
+                            layer.logical_widths,
+                            getattr(layer, "input_scale", None),
+                        )
+                    )
+                    if self.act_q_static:
+                        assert input_scale is not None
+                        input_scale = input_scale.max()
+                weight = weight.t()
 
-        # If checkpoint is fp8 per-tensor, handle that there are N scales for N
-        # shards in a fused module
+            # Update layer with new values.
+            replace_parameter(layer, "weight", weight.data)
+            replace_parameter(layer, "weight_scale", weight_scale.data)
+
+        if input_scale is not None:
+            replace_parameter(layer, "input_scale", input_scale)
         else:
-            weight = layer.weight
-            weight_scale = layer.weight_scale
-
-            # If using w8a8, torch._scaled_mm needs per tensor, so
-            # requantize the logical shards as a single weight.
-            if not self.use_marlin:
-                weight, weight_scale, input_scale = process_fp8_weight_tensor_strategy(
-                    weight,
-                    weight_scale,
-                    layer.logical_widths,
-                    getattr(layer, "input_scale", None),
-                )
-                if self.act_q_static:
-                    assert input_scale is not None
-                    input_scale = input_scale.max()
-            weight = weight.t()
-
-        # Update layer with new values.
-        layer.weight = Parameter(weight.data, requires_grad=False)
-        layer.weight_scale = Parameter(weight_scale.data, requires_grad=False)
-        layer.input_scale = (
-            Parameter(input_scale, requires_grad=False)
-            if input_scale is not None
-            else None
-        )
+            layer.input_scale = None
 
         if self.use_marlin:
             prepare_fp8_layer_for_marlin(
@@ -614,7 +618,7 @@ class Fp8LinearMethod(LinearMethodBase):
                 return self.w8a8_block_fp8_linear.apply(
                     input=x,
                     weight=layer.weight,
-                    weight_scale=layer.weight_scale,
+                    weight_scale=layer.weight_scale_inv,
                     input_scale=layer.input_scale,
                     bias=bias,
                 )
@@ -643,10 +647,15 @@ class Fp8LinearMethod(LinearMethodBase):
                 return torch.nn.functional.linear(x, weight_bf16.t(), bias)
 
         if self.use_marlin:
+            if self.block_quant:
+                weight_scale = layer.weight_scale_inv
+            else:
+                weight_scale = layer.weight_scale
+
             return apply_fp8_marlin_linear(
                 input=x,
                 weight=layer.weight,
-                weight_scale=layer.weight_scale,
+                weight_scale=weight_scale,
                 workspace=layer.workspace,
                 size_n=layer.output_size_per_partition,
                 size_k=layer.input_size_per_partition,
@@ -660,7 +669,7 @@ class Fp8LinearMethod(LinearMethodBase):
             return self.w8a8_block_fp8_linear.apply(
                 input=x,
                 weight=layer.weight,
-                weight_scale=layer.weight_scale,
+                weight_scale=layer.weight_scale_inv,
                 input_scale=layer.input_scale,
                 bias=bias,
             )
@@ -937,22 +946,18 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 w2_weight_scale_inv = layer.w2_weight_scale_inv
 
             # torch.compile() cannot use Parameter subclasses.
-            layer.w13_weight = Parameter(w13_weight, requires_grad=False)
-            layer.w13_weight_scale_inv = Parameter(
-                w13_weight_scale_inv, requires_grad=False
-            )
-            layer.w2_weight = Parameter(w2_weight, requires_grad=False)
-            layer.w2_weight_scale_inv = Parameter(
-                w2_weight_scale_inv, requires_grad=False
-            )
+            replace_parameter(layer, "w13_weight", w13_weight)
+            replace_parameter(layer, "w13_weight_scale_inv", w13_weight_scale_inv)
+            replace_parameter(layer, "w2_weight", w2_weight)
+            replace_parameter(layer, "w2_weight_scale_inv", w2_weight_scale_inv)
             if self.rocm_aiter_moe_enabled:
                 # reshaping weights is required for aiter moe kernel.
                 shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
                     layer.w13_weight.data, layer.w2_weight.data
                 )
 
-                layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False)
-                layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False)
+                replace_parameter(layer, "w13_weight", shuffled_w13)
+                replace_parameter(layer, "w2_weight", shuffled_w2)
 
             # DeepGemm scales need to be transposed and aligned. We try to do
             # it ahead of time for performance reasons.
@@ -990,13 +995,14 @@ class Fp8MoEMethod(FusedMoEMethodBase):
 
             # Re-initialize w13_scale because we directly quantize
             # merged w13 weights and generate a single scaling factor.
-            layer.w13_weight_scale = torch.nn.Parameter(
+            replace_parameter(
+                layer,
+                "w13_weight_scale",
                 torch.ones(
                     layer.local_num_experts,
                     dtype=torch.float32,
                     device=w13_weight.device,
                 ),
-                requires_grad=False,
             )
             for expert in range(layer.local_num_experts):
                 w13_weight[expert, :, :], layer.w13_weight_scale[expert] = (
@@ -1005,16 +1011,17 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 w2_weight[expert, :, :], layer.w2_weight_scale[expert] = (
                     ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :])
                 )
-            layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
-            layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+            replace_parameter(layer, "w13_weight", w13_weight)
+            replace_parameter(layer, "w2_weight", w2_weight)
+
             if self.rocm_aiter_moe_enabled:
                 # reshaping weights is required for aiter moe kernel.
                 shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
                     layer.w13_weight, layer.w2_weight
                 )
 
-                layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False)
-                layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False)
+                replace_parameter(layer, "w13_weight", shuffled_w13)
+                replace_parameter(layer, "w2_weight", shuffled_w2)
         # If checkpoint is fp8, we need to handle that the
         # MoE kernels require single activation scale and single weight
         # scale for w13 per expert.
@@ -1035,12 +1042,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                         "fp8 MoE layer. Using the maximum across experts "
                         "for each layer."
                     )
-                layer.w13_input_scale = torch.nn.Parameter(
-                    layer.w13_input_scale.max(), requires_grad=False
-                )
-                layer.w2_input_scale = torch.nn.Parameter(
-                    layer.w2_input_scale.max(), requires_grad=False
-                )
+                replace_parameter(layer, "w13_input_scale", layer.w13_input_scale.max())
+                replace_parameter(layer, "w2_input_scale", layer.w2_input_scale.max())
             if current_platform.is_fp8_fnuz():
                 # Normalize the weights and scales
                 w13_weight, w13_weight_scale, w13_input_scale = (
@@ -1054,22 +1057,14 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     )
                 )
                 # Reset the parameter
-                layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
-                layer.w13_weight_scale = torch.nn.Parameter(
-                    w13_weight_scale, requires_grad=False
-                )
+                replace_parameter(layer, "w13_weight", w13_weight)
+                replace_parameter(layer, "w13_weight_scale", w13_weight_scale)
                 if w13_input_scale is not None:
-                    layer.w13_input_scale = torch.nn.Parameter(
-                        w13_input_scale, requires_grad=False
-                    )
-                layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
-                layer.w2_weight_scale = torch.nn.Parameter(
-                    w2_weight_scale, requires_grad=False
-                )
+                    replace_parameter(layer, "w13_input_scale", w13_input_scale)
+                replace_parameter(layer, "w2_weight", w2_weight)
+                replace_parameter(layer, "w2_weight_scale", w2_weight_scale)
                 if w2_input_scale is not None:
-                    layer.w2_input_scale = torch.nn.Parameter(
-                        w2_input_scale, requires_grad=False
-                    )
+                    replace_parameter(layer, "w2_input_scale", w2_input_scale)
 
             # Fp8 moe kernel needs single weight scale for w13 per expert.
             # We take the max then dequant and requant each expert.
@@ -1093,12 +1088,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     layer.w13_weight, layer.w2_weight
                 )
 
-                layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False)
-                layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False)
+                replace_parameter(layer, "w13_weight", shuffled_w13)
+                replace_parameter(layer, "w2_weight", shuffled_w2)
 
-            layer.w13_weight_scale = torch.nn.Parameter(
-                max_w13_scales, requires_grad=False
-            )
+            replace_parameter(layer, "w13_weight_scale", max_w13_scales)
 
             if self.flashinfer_moe_backend is not None:
                 # NOTE: weights have to be swapped since the activation is
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
index 78456dcf1..f0497a872 100644
--- a/vllm/model_executor/layers/quantization/kv_cache.py
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -45,6 +45,13 @@ class BaseKVCacheMethod(QuantizeMethodBase):
         raise RuntimeError(f"{self.__class__.__name__}.apply should not be called.")
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # skip if there are no weights to process (for example, weight reloading)
+        if not hasattr(layer, "q_scale"):
+            assert not hasattr(layer, "k_scale")
+            assert not hasattr(layer, "v_scale")
+            assert not hasattr(layer, "prob_scale")
+            return
+
         # If the kv-cache dtype is auto, we enforce the k/v_scale to be 1.0
         # regardless whether the kv-scale is available in the checkpoint.
         # No need to process kv scales after loading if we are going to
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 366c5778f..f5200d7d3 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -27,6 +27,7 @@ from vllm.model_executor.parameter import (
     ChannelQuantScaleParameter,
     PerTensorScaleParameter,
 )
+from vllm.model_executor.utils import replace_parameter
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils.deep_gemm import (
@@ -1404,12 +1405,12 @@ def maybe_post_process_fp8_weight_block(layer: torch.nn.Module):
     if should_use_deepgemm:
         dg_weight, dg_weight_scale = deepgemm_post_process_fp8_weight_block(
             wq=layer.weight.data,
-            ws=layer.weight_scale.data,
+            ws=layer.weight_scale_inv.data,
             quant_block_shape=tuple(layer.weight_block_size),
             use_e8m0=is_deep_gemm_e8m0_used(),
         )
-        layer.weight = torch.nn.Parameter(dg_weight, requires_grad=False)
-        layer.weight_scale = torch.nn.Parameter(dg_weight_scale, requires_grad=False)
+        replace_parameter(layer, "weight", dg_weight)
+        replace_parameter(layer, "weight_scale_inv", dg_weight_scale)
 
 
 def expert_weight_is_col_major(x: torch.Tensor) -> bool:
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index e6b4f567c..c67e4f437 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -14,6 +14,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     marlin_quant_input,
     should_use_atomic_add_reduce,
 )
+from vllm.model_executor.utils import replace_parameter
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
@@ -130,7 +131,7 @@ def prepare_fp8_layer_for_marlin(
         size_n=part_size_n,
         num_bits=8,
     )
-    layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
+    replace_parameter(layer, "weight", marlin_qweight)
 
     # WEIGHT SCALES
     # Permute scales
@@ -138,7 +139,6 @@ def prepare_fp8_layer_for_marlin(
         scales = layer.weight_scale.to(layer.orig_dtype)
     elif "weight_scale_inv" in dir(layer):
         scales = layer.weight_scale_inv.to(layer.orig_dtype)
-        del layer.weight_scale_inv
 
     group_size = -1 if weight_block_size is None else weight_block_size[1]
 
@@ -177,12 +177,15 @@ def prepare_fp8_layer_for_marlin(
     )
     if input_dtype != torch.float8_e4m3fn:
         marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
-    layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
+    if hasattr(layer, "weight_scale"):
+        replace_parameter(layer, "weight_scale", marlin_scales)
+    elif hasattr(layer, "weight_scale_inv"):
+        replace_parameter(layer, "weight_scale_inv", marlin_scales)
 
     if hasattr(layer, "bias") and layer.bias is not None:
         assert layer.bias.shape == (part_size_n,)
         bias = marlin_permute_bias(layer.bias)
-        layer.bias = torch.nn.Parameter(bias, requires_grad=False)
+        replace_parameter(layer, "bias", bias)
 
 
 def prepare_moe_fp8_layer_for_marlin(
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index fceed3e55..428792241 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -118,8 +118,11 @@ def requantize_with_max_scale(
     # from disk in this case. Skip requantization in this case (since)
     # we already are quantized with the single scale.
     # * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8
+    #
+    # Extra note: upon weight reloading weight_scale.ndim == 0
     unfused_module_in_checkpoint = (
-        weight_scale[-1] > torch.finfo(torch.float8_e4m3fn).min
+        weight_scale.ndim != 0
+        and weight_scale[-1] > torch.finfo(torch.float8_e4m3fn).min
     )
 
     # If unfused checkpoint, need requanize with the single scale.
diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
index 8aad59e84..b89371d98 100644
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -50,6 +50,31 @@ def set_weight_attrs(
         setattr(weight, key, value)
 
 
+def replace_parameter(layer: torch.nn.Module, param_name: str, new_data: torch.Tensor):
+    """
+    Replace a parameter of a layer while maintaining the ability to reload the weight.
+    Called within implementations of the `process_weights_after_loading` method.
+
+    This function should not be called on weights which are tied/shared
+
+    Args:
+        layer: Layer containing parameter to replace
+        param_name: Name of parameter to replace
+        new_data: New data of the new parameter
+    """
+    # should not be used on a tied/shared param
+    if isinstance(new_data, torch.nn.Parameter):
+        new_data = new_data.data
+    new_param = torch.nn.Parameter(new_data, requires_grad=False)
+
+    old_param: torch.nn.Parameter | None = getattr(layer, param_name, None)
+    if old_param is not None and hasattr(old_param, "weight_loader"):
+        weight_loader = old_param.weight_loader
+        set_weight_attrs(new_param, {"weight_loader": weight_loader})
+
+    setattr(layer, param_name, new_param)
+
+
 def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]:
     parent_map = getattr(model, "packed_modules_mapping", None)
     parent_map = copy.deepcopy(parent_map) if parent_map is not None else {}
-- 
GitLab


From 3c680f4a17057d7994af8fbb1dc8c2d98307c890 Mon Sep 17 00:00:00 2001
From: Charlie Fu <charlifu@amd.com>
Date: Tue, 9 Dec 2025 16:39:26 -0600
Subject: [PATCH 250/258] [Rocm][torch.compile] Adding layernorm + fp8 block
 quant and silu + fp8 block quant for Aiter (#25693)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: charlifu <charlifu@amd.com>
Signed-off-by: Micah Williamson <micah.williamson@amd.com>
Signed-off-by: Charlie Fu <Charlie.Fu@amd.com>
Co-authored-by: Micah Williamson <micah.williamson@amd.com>
Co-authored-by: wuhuikx <hattie.wu@amd.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
---
 tests/compile/test_fusion.py                  |  98 ++++++-
 tests/compile/test_silu_mul_quant_fusion.py   |  62 ++++-
 vllm/_aiter_ops.py                            | 214 ++++++++++++----
 vllm/compilation/pass_manager.py              |  11 +
 vllm/compilation/rocm_aiter_fusion.py         | 242 ++++++++++++++++++
 .../layers/quantization/utils/fp8_utils.py    |  43 +++-
 6 files changed, 610 insertions(+), 60 deletions(-)
 create mode 100644 vllm/compilation/rocm_aiter_fusion.py

diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index 2ad34a798..6b72c595c 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -1,10 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import itertools
+
 import pytest
 import torch
 
 import vllm.plugins
+from vllm._aiter_ops import IS_AITER_FOUND, rocm_aiter_ops
 from vllm.compilation.fusion import FUSED_OPS, FusedRMSQuantKey, RMSNormQuantFusionPass
 from vllm.compilation.fx_utils import find_op_nodes
 from vllm.compilation.matcher_utils import QUANT_OPS
@@ -152,13 +155,79 @@ GROUP_SHAPES = [
 ]
 
 
+class TestRmsnormGroupFp8QuantModel(torch.nn.Module):
+    def __init__(self, hidden_size: int, eps: float, **kwargs):
+        super().__init__()
+        self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp(
+            weight_group_shape=GroupShape(128, 128),
+            act_quant_group_shape=GroupShape(1, 128),
+            cutlass_block_fp8_supported=False,
+            use_aiter_and_is_supported=True,
+        )
+        self.w = [
+            torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
+            for _ in range(3)
+        ]
+
+        scale_hidden_size = (hidden_size + 128 - 1) // 128
+        self.wscale = [
+            torch.rand((scale_hidden_size, scale_hidden_size), dtype=torch.float32)
+            for _ in range(3)
+        ]
+
+        self.norm_weight = [torch.ones(hidden_size) for _ in range(4)]
+        self.eps = eps
+
+    def forward(self, x):
+        # avoid having graph input be an arg to a pattern directly
+        x = resid = torch.relu(x)
+        y = rocm_aiter_ops.rms_norm(x, self.norm_weight[0], self.eps)
+
+        x2 = self.w8a8_block_fp8_linear.apply(y, self.w[0], self.wscale[0])
+        # make sure resid is used for replacement to work
+        y2, resid = rocm_aiter_ops.rms_norm2d_with_add(
+            x2, resid, self.norm_weight[1], self.eps
+        )
+
+        x3 = self.w8a8_block_fp8_linear.apply(y2, self.w[1], self.wscale[1])
+
+        y3, resid = rocm_aiter_ops.rms_norm2d_with_add(
+            x3, resid, self.norm_weight[2], self.eps
+        )
+
+        x4 = self.w8a8_block_fp8_linear.apply(y3, self.w[2], self.wscale[2])
+
+        y4, resid = rocm_aiter_ops.rms_norm2d_with_add(
+            x4, resid, self.norm_weight[3], self.eps
+        )
+        return y4
+
+    def ops_in_model_before(self):
+        return [
+            torch.ops.vllm.rocm_aiter_rms_norm,
+            torch.ops.vllm.rocm_aiter_group_fp8_quant,
+        ]
+
+    def ops_in_model_before_partial(self):
+        return []
+
+    def ops_in_model_after(self):
+        return [
+            torch.ops.vllm.rocm_aiter_rmsnorm_fp8_group_quant,
+            torch.ops.vllm.rocm_aiter_rmsnorm_with_add_fp8_group_quant,
+        ]
+
+
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("hidden_size", [256])
 @pytest.mark.parametrize("num_tokens", [257])
 @pytest.mark.parametrize("eps", [1e-5, 1e-6])
 @pytest.mark.parametrize("group_shape", GROUP_SHAPES)
-@pytest.mark.parametrize("enable_rms_norm_custom_op", [True, False])
-@pytest.mark.parametrize("enable_quant_fp8_custom_op", [True, False])
+@pytest.mark.parametrize(
+    "model_class, enable_rms_norm_custom_op, enable_quant_fp8_custom_op",
+    list(itertools.product([TestModel], [True, False], [True, False]))
+    + [(TestRmsnormGroupFp8QuantModel, False, False)],
+)
 # cuda_force_torch used to test torch code path on platforms that
 # cutlass_fp8_supported() == True.
 @pytest.mark.parametrize(
@@ -173,10 +242,14 @@ def test_fusion_rmsnorm_quant(
     num_tokens,
     eps,
     group_shape,
+    model_class,
     enable_rms_norm_custom_op,
     enable_quant_fp8_custom_op,
     cuda_force_torch,
 ):
+    if model_class is TestRmsnormGroupFp8QuantModel and not IS_AITER_FOUND:
+        pytest.skip("AITER is not supported on this GPU.")
+
     torch.set_default_device("cuda")
     torch.set_default_dtype(dtype)
     torch.manual_seed(1)
@@ -209,12 +282,24 @@ def test_fusion_rmsnorm_quant(
     with vllm.config.set_current_vllm_config(vllm_config):
         # Reshape pass is needed for the fusion pass to work
         noop_pass = NoOpEliminationPass(vllm_config)
-        fusion_pass = RMSNormQuantFusionPass(vllm_config)
+        if model_class is TestRmsnormGroupFp8QuantModel:
+            from vllm.compilation.rocm_aiter_fusion import (
+                RocmAiterRMSNormFp8GroupQuantFusionPass,
+            )
+
+            fusion_pass = RocmAiterRMSNormFp8GroupQuantFusionPass(vllm_config)
+        else:
+            fusion_pass = RMSNormQuantFusionPass(vllm_config)
         cleanup_pass = PostCleanupPass(vllm_config)
 
         backend = TestBackend(noop_pass, fusion_pass, cleanup_pass)
         backend2 = TestBackend(noop_pass, cleanup_pass)
-        model = TestModel(hidden_size, eps, group_shape, cuda_force_torch)
+        model = model_class(
+            hidden_size=hidden_size,
+            eps=eps,
+            group_shape=group_shape,
+            cuda_force_torch=cuda_force_torch,
+        )
         # First dimension dynamic
         x = torch.rand(num_tokens, hidden_size)
         torch._dynamo.mark_dynamic(x, 0)
@@ -243,7 +328,10 @@ def test_fusion_rmsnorm_quant(
         # there's a risk that the fused add doesn't get included in the
         # replacement and only the rms part gets fused with quant.
         # Hence, we check only 2 add nodes are left (final fused rmsnorm add).
-        if not enable_rms_norm_custom_op:
+        if (
+            not enable_rms_norm_custom_op
+            and model_class is not TestRmsnormGroupFp8QuantModel
+        ):
             n_add_nodes = lambda g: sum(1 for _ in find_op_nodes(torch.ops.aten.add, g))
             # 7 = 1 (RMS) + 3x2 (3xRMS_ADD, 2 each)
             assert n_add_nodes(backend.graph_pre_pass) == 7
diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py
index c336a4595..eb0dee8d4 100644
--- a/tests/compile/test_silu_mul_quant_fusion.py
+++ b/tests/compile/test_silu_mul_quant_fusion.py
@@ -7,6 +7,7 @@ import torch
 
 import vllm.envs as envs
 from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor
+from vllm._aiter_ops import IS_AITER_FOUND
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.compilation.activation_quant_fusion import (
     FUSED_OPS,
@@ -24,6 +25,7 @@ from vllm.config import (
     set_current_vllm_config,
 )
 from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.quantization.utils.fp8_utils import W8A8BlockFp8LinearOp
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape,
     kFp8StaticTensorSym,
@@ -126,6 +128,39 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module):
         return [FUSED_OPS[kNvfp4Quant]]
 
 
+class TestSiluMulGroupFp8QuantModel(torch.nn.Module):
+    def __init__(self, hidden_size: int, **kwargs):
+        super().__init__()
+        self.silu_and_mul = SiluAndMul()
+        self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp(
+            weight_group_shape=GroupShape(128, 128),
+            act_quant_group_shape=GroupShape(1, 128),
+            cutlass_block_fp8_supported=False,
+            use_aiter_and_is_supported=True,
+        )
+        self.w = torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
+
+        scale_hidden_size = (hidden_size + 128 - 1) // 128
+        self.wscale = torch.rand(
+            (scale_hidden_size, scale_hidden_size), dtype=torch.float32
+        )
+
+        self.enable_silu_mul_custom_op = self.silu_and_mul.enabled()
+
+    def forward(self, x):
+        y = self.silu_and_mul(x)
+        x2 = self.w8a8_block_fp8_linear.apply(y, self.w, self.wscale)
+        return x2
+
+    def ops_in_model_before(self):
+        return [
+            SILU_MUL_OP if self.enable_silu_mul_custom_op else torch.ops.aten.mul,
+        ]
+
+    def ops_in_model_after(self):
+        return [torch.ops.vllm.rocm_aiter_act_mul_and_fp8_group_quant]
+
+
 @pytest.mark.parametrize("num_tokens", [32, 64])
 @pytest.mark.parametrize("hidden_size", [128, 256])
 @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
@@ -133,7 +168,10 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module):
 @pytest.mark.parametrize(
     "model_class, enable_quant_fp8_custom_op, cuda_force_torch",
     list(itertools.product([TestSiluMulFp8QuantModel], [True, False], [True, False]))
-    + [(TestSiluMulNvfp4QuantModel, False, False)],
+    + [
+        (TestSiluMulNvfp4QuantModel, False, False),
+        (TestSiluMulGroupFp8QuantModel, False, False),
+    ],
 )
 # cuda_force_torch used to test torch code path on platforms that
 # cutlass_fp8_supported() == True.
@@ -144,13 +182,19 @@ def test_fusion_silu_and_mul_quant(
     num_tokens: int,
     hidden_size: int,
     dtype: torch.dtype,
-    model_class: type[TestSiluMulFp8QuantModel | TestSiluMulNvfp4QuantModel],
+    model_class: type[
+        TestSiluMulFp8QuantModel
+        | TestSiluMulNvfp4QuantModel
+        | TestSiluMulGroupFp8QuantModel
+    ],
     enable_silu_mul_custom_op: bool,
     enable_quant_fp8_custom_op: bool,
     cuda_force_torch: bool,
 ):
     if model_class is TestSiluMulNvfp4QuantModel and not is_nvfp4_supported():
         pytest.skip("NVFP4 is not supported on this GPU.")
+    if model_class is TestSiluMulGroupFp8QuantModel and not IS_AITER_FOUND:
+        pytest.skip("AITER is not supported on this GPU.")
 
     torch.set_default_device("cuda")
     torch.set_default_dtype(dtype)
@@ -173,9 +217,15 @@ def test_fusion_silu_and_mul_quant(
     )
 
     with set_current_vllm_config(config):
-        fusion_pass = ActivationQuantFusionPass(config)
+        fusion_passes = [ActivationQuantFusionPass(config)]
+        if IS_AITER_FOUND:
+            from vllm.compilation.rocm_aiter_fusion import (
+                RocmAiterSiluMulFp8GroupQuantFusionPass,
+            )
+
+            fusion_passes += [RocmAiterSiluMulFp8GroupQuantFusionPass(config)]
 
-        passes = [NoOpEliminationPass(config), fusion_pass, PostCleanupPass(config)]
+        passes = [NoOpEliminationPass(config), *fusion_passes, PostCleanupPass(config)]
         backend = TestBackend(*passes)
         model = model_class(
             hidden_size=hidden_size, cuda_force_torch=cuda_force_torch, x=x
@@ -194,12 +244,14 @@ def test_fusion_silu_and_mul_quant(
             atol, rtol = 1e-3, 1e-3
         elif model_class == TestSiluMulNvfp4QuantModel:
             atol, rtol = 1e-1, 1e-1
+        elif model_class == TestSiluMulGroupFp8QuantModel:
+            atol, rtol = 5e-2, 5e-2
 
         torch.testing.assert_close(
             result[0].to(dtype=dtype), result2[0].to(dtype=dtype), atol=atol, rtol=rtol
         )
 
-        assert fusion_pass.matched_count == 1
+        assert sum([p.matched_count for p in fusion_passes]) == 1
 
         # In pre-nodes, quant op should be present and fused kernels should not
         backend.check_before_ops(model.ops_in_model_before())
diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index 94bbc9b00..010817e79 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -24,6 +24,15 @@ def is_aiter_found() -> bool:
 # we keep this global outside to not cause torch compile breaks.
 IS_AITER_FOUND = is_aiter_found()
 
+# Can't use dtypes.fp8 directly inside an op
+# because it returns wrong result on gfx942.
+# This is a workaround to get the correct FP8 dtype.
+# This might because that the get_gfx() is wrapped as a custom op.
+if IS_AITER_FOUND:
+    from aiter import dtypes
+
+    AITER_FP8_DTYPE = dtypes.fp8
+
 
 def if_aiter_supported(func: Callable) -> Callable:
     """Decorator that only executes the function if
@@ -45,36 +54,6 @@ def if_aiter_supported(func: Callable) -> Callable:
     return wrapper
 
 
-def _rocm_aiter_group_fp8_quant_impl(
-    x: torch.Tensor,
-    group_size: int,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    assert x.shape[-1] % group_size == 0, "Input shape must be divisible by group size"
-    from aiter import QuantType, dtypes, get_hip_quant
-
-    aiter_per1x128_quant = get_hip_quant(QuantType.per_1x128)
-    return aiter_per1x128_quant(x.contiguous(), quant_dtype=dtypes.fp8)
-
-
-def _rocm_aiter_group_fp8_quant_fake(
-    x: torch.Tensor,
-    group_size: int,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    from aiter import dtypes
-
-    M, N = x.shape
-    x_fp8 = torch.empty((M, N), dtype=dtypes.fp8, device=x.device)
-    out_bs = torch.empty(
-        (
-            M,
-            (N + group_size - 1) // group_size,
-        ),
-        dtype=torch.float32,
-        device=x.device,
-    )
-    return x_fp8, out_bs
-
-
 def _rocm_aiter_fused_moe_impl(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,
@@ -522,6 +501,142 @@ def _rocm_aiter_per_token_quant_fake(
     )
 
 
+def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_impl(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    variance_epsilon: float,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    from aiter.ops.triton.fused_fp8_quant import fused_rms_fp8_group_quant
+
+    (x_quant, x_quant_scales), _, _, res = fused_rms_fp8_group_quant(
+        x,
+        weight,
+        variance_epsilon,
+        None,
+        None,
+        None,
+        group_size=group_size,
+        dtype_quant=AITER_FP8_DTYPE,
+        res1=residual,
+    )
+    return (x_quant, x_quant_scales, res)
+
+
+def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_fake(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    variance_epsilon: float,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    M, N = x.shape
+    scale_shape = (M, (N + group_size - 1) // group_size)
+    return (
+        torch.empty_like(x, dtype=AITER_FP8_DTYPE, device=x.device),
+        torch.empty(scale_shape, dtype=torch.float32, device=x.device),
+        torch.empty_like(residual, device=residual.device),
+    )
+
+
+def _rocm_aiter_rmsnorm_fp8_group_quant_impl(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    variance_epsilon: float,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    from aiter.ops.triton.fused_fp8_quant import fused_rms_fp8_group_quant
+
+    (x_quant, x_quant_scales), _, _, res = fused_rms_fp8_group_quant(
+        x,
+        weight,
+        variance_epsilon,
+        None,
+        None,
+        None,
+        group_size=group_size,
+        dtype_quant=AITER_FP8_DTYPE,
+        res1=None,
+    )
+    return (x_quant, x_quant_scales)
+
+
+def _rocm_aiter_rmsnorm_fp8_group_quant_fake(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    variance_epsilon: float,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    M, N = x.shape
+    scale_shape = (M, (N + group_size - 1) // group_size)
+    return (
+        torch.empty_like(x, dtype=AITER_FP8_DTYPE, device=x.device),
+        torch.empty(scale_shape, dtype=torch.float32, device=x.device),
+    )
+
+
+def _rocm_aiter_group_fp8_quant_impl(
+    x: torch.Tensor,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.shape[-1] % group_size == 0, "Input shape must be divisible by group size"
+    from aiter import QuantType, get_hip_quant
+
+    aiter_per1x128_quant = get_hip_quant(QuantType.per_1x128)
+    return aiter_per1x128_quant(x.contiguous(), quant_dtype=AITER_FP8_DTYPE)
+
+
+def _rocm_aiter_group_fp8_quant_fake(
+    x: torch.Tensor,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    M, N = x.shape
+    x_fp8 = torch.empty((M, N), dtype=AITER_FP8_DTYPE, device=x.device)
+    out_bs = torch.empty(
+        (
+            M,
+            (N + group_size - 1) // group_size,
+        ),
+        dtype=torch.float32,
+        device=x.device,
+    )
+    return x_fp8, out_bs
+
+
+def _rocm_aiter_act_mul_and_fp8_group_quant_impl(
+    x: torch.Tensor,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    from aiter.ops.triton.activation import act_mul_and_fp8_group_quant
+
+    return act_mul_and_fp8_group_quant(
+        x,
+        activation="silu",
+        group_size=group_size,
+        dtype_quant=AITER_FP8_DTYPE,
+    )
+
+
+def _rocm_aiter_act_mul_and_fp8_group_quant_fake(
+    x: torch.Tensor,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    M, N = x.shape
+    assert N % 2 == 0
+    N_half = N // 2
+    x_fp8 = torch.empty((M, N_half), dtype=AITER_FP8_DTYPE, device=x.device)
+    out_bs = torch.empty(
+        (
+            M,
+            (N_half + group_size - 1) // group_size,
+        ),
+        dtype=torch.float32,
+        device=x.device,
+    )
+    return x_fp8, out_bs
+
+
 # Global flag to ensure ops are registered only once
 _OPS_REGISTERED = False
 
@@ -557,7 +672,7 @@ class rocm_aiter_ops:
     @if_aiter_supported
     def is_linear_fp8_enaled(cls) -> bool:
         """ "Verifies device specs and availability of env variable."""
-        return cls.is_linear_enabled() and current_platform.is_fp8_fnuz()
+        return cls.is_linear_enabled()
 
     @classmethod
     @if_aiter_supported
@@ -632,14 +747,6 @@ class rocm_aiter_ops:
             )
 
             # register all the custom ops here
-            direct_register_custom_op(
-                op_name="rocm_aiter_group_fp8_quant",
-                op_func=_rocm_aiter_group_fp8_quant_impl,
-                mutates_args=[],
-                fake_impl=_rocm_aiter_group_fp8_quant_fake,
-                dispatch_key=current_platform.dispatch_key,
-            )
-
             direct_register_custom_op(
                 op_name="rocm_aiter_asm_moe_tkw1",
                 op_func=_rocm_aiter_asm_moe_tkw1_impl,
@@ -699,27 +806,46 @@ class rocm_aiter_ops:
             direct_register_custom_op(
                 op_name="rocm_aiter_gemm_a8w8_blockscale",
                 op_func=_rocm_aiter_gemm_a8w8_blockscale_impl,
-                mutates_args=[],
                 fake_impl=_rocm_aiter_gemm_a8w8_blockscale_fake,
-                dispatch_key=current_platform.dispatch_key,
             )
 
             direct_register_custom_op(
                 op_name="rocm_aiter_rms_norm",
                 op_func=_rocm_aiter_rms_norm_impl,
-                mutates_args=[],
                 fake_impl=_rocm_aiter_rms_norm_fake,
-                dispatch_key=current_platform.dispatch_key,
             )
 
             direct_register_custom_op(
                 op_name="rocm_aiter_rmsnorm2d_fwd_with_add",
                 op_func=_rocm_aiter_rmsnorm2d_fwd_with_add_impl,
-                mutates_args=[],
                 fake_impl=_rocm_aiter_rmsnorm2d_fwd_with_add_fake,
                 dispatch_key=current_platform.dispatch_key,
             )
 
+            direct_register_custom_op(
+                op_name="rocm_aiter_rmsnorm_fp8_group_quant",
+                op_func=_rocm_aiter_rmsnorm_fp8_group_quant_impl,
+                fake_impl=_rocm_aiter_rmsnorm_fp8_group_quant_fake,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_rmsnorm_with_add_fp8_group_quant",
+                op_func=_rocm_aiter_rmsnorm_with_add_fp8_group_quant_impl,
+                fake_impl=_rocm_aiter_rmsnorm_with_add_fp8_group_quant_fake,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_act_mul_and_fp8_group_quant",
+                op_func=_rocm_aiter_act_mul_and_fp8_group_quant_impl,
+                fake_impl=_rocm_aiter_act_mul_and_fp8_group_quant_fake,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_group_fp8_quant",
+                op_func=_rocm_aiter_group_fp8_quant_impl,
+                fake_impl=_rocm_aiter_group_fp8_quant_fake,
+            )
+
             direct_register_custom_op(
                 op_name="rocm_aiter_per_tensor_quant",
                 op_func=_rocm_aiter_per_tensor_quant_impl,
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index 6848bfb6a..4ebb386f7 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -5,6 +5,7 @@ import functools
 from torch import fx as fx
 
 from vllm import envs
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
@@ -13,6 +14,12 @@ from vllm.utils.system_utils import set_env_var
 from .post_cleanup import PostCleanupPass
 from .vllm_inductor_pass import VllmInductorPass
 
+if rocm_aiter_ops.is_enabled():
+    from vllm.compilation.rocm_aiter_fusion import (
+        RocmAiterRMSNormFp8GroupQuantFusionPass,
+        RocmAiterSiluMulFp8GroupQuantFusionPass,
+    )
+
 if current_platform.is_cuda_alike():
     from .activation_quant_fusion import ActivationQuantFusionPass
     from .fusion import RMSNormQuantFusionPass
@@ -109,8 +116,12 @@ class PostGradPassManager(CustomGraphPass):
 
             if self.pass_config.fuse_norm_quant:
                 self.passes += [RMSNormQuantFusionPass(config)]
+                if rocm_aiter_ops.is_enabled():
+                    self.passes += [RocmAiterRMSNormFp8GroupQuantFusionPass(config)]
             if self.pass_config.fuse_act_quant:
                 self.passes += [ActivationQuantFusionPass(config)]
+                if rocm_aiter_ops.is_enabled():
+                    self.passes += [RocmAiterSiluMulFp8GroupQuantFusionPass(config)]
 
             if self.pass_config.fuse_attn_quant:
                 self.passes += [AttnFusionPass(config)]
diff --git a/vllm/compilation/rocm_aiter_fusion.py b/vllm/compilation/rocm_aiter_fusion.py
new file mode 100644
index 000000000..8b5db9de3
--- /dev/null
+++ b/vllm/compilation/rocm_aiter_fusion.py
@@ -0,0 +1,242 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+import torch
+import torch._inductor.pattern_matcher as pm
+from torch import fx
+from torch._inductor.pattern_matcher import PatternMatcherPass
+from torch._ops import OpOverload
+
+import vllm.model_executor.layers.quantization.utils.fp8_utils  # noqa: F401
+from vllm.compilation.activation_quant_fusion import ActivationQuantPattern
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+
+from .fusion import empty_bf16
+from .inductor_pass import enable_fake_mode
+from .matcher_utils import MatcherSiluAndMul
+from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
+
+logger = init_logger(__name__)
+FP8_DTYPE = current_platform.fp8_dtype()
+
+AITER_RMS_GROUP_QUANT_OP = torch.ops.vllm.rocm_aiter_rmsnorm_fp8_group_quant.default
+AITER_RMS_ADD_GROUP_QUANT_OP = (
+    torch.ops.vllm.rocm_aiter_rmsnorm_with_add_fp8_group_quant.default
+)
+
+AITER_RMS_OP = torch.ops.vllm.rocm_aiter_rms_norm.default
+AITER_RMS_ADD_OP = torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add.default
+
+AITER_GROUP_FP8_QUANT_OP = torch.ops.vllm.rocm_aiter_group_fp8_quant.default
+TRITON_GROUP_FP8_QUANT_OP = torch.ops.vllm.triton_per_token_group_quant_fp8.default
+
+FUSED_SILU_MUL_QUANT_OP = torch.ops.vllm.rocm_aiter_act_mul_and_fp8_group_quant.default
+
+
+class AiterRMSFp8GroupQuantPattern:
+    """
+    This pattern fuses aiter rms_norm & group fp8 quant custom
+    ops into an aiter rms_norm_group_fp8_quant op.
+    """
+
+    def __init__(self, epsilon: float, quant_dtype: torch.dtype, quant_op: OpOverload):
+        self.epsilon = epsilon
+        self.quant_dtype = quant_dtype
+        self.quant_op = quant_op
+
+    def register(self, pm_pass: PatternMatcherPass):
+        def pattern(
+            input: torch.Tensor,
+            weight: torch.Tensor,
+        ):
+            at1 = AITER_RMS_OP(x=input, weight=weight, variance_epsilon=self.epsilon)
+
+            at2 = self.quant_op(at1, 128)
+
+            return at2[0], at2[1]
+
+        def replacement(
+            input: torch.Tensor,
+            weight: torch.Tensor,
+        ):
+            at = AITER_RMS_GROUP_QUANT_OP(
+                x=input,
+                weight=weight,
+                variance_epsilon=self.epsilon,
+                group_size=128,
+            )
+
+            return at[0], at[1]
+
+        inputs = [
+            empty_bf16(5, 4),  # input
+            empty_bf16(1, 5),  # weight
+        ]
+
+        pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass)
+
+
+class AiterFusedAddRMSFp8GroupQuantPattern:
+    """
+    This pattern fuses aiter rms_norm_with_add & group fp8 quant custom ops
+    into a aiter rms_norm_with_add_group_fp8_quant op.
+    """
+
+    def __init__(self, epsilon: float, quant_dtype: torch.dtype, quant_op: OpOverload):
+        self.epsilon = epsilon
+        self.quant_dtype = quant_dtype
+        self.quant_op = quant_op
+
+    def register(self, pm_pass: PatternMatcherPass):
+        def pattern(
+            input: torch.Tensor,
+            residual: torch.Tensor,
+            weight: torch.Tensor,
+        ):
+            at1 = AITER_RMS_ADD_OP(
+                x=input,
+                residual=residual,
+                weight=weight,
+                variance_epsilon=self.epsilon,
+            )
+
+            at2 = self.quant_op(at1[0], 128)
+
+            # result, scale, residual
+            return at2[0], at2[1], at1[1]
+
+        def replacement(
+            input: torch.Tensor,
+            residual: torch.Tensor,
+            weight: torch.Tensor,
+        ):
+            at = AITER_RMS_ADD_GROUP_QUANT_OP(
+                x=input,
+                residual=residual,
+                weight=weight,
+                variance_epsilon=self.epsilon,
+                group_size=128,
+            )
+
+            # result, scale, residual
+            return at[0], at[1], at[2]
+
+        inputs = [
+            empty_bf16(5, 4),  # input
+            empty_bf16(5, 4),  # residual
+            empty_bf16(1, 5),  # weight
+        ]
+
+        pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass)
+
+
+class RocmAiterRMSNormFp8GroupQuantFusionPass(VllmPatternMatcherPass):
+    """
+    This pass fuses rms_norm & quant custom ops into a fused rms_norm_quant op.
+    It also supports fused_add_rms_norm.
+    """
+
+    @enable_fake_mode
+    def __init__(self, config: VllmConfig):
+        super().__init__(config)
+
+        self.patterns: PatternMatcherPass = PatternMatcherPass(
+            pass_name="rocm_aiter_rms_norm_fp8_group_quant_fusion_pass"
+        )
+
+        # Make sure fused add patterns are before simple rms norm,
+        # as the latter is a subset of the former in torch ops
+        for epsilon in [1e-5, 1e-6]:
+            # Fuse rms_norm + dynamic group fp8 quant
+            for quant_op in [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP]:
+                AiterRMSFp8GroupQuantPattern(epsilon, FP8_DTYPE, quant_op).register(
+                    self.patterns
+                )
+
+                AiterFusedAddRMSFp8GroupQuantPattern(
+                    epsilon, FP8_DTYPE, quant_op
+                ).register(self.patterns)
+
+        self.dump_patterns(config, self.patterns)
+
+    @VllmInductorPass.time_and_log
+    def __call__(self, graph: fx.Graph):
+        self.matched_count = self.patterns.apply(graph)
+        logger.debug("Replaced %s patterns", self.matched_count)
+
+    def uuid(self) -> Any:
+        fusion_patterns = [
+            AiterRMSFp8GroupQuantPattern,
+            AiterFusedAddRMSFp8GroupQuantPattern,
+        ]
+        return self.hash_source(self, *fusion_patterns)
+
+
+class AiterSiluMulFp8GroupQuantPattern(ActivationQuantPattern):
+    """
+    This pattern fuses aiter silu_and_mul & group fp8 quant custom
+    ops into an aiter silu_and_mul_group_fp8_quant op.
+    """
+
+    def __init__(self, quant_op: OpOverload):
+        self.silu_and_mul_matcher = MatcherSiluAndMul()
+        self.quant_op = quant_op
+
+    def register(self, pm_pass: PatternMatcherPass):
+        def pattern(
+            input: torch.Tensor,
+        ):
+            at1 = self.silu_and_mul_matcher(input)
+            at2 = self.quant_op(at1, 128)
+            return at2[0], at2[1]
+
+        def replacement(
+            input: torch.Tensor,
+        ):
+            at = FUSED_SILU_MUL_QUANT_OP(x=input, group_size=128)
+            return at[0], at[1]
+
+        inputs = [
+            self.silu_and_mul_matcher.inputs()[0],
+        ]
+
+        pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass)
+
+
+class RocmAiterSiluMulFp8GroupQuantFusionPass(VllmPatternMatcherPass):
+    """
+    This pass fuses a pre-defined set of custom ops into fused ops.
+    It uses the torch pattern matcher to find the patterns and replace them.
+
+    Because patterns can only be registered once, the pass is a singleton.
+    This will be addressed in a future version of PyTorch:
+    https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980
+    """
+
+    @enable_fake_mode
+    def __init__(self, config: VllmConfig):
+        super().__init__(config)
+
+        self.patterns: PatternMatcherPass = PatternMatcherPass(
+            pass_name="rocm_aiter_silu_mul_fp8_group_quant_fusion_pass"
+        )
+
+        for quant_op in [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP]:
+            AiterSiluMulFp8GroupQuantPattern(quant_op).register(self.patterns)
+
+        self.dump_patterns(config, self.patterns)
+
+    @VllmInductorPass.time_and_log
+    def __call__(self, graph: torch.fx.Graph):
+        self.matched_count = self.patterns.apply(graph)
+        logger.debug("Replaced %s patterns", self.matched_count)
+
+    def uuid(self):
+        fusion_patterns = [
+            ActivationQuantPattern,
+            AiterSiluMulFp8GroupQuantPattern,
+        ]
+        return VllmInductorPass.hash_source(self, *fusion_patterns)
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index f5200d7d3..b459d5947 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -196,6 +196,39 @@ direct_register_custom_op(
 )
 
 
+def _triton_per_token_group_quant_fp8_impl(
+    x: torch.Tensor,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    return per_token_group_quant_fp8(
+        x, group_size, column_major_scales=False, use_ue8m0=False
+    )
+
+
+def _triton_per_token_group_quant_fp8_fake(
+    x: torch.Tensor,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    M, N = x.shape
+    x_fp8 = torch.empty((M, N), dtype=current_platform.fp8_dtype(), device=x.device)
+    out_bs = torch.empty(
+        (
+            M,
+            (N + group_size - 1) // group_size,
+        ),
+        dtype=torch.float32,
+        device=x.device,
+    )
+    return x_fp8, out_bs
+
+
+direct_register_custom_op(
+    "triton_per_token_group_quant_fp8",
+    _triton_per_token_group_quant_fp8_impl,
+    fake_impl=_triton_per_token_group_quant_fp8_fake,
+)
+
+
 # TODO fix ROCm->Triton custom path:
 #  https://github.com/vllm-project/vllm/issues/14397
 class W8A8BlockFp8LinearOp:
@@ -341,17 +374,15 @@ class W8A8BlockFp8LinearOp:
 
         if input_scale is not None:
             q_input = input_2d
-        # MI350 case uses triton kernel
         elif use_triton:
-            q_input, input_scale = per_token_group_quant_fp8(
+            q_input, input_scale = torch.ops.vllm.triton_per_token_group_quant_fp8(
                 input_2d,
                 self.act_quant_group_shape.col,
-                column_major_scales=False,
-                use_ue8m0=False,
             )
-        # MI300 uses tuned AITER ASM/C++ kernel
         else:
-            q_input, input_scale = rocm_aiter_ops.group_fp8_quant(input_2d)
+            q_input, input_scale = rocm_aiter_ops.group_fp8_quant(
+                input_2d, self.act_quant_group_shape.col
+            )
 
         return gemm_a8w8_blockscale_op(
             q_input,
-- 
GitLab


From 2e7054da065504a4786d251f4c5bd099a9ddab86 Mon Sep 17 00:00:00 2001
From: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com>
Date: Tue, 9 Dec 2025 15:51:32 -0800
Subject: [PATCH 251/258] Improve wvsplitK tile and balance heristics. (#29937)

Signed-off-by: Hashem Hashemi <hashem.hashemi@amd.com>
---
 csrc/rocm/skinny_gemms.cu | 97 +++++++++++++++++++--------------------
 1 file changed, 48 insertions(+), 49 deletions(-)

diff --git a/csrc/rocm/skinny_gemms.cu b/csrc/rocm/skinny_gemms.cu
index 2ef579a1b..8ebe55cef 100644
--- a/csrc/rocm/skinny_gemms.cu
+++ b/csrc/rocm/skinny_gemms.cu
@@ -1241,33 +1241,16 @@ __global__ void wvSplitK_hf_big_(const int K, const int M, const int Bx,
 }
 #endif  // defined(__HIP__GFX9__) TODO: Add NAVI support
 
+// Find the min val of div2 that doesn't increase N/(div1*div2)
 int mindiv(int N, int div1, int div2) {
   int nPrRnd = div1 * div2;
-  int rnds0 = N / nPrRnd;
-  nPrRnd -= div1 * 3;
-  int rnds3 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rnds4 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rnds5 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rnds6 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rnds7 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rnds8 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rnds9 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rtn = div2;
-  if (rnds0 == rnds3) rtn = div2 - 3;
-  if (rnds0 == rnds4) rtn = div2 - 4;
-  if (rnds0 == rnds5) rtn = div2 - 5;
-  if (rnds0 == rnds6) rtn = div2 - 6;
-  if (rnds0 == rnds7) rtn = div2 - 7;
-  if (rnds0 == rnds8) rtn = div2 - 8;
-  if (rnds0 == rnds9) rtn = div2 - 9;
-  return rtn;
+  int rnds[13];
+  for (int i = 0; i < 13; i++) {
+    rnds[i] = (N + nPrRnd - 1) / nPrRnd;
+    nPrRnd -= div1;
+  }
+  for (int i = 12; i >= 0; i--)
+    if (rnds[0] == rnds[i]) return (div2 - i);
 }
 
 torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
@@ -1300,26 +1283,37 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   const int max_lds_len = get_lds_size() / 2;
 
-#define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
-                 _N)                                                          \
-  {                                                                           \
-    dim3 block(64, _WvPrGrp);                                                 \
-    if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) {              \
-      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp);              \
-      wvSplitK_hf_sml_<fptype, 64, _YTILEs, _WvPrGrp, 8, _UNRLs, _N>          \
-          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4,    \
-                                       biasf4, c, __wvPrGrp, CuCount);        \
-    } else if (K_in * N_in <= max_lds_len * 1.2) {                            \
-      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp);              \
-      wvSplitK_hf_<fptype, 64, _YTILEm, _WvPrGrp, 8, _UNRLm, _N>              \
-          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4,    \
-                                       biasf4, c, __wvPrGrp, CuCount);        \
-    } else {                                                                  \
-      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEb, _WvPrGrp);              \
-      wvSplitK_hf_big_<fptype, 64, _YTILEb, _WvPrGrp, 8, _UNRLb, _N>          \
-          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4,    \
-                                       biasf4, c, __wvPrGrp, CuCount);        \
-    }                                                                         \
+#define WVSPLITK(_YTILE, _UNRL, _N)                                        \
+  {                                                                        \
+    dim3 block(64, 16);                                                    \
+    int __wvPrGrp = mindiv(M_in, CuCount * _YTILE, 16);                    \
+    if ((K_in * N_in <= max_lds_len) && (M_in % _YTILE == 0))              \
+      wvSplitK_hf_sml_<fptype, 64, _YTILE, 16, 8, _UNRL, _N>               \
+          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
+                                       biasf4, c, __wvPrGrp, CuCount);     \
+    else if (K_in * N_in <= max_lds_len * 1.2)                             \
+      wvSplitK_hf_<fptype, 64, _YTILE, 16, 8, _UNRL, _N>                   \
+          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
+                                       biasf4, c, __wvPrGrp, CuCount);     \
+    else                                                                   \
+      wvSplitK_hf_big_<fptype, 64, _YTILE, 16, 8, _UNRL, _N>               \
+          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
+                                       biasf4, c, __wvPrGrp, CuCount);     \
+  }
+
+#define WVSPLIT_TILE(_sYT, __N)                           \
+  {                                                       \
+    bool fit_lds = (K_in * N_in <= max_lds_len);          \
+    if (_sYT <= 1)                                        \
+      WVSPLITK(1, 4, __N)                                 \
+    else if ((__N == 1) || (!fit_lds) || (_sYT <= 4 * 2)) \
+      WVSPLITK(2, 2, __N)                                 \
+    else if (_sYT <= 4 * 3)                               \
+      WVSPLITK(3, 2, __N)                                 \
+    else if (__N == 4)                                    \
+      WVSPLITK(4, 1, __N)                                 \
+    else                                                  \
+      WVSPLITK(4, 2, __N)                                 \
   }
 
   AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitK", [&] {
@@ -1331,18 +1325,23 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
             ? reinterpret_cast<const fptype*>(in_bias->data_ptr())
             : nullptr;
     fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
+
+    // first shoot for biggest tile-size that keeps all simd busy,
+    // then cut the active waves to balance their distribution...
+    int sYT = (M_in + CuCount * 4 - 1) / (CuCount * 4);
+
     switch (N_in) {
       case 1:
-        WVSPLITK(16, 2, 2, 2, 2, 2, 2, 1)
+        WVSPLIT_TILE(sYT, 1)
         break;
       case 2:
-        WVSPLITK(16, 2, 2, 2, 2, 2, 2, 2)
+        WVSPLIT_TILE(sYT, 2)
         break;
       case 3:
-        WVSPLITK(16, 4, 7, 7, 1, 1, 1, 3)
+        WVSPLIT_TILE(sYT, 3)
         break;
       case 4:
-        WVSPLITK(16, 4, 7, 7, 1, 1, 1, 4)
+        WVSPLIT_TILE(sYT, 4)
         break;
       default:
         throw std::runtime_error(
-- 
GitLab


From 03b5f940fdcff25024ce5d37c357d770344a8f20 Mon Sep 17 00:00:00 2001
From: dongbo910220 <32610838+dongbo910220@users.noreply.github.com>
Date: Wed, 10 Dec 2025 08:15:01 +0800
Subject: [PATCH 252/258] [V1][Spec Decode] Optimize Medusa proposer to avoid
 GPU-CPU sync (#29723)

Signed-off-by: dongbo910220 <1275604947@qq.com>
---
 vllm/v1/spec_decode/medusa.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/spec_decode/medusa.py b/vllm/v1/spec_decode/medusa.py
index 12b903cca..989478f34 100644
--- a/vllm/v1/spec_decode/medusa.py
+++ b/vllm/v1/spec_decode/medusa.py
@@ -38,16 +38,16 @@ class MedusaProposer:
         self,
         target_hidden_states: torch.Tensor,
         sampling_metadata: SamplingMetadata,
-    ) -> list[list[int]]:
+    ) -> torch.Tensor:
         # Generate blocks and compute logits
         blocks = self.model(target_hidden_states)
         logits = self.model.compute_logits(blocks)
 
-        # Get draft tokens and transpose the result
-        # TODO(woosuk): OPTIMIZATION: Return GPU tensor without GPU-CPU
-        # synchronization.
-        draft_tokens = [logit.argmax(dim=-1).tolist() for logit in logits]
-        return [list(row) for row in zip(*draft_tokens)]
+        # Compute argmax for each Medusa head and stack into a single tensor
+        # Shape: [batch_size, num_heads]
+        draft_tokens = torch.stack([logit.argmax(dim=-1) for logit in logits], dim=1)
+
+        return draft_tokens
 
     def load_model(self, target_model: nn.Module) -> None:
         from vllm.compilation.backends import set_model_tag
-- 
GitLab


From 4c2e10ea19b9053924d66f30f3d7121fbd9684f8 Mon Sep 17 00:00:00 2001
From: PatrykSaffer <patryk.saffer@mistral.ai>
Date: Wed, 10 Dec 2025 01:47:07 +0100
Subject: [PATCH 253/258] [Bugfix] Fix cuda graph sizes when running with
 speculative decoding (#30330)

Signed-off-by: Patryk Saffer <patryk.saffer99@gmail.com>
Signed-off-by: PatrykSaffer <patryk.saffer@mistral.ai>
Co-authored-by: Patryk Saffer <patryk.saffer99@gmail.com>
---
 vllm/config/vllm.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 614a3226c..8f27db001 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -1047,8 +1047,14 @@ class VllmConfig:
                 self.compilation_config.max_cudagraph_capture_size
             )
             if max_cudagraph_capture_size is None:
+                decode_query_len = 1
+                if (
+                    self.speculative_config
+                    and self.speculative_config.num_speculative_tokens
+                ):
+                    decode_query_len += self.speculative_config.num_speculative_tokens
                 max_cudagraph_capture_size = min(
-                    self.scheduler_config.max_num_seqs * 2, 512
+                    self.scheduler_config.max_num_seqs * decode_query_len * 2, 512
                 )
             max_num_tokens = self.scheduler_config.max_num_batched_tokens
             max_cudagraph_capture_size = min(max_num_tokens, max_cudagraph_capture_size)
-- 
GitLab


From 2e7035dd8cc2e6c907873462b4ac0bb9f08e0abb Mon Sep 17 00:00:00 2001
From: ElizaWszola <ewszola@redhat.com>
Date: Wed, 10 Dec 2025 02:17:25 +0100
Subject: [PATCH 254/258] [Bugfix] Fix fp8 DeepGemm compilation issues (#30336)

---
 vllm/model_executor/layers/quantization/utils/fp8_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index b459d5947..e12fe61bf 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -31,7 +31,6 @@ from vllm.model_executor.utils import replace_parameter
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils.deep_gemm import (
-    DeepGemmQuantScaleFMT,
     fp8_gemm_nt,
     is_deep_gemm_e8m0_used,
     is_deep_gemm_supported,
@@ -248,6 +247,7 @@ class W8A8BlockFp8LinearOp:
         self.act_quant_group_shape = act_quant_group_shape
         self.is_deep_gemm_supported = is_deep_gemm_supported()
         self.is_hopper = current_platform.is_device_capability(90)
+        self.is_blackwell = current_platform.is_device_capability(100)
         self.use_deep_gemm_e8m0 = is_deep_gemm_e8m0_used()
 
         # Get the correct blockscale mul and input quant operations.
@@ -303,7 +303,7 @@ class W8A8BlockFp8LinearOp:
         weight: torch.Tensor,
         weight_scale: torch.Tensor,
     ) -> torch.Tensor:
-        if DeepGemmQuantScaleFMT.from_oracle() == DeepGemmQuantScaleFMT.UE8M0:
+        if self.use_deep_gemm_e8m0 and self.is_blackwell:
             q_input, input_scale = per_token_group_quant_fp8_packed_for_deepgemm(
                 input_2d,
                 group_size=self.act_quant_group_shape.col,
-- 
GitLab


From abe93bce5952ed8adf90d4b77af6ed3515958620 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Tue, 9 Dec 2025 20:18:10 -0500
Subject: [PATCH 255/258] [Attention] Make seq_lens_cpu optional in
 CommonAttentionMetadata to enable true async spec-decode (#29624)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Benjamin Chislett <chislett.ben@gmail.com>
---
 tests/v1/attention/utils.py                 |   4 +-
 tests/v1/e2e/test_async_spec_decode.py      | 131 ++++++++++++++++++++
 tests/v1/spec_decode/test_tree_attention.py |   4 +-
 vllm/attention/layers/cross_attention.py    |   2 +-
 vllm/v1/attention/backends/gdn_attn.py      |   2 +-
 vllm/v1/attention/backends/utils.py         |  66 +++++++---
 vllm/v1/spec_decode/eagle.py                |  20 +--
 vllm/v1/worker/gpu/attn_utils.py            |   4 +-
 vllm/v1/worker/gpu_model_runner.py          |   4 +-
 9 files changed, 200 insertions(+), 37 deletions(-)
 create mode 100644 tests/v1/e2e/test_async_spec_decode.py

diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index 6cab129c1..4dcaf9d90 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -106,8 +106,8 @@ def create_common_attn_metadata(
         query_start_loc=query_start_loc,
         query_start_loc_cpu=query_start_loc_cpu,
         seq_lens=seq_lens,
-        seq_lens_cpu=seq_lens_cpu,
-        num_computed_tokens_cpu=num_computed_tokens_cpu,
+        _seq_lens_cpu=seq_lens_cpu,
+        _num_computed_tokens_cpu=num_computed_tokens_cpu,
         num_reqs=batch_spec.batch_size,
         num_actual_tokens=num_tokens,
         max_query_len=max_query_len,
diff --git a/tests/v1/e2e/test_async_spec_decode.py b/tests/v1/e2e/test_async_spec_decode.py
new file mode 100644
index 000000000..561f37a52
--- /dev/null
+++ b/tests/v1/e2e/test_async_spec_decode.py
@@ -0,0 +1,131 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test that verifies no implicit GPU-CPU synchronization occurs during
+speculative decoding generation under expected conditions.
+"""
+
+import multiprocessing
+import sys
+import traceback
+
+import pytest
+import torch
+
+
+@pytest.fixture
+def sync_tracker():
+    """
+    Fixture that patches CommonAttentionMetadata.seq_lens_cpu to detect
+    lazy init syncs. Prints stack traces immediately when syncs occur.
+    """
+    from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+
+    # Shared counter for cross-process communication (inherited by fork)
+    sync_count = multiprocessing.Value("i", 0)
+
+    # Save original property
+    original_prop = CommonAttentionMetadata.seq_lens_cpu
+    original_fget = original_prop.fget
+
+    # Create tracking wrapper
+    def tracking_seq_lens_cpu(self):
+        if self._seq_lens_cpu is None:
+            # Increment counter
+            with sync_count.get_lock():
+                sync_count.value += 1
+                count = sync_count.value
+            # Print stack trace immediately (shows in subprocess output)
+            print(f"\n{'=' * 60}", file=sys.stderr)
+            print(f"SYNC #{count}: seq_lens_cpu lazy init triggered!", file=sys.stderr)
+            print(f"{'=' * 60}", file=sys.stderr)
+            traceback.print_stack(file=sys.stderr)
+            print(f"{'=' * 60}\n", file=sys.stderr)
+            sys.stderr.flush()
+        return original_fget(self)
+
+    # Apply patch
+    CommonAttentionMetadata.seq_lens_cpu = property(tracking_seq_lens_cpu)
+
+    class SyncTracker:
+        @property
+        def count(self) -> int:
+            return sync_count.value
+
+        def assert_no_sync(self, msg: str = ""):
+            count = sync_count.value
+            assert count == 0, (
+                f"Unexpected GPU-CPU sync: seq_lens_cpu lazy init triggered "
+                f"{count} times. See stack traces above. {msg}"
+            )
+
+    yield SyncTracker()
+
+    # Restore original property
+    CommonAttentionMetadata.seq_lens_cpu = original_prop
+    torch._dynamo.reset()
+
+
+# Test configurations: (model, spec_model, method, num_spec_tokens, backend_env)
+SPEC_DECODE_CONFIGS = [
+    pytest.param(
+        "meta-llama/Llama-3.2-1B-Instruct",
+        "nm-testing/Llama3_2_1B_speculator.eagle3",
+        "eagle3",
+        2,
+        id="eagle3-llama",
+    ),
+    pytest.param(
+        "eagle618/deepseek-v3-random",
+        "eagle618/eagle-deepseek-v3-random",
+        "eagle",
+        2,
+        id="eagle-mla-deepseek",
+    ),
+]
+
+
+@pytest.mark.parametrize(
+    "model,spec_model,method,num_spec_tokens",
+    SPEC_DECODE_CONFIGS,
+)
+def test_no_sync_with_spec_decode(
+    sync_tracker,
+    model: str,
+    spec_model: str,
+    method: str,
+    num_spec_tokens: int,
+):
+    """
+    Test that no implicit GPU-CPU sync occurs during speculative decoding
+    generation.
+    """
+    # Import vLLM AFTER sync_tracker fixture has applied the patch
+    from vllm import LLM, SamplingParams
+    from vllm.distributed import cleanup_dist_env_and_memory
+
+    llm = LLM(
+        model=model,
+        max_model_len=256,
+        speculative_config={
+            "method": method,
+            "num_speculative_tokens": num_spec_tokens,
+            "model": spec_model,
+        },
+        enforce_eager=True,
+        async_scheduling=True,
+    )
+
+    outputs = llm.generate(
+        ["Hello, my name is"],
+        SamplingParams(temperature=0, max_tokens=10),
+    )
+
+    assert len(outputs) == 1
+    assert len(outputs[0].outputs[0].text) > 0
+
+    del llm
+    torch.cuda.empty_cache()
+    cleanup_dist_env_and_memory()
+
+    sync_tracker.assert_no_sync()
diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py
index a4ee53008..0afeeb891 100644
--- a/tests/v1/spec_decode/test_tree_attention.py
+++ b/tests/v1/spec_decode/test_tree_attention.py
@@ -88,8 +88,8 @@ def forward_attention(
         query_start_loc=query_start_loc,
         query_start_loc_cpu=query_start_loc.cpu(),
         seq_lens=seq_lens,
-        seq_lens_cpu=seq_lens.cpu(),
-        num_computed_tokens_cpu=context_lens.cpu(),
+        _seq_lens_cpu=seq_lens.cpu(),
+        _num_computed_tokens_cpu=context_lens.cpu(),
         num_reqs=batch_size,
         num_actual_tokens=num_actual_tokens,
         max_query_len=max_query_len,
diff --git a/vllm/attention/layers/cross_attention.py b/vllm/attention/layers/cross_attention.py
index 068fd0a0e..cfd203bdd 100644
--- a/vllm/attention/layers/cross_attention.py
+++ b/vllm/attention/layers/cross_attention.py
@@ -103,7 +103,7 @@ def create_cross_attention_backend(
             # needed here to know how many tokens to attend to from the cached
             # cross-attention KV cache.
             new_metadata.seq_lens = common_attn_metadata.encoder_seq_lens
-            new_metadata.seq_lens_cpu = torch.from_numpy(
+            new_metadata._seq_lens_cpu = torch.from_numpy(
                 common_attn_metadata.encoder_seq_lens_cpu
             )
 
diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py
index e921f8c3d..3a2f92d99 100644
--- a/vllm/v1/attention/backends/gdn_attn.py
+++ b/vllm/v1/attention/backends/gdn_attn.py
@@ -370,6 +370,6 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]
 
         num_accepted_tokens = torch.diff(m.query_start_loc)
         num_decode_draft_tokens_cpu = (num_accepted_tokens - 1).cpu()
-        m.num_computed_tokens_cpu = m.seq_lens_cpu - num_accepted_tokens.cpu()
+        m._num_computed_tokens_cpu = m.seq_lens_cpu - num_accepted_tokens.cpu()
 
         return self.build(0, m, num_accepted_tokens, num_decode_draft_tokens_cpu)
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 5200bc48b..79a1f7d47 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -18,7 +18,7 @@ from typing import (
 
 import numpy as np
 import torch
-from typing_extensions import runtime_checkable
+from typing_extensions import deprecated, runtime_checkable
 
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.utils.math_utils import cdiv
@@ -66,11 +66,6 @@ class CommonAttentionMetadata:
     """(batch_size + 1,), the start location of each request in query Tensor"""
 
     seq_lens: torch.Tensor
-    seq_lens_cpu: torch.Tensor
-    """(batch_size,), the length of each request including both computed tokens
-    and newly scheduled tokens"""
-
-    num_computed_tokens_cpu: torch.Tensor
     """(batch_size,), the number of computed tokens for each request"""
 
     num_reqs: int
@@ -81,7 +76,7 @@ class CommonAttentionMetadata:
     max_query_len: int
     """Longest query in batch"""
     max_seq_len: int
-    """Longest context length in batch"""
+    """Longest context length (may be an upper bound)"""
 
     block_table_tensor: torch.Tensor
     slot_mapping: torch.Tensor
@@ -100,6 +95,40 @@ class CommonAttentionMetadata:
     dcp_local_seq_lens_cpu: torch.Tensor | None = None
     """Sequence lengths of the local rank in decode context parallelism world"""
 
+    # WARNING: Deprecated fields. Will be removed in a future release (v0.14.0)
+    _seq_lens_cpu: torch.Tensor | None = None
+    _num_computed_tokens_cpu: torch.Tensor | None = None
+
+    @property
+    @deprecated(
+        """
+    Prefer using device seq_lens directly to avoid implicit H<>D sync.
+    If a CPU copy is needed, use `seq_lens.cpu()` instead.
+    Will be removed in a future release (v0.14.0)
+    """
+    )
+    def seq_lens_cpu(self) -> torch.Tensor:
+        if self._seq_lens_cpu is None:
+            self._seq_lens_cpu = self.seq_lens.to("cpu")
+        return self._seq_lens_cpu
+
+    @property
+    @deprecated(
+        """
+    Prefer using device seq_lens directly to avoid implicit H<>D sync which breaks full
+    async scheduling. If a CPU copy is needed, it can be derived from 
+    query_start_loc_cpu and seq_lens.
+    Will be removed in a future release (v0.14.0)
+    """
+    )
+    def num_computed_tokens_cpu(self) -> torch.Tensor:
+        if self._num_computed_tokens_cpu is None:
+            query_seq_lens = (
+                self.query_start_loc_cpu[1:] - self.query_start_loc_cpu[:-1]
+            )
+            self._num_computed_tokens_cpu = self.seq_lens_cpu - query_seq_lens
+        return self._num_computed_tokens_cpu
+
     # TODO(lucas): remove once we have FULL-CG spec-decode support
     def unpadded(
         self, num_actual_tokens: int, num_actual_reqs: int
@@ -109,8 +138,12 @@ class CommonAttentionMetadata:
             query_start_loc=self.query_start_loc[: num_actual_reqs + 1],
             query_start_loc_cpu=self.query_start_loc_cpu[: num_actual_reqs + 1],
             seq_lens=self.seq_lens[:num_actual_reqs],
-            seq_lens_cpu=self.seq_lens_cpu[:num_actual_reqs],
-            num_computed_tokens_cpu=self.num_computed_tokens_cpu[:num_actual_reqs],
+            _seq_lens_cpu=self._seq_lens_cpu[:num_actual_reqs]
+            if self._seq_lens_cpu is not None
+            else None,
+            _num_computed_tokens_cpu=self._num_computed_tokens_cpu[:num_actual_reqs]
+            if self._num_computed_tokens_cpu is not None
+            else None,
             num_reqs=num_actual_reqs,
             num_actual_tokens=num_actual_tokens,
             max_query_len=self.max_query_len,
@@ -224,14 +257,14 @@ def _make_metadata_with_slice(
         query_start_loc=query_start_loc,
         query_start_loc_cpu=query_start_loc_cpu,
         seq_lens=seq_lens,
-        seq_lens_cpu=seq_lens_cpu,
-        num_computed_tokens_cpu=num_computed_tokens_cpu,
         num_reqs=num_requests,
         num_actual_tokens=num_actual_tokens,
         max_query_len=max_query_len,
         max_seq_len=max_seq_len,
         block_table_tensor=block_table_tensor,
         slot_mapping=slot_mapping,
+        _seq_lens_cpu=seq_lens_cpu,
+        _num_computed_tokens_cpu=num_computed_tokens_cpu,
     )
 
 
@@ -689,9 +722,7 @@ def make_local_attention_virtual_batches(
     return CommonAttentionMetadata(
         query_start_loc_cpu=query_start_loc_cpu,
         query_start_loc=query_start_loc_cpu.to(device=device, non_blocking=True),
-        seq_lens_cpu=seq_lens_cpu,
         seq_lens=seq_lens_cpu.to(device=device, non_blocking=True),
-        num_computed_tokens_cpu=torch.from_numpy(num_computed_tokens_local),
         num_reqs=len(seq_lens_cpu),
         num_actual_tokens=common_attn_metadata.num_actual_tokens,
         max_query_len=seqlens_q_local.max(),
@@ -699,6 +730,8 @@ def make_local_attention_virtual_batches(
         block_table_tensor=block_table_local,
         slot_mapping=common_attn_metadata.slot_mapping,
         causal=True,
+        _seq_lens_cpu=seq_lens_cpu,
+        _num_computed_tokens_cpu=torch.from_numpy(num_computed_tokens_local),
     )
 
 
@@ -719,7 +752,6 @@ def make_kv_sharing_fast_prefill_common_attn_metadata(
     logits_indices = logits_indices_padded[:num_logits_indices]
     num_reqs = common_attn_metadata.num_reqs
     query_start_loc = common_attn_metadata.query_start_loc
-    seq_lens = common_attn_metadata.seq_lens
     # Example inputs
     # num_reqs: 3
     # generation_indices:  [14, 18, 19, 27]
@@ -748,9 +780,7 @@ def make_kv_sharing_fast_prefill_common_attn_metadata(
     common_attn_metadata = CommonAttentionMetadata(
         query_start_loc=decode_query_start_loc,
         query_start_loc_cpu=decode_query_start_loc.to("cpu", non_blocking=True),
-        seq_lens=seq_lens,
-        seq_lens_cpu=seq_lens.to("cpu", non_blocking=True),
-        num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu,
+        seq_lens=common_attn_metadata.seq_lens,
         num_reqs=num_reqs,
         num_actual_tokens=total_num_decode_tokens,
         max_query_len=decode_max_query_len,
@@ -758,6 +788,8 @@ def make_kv_sharing_fast_prefill_common_attn_metadata(
         block_table_tensor=common_attn_metadata.block_table_tensor,
         slot_mapping=common_attn_metadata.slot_mapping,
         causal=True,
+        _seq_lens_cpu=common_attn_metadata._seq_lens_cpu,
+        _num_computed_tokens_cpu=common_attn_metadata._num_computed_tokens_cpu,
     )
     return common_attn_metadata
 
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 9f7859a5c..4cc78ae9d 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -440,16 +440,16 @@ class EagleProposer:
             # of main model.
             # Increment the sequence lengths.
             common_attn_metadata.seq_lens += 1
-            # This is an out-of-place operation to avoid modifying the original tensor.
-            common_attn_metadata.seq_lens_cpu = common_attn_metadata.seq_lens_cpu + 1
             # For the requests that exceed the max model length, we set the
             # sequence length to 1 to minimize their overheads in attention.
-
             common_attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len, 1)
 
-            common_attn_metadata.num_computed_tokens_cpu = (
-                common_attn_metadata.seq_lens_cpu - 1
-            )
+            # Also update the CPU-side shadow; NOTE: this is hacky and should be
+            # removed in when common_attn_metadata.seq_lens_cpu is deprecated.
+            if common_attn_metadata._seq_lens_cpu is not None:
+                common_attn_metadata._seq_lens_cpu += 1
+            if common_attn_metadata._num_computed_tokens_cpu is not None:
+                common_attn_metadata._num_computed_tokens_cpu += 1
 
             # Compute the slot mapping.
             if self.uses_mrope:
@@ -656,8 +656,8 @@ class EagleProposer:
             query_start_loc=common_attn_metadata.query_start_loc,
             seq_lens=common_attn_metadata.seq_lens,
             query_start_loc_cpu=query_start_loc_cpu,
-            seq_lens_cpu=common_attn_metadata.seq_lens_cpu,
-            num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu,
+            _seq_lens_cpu=common_attn_metadata._seq_lens_cpu,
+            _num_computed_tokens_cpu=common_attn_metadata._num_computed_tokens_cpu,
             num_reqs=common_attn_metadata.num_reqs,
             num_actual_tokens=total_num_tokens,
             max_query_len=new_query_len_per_req.max().item(),
@@ -932,8 +932,8 @@ class EagleProposer:
             query_start_loc=new_query_start_loc_cpu.to(device, non_blocking=True),
             seq_lens=new_seq_lens_cpu.to(device, non_blocking=True),
             query_start_loc_cpu=new_query_start_loc_cpu,
-            seq_lens_cpu=new_seq_lens_cpu,
-            num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu,
+            _seq_lens_cpu=new_seq_lens_cpu,
+            _num_computed_tokens_cpu=common_attn_metadata._num_computed_tokens_cpu,
             num_reqs=common_attn_metadata.num_reqs,
             num_actual_tokens=total_num_tokens,
             max_query_len=new_query_len_per_req.max().item(),
diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py
index 5aa1a33d8..6386f1a08 100644
--- a/vllm/v1/worker/gpu/attn_utils.py
+++ b/vllm/v1/worker/gpu/attn_utils.py
@@ -168,9 +168,9 @@ def build_attn_metadata(
             query_start_loc=query_start_loc_gpu,
             query_start_loc_cpu=query_start_loc_cpu,
             seq_lens=seq_lens,
-            seq_lens_cpu=seq_lens_cpu,
+            _seq_lens_cpu=seq_lens_cpu,
             max_seq_len=max_seq_len,
-            num_computed_tokens_cpu=num_computed_tokens_cpu,
+            _num_computed_tokens_cpu=num_computed_tokens_cpu,
             num_reqs=num_reqs,
             num_actual_tokens=num_tokens,
             max_query_len=max_query_len,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 7398defd7..f6f89d6eb 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1626,8 +1626,8 @@ class GPUModelRunner(
                 query_start_loc=query_start_loc,
                 query_start_loc_cpu=query_start_loc_cpu,
                 seq_lens=seq_lens,
-                seq_lens_cpu=seq_lens_cpu,
-                num_computed_tokens_cpu=num_computed_tokens_cpu,
+                _seq_lens_cpu=seq_lens_cpu,
+                _num_computed_tokens_cpu=num_computed_tokens_cpu,
                 num_actual_tokens=num_tokens_padded,
                 num_reqs=num_reqs_padded,
                 max_query_len=max_query_len,
-- 
GitLab


From c3487aca3425f532730c3433cfbd44e880fce2a8 Mon Sep 17 00:00:00 2001
From: Andrew Xia <axia@meta.com>
Date: Tue, 9 Dec 2025 18:13:13 -0800
Subject: [PATCH 256/258] [responsesAPI][6] Fix multi turn MCP tokenization
 (#30230)

Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
---
 tests/entrypoints/test_responses_utils.py | 52 ++++++++++++++++---
 vllm/entrypoints/constants.py             |  2 +
 vllm/entrypoints/context.py               |  6 ++-
 vllm/entrypoints/openai/serving_engine.py |  1 +
 vllm/entrypoints/responses_utils.py       | 62 +++++++++++++++++++++--
 5 files changed, 110 insertions(+), 13 deletions(-)

diff --git a/tests/entrypoints/test_responses_utils.py b/tests/entrypoints/test_responses_utils.py
index 3951bd484..a52296711 100644
--- a/tests/entrypoints/test_responses_utils.py
+++ b/tests/entrypoints/test_responses_utils.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
+from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall
 from openai.types.responses.response_function_tool_call_output_item import (
     ResponseFunctionToolCallOutputItem,
 )
@@ -14,7 +15,8 @@ from openai.types.responses.response_reasoning_item import (
 )
 
 from vllm.entrypoints.responses_utils import (
-    construct_chat_message_with_tool_call,
+    _construct_single_message_from_response_item,
+    construct_chat_messages_with_tool_call,
     convert_tool_responses_to_completions_format,
 )
 
@@ -42,7 +44,43 @@ class TestResponsesUtils:
 
         assert result == {"type": "function", "function": input_tool}
 
-    def test_construct_chat_message_with_tool_call(self):
+    def test_construct_chat_messages_with_tool_call(self):
+        """Test construction of chat messages with tool calls."""
+        reasoning_item = ResponseReasoningItem(
+            id="lol",
+            summary=[],
+            type="reasoning",
+            content=[
+                Content(
+                    text="Leroy Jenkins",
+                    type="reasoning_text",
+                )
+            ],
+            encrypted_content=None,
+            status=None,
+        )
+        mcp_tool_item = ResponseFunctionToolCall(
+            id="mcp_123",
+            call_id="call_123",
+            type="function_call",
+            status="completed",
+            name="python",
+            arguments='{"code": "123+456"}',
+        )
+        input_items = [reasoning_item, mcp_tool_item]
+        messages = construct_chat_messages_with_tool_call(input_items)
+
+        assert len(messages) == 1
+        message = messages[0]
+        assert message["role"] == "assistant"
+        assert message["reasoning"] == "Leroy Jenkins"
+        assert message["tool_calls"][0]["id"] == "call_123"
+        assert message["tool_calls"][0]["function"]["name"] == "python"
+        assert (
+            message["tool_calls"][0]["function"]["arguments"] == '{"code": "123+456"}'
+        )
+
+    def test_construct_single_message_from_response_item(self):
         item = ResponseReasoningItem(
             id="lol",
             summary=[],
@@ -56,7 +94,7 @@ class TestResponsesUtils:
             encrypted_content=None,
             status=None,
         )
-        formatted_item = construct_chat_message_with_tool_call(item)
+        formatted_item = _construct_single_message_from_response_item(item)
         assert formatted_item["role"] == "assistant"
         assert formatted_item["reasoning"] == "Leroy Jenkins"
 
@@ -74,7 +112,7 @@ class TestResponsesUtils:
             status=None,
         )
 
-        formatted_item = construct_chat_message_with_tool_call(item)
+        formatted_item = _construct_single_message_from_response_item(item)
         assert formatted_item["role"] == "assistant"
         assert (
             formatted_item["reasoning"]
@@ -88,7 +126,7 @@ class TestResponsesUtils:
             output="1234",
             status="completed",
         )
-        formatted_item = construct_chat_message_with_tool_call(tool_call_output)
+        formatted_item = _construct_single_message_from_response_item(tool_call_output)
         assert formatted_item["role"] == "tool"
         assert formatted_item["content"] == "1234"
         assert formatted_item["tool_call_id"] == "temp"
@@ -102,7 +140,7 @@ class TestResponsesUtils:
             status=None,
         )
         with pytest.raises(ValueError):
-            construct_chat_message_with_tool_call(item)
+            _construct_single_message_from_response_item(item)
 
         output_item = ResponseOutputMessage(
             id="msg_bf585bbbe3d500e0",
@@ -119,6 +157,6 @@ class TestResponsesUtils:
             type="message",
         )
 
-        formatted_item = construct_chat_message_with_tool_call(output_item)
+        formatted_item = _construct_single_message_from_response_item(output_item)
         assert formatted_item["role"] == "assistant"
         assert formatted_item["content"] == "dongyi"
diff --git a/vllm/entrypoints/constants.py b/vllm/entrypoints/constants.py
index b5bcccc35..5726ee073 100644
--- a/vllm/entrypoints/constants.py
+++ b/vllm/entrypoints/constants.py
@@ -8,3 +8,5 @@ Shared constants for vLLM entrypoints.
 # These constants help mitigate header abuse attacks
 H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT = 4194304  # 4 MB
 H11_MAX_HEADER_COUNT_DEFAULT = 256
+
+MCP_PREFIX = "mcp_"
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index 01ddab473..c70eaaa08 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -19,6 +19,7 @@ from vllm import envs
 from vllm.entrypoints.chat_utils import (
     ChatTemplateContentFormatOption,
 )
+from vllm.entrypoints.constants import MCP_PREFIX
 from vllm.entrypoints.openai.parser.harmony_utils import (
     get_encoding,
     get_streamable_parser_for_assistant,
@@ -303,7 +304,7 @@ class ParsableContext(ConversationContext):
         result_str = result.content[0].text
 
         message = ResponseFunctionToolCallOutputItem(
-            id=f"fco_{random_uuid()}",
+            id=f"mcpo_{random_uuid()}",
             type="function_call_output",
             call_id=f"call_{random_uuid()}",
             output=result_str,
@@ -385,6 +386,9 @@ class ParsableContext(ConversationContext):
         if not self.parser.response_messages:
             return []
         last_msg = self.parser.response_messages[-1]
+        # change this to a mcp_ function call
+        last_msg.id = f"{MCP_PREFIX}{random_uuid()}"
+        self.parser.response_messages[-1] = last_msg
         if last_msg.name == "code_interpreter":
             return await self.call_python_tool(self._tool_sessions["python"], last_msg)
         elif last_msg.name == "web_search_preview":
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 99936f588..44b0f1842 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1339,6 +1339,7 @@ class OpenAIServing:
                 )
                 engine_prompt = engine_prompts[0]
                 request_prompt = request_prompts[0]
+                prompt_text, _, _ = self._get_prompt_components(request_prompt)
 
             # Update the sampling params.
             sampling_params.max_tokens = self.max_model_len - len(
diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py
index fbc137bac..99080fa43 100644
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@@ -22,6 +22,7 @@ from openai.types.responses.response_reasoning_item import ResponseReasoningItem
 from openai.types.responses.tool import Tool
 
 from vllm import envs
+from vllm.entrypoints.constants import MCP_PREFIX
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionMessageParam,
     ResponseInputOutputItem,
@@ -44,13 +45,13 @@ def make_response_output_items_from_parsable_context(
                 )
             if isinstance(output_messages[-1], ResponseFunctionToolCall):
                 mcp_message = McpCall(
-                    id=f"mcp_{random_uuid()}",
+                    id=f"{MCP_PREFIX}{random_uuid()}",
                     arguments=output_messages[-1].arguments,
                     name=output_messages[-1].name,
                     server_label=output_messages[
                         -1
                     ].name,  # TODO: store the server label
-                    type="mcp_call",
+                    type=f"{MCP_PREFIX}call",
                     status="completed",
                     output=message.output,
                     # TODO: support error output
@@ -98,12 +99,63 @@ def construct_input_messages(
     if isinstance(request_input, str):
         messages.append({"role": "user", "content": request_input})
     else:
-        for item in request_input:
-            messages.append(construct_chat_message_with_tool_call(item))
+        input_messages = construct_chat_messages_with_tool_call(request_input)
+        messages.extend(input_messages)
     return messages
 
 
-def construct_chat_message_with_tool_call(
+def _maybe_combine_reasoning_and_tool_call(
+    item: ResponseInputOutputItem, messages: list[ChatCompletionMessageParam]
+) -> ChatCompletionMessageParam | None:
+    """Many models treat MCP calls and reasoning as a single message.
+    This function checks if the last message is a reasoning message and
+    the current message is a tool call"""
+    if not (
+        isinstance(item, ResponseFunctionToolCall) and item.id.startswith(MCP_PREFIX)
+    ):
+        return None
+    if len(messages) == 0:
+        return None
+    last_message = messages[-1]
+    if not (
+        last_message.get("role") == "assistant"
+        and last_message.get("reasoning") is not None
+    ):
+        return None
+
+    last_message["tool_calls"] = [
+        ChatCompletionMessageToolCallParam(
+            id=item.call_id,
+            function=FunctionCallTool(
+                name=item.name,
+                arguments=item.arguments,
+            ),
+            type="function",
+        )
+    ]
+    return last_message
+
+
+def construct_chat_messages_with_tool_call(
+    input_messages: list[ResponseInputOutputItem],
+) -> list[ChatCompletionMessageParam]:
+    """This function wraps _construct_single_message_from_response_item
+    Because some chatMessages come from multiple response items
+    for example a reasoning item and a MCP tool call are two response items
+    but are one chat message
+    """
+    messages: list[ChatCompletionMessageParam] = []
+    for item in input_messages:
+        maybe_combined_message = _maybe_combine_reasoning_and_tool_call(item, messages)
+        if maybe_combined_message is not None:
+            messages[-1] = maybe_combined_message
+        else:
+            messages.append(_construct_single_message_from_response_item(item))
+
+    return messages
+
+
+def _construct_single_message_from_response_item(
     item: ResponseInputOutputItem,
 ) -> ChatCompletionMessageParam:
     if isinstance(item, ResponseFunctionToolCall):
-- 
GitLab


From b75f826fca4febb17a76c12a45d5e315111c7618 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Tue, 9 Dec 2025 20:28:37 -0600
Subject: [PATCH 257/258] [CI/Build][AMD] Skip quantization kernels tests that
 require CUTLASS or e4m3fn when not supported by platform (#30020)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/kernels/quantization/test_block_fp8.py    | 17 ++++++++++++++---
 .../quantization/test_cutlass_scaled_mm.py      |  3 +++
 tests/kernels/quantization/test_cutlass_w4a8.py |  3 +++
 3 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
index d0e4f6554..32c77b9a0 100644
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -54,6 +54,10 @@ def setup_cuda():
     torch.set_default_device("cuda")
 
 
+@pytest.mark.skipif(
+    current_platform.is_fp8_fnuz(),
+    reason="This platform supports e4m3fnuz, not e4m3fn.",
+)
 @pytest.mark.parametrize(
     "num_tokens,d,dtype,group_size,seed",
     itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS),
@@ -78,14 +82,14 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
 def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
     torch.manual_seed(seed)
     factor_for_scale = 1e-2
-    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_info = torch.finfo(current_platform.fp8_dtype())
     fp8_max, fp8_min = fp8_info.max, fp8_info.min
 
     A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
-    A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(current_platform.fp8_dtype())
 
     B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
-    B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(current_platform.fp8_dtype())
 
     block_n, block_k = block_size[0], block_size[1]
     n_tiles = (N + block_n - 1) // block_n
@@ -103,6 +107,9 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
     assert rel_diff < 0.001
 
 
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="CUTLASS only supported on CUDA platform."
+)
 @torch.inference_mode()
 def test_w8a8_block_fp8_cutlass_matmul():
     # Test simple case where weight.shape % 128 != 0,
@@ -151,6 +158,10 @@ def test_w8a8_block_fp8_cutlass_matmul():
     assert rel_diff < 0.001
 
 
+@pytest.mark.skipif(
+    current_platform.is_fp8_fnuz(),
+    reason="This platform supports e4m3fnuz, not e4m3fn.",
+)
 @pytest.mark.parametrize(
     "M,N,K,block_size,out_dtype,seed",
     itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS),
diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py
index de595b0a3..bc4744df7 100644
--- a/tests/kernels/quantization/test_cutlass_scaled_mm.py
+++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py
@@ -15,6 +15,9 @@ from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv
 
+if not current_platform.is_cuda():
+    pytest.skip("These tests use CUTLASS which requires CUDA", allow_module_level=True)
+
 MNK_FACTORS = [
     (1, 256, 128),
     (1, 16384, 1024),
diff --git a/tests/kernels/quantization/test_cutlass_w4a8.py b/tests/kernels/quantization/test_cutlass_w4a8.py
index cccef28f5..8cfc993fe 100644
--- a/tests/kernels/quantization/test_cutlass_w4a8.py
+++ b/tests/kernels/quantization/test_cutlass_w4a8.py
@@ -21,6 +21,9 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
 
+if not current_platform.is_cuda():
+    pytest.skip("These tests use CUTLASS which requires CUDA", allow_module_level=True)
+
 # TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
 #  unit tests to a common utility function. Currently the use of
 #  `is_quant_method_supported` conflates kernels with quantization methods
-- 
GitLab


From 7d80c73d4277187d0468f15a22bba959ce853261 Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Tue, 9 Dec 2025 20:35:49 -0600
Subject: [PATCH 258/258] [CI] Reduce Flakiness For
 test_spec_decode.py::test_suffix_decoding_acceptance (#30367)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 tests/v1/e2e/test_spec_decode.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 416b582df..8c904a8cd 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -191,8 +191,8 @@ def test_suffix_decoding_acceptance(
     # Expect the acceptance rate to improve.
     assert first_accept_rate < last_accept_rate
 
-    # Heuristic: expect at least 82.5% acceptance rate at the end.
-    assert last_accept_rate > 0.825
+    # Heuristic: expect at least 80.0% acceptance rate at the end.
+    assert last_accept_rate > 0.80
 
     del spec_llm
     torch.cuda.empty_cache()
-- 
GitLab