Merge tag 'v0.14.0' into v0.14.0-dev

7e63ef82 · zhuwenwen · 8cbcac5d · b17039bc · 7e63ef82 · 7e63ef82
Commit 7e63ef82 authored Jan 21, 2026 by zhuwenwen
20 changed files
--- a/tests/v1/attention/test_attention_splitting.py
+++ b/tests/v1/attention/test_attention_splitting.py
@@ -323,6 +323,7 @@ def test_prefill_split_across_ubatches(
        num_tokens,
        batch_spec.batch_size,
        split_point=split_point,
+        num_ubatches=2,
    )
    assert ubatch_slices is not None and len(ubatch_slices) == 2


--- a/tests/v1/attention/test_batch_reordering.py
+++ b/tests/v1/attention/test_batch_reordering.py
@@ -98,6 +98,27 @@ REORDER_TEST_CASES = {
        expected_order=[0, 1, 6, 8, 4, 3, 2, 7, 5],
        expected_modified=True,
    ),
+    "new_request_single_token_prefill": ReorderTestCase(
+        requests=[
+            (100, 0),
+            (1, 0),  # New request with only 1 token (STILL prefill)
+            (50, 100),
+            (1, 10),
+        ],
+        # Only index 3 is a true decode (has num_computed_tokens > 0)
+        expected_order=[3, 2, 0, 1],
+        expected_modified=True,
+    ),
+    "multiple_new_requests_single_token_prefill": ReorderTestCase(
+        requests=[
+            (1, 0),  # New prefill (1 token, no computed)
+            (1, 0),  # New prefill (1 token, no computed)
+            (1, 50),
+            (200, 0),
+        ],
+        expected_order=[2, 1, 0, 3],
+        expected_modified=True,
+    ),
 }



--- a/tests/v1/attention/test_chunked_local_attention.py
+++ b/tests/v1/attention/test_chunked_local_attention.py
@@ -172,7 +172,7 @@ def test_local_attention_virtual_batches(test_data: LocalAttentionTestData):
    )

    # Call the function
-    result = make_local_attention_virtual_batches(
+    result, _ = make_local_attention_virtual_batches(
        attn_chunk_size, common_attn_metadata, block_size
    )


--- a/tests/v1/attention/test_mla_backends.py
+++ b/tests/v1/attention/test_mla_backends.py
@@ -18,15 +18,15 @@ from tests.v1.attention.utils import (
    try_get_attention_backend,
 )
 from vllm import _custom_ops as ops
-from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.ops.flashmla import is_flashmla_dense_supported
-from vllm.attention.utils.fa_utils import flash_attn_supports_mla
 from vllm.config.vllm import set_current_vllm_config
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.utils.math_utils import cdiv
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.v1.attention.backend import CommonAttentionMetadata
+from vllm.v1.attention.backends.fa_utils import flash_attn_supports_mla
 from vllm.v1.attention.backends.mla.common import QueryLenSupport
-from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
+from vllm.v1.attention.ops.flashmla import is_flashmla_dense_supported
 from vllm.v1.kv_cache_interface import FullAttentionSpec

 BACKENDS_TO_TEST = [
@@ -154,12 +154,12 @@ def create_and_prepopulate_kv_cache(
        MLA KV cache tensor
    """
    batch_size = len(kv_c_contexts)
-    seq_lens = common_attn_metadata.seq_lens_cpu
+    seq_lens = common_attn_metadata.seq_lens.cpu()
    query_lens = (
        common_attn_metadata.query_start_loc_cpu[1:]
        - common_attn_metadata.query_start_loc_cpu[:-1]
    )
-    context_lens = common_attn_metadata.num_computed_tokens_cpu
+    context_lens = seq_lens - query_lens
    block_table = common_attn_metadata.block_table_tensor
    slot_mapping = common_attn_metadata.slot_mapping

@@ -394,7 +394,11 @@ def run_attention_backend(
 @pytest.mark.parametrize("model", ["deepseek-ai/DeepSeek-R1"])
 @pytest.mark.parametrize("tensor_parallel_size", [1, 4, 8, 16])
 def test_backend_correctness(
-    dist_init, batch_spec_name: str, model: str, tensor_parallel_size: int
+    default_vllm_config,
+    dist_init,
+    batch_spec_name: str,
+    model: str,
+    tensor_parallel_size: int,
 ):
    """
    Test that all backends produce similar outputs to a reference implementation

--- a/tests/v1/attention/test_rocm_attention_backends_selection.py
+++ b/tests/v1/attention/test_rocm_attention_backends_selection.py
@@ -7,8 +7,9 @@ from unittest.mock import MagicMock, patch
 import pytest
 import torch

-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.platforms import current_platform
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
+from vllm.v1.attention.selector import AttentionSelectorConfig

 # ROCm-specific attention backend selection tests
 pytestmark = pytest.mark.skipif(
@@ -94,26 +95,20 @@ def mock_on_gfx9():
            None,
            AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path(),
        ),
-        # Test Case 9: VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1
-        (
-            {"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
-            None,
-            AttentionBackendEnum.ROCM_ATTN.get_path(),
-        ),
-        # Test Case 10: VLLM_ROCM_USE_AITER=1 + explicit TRITON_ATTN
+        # Test Case 9: VLLM_ROCM_USE_AITER=1 + explicit TRITON_ATTN
        (
            {"VLLM_ROCM_USE_AITER": "1"},
            "TRITON_ATTN",
            AttentionBackendEnum.TRITON_ATTN.get_path(),
        ),
-        # Test Case 11: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=0
+        # Test Case 10: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=0
        # (explicitly disabled)
        (
            {"VLLM_ROCM_USE_AITER": "1", "VLLM_ROCM_USE_AITER_MHA": "0"},
            None,
            AttentionBackendEnum.TRITON_ATTN.get_path(),
        ),
-        # Test Case 12: VLLM_ROCM_USE_AITER=1 + explicit ROCM_ATTN
+        # Test Case 11: VLLM_ROCM_USE_AITER=1 + explicit ROCM_ATTN
        (
            {"VLLM_ROCM_USE_AITER": "1"},
            "ROCM_ATTN",
@@ -150,8 +145,7 @@ def test_standard_attention_backend_selection(
    # Get the backend class path
    from vllm.platforms.rocm import RocmPlatform

-    backend_path = RocmPlatform.get_attn_backend_cls(
-        selected_backend=backend_enum,
+    attn_selector_config = AttentionSelectorConfig(
        head_size=128,
        dtype=torch.float16,
        kv_cache_dtype="auto",
@@ -160,6 +154,11 @@ def test_standard_attention_backend_selection(
        has_sink=False,
        use_sparse=False,
    )
+
+    backend_path = RocmPlatform.get_attn_backend_cls(
+        selected_backend=backend_enum, attn_selector_config=attn_selector_config
+    )
+
    assert backend_path == expected_backend_path


@@ -273,8 +272,16 @@ def test_mla_backend_selection(

        if should_raise:
            with pytest.raises(ValueError):
-                RocmPlatform.get_attn_backend_cls(
-                    selected_backend=backend_enum,
+                attn_selector_config = AttentionSelectorConfig(
+                    head_size=128,
+                    dtype=torch.float16,
+                    kv_cache_dtype="auto",
+                    block_size=block_size,
+                    use_mla=True,
+                    has_sink=False,
+                    use_sparse=False,
+                )
+                attn_selector_config = AttentionSelectorConfig(
                    head_size=128,
                    dtype=torch.float16,
                    kv_cache_dtype="auto",
@@ -283,9 +290,13 @@ def test_mla_backend_selection(
                    has_sink=False,
                    use_sparse=False,
                )
+                backend_path = RocmPlatform.get_attn_backend_cls(
+                    selected_backend=backend_enum,
+                    attn_selector_config=attn_selector_config,
+                )
+
        else:
-            backend_path = RocmPlatform.get_attn_backend_cls(
-                selected_backend=backend_enum,
+            attn_selector_config = AttentionSelectorConfig(
                head_size=128,
                dtype=torch.float16,
                kv_cache_dtype="auto",
@@ -294,6 +305,11 @@ def test_mla_backend_selection(
                has_sink=False,
                use_sparse=False,
            )
+
+            backend_path = RocmPlatform.get_attn_backend_cls(
+                selected_backend=backend_enum, attn_selector_config=attn_selector_config
+            )
+
            assert backend_path == expected_backend_path


@@ -309,8 +325,7 @@ def test_aiter_fa_requires_gfx9(mock_vllm_config):
            match="only supported on gfx9",
        ),
    ):
-        RocmPlatform.get_attn_backend_cls(
-            selected_backend=AttentionBackendEnum.ROCM_AITER_FA,
+        attn_selector_config = AttentionSelectorConfig(
            head_size=128,
            dtype=torch.float16,
            kv_cache_dtype="auto",
@@ -320,6 +335,11 @@ def test_aiter_fa_requires_gfx9(mock_vllm_config):
            use_sparse=False,
        )

+        RocmPlatform.get_attn_backend_cls(
+            selected_backend=AttentionBackendEnum.ROCM_AITER_FA,
+            attn_selector_config=attn_selector_config,
+        )
+

 def test_sparse_not_supported(mock_vllm_config):
    """Test that sparse attention is not supported on ROCm."""
@@ -328,8 +348,7 @@ def test_sparse_not_supported(mock_vllm_config):
    with pytest.raises(
        AssertionError, match="Sparse MLA backend on ROCm only supports block size 1"
    ):
-        RocmPlatform.get_attn_backend_cls(
-            selected_backend=None,
+        attn_selector_config = AttentionSelectorConfig(
            head_size=128,
            dtype=torch.float16,
            kv_cache_dtype="auto",
@@ -338,3 +357,7 @@ def test_sparse_not_supported(mock_vllm_config):
            has_sink=False,
            use_sparse=True,
        )
+
+        RocmPlatform.get_attn_backend_cls(
+            selected_backend=None, attn_selector_config=attn_selector_config
+        )
--- a/tests/v1/attention/test_sparse_mla_backends.py
+++ b/tests/v1/attention/test_sparse_mla_backends.py
@@ -22,15 +22,16 @@ from tests.v1.attention.utils import (
    create_vllm_config,
 )
 from vllm import _custom_ops as ops
-from vllm.attention.ops import flashmla
 from vllm.config import set_current_vllm_config
 from vllm.model_executor.layers.linear import ColumnParallelLinear
+from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backends.mla.flashmla_sparse import (
    FlashMLASparseBackend,
    triton_convert_req_index_to_global_index,
 )
 from vllm.v1.attention.backends.utils import split_prefill_chunks
+from vllm.v1.attention.ops import flashmla
 from ...utils import models_path_prefix

 SPARSE_BACKEND_BATCH_SPECS = {
@@ -125,8 +126,16 @@ def _quantize_dequantize_fp8_ds_mla(
    reason="FlashMLASparseBackend requires CUDA 9.0 or higher",
 )
 def test_sparse_backend_decode_correctness(
-    dist_init, batch_name, kv_cache_dtype, tensor_parallel_size, workspace_init
+    default_vllm_config,
+    dist_init,
+    batch_name,
+    kv_cache_dtype,
+    tensor_parallel_size,
+    workspace_init,
 ):
+    if current_platform.is_rocm():
+        pytest.skip("ROCm does not support fp8_ds_mla data type for kv cache.")
+
    if not torch.cuda.is_available():
        pytest.skip("CUDA is required for sparse MLA decode test")

@@ -295,7 +304,7 @@ def test_sparse_backend_decode_correctness(
    positions = np.arange(starts[-1], dtype=np.int32) - np.repeat(
        starts[:-1], seg_lengths
    )
-    seq_lengths = np.asarray(common_attn_metadata.seq_lens_cpu, dtype=np.int32)
+    seq_lengths = np.asarray(common_attn_metadata.seq_lens.cpu(), dtype=np.int32)
    prefix_lengths = seq_lengths - seg_lengths
    positions += np.repeat(prefix_lengths, seg_lengths)


--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -7,8 +7,6 @@ from dataclasses import dataclass
 import pytest
 import torch

-from vllm.attention.backends.abstract import AttentionImpl
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import (
    CacheConfig,
    CompilationConfig,
@@ -20,10 +18,12 @@ from vllm.config import (
    VllmConfig,
 )
 from vllm.config.model import ModelDType
-from vllm.v1.attention.backends.utils import (
+from vllm.v1.attention.backend import (
+    AttentionImpl,
    AttentionMetadataBuilder,
    CommonAttentionMetadata,
 )
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 from vllm.v1.kv_cache_interface import FullAttentionSpec


@@ -249,8 +249,8 @@ def create_dummy_kv_cache(
 @dataclass
 class BackendConfig:
    name: str
-    env_vars: dict
-    comp_config: dict  # compilation config
+    attention_config: dict
+    comp_config: dict
    specific_gpu_arch: tuple | None = None


@@ -259,10 +259,10 @@ full_cg_backend_configs = {
    # FA3 on Hopper
    "FA3": BackendConfig(
        name="FA3",
-        env_vars={
-            "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
-            "VLLM_FLASH_ATTN_VERSION": "3",
-            "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
+        attention_config={
+            "backend": "FLASH_ATTN",
+            "flash_attn_version": 3,
+            "flash_attn_max_num_splits_for_cuda_graph": 16,
        },
        comp_config={
            "cudagraph_mode": "FULL",
@@ -272,9 +272,7 @@ full_cg_backend_configs = {
    # FlashMLA on Hopper
    "FlashMLA": BackendConfig(
        name="FlashMLA",
-        env_vars={
-            "VLLM_ATTENTION_BACKEND": "FLASHMLA",
-        },
+        attention_config={"backend": "FLASHMLA"},
        comp_config={
            "cudagraph_mode": "FULL_AND_PIECEWISE",
        },
@@ -283,9 +281,7 @@ full_cg_backend_configs = {
    # Cutlass MLA on Blackwell
    "CutlassMLA": BackendConfig(
        name="CutlassMLA",
-        env_vars={
-            "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
-        },
+        attention_config={"backend": "CUTLASS_MLA"},
        comp_config={
            "cudagraph_mode": "FULL_AND_PIECEWISE",
        },
@@ -294,9 +290,7 @@ full_cg_backend_configs = {
    # FlashInfer MLA on Blackwell
    "FlashInferMLA": BackendConfig(
        name="FlashInferMLA",
-        env_vars={
-            "VLLM_ATTENTION_BACKEND": "FLASHINFER_MLA",
-        },
+        attention_config={"backend": "FLASHINFER_MLA"},
        comp_config={
            "cudagraph_mode": "FULL_AND_PIECEWISE",
        },
@@ -305,9 +299,9 @@ full_cg_backend_configs = {
    # FlashAttention MLA on Hopper
    "FlashAttentionMLA": BackendConfig(
        name="FlashAttentionMLA",
-        env_vars={
-            "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
-            "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
+        attention_config={
+            "backend": "FLASH_ATTN_MLA",
+            "flash_attn_max_num_splits_for_cuda_graph": 16,
        },
        comp_config={
            "cudagraph_mode": "FULL_DECODE_ONLY",
@@ -317,10 +311,10 @@ full_cg_backend_configs = {
    # FA2
    "FA2": BackendConfig(
        name="FA2",
-        env_vars={
-            "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
-            "VLLM_FLASH_ATTN_VERSION": "2",
-            "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
+        attention_config={
+            "backend": "FLASH_ATTN",
+            "flash_attn_version": 2,
+            "flash_attn_max_num_splits_for_cuda_graph": 16,
        },
        comp_config={
            "cudagraph_mode": "FULL_AND_PIECEWISE",
@@ -329,7 +323,7 @@ full_cg_backend_configs = {
    # Triton Attention
    "TritonAttn": BackendConfig(
        name="TritonAttn",
-        env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
+        attention_config={"backend": "TRITON_ATTN"},
        comp_config={
            "cudagraph_mode": "FULL_AND_PIECEWISE",
        },
@@ -337,14 +331,17 @@ full_cg_backend_configs = {
    # FlashInfer
    "FlashInfer": BackendConfig(
        name="FlashInfer",
-        env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
+        attention_config={"backend": "FLASHINFER"},
        comp_config={
            "cudagraph_mode": "FULL_AND_PIECEWISE",
        },
    ),
    "RocmAttn": BackendConfig(
        name="RocmAttn",
-        env_vars={"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
+        attention_config={
+            "backend": "ROCM_ATTN",
+            "use_prefill_decode_attention": True,
+        },
        comp_config={
            "cudagraph_mode": "FULL",
        },

--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1800,3 +1800,60 @@ def test_request_with_prompt_embeds_and_mm_inputs(hash_fn: Callable[[Any], bytes
        )
    )
    assert block_hashes[1] == expected_hash2
+
+
+def test_auto_fit_max_model_len():
+    """Test that max_model_len=-1 auto-fits to available GPU memory."""
+    # Create config with original_max_model_len=-1 to trigger auto-fit
+    model_config = ModelConfig(max_model_len=1024)
+    # Simulate the user passing -1 by setting original_max_model_len
+    model_config.original_max_model_len = -1
+    vllm_config = VllmConfig(model_config=model_config)
+
+    mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2  # 16KB per block per layer
+    kv_cache_specs = {
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_kv_cache_spec(),
+    }
+
+    # With enough memory, max_model_len stays at the derived max
+    large_available_memory = mem_per_block_per_layer * 2 * 1024  # plenty of memory
+    _kv_cache_configs = get_kv_cache_configs(
+        vllm_config, [kv_cache_specs], [large_available_memory]
+    )
+    assert vllm_config.model_config.max_model_len == 1024
+
+    # Reset for next test
+    model_config = ModelConfig(max_model_len=1024)
+    model_config.original_max_model_len = -1
+    vllm_config = VllmConfig(model_config=model_config)
+
+    # With limited memory, max_model_len should be reduced
+    # Need memory for at least max_model_len tokens
+    # 32 blocks worth of memory for 2 layers = can fit 32*16=512 tokens
+    limited_memory = mem_per_block_per_layer * 2 * 32
+    _kv_cache_configs = get_kv_cache_configs(
+        vllm_config, [kv_cache_specs], [limited_memory]
+    )
+    # Should be reduced to fit in memory
+    assert vllm_config.model_config.max_model_len < 1024
+    assert vllm_config.model_config.max_model_len > 0
+
+
+def test_auto_fit_max_model_len_not_triggered():
+    """Test that auto-fit is not triggered when original_max_model_len is not -1."""
+    model_config = ModelConfig(max_model_len=16)
+    # original_max_model_len should be None by default, not -1
+    vllm_config = VllmConfig(model_config=model_config)
+
+    mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
+    kv_cache_specs = {
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_kv_cache_spec(),
+    }
+
+    # This should work normally without auto-fit
+    _kv_cache_configs = get_kv_cache_configs(
+        vllm_config, [kv_cache_specs], [mem_per_block_per_layer * 2 * 32]
+    )
+    assert vllm_config.model_config.max_model_len == 16
--- a/tests/v1/core/test_kv_sharing.py
+++ b/tests/v1/core/test_kv_sharing.py
@@ -11,7 +11,9 @@ pytestmark = pytest.mark.cpu_test


 def new_kv_cache_spec():
-    return FullAttentionSpec(16, 1, 1, torch.float32, False)
+    return FullAttentionSpec(
+        block_size=16, num_kv_heads=1, head_size=1, dtype=torch.float32
+    )


 def test_initialize_kv_cache_for_kv_sharing_different_attn_groups():

--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -35,6 +35,7 @@ from vllm.v1.kv_cache_interface import (
    FullAttentionSpec,
    KVCacheConfig,
    KVCacheGroupSpec,
+    MambaSpec,
    SlidingWindowSpec,
 )

@@ -94,35 +95,105 @@ def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
        kv_cache_groups=[
            KVCacheGroupSpec(
                ["layer"],
-                FullAttentionSpec(block_size, 1, 1, torch.float32),
+                FullAttentionSpec(
+                    block_size=block_size,
+                    num_kv_heads=1,
+                    head_size=1,
+                    dtype=torch.float32,
+                ),
            )
        ],
    )


 def make_kv_cache_config_hybrid_model(
-    block_size: int, num_blocks: int
+    block_size: int, num_blocks: int, second_spec_type: str = "sliding_window"
 ) -> KVCacheConfig:
+    if second_spec_type == "sliding_window":
+        second_spec = SlidingWindowSpec(
+            block_size=block_size,
+            num_kv_heads=1,
+            head_size=1,
+            dtype=torch.float32,
+            sliding_window=2 * block_size,
+        )
+    elif second_spec_type == "mamba":
+        second_spec = MambaSpec(
+            block_size=block_size,
+            shapes=(1, 1),
+            dtypes=(torch.float32,),
+        )
+
    return KVCacheConfig(
        num_blocks=num_blocks,
        kv_cache_tensors=[],
        kv_cache_groups=[
            KVCacheGroupSpec(
                ["layer1"],
-                FullAttentionSpec(block_size, 1, 1, torch.float32),
+                FullAttentionSpec(
+                    block_size=block_size,
+                    num_kv_heads=1,
+                    head_size=1,
+                    dtype=torch.float32,
+                ),
            ),
            KVCacheGroupSpec(
                ["layer2"],
-                SlidingWindowSpec(
-                    block_size, 1, 1, torch.float32, sliding_window=2 * block_size
-                ),
+                second_spec,
            ),
            KVCacheGroupSpec(
                ["layer3"],
+                second_spec,
+            ),
+        ],
+    )
+
+
+def make_kv_cache_config_three_types(
+    block_size: int, num_blocks: int, third_spec_type: str = "mamba"
+) -> KVCacheConfig:
+    if third_spec_type == "mamba":
+        third_spec = MambaSpec(
+            block_size=block_size,
+            shapes=(1, 1),
+            dtypes=(torch.float32,),
+        )
+    elif third_spec_type == "sliding_window":
+        third_spec = SlidingWindowSpec(
+            block_size=block_size,
+            num_kv_heads=1,
+            head_size=1,
+            dtype=torch.float32,
+            sliding_window=4 * block_size,
+        )
+
+    return KVCacheConfig(
+        num_blocks=num_blocks,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec(
+                ["layer1"],
+                FullAttentionSpec(
+                    block_size=block_size,
+                    num_kv_heads=1,
+                    head_size=1,
+                    dtype=torch.float32,
+                ),
+            ),
+            KVCacheGroupSpec(
+                ["layer2"],
                SlidingWindowSpec(
-                    block_size, 1, 1, torch.float32, sliding_window=2 * block_size
+                    block_size=block_size,
+                    num_kv_heads=1,
+                    head_size=1,
+                    dtype=torch.float32,
+                    sliding_window=2 * block_size,
                ),
            ),
+            KVCacheGroupSpec(
+                ["layer3"],
+                third_spec,
+            ),
        ],
    )

@@ -406,6 +477,184 @@ def test_prefill_hybrid_model():
    )


+def _make_hybrid_kv_cache_config(
+    block_size: int, num_blocks: int, spec_types: list[str]
+) -> KVCacheConfig:
+    """
+    Create a KVCacheConfig with the specified spec types.
+
+    Args:
+        block_size: The block size for KV cache.
+        num_blocks: The number of blocks in the KV cache.
+        spec_types: List of spec type strings. Supported types:
+            - "full": FullAttentionSpec
+            - "sliding_window": SlidingWindowSpec with window=2*block_size
+            - "sliding_window_large": SlidingWindowSpec with window=4*block_size
+            - "mamba": MambaSpec
+    """
+    spec_map = {
+        "full": lambda: FullAttentionSpec(
+            block_size=block_size,
+            num_kv_heads=1,
+            head_size=1,
+            dtype=torch.float32,
+        ),
+        "sliding_window": lambda: SlidingWindowSpec(
+            block_size=block_size,
+            num_kv_heads=1,
+            head_size=1,
+            dtype=torch.float32,
+            sliding_window=2 * block_size,
+        ),
+        "sliding_window_large": lambda: SlidingWindowSpec(
+            block_size=block_size,
+            num_kv_heads=1,
+            head_size=1,
+            dtype=torch.float32,
+            sliding_window=4 * block_size,
+        ),
+        "mamba": lambda: MambaSpec(
+            block_size=block_size,
+            shapes=(1, 1),
+            dtypes=(torch.float32,),
+        ),
+    }
+
+    kv_cache_groups = [
+        KVCacheGroupSpec([f"layer{i}"], spec_map[spec_type]())
+        for i, spec_type in enumerate(spec_types)
+    ]
+
+    return KVCacheConfig(
+        num_blocks=num_blocks,
+        kv_cache_tensors=[],
+        kv_cache_groups=kv_cache_groups,
+    )
+
+
+# Test cases covering various combinations of KV cache spec types:
+# - Varying number of groups (2, 3, or 4)
+# - 0, 1, or 2 full attention groups
+# - Sliding window with different window sizes
+# - Interleaved group IDs (full attn and other types mixed)
+# - Mamba spec combinations
+_HYBRID_MODEL_TEST_CASES = [
+    # 2 groups: 1 full + 1 other
+    pytest.param(["full", "sliding_window"], id="2g-full+sw"),
+    pytest.param(["full", "mamba"], id="2g-full+mamba"),
+    # 2 groups: 0 full (all other types)
+    pytest.param(["sliding_window", "mamba"], id="2g-sw+mamba"),
+    pytest.param(["sliding_window", "sliding_window_large"], id="2g-sw+sw_large"),
+    # 3 groups: 1 full + 2 others (same type)
+    pytest.param(["full", "sliding_window", "sliding_window"], id="3g-full+2sw"),
+    pytest.param(["full", "mamba", "mamba"], id="3g-full+2mamba"),
+    # 3 groups: 1 full + 2 others (different types)
+    pytest.param(["full", "sliding_window", "mamba"], id="3g-full+sw+mamba"),
+    pytest.param(
+        ["full", "sliding_window", "sliding_window_large"],
+        id="3g-full+sw+sw_large",
+    ),
+    # 3 groups: 2 full + 1 other
+    pytest.param(["full", "full", "sliding_window"], id="3g-2full+sw"),
+    pytest.param(["full", "full", "mamba"], id="3g-2full+mamba"),
+    # 4 groups: interleaved (full, other, full, other)
+    pytest.param(
+        ["full", "sliding_window", "full", "sliding_window_large"],
+        id="4g-interleaved-full+sw+sw_large",
+    ),
+    pytest.param(
+        ["full", "mamba", "full", "mamba"],
+        id="4g-interleaved-full+mamba",
+    ),
+    # 4 groups: interleaved with different sliding windows
+    pytest.param(
+        ["full", "sliding_window", "full", "sliding_window_large"],
+        id="4g-interleaved-full+sw_mixed",
+    ),
+    # 4 groups: 0 full (all other types)
+    pytest.param(
+        ["sliding_window", "mamba", "sliding_window_large", "mamba"],
+        id="4g-sw+mamba+sw_large+mamba",
+    ),
+    # 4 groups: 2 full + 2 others (grouped)
+    pytest.param(
+        ["full", "full", "sliding_window", "mamba"],
+        id="4g-2full+sw+mamba",
+    ),
+]
+
+
+@pytest.mark.parametrize("spec_types", _HYBRID_MODEL_TEST_CASES)
+def test_prefill_hybrid_model_combinations(spec_types: list[str]):
+    """
+    Test prefix caching with hybrid models containing various combinations of
+    KV cache spec types.
+
+    This unified test covers:
+    - Various combinations (full attn + other attn types)
+    - Varying number of groups (2, 3, or 4)
+    - 0, 1, or 2 full attention groups in the combination
+    - Two sliding_window attn groups with different window sizes
+    - Interleaved group IDs (full attn and other types alternating)
+    - Mamba spec with other attention types
+    """
+    block_size = 16
+    num_groups = len(spec_types)
+    # Allocate enough blocks for all groups
+    num_blocks = 10 * num_groups
+
+    kv_cache_config = _make_hybrid_kv_cache_config(block_size, num_blocks, spec_types)
+    manager = KVCacheManager(
+        kv_cache_config,
+        max_model_len=8192,
+        enable_caching=True,
+        hash_block_size=block_size,
+    )
+
+    hash_fn = sha256
+
+    # Complete 3 blocks (48 tokens)
+    common_token_ids = [i for i in range(3) for _ in range(block_size)]
+    unique_token_ids = [3] * 7
+    all_token_ids = common_token_ids + unique_token_ids
+
+    # First request: no cache hit initially
+    req0 = make_request("0", all_token_ids, block_size, hash_fn)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
+
+    assert len(req0.block_hashes) == 3
+    assert not computed_blocks.blocks[0]  # No cache hit initially
+    assert num_computed_tokens == 0
+
+    blocks = manager.allocate_slots(
+        req0, 55, len(computed_blocks.blocks[0]) * block_size, computed_blocks
+    )
+    assert blocks is not None
+    # Should have blocks for all groups
+    assert len(blocks.get_block_ids()) == num_groups
+
+    # Second request: should hit cached blocks for common prefix
+    req1 = make_request("1", common_token_ids + [4] * 5, block_size, hash_fn)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+
+    # Should hit cached blocks for all groups
+    assert num_computed_tokens == 3 * block_size
+    assert len(computed_blocks.blocks) == num_groups
+
+    # Allocate and verify blocks for second request
+    blocks = manager.allocate_slots(
+        req1,
+        len(common_token_ids) + 5 - num_computed_tokens,
+        num_computed_tokens,
+        computed_blocks,
+    )
+    assert blocks is not None
+    assert len(blocks.get_block_ids()) == num_groups
+
+    manager.free(req0)
+    manager.free(req1)
+
+
 def test_prefill_plp():
    """Test prefill with APC and some prompt logprobs (plp) requests.

@@ -1356,6 +1605,69 @@ def test_kv_cache_events(blocks_to_cache: int):
    assert len(manager.block_pool.cached_block_hash_to_block) == 0


+def test_null_parent_block_hash():
+    block_size = 1
+    num_cached_blocks = 2
+    num_full_blocks = 4
+
+    pool = BlockPool(
+        num_gpu_blocks=8,
+        enable_caching=True,
+        hash_block_size=block_size,
+        enable_kv_cache_events=True,
+    )
+
+    req = make_request(
+        "req_null_parent",
+        prompt_token_ids=[10, 11, 12, 13],
+        block_size=block_size,
+        hash_fn=sha256,
+    )
+    assert len(req.block_hashes) == num_full_blocks
+
+    # Physical parent is `null_block` (no hash), while the logical parent hash
+    # still exists in `request.block_hashes[num_cached_blocks - 1]`.
+    assert pool.null_block.block_hash is None
+    new_blocks = pool.get_new_blocks(num_full_blocks - 1)
+    blocks = [
+        new_blocks[: num_cached_blocks - 1],
+        pool.null_block,  # physical parent
+        *new_blocks[num_cached_blocks - 1 :],
+    ]
+
+    pool.cache_full_blocks(
+        request=req,
+        blocks=blocks,
+        num_cached_blocks=num_cached_blocks,
+        num_full_blocks=num_full_blocks,
+        block_size=block_size,
+        kv_cache_group_id=0,
+    )
+
+    events = pool.take_events()
+    assert len(events) == 1
+    event = events[0]
+    assert isinstance(event, BlockStored)
+
+    expected_parent = kv_cache_utils.maybe_convert_block_hash(
+        req.block_hashes[num_cached_blocks - 1]
+    )
+    assert event.parent_block_hash == expected_parent
+    assert event.parent_block_hash is not None
+
+    expected_new_hashes = [
+        kv_cache_utils.maybe_convert_block_hash(h)
+        for h in req.block_hashes[num_cached_blocks:num_full_blocks]
+    ]
+    assert event.block_hashes == expected_new_hashes
+
+    # Ensure we didn't accidentally assign a hash to the null block.
+    assert pool.null_block.block_hash is None
+    # Sanity check: newly cached physical blocks should have hashes assigned.
+    assert blocks[num_cached_blocks].block_hash is not None
+    assert blocks[num_full_blocks - 1].block_hash is not None
+
+
 @pytest.mark.parametrize("blocks_to_cache", [2, 3, 10])
 def test_kv_cache_events_with_lora(blocks_to_cache: int):
    """Test BlockStored events contain correct lora_id when using LoRA requests."""
@@ -1553,15 +1865,20 @@ def test_different_block_size():
        kv_cache_groups=[
            KVCacheGroupSpec(
                ["layer1"],
-                FullAttentionSpec(block_size * 2, 1, 1, torch.float16),
+                FullAttentionSpec(
+                    block_size=block_size * 2,
+                    num_kv_heads=1,
+                    head_size=1,
+                    dtype=torch.float16,
+                ),
            ),
            KVCacheGroupSpec(
                ["layer2"],
                SlidingWindowSpec(
-                    block_size,
-                    1,
-                    1,
-                    torch.float32,
+                    block_size=block_size,
+                    num_kv_heads=1,
+                    head_size=1,
+                    dtype=torch.float32,
                    sliding_window=2 * block_size,
                ),
            ),

--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1264,10 +1264,11 @@ def test_kv_connector_unable_to_allocate(use_ec_connector, ec_role):
    assert len(scheduler.waiting) == 0


+@pytest.mark.parametrize("is_async", [False, True])
 @pytest.mark.parametrize(
    "use_ec_connector, ec_role", [(False, None), (True, "ec_consumer")]
 )
-def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
+def test_kv_connector_handles_preemption(is_async, use_ec_connector, ec_role):
    """
    Test whether scheduler with KVConnector is able to handle
    unable to allocate (run out of blocks in allocate_slots().
@@ -1280,7 +1281,9 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
    scheduler = create_scheduler(
        enable_prefix_caching=True,
-        use_kv_connector=mock_kv(matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=False),
+        use_kv_connector=mock_kv(
+            matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=is_async
+        ),
        block_size=BLOCK_SIZE,
        num_blocks=NUM_BLOCKS,
        # encoder connector should not affect test results
@@ -1318,6 +1321,12 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):

    # All can be scheduled - 1st token.
    output = scheduler.schedule()
+    if is_async:
+        assert len(scheduler.waiting) == 2
+        assert scheduler.running == []
+        _step_until_kv_transfer_finished(scheduler, req_ids)
+        output = scheduler.schedule()
+
    _assert_right_scheduler_output(
        output,
        # 2 remote kv cache hits.
@@ -1370,6 +1379,12 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
    # Restarts the preempted request - generate 3rd token.
    # This will have a local and remote cache hit.
    output = scheduler.schedule()
+    if is_async:
+        waiting_req_ids = [req.request_id for req in scheduler.waiting]
+        assert len(waiting_req_ids) == 1
+        _step_until_kv_transfer_finished(scheduler, waiting_req_ids)
+        output = scheduler.schedule()
+
    _assert_right_scheduler_output(
        output,
        # 1 remote kv_cache hit!
@@ -1380,6 +1395,8 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
    )
    assert len(scheduler.running) == 1
    assert len(scheduler.waiting) == 0
+    assert output.scheduled_cached_reqs.num_reqs == 1
+    assert output.scheduled_new_reqs == []
    _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
    assert len(scheduler.running) == 1
    assert len(scheduler.waiting) == 0
@@ -1392,6 +1409,8 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
        num_requests=0,
        expected_num_scheduled_tokens=1,
    )
+    assert output.scheduled_cached_reqs.num_reqs == 1
+    assert output.scheduled_new_reqs == []
    assert len(scheduler.running) == 1
    _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
    assert len(scheduler.running) == 0
@@ -1577,7 +1596,13 @@ def create_scheduler_with_priority(
        kv_cache_tensors=[],
        kv_cache_groups=[
            KVCacheGroupSpec(
-                ["layer"], FullAttentionSpec(block_size, 1, 1, torch.float32, False)
+                ["layer"],
+                FullAttentionSpec(
+                    block_size=block_size,
+                    num_kv_heads=1,
+                    head_size=1,
+                    dtype=torch.float32,
+                ),
            )
        ],
    )
@@ -2288,7 +2313,6 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
    # 4th Schedule - this should trigger the resumption
    output = scheduler.schedule()
    scheduled_cached_reqs = output.scheduled_cached_reqs
-    resumed_from_preemption = scheduled_cached_reqs.resumed_from_preemption

    assert len(output.scheduled_new_reqs) == 0
    assert scheduled_cached_reqs.num_reqs == 1
@@ -2296,14 +2320,14 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
    assert len(scheduler.running) == 1

    # Preempted request resumed in scheduled_cached_reqs
-    assert len(resumed_from_preemption) == 1
-    assert len(scheduled_cached_reqs.resumed_req_token_ids) == 1
-    assert resumed_from_preemption[0]
+    assert len(scheduled_cached_reqs.resumed_req_ids) == 1
+    assert len(scheduled_cached_reqs.all_token_ids) == 1
    assert scheduled_cached_reqs.req_ids[0] == request_low.request_id
-    assert scheduled_cached_reqs.resumed_req_token_ids[0] is not None
+    assert request_low.request_id in scheduled_cached_reqs.resumed_req_ids
+    assert request_low.request_id in scheduled_cached_reqs.all_token_ids
    # Resumed tokens include 30 prompt tokens and 2 decoded tokens
-    assert len(scheduled_cached_reqs.resumed_req_token_ids[0]) == 32
-    assert scheduled_cached_reqs.resumed_req_token_ids[0][31] == 100
+    assert len(scheduled_cached_reqs.all_token_ids[request_low.request_id]) == 32
+    assert scheduled_cached_reqs.all_token_ids[request_low.request_id][31] == 100


 @pytest.mark.parametrize(
@@ -3126,7 +3150,6 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
    # 4th Schedule - this should trigger req_low resumption from waiting
    output = scheduler.schedule()
    scheduled_cached_reqs = output.scheduled_cached_reqs
-    resumed_from_preemption = scheduled_cached_reqs.resumed_from_preemption

    assert len(output.scheduled_new_reqs) == 0
    assert scheduled_cached_reqs.num_reqs == 1
@@ -3134,14 +3157,14 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
    assert len(scheduler.running) == 1

    # Preempted request resumed in scheduled_cached_reqs
-    assert len(resumed_from_preemption) == 1
-    assert len(scheduled_cached_reqs.resumed_req_token_ids) == 1
-    assert resumed_from_preemption[0]
+    assert len(scheduled_cached_reqs.resumed_req_ids) == 1
+    assert len(scheduled_cached_reqs.all_token_ids) == 1
    assert scheduled_cached_reqs.req_ids[0] == request_low.request_id
-    assert scheduled_cached_reqs.resumed_req_token_ids[0] is not None
+    assert request_low.request_id in scheduled_cached_reqs.resumed_req_ids
+    assert request_low.request_id in scheduled_cached_reqs.all_token_ids
    ## Resumed tokens include 94 prompt tokens and 2 decoded tokens
-    assert len(scheduled_cached_reqs.resumed_req_token_ids[0]) == 96
-    assert scheduled_cached_reqs.resumed_req_token_ids[0][95] == 100
+    assert len(scheduled_cached_reqs.all_token_ids[request_low.request_id]) == 96
+    assert scheduled_cached_reqs.all_token_ids[request_low.request_id][95] == 100
    assert scheduler.running[0].request_id == request_low.request_id
    assert request_high.request_id in output.finished_req_ids

@@ -3330,3 +3353,28 @@ def test_ec_connector_allocate_encoder_tokens_with_external_load(use_kv_connecto
 # ==============================================================================
 # EPD (Encoder-Prefill-Decode) Encoder-cache-specific tests end
 # ==============================================================================
+
+
+def test_prepend_skipped_requests_order():
+    scheduler = create_scheduler(max_num_seqs=1, use_kv_connector=True)
+    requests = create_requests(num_requests=4)
+    for request in requests:
+        scheduler.add_request(request)
+
+    # 4 requests waiting, capture their order
+    expected_waiting_reqs = list(scheduler.waiting)
+
+    # simulate first 2 waiting requests are waiting for remote KVs
+    for req in expected_waiting_reqs[:2]:
+        req.status = RequestStatus.WAITING_FOR_REMOTE_KVS
+
+    # schedule step
+    # expect the first 2 waiting to be skipped, the third running,
+    # and the fourth waiting
+    scheduler.schedule()
+
+    # pop the third request which is expected to be running
+    expected_waiting_reqs.pop(2)
+
+    # verify waiting order is preserved
+    assert list(scheduler.waiting) == expected_waiting_reqs
--- a/tests/v1/core/test_single_type_kv_cache_manager.py
+++ b/tests/v1/core/test_single_type_kv_cache_manager.py
@@ -21,13 +21,23 @@ from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, SlidingWindowS
 pytestmark = pytest.mark.cpu_test


-def get_sliding_window_manager(sliding_window_spec, block_pool):
-    return SlidingWindowManager(sliding_window_spec, block_pool, kv_cache_group_id=0)
+def get_sliding_window_manager(sliding_window_spec, block_pool, enable_caching=True):
+    return SlidingWindowManager(
+        sliding_window_spec,
+        block_pool,
+        enable_caching=enable_caching,
+        kv_cache_group_id=0,
+    )


-def get_chunked_local_attention_manager(chunked_local_attention_spec, block_pool):
+def get_chunked_local_attention_manager(
+    chunked_local_attention_spec, block_pool, enable_caching=True
+):
    return ChunkedLocalAttentionManager(
-        chunked_local_attention_spec, block_pool, kv_cache_group_id=0
+        chunked_local_attention_spec,
+        block_pool,
+        enable_caching=enable_caching,
+        kv_cache_group_id=0,
    )


@@ -332,11 +342,53 @@ def test_get_num_blocks_to_allocate():
    ]

    assert (
-        manager.get_num_blocks_to_allocate("1", 20 * block_size, cached_blocks_1) == 20
+        manager.get_num_blocks_to_allocate("1", 20 * block_size, cached_blocks_1, 0)
+        == 20
    )
    assert (
-        manager.get_num_blocks_to_allocate("2", 20 * block_size, cached_blocks_2) == 15
+        manager.get_num_blocks_to_allocate("2", 20 * block_size, cached_blocks_2, 0)
+        == 15
+    )
+
+
+def test_evictable_cached_blocks_not_double_allocated():
+    block_size = 2
+    sliding_window_length = 2 * block_size
+    sliding_window_spec = SlidingWindowSpec(
+        block_size=block_size,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+        sliding_window=sliding_window_length,
+    )
+
+    block_pool = BlockPool(
+        num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
+    )
+    manager = get_sliding_window_manager(sliding_window_spec, block_pool)
+
+    request_id = "req"
+    evictable_block = block_pool.blocks[1]  # ref_cnt == 0, eviction candidate
+
+    num_blocks_to_allocate = manager.get_num_blocks_to_allocate(
+        request_id=request_id,
+        num_tokens=2 * block_size,
+        new_computed_blocks=[evictable_block],
+        total_computed_tokens=block_size,
+    )
+    # Free capacity check should count evictable cached blocks, but allocation
+    # should only allocate the truly new block.
+    assert num_blocks_to_allocate == 2
+
+    manager.allocate_new_computed_blocks(
+        request_id,
+        [evictable_block],
+        num_local_computed_tokens=block_size,
+        num_external_computed_tokens=0,
    )
+    new_blocks = manager.allocate_new_blocks(request_id, num_tokens=4)
+    assert len(new_blocks) == 1
+    assert len(manager.req_to_blocks[request_id]) == 2


 def test_chunked_local_attention_get_num_blocks_to_allocate():
@@ -359,8 +411,10 @@ def test_chunked_local_attention_get_num_blocks_to_allocate():
    ]

    assert (
-        manager.get_num_blocks_to_allocate("1", 20 * block_size, cached_blocks_1) == 20
+        manager.get_num_blocks_to_allocate("1", 20 * block_size, cached_blocks_1, 0)
+        == 20
    )
    assert (
-        manager.get_num_blocks_to_allocate("2", 20 * block_size, cached_blocks_2) == 15
+        manager.get_num_blocks_to_allocate("2", 20 * block_size, cached_blocks_2, 0)
+        == 15
    )
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -142,7 +142,13 @@ def create_scheduler(
        kv_cache_tensors=[],
        kv_cache_groups=[
            KVCacheGroupSpec(
-                ["layer"], FullAttentionSpec(block_size, 1, 1, torch.float32, False)
+                ["layer"],
+                FullAttentionSpec(
+                    block_size=block_size,
+                    num_kv_heads=1,
+                    head_size=1,
+                    dtype=torch.float32,
+                ),
            )
        ],
    )

--- a/tests/v1/cudagraph/test_cudagraph_dispatch.py
+++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py
@@ -49,7 +49,10 @@ def _create_vllm_config(
        mock_config.lora_config = None
    # Mimic the behavior of VllmConfig.__post_init__()
    if compilation_config.mode == CompilationMode.VLLM_COMPILE:
-        compilation_config.set_splitting_ops_for_v1()
+        compilation_config.set_splitting_ops_for_v1(
+            all2all_backend=mock_config.parallel_config.all2all_backend,
+            data_parallel_size=mock_config.parallel_config.data_parallel_size,
+        )

    # mimic VllmConfig.__post_init__
    if compilation_config.cudagraph_capture_sizes:

--- a/tests/v1/cudagraph/test_cudagraph_mode.py
+++ b/tests/v1/cudagraph/test_cudagraph_mode.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import contextlib
-import os
 import weakref
 from contextlib import ExitStack

@@ -13,26 +11,6 @@ from vllm import LLM
 from vllm.config import CompilationConfig, CompilationMode
 from vllm.platforms import current_platform

-
-@contextlib.contextmanager
-def temporary_environ(env_vars):
-    """
-    Temporarily set environment variables and restore them afterward.
-    We have to do this vs monkeypatch because monkeypatch doesn't work
-    with "module" scoped fixtures.
-    """
-    original_env = {k: os.environ.get(k) for k in env_vars}
-    try:
-        os.environ.update(env_vars)
-        yield
-    finally:
-        for k, v in original_env.items():
-            if v is None:
-                os.environ.pop(k, None)
-            else:
-                os.environ[k] = v
-
-
 # test attention backend and cudagraph_mode combo
 # (backend_name, cudagraph_mode, supported)
 if current_platform.is_rocm():
@@ -68,9 +46,9 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
    ):
        pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")

-    env_vars = backend_configs[backend_name].env_vars
+    attention_config = backend_config.attention_config

-    with temporary_environ(env_vars), ExitStack() as stack:
+    with ExitStack() as stack:
        if not supported:
            stack.enter_context(pytest.raises(Exception))

@@ -80,6 +58,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
            trust_remote_code=True,
            gpu_memory_utilization=0.45,
            max_model_len=1024,
+            attention_config=attention_config,
            compilation_config=CompilationConfig(
                mode=CompilationMode.VLLM_COMPILE, cudagraph_mode=cudagraph_mode
            ),
@@ -122,9 +101,10 @@ combo_cases_2 = [
 def test_cudagraph_compilation_combo(
    backend_name, cudagraph_mode, compilation_mode, supported
 ):
-    env_vars = backend_configs[backend_name].env_vars
+    backend_config = backend_configs[backend_name]
+    attention_config = backend_config.attention_config

-    with temporary_environ(env_vars), ExitStack() as stack:
+    with ExitStack() as stack:
        if not supported:
            stack.enter_context(pytest.raises(Exception))

@@ -134,6 +114,7 @@ def test_cudagraph_compilation_combo(
            trust_remote_code=True,
            gpu_memory_utilization=0.45,
            max_model_len=1024,
+            attention_config=attention_config,
            compilation_config=CompilationConfig(
                mode=compilation_mode, cudagraph_mode=cudagraph_mode
            ),

--- a/tests/v1/determinism/test_batch_invariance.py
+++ b/tests/v1/determinism/test_batch_invariance.py
@@ -28,7 +28,7 @@ IS_DEVICE_CAPABILITY_BELOW_90 = is_device_capability_below_90()
    BACKENDS,
 )
 def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
-    backend, monkeypatch: pytest.MonkeyPatch
+    backend,
 ):
    """
    Ensures that the same request (the 'needle' prompt) yields identical output
@@ -54,7 +54,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
    seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
    random.seed(seed)

-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
+    attention_config = {"backend": backend}
    # Allow overrides from environment (useful for CI tuning)
    # "facebook/opt-125m" is too small, doesn't reliably test determinism
    model = resolve_model_name(backend)
@@ -92,6 +92,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
            max_num_seqs=max_batch_size,
            gpu_memory_utilization=gpu_mem_util,
            max_model_len=max_model_len,
+            attention_config=attention_config,
        )

        # Baseline generation for the needle prompt alone.
@@ -106,6 +107,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
            max_num_seqs=max_batch_size,
            gpu_memory_utilization=gpu_mem_util,
            max_model_len=max_model_len,
+            attention_config=attention_config,
        )

        mismatches = 0
@@ -163,10 +165,8 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
    BACKENDS,
 )
 def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
-    backend, monkeypatch: pytest.MonkeyPatch
+    backend,
 ):
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
-
    seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
    random.seed(seed)
    model_name = resolve_model_name(backend)
@@ -188,12 +188,12 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
    llm = LLM(
        model=model_name,
        tensor_parallel_size=tp_size,
-        # enable_prefix_caching=False,
        max_num_seqs=32,
        max_model_len=8192,
        dtype="bfloat16",  # not everything is supported
        gpu_memory_utilization=0.9,
        enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
+        attention_config={"backend": backend},
    )

    # Use more realistic prompts for better token generation
@@ -382,12 +382,11 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
    "backend",
    BACKENDS,
 )
-def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
+def test_simple_generation(backend):
    """
    Simple test that runs the model with a basic prompt and prints the output.
    Useful for quick smoke testing and debugging.
    """
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
    model = resolve_model_name(backend)

    llm = LLM(
@@ -399,6 +398,7 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
        dtype="bfloat16",
        enable_prefix_caching=False,
        enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
+        attention_config={"backend": backend},
    )

    prompt = "the capital of france is"
@@ -445,8 +445,6 @@ def test_logprobs_without_batch_invariance_should_fail(
    The test will PASS if we detect differences (proving batch invariance matters).
    The test will FAIL if everything matches (suggesting batch invariance isn't needed).
    """
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
-
    # CRITICAL: Disable batch invariance for this test
    monkeypatch.setenv("VLLM_BATCH_INVARIANT", "0")
    monkeypatch.setattr(batch_invariant, "VLLM_BATCH_INVARIANT", False)
@@ -466,6 +464,7 @@ def test_logprobs_without_batch_invariance_should_fail(
        max_model_len=8192,
        dtype="bfloat16",
        enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
+        attention_config={"backend": backend},
    )

    # build ragged prompts to change shapes significantly across BS=1 vs BS=N
@@ -650,7 +649,7 @@ def test_logprobs_without_batch_invariance_should_fail(
 @skip_unsupported
 @pytest.mark.parametrize("backend", ["FLASH_ATTN"])
 def test_decode_logprobs_match_prefill_logprobs(
-    backend, monkeypatch: pytest.MonkeyPatch
+    backend,
 ):
    """
    Test that verifies decode logprobs match prefill logprobs.
@@ -665,8 +664,6 @@ def test_decode_logprobs_match_prefill_logprobs(
    This ensures that the logprobs from decode are consistent with what
    we would get if we ran prefill on each prefix.
    """
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
-
    seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
    random.seed(seed)
    model_name = resolve_model_name(backend)
@@ -690,6 +687,7 @@ def test_decode_logprobs_match_prefill_logprobs(
        max_model_len=8192,
        dtype="bfloat16",
        enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
+        attention_config={"backend": backend},
    )

    # Use a few test prompts
@@ -921,6 +919,7 @@ def LLM_with_max_seqs(
    max_num_seqs: int,
    gpu_memory_utilization: float,
    max_model_len: int,
+    attention_config: dict | None = None,
 ) -> LLM:
    """
    Helper to construct an LLM with a specific max_num_seqs (batch-size limit)
@@ -935,6 +934,7 @@ def LLM_with_max_seqs(
        tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
        enable_prefix_caching=False,
        enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
+        attention_config=attention_config,
        # Enable for MOE models
        # enable_expert_parallel=True,
    )
--- a/tests/v1/determinism/test_online_batch_invariance.py
+++ b/tests/v1/determinism/test_online_batch_invariance.py
@@ -136,11 +136,9 @@ def _compare_bs1_vs_bsn_single_process(
 @skip_unsupported
 @pytest.mark.parametrize("backend", BACKENDS)
 def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
-    backend: str, monkeypatch: pytest.MonkeyPatch
+    backend: str,
 ) -> None:
    random.seed(int(os.getenv("VLLM_TEST_SEED", "12345")))
-    # Override backend for this test (and the RemoteOpenAIServer child process).
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
    model_name = resolve_model_name(backend)
    prompts_all = [_random_prompt(10, 50) for _ in range(32)]

@@ -156,6 +154,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
    server_args: list[str] = [
        "--max-model-len=8192",
        "--max-num-seqs=32",
+        f"--attention-backend={backend}",
    ]
    if tp_size:
        server_args += ["-tp", tp_size]

--- a/tests/v1/determinism/test_rms_norm_batch_invariant.py
+++ b/tests/v1/determinism/test_rms_norm_batch_invariant.py
@@ -21,7 +21,11 @@ from vllm.model_executor.layers.layernorm import RMSNorm
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("eps", [1e-6, 1e-5])
 def test_rms_norm_batch_invariant_vs_standard(
-    batch_size: int, hidden_size: int, dtype: torch.dtype, eps: float
+    default_vllm_config,
+    batch_size: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    eps: float,
 ):
    """
    Compare batch-invariant Triton RMS norm against standard CUDA implementation.
@@ -68,7 +72,9 @@ def test_rms_norm_batch_invariant_vs_standard(
 @pytest.mark.parametrize("batch_size", [1, 16, 128])
 @pytest.mark.parametrize("seq_len", [1, 32, 512])
 @pytest.mark.parametrize("hidden_size", [2048, 4096])
-def test_rms_norm_3d_input(batch_size: int, seq_len: int, hidden_size: int):
+def test_rms_norm_3d_input(
+    default_vllm_config, batch_size: int, seq_len: int, hidden_size: int
+):
    """
    Test RMS norm with 3D input tensors (batch, seq_len, hidden_size).

@@ -107,7 +113,7 @@ def test_rms_norm_3d_input(batch_size: int, seq_len: int, hidden_size: int):


 @skip_unsupported
-def test_rms_norm_numerical_stability():
+def test_rms_norm_numerical_stability(default_vllm_config):
    """
    Test RMS norm numerical stability with extreme values.

@@ -167,7 +173,7 @@ def test_rms_norm_numerical_stability():


 @skip_unsupported
-def test_rms_norm_formula():
+def test_rms_norm_formula(default_vllm_config):
    """
    Test that RMS norm follows the correct mathematical formula.

@@ -201,7 +207,7 @@ def test_rms_norm_formula():

 @skip_unsupported
 @pytest.mark.parametrize("hidden_size", [128, 1024, 4096, 16384])
-def test_rms_norm_different_hidden_sizes(hidden_size: int):
+def test_rms_norm_different_hidden_sizes(default_vllm_config, hidden_size: int):
    """
    Test RMS norm with various hidden sizes to ensure block size handling.

@@ -238,7 +244,7 @@ def test_rms_norm_different_hidden_sizes(hidden_size: int):


 @skip_unsupported
-def test_rms_norm_determinism():
+def test_rms_norm_determinism(default_vllm_config):
    """
    Test that batch-invariant RMS norm produces deterministic results.


--- a/tests/v1/determinism/utils.py
+++ b/tests/v1/determinism/utils.py
@@ -6,9 +6,9 @@ import random
 import pytest
 import torch

-from vllm.attention.utils.fa_utils import flash_attn_supports_mla
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer
+from vllm.v1.attention.backends.fa_utils import flash_attn_supports_mla

 skip_unsupported = pytest.mark.skipif(
    not (current_platform.is_cuda() and current_platform.has_device_capability(80)),

--- a/tests/v1/distributed/test_eagle_dp.py
+++ b/tests/v1/distributed/test_eagle_dp.py
@@ -16,7 +16,12 @@ DP_SIZE = int(os.getenv("DP_SIZE", 2))


 @pytest.mark.asyncio
-async def test_run_eagle_dp():
+async def test_run_eagle_dp(monkeypatch: pytest.MonkeyPatch):
+    # This test checks that running a model with and without eagle
+    # leads to identical tokens. This is only true in batch invariant mode
+    # (because the target model verifies all draft tokens in one big forward pass)
+    monkeypatch.setenv("VLLM_BATCH_INVARIANT", "1")
+
    target_model = "meta-llama/Llama-3.1-8B-Instruct"
    draft_model = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"

@@ -29,6 +34,7 @@ async def test_run_eagle_dp():
        data_parallel_backend="mp",  # ray takes more time
        trust_remote_code=True,
        max_model_len=16384,
+        attention_config={"backend": "FLASH_ATTN"},
    )

    eagle_engine_args = replace(
@@ -41,9 +47,10 @@ async def test_run_eagle_dp():
    )

    prompt = "This is a test of data parallel with eagle"
-    num_expected_tokens = 100
+    # This test might be flaky, see
+    # https://github.com/vllm-project/vllm/issues/31913
+    num_expected_tokens = 20
    sampling_params = SamplingParams(
-        min_tokens=num_expected_tokens,
        max_tokens=num_expected_tokens,
        ignore_eos=True,
        output_kind=RequestOutputKind.FINAL_ONLY,