Merge tag 'v0.18.0' into v0.18.0-ori

3fb4b5fa · zhuwenwen · bcf25339 · 89138b21 · 3fb4b5fa · 3fb4b5fa
Commit 3fb4b5fa authored Mar 23, 2026 by zhuwenwen
20 changed files
--- a/tests/compile/passes/test_rope_kvcache_fusion.py
+++ b/tests/compile/passes/test_rope_kvcache_fusion.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+import vllm.config
+from tests.compile.backend import TestBackend
+from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
+from vllm._aiter_ops import is_aiter_found_and_supported, rocm_aiter_ops
+from vllm.compilation.passes.fusion.matcher_utils import ROTARY_OP
+from vllm.compilation.passes.fusion.rope_kvcache_fusion import RopeKVCacheFusionPass
+from vllm.compilation.passes.utility.noop_elimination import NoOpEliminationPass
+from vllm.compilation.passes.utility.post_cleanup import PostCleanupPass
+from vllm.compilation.passes.utility.scatter_split_replace import (
+    ScatterSplitReplacementPass,
+)
+from vllm.compilation.passes.utility.split_coalescing import SplitCoalescingPass
+from vllm.config import (
+    CacheConfig,
+    CompilationConfig,
+    CompilationMode,
+    ModelConfig,
+    PassConfig,
+    VllmConfig,
+)
+from vllm.forward_context import get_forward_context, set_forward_context
+from vllm.model_executor.layers.attention import Attention
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.platforms import current_platform
+from vllm.v1.attention.backend import (
+    AttentionBackend,
+    CommonAttentionMetadata,
+)
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
+from vllm.v1.kv_cache_interface import AttentionSpec
+
+INDEX_SELECT_OP = torch.ops.aten.index.Tensor
+VLLM_UNIFIED_KV_CACHE_UPDATE_OP = torch.ops.vllm.unified_kv_cache_update
+FP8_DTYPE = current_platform.fp8_dtype()
+
+
+class QKRoPEKVCacheTestModel(torch.nn.Module):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        attn_backend: AttentionBackendEnum,
+        num_heads: int,
+        num_kv_heads: int,
+        head_size: int,
+        is_neox: bool,
+        dtype: torch.dtype,
+        device: torch.device,
+        prefix: str = "model.layers.0.self_attn.attn",
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.head_size = head_size
+        self.block_size = vllm_config.cache_config.block_size
+        self.q_size = num_heads * head_size
+        self.kv_size = num_kv_heads * head_size
+        self.is_neox = is_neox
+        self.dtype = dtype
+        self.device = device
+        self.layer_name = prefix
+
+        self.rotary_emb = RotaryEmbedding(
+            head_size,
+            rotary_dim=head_size,
+            max_position_embeddings=4096,
+            base=10000,
+            is_neox_style=is_neox,
+            dtype=self.dtype,
+        )
+
+        # Whether to check for the RoPE custom op or component index_select
+        self.enable_rope_custom_op = self.rotary_emb.enabled()
+
+        # Register layer metadata for the fusion pass via Attention.
+        self.attn = Attention(
+            num_heads=num_heads,
+            head_size=head_size,
+            scale=1.0 / head_size**0.5,
+            num_kv_heads=num_kv_heads,
+            cache_config=vllm_config.cache_config,
+            quant_config=vllm_config.quant_config,
+            prefix=prefix,
+            attn_backend=attn_backend.get_class(),
+        )
+        self.attn_backend: type[AttentionBackend] = self.attn.get_attn_backend()
+        assert not self.attn_backend.forward_includes_kv_cache_update, (
+            f"Attention backend {self.attn_backend} does not support fuse_rope_kvcache."
+        )
+        self.attn._k_scale = self.attn._k_scale.to(device)
+        self.attn._v_scale = self.attn._v_scale.to(device)
+
+        kv_cache_dtype_str = vllm_config.cache_config.cache_dtype
+        self.kv_cache_dtype = (
+            FP8_DTYPE if kv_cache_dtype_str.startswith("fp8") else self.dtype
+        )
+
+        # Initialize attn MetadataBuilder
+        self.builder = self.attn.attn_backend.get_builder_cls()(
+            kv_cache_spec=AttentionSpec(
+                block_size=self.block_size,
+                num_kv_heads=self.num_kv_heads,
+                head_size=head_size,
+                dtype=self.kv_cache_dtype,
+            ),
+            layer_names=[self.attn.layer_name],
+            vllm_config=vllm_config,
+            device=device,
+        )
+
+    def build_attn_metadata(self, batch_size: int) -> CommonAttentionMetadata:
+        """Initialize attention metadata."""
+        # Create common attn metadata
+        batch_spec = BatchSpec(seq_lens=[1] * batch_size, query_lens=[1] * batch_size)
+        common_attn_metadata = create_common_attn_metadata(
+            batch_spec, self.block_size, self.device, arange_block_indices=True
+        )
+
+        max_blocks = (max(batch_spec.seq_lens) + self.block_size - 1) // self.block_size
+        num_blocks = batch_size * max_blocks
+
+        # Fetch the attention backend and kv cache shape and stride order
+        attn_backend = self.attn.attn_backend
+        kv_cache_shape = attn_backend.get_kv_cache_shape(
+            num_blocks, self.block_size, self.num_kv_heads, self.head_size
+        )
+        try:
+            kv_cache_stride_order = attn_backend.get_kv_cache_stride_order()
+        except (AttributeError, NotImplementedError):
+            kv_cache_stride_order = tuple(range(len(kv_cache_shape)))
+
+        kv_cache_shape = tuple(kv_cache_shape[i] for i in kv_cache_stride_order)
+        inv_order = [
+            kv_cache_stride_order.index(i) for i in range(len(kv_cache_stride_order))
+        ]
+
+        # Create dummy KV cache
+        raw_tensor = torch.zeros(
+            2 * num_blocks * self.block_size * self.num_kv_heads * self.head_size,
+            dtype=self.kv_cache_dtype,
+            device=self.device,
+        )
+        raw_tensor = raw_tensor.view(kv_cache_shape)
+        kv_cache = raw_tensor.permute(*inv_order)
+
+        self.attn.kv_cache = [kv_cache]
+
+        # Build attn metadata
+        attn_metadata = self.builder.build(
+            common_prefix_len=0, common_attn_metadata=common_attn_metadata
+        )
+
+        return attn_metadata
+
+    def forward(
+        self, qkv: torch.Tensor, positions: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Create copy so inplace ops do not modify the original tensors
+        qkv = qkv.clone()
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+
+        # Instead of a full forward pass, match only the KV cache update op here
+        q = q.view(-1, self.num_heads, self.head_size)
+        k = k.view(-1, self.num_kv_heads, self.head_size)
+        v = v.view(-1, self.num_kv_heads, self.head_size)
+        kv_cache_dummy_dep = torch.ops.vllm.unified_kv_cache_update(
+            k, v, self.layer_name
+        )
+        return q, k, v, kv_cache_dummy_dep
+
+    def ops_in_model_before(self) -> list[torch._ops.OpOverload]:
+        ops = []
+        if self.enable_rope_custom_op:
+            if rocm_aiter_ops.is_triton_rotary_embed_enabled():
+                ops.append(torch.ops.vllm.rocm_aiter_triton_rotary_embedding.default)
+            else:
+                ops.append(ROTARY_OP)
+        else:
+            ops.append(INDEX_SELECT_OP)
+        ops.append(torch.ops.vllm.unified_kv_cache_update.default)
+        return ops
+
+    def ops_in_model_after(self) -> list[torch._ops.OpOverload]:
+        return [torch.ops.vllm.fused_rope_and_unified_kv_cache_update.default]
+
+
+@pytest.mark.parametrize(
+    "attn_backend",
+    [
+        AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN,
+        AttentionBackendEnum.TRITON_ATTN,
+        AttentionBackendEnum.ROCM_ATTN,
+        AttentionBackendEnum.ROCM_AITER_FA,
+    ],
+)
+@pytest.mark.parametrize("enable_rope_custom_op", [True])  # [True, False])
+@pytest.mark.parametrize("enable_aiter_triton_rope", [True, False])
+@pytest.mark.parametrize("num_heads", [64])
+@pytest.mark.parametrize("num_kv_heads", [8])
+@pytest.mark.parametrize("head_size", [64])
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("is_neox", [True, False])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
+@pytest.mark.skipif(
+    not is_aiter_found_and_supported(),
+    reason="Only test on ROCm with AITER installed and supported",
+)
+def test_rope_kvcache_fusion(
+    attn_backend: AttentionBackendEnum,
+    enable_rope_custom_op: bool,
+    enable_aiter_triton_rope: bool,
+    num_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    block_size: int,
+    is_neox: bool,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    torch.set_default_device("cuda")
+    torch.set_default_dtype(dtype)
+    torch.manual_seed(0)
+
+    custom_ops: list[str] = []
+    if enable_rope_custom_op:
+        custom_ops.append("+rotary_embedding")
+
+    vllm_config = VllmConfig(
+        model_config=ModelConfig(dtype=dtype),
+        cache_config=CacheConfig(
+            block_size=block_size,
+            cache_dtype=kv_cache_dtype,
+        ),
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            custom_ops=custom_ops,
+            pass_config=PassConfig(
+                fuse_rope_kvcache=True,
+                eliminate_noops=True,
+            ),
+        ),
+    )
+
+    with vllm.config.set_current_vllm_config(vllm_config), monkeypatch.context() as m:
+        m.setenv("VLLM_ROCM_USE_AITER", "1")
+        m.setenv(
+            "VLLM_ROCM_USE_AITER_TRITON_ROPE", "1" if enable_aiter_triton_rope else "0"
+        )
+        rocm_aiter_ops.refresh_env_variables()
+
+        model = QKRoPEKVCacheTestModel(
+            vllm_config=vllm_config,
+            attn_backend=attn_backend,
+            num_heads=num_heads,
+            num_kv_heads=num_kv_heads,
+            head_size=head_size,
+            is_neox=is_neox,
+            dtype=dtype,
+            device=torch.get_default_device(),
+        )
+
+        fusion_pass = RopeKVCacheFusionPass(vllm_config)
+        passes = [
+            NoOpEliminationPass(vllm_config),
+            SplitCoalescingPass(vllm_config),
+            ScatterSplitReplacementPass(vllm_config),
+            fusion_pass,
+            PostCleanupPass(vllm_config),
+        ]
+        backend = TestBackend(*passes)
+
+        T = 5
+
+        qkv = torch.randn(
+            T, num_heads * head_size + 2 * num_kv_heads * head_size, dtype=dtype
+        )
+        pos = torch.arange(T, dtype=torch.long)
+
+        qkv_unfused = qkv.clone()
+        pos_unfused = pos.clone()
+
+        with set_forward_context(None, vllm_config):
+            forward_context = get_forward_context()
+            attn_metadata = model.build_attn_metadata(T)
+            forward_context.slot_mapping = {
+                model.layer_name: attn_metadata.slot_mapping
+            }
+            q_unfused, k_unfused, v_unfused, dummy = model(qkv_unfused, pos_unfused)
+            attn_layer = forward_context.no_compile_layers[model.layer_name]
+            kv_cache_unfused = attn_layer.kv_cache[forward_context.virtual_engine]
+        del dummy
+
+        torch._dynamo.mark_dynamic(qkv, 0)
+        torch._dynamo.mark_dynamic(pos, 0)
+        with set_forward_context(None, vllm_config):
+            model_fused = torch.compile(model, backend=backend)
+            forward_context = get_forward_context()
+            attn_metadata = model_fused.build_attn_metadata(T)
+            forward_context.slot_mapping = {
+                model.layer_name: attn_metadata.slot_mapping
+            }
+            q_fused, k_fused, v_fused, dummy = model_fused(qkv, pos)
+            attn_layer = forward_context.no_compile_layers[model.layer_name]
+            kv_cache_fused = attn_layer.kv_cache[forward_context.virtual_engine]
+        del dummy
+
+        assert fusion_pass.matched_count == 1
+
+        backend.check_before_ops(model.ops_in_model_before())
+        backend.check_after_ops(model.ops_in_model_after())
+
+        if dtype == torch.float16:
+            ATOL, RTOL = (2e-3, 2e-3)
+        else:
+            ATOL, RTOL = (1e-2, 1e-2)
+
+        torch.testing.assert_close(q_unfused, q_fused, atol=ATOL, rtol=RTOL)
+        torch.testing.assert_close(k_unfused, k_fused, atol=ATOL, rtol=RTOL)
+        torch.testing.assert_close(v_unfused, v_fused, atol=ATOL, rtol=RTOL)
+        # Cannot compare fp8_* directly here, cast to model dtype instead
+        torch.testing.assert_close(
+            kv_cache_unfused.view(dtype),
+            kv_cache_fused.view(dtype),
+            atol=ATOL,
+            rtol=RTOL,
+        )
--- a/tests/compile/passes/test_scatter_split_replace.py
+++ b/tests/compile/passes/test_scatter_split_replace.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+import torch.nn as nn
+
+import vllm
+from tests.compile.backend import TestBackend
+from vllm.compilation.passes.utility.scatter_split_replace import (
+    ScatterSplitReplacementPass,
+)
+from vllm.compilation.passes.utility.split_coalescing import SplitCoalescingPass
+from vllm.config import CompilationConfig, CompilationMode, VllmConfig
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+
+
+class ScatterSplitReplacementModel(nn.Module):
+    """Model with a rope+getitem+slice_scatter+split_with_sizes sequence."""
+
+    def __init__(
+        self,
+        num_heads: int,
+        num_kv_heads: int,
+        head_size: int,
+        dtype: torch.dtype,
+    ):
+        super().__init__()
+        self.q_size = num_heads * head_size
+        self.kv_size = num_kv_heads * head_size
+
+        self.rotary_emb = RotaryEmbedding(
+            head_size,
+            rotary_dim=head_size,
+            max_position_embeddings=4096,
+            base=10000,
+            is_neox_style=True,
+            dtype=dtype,
+        )
+
+    def forward(self, qkv: torch.Tensor, positions: torch.Tensor):
+        # Create copy so inplace ops do not modify the original tensors
+        qkv = qkv.clone()
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        q = q + 1
+        k = k + 2
+        v = v + 3
+        return q, k, v
+
+    def ops_in_model_before(self) -> list[torch._ops.OpOverload]:
+        return [
+            torch.ops.aten.slice_scatter.default,
+            torch.ops.aten.split_with_sizes.default,
+            torch.ops.aten.getitem.default,
+        ]
+
+    def ops_in_model_after(self) -> list[torch._ops.OpOverload]:
+        return [torch.ops.aten.getitem.default]
+
+
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+def test_scatter_split_replace(dtype):
+    torch.set_default_device("cuda")
+    torch.set_default_dtype(dtype)
+    torch.manual_seed(0)
+
+    num_heads = 8
+    num_kv_heads = 4
+    head_size = 64
+
+    vllm_config = VllmConfig(
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            custom_ops=["+rotary_embedding"],
+        ),
+    )
+    with vllm.config.set_current_vllm_config(vllm_config):
+        # ScatterSplitReplacementPass requires SplitCoalescingPass to be run before it
+        coalesce_pass = SplitCoalescingPass(vllm_config)
+        replace_pass = ScatterSplitReplacementPass(vllm_config)
+        passes = [coalesce_pass, replace_pass]
+        backend = TestBackend(*passes)
+
+        model = ScatterSplitReplacementModel(num_heads, num_kv_heads, head_size, dtype)
+
+        T = 5
+        qkv = torch.randn(
+            T, num_heads * head_size + 2 * num_kv_heads * head_size, dtype=dtype
+        )
+        pos = torch.arange(T, dtype=torch.long)
+
+        qkv_eager = qkv.clone()
+        pos_eager = pos.clone()
+        result_eager = model(qkv_eager, pos_eager)
+
+        torch._dynamo.mark_dynamic(qkv, 0)
+        torch._dynamo.mark_dynamic(pos, 0)
+
+        model_compiled = torch.compile(model, backend=backend)
+        result_compiled = model_compiled(qkv, pos)
+
+        for eager, compiled in zip(result_eager, result_compiled):
+            torch.testing.assert_close(eager, compiled)
+
+        assert backend.op_count(torch.ops.aten.slice_scatter.default) == 0
+        assert backend.op_count(torch.ops.aten.split_with_sizes.default) == 1
--- a/tests/compile/passes/test_silu_mul_quant_fusion.py
+++ b/tests/compile/passes/test_silu_mul_quant_fusion.py
@@ -26,22 +26,14 @@ from vllm.config import (
    VllmConfig,
    set_current_vllm_config,
 )
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import (
+from vllm.model_executor.kernels.linear import (
    CutlassFP8ScaledMMLinearKernel,
-)
-from vllm.model_executor.layers.quantization.kernels.scaled_mm.flashinfer import (
    FlashInferFP8ScaledMMLinearKernel,
-)
-from vllm.model_executor.layers.quantization.kernels.scaled_mm.pytorch import (
+    FP8ScaledMMLinearKernel,
    PerTensorTorchFP8ScaledMMLinearKernel,
-)
-from vllm.model_executor.layers.quantization.kernels.scaled_mm.rocm import (
    ROCmFP8ScaledMMLinearKernel,
 )
-from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import (  # noqa: E501
-    FP8ScaledMMLinearKernel,
-)
+from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.quantization.utils.fp8_utils import W8A8BlockFp8LinearOp
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    GroupShape,
@@ -190,8 +182,24 @@ TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS
    "model_class, enable_quant_fp8_custom_op, force_kernel",
    list(itertools.product([TestSiluMulFp8QuantModel], [True, False], TEST_KERNELS))
    + [
-        (TestSiluMulNvfp4QuantModel, False, None),
-        (TestSiluMulGroupFp8QuantModel, False, None),
+        pytest.param(
+            TestSiluMulNvfp4QuantModel,
+            False,
+            None,
+            marks=pytest.mark.skipif(
+                not current_platform.is_cuda(), reason="CUDA only"
+            ),
+        ),
+        # GroupFP8Quant fusion only works with AITER on ROCm.
+        # and the enable_quant_fp8_custom_op must be True.
+        pytest.param(
+            TestSiluMulGroupFp8QuantModel,
+            True,
+            None,
+            marks=pytest.mark.skipif(
+                not current_platform.is_rocm(), reason="ROCm only"
+            ),
+        ),
    ],
 )
 @pytest.mark.skipif(
@@ -209,6 +217,7 @@ def test_fusion_silu_and_mul_quant(
    enable_silu_mul_custom_op: bool,
    enable_quant_fp8_custom_op: bool,
    force_kernel: FP8ScaledMMLinearKernel | None,
+    monkeypatch: pytest.MonkeyPatch,
 ):
    if model_class is TestSiluMulNvfp4QuantModel and not is_nvfp4_supported():
        pytest.skip("NVFP4 is not supported on this GPU.")
@@ -235,13 +244,16 @@ def test_fusion_silu_and_mul_quant(
        ),
    )

-    with set_current_vllm_config(config):
+    with set_current_vllm_config(config), monkeypatch.context() as m:
        fusion_passes = [ActivationQuantFusionPass(config)]
-        if IS_AITER_FOUND:
+        if IS_AITER_FOUND and model_class is TestSiluMulGroupFp8QuantModel:
+            from vllm._aiter_ops import rocm_aiter_ops
            from vllm.compilation.passes.fusion.rocm_aiter_fusion import (
                RocmAiterSiluMulFp8GroupQuantFusionPass,
            )

+            m.setenv("VLLM_ROCM_USE_AITER", "1")
+            rocm_aiter_ops.refresh_env_variables()
            fusion_passes += [RocmAiterSiluMulFp8GroupQuantFusionPass(config)]

        passes = [NoOpEliminationPass(config), *fusion_passes, PostCleanupPass(config)]

--- a/tests/compile/test_aot_compile.py
+++ b/tests/compile/test_aot_compile.py
@@ -4,6 +4,7 @@
 import functools
 import hashlib
 import multiprocessing
+import os
 import pickle
 import tempfile
 from contextlib import contextmanager
@@ -14,9 +15,12 @@ import pytest
 import torch

 import vllm.model_executor.layers.activation
+from vllm.compilation.backends import VllmBackend
 from vllm.compilation.caching import (
    StandaloneCompiledArtifacts,
+    VllmSerializableFunction,
 )
+from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (
    CompilationConfig,
@@ -156,6 +160,26 @@ def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
            assert torch.allclose(ret, expected)


+@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
+def test_save_and_load_slice(monkeypatch: pytest.MonkeyPatch):
+    def foo(x: torch.Tensor):
+        return x[slice(0, x.shape[0])]
+
+    vllm_config = make_vllm_config()
+
+    example_input = torch.randn(10, 10)
+    torch._dynamo.mark_dynamic(example_input, 0)
+    gm = torch.fx.symbolic_trace(foo)
+    assert "getitem_1 = x[slice(0, getitem, None)]" in gm.code
+    with use_vllm_config(vllm_config):
+        payload = VllmSerializableFunction.serialize_compile_artifacts(
+            VllmSerializableFunction(gm, (example_input,), "", foo)
+        )
+        fn = VllmSerializableFunction.deserialize_compile_artifacts(payload)
+
+    assert gm.code == fn.graph_module.code
+
+
 @pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
 def test_cache_load_returns_tuple_consistency(monkeypatch: pytest.MonkeyPatch):
    """
@@ -700,3 +724,156 @@ class TestStandaloneCompiledArtifactsIntegration:
            ("mod3", "shape3"),
        ]:
            assert cache.get(submod, shape) == shared_data
+
+    def test_functorch_config(self):
+        vllm_config = make_vllm_config()
+        example_inputs = (torch.randn(10, 10),)
+
+        def add_1(x: torch.Tensor):
+            return x + 1
+
+        gm = torch._dynamo.functional_export.dynamo_graph_capture_for_export(add_1)(
+            *example_inputs
+        )
+
+        gm.graph._codegen = torch.fx.graph.CodeGen()
+        gm._dynamo_bytecode_flatten = None
+        gm._dynamo_bytecode_unflatten = None
+
+        with (
+            torch._functorch.config.patch(bundled_autograd_cache=False),
+            set_current_vllm_config(vllm_config),
+        ):
+            with torch._functorch.config.patch(bundled_autograd_cache=True):
+                fn = VllmSerializableFunction(gm, example_inputs, "", add_1)
+
+            payload = VllmSerializableFunction.serialize_compile_artifacts(fn)
+
+            config = None
+
+            def backend(*args, **kwargs) -> VllmSerializableFunction:
+                nonlocal config
+                # bundled_autograd_cache should be True even compiler backend
+                # runs with bundled_autograd_cache=False in ambient context.
+                config = torch._functorch.config.save_config_portable()
+                return fn
+
+            loaded_fn = VllmSerializableFunction.deserialize_compile_artifacts(payload)
+            with patch.object(VllmBackend, "__call__", backend):
+                loaded_fn(*example_inputs)
+
+        assert isinstance(config, dict)
+        assert "bundled_autograd_cache" in config
+        assert config["bundled_autograd_cache"] is True
+
+
+@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
+def test_disable_compile_cache_skips_aot_save(
+    monkeypatch: pytest.MonkeyPatch, fresh_vllm_cache: str
+):
+    """When VLLM_DISABLE_COMPILE_CACHE=1, AOT artifacts must not be saved."""
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+    monkeypatch.setenv("VLLM_USE_AOT_COMPILE", "1")
+    disable_envs_cache()
+
+    args = (torch.randn(10, 10),)
+    expected = reference_fn(*args)
+    vllm_config = make_vllm_config()
+
+    with (
+        use_vllm_config(vllm_config),
+        compilation_counter.expect(
+            num_aot_compiles=1,
+            num_aot_artifacts_saved=0,
+            num_aot_artifacts_loaded=0,
+        ),
+    ):
+        mod = CompiledMod(vllm_config=vllm_config)
+        actual = mod(*args)
+
+    assert torch.allclose(actual, expected)
+
+    # No cached artifact should exist on disk
+    aot_dir = os.path.join(fresh_vllm_cache, "torch_compile_cache", "torch_aot_compile")
+    if os.path.isdir(aot_dir):
+        for root, _dirs, files in os.walk(aot_dir):
+            for f in files:
+                assert f != "model", (
+                    f"AOT artifact unexpectedly saved at {os.path.join(root, f)}"
+                )
+
+
+@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
+def test_disable_compile_cache_skips_aot_load(
+    monkeypatch: pytest.MonkeyPatch, fresh_vllm_cache: str
+):
+    """When VLLM_DISABLE_COMPILE_CACHE=1, AOT artifacts must not be loaded."""
+    # Phase 1: compile and save with cache enabled
+    monkeypatch.setenv("VLLM_USE_AOT_COMPILE", "1")
+    disable_envs_cache()
+
+    args = (torch.randn(10, 10),)
+    vllm_config = make_vllm_config()
+
+    with (
+        use_vllm_config(vllm_config),
+        compilation_counter.expect(num_aot_artifacts_saved=1),
+    ):
+        CompiledMod(vllm_config=vllm_config)(*args)
+
+    # Phase 2: disable cache, compile again — should NOT load from disk
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+    disable_envs_cache()
+    torch._dynamo.reset()
+
+    vllm_config = make_vllm_config()
+    with (
+        use_vllm_config(vllm_config),
+        compilation_counter.expect(
+            num_aot_compiles=1,
+            num_aot_artifacts_saved=0,
+            num_aot_artifacts_loaded=0,
+        ),
+    ):
+        mod = CompiledMod(vllm_config=vllm_config)
+        mod(*args)
+
+    assert not mod.was_aot_compile_fn_loaded_from_disk
+
+
+@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
+def test_aot_counters_on_save_and_load(
+    monkeypatch: pytest.MonkeyPatch, fresh_vllm_cache: str
+):
+    """Verify AOT counters are incremented correctly on save and load."""
+    monkeypatch.setenv("VLLM_USE_AOT_COMPILE", "1")
+    disable_envs_cache()
+
+    args = (torch.randn(10, 10),)
+
+    # Phase 1: fresh compile + save
+    vllm_config = make_vllm_config()
+    with (
+        use_vllm_config(vllm_config),
+        compilation_counter.expect(
+            num_aot_compiles=1,
+            num_aot_artifacts_saved=1,
+            num_aot_artifacts_loaded=0,
+        ),
+    ):
+        CompiledMod(vllm_config=vllm_config)(*args)
+
+    # Phase 2: load from cache
+    monkeypatch.setenv("VLLM_FORCE_AOT_LOAD", "1")
+    disable_envs_cache()
+
+    vllm_config = make_vllm_config()
+    with (
+        use_vllm_config(vllm_config),
+        compilation_counter.expect(
+            num_aot_compiles=0,
+            num_aot_artifacts_saved=0,
+            num_aot_artifacts_loaded=1,
+        ),
+    ):
+        CompiledMod(vllm_config=vllm_config)(*args)
--- a/tests/compile/test_cold_start.py
+++ b/tests/compile/test_cold_start.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from torch._dynamo.utils import counters
-
-from vllm import LLM
-from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
-
-
-def test_moe_compilation_cold_start(monkeypatch, use_fresh_inductor_cache):
-    # Run in same process so we can access PyTorch's internal counters
-    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
-
-    # I'm not sure if this is going to affect the numbers
-    monkeypatch.setenv("VLLM_USE_AOT_COMPILE", "0")
-
-    # Force cold compilation
-    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
-
-    compilation_config = CompilationConfig(
-        mode=CompilationMode.VLLM_COMPILE,
-        cudagraph_mode=CUDAGraphMode.NONE,  # make the model loading faster
-    )
-
-    counters.clear()
-
-    _ = LLM(
-        model="microsoft/Phi-tiny-MoE-instruct",
-        max_model_len=256,
-        load_format="dummy",  # make the model loading faster
-        compilation_config=compilation_config,
-        num_gpu_blocks_override=8,  # make the model loading faster
-    )
-
-    # vLLM-compile cold start is special. By default, we do
-    # one full dynamo capture of the entire forward pass.
-    # The forward pass consists of 32 transformer layers.
-    # Then, we split on the attention operation. This results in
-    # 33 subgraphs (not including the attention operation).
-    # We then generate compiled artifacts for the unique subgraphs.
-    #
-    # There are actually only 3 unique subgraphs for this model
-    # (all of its transformer layers are the same modulo weights);
-    # this is true for most vLLM models.
-    # So we test that during cold start, we are only compling
-    # for 3 unique subgraphs.
-    assert counters["aot_autograd"]["autograd_cache_miss"] == 3
-    assert counters["aot_autograd"]["autograd_cache_hit"] == 0
--- a/tests/compile/test_compile_ranges.py
+++ b/tests/compile/test_compile_ranges.py
@@ -73,6 +73,7 @@ def test_compile_ranges(use_fresh_inductor_cache):
            Range(start=16, end=16),
            Range(start=9, end=32),
            Range(start=64, end=64),
+            Range(start=128, end=128),
            Range(start=33, end=8192),
        ]
    )
@@ -85,7 +86,7 @@ def test_compile_ranges(use_fresh_inductor_cache):
        ),
        compilation_config=CompilationConfig(
            mode=CompilationMode.VLLM_COMPILE,
-            compile_ranges_split_points=[8, 32],
+            compile_ranges_endpoints=[8, 32],
            compile_sizes=[16, 64, 128],
            inductor_compile_config={
                "post_grad_custom_post_pass": post_grad_range_checker,
@@ -95,21 +96,21 @@ def test_compile_ranges(use_fresh_inductor_cache):

    with set_current_vllm_config(vllm_config):
        model = TestModel(vllm_config=vllm_config, prefix="").eval()
-        # Number of compilations: 3 for each compile range + 2 compile sizes
+        # Number of compilations: 3 compile ranges + 3 compile sizes
        batch_sizes = [1, 4, 16, 24, 48, 64, 8192]

        with compilation_counter.expect(
            num_graphs_seen=1,
            num_piecewise_graphs_seen=1,
-            num_backend_compilations=5,
+            num_backend_compilations=6,
        ):
            run_model(vllm_config, model, batch_sizes)
-        assert post_grad_range_checker.num_calls == 5
+        assert post_grad_range_checker.num_calls == 6


 def test_compile_config_get_compile_ranges():
    compilation_config = CompilationConfig(
-        compile_ranges_split_points=[8, 32],
+        compile_ranges_endpoints=[8, 32],
    )
    VllmConfig(
        scheduler_config=SchedulerConfig(
@@ -126,6 +127,88 @@ def test_compile_config_get_compile_ranges():
    ]


+class PostGradStaticShapeChecker(InductorPass):
+    """Asserts that compile_sizes entries produce graphs with fully concrete
+    (non-symbolic) shapes, and compile_ranges entries have symbolic shapes."""
+
+    def __init__(self):
+        self.num_static_calls = 0
+        self.num_dynamic_calls = 0
+
+    def __call__(self, graph: fx.Graph):
+        from torch.fx.experimental.symbolic_shapes import is_symbolic
+
+        compile_range = get_pass_context().compile_range
+        is_single = compile_range.is_single_size()
+
+        for node in graph.nodes:
+            val = node.meta.get("val")
+            if val is None:
+                val = node.meta.get("example_value")
+            if isinstance(val, torch.Tensor):
+                has_symbolic = any(is_symbolic(d) for d in val.shape)
+                if is_single:
+                    assert not has_symbolic, (
+                        f"compile_sizes entry {compile_range}: "
+                        f"node '{node.name}' has symbolic shape "
+                        f"{val.shape}"
+                    )
+                else:
+                    # compile_ranges should have at least some
+                    # symbolic shapes (the batch dimension)
+                    if has_symbolic:
+                        self.num_dynamic_calls += 1
+                        return
+
+        if is_single:
+            self.num_static_calls += 1
+
+    def uuid(self) -> str:
+        state: dict[str, Any] = {}
+        return InductorPass.hash_dict(state)
+
+
+def test_compile_sizes_produce_static_shapes(use_fresh_inductor_cache):
+    """Verify that compile_sizes entries are compiled with fully concrete
+    shapes (no SymInts), while compile_ranges entries retain dynamic shapes."""
+    checker = PostGradStaticShapeChecker()
+    torch.set_default_device("cuda")
+    vllm_config = VllmConfig(
+        scheduler_config=SchedulerConfig(
+            max_num_batched_tokens=8192,
+            max_model_len=8192,
+            is_encoder_decoder=False,
+        ),
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            compile_ranges_endpoints=[8],
+            compile_sizes=[16],
+            inductor_compile_config={
+                "post_grad_custom_post_pass": checker,
+            },
+        ),
+    )
+
+    with set_current_vllm_config(vllm_config):
+        model = TestModel(vllm_config=vllm_config, prefix="").eval()
+        # 3 compilations: Range(1,8), Range(9,8192), single-size 16
+        with compilation_counter.expect(
+            num_graphs_seen=1,
+            num_piecewise_graphs_seen=1,
+            num_backend_compilations=3,
+        ):
+            run_model(vllm_config, model, [1, 16, 64])
+
+    # compile_sizes=16 should produce static shapes
+    assert checker.num_static_calls == 1, (
+        f"Expected 1 static compilation, got {checker.num_static_calls}"
+    )
+    # compile_ranges should produce dynamic shapes
+    assert checker.num_dynamic_calls == 2, (
+        f"Expected 2 dynamic compilations, got {checker.num_dynamic_calls}"
+    )
+
+
 def test_inductor_cache_compile_ranges(monkeypatch, use_fresh_inductor_cache):
    # To force multiple compilations, we disable the compile cache
    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
@@ -148,7 +231,7 @@ def test_inductor_cache_compile_ranges(monkeypatch, use_fresh_inductor_cache):
            scheduler_config=scheduler_config,
            compilation_config=CompilationConfig(
                mode=CompilationMode.VLLM_COMPILE,
-                compile_ranges_split_points=[8],
+                compile_ranges_endpoints=[8],
                inductor_compile_config={
                    "post_grad_custom_post_pass": post_grad_range_checker,
                },

--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -421,6 +421,7 @@ def test_cudagraph_sizes_post_init(
                fuse_norm_quant=True,
                fuse_act_quant=True,
                eliminate_noops=True,
+                sp_min_token_num=512 if enable_sp else None,
            ),
            cudagraph_mode=cudagraph_mode,
        )
@@ -569,3 +570,45 @@ def test_compile_sizes_padding_validation():
    assert sorted(config.compile_sizes) == [3, 5, 7]
    dispatcher = CudagraphDispatcher(_create_vllm_config_for_validation(config))
    dispatcher.initialize_cudagraph_keys(CUDAGraphMode.NONE)  # Should not raise
+
+
+@pytest.mark.parametrize(
+    "capture_sizes, max_size, num_blocks, expected_sizes, expected_max",
+    [
+        # Normal capping: sizes filtered to <= num_blocks
+        (
+            [1, 2, 4, 8, 16, 32, 64, 128, 256, 512],
+            512,
+            200,
+            [1, 2, 4, 8, 16, 32, 64, 128],
+            128,
+        ),
+        # No capping needed: num_blocks >= max
+        ([1, 2, 4, 8, 16], 16, 1000, [1, 2, 4, 8, 16], 16),
+        # Exact boundary: num_blocks == max (no capping)
+        ([1, 2, 4, 8, 16, 32], 32, 32, [1, 2, 4, 8, 16, 32], 32),
+        # All sizes capped: num_blocks < smallest size
+        ([8, 16, 32], 32, 4, [], 0),
+        # num_blocks <= 0: early return, no change
+        ([1, 2, 4], 4, 0, [1, 2, 4], 4),
+    ],
+)
+def test_adjust_cudagraph_sizes_for_mamba_cache(
+    capture_sizes, max_size, num_blocks, expected_sizes, expected_max
+):
+    """Test that cudagraph capture sizes are correctly capped to fit
+    available Mamba cache blocks.
+
+    See: https://github.com/vllm-project/vllm/issues/34094
+    """
+    config = CompilationConfig(
+        cudagraph_capture_sizes=capture_sizes,
+        max_cudagraph_capture_size=max_size,
+        cudagraph_mode=CUDAGraphMode.NONE,
+    )
+    config.adjust_cudagraph_sizes_for_mamba_cache(num_blocks)
+    assert config.cudagraph_capture_sizes == expected_sizes
+    assert config.max_cudagraph_capture_size == expected_max
+    # Invariant: last element == max_cudagraph_capture_size
+    if expected_sizes:
+        assert config.cudagraph_capture_sizes[-1] == config.max_cudagraph_capture_size
--- a/tests/compile/test_decorator.py
+++ b/tests/compile/test_decorator.py
@@ -234,7 +234,7 @@ def test_conditional_compile_enable_if(use_inductor_graph_partition, monkeypatch
        expected_num_backend_compilations = 4

    # A has support_torch_compile but enable_if fn returns False
-    # enalbe_if will be True for B, so we expect mod1 and mod2
+    # enable_if will be True for B, so we expect mod1 and mod2
    # to be compiled
    with compilation_counter.expect(
        num_graphs_seen=2,

--- a/tests/compile/test_dynamic_shapes_compilation.py
+++ b/tests/compile/test_dynamic_shapes_compilation.py
@@ -99,8 +99,8 @@ def test_dynamic_shapes_compilation(
    # Clean up GPU memory
    del model
    gc.collect()
-    torch.cuda.empty_cache()
-    torch.cuda.synchronize()
+    torch.accelerator.empty_cache()
+    torch.accelerator.synchronize()
    print("GPU memory cleared")



--- a/tests/compile/test_graph_partition.py
+++ b/tests/compile/test_graph_partition.py
@@ -7,7 +7,7 @@ import pytest
 import torch
 from torch.fx.experimental.proxy_tensor import make_fx

-from vllm.compilation.backends import split_graph
+from vllm.compilation.backends import _is_empty_allocation_node, split_graph
 from vllm.compilation.passes.fx_utils import find_op_nodes

 # This import automatically registers `torch.ops.silly.attention`
@@ -184,3 +184,146 @@ def test_consecutive_ops_in_split():
    assert [node.op for node in splitting_gm.graph.nodes] == ["placeholder"] + 2 * [
        "call_function"
    ] + ["output"]
+
+
+def _get_empty_nodes(split_item):
+    return [
+        node for node in split_item.graph.graph.nodes if _is_empty_allocation_node(node)
+    ]
+
+
+def _subgraphs_with_empty_nodes(split_items, *, is_splitting_graph):
+    return [
+        split_item
+        for split_item in split_items
+        if split_item.is_splitting_graph == is_splitting_graph
+        and _get_empty_nodes(split_item)
+    ]
+
+
+def test_empty_only_partition_stays_separate_after_splitting_predecessor():
+    """
+    Empty-only subgraphs should not be merged when the only predecessor is
+    a splitting-op subgraph.
+    """
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        y = torch.sin(x)
+        out = torch.empty_like(y)
+        torch.ops.aten.cos.out(y, out=out)
+        return out
+
+    x = torch.randn(4, 3)
+    gm = make_fx(model_fn)(x)
+
+    split_ops = ["aten::sin", "aten::cos.out"]
+    split_gm, split_items = split_graph(gm, split_ops)
+
+    # Graph partitioning for this pattern is:
+    # [sin], [empty_like], [cos.out].
+    assert len(split_items) == 3, (
+        "Empty-only partition should not merge into splitting-op subgraph"
+    )
+
+    splitting_with_empty = _subgraphs_with_empty_nodes(
+        split_items, is_splitting_graph=True
+    )
+    assert len(splitting_with_empty) == 0, (
+        "Splitting-op subgraphs should not contain empty allocation nodes: "
+        f"{[item.submod_name for item in splitting_with_empty]}"
+    )
+
+    output_original = gm(x)
+    output_split = split_gm(x)
+    assert torch.allclose(output_original, output_split), "Output mismatch after split"
+
+
+def test_empty_only_partition_is_merged():
+    """
+    Empty-only subgraphs should still be merged when a non-splitting predecessor
+    exists. The merged empty node must remain outside splitting-op subgraphs.
+    """
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        base = x + 1
+        y = torch.sin(base)
+        out = torch.empty_like(base)
+        torch.ops.aten.cos.out(base, out=out)
+        return out + y
+
+    x = torch.randn(4, 3)
+    gm = make_fx(model_fn)(x)
+    split_gm, split_items = split_graph(gm, ["aten::sin", "aten::cos.out"])
+
+    # Partitioning should be:
+    # [add, empty_like], [sin], [cos.out], [add].
+    assert len(split_items) == 4, (
+        "Empty-only partition should be merged into non-splitting predecessor"
+    )
+
+    splitting_with_empty = _subgraphs_with_empty_nodes(
+        split_items, is_splitting_graph=True
+    )
+    assert len(splitting_with_empty) == 0, (
+        "Splitting-op subgraphs should not contain empty allocation nodes: "
+        f"{[item.submod_name for item in splitting_with_empty]}"
+    )
+
+    non_splitting_with_empty = _subgraphs_with_empty_nodes(
+        split_items, is_splitting_graph=False
+    )
+    assert len(non_splitting_with_empty) == 1, (
+        "Exactly one non-splitting subgraph should contain the merged empty node"
+    )
+    assert len(_get_empty_nodes(non_splitting_with_empty[0])) == 1, (
+        "Expected exactly one empty allocation node in merged subgraph"
+    )
+
+    output_original = gm(x)
+    output_split = split_gm(x)
+    assert torch.allclose(output_original, output_split), "Output mismatch after split"
+
+
+def test_builtin_empty_only_partition_is_merged():
+    """
+    In Dynamo graphs, torch.empty/empty_like may appear as builtin call targets
+    (not aten OpOverload). Ensure empty-only partitions are still merged.
+    """
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        hidden = x + 1
+        out1 = torch.empty_like(hidden)
+        torch.ops.silly.attention(hidden, hidden, hidden, out1)
+        out2 = torch.empty_like(hidden)
+        torch.ops.silly.attention(out1, out1, hidden, out2)
+        return out2 + hidden
+
+    gm = torch.fx.symbolic_trace(model_fn)
+    split_gm, split_items = split_graph(gm, ["silly::attention"])
+
+    # Without empty-only merge, this graph would split into:
+    # [add, empty_like], [attention], [empty_like], [attention], [add].
+    assert len(split_items) == 4, "Builtin empty-only partition should be merged"
+
+    splitting_with_empty = _subgraphs_with_empty_nodes(
+        split_items, is_splitting_graph=True
+    )
+    assert len(splitting_with_empty) == 0, (
+        "Splitting-op subgraphs should not contain empty allocation nodes: "
+        f"{[item.submod_name for item in splitting_with_empty]}"
+    )
+
+    non_splitting_with_empty = _subgraphs_with_empty_nodes(
+        split_items, is_splitting_graph=False
+    )
+    assert len(non_splitting_with_empty) == 1, (
+        "Exactly one non-splitting subgraph should contain merged empty nodes"
+    )
+    assert len(_get_empty_nodes(non_splitting_with_empty[0])) == 2, (
+        "Expected two builtin empty_like nodes in merged non-splitting subgraph"
+    )
+
+    x = torch.randn(2, 3, device="cuda")
+    output_original = gm(x)
+    output_split = split_gm(x)
+    assert torch.allclose(output_original, output_split), "Output mismatch after split"
--- a/tests/compile/test_sequence_parallelism_threshold.py
+++ b/tests/compile/test_sequence_parallelism_threshold.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.compilation.passes.fusion.sequence_parallelism import (
+    SP_MIN_HIDDEN_SIZE,
+    SP_MIN_PER_GPU_SIZE_MB,
+    get_sequence_parallelism_threshold,
+)
+
+
+class TestGetSequenceParallelismThreshold:
+    """Tests for get_sequence_parallelism_threshold function."""
+
+    def test_non_cuda_returns_none(self, mock_cuda_platform):
+        """Non-CUDA platforms should return None."""
+        with mock_cuda_platform(is_cuda=False):
+            result = get_sequence_parallelism_threshold(
+                hidden_size=8192, tp_size=2, element_size=2
+            )
+        assert result is None
+
+    def test_unsupported_device_capability_returns_none(self, mock_cuda_platform):
+        """Unsupported device capabilities (e.g., sm80) should return None."""
+        with mock_cuda_platform(capability=(8, 0)):
+            result = get_sequence_parallelism_threshold(
+                hidden_size=8192, tp_size=2, element_size=2
+            )
+        assert result is None
+
+    def test_small_hidden_size_returns_none(self, mock_cuda_platform):
+        """H100 with hidden_size below threshold should return None."""
+        with mock_cuda_platform(capability=(9, 0)):
+            result = get_sequence_parallelism_threshold(
+                hidden_size=4096,
+                tp_size=2,
+                element_size=2,  # 4096 < 8192
+            )
+        assert result is None
+
+    def test_h100_large_model_returns_threshold(self, mock_cuda_platform):
+        """H100 with large enough hidden_size should return calculated threshold."""
+        with mock_cuda_platform(capability=(9, 0)):
+            hidden_size = 8192
+            tp_size = 2
+            element_size = 2  # float16/bfloat16
+
+            result = get_sequence_parallelism_threshold(
+                hidden_size=hidden_size,
+                tp_size=tp_size,
+                element_size=element_size,
+            )
+
+            # Verify calculation: (8 * 2 * 1024 * 1024) // (8192 * 2) = 1024
+            MiB = 1024 * 1024
+            expected = int(
+                (SP_MIN_PER_GPU_SIZE_MB[90] * tp_size * MiB)
+                // (hidden_size * element_size)
+            )
+            assert result == expected
+            assert result == 1024
+
+    @pytest.mark.parametrize(
+        "hidden_size,tp_size,element_size,expected",
+        [
+            # Boundary: exactly at min hidden size threshold, tp_size=1
+            # (8 * 1 * 1024 * 1024) // (8192 * 2) = 512
+            (8192, 1, 2, 512),
+            # Larger hidden size reduces token threshold
+            # (8 * 1 * 1024 * 1024) // (16384 * 2) = 256
+            (16384, 1, 2, 256),
+            # Larger tp_size increases token threshold
+            # (8 * 4 * 1024 * 1024) // (8192 * 2) = 2048
+            (8192, 4, 2, 2048),
+            # Larger element_size (fp32) reduces token threshold
+            # (8 * 2 * 1024 * 1024) // (8192 * 4) = 512
+            (8192, 2, 4, 512),
+        ],
+    )
+    def test_threshold_calculation_variations(
+        self, mock_cuda_platform, hidden_size, tp_size, element_size, expected
+    ):
+        """Test threshold calculation with various parameter combinations."""
+        with mock_cuda_platform(capability=(9, 0)):
+            result = get_sequence_parallelism_threshold(
+                hidden_size=hidden_size,
+                tp_size=tp_size,
+                element_size=element_size,
+            )
+            assert result == expected
+
+    def test_hidden_size_boundary(self, mock_cuda_platform):
+        """Test behavior at the exact hidden_size boundary."""
+        with mock_cuda_platform(capability=(9, 0)):
+            # Just below threshold
+            result = get_sequence_parallelism_threshold(
+                hidden_size=SP_MIN_HIDDEN_SIZE[90] - 1,
+                tp_size=2,
+                element_size=2,
+            )
+            assert result is None
+
+            # Exactly at threshold
+            result = get_sequence_parallelism_threshold(
+                hidden_size=SP_MIN_HIDDEN_SIZE[90],
+                tp_size=2,
+                element_size=2,
+            )
+            assert result is not None
--- a/tests/compile/test_startup.py
+++ b/tests/compile/test_startup.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Cold start and warm start tests for vLLM-compile.
+
+Cold start runs in a forked child (must fork before CUDA init) which
+populates on-disk caches and asserts cold-start counters.  Warm start
+then runs in the parent with clean in-memory state but populated caches.
+"""
+
+import multiprocessing as mp
+
+from torch._dynamo.utils import counters
+
+from vllm.compilation.counter import compilation_counter
+from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
+
+MODEL = "microsoft/Phi-tiny-MoE-instruct"
+
+
+def _run_vllm(vllm_runner):
+    with vllm_runner(
+        MODEL,
+        trust_remote_code=False,
+        max_model_len=256,
+        max_num_batched_tokens=1024,
+        load_format="dummy",
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            cudagraph_mode=CUDAGraphMode.NONE,
+        ),
+        num_gpu_blocks_override=8,
+    ):
+        pass
+
+
+def _cold_start(vllm_runner):
+    counters.clear()
+    with compilation_counter.expect(
+        num_compiled_artifacts_saved=3,
+        num_compiled_artifacts_loaded=0,
+    ):
+        _run_vllm(vllm_runner)
+    assert counters["aot_autograd"]["total"] == 33
+    assert counters["aot_autograd"]["autograd_cache_miss"] == 3
+    assert counters["aot_autograd"]["autograd_cache_hit"] == 0
+
+
+def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache):
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+
+    # Cold start in a forked child (must fork before CUDA init).
+    # This model has 32 identical transformer layers which produce
+    # 33 subgraphs after splitting on attention — only 3 are unique.
+    ctx = mp.get_context("fork")
+    p = ctx.Process(target=_cold_start, args=(vllm_runner,))
+    p.start()
+    p.join()
+    assert p.exitcode == 0, "Cold-start child failed"
+
+    # Warm start — compiled artifacts loaded from disk cache.
+    counters.clear()
+    with compilation_counter.expect(
+        num_compiled_artifacts_loaded=3,
+        num_compiled_artifacts_saved=0,
+    ):
+        _run_vllm(vllm_runner)
+    assert counters["aot_autograd"]["total"] == 30
+    assert counters["aot_autograd"]["autograd_cache_miss"] == 0
+    assert (
+        counters["aot_autograd"]["autograd_cache_hit"] == 0
+    )  # No miss at aot_autograd level causing disk I/O.
--- a/tests/compile/test_structured_logging.py
+++ b/tests/compile/test_structured_logging.py
@@ -109,9 +109,9 @@ def test_vllm_structured_logging_artifacts(use_fresh_inductor_cache):
        f"got {len(vllm_piecewise_split_graph)}"
    )
    compile_start_artifacts = capture.get("artifact", "vllm_piecewise_compile_start")
-    assert len(compile_start_artifacts) == 2, (
-        "Expected 2 vllm_piecewise_compile_start "
-        "(one for dynamic ranges, one for compile size), "
+    assert len(compile_start_artifacts) == 4, (
+        "Expected 4 vllm_piecewise_compile_start "
+        "(2 subgraphs x 2 ranges each: dynamic + compile size), "
        f"got {len(compile_start_artifacts)}"
    )
    submod_dumps = capture.get("graph_dump", r"vllm_submod_.*")

--- a/tests/compile/test_wrapper.py
+++ b/tests/compile/test_wrapper.py
@@ -95,7 +95,7 @@ def test_torch_compile_wrapper(use_bytecode_hook, monkeypatch):
            f"Expected {expected1}, got {result1}"
        )

-        # Second call should triger another compilation
+        # Second call should trigger another compilation
        x2 = torch.tensor([1, 2, 3])
        result2 = wrapper(x2)
        expected2 = torch.tensor([100, 200, 300])

--- a/tests/config/test_config_generation.py
+++ b/tests/config/test_config_generation.py
@@ -78,3 +78,34 @@ def test_ray_runtime_env(monkeypatch: pytest.MonkeyPatch):
    )

    ray.shutdown()
+
+
+def test_unrecognized_env(monkeypatch):
+    import os
+
+    from vllm.envs import environment_variables
+
+    # Remove any existing unrecognized VLLM env vars that might interfere
+    for env in list(os.environ):
+        if env.startswith("VLLM_") and env not in environment_variables:
+            monkeypatch.delenv(env, raising=False)
+
+    # Test that if fail_on_environ_validation is True, then an error
+    # is raised when an unrecognized vLLM environment variable is set
+    monkeypatch.setenv("VLLM_UNRECOGNIZED_ENV_VAR", "some_value")
+    engine_args = EngineArgs(
+        fail_on_environ_validation=True,
+    )
+    with pytest.raises(ValueError, match="Unknown vLLM environment variable detected"):
+        engine_args.create_engine_config()
+
+    # Test that if fail_on_environ_validation is False, then no error is raised
+    engine_args = EngineArgs()
+    engine_args.create_engine_config()
+
+    # Test that when the unrecognized env var is removed, no error is raised
+    monkeypatch.delenv("VLLM_UNRECOGNIZED_ENV_VAR")
+    engine_args = EngineArgs(
+        fail_on_environ_validation=True,
+    )
+    engine_args.create_engine_config()
--- a/tests/config/test_multimodal_config.py
+++ b/tests/config/test_multimodal_config.py
@@ -3,6 +3,7 @@

 import pytest

+from vllm.config.model import ModelConfig
 from vllm.config.multimodal import MultiModalConfig
 from vllm.v1.attention.backends.registry import AttentionBackendEnum

@@ -23,3 +24,20 @@ def test_mm_encoder_attn_backend_hash_updates():
        mm_encoder_attn_backend=AttentionBackendEnum.FLASH_ATTN
    ).compute_hash()
    assert base_hash != overridden_hash
+
+
+def test_language_model_only_does_not_affect_mm_hash():
+    """language_model_only does not affect the ViT computation graph,
+    so it should not change the multimodal config hash."""
+    base_hash = MultiModalConfig().compute_hash()
+    lm_only_hash = MultiModalConfig(language_model_only=True).compute_hash()
+    assert base_hash == lm_only_hash
+
+
+def test_language_model_only_affects_model_hash():
+    """language_model_only affects the LM computation graph,
+    so it should change the model config hash."""
+    model = "llava-hf/llava-1.5-7b-hf"
+    base_hash = ModelConfig(model).compute_hash()
+    lm_only_hash = ModelConfig(model, language_model_only=True).compute_hash()
+    assert base_hash != lm_only_hash
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -176,16 +176,20 @@ def init_test_http_connection():

 @pytest.fixture
 def dist_init():
+    from tests.utils import ensure_current_vllm_config
+
    temp_file = tempfile.mkstemp()[1]
-    init_distributed_environment(
-        world_size=1,
-        rank=0,
-        distributed_init_method=f"file://{temp_file}",
-        local_rank=0,
-        backend="nccl",
-    )
-    initialize_model_parallel(1, 1)
-    yield
+
+    with ensure_current_vllm_config():
+        init_distributed_environment(
+            world_size=1,
+            rank=0,
+            distributed_init_method=f"file://{temp_file}",
+            local_rank=0,
+            backend="nccl",
+        )
+        initialize_model_parallel(1, 1)
+        yield
    cleanup_dist_env_and_memory()


@@ -419,18 +423,16 @@ class HfRunner:
            self.tokenizer: "PreTrainedTokenizer | PreTrainedTokenizerFast" = (
                AutoTokenizer.from_pretrained(
                    model_name,
-                    dtype=dtype,
                    trust_remote_code=trust_remote_code,
                )
            )

        # don't put this import at the top level
-        # it will call torch.cuda.device_count()
+        # it will call torch.accelerator.device_count()
        from transformers import AutoProcessor

        self.processor = AutoProcessor.from_pretrained(
            model_name,
-            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
        if skip_tokenizer_init:
@@ -792,7 +794,6 @@ class VllmRunner:
        tensor_parallel_size: int = 1,
        block_size: int = 16 if not torch.xpu.is_available() else 64,
        enable_chunked_prefill: bool | None = False,
-        swap_space: int = 4,
        enforce_eager: bool | None = False,
        # Set this to avoid hanging issue
        default_torch_num_threads: int | None = None,
@@ -829,7 +830,6 @@ class VllmRunner:
                trust_remote_code=trust_remote_code,
                dtype=dtype,
                seed=seed,
-                swap_space=swap_space,
                enforce_eager=enforce_eager,
                disable_log_stats=disable_log_stats,
                tensor_parallel_size=tensor_parallel_size,
@@ -841,7 +841,10 @@ class VllmRunner:

    def get_inputs(
        self,
-        prompts: list[str] | list[torch.Tensor] | list[list[int]],
+        prompts: list[str]
+        | list[torch.Tensor]
+        | list[list[int]]
+        | list[dict[str, Any]],
        images: PromptImageInput | None = None,
        videos: PromptVideoInput | None = None,
        audios: PromptAudioInput | None = None,
@@ -855,26 +858,32 @@ class VllmRunner:

        inputs = list[dict[str, Any]]()
        for i, prompt in enumerate(prompts):
-            prompt_dict = dict[str, Any]()
-            if isinstance(prompt, str):
-                prompt_dict["prompt"] = prompt
-            elif isinstance(prompt, list):
-                prompt_dict["prompt_token_ids"] = prompt
+            # If we're passing an encoder/decoder prompt, we assume it
+            # already contains the multimodal data in the prompt
+            if isinstance(prompt, dict):
+                assert images is None and audios is None and videos is None
+                inputs.append(prompt.copy())
            else:
-                prompt_dict["prompt_embeds"] = prompt
-
-            multi_modal_data = dict[str, Any]()
-            if images is not None and (image := images[i]) is not None:
-                multi_modal_data["image"] = image
-            if videos is not None and (video := videos[i]) is not None:
-                multi_modal_data["video"] = video
-            if audios is not None and (audio := audios[i]) is not None:
-                multi_modal_data["audio"] = audio
+                prompt_dict = dict[str, Any]()
+                if isinstance(prompt, str):
+                    prompt_dict["prompt"] = prompt
+                elif isinstance(prompt, list):
+                    prompt_dict["prompt_token_ids"] = prompt
+                else:
+                    prompt_dict["prompt_embeds"] = prompt
+
+                multi_modal_data = dict[str, Any]()
+                if images is not None and (image := images[i]) is not None:
+                    multi_modal_data["image"] = image
+                if videos is not None and (video := videos[i]) is not None:
+                    multi_modal_data["video"] = video
+                if audios is not None and (audio := audios[i]) is not None:
+                    multi_modal_data["audio"] = audio

-            if multi_modal_data:
-                prompt_dict["multi_modal_data"] = multi_modal_data
+                if multi_modal_data:
+                    prompt_dict["multi_modal_data"] = multi_modal_data

-            inputs.append(prompt_dict)
+                inputs.append(prompt_dict)

        return inputs

@@ -1138,6 +1147,15 @@ class VllmRunner:
        return self

    def __exit__(self, exc_type, exc_value, traceback):
+        # Explicitly shutdown the engine core to release GPU resources
+        # This is needed because when executing consecutive tests, the GC
+        # might not be fast enough in shutting down the llm engine. This can lead to OOMs
+        # because when the next test starts some GPU memory is still in use.
+        try:
+            self.llm.llm_engine.engine_core.shutdown()
+        except Exception:
+            # Ignore shutdown errors as cleanup will still proceed
+            pass
        del self.llm
        cleanup_dist_env_and_memory()

@@ -1517,7 +1535,7 @@ def clean_gpu_memory_between_tests():

    from tests.utils import wait_for_gpu_memory_to_clear

-    num_gpus = torch.cuda.device_count()
+    num_gpus = torch.accelerator.device_count()
    if num_gpus > 0:
        try:
            wait_for_gpu_memory_to_clear(
@@ -1531,7 +1549,7 @@ def clean_gpu_memory_between_tests():

    # Clean up GPU memory after the test
    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
+        torch.accelerator.empty_cache()
        gc.collect()


@@ -1546,6 +1564,14 @@ def use_fresh_inductor_cache():
        yield


+@pytest.fixture
+def fresh_vllm_cache(monkeypatch, use_fresh_inductor_cache):
+    """Temporary VLLM_CACHE_ROOT combined with a fresh inductor cache."""
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        monkeypatch.setenv("VLLM_CACHE_ROOT", tmp_dir)
+        yield tmp_dir
+
+
 @pytest.fixture(scope="function")
 def enable_pickle(monkeypatch):
    """`LLM.apply_model` requires pickling a function."""

--- a/tests/cuda/scripts/check_device_count_respects_env.py
+++ b/tests/cuda/scripts/check_device_count_respects_env.py
@@ -14,7 +14,7 @@ import torch  # noqa: E402
 from vllm.platforms import current_platform  # noqa: F401, E402

 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
-count = torch.cuda.device_count()
+count = torch.accelerator.device_count()

 if count == 0:
    sys.exit(0)  # Skip: no GPUs available

--- a/tests/cuda/test_cuda_compatibility_path.py
+++ b/tests/cuda/test_cuda_compatibility_path.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for CUDA forward compatibility path logic in env_override.py.
+
+Verifies the opt-in LD_LIBRARY_PATH manipulation for CUDA compat libs,
+including env var parsing, path detection, and deduplication.
+"""
+
+import os
+from unittest.mock import patch
+
+import pytest
+
+# Import the functions directly (they're module-level in env_override)
+# We must import them without triggering the module-level side effects,
+# so we import the functions by name after the module is already loaded.
+from vllm.env_override import (
+    _get_torch_cuda_version,
+    _maybe_set_cuda_compatibility_path,
+)
+
+
+class TestCudaCompatibilityEnvParsing:
+    """Test VLLM_ENABLE_CUDA_COMPATIBILITY env var parsing."""
+
+    def test_disabled_by_default(self, monkeypatch):
+        """Compat path is NOT set when env var is absent."""
+        monkeypatch.delenv("VLLM_ENABLE_CUDA_COMPATIBILITY", raising=False)
+        monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
+        _maybe_set_cuda_compatibility_path()
+        assert (
+            "LD_LIBRARY_PATH" not in os.environ
+            or os.environ.get("LD_LIBRARY_PATH", "") == ""
+        )
+
+    @pytest.mark.parametrize("value", ["0", "false", "False", "no", ""])
+    def test_disabled_values(self, monkeypatch, value):
+        """Various falsy values should not activate compat path."""
+        monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", value)
+        monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
+        _maybe_set_cuda_compatibility_path()
+        # LD_LIBRARY_PATH should not be set (or remain empty)
+        ld_path = os.environ.get("LD_LIBRARY_PATH", "")
+        assert "compat" not in ld_path
+
+    @pytest.mark.parametrize("value", ["1", "true", "True", " 1 ", " TRUE "])
+    def test_enabled_values_with_valid_path(self, monkeypatch, tmp_path, value):
+        """Truthy values activate compat path when a valid path exists."""
+        compat_dir = tmp_path / "compat"
+        compat_dir.mkdir()
+        monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", value)
+        monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(compat_dir))
+        monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
+        _maybe_set_cuda_compatibility_path()
+        ld_path = os.environ.get("LD_LIBRARY_PATH", "")
+        assert str(compat_dir) in ld_path
+
+
+class TestCudaCompatibilityPathDetection:
+    """Test path detection: custom override, conda, default."""
+
+    def test_custom_path_override(self, monkeypatch, tmp_path):
+        """VLLM_CUDA_COMPATIBILITY_PATH takes highest priority."""
+        custom_dir = tmp_path / "my-compat"
+        custom_dir.mkdir()
+        monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
+        monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(custom_dir))
+        monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
+        _maybe_set_cuda_compatibility_path()
+        ld_path = os.environ.get("LD_LIBRARY_PATH", "")
+        assert ld_path.startswith(str(custom_dir))
+
+    def test_conda_prefix_fallback(self, monkeypatch, tmp_path):
+        """Falls back to $CONDA_PREFIX/cuda-compat if custom not set."""
+        conda_dir = tmp_path / "conda-env"
+        compat_dir = conda_dir / "cuda-compat"
+        compat_dir.mkdir(parents=True)
+        monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
+        monkeypatch.delenv("VLLM_CUDA_COMPATIBILITY_PATH", raising=False)
+        monkeypatch.setenv("CONDA_PREFIX", str(conda_dir))
+        monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
+        _maybe_set_cuda_compatibility_path()
+        ld_path = os.environ.get("LD_LIBRARY_PATH", "")
+        assert str(compat_dir) in ld_path
+
+    def test_no_valid_path_does_nothing(self, monkeypatch):
+        """When enabled but no valid path exists, LD_LIBRARY_PATH unchanged."""
+        monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
+        monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", "/nonexistent/path")
+        monkeypatch.delenv("CONDA_PREFIX", raising=False)
+        monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
+        with patch("vllm.env_override._get_torch_cuda_version", return_value=None):
+            _maybe_set_cuda_compatibility_path()
+        assert os.environ.get("LD_LIBRARY_PATH", "") == ""
+
+    def test_default_cuda_path_fallback(self, monkeypatch, tmp_path):
+        """Falls back to /usr/local/cuda-{ver}/compat via torch version."""
+        fake_cuda = tmp_path / "cuda-12.8" / "compat"
+        fake_cuda.mkdir(parents=True)
+        monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
+        monkeypatch.delenv("VLLM_CUDA_COMPATIBILITY_PATH", raising=False)
+        monkeypatch.delenv("CONDA_PREFIX", raising=False)
+        monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
+        with (
+            patch("vllm.env_override._get_torch_cuda_version", return_value="12.8"),
+            patch(
+                "vllm.env_override.os.path.isdir",
+                side_effect=lambda p: p == "/usr/local/cuda-12.8/compat"
+                or os.path.isdir(p),
+            ),
+        ):
+            _maybe_set_cuda_compatibility_path()
+        ld_path = os.environ.get("LD_LIBRARY_PATH", "")
+        assert "/usr/local/cuda-12.8/compat" in ld_path
+
+
+class TestCudaCompatibilityLdPathManipulation:
+    """Test LD_LIBRARY_PATH prepend and deduplication logic."""
+
+    def test_prepends_to_empty_ld_path(self, monkeypatch, tmp_path):
+        """Compat path is set when LD_LIBRARY_PATH is empty."""
+        compat_dir = tmp_path / "compat"
+        compat_dir.mkdir()
+        monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
+        monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(compat_dir))
+        monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
+        _maybe_set_cuda_compatibility_path()
+        assert os.environ["LD_LIBRARY_PATH"] == str(compat_dir)
+
+    def test_prepends_to_existing_ld_path(self, monkeypatch, tmp_path):
+        """Compat path is prepended before existing entries."""
+        compat_dir = tmp_path / "compat"
+        compat_dir.mkdir()
+        monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
+        monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(compat_dir))
+        monkeypatch.setenv("LD_LIBRARY_PATH", "/usr/lib:/other/lib")
+        _maybe_set_cuda_compatibility_path()
+        ld_path = os.environ["LD_LIBRARY_PATH"]
+        parts = ld_path.split(os.pathsep)
+        assert parts[0] == str(compat_dir)
+        assert "/usr/lib" in parts
+        assert "/other/lib" in parts
+
+    def test_deduplicates_existing_compat_path(self, monkeypatch, tmp_path):
+        """If compat path already in LD_LIBRARY_PATH, move to front."""
+        compat_dir = tmp_path / "compat"
+        compat_dir.mkdir()
+        monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
+        monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(compat_dir))
+        monkeypatch.setenv(
+            "LD_LIBRARY_PATH",
+            f"/usr/lib:{compat_dir}:/other/lib",
+        )
+        _maybe_set_cuda_compatibility_path()
+        ld_path = os.environ["LD_LIBRARY_PATH"]
+        parts = ld_path.split(os.pathsep)
+        assert parts[0] == str(compat_dir)
+        assert parts.count(str(compat_dir)) == 1
+
+    def test_already_at_front_is_noop(self, monkeypatch, tmp_path):
+        """If compat path is already first, don't modify LD_LIBRARY_PATH."""
+        compat_dir = tmp_path / "compat"
+        compat_dir.mkdir()
+        original = f"{compat_dir}:/usr/lib"
+        monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
+        monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(compat_dir))
+        monkeypatch.setenv("LD_LIBRARY_PATH", original)
+        _maybe_set_cuda_compatibility_path()
+        assert os.environ["LD_LIBRARY_PATH"] == original
+
+
+class TestGetTorchCudaVersion:
+    """Test _get_torch_cuda_version() helper."""
+
+    def test_returns_string_when_torch_available(self):
+        """Should return a CUDA version string like '12.8'."""
+        version = _get_torch_cuda_version()
+        # torch is installed in vllm's environment
+        assert version is None or isinstance(version, str)
+
+    def test_returns_none_when_torch_missing(self):
+        """Should return None when torch is not importable."""
+        with patch(
+            "vllm.env_override.importlib.util.find_spec",
+            return_value=None,
+        ):
+            assert _get_torch_cuda_version() is None
--- a/tests/detokenizer/test_disable_detokenization.py
+++ b/tests/detokenizer/test_disable_detokenization.py
@@ -7,7 +7,6 @@ from vllm.entrypoints.llm import LLM
 from vllm.sampling_params import SamplingParams


-@pytest.mark.skip_v1
 @pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
 def test_computed_prefix_blocks(model: str):
    # This test checks if the engine generates completions both with and