Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori

38d80967 · zhuwenwen · 33650733 · 880c741b · 38d80967 · 38d80967
Commit 38d80967 authored Sep 12, 2025 by zhuwenwen
20 changed files
--- a/tests/compile/piecewise/test_multiple_graphs.py
+++ b/tests/compile/piecewise/test_multiple_graphs.py
@@ -4,9 +4,9 @@
 Test (piecewise) compilation with a simple model where multiple submodules
 are compiled and graph captured separately.
 """
 import torch
 from torch import nn
-from torch.library import Library
 from vllm.compilation.backends import set_model_tag
 from vllm.compilation.counter import compilation_counter
@@ -15,10 +15,9 @@ from vllm.compilation.decorators import (ignore_torch_compile,
 from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
                         VllmConfig, set_current_vllm_config)
 from vllm.forward_context import BatchDescriptor, set_forward_context
-from vllm.utils import direct_register_custom_op
-# create a library to hold the custom op
+# This import automatically registers `torch.ops.silly.attention`
-silly_lib = Library("silly", "FRAGMENT")  # noqa
+from .. import silly_attention  # noqa: F401
 BATCH_SIZE = 32
 MLP_SIZE = 128
@@ -26,27 +25,6 @@ HIDDEN_SIZE = 1024
 RANDOM_SEED = 0
-def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                    out: torch.Tensor) -> None:
-    out.copy_(q)
-    out += k
-    out += v
-def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                         out: torch.Tensor) -> None:
-    return
-direct_register_custom_op(
-    op_name="attention",
-    op_func=silly_attention,
-    mutates_args=["out"],
-    fake_impl=silly_attention_fake,
-    target_lib=silly_lib,
-)
 @support_torch_compile
 class ParentModel(nn.Module):
@@ -134,7 +112,7 @@ class SimpleModelWithTwoGraphs(ParentModel):
        # Test will fail without set_model_tag here with error:
        # "ValueError: too many values to unpack (expected 3)"
        # This is because CompiledAttention and CompiledAttentionTwo
-        # have different implmentations but the same torch.compile
+        # have different implementations but the same torch.compile
        # cache dir will be used as default prefix is 'model_tag'
        with set_model_tag("attn_one"):
            self.attn_one = CompiledAttention(

--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -4,10 +4,10 @@
 Test the piecewise compilation with a simple model so that we
 can exactly calculate the expected output and side effects.
 """
 import pytest
 import torch
 from torch import nn
-from torch.library import Library
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
@@ -15,35 +15,9 @@ from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
                         VllmConfig, set_current_vllm_config)
 from vllm.envs import VLLM_USE_V1
 from vllm.forward_context import BatchDescriptor, set_forward_context
-from vllm.utils import direct_register_custom_op
-global_counter = 0
-# create a library to hold the custom op
-silly_lib = Library("silly", "FRAGMENT")  # noqa
-def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                    out: torch.Tensor) -> None:
-    global global_counter
-    global_counter += 1
-    print(f"{global_counter=}")
-    out.copy_(q)
-    out[0] += 1
-def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                         out: torch.Tensor) -> None:
-    return
-direct_register_custom_op(
+# This import automatically registers `torch.ops.silly.attention`
-    op_name="attention",
+from ..silly_attention import get_global_counter, reset_global_counter
-    op_func=silly_attention,
-    mutates_args=["out"],
-    fake_impl=silly_attention_fake,
-    target_lib=silly_lib,
-)
 @support_torch_compile
@@ -59,8 +33,7 @@ class SillyModel(nn.Module):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Overall effect:
-        x += 1
+        x = 3 * x + 19
-        x[0] += 2
        global_counter += 2
        """
        x = x + 1
@@ -78,6 +51,7 @@ class SillyModel(nn.Module):
 @pytest.mark.parametrize("use_inductor", [True, False])
+@torch.inference_mode()
 def test_simple_piecewise_compile(use_inductor):
    assert VLLM_USE_V1
@@ -121,13 +95,12 @@ def test_simple_piecewise_compile(use_inductor):
            model(torch.randn(1).cuda())
        input = torch.zeros(2).cuda()
-        global global_counter
+        reset_global_counter()
-        global_counter = 0
        with set_forward_context(
                None,
                vllm_config=vllm_config,
                cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
                batch_descriptor=BatchDescriptor(num_tokens=2, )):
            output = model(input)
-        assert global_counter == 2
+        assert get_global_counter() == 2
-        assert torch.allclose(output.cpu(), torch.tensor([3., 1.]))
+        assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -14,38 +14,15 @@ from typing import Any, Optional
 import pytest
 import torch
 from torch import nn
-from torch.library import Library
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
                         VllmConfig, set_current_vllm_config)
 from vllm.forward_context import BatchDescriptor, set_forward_context
-from vllm.utils import direct_register_custom_op
-# create a library to hold the custom op
+# This import automatically registers `torch.ops.silly.attention`
-silly_lib = Library("silly", "FRAGMENT")  # noqa
+from .. import silly_attention  # noqa: F401
-def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                    out: torch.Tensor) -> None:
-    out.copy_(q)
-    out += k
-    out += v
-def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                         out: torch.Tensor) -> None:
-    return
-direct_register_custom_op(
-    op_name="attention",
-    op_func=silly_attention,
-    mutates_args=["out"],
-    fake_impl=silly_attention_fake,
-    target_lib=silly_lib,
-)
 @dataclass

--- a/tests/compile/silly_attention.py
+++ b/tests/compile/silly_attention.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Shared PyTorch custom silly attention for compilation tests.
+Centralizes custom operation definitions to avoid duplicate registrations.
+"""
+import torch
+from torch.library import Library
+from vllm.utils import direct_register_custom_op
+# Shared library for all compilation test operations
+# Using "silly" namespace to match existing test expectations
+# import this file will automatically register
+# torch ops for testing (like silly.attention)
+silly_lib = Library("silly", "FRAGMENT")
+# Global counter that counts the number of times attention is invoked
+_global_counter = 0
+def get_global_counter():
+    """Get the current global counter value"""
+    return _global_counter
+def reset_global_counter():
+    """Reset the global counter to 0"""
+    global _global_counter
+    _global_counter = 0
+def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                    out: torch.Tensor) -> None:
+    """
+    Unified attention implementation that depends on
+    all inputs and affects the output.
+    Always increments a global counter that tests can use or ignore.
+    """
+    global _global_counter
+    # Always increment the global counter
+    _global_counter += 1
+    # Unified implementation that depends on all inputs
+    out.copy_(q + k + v)
+def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                         out: torch.Tensor) -> None:
+    """Fake implementation for testing"""
+    return
+# Register the unified attention operation
+direct_register_custom_op(
+    op_name="attention",
+    op_func=silly_attention,
+    mutates_args=["out"],
+    fake_impl=silly_attention_fake,
+    target_lib=silly_lib,
+)
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -23,7 +23,7 @@ class TestSetting:
    fullgraph: bool
-# we cannot afford testing the full Catesian product
+# we cannot afford testing the full Cartesian product
 # of all models and all levels
 @pytest.mark.parametrize(
    "test_setting",
@@ -62,8 +62,12 @@ class TestSetting:
        TestSetting(
            model="BAAI/bge-multilingual-gemma2",
            model_args=[
-                "--runner", "pooling", "--dtype", "bfloat16",
+                "--runner",
-                "--max-model-len", "2048"
+                "pooling",
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
            ],
            pp_size=1,
            tp_size=1,
@@ -71,17 +75,15 @@ class TestSetting:
            method="encode",
            fullgraph=True,
        ),
-        # TODO: bert models are not supported in V1 yet
+        TestSetting(
-        # # encoder-based embedding model (BERT)
+            model="BAAI/bge-base-en-v1.5",
-        # TestSetting(
+            model_args=["--runner", "pooling"],
-        #     model="BAAI/bge-base-en-v1.5",
+            pp_size=1,
-        #     model_args=["--runner", "pooling"],
+            tp_size=1,
-        #     pp_size=1,
+            attn_backend="FLASH_ATTN",
-        #     tp_size=1,
+            method="encode",
-        #     attn_backend="XFORMERS",
+            fullgraph=True,
-        #     method="encode",
+        ),
-        #     fullgraph=True,
-        # ),
        # vision language model
        TestSetting(
            model="microsoft/Phi-3.5-vision-instruct",
@@ -92,7 +94,8 @@ class TestSetting:
            method="generate_with_image",
            fullgraph=False,
        ),
-    ])
+    ],
+)
 def test_compile_correctness(
    monkeypatch: pytest.MonkeyPatch,
    test_setting: TestSetting,

--- a/tests/compile/test_decorator.py
+++ b/tests/compile/test_decorator.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 from torch import nn
-from torch.library import Library
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import (ignore_torch_compile,
@@ -10,36 +9,14 @@ from vllm.compilation.decorators import (ignore_torch_compile,
 from vllm.config import (CacheConfig, CompilationConfig, CompilationLevel,
                         CUDAGraphMode, VllmConfig, set_current_vllm_config)
 from vllm.forward_context import BatchDescriptor, set_forward_context
-from vllm.utils import direct_register_custom_op
-# create a library to hold the custom op
+# This import automatically registers `torch.ops.silly.attention`
-silly_lib = Library("silly", "FRAGMENT")  # noqa
+from . import silly_attention  # noqa: F401
 BATCH_SIZE = 32
 MLP_SIZE = 128
-def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                    out: torch.Tensor) -> None:
-    out.copy_(q)
-    out += k
-    out += v
-def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                         out: torch.Tensor) -> None:
-    return
-direct_register_custom_op(
-    op_name="attention",
-    op_func=silly_attention,
-    mutates_args=["out"],
-    fake_impl=silly_attention_fake,
-    target_lib=silly_lib,
-)
 @torch.inference_mode
 def run_model(vllm_config: VllmConfig, model: nn.Module,
              cudagraph_runtime_mode: CUDAGraphMode):
@@ -151,7 +128,7 @@ def test_ignore_torch_compile_decorator():
        run_model(vllm_config, mod_C, cudagraph_runtime_mode)
-# Only enable torch.compile if
+# Only enable torch.compile if
 # vllm_config.cache_config.kv_sharing_fast_prefill=True
 @support_torch_compile(enable_if=lambda vllm_config: vllm_config.cache_config.
                       kv_sharing_fast_prefill)
@@ -173,7 +150,7 @@ class B(nn.Module):
        return x
-# Only enable torch.compile if
+# Only enable torch.compile if
 # vllm_config.cache_config.kv_sharing_fast_prefill=False
 @support_torch_compile(enable_if=lambda vllm_config: not vllm_config.
                       cache_config.kv_sharing_fast_prefill)

--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -15,9 +15,10 @@ from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    GroupShape, QuantKey, ScaleDesc)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    Fp8LinearOp, maybe_create_device_identity)
+    Fp8LinearOp, cutlass_fp8_supported, maybe_create_device_identity)
 from vllm.platforms import current_platform
+from ..utils import override_cutlass_fp8_supported
 from .backend import TestBackend
 FP8_DTYPE = current_platform.fp8_dtype()
@@ -26,9 +27,9 @@ FP8_DTYPE = current_platform.fp8_dtype()
 class TestModel(torch.nn.Module):
    def __init__(self, hidden_size: int, eps: float, static: bool,
-                 force_fp8_e4m3fnuz: bool, *args, **kwargs):
+                 cuda_force_torch: bool, *args, **kwargs):
        super().__init__(*args, **kwargs)
-        self.force_fp8_e4m3fnuz = force_fp8_e4m3fnuz
+        self.cuda_force_torch = cuda_force_torch
        self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
        self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
        group_shape = GroupShape.PER_TENSOR if static else GroupShape.PER_TOKEN
@@ -42,11 +43,12 @@ class TestModel(torch.nn.Module):
            torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
            for _ in range(2)
        ]
-        self.fp8_linear = Fp8LinearOp(
-            force_fp8_e4m3fnuz=force_fp8_e4m3fnuz,
+        with override_cutlass_fp8_supported(not cuda_force_torch):
-            act_quant_static=static,
+            self.fp8_linear = Fp8LinearOp(
-            act_quant_group_shape=group_shape,
+                act_quant_static=static,
-        )
+                act_quant_group_shape=group_shape,
+            )
    def forward(self, x):
        resid = torch.sqrt(x)
@@ -81,11 +83,14 @@ class TestModel(torch.nn.Module):
 @pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049])
 @pytest.mark.parametrize("eps", [1e-5, 1e-6])
 @pytest.mark.parametrize("static", [True, False])
-@pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False])
+# cuda_force_torch used to test torch code path on platforms that
+# cutlass_fp8_supported() == True.
+@pytest.mark.parametrize("cuda_force_torch",
+                         [True, False] if cutlass_fp8_supported() else [True])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
                    reason="Only test on CUDA and ROCm")
 def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
-                              force_fp8_e4m3fnuz):
+                              cuda_force_torch):
    torch.set_default_device("cuda")
    torch.set_default_dtype(dtype)
    torch.manual_seed(1)
@@ -102,7 +107,7 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
        fusion_pass = FusionPass.instance(vllm_config)
        backend = TestBackend(noop_pass, fusion_pass)
-        model = TestModel(hidden_size, eps, static, force_fp8_e4m3fnuz)
+        model = TestModel(hidden_size, eps, static, cuda_force_torch)
        # First dimension dynamic
        x = torch.rand(num_tokens, hidden_size)

--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -40,13 +40,12 @@ backend_unfused: Optional[TestBackend] = None
 @pytest.mark.parametrize(
    "model, quant_key",
    [("amd/Llama-3.1-8B-Instruct-FP8-KV", kFp8StaticTensorSym)])
-@pytest.mark.parametrize(
+@pytest.mark.parametrize("use_triton_fa", [True, False])
-    "use_triton_fa", [True, False] if current_platform.is_rocm() else [False])
 @pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
-@pytest.mark.skipif(not current_platform.is_cuda_alike(),
+@pytest.mark.skipif(not current_platform.is_rocm(),
-                    reason="Only test CUDA and ROCm")
+                    reason="V0 attn quant fusion only on ROCm")
-def test_attention_fusion(example_prompts, monkeypatch, model: str,
+def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
-                          quant_key: QuantKey, use_triton_fa: bool):
+                             quant_key: QuantKey, use_triton_fa: bool):
    # Clean Dynamo cache to avoid reusing other test cases
    # (for some reason the reset at the end is not enough)
    torch._dynamo.reset()
@@ -69,13 +68,17 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
        backend="tests.compile.test_fusion_attn.backend_unfused",
        custom_ops=["+quant_fp8"],
    )
-    vllm_config = VllmConfig(compilation_config=compile_config)
+    vllm_config = VllmConfig(compilation_config=compile_config,
+                             model_config=ModelConfig(
+                                 model=model,
+                                 dtype=torch.bfloat16,
+                             ))
    backend_unfused = TestBackend(NoOpEliminationPass(vllm_config))
    llm = LLM(model,
              enforce_eager=True,
              compilation_config=compile_config,
-              gpu_memory_utilization=0.9,
+              gpu_memory_utilization=0.5,
              max_model_len=2048)
    sampling_params = SamplingParams(temperature=0.0,
@@ -93,7 +96,11 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
        backend="tests.compile.test_fusion_attn.backend",
        custom_ops=["+quant_fp8"],
    )
-    vllm_config = VllmConfig(compilation_config=compile_config)
+    vllm_config = VllmConfig(compilation_config=compile_config,
+                             model_config=ModelConfig(
+                                 model=model,
+                                 dtype=torch.bfloat16,
+                             ))
    # AttnFusionPass needs attention layers to be registered in config upon init
    # so we initialize it during compilation.
@@ -102,7 +109,7 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
    llm2 = LLM(model,
               enforce_eager=True,
               compilation_config=compile_config,
-               gpu_memory_utilization=0.9,
+               gpu_memory_utilization=0.5,
               max_model_len=2048)
    # check support
@@ -171,6 +178,8 @@ class AttentionQuantPatternModel(torch.nn.Module):
            cache_config=vllm_config.cache_config,
            prefix="model.layers.0.self_attn.attn",
        )
+        self.attn._k_scale = self.attn._k_scale.to(device)
+        self.attn._v_scale = self.attn._v_scale.to(device)
        self.block_size = 16
@@ -188,7 +197,7 @@ class AttentionQuantPatternModel(torch.nn.Module):
            device=self.device,
        )
-    def build_attn_metadata(self, batch_size: int):
+    def build_attn_metadata(self, batch_size: int, use_hnd: bool):
        """Initialize attention metadata."""
        # Create common attn metadata
@@ -205,10 +214,8 @@ class AttentionQuantPatternModel(torch.nn.Module):
        num_blocks = batch_size * max_blocks
        # Create dummy KV cache for FlashInfer TRTLLM
-        #   - NHD: [num_blocks, 2, block_size, num_kv_heads, head_size]
+        #   - NHD: [num_blocks, block_size, num_kv_heads, head_size]
-        #   - HND: [num_blocks, 2, num_kv_heads, block_size, head_size]
+        #   - HND: [num_blocks, num_kv_heads, block_size, head_size]
-        # Create kv_cache in HND layout and permute to NHD layout
-        # (later will be permuted back to HND layout in forward pass)
        kv_cache = torch.zeros(num_blocks,
                               2,
                               self.num_kv_heads,
@@ -216,7 +223,17 @@ class AttentionQuantPatternModel(torch.nn.Module):
                               self.head_size,
                               dtype=self.kv_cache_dtype,
                               device=self.device)
-        kv_cache = kv_cache.permute(0, 1, 3, 2, 4)
+        if current_platform.is_rocm():
+            # k/v as 1st dimention
+            if use_hnd:
+                kv_cache = kv_cache.permute(1, 0, 2, 3, 4)
+            else:
+                kv_cache = kv_cache.permute(1, 0, 3, 2, 4)
+        else:
+            # k/v as 2nd dimention
+            # Create kv_cache in HND layout and permute to NHD layout
+            # (later will be permuted back to HND layout in forward pass)
+            kv_cache = kv_cache.permute(0, 1, 3, 2, 4)
        self.attn.kv_cache = [kv_cache]
        # Build attn metadata
@@ -296,28 +313,51 @@ class TestAttentionNvfp4QuantPatternModel(AttentionQuantPatternModel):
                                     out_dtype=attn_output.dtype)
-@pytest.mark.parametrize("num_qo_heads, num_kv_heads", [(64, 8), (40, 8)])
+if current_platform.is_cuda():
+    MODELS = [("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
+               TestAttentionFp8StaticQuantPatternModel),
+              ("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
+               TestAttentionNvfp4QuantPatternModel)]
+    HEADS = [(64, 8), (40, 8)]
+elif current_platform.is_rocm():
+    MODELS = [("amd/Llama-3.1-8B-Instruct-FP8-KV",
+               TestAttentionFp8StaticQuantPatternModel)]
+    HEADS = [(32, 8), (40, 8)]
+else:
+    MODELS = []
+    HEADS = []
+@pytest.mark.parametrize("num_qo_heads, num_kv_heads", HEADS)
 @pytest.mark.parametrize("head_size", [128])
-@pytest.mark.parametrize("batch_size", [7, 256, 533])
+@pytest.mark.parametrize("batch_size",
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
+                         [7, 256, 533] if current_platform.is_cuda() else [8])
-@pytest.mark.parametrize("model_name, model_class",
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
-                         [("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
+@pytest.mark.parametrize("model_name, model_class", MODELS)
-                           TestAttentionFp8StaticQuantPatternModel),
+@pytest.mark.parametrize("backend", [_Backend.FLASHINFER] if
-                          ("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
+                         current_platform.is_cuda() else [_Backend.ROCM_FLASH])
-                           TestAttentionNvfp4QuantPatternModel)])
+@pytest.mark.parametrize(
-@pytest.mark.parametrize("backend", [_Backend.FLASHINFER])
+    "split_attention",
-@pytest.mark.skipif(not current_platform.is_cuda(), reason="Only test CUDA")
+    [False, True] if current_platform.is_rocm() else [False])
+@pytest.mark.skipif(not current_platform.is_cuda_alike(),
+                    reason="Only test ROCm or CUDA")
 @pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
-@pytest.mark.skipif(not current_platform.is_device_capability((10, 0)),
+@pytest.mark.skipif(current_platform.is_cuda()
-                    reason="Only test on SM100(Blackwell)")
+                    and not current_platform.is_device_capability((10, 0)),
+                    reason="On CUDA only test on SM100(Blackwell)")
+@pytest.mark.skipif(not current_platform.is_cuda_alike(),
+                    reason="Only test ROCm or CUDA")
 def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
                                 head_size: int, batch_size: int,
                                 dtype: torch.dtype, model_name: str,
                                 model_class: type[AttentionQuantPatternModel],
-                                 backend: _Backend, monkeypatch, dist_init):
+                                 backend: _Backend, split_attention: bool,
+                                 monkeypatch, dist_init):
    """Test AttentionStaticQuantPattern fusion pass"""
    monkeypatch.setenv("VLLM_USE_V1", "1")
+    if split_attention:
+        monkeypatch.setenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "1")
    device = torch.device("cuda:0")
    torch.manual_seed(42)
@@ -326,6 +366,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
        model_config=ModelConfig(
            model=model_name,
            max_model_len=2048,
+            dtype=dtype,
        ),
        scheduler_config=SchedulerConfig(max_num_seqs=1024),
        compilation_config=CompilationConfig(
@@ -368,7 +409,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
        forward_ctx = get_forward_context()
        forward_ctx.attn_metadata = model_unfused.build_attn_metadata(
-            batch_size)
+            batch_size, use_hnd=split_attention)
        # Run model directly without compilation and fusion
        result_unfused = model_unfused(q, k, v)
@@ -389,7 +430,8 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
        model_fused = model_fused.to(device)
        forward_ctx = get_forward_context()
-        forward_ctx.attn_metadata = model_fused.build_attn_metadata(batch_size)
+        forward_ctx.attn_metadata = model_fused.build_attn_metadata(
+            batch_size, use_hnd=split_attention)
        # Create test backend with fusion passes enabled
        noop_pass = NoOpEliminationPass(vllm_config)
@@ -404,12 +446,19 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
        assert model_compiled.attn._o_scale_float is None
        result_fused_1 = model_compiled(q, k, v)
-        # After the 1st round of the forward pass, output quant scale should be
+        if backend == _Backend.FLASHINFER:
-        # loaded into the attn layer's _o_scale_float, the 2nd round should
+            # With the Flashinfer backend after the 1st round of the forward
-        # reuse the loaded _o_scale_float
+            # pass, output quant scale should be loaded into the attn layer's
-        assert model_compiled.attn._o_scale_float is not None
+            # _o_scale_float, the 2nd round should reuse the loaded
-        result_fused_2 = model_compiled(q, k, v)
+            # _o_scale_float
-        assert model_compiled.attn._o_scale_float is not None
+            assert model_compiled.attn._o_scale_float is not None
+            result_fused_2 = model_compiled(q, k, v)
+            assert model_compiled.attn._o_scale_float is not None
+            torch.testing.assert_close(result_unfused,
+                                       result_fused_2,
+                                       atol=1e-2,
+                                       rtol=1e-2)
    # Check attn fusion support
    quant_key = model_class.quant_key
@@ -444,12 +493,8 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
        assert attn_nodes_post[0].kwargs.get("output_block_scale") is not None, \
            "Attention should have output_block_scale after FP4 fusion"  # noqa: E501
-    # Check that results are closed
+    # Check that results are close
    torch.testing.assert_close(result_unfused,
                               result_fused_1,
                               atol=1e-2,
                               rtol=1e-2)
-    torch.testing.assert_close(result_unfused,
-                               result_fused_2,
-                               atol=1e-2,
-                               rtol=1e-2)
--- a/tests/compile/test_silu_mul_quant_fusion.py
+++ b/tests/compile/test_silu_mul_quant_fusion.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import cast
 import pytest
 import torch
 import vllm.envs as envs
+from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -17,9 +20,10 @@ from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    GroupShape, kFp8StaticTensorSym, kNvfp4Quant)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    Fp8LinearOp)
+    Fp8LinearOp, cutlass_fp8_supported)
 from vllm.platforms import current_platform
+from ..utils import override_cutlass_fp8_supported
 from .backend import TestBackend
 FP8_DTYPE = current_platform.fp8_dtype()
@@ -32,7 +36,7 @@ def is_nvfp4_supported():
 class TestSiluMulFp8QuantModel(torch.nn.Module):
-    def __init__(self, hidden_size: int, force_fp8_e4m3fnuz: bool, **kwargs):
+    def __init__(self, hidden_size: int, cuda_force_torch: bool, **kwargs):
        super().__init__()
        self.silu_and_mul = SiluAndMul()
        self.wscale = torch.rand(1, dtype=torch.float32)
@@ -40,11 +44,11 @@ class TestSiluMulFp8QuantModel(torch.nn.Module):
        self.w = torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
-        self.fp8_linear = Fp8LinearOp(
+        with override_cutlass_fp8_supported(not cuda_force_torch):
-            force_fp8_e4m3fnuz=force_fp8_e4m3fnuz,
+            self.fp8_linear = Fp8LinearOp(
-            act_quant_static=True,
+                act_quant_static=True,
-            act_quant_group_shape=GroupShape.PER_TENSOR,
+                act_quant_group_shape=GroupShape.PER_TENSOR,
-        )
+            )
    def forward(self, x):
        y = self.silu_and_mul(x)
@@ -63,24 +67,27 @@ class TestSiluMulFp8QuantModel(torch.nn.Module):
 class TestSiluMulNvfp4QuantModel(torch.nn.Module):
-    def __init__(self, hidden_size: int, **kwargs):
+    def __init__(self, hidden_size: int, x: torch.Tensor, **kwargs):
        super().__init__()
        self.silu_and_mul = SiluAndMul()
-        self.w = torch.randint(256, (hidden_size, hidden_size // 2),
-                               dtype=FP4_DTYPE)
+        # create nvfp4 weight
-        self.wscale = torch.randn(hidden_size,
+        w = torch.rand((hidden_size, hidden_size))
-                                  hidden_size // 16).to(dtype=FP8_DTYPE)
+        self.w, self.w_block_scale, self.w_global_scale = quant_nvfp4_tensor(w)
-        self.wscale2 = torch.rand(1, dtype=torch.float32)
-        self.scale = torch.rand(1, dtype=torch.float32)
+        # get global scale offline
+        _, _, self.y_global_scale = quant_nvfp4_tensor(self.silu_and_mul(x))
+        self.alpha = 1.0 / (self.w_global_scale * self.y_global_scale)
    def forward(self, x):
        y = self.silu_and_mul(x)
-        y_quant, y_block_scale = scaled_fp4_quant(y, 1 / self.scale)
+        y_quant, y_block_scale = scaled_fp4_quant(y, self.y_global_scale)
        out = cutlass_scaled_fp4_mm(a=y_quant,
                                    b=self.w,
                                    block_scale_a=y_block_scale,
-                                    block_scale_b=self.wscale,
+                                    block_scale_b=self.w_block_scale,
-                                    alpha=self.scale * self.wscale2,
+                                    alpha=self.alpha,
                                    out_dtype=y.dtype)
        return out
@@ -94,19 +101,25 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module):
 @pytest.mark.parametrize("num_tokens", [64])
 @pytest.mark.parametrize("hidden_size", [128])
 @pytest.mark.parametrize(
-    "model_class", [TestSiluMulFp8QuantModel, TestSiluMulNvfp4QuantModel]
+    "model_class",
-    if is_nvfp4_supported() else [TestSiluMulFp8QuantModel])
+    cast(list[type], [TestSiluMulFp8QuantModel, TestSiluMulNvfp4QuantModel]
-@pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False])
+         if is_nvfp4_supported() else [TestSiluMulFp8QuantModel]))
+# cuda_force_torch used to test torch code path on platforms that
+# cutlass_fp8_supported() == True.
+@pytest.mark.parametrize("cuda_force_torch",
+                         [True, False] if cutlass_fp8_supported() else [True])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
                    reason="Only test on CUDA and ROCm")
 def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
-                                   force_fp8_e4m3fnuz):
+                                   cuda_force_torch):
-    if model_class == TestSiluMulNvfp4QuantModel and force_fp8_e4m3fnuz:
+    if model_class == TestSiluMulNvfp4QuantModel and cuda_force_torch:
        pytest.skip("Duplicate tests for NVFP4")
    torch.set_default_device("cuda")
    torch.set_default_dtype(torch.float16)
+    x = torch.rand(num_tokens, hidden_size * 2)
    # Reshape pass is needed for the fusion pass to work
    config = VllmConfig()
    config.compilation_config = CompilationConfig(
@@ -115,10 +128,10 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
    backend = TestBackend(NoOpEliminationPass(config), fusion_pass)
    model = model_class(hidden_size=hidden_size,
-                        force_fp8_e4m3fnuz=force_fp8_e4m3fnuz)
+                        cuda_force_torch=cuda_force_torch,
+                        x=x)
    # First dimension dynamic
-    x = torch.rand(num_tokens, hidden_size * 2)
    torch._dynamo.mark_dynamic(x, 0)
    result = model(x)
@@ -127,10 +140,15 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
    result2 = model2(x)
    # Check that it gives the same answer
+    if model_class == TestSiluMulFp8QuantModel:
+        atol, rtol = 1e-3, 1e-3
+    elif model_class == TestSiluMulNvfp4QuantModel:
+        atol, rtol = 1e-1, 1e-1
    torch.testing.assert_close(result[0].to(dtype=torch.float16),
                               result2[0].to(dtype=torch.float16),
-                               atol=1e-3,
+                               atol=atol,
-                               rtol=1e-3)
+                               rtol=rtol)
    # In pre-nodes, quant op should be present and fused kernels should not
    backend.check_before_ops(model.ops_in_model_before())

--- a/tests/conftest.py
+++ b/tests/conftest.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa
+from tblib import pickling_support
+# Install support for pickling exceptions so that we can nicely propagate
+# failures from tests running in a subprocess.
+# This should be run before any custom exception subclasses are defined.
+pickling_support.install()
+import http.server
 import json
 import math
+import mimetypes
 import os
+import socket
 import tempfile
+import threading
+from collections.abc import Generator
 from enum import Enum
 from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast
@@ -32,6 +47,7 @@ from vllm.distributed import (cleanup_dist_env_and_memory,
 from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
                         to_enc_dec_tuple_list, zip_enc_dec_prompts)
 from vllm.logger import init_logger
+from vllm.multimodal.utils import fetch_image
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
 from vllm.sequence import Logprob
@@ -1253,3 +1269,119 @@ def cli_config_file():
 def cli_config_file_with_model():
    """Return the path to the CLI config file with model."""
    return os.path.join(_TEST_DIR, "config", "test_config_with_model.yaml")
+class AssetHandler(http.server.BaseHTTPRequestHandler):
+    # _IMAGE_CACHE : Dict[str, bytes] = {}
+    def log_message(self, *args, **kwargs):
+        pass
+    def do_GET(self):
+        # Accepts paths like: /1280px-Venn_diagram_rgb.jpg
+        filename = self.path.lstrip("/")
+        if not filename or "." not in filename:
+            self.send_error(404, "Missing filename (expected /<name>.<ext>)")
+            return
+        base, ext = filename.rsplit(".", 1)
+        ext = ext.lower()
+        if ext not in ["jpg", "png"]:
+            self.send_error(404, f"Unsupported extension: .{ext}")
+            return
+        try:
+            data = ImageAsset(base).read_bytes(ext=ext)
+        except Exception as e:
+            self.send_error(500, f"Failed to load asset: {ext} {base} {e} ")
+            return
+        ctype, _ = mimetypes.guess_type(filename)
+        if ctype is None:
+            ctype = {"jpg": "image/jpg", "png": "image/png"}[ext]
+        self.send_response(200)
+        self.send_header("Content-Type", ctype)
+        self.send_header("Content-Length", str(len(data)))
+        self.end_headers()
+        self.wfile.write(data)
+def _find_free_port() -> int:
+    with socket.socket() as s:
+        s.bind(("127.0.0.1", 0))
+        return s.getsockname()[1]
+class LocalAssetServer:
+    address: str
+    port: int
+    server: Optional[http.server.ThreadingHTTPServer]
+    thread: Optional[threading.Thread]
+    def __init__(self, address: str = "127.0.0.1") -> None:
+        self.address = address
+        self.port = -1
+        self.server = None
+        self.thread = None
+    def __enter__(self):
+        self.port = _find_free_port()
+        self.server = http.server.ThreadingHTTPServer(
+            (self.address, self.port), AssetHandler)
+        self.thread = threading.Thread(target=self.server.serve_forever,
+                                       daemon=True)
+        self.thread.start()
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        if self.server:
+            self.server.shutdown()
+            del self.server
+        if self.thread:
+            self.thread.join()
+            del self.thread
+        if exc_type is None:
+            return None
+        return False
+    @property
+    def base_url(self) -> str:
+        assert self.port is not None
+        return f"http://{self.address}:{self.port}"
+    def url_for(self, name: str) -> str:
+        """e.g., name='RGBA_comp.png' -> 'http://127.0.0.1:PORT/RGBA_comp.png'"""
+        return f"{self.base_url}/{name}"
+    def get_image_asset(self, name: str) -> Image.Image:
+        return fetch_image(self.url_for(name))
+@pytest.fixture(scope="session")
+def local_asset_server() -> Generator[LocalAssetServer, None, None]:
+    """
+    Starts a thread based HTTP server bound to 127.0.0.1 on a random free port. 
+    The server currently servers images at:
+    http://127.0.0.1:<port>/<name>.<ext>
+    """
+    with LocalAssetServer() as srv:
+        yield srv
+@pytest.fixture
+def image_url(request, local_asset_server) -> str:
+    # request.param is one of the IMAGE_ASSETS filenames
+    name = request.param
+    return local_asset_server.url_for(name)
+@pytest.fixture
+def image_urls(request, local_asset_server) -> list[str]:
+    """Indirect fixture: takes a list of names, returns list of full URLs."""
+    names: list[str] = request.param
+    return [local_asset_server.url_for(name) for name in names]
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -439,10 +439,10 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
 @pytest.mark.parametrize("seed", [1])
 def test_auto_prefix_caching_after_eviction_start(baseline_llm_generator,
                                                  test_llm_generator):
-    """Verify block manager v2 with auto prefix caching could works normal
+    """Verify block manager v2 with auto prefix caching could work normally
    even when eviction started.
    With APC enabled, all blocks are held by native block at the beginning.
-    Then blocks are managed by evictor instead. If cache hit at the evitor's
+    Then blocks are managed by evictor instead. If cache hit at the evictor's
    block, then it could be reused, or we need to recompute its kv cache.
    """
    output_len = 10

--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -10,7 +10,8 @@ import pytest  # noqa
 import torch
 from torch import Use  # noqa
-from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
+from vllm.config import CacheConfig, SchedulerConfig
+from vllm.config.lora import LoRAConfig
 from vllm.core.interfaces import AllocStatus
 from vllm.core.scheduler import Scheduler, SchedulingBudget
 from vllm.lora.request import LoRARequest
@@ -641,7 +642,7 @@ def test_schedule_decode_blocks_to_copy_update():
    # Nothing is preempted.
    assert output.blocks_to_swap_out == []
    # Since append_slot returns the source -> dist mapping, it should
-    # applied.
+    # be applied.
    assert output.blocks_to_copy == [(2, 3)]

--- a/tests/detokenizer/test_stop_string_while_stop_model_terminates.py
+++ b/tests/detokenizer/test_stop_string_while_stop_model_terminates.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+from vllm.sampling_params import SamplingParams
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.detokenizer import BaseIncrementalDetokenizer
+@pytest.fixture(params=[True, False])
+def include_stop_str_in_output(request):
+    return request.param
+class _DummyDetokenizer(BaseIncrementalDetokenizer):
+    def __init__(self, request: EngineCoreRequest):
+        super().__init__(request)
+    def decode_next(self, next_token_id: int) -> str:
+        # Map token id to single ASCII character for deterministic testing.
+        return chr(next_token_id)
+def _make_request(stop, include_stop_str_in_output: bool, min_tokens: int = 0):
+    params = SamplingParams(
+        stop=stop,
+        include_stop_str_in_output=include_stop_str_in_output,
+        min_tokens=min_tokens)
+    # Keep other fields minimal for unit test purposes.
+    req = EngineCoreRequest(
+        request_id="test",
+        prompt_token_ids=[],
+        mm_features=None,
+        sampling_params=params,
+        pooling_params=None,
+        eos_token_id=None,
+        arrival_time=0.0,
+        lora_request=None,
+        cache_salt=None,
+        data_parallel_rank=None,
+    )
+    return req
+def test_stop_string_while_stop_token_terminates(
+        include_stop_str_in_output: bool):
+    """
+    This test verifies that the detokenizer correctly handles the case where
+    the generated token sequence contains both:
+    - a stop token
+    - an <eos> token
+    The detokenizer should respect the stop string and truncate the output
+    accordingly.
+    Imagine the following sequence:
+    - "abcdeZ" is generated, where "Z" is the <eos> token.
+    - "cd" is the stop string.
+    If include_stop_str_in_output=False, the detokenizer should truncate the
+    output to "ab" because the stop string "cd" is excluded.
+    If include_stop_str_in_output=True, the detokenizer should include the stop
+    string "cd" in the output, resulting in "abcd".
+    This verifies the behavioral change introduced in BaseIncrementalDetokenizer
+    where stop-string evaluation occurs before the early-return on
+    stop_terminated.
+    """
+    # Generate text "abcdeZ" and tokenize it.
+    generated_text = "abcde"
+    eos_token = "Z"
+    stop_string = "cd"
+    generated_text = generated_text + eos_token
+    token_ids = [ord(c) for c in generated_text]
+    # Create a request with the stop string and initialize the detokenizer.
+    req = _make_request(stop=[stop_string],
+                        include_stop_str_in_output=include_stop_str_in_output)
+    detok = _DummyDetokenizer(req)
+    # Simulate that the last token ('Z') is a stop token (stop_terminated=True).
+    result = detok.update(new_token_ids=token_ids, stop_terminated=True)
+    # The update should not report a stop string
+    assert result == stop_string
+    # Output text should reflect stop-string handling:
+    # - include_stop_str_in_output=False => exclude "cd" => "ab"
+    # - include_stop_str_in_output=True  => include "cd" => "abcd"
+    expected_text = "abcd" if include_stop_str_in_output else "ab"
+    assert detok.output_text == expected_text
+    # The skipped final token should still be recorded in token_ids.
+    assert detok.output_token_ids == token_ids
+    # get_next_output_text should return the full text when finished=True.
+    # (Buffering only applies during streaming when finished=False.)
+    assert detok.get_next_output_text(finished=True,
+                                      delta=False) == expected_text
--- a/tests/distributed/conftest.py
+++ b/tests/distributed/conftest.py
@@ -8,7 +8,7 @@ import msgspec.msgpack
 import pytest
 import zmq
-from vllm.config import KVEventsConfig
+from vllm.config.kv_events import KVEventsConfig
 from vllm.distributed.kv_events import EventPublisherFactory
 from .test_events import SampleBatch

--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+WARNING: This test runs in both single-node (4 GPUs) and multi-node
+ (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
+ important to set the distributed backend to "mp" to avoid Ray scheduling
+ all workers in a node other than the head node, which can cause the test
+ to fail.
+"""
+import json
+import os
+from dataclasses import dataclass
+from typing import Literal, NamedTuple, Optional
+import pytest
+from vllm.config import RunnerOption
+from vllm.logger import init_logger
+from ..models.registry import HF_EXAMPLE_MODELS
+from ..utils import compare_two_settings, create_new_process_for_each_test
+logger = init_logger("test_context_parallel")
+VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
+class ParallelSetup(NamedTuple):
+    tp_size: int
+    pp_size: int
+    dcp_size: int
+    eager_mode: bool
+    chunked_prefill: bool
+class CPTestOptions(NamedTuple):
+    multi_node_only: bool
+    load_format: Optional[str] = None
+@dataclass
+class CPTestSettings:
+    parallel_setups: list[ParallelSetup]
+    # NOTE: the length of distributed_backends and
+    # vllm_major_versions should be the same, and they
+    # are first zipped together to iterate over all
+    # test settings.
+    distributed_backends: list[str]
+    # vllm major version: "0" for V0, "1" for V1
+    vllm_major_versions: list[str]
+    runner: RunnerOption
+    test_options: CPTestOptions
+    def __post_init__(self):
+        if len(self.distributed_backends) != len(self.vllm_major_versions):
+            raise ValueError(
+                f"Length mismatch: distributed_backends "
+                f"({len(self.distributed_backends)}) != "
+                f"vllm_major_versions ({len(self.vllm_major_versions)})")
+    @staticmethod
+    def detailed(
+        *,
+        tp_base: int = 4,
+        pp_base: int = 1,
+        dcp_base: int = 1,
+        multi_node_only: bool = False,
+        runner: RunnerOption = "auto",
+        load_format: Optional[str] = None,
+    ):
+        parallel_setups = []
+        for eager_mode_val in [False]:
+            for pp_multiplier in [1]:
+                for dcp_multiplier in [2, 4]:
+                    for chunked_prefill_val in [True]:
+                        parallel_setups.append(
+                            ParallelSetup(tp_size=tp_base,
+                                          pp_size=pp_multiplier * pp_base,
+                                          dcp_size=dcp_multiplier * dcp_base,
+                                          eager_mode=eager_mode_val,
+                                          chunked_prefill=chunked_prefill_val))
+        return CPTestSettings(
+            parallel_setups=parallel_setups,
+            distributed_backends=["mp"],
+            vllm_major_versions=["1"],
+            runner=runner,
+            test_options=CPTestOptions(multi_node_only=multi_node_only,
+                                       load_format=load_format),
+        )
+    def iter_params(self, model_id: str):
+        opts = self.test_options
+        for parallel_setup in self.parallel_setups:
+            for backend, vllm_major_version in zip(self.distributed_backends,
+                                                   self.vllm_major_versions):
+                yield (model_id, parallel_setup, backend, vllm_major_version,
+                       self.runner, opts)
+def _compare_cp_with_tp(
+    model_id: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    vllm_major_version: str,
+    runner: RunnerOption,
+    test_options: CPTestOptions,
+    num_gpus_available: int,
+    *,
+    method: Literal["generate"],
+    is_multimodal: bool,
+):
+    (
+        tp_size,
+        pp_size,
+        dcp_size,
+        eager_mode,
+        chunked_prefill,
+    ) = parallel_setup
+    multi_node_only, load_format = test_options
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    model_info.check_transformers_version(on_fail="skip")
+    trust_remote_code = model_info.trust_remote_code
+    tokenizer_mode = model_info.tokenizer_mode
+    hf_overrides = model_info.hf_overrides
+    if load_format == "dummy":
+        # Avoid OOM
+        text_overrides = {
+            "num_hidden_layers": 4,
+            "hidden_size": 512,
+            "intermediate_size": 800,
+            "num_attention_heads": 4,
+            "num_key_value_heads": 1,
+        }
+        if is_multimodal:
+            hf_overrides.update({"text_config": text_overrides})
+        else:
+            hf_overrides.update(text_overrides)
+    else:
+        model_info.check_available_online(on_fail="skip")
+    if num_gpus_available < tp_size * pp_size:
+        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
+    if VLLM_MULTI_NODE and distributed_backend == "mp":
+        pytest.skip("Skipping multi-node pipeline parallel test for "
+                    "multiprocessing distributed backend")
+    if multi_node_only and not VLLM_MULTI_NODE:
+        pytest.skip("Not in multi-node setting")
+    common_args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "8",
+    ]
+    if chunked_prefill:
+        common_args.append("--enable-chunked-prefill")
+    if eager_mode:
+        common_args.append("--enforce-eager")
+    if runner != "auto":
+        common_args.extend(["--runner", runner])
+    if trust_remote_code:
+        common_args.append("--trust-remote-code")
+    if tokenizer_mode:
+        common_args.extend(["--tokenizer-mode", tokenizer_mode])
+    if load_format:
+        common_args.extend(["--load-format", load_format])
+    if hf_overrides:
+        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
+    cp_env = tp_env = {
+        "VLLM_USE_V1":
+        vllm_major_version,  # Note(hc): DCP only support V1 engine only
+    }
+    cp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--pipeline-parallel-size",
+        str(pp_size),
+        "--decode-context-parallel-size",
+        str(dcp_size),
+        "--distributed-executor-backend",
+        distributed_backend,
+    ]
+    tp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--pipeline-parallel-size",
+        str(pp_size),
+        "--distributed-executor-backend",
+        distributed_backend,
+    ]
+    try:
+        compare_two_settings(model_id,
+                             cp_args,
+                             tp_args,
+                             cp_env,
+                             tp_env,
+                             method=method,
+                             max_wait_seconds=720)
+    except Exception:
+        testing_ray_compiled_graph = cp_env is not None
+        if testing_ray_compiled_graph and vllm_major_version == "0":
+            # Ray Compiled Graph tests are flaky for V0,
+            # so we don't want to fail the test
+            logger.exception("Ray Compiled Graph tests failed")
+        else:
+            raise
+CP_TEXT_GENERATION_MODELS = {
+    # [MLA attention only]
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": CPTestSettings.detailed(),
+}
+CP_TEST_MODELS = [
+    # TODO support other models
+    # [LANGUAGE GENERATION]
+    "deepseek-ai/DeepSeek-V2-Lite-Chat",
+]
+@pytest.mark.parametrize(
+    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
+     "runner", "test_options"),
+    [
+        params for model_id, settings in CP_TEXT_GENERATION_MODELS.items()
+        for params in settings.iter_params(model_id)
+        if model_id in CP_TEST_MODELS
+    ],
+)
+@create_new_process_for_each_test()
+def test_cp_generation(
+    model_id: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    vllm_major_version: str,
+    runner: RunnerOption,
+    test_options: CPTestOptions,
+    num_gpus_available,
+):
+    _compare_cp_with_tp(model_id,
+                        parallel_setup,
+                        distributed_backend,
+                        vllm_major_version,
+                        runner,
+                        test_options,
+                        num_gpus_available,
+                        method="generate",
+                        is_multimodal=False)
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -298,6 +298,8 @@ def _compare_tp(
    tokenizer_mode = model_info.tokenizer_mode
    hf_overrides = model_info.hf_overrides
    hf_config = get_config(model_id, trust_remote_code)
+    skip_tokenizer_init = model_info.skip_tokenizer_init
+    max_num_seqs = model_info.max_num_seqs
    dtype = "float16"
    if hf_config.model_type in _FLOAT16_NOT_SUPPORTED_MODELS:
@@ -351,6 +353,10 @@ def _compare_tp(
        common_args.extend(["--load-format", load_format])
    if hf_overrides:
        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
+    if skip_tokenizer_init:
+        common_args.append("--skip-tokenizer-init")
+    if max_num_seqs:
+        common_args.extend(["--max-num-seqs", f"{max_num_seqs}"])
    specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
    testing_ray_compiled_graph = False

--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -178,6 +178,7 @@ def _compare_sp(
    trust_remote_code = model_info.trust_remote_code
    tokenizer_mode = model_info.tokenizer_mode
    hf_overrides = model_info.hf_overrides
+    skip_tokenizer_init = model_info.skip_tokenizer_init
    if load_format == "dummy":
        # Avoid OOM
@@ -227,6 +228,8 @@ def _compare_sp(
        common_args.extend(["--load-format", load_format])
    if hf_overrides:
        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
+    if skip_tokenizer_init:
+        common_args.append("--skip-tokenizer-init")
    compilation_config = {
        'level': 3,

--- a/tests/encoder_decoder/test_e2e_correctness.py
+++ b/tests/encoder_decoder/test_e2e_correctness.py
@@ -63,6 +63,7 @@ def clear_cache():
    current_platform.is_cpu(),
    reason="CPU backend is not currently supported with encoder/decoder models"
 )
+@pytest.mark.skip(reason="bart not supported in V1")
 def test_encoder_decoder_e2e(
    hf_runner,
    vllm_runner,

--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -167,7 +167,7 @@ def test_get_kwargs():
    # dict should have json tip in help
    json_tip = "Should either be a valid JSON string or JSON keys"
    assert json_tip in kwargs["json_tip"]["help"]
-    # nested config should should construct the nested config
+    # nested config should construct the nested config
    assert kwargs["nested_config"]["type"]('{"field": 2}') == NestedConfig(2)
@@ -287,15 +287,6 @@ def test_prefix_cache_default():
        },
        "mm-processor-kwargs"
    ),
-    (
-        '{"cast_logits_dtype":"bfloat16","sequence_parallel_norm":true,"sequence_parallel_norm_threshold":2048}',
-        {
-            "cast_logits_dtype": "bfloat16",
-            "sequence_parallel_norm": True,
-            "sequence_parallel_norm_threshold": 2048,
-        },
-        "override-neuron-config"
-    ),
 ])
 # yapf: enable
 def test_composite_arg_parser(arg, expected, option):

--- a/tests/engine/test_executor.py
+++ b/tests/engine/test_executor.py
@@ -25,7 +25,7 @@ class CustomUniExecutor(UniProcExecutor):
                       timeout: Optional[float] = None,
                       args: tuple = (),
                       kwargs: Optional[dict] = None) -> list[Any]:
-        # Drop marker to show that this was ran
+        # Drop marker to show that this was run
        with open(".marker", "w"):
            ...
        return super().collective_rpc(method, timeout, args, kwargs)