Merge tag 'v0.10.2rc1' into v0.10.2rc1-dev

a99300bd · zhuwenwen · cc3e01c7 · 5438967f · a99300bd · a99300bd
Commit a99300bd authored Sep 09, 2025 by zhuwenwen
20 changed files
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -21,7 +21,6 @@ runai-model-streamer-s3==0.11.0
 # conch-triton-kernels==1.2.1 # numpy>=1.26.4
 numa
-python-multipart
 pytrie
 setuptools_scm>=8
 cmake==3.29
@@ -30,4 +29,6 @@ torch == 2.5.1
 triton == 3.0.0
 flash_attn == 2.6.1
 flash_mla == 1.0.0
-lmslim == 0.3.1
+lightop == 0.5.0
\ No newline at end of file
+lmslim == 0.3.1
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -22,9 +22,9 @@ sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
 timm >=1.0.17 # required for internvl and gemma3n-mm test
-torch==2.7.1
+torch==2.8.0
-torchaudio==2.7.1
+torchaudio==2.8.0
-torchvision==0.22.1
+torchvision==0.23.0
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
 mistral_common[image,audio] >= 1.8.2 # required for voxtral test
@@ -32,9 +32,10 @@ num2words # required for smolvlm test
 open_clip_torch==2.32.0 # Required for nemotron_vl test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
-lm-eval[api]==0.4.8 # required for model evaluation test
+# TODO: Use lm-eval[api]==0.4.10 once released
+lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
 mteb[bm25s]>=1.38.11, <2 # required for mteb test
-transformers==4.55.0
+transformers==4.55.2
 tokenizers==0.21.1
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
@@ -53,3 +54,4 @@ runai-model-streamer-s3==0.11.0
 fastsafetensors>=0.1.10
 pydantic>=2.10 # 2.9 leads to error on python 3.10
 terratorch==1.1rc2 # required for PrithviMAE test
+decord==0.6.0
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -156,6 +156,8 @@ datasets==3.0.2
    #   mteb
 decorator==5.1.1
    # via librosa
+decord==0.6.0
+    # via -r requirements/test.in
 dill==0.3.8
    # via
    #   datasets
@@ -408,7 +410,7 @@ lightning-utilities==0.14.3
    #   torchmetrics
 llvmlite==0.44.0
    # via numba
-lm-eval==0.4.8
+lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
    # via -r requirements/test.in
 lxml==5.3.0
    # via
@@ -493,6 +495,7 @@ numpy==1.26.4
    #   contourpy
    #   cupy-cuda12x
    #   datasets
+    #   decord
    #   einx
    #   encodec
    #   evaluate
@@ -538,42 +541,42 @@ numpy==1.26.4
    #   tritonclient
    #   vocos
    #   xarray
-nvidia-cublas-cu12==12.8.3.14
+nvidia-cublas-cu12==12.8.4.1
    # via
    #   nvidia-cudnn-cu12
    #   nvidia-cusolver-cu12
    #   torch
-nvidia-cuda-cupti-cu12==12.8.57
+nvidia-cuda-cupti-cu12==12.8.90
    # via torch
-nvidia-cuda-nvrtc-cu12==12.8.61
+nvidia-cuda-nvrtc-cu12==12.8.93
    # via torch
-nvidia-cuda-runtime-cu12==12.8.57
+nvidia-cuda-runtime-cu12==12.8.90
    # via torch
-nvidia-cudnn-cu12==9.7.1.26
+nvidia-cudnn-cu12==9.10.2.21
    # via torch
-nvidia-cufft-cu12==11.3.3.41
+nvidia-cufft-cu12==11.3.3.83
    # via torch
-nvidia-cufile-cu12==1.13.0.11
+nvidia-cufile-cu12==1.13.1.3
    # via torch
-nvidia-curand-cu12==10.3.9.55
+nvidia-curand-cu12==10.3.9.90
    # via torch
-nvidia-cusolver-cu12==11.7.2.55
+nvidia-cusolver-cu12==11.7.3.90
    # via torch
-nvidia-cusparse-cu12==12.5.7.53
+nvidia-cusparse-cu12==12.5.8.93
    # via
    #   nvidia-cusolver-cu12
    #   torch
-nvidia-cusparselt-cu12==0.6.3
+nvidia-cusparselt-cu12==0.7.1
    # via torch
-nvidia-nccl-cu12==2.26.2
+nvidia-nccl-cu12==2.27.3
    # via torch
-nvidia-nvjitlink-cu12==12.8.61
+nvidia-nvjitlink-cu12==12.8.93
    # via
    #   nvidia-cufft-cu12
    #   nvidia-cusolver-cu12
    #   nvidia-cusparse-cu12
    #   torch
-nvidia-nvtx-cu12==12.8.55
+nvidia-nvtx-cu12==12.8.90
    # via torch
 omegaconf==2.3.0
    # via
@@ -742,7 +745,7 @@ pycparser==2.22
    # via cffi
 pycryptodomex==3.22.0
    # via blobfile
-pydantic==2.11.5
+pydantic==2.11.7
    # via
    #   -r requirements/test.in
    #   albumentations
@@ -1066,7 +1069,7 @@ tomli==2.2.1
    # via schemathesis
 tomli-w==1.2.0
    # via schemathesis
-torch==2.7.1+cu128
+torch==2.8.0+cu128
    # via
    #   -r requirements/test.in
    #   accelerate
@@ -1095,7 +1098,7 @@ torch==2.7.1+cu128
    #   torchvision
    #   vector-quantize-pytorch
    #   vocos
-torchaudio==2.7.1+cu128
+torchaudio==2.8.0+cu128
    # via
    #   -r requirements/test.in
    #   encodec
@@ -1108,7 +1111,7 @@ torchmetrics==1.7.4
    #   pytorch-lightning
    #   terratorch
    #   torchgeo
-torchvision==0.22.1+cu128
+torchvision==0.23.0+cu128
    # via
    #   -r requirements/test.in
    #   lightly
@@ -1139,7 +1142,7 @@ tqdm==4.66.6
    #   transformers
 tqdm-multiprocess==0.0.11
    # via lm-eval
-transformers==4.55.0
+transformers==4.55.2
    # via
    #   -r requirements/test.in
    #   genai-perf
@@ -1149,7 +1152,7 @@ transformers==4.55.0
    #   transformers-stream-generator
 transformers-stream-generator==0.0.5
    # via -r requirements/test.in
-triton==3.3.1
+triton==3.4.0
    # via torch
 tritonclient==2.51.0
    # via

--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -11,6 +11,7 @@ ray[default]
 ray[data]
 setuptools==78.1.0
 nixl==0.3.0
+tpu_info==0.4.0
 # Install torch_xla
 --pre

--- a/setup.py
+++ b/setup.py
@@ -550,8 +550,8 @@ def get_version_add(sha: Optional[str] = None) -> str:
    new_version_content = f"""
 try:
-    __version__ = "0.10.1.1"
+    __version__ = "0.10.2.rc1"
-    __version_tuple__ = (0, 10, 1, 1)
+    __version_tuple__ = (0, 10, 2, rc1)
    __hcu_version__ = f'0.10.1.1+{version}' 
    from vllm.version import __version__, __version_tuple__, __hcu_version__
@@ -765,16 +765,25 @@ if envs.VLLM_USE_PRECOMPILED:
    if wheel_location is not None:
        wheel_url = wheel_location
    else:
+        import platform
+        arch = platform.machine()
+        if arch == "x86_64":
+            wheel_tag = "manylinux1_x86_64"
+        elif arch == "aarch64":
+            wheel_tag = "manylinux2014_aarch64"
+        else:
+            raise ValueError(f"Unsupported architecture: {arch}")
        base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
-        wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+        wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
+        nightly_wheel_url = f"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
        from urllib.request import urlopen
        try:
            with urlopen(wheel_url) as resp:
                if resp.status != 200:
-                    wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+                    wheel_url = nightly_wheel_url
        except Exception as e:
            print(f"[warn] Falling back to nightly wheel: {e}")
-            wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+            wheel_url = nightly_wheel_url
    patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
        wheel_url)
@@ -807,7 +816,9 @@ setup(
                  "mistral_common[audio]"],  # Required for audio processing
        "video": [],  # Kept for backwards compatibility
        # FlashInfer should be updated together with the Dockerfile
-        "flashinfer": ["flashinfer-python==0.2.11"],
+        "flashinfer": ["flashinfer-python==0.2.14.post1"],
+        # Optional deps for AMD FP4 quantization support
+        "petit-kernel": ["petit-kernel"],
    },
    cmdclass=cmdclass,
    package_data=package_data,

--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -12,7 +12,6 @@ import pytest
 import torch
 from vllm import LLM, envs
-from vllm.platforms import current_platform
 from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1
 from ..conftest import HfRunner, VllmRunner
@@ -83,11 +82,7 @@ def test_models(
            "VLLM_USE_V1") and envs.VLLM_USE_V1:
        pytest.skip("enable_prompt_embeds is not supported in v1.")
-    if backend == "FLASHINFER" and current_platform.is_rocm():
+    if backend == "XFORMERS" and model == os.path.join(models_path_prefix, "google/gemma-2-2b-it"):
-        pytest.skip("Flashinfer does not support ROCm/HIP.")
-    if backend in ("XFORMERS",
-                   "FLASHINFER") and model == os.path.join(models_path_prefix, "google/gemma-2-2b-it"):
        pytest.skip(
            f"{backend} does not support gemma2 with full context length.")
@@ -146,6 +141,7 @@ def test_models(
        )
 # @multi_gpu_test(num_gpus=2)
 # @pytest.mark.parametrize(
 #     "model, distributed_executor_backend, attention_backend, "
@@ -162,8 +158,6 @@ def test_models(
 #         ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
 #         ("distilbert/distilgpt2", "ray", "", "A100", {}),
 #         ("distilbert/distilgpt2", "mp", "", "A100", {}),
-#         ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100", {}),
-#         ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100", {}),
 #     ])
 # @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
 # def test_models_distributed(

--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -176,4 +176,35 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
        output3 = llm.generate(prompt, sampling_params)
        # cmp output
        assert output[0].outputs[0].text == output3[0].outputs[0].text
\ No newline at end of file
+@create_new_process_for_each_test()
+def test_deep_sleep():
+    model = "Qwen/Qwen3-0.6B"
+    free, total = torch.cuda.mem_get_info()
+    used_bytes_baseline = total - free  # in case other process is running
+    llm = LLM(model, enable_sleep_mode=True)
+    prompt = "How are you?"
+    sampling_params = SamplingParams(temperature=0, max_tokens=10)
+    output = llm.generate(prompt, sampling_params)
+    # Put the engine to deep sleep
+    llm.sleep(level=2)
+    free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
+    used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
+    assert used_bytes < 3 * GiB_bytes
+    llm.wake_up(tags=["weights"])
+    llm.collective_rpc("reload_weights")
+    free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
+    used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
+    assert used_bytes < 4 * GiB_bytes
+    # now allocate kv cache and cuda graph memory
+    llm.wake_up(tags=["kv_cache"])
+    output2 = llm.generate(prompt, sampling_params)
+    # cmp output
+    assert output[0].outputs[0].text == output2[0].outputs[0].text
--- a/tests/benchmarks/test_random_dataset.py
+++ b/tests/benchmarks/test_random_dataset.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+from typing import Any, NamedTuple, Optional, cast
+import numpy as np
+import pytest
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+from vllm.benchmarks.datasets import (RandomDataset, RandomMultiModalDataset,
+                                      SampleRequest)
+@pytest.fixture(scope="session")
+def hf_tokenizer() -> PreTrainedTokenizerBase:
+    # Use a small, commonly available tokenizer
+    return AutoTokenizer.from_pretrained("gpt2")
+class Params(NamedTuple):
+    num_requests: int
+    prefix_len: int
+    range_ratio: float
+    input_len: int
+    output_len: int
+@pytest.fixture(scope="session")
+def random_dataset_params() -> Params:
+    return Params(num_requests=16,
+                  prefix_len=7,
+                  range_ratio=0.3,
+                  input_len=50,
+                  output_len=20)
+def _fingerprint_sample(req: SampleRequest) -> tuple[str, int, int]:
+    """Project a SampleRequest into a comparable tuple."""
+    return (req.prompt, req.prompt_len, req.expected_output_len)
+def _collect_samples(dataset: RandomDataset,
+                    tokenizer: PreTrainedTokenizerBase,
+                     num_requests: int = 16,
+                     prefix_len: int = 7,
+                     range_ratio: float = 0.3,
+                     input_len: int = 50,
+                     output_len: int = 20) -> list[tuple[str, int, int]]:
+    samples = dataset.sample(
+        tokenizer=tokenizer,
+        num_requests=num_requests,
+        prefix_len=prefix_len,
+        range_ratio=range_ratio,
+        input_len=input_len,
+        output_len=output_len,
+    )
+    return [_fingerprint_sample(s) for s in samples]
+@pytest.mark.benchmark
+def test_random_dataset_same_seed(
+        hf_tokenizer: PreTrainedTokenizerBase,
+        random_dataset_params: Params) -> None:
+    """Same seed should yield identical outputs, even if global RNGs change.
+    This guards against accidental reliance on Python's random or np.random
+    in RandomDataset after moving to numpy.default_rng.
+    """
+    p = random_dataset_params
+    common_seed = 123
+    dataset_a = RandomDataset(random_seed=common_seed)
+    dataset_b = RandomDataset(random_seed=common_seed)
+    a = _collect_samples(dataset_a,
+                         hf_tokenizer,
+                         num_requests=p.num_requests,
+                         prefix_len=p.prefix_len,
+                         range_ratio=p.range_ratio,
+                         input_len=p.input_len,
+                         output_len=p.output_len)
+    # Perturb global RNG state to ensure isolation
+    random.seed(999)
+    _ = [random.random() for _ in range(100)]
+    np.random.seed(888)
+    _ = [np.random.random() for _ in range(100)]
+    b = _collect_samples(dataset_b,
+                         hf_tokenizer,
+                         num_requests=p.num_requests,
+                         prefix_len=p.prefix_len,
+                         range_ratio=p.range_ratio,
+                         input_len=p.input_len,
+                         output_len=p.output_len)
+    assert a == b
+@pytest.mark.benchmark
+def test_random_dataset_different_seeds(
+        hf_tokenizer: PreTrainedTokenizerBase,
+        random_dataset_params: Params) -> None:
+    """Different seeds should change outputs with overwhelming likelihood."""
+    p = random_dataset_params
+    seed_a = 0
+    dataset_a = RandomDataset(random_seed=seed_a)
+    a = _collect_samples(dataset_a,
+                         hf_tokenizer,
+                         num_requests=p.num_requests,
+                         prefix_len=p.prefix_len,
+                         range_ratio=p.range_ratio,
+                         input_len=p.input_len,
+                         output_len=p.output_len)
+    seed_b = 999
+    dataset_b = RandomDataset(random_seed=seed_b)
+    # Perturb global RNG with same seed as dataset_a to ensure isolation
+    random.seed(seed_a)
+    np.random.seed(seed_a)
+    b = _collect_samples(dataset_b,
+                         hf_tokenizer,
+                         num_requests=p.num_requests,
+                         prefix_len=p.prefix_len,
+                         range_ratio=p.range_ratio,
+                         input_len=p.input_len,
+                         output_len=p.output_len)
+    assert a != b
+# -----------------------------
+# RandomMultiModalDataset tests
+# -----------------------------
+def _mm_fingerprint_sample(
+    req: SampleRequest,
+) -> tuple[str, int, int, int, list[str]]:
+    """Create a compact fingerprint for multimodal samples.
+    Includes:
+    - prompt string
+    - prompt_len
+    - expected_output_len
+    - count of multimodal items
+    - per-item type and URL prefix (e.g., 'data:image/jpeg;base64,')
+    """
+    items = req.multi_modal_data or []
+    item_prefixes: list[str] = []
+    for it in items:
+        if isinstance(it, dict) and it.get("type") == "image_url":
+            url = it.get("image_url", {}).get("url", "")
+            # Only keep a short identifying prefix to avoid huge strings
+            item_prefixes.append(f"image:{url[:22]}")
+        elif isinstance(it, dict) and it.get("type") == "video_url":
+            url = it.get("video_url", {}).get("url", "")
+            item_prefixes.append(f"video:{url[:22]}")
+        else:
+            item_prefixes.append("unknown:")
+    return (req.prompt, req.prompt_len, req.expected_output_len, len(items),
+            item_prefixes)
+def _collect_mm_samples(
+    dataset: RandomMultiModalDataset,
+    tokenizer: PreTrainedTokenizerBase,
+    *,
+    num_requests: int = 8,
+    prefix_len: int = 3,
+    range_ratio: float = 0.0,
+    input_len: int = 20,
+    output_len: int = 5,
+    base_items_per_request: int = 2,
+    num_mm_items_range_ratio: float = 0.0,
+    limit_mm_per_prompt: Optional[dict[str, int]] = None,
+    bucket_config: Optional[dict[tuple[int, int, int], float]] = None,
+    enable_multimodal_chat: bool = False,
+) -> list[SampleRequest]:
+    if limit_mm_per_prompt is None:
+        limit_mm_per_prompt = {"image": 5, "video": 0}
+    if bucket_config is None:
+        bucket_config = {(32, 32, 1): 0.5, (52, 64, 1): 0.5}
+    return dataset.sample(
+        tokenizer=tokenizer,
+        num_requests=num_requests,
+        prefix_len=prefix_len,
+        range_ratio=range_ratio,
+        input_len=input_len,
+        output_len=output_len,
+        base_items_per_request=base_items_per_request,
+        num_mm_items_range_ratio=num_mm_items_range_ratio,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+        enable_multimodal_chat=enable_multimodal_chat,
+    )
+@pytest.mark.benchmark
+def test_random_mm_same_seed(hf_tokenizer: PreTrainedTokenizerBase) -> None:
+    seed = 42
+    ds_a = RandomMultiModalDataset(random_seed=seed)
+    ds_b = RandomMultiModalDataset(random_seed=seed)
+    a = _collect_mm_samples(ds_a, hf_tokenizer)
+    b = _collect_mm_samples(ds_b, hf_tokenizer)
+    fa = [_mm_fingerprint_sample(s) for s in a]
+    fb = [_mm_fingerprint_sample(s) for s in b]
+    assert fa == fb
+@pytest.mark.benchmark
+def test_random_mm_different_seeds(
+    hf_tokenizer: PreTrainedTokenizerBase,
+) -> None:
+    ds_a = RandomMultiModalDataset(random_seed=0)
+    ds_b = RandomMultiModalDataset(random_seed=999)
+    a = _collect_mm_samples(ds_a, hf_tokenizer)
+    b = _collect_mm_samples(ds_b, hf_tokenizer)
+    fa = [_mm_fingerprint_sample(s) for s in a]
+    fb = [_mm_fingerprint_sample(s) for s in b]
+    assert fa != fb
+@pytest.mark.benchmark
+def test_random_mm_respects_limits(
+    hf_tokenizer: PreTrainedTokenizerBase,
+) -> None:
+    ds = RandomMultiModalDataset(random_seed=0)
+    # Requesting 3 items with a per-prompt limit of 1 should error per current
+    # design (dataset refuses to silently clamp below the requested baseline).
+    with pytest.raises(ValueError):
+        _collect_mm_samples(
+            ds,
+            hf_tokenizer,
+            num_requests=12,
+            base_items_per_request=3,
+            num_mm_items_range_ratio=0.0,
+            limit_mm_per_prompt={"image": 1, "video": 0},
+            bucket_config={(32, 32, 1): 1.0},
+        )
+@pytest.mark.benchmark
+def test_random_mm_zero_prob_entries_are_removed(
+    hf_tokenizer: PreTrainedTokenizerBase,
+) -> None:
+    ds = RandomMultiModalDataset(random_seed=0)
+    # Second bucket has zero probability and should be ignored after
+    # normalization
+    samples = _collect_mm_samples(
+        ds,
+        hf_tokenizer,
+        num_requests=6,
+        base_items_per_request=2,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt={"image": 10, "video": 0},
+        bucket_config={(32, 32, 1): 1.0, (52, 64, 1): 0.0},
+    )
+    for s in samples:
+        assert isinstance(s.multi_modal_data, list)
+        typed_mm = cast(list[dict[str, Any]], s.multi_modal_data)
+        for it in typed_mm:
+            assert it.get("type") == "image_url"
+@pytest.mark.benchmark
+def test_random_mm_zero_items(hf_tokenizer: PreTrainedTokenizerBase) -> None:
+    ds = RandomMultiModalDataset(random_seed=0)
+    samples = _collect_mm_samples(
+        ds,
+        hf_tokenizer,
+        num_requests=5,
+        base_items_per_request=0,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt={"image": 5, "video": 0},
+        bucket_config={(32, 32, 1): 1.0},
+    )
+    for s in samples:
+        assert s.multi_modal_data == []
+@pytest.mark.benchmark
+def test_random_mm_num_items_per_prompt(
+    hf_tokenizer: PreTrainedTokenizerBase) -> None:
+    ds = RandomMultiModalDataset(random_seed=0)
+    # Fixed number of images per prompt
+    # set num_mm_items_range_ratio to 0.0
+    # TODO: modify video values when video sampling is implemented
+    samples_fixed_items = _collect_mm_samples(
+        ds,
+        hf_tokenizer,
+        num_requests=5,
+        base_items_per_request=3,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt={"image": 3, "video": 0},
+        bucket_config={(32, 32, 1): 1.0},
+    )
+    # Must have 5 requests each with 3 mm items per prompt
+    assert len(samples_fixed_items) == 5
+    for s in samples_fixed_items:
+        mm_data = cast(list[dict[str, Any]], s.multi_modal_data)
+        assert len(mm_data) == 3
+        for it in mm_data:
+            assert it.get("type") == "image_url"
+@pytest.mark.benchmark
+def test_random_mm_bucket_config_not_mutated(
+    hf_tokenizer: PreTrainedTokenizerBase,
+) -> None:
+    ds = RandomMultiModalDataset(random_seed=0)
+    # This bucket config is not normalized to sum to 1
+    # and has more buckets than requested images
+    original = {(32, 32, 1): 0.2, (52, 64, 1): 6, (25, 64, 1): 3}
+    # Keep a snapshot to compare after sampling
+    snapshot = dict(original)
+    _ = _collect_mm_samples(
+        ds,
+        hf_tokenizer,
+        num_requests=4,
+        base_items_per_request=1,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt={"image": 1, "video": 0},
+        bucket_config=original,
+    )
+    # Ensure the original dict content is unchanged
+    assert original == snapshot
+    # Vary number of mm items per prompt
+    # set num_mm_items_range_ratio to 0.5
+    samples_varying_items = _collect_mm_samples(
+        ds,
+        hf_tokenizer,
+        num_requests=5,
+        base_items_per_request=2,
+        num_mm_items_range_ratio=0.5,
+        limit_mm_per_prompt={"image": 4, "video": 0},
+        bucket_config={(32, 32, 1): 1.0},
+    )
+    # Must have 5 requests each with less than 4 mm items per prompt
+    # but at least 1 mm item per prompt
+    assert len(samples_varying_items) == 5
+    for s in samples_varying_items:
+        mm_data = cast(list[dict[str, Any]], s.multi_modal_data)
+        assert len(mm_data) <= 4
+        assert len(mm_data) >= 1
+        for it in mm_data:
+            assert it.get("type") == "image_url"
--- a/tests/compile/piecewise/test_multiple_graphs.py
+++ b/tests/compile/piecewise/test_multiple_graphs.py
@@ -12,10 +12,9 @@ from vllm.compilation.backends import set_model_tag
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import (ignore_torch_compile,
                                         support_torch_compile)
-from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
+from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
-                         set_current_vllm_config)
+                         VllmConfig, set_current_vllm_config)
-from vllm.envs import VLLM_USE_V1
+from vllm.forward_context import BatchDescriptor, set_forward_context
-from vllm.forward_context import set_forward_context
 from vllm.utils import direct_register_custom_op
 # create a library to hold the custom op
@@ -164,104 +163,34 @@ class SimpleModelWithTwoGraphs(ParentModel):
        return x
-def test_ignore_torch_compile_decorator():
-    assert VLLM_USE_V1
-    # piecewise
-    vllm_config = VllmConfig(compilation_config=CompilationConfig(
-        level=CompilationLevel.PIECEWISE,
-        use_cudagraph=True,
-        splitting_ops=["silly.attention"],
-        cudagraph_capture_sizes=[1, 2],
-    ))
-    @support_torch_compile
-    class A(nn.Module):
-        def __init__(self,
-                     *,
-                     vllm_config: VllmConfig,
-                     prefix: str = '',
-                     **kwargs) -> None:
-            super().__init__()
-        def forward(self, x: torch.Tensor) -> torch.Tensor:
-            x = x + x
-            attn_output = torch.empty_like(x)
-            torch.ops.silly.attention(x, x, x, attn_output)
-            x = attn_output
-            x = x * 3
-            return x
-    @ignore_torch_compile
-    class B(A):
-        ...
-    @support_torch_compile
-    class C(B):
-        ...
-    with set_current_vllm_config(vllm_config):
-        mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda()
-    # A has support_torch_compile
-    with compilation_counter.expect(
-            num_graphs_seen=1,
-            num_piecewise_graphs_seen=3,
-            num_piecewise_capturable_graphs_seen=2,
-            num_backend_compilations=2,
-            num_cudagraph_captured=4,
-            # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
-    ), set_forward_context({}, vllm_config=vllm_config):
-        # first run is for compile
-        mod_A(torch.randn(BATCH_SIZE, MLP_SIZE).cuda())
-        # run cudagraph captured sizes
-        mod_A(torch.randn(2, MLP_SIZE).cuda())
-        mod_A(torch.randn(1, MLP_SIZE).cuda())
-    with set_current_vllm_config(vllm_config):
-        mod_B = B(vllm_config=vllm_config, prefix='').eval().cuda()
-    # B's ignore_torch_compile should override A's support_torch_compile
-    with compilation_counter.expect(
-            num_graphs_seen=0,
-            num_piecewise_graphs_seen=0,
-            num_piecewise_capturable_graphs_seen=0,
-            num_backend_compilations=0,
-            num_cudagraph_captured=0,
-    ), set_forward_context({}, vllm_config=vllm_config):
-        mod_B(torch.randn(BATCH_SIZE, MLP_SIZE).cuda())
-        mod_B(torch.randn(2, MLP_SIZE).cuda())
-        mod_B(torch.randn(1, MLP_SIZE).cuda())
-    with set_current_vllm_config(vllm_config):
-        mod_C = C(vllm_config=vllm_config, prefix='').eval().cuda()
-    # C's support_torch_compile should override B's ignore_torch_compile
-    with compilation_counter.expect(
-            num_graphs_seen=1,
-            num_piecewise_graphs_seen=3,
-            num_piecewise_capturable_graphs_seen=2,
-            num_backend_compilations=2,
-            num_cudagraph_captured=4,
-            # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
-    ), set_forward_context({}, vllm_config=vllm_config):
-        mod_C(torch.randn(BATCH_SIZE, MLP_SIZE).cuda())
-        mod_C(torch.randn(2, MLP_SIZE).cuda())
-        mod_C(torch.randn(1, MLP_SIZE).cuda())
 @torch.inference_mode
-def run_model(vllm_config, model: nn.Module, inputs: torch.Tensor):
+def run_model(vllm_config: VllmConfig, model: nn.Module, inputs: torch.Tensor,
+              cudagraph_runtime_mode: CUDAGraphMode):
    with set_forward_context({}, vllm_config=vllm_config):
-        # First run is for compile
+        # warmup for the model with cudagraph_mode NONE
        model(inputs)
-        # Run CUDAGraph captured sizes
+        # simulate cudagraphs capturing
-        model(inputs[:2])
+        with set_forward_context({},
-        model(inputs[:1])
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
-        output = model(inputs[:2])
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=2, )):
+            model(inputs[:2])
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=1, )):
+            model(inputs[:1])
+        # simulate cudagraphs replay
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=2, )):
+            output = model(inputs[:2])
        output = output.cpu()
        return output.cpu()
@@ -277,6 +206,7 @@ def test_multi_graph_piecewise_compile_outputs_equal():
        splitting_ops=["silly.attention"],
        cudagraph_capture_sizes=[1, 2],
    ))
+    cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
    with set_current_vllm_config(vllm_config):
        model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE,
@@ -299,11 +229,13 @@ def test_multi_graph_piecewise_compile_outputs_equal():
            num_cudagraph_captured=8,
            # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
    ):
-        outputs.append(run_model(vllm_config, model, inputs))
+        outputs.append(
+            run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
    # no compile or cudagraph
    vllm_config = VllmConfig(compilation_config=CompilationConfig(
        level=CompilationLevel.NO_COMPILATION, ))
+    cudagraph_runtime_mode = CUDAGraphMode.NONE
    with set_current_vllm_config(vllm_config):
        model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE,
@@ -318,7 +250,8 @@ def test_multi_graph_piecewise_compile_outputs_equal():
            num_backend_compilations=0,
            num_cudagraph_captured=0,
    ):
-        outputs.append(run_model(vllm_config, model, inputs))
+        outputs.append(
+            run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
    # piecewise compile without CUDA graph
    vllm_config = VllmConfig(compilation_config=CompilationConfig(
@@ -326,6 +259,7 @@ def test_multi_graph_piecewise_compile_outputs_equal():
        use_cudagraph=False,
        splitting_ops=["silly.attention"],
    ))
+    cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
    with set_current_vllm_config(vllm_config):
        model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE,
@@ -340,7 +274,8 @@ def test_multi_graph_piecewise_compile_outputs_equal():
            num_backend_compilations=4,
            num_cudagraph_captured=0,  # no cudagraph captured
    ):
-        outputs.append(run_model(vllm_config, model, inputs))
+        outputs.append(
+            run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
    # Generally don't expect outputs with and without inductor
    # to be bitwise equivalent

--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -30,15 +30,15 @@ class TestSetting:
    "test_setting",
    [
        # basic llama model
-        # TestSetting(
+        TestSetting(
-        #     model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
+            model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
-        #     model_args=["--max-model-len", "2048"],
+            model_args=["--max-model-len", "2048"],
-        #     pp_size=2,
+            pp_size=2,
-        #     tp_size=2,
+            tp_size=2,
-        #     attn_backend="FLASHINFER",
+            attn_backend="FLASH_ATTN",
-        #     method="generate",
+            method="generate",
-        #     fullgraph=True,
+            fullgraph=True,
-        # ),
+        ),
        # llama model with quantization
        TestSetting(
            model=os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"),

--- a/tests/compile/test_decorator.py
+++ b/tests/compile/test_decorator.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from torch import nn
+from torch.library import Library
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.decorators import (ignore_torch_compile,
+                                         support_torch_compile)
+from vllm.config import (CacheConfig, CompilationConfig, CompilationLevel,
+                         CUDAGraphMode, VllmConfig, set_current_vllm_config)
+from vllm.forward_context import BatchDescriptor, set_forward_context
+from vllm.utils import direct_register_custom_op
+# create a library to hold the custom op
+silly_lib = Library("silly", "FRAGMENT")  # noqa
+BATCH_SIZE = 32
+MLP_SIZE = 128
+def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                    out: torch.Tensor) -> None:
+    out.copy_(q)
+    out += k
+    out += v
+def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                         out: torch.Tensor) -> None:
+    return
+direct_register_custom_op(
+    op_name="attention",
+    op_func=silly_attention,
+    mutates_args=["out"],
+    fake_impl=silly_attention_fake,
+    target_lib=silly_lib,
+)
+@torch.inference_mode
+def run_model(vllm_config: VllmConfig, model: nn.Module,
+              cudagraph_runtime_mode: CUDAGraphMode):
+    with set_forward_context({}, vllm_config=vllm_config):
+        # warmup for the model with cudagraph_mode NONE
+        model(torch.randn(BATCH_SIZE, MLP_SIZE).cuda())
+        # simulate cudagraphs capturing
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=2, )):
+            model(torch.randn(2, MLP_SIZE).cuda())
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=1, )):
+            model(torch.randn(1, MLP_SIZE).cuda())
+        # simulate cudagraphs replay
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=2, )):
+            output = model(torch.randn(2, MLP_SIZE).cuda())
+        output = output.cpu()
+        return output.cpu()
+def test_ignore_torch_compile_decorator():
+    # piecewise
+    vllm_config = VllmConfig(compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        use_cudagraph=True,
+        splitting_ops=["silly.attention"],
+        cudagraph_capture_sizes=[1, 2],
+    ))
+    cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
+    @support_torch_compile
+    class A(nn.Module):
+        def __init__(self,
+                     *,
+                     vllm_config: VllmConfig,
+                     prefix: str = '',
+                     **kwargs) -> None:
+            super().__init__()
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            x = x + x
+            attn_output = torch.empty_like(x)
+            torch.ops.silly.attention(x, x, x, attn_output)
+            x = attn_output
+            x = x * 3
+            return x
+    @ignore_torch_compile
+    class B(A):
+        ...
+    @support_torch_compile
+    class C(B):
+        ...
+    with set_current_vllm_config(vllm_config):
+        mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda()
+    # A has support_torch_compile
+    with compilation_counter.expect(
+            num_graphs_seen=1,
+            num_piecewise_graphs_seen=3,
+            num_piecewise_capturable_graphs_seen=2,
+            num_backend_compilations=2,
+            num_cudagraph_captured=4,
+            # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    ):
+        run_model(vllm_config, mod_A, cudagraph_runtime_mode)
+    with set_current_vllm_config(vllm_config):
+        mod_B = B(vllm_config=vllm_config, prefix='').eval().cuda()
+    # B's ignore_torch_compile should override A's support_torch_compile
+    with compilation_counter.expect(
+            num_graphs_seen=0,
+            num_piecewise_graphs_seen=0,
+            num_piecewise_capturable_graphs_seen=0,
+            num_backend_compilations=0,
+            num_cudagraph_captured=0,
+    ):
+        run_model(vllm_config, mod_B, cudagraph_runtime_mode)
+    with set_current_vllm_config(vllm_config):
+        mod_C = C(vllm_config=vllm_config, prefix='').eval().cuda()
+    # C's support_torch_compile should override B's ignore_torch_compile
+    with compilation_counter.expect(
+            num_graphs_seen=1,
+            num_piecewise_graphs_seen=3,
+            num_piecewise_capturable_graphs_seen=2,
+            num_backend_compilations=2,
+            num_cudagraph_captured=4,
+            # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    ):
+        run_model(vllm_config, mod_C, cudagraph_runtime_mode)
+# Only enable torch.compile if
+# vllm_config.cache_config.kv_sharing_fast_prefill=True
+@support_torch_compile(enable_if=lambda vllm_config: vllm_config.cache_config.
+                       kv_sharing_fast_prefill)
+class B(nn.Module):
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = '',
+                 **kwargs) -> None:
+        super().__init__()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + x
+        attn_output = torch.empty_like(x)
+        torch.ops.silly.attention(x, x, x, attn_output)
+        x = attn_output
+        x = x + x
+        return x
+# Only enable torch.compile if
+# vllm_config.cache_config.kv_sharing_fast_prefill=False
+@support_torch_compile(enable_if=lambda vllm_config: not vllm_config.
+                       cache_config.kv_sharing_fast_prefill)
+class A(nn.Module):
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = '',
+                 **kwargs) -> None:
+        super().__init__()
+        self.mod1 = B(vllm_config=vllm_config, prefix=prefix, **kwargs)
+        self.mod2 = B(vllm_config=vllm_config, prefix=prefix, **kwargs)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.mod1(x)
+        attn_output = torch.empty_like(x)
+        torch.ops.silly.attention(x, x, x, attn_output)
+        x = attn_output
+        x = self.mod2(x)
+        return x
+def test_conditional_compile_enable_if():
+    vllm_config = VllmConfig(cache_config=CacheConfig(
+        kv_sharing_fast_prefill=True, ),
+                             compilation_config=CompilationConfig(
+                                 level=CompilationLevel.PIECEWISE,
+                                 use_cudagraph=True,
+                                 splitting_ops=["silly.attention"],
+                                 cudagraph_capture_sizes=[1, 2],
+                             ))
+    cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
+    with set_current_vllm_config(vllm_config):
+        mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda()
+    # A has support_torch_compile but enable_if fn returns False
+    # enalbe_if will be True for B, so we expect mod1 and mod2
+    # to be compiled
+    with compilation_counter.expect(
+            num_graphs_seen=2,
+            num_piecewise_graphs_seen=6,
+            # 3 piecewise graphs per instance of B()
+            num_piecewise_capturable_graphs_seen=4,
+            num_backend_compilations=4,
+            num_cudagraph_captured=8,
+            # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    ):
+        run_model(vllm_config, mod_A, cudagraph_runtime_mode)
+    # Set kv_sharing_fast_prefill=False
+    # which will cause A to be compiled and B to not be compiled
+    vllm_config = VllmConfig(cache_config=CacheConfig(
+        kv_sharing_fast_prefill=False, ),
+                             compilation_config=CompilationConfig(
+                                 level=CompilationLevel.PIECEWISE,
+                                 use_cudagraph=True,
+                                 splitting_ops=["silly.attention"],
+                                 cudagraph_capture_sizes=[1, 2],
+                             ))
+    with set_current_vllm_config(vllm_config):
+        mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda()
+    with compilation_counter.expect(
+            num_graphs_seen=1,
+            num_piecewise_graphs_seen=7,
+            # 3 attn ops and 4 non-attn ops
+            num_piecewise_capturable_graphs_seen=4,
+            num_backend_compilations=4,
+            num_cudagraph_captured=8,
+            # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    ):
+        run_model(vllm_config, mod_A, cudagraph_runtime_mode)
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -53,12 +53,6 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
                "quantization": "gptq_marlin_24"
            }))
-        if is_quant_method_supported("marlin"):
-            TEST_MODELS.append(
-                ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
-                    "quantization": "marlin"
-                }))
        if not current_platform.is_rocm() and is_quant_method_supported("awq"):
            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
                "quantization": "AWQ"

--- a/tests/compile/test_fusion_all_reduce.py
+++ b/tests/compile/test_fusion_all_reduce.py
@@ -148,7 +148,7 @@ class TestAllReduceFusedAddRMSNormStaticQuantFP4Model(torch.nn.Module):
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seq_len", [8])
 @pytest.mark.parametrize("hidden_size", [16])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
                    reason="Only test on CUDA")
 @pytest.mark.skipif(

--- a/tests/compile/test_sequence_parallelism.py
+++ b/tests/compile/test_sequence_parallelism.py
@@ -104,8 +104,7 @@ class TestQuantModel(torch.nn.Module):
        # Initialize weights
        torch.nn.init.normal_(self.gate_proj, std=0.02)
-        self.fp8_linear = Fp8LinearOp(cutlass_fp8_supported=True,
+        self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=False)
-                                      use_per_token_if_dynamic=False)
        self.scale = torch.rand(1, dtype=torch.float32)
        # Create a weight that is compatible with torch._scaled_mm,

--- a/tests/compile/untest_functionalization.py
+++ b/tests/compile/untest_functionalization.py
@@ -8,11 +8,12 @@ import vllm.envs as envs
 from vllm import LLM, SamplingParams
 from vllm.compilation.activation_quant_fusion import ActivationQuantFusionPass
 from vllm.compilation.fix_functionalization import FixFunctionalizationPass
-from vllm.compilation.fusion import (FUSED_OPS, FusionPass, QuantKey,
+from vllm.compilation.fusion import FUSED_OPS, FusionPass
-                                     kFp8DynamicTokenSym, kFp8StaticTensorSym)
 from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.config import CompilationConfig, PassConfig, VllmConfig
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    QuantKey, kFp8DynamicTokenSym, kFp8StaticTensorSym)
 from .backend import TestBackend

--- a/tests/compile/untest_fusion.py
+++ b/tests/compile/untest_fusion.py
@@ -7,13 +7,15 @@ import torch
 import vllm.envs as envs
 import vllm.plugins
 from vllm.compilation.fusion import (FUSED_OPS, QUANT_OPS, FusedRMSQuantKey,
-                                     FusionPass, GroupShape, QuantKey)
+                                     FusionPass)
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.config import (CompilationConfig, CompilationLevel, PassConfig,
                         VllmConfig)
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape, QuantKey, ScaleDesc)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    CUTLASS_FP8_SUPPORTED, Fp8LinearOp, maybe_create_device_identity)
+    Fp8LinearOp, maybe_create_device_identity)
 from vllm.platforms import current_platform
 from .backend import TestBackend
@@ -24,16 +26,14 @@ FP8_DTYPE = current_platform.fp8_dtype()
 class TestModel(torch.nn.Module):
    def __init__(self, hidden_size: int, eps: float, static: bool,
-                 cutlass_fp8_enabled: bool, *args, **kwargs):
+                 force_fp8_e4m3fnuz: bool, *args, **kwargs):
        super().__init__(*args, **kwargs)
-        self.cutlass_fp8_enabled = cutlass_fp8_enabled
+        self.force_fp8_e4m3fnuz = force_fp8_e4m3fnuz
        self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
        self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
        group_shape = GroupShape.PER_TENSOR if static else GroupShape.PER_TOKEN
-        self.key = QuantKey(dtype=FP8_DTYPE,
+        quant_scale = ScaleDesc(torch.float32, static, group_shape)
-                            static=static,
+        self.key = QuantKey(dtype=FP8_DTYPE, scale=quant_scale, symmetric=True)
-                            group_shape=group_shape,
-                            symmetric=True)
        if static:
            self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
        else:
@@ -43,7 +43,7 @@ class TestModel(torch.nn.Module):
            for _ in range(2)
        ]
        self.fp8_linear = Fp8LinearOp(
-            cutlass_fp8_supported=cutlass_fp8_enabled,
+            force_fp8_e4m3fnuz=force_fp8_e4m3fnuz,
            act_quant_static=static,
            act_quant_group_shape=group_shape,
        )
@@ -81,12 +81,11 @@ class TestModel(torch.nn.Module):
 @pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049])
 @pytest.mark.parametrize("eps", [1e-5, 1e-6])
 @pytest.mark.parametrize("static", [True, False])
-@pytest.mark.parametrize("cutlass_fp8_enabled",
+@pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False])
-                         [True, False] if CUTLASS_FP8_SUPPORTED else [False])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
                    reason="Only test on CUDA and ROCm")
 def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
-                              cutlass_fp8_enabled):
+                              force_fp8_e4m3fnuz):
    torch.set_default_device("cuda")
    torch.set_default_dtype(dtype)
    torch.manual_seed(1)
@@ -103,7 +102,7 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
        fusion_pass = FusionPass.instance(vllm_config)
        backend = TestBackend(noop_pass, fusion_pass)
-        model = TestModel(hidden_size, eps, static, cutlass_fp8_enabled)
+        model = TestModel(hidden_size, eps, static, force_fp8_e4m3fnuz)
        # First dimension dynamic
        x = torch.rand(num_tokens, hidden_size)

--- a/tests/compile/untest_fusion_attn.py
+++ b/tests/compile/untest_fusion_attn.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
 from typing import Optional
 import pytest
@@ -7,13 +8,29 @@ import torch._dynamo
 from tests.compile.backend import TestBackend
 from tests.models.utils import check_outputs_equal
+from tests.v1.attention.utils import (BatchSpec, _Backend,
+                                      create_common_attn_metadata)
 from vllm import LLM, SamplingParams
-from vllm.compilation.fusion import QUANT_OPS, QuantKey, kFp8StaticTensorSym
+from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
+from vllm.attention import Attention
+from vllm.attention.selector import global_force_attn_backend_context_manager
+from vllm.compilation.fusion import QUANT_OPS
 from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
 from vllm.compilation.fx_utils import find_op_nodes
 from vllm.compilation.noop_elimination import NoOpEliminationPass
-from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
+from vllm.config import (CacheConfig, CompilationConfig, CompilationLevel,
+                         ModelConfig, PassConfig, SchedulerConfig, VllmConfig,
+                         set_current_vllm_config)
+from vllm.forward_context import get_forward_context, set_forward_context
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    QuantKey, kFp8StaticTensorSym, kNvfp4Quant)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    Fp8LinearOp)
 from vllm.platforms import current_platform
+from vllm.v1.kv_cache_interface import AttentionSpec
+FP8_DTYPE = current_platform.fp8_dtype()
+FP4_DTYPE = torch.uint8
 # globals needed for string-import custom Dynamo backend field
 backend: Optional[TestBackend] = None
@@ -90,9 +107,7 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
    # check support
    attn_fusion_supported = [
-        layer.impl.fused_output_quant_supported(quant_key.dtype,
+        layer.impl.fused_output_quant_supported(quant_key)
-                                                quant_key.static,
-                                                quant_key.group_shape)
        for key, layer in compile_config.static_forward_context.items()
    ]
@@ -132,3 +147,309 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
    # Reset backend to make sure llm2 gets released
    backend = None
+class AttentionQuantPatternModel(torch.nn.Module):
+    """Base model for AttentionQuantPattern fusion."""
+    def __init__(self, num_qo_heads: int, num_kv_heads: int, head_size: int,
+                 kv_cache_dtype: torch.dtype, device: torch.device,
+                 vllm_config: VllmConfig, **kwargs):
+        super().__init__()
+        self.num_qo_heads = num_qo_heads
+        self.num_kv_heads = num_kv_heads
+        self.head_size = head_size
+        self.kv_cache_dtype = kv_cache_dtype
+        self.device = device
+        self.vllm_config = vllm_config
+        self.attn = Attention(
+            num_heads=self.num_qo_heads,
+            head_size=self.head_size,
+            scale=1.0 / (self.head_size**0.5),
+            num_kv_heads=self.num_kv_heads,
+            cache_config=vllm_config.cache_config,
+            prefix="model.layers.0.self_attn.attn",
+        )
+        self.block_size = 16
+        # Initialize attn MetadataBuilder
+        self.builder = self.attn.attn_backend.get_builder_cls()(
+            kv_cache_spec=AttentionSpec(
+                block_size=self.block_size,
+                num_kv_heads=self.num_kv_heads,
+                head_size=self.head_size,
+                dtype=self.kv_cache_dtype,
+                use_mla=False,
+            ),
+            layer_names=[self.attn.layer_name],
+            vllm_config=self.vllm_config,
+            device=self.device,
+        )
+    def build_attn_metadata(self, batch_size: int):
+        """Initialize attention metadata."""
+        # Create common attn metadata
+        batch_spec = BatchSpec(seq_lens=[1] * batch_size,
+                               query_lens=[1] * batch_size)
+        common_attn_metadata = create_common_attn_metadata(
+            batch_spec,
+            self.block_size,
+            self.device,
+            arange_block_indices=True)
+        max_blocks = (max(batch_spec.seq_lens) + self.block_size -
+                      1) // self.block_size
+        num_blocks = batch_size * max_blocks
+        # Create dummy KV cache for FlashInfer TRTLLM
+        #   - NHD: [num_blocks, 2, block_size, num_kv_heads, head_size]
+        #   - HND: [num_blocks, 2, num_kv_heads, block_size, head_size]
+        # Create kv_cache in HND layout and permute to NHD layout
+        # (later will be permuted back to HND layout in forward pass)
+        kv_cache = torch.zeros(num_blocks,
+                               2,
+                               self.num_kv_heads,
+                               self.block_size,
+                               self.head_size,
+                               dtype=self.kv_cache_dtype,
+                               device=self.device)
+        kv_cache = kv_cache.permute(0, 1, 3, 2, 4)
+        self.attn.kv_cache = [kv_cache]
+        # Build attn metadata
+        self.attn_metadata = self.builder.build(
+            common_prefix_len=0, common_attn_metadata=common_attn_metadata)
+        return self.attn_metadata
+class TestAttentionFp8StaticQuantPatternModel(AttentionQuantPatternModel):
+    """Test model for AttentionFp8StaticQuantPattern fusion."""
+    quant_key = kFp8StaticTensorSym
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.fp8_linear = Fp8LinearOp(
+            act_quant_static=self.quant_key.scale.static,
+            act_quant_group_shape=self.quant_key.scale.group_shape)
+        hidden_size = self.num_qo_heads * self.head_size
+        self.w = kwargs.get(
+            "w", {
+                "weight":
+                torch.randn(hidden_size, hidden_size).to(
+                    dtype=FP8_DTYPE, device=self.device).t(),
+                "wscale":
+                torch.tensor([1.0], dtype=torch.float32, device=self.device),
+                "scale":
+                torch.tensor([1.0], dtype=torch.float32, device=self.device),
+            })
+    def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
+        """Forward pass that creates the pattern to be fused."""
+        attn_output = self.attn(q, k, v)
+        return self.fp8_linear.apply(input=attn_output,
+                                     weight=self.w["weight"],
+                                     weight_scale=self.w["wscale"],
+                                     input_scale=self.w["scale"])
+class TestAttentionNvfp4QuantPatternModel(AttentionQuantPatternModel):
+    """Test model for AttentionNvfp4QuantPattern fusion."""
+    quant_key = kNvfp4Quant
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        hidden_size = self.num_qo_heads * self.head_size
+        self.w = kwargs.get(
+            "w", {
+                "weight":
+                torch.randint(256, (hidden_size, hidden_size // 2),
+                              dtype=FP4_DTYPE,
+                              device=self.device),
+                "wscale_swizzled":
+                torch.randn(hidden_size, hidden_size // 16).to(
+                    dtype=FP8_DTYPE, device=self.device),
+                "wscale":
+                torch.tensor([500], dtype=torch.float32, device=self.device),
+                "scale":
+                torch.tensor([0.002], dtype=torch.float32, device=self.device),
+            })
+    def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
+        """Forward pass that creates the pattern to be fused."""
+        attn_output = self.attn(q, k, v)
+        quant_output, output_block_scale = scaled_fp4_quant(
+            attn_output, 1 / self.w["scale"])
+        return cutlass_scaled_fp4_mm(a=quant_output,
+                                     b=self.w["weight"],
+                                     block_scale_a=output_block_scale,
+                                     block_scale_b=self.w["wscale_swizzled"],
+                                     alpha=self.w["scale"] * self.w["wscale"],
+                                     out_dtype=attn_output.dtype)
+@pytest.mark.parametrize("num_qo_heads, num_kv_heads", [(64, 8), (40, 8)])
+@pytest.mark.parametrize("head_size", [128])
+@pytest.mark.parametrize("batch_size", [7, 256, 533])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("model_name, model_class",
+                         [("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
+                           TestAttentionFp8StaticQuantPatternModel),
+                          ("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
+                           TestAttentionNvfp4QuantPatternModel)])
+@pytest.mark.parametrize("backend", [_Backend.FLASHINFER])
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="Only test CUDA")
+@pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
+@pytest.mark.skipif(not current_platform.is_device_capability((10, 0)),
+                    reason="Only test on SM100(Blackwell)")
+def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
+                                 head_size: int, batch_size: int,
+                                 dtype: torch.dtype, model_name: str,
+                                 model_class: type[AttentionQuantPatternModel],
+                                 backend: _Backend, monkeypatch, dist_init):
+    """Test AttentionStaticQuantPattern fusion pass"""
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    device = torch.device("cuda:0")
+    torch.manual_seed(42)
+    vllm_config = VllmConfig(
+        model_config=ModelConfig(
+            model=model_name,
+            max_model_len=2048,
+        ),
+        scheduler_config=SchedulerConfig(max_num_seqs=1024),
+        compilation_config=CompilationConfig(
+            level=CompilationLevel.PIECEWISE,
+            custom_ops=["+quant_fp8"],
+        ),
+        cache_config=CacheConfig(cache_dtype="fp8"))
+    # Create test inputs
+    q = torch.randn(batch_size,
+                    num_qo_heads * head_size,
+                    dtype=dtype,
+                    device=device)
+    k = torch.randn(batch_size,
+                    num_kv_heads * head_size,
+                    dtype=dtype,
+                    device=device)
+    v = torch.randn(batch_size,
+                    num_kv_heads * head_size,
+                    dtype=dtype,
+                    device=device)
+    # Mark first dimension as dynamic for realistic testing
+    torch._dynamo.mark_dynamic(q, 0)
+    torch._dynamo.mark_dynamic(k, 0)
+    torch._dynamo.mark_dynamic(v, 0)
+    # Run model directly without compilation and fusion
+    vllm_config_unfused = copy.deepcopy(vllm_config)
+    with set_current_vllm_config(vllm_config_unfused), set_forward_context(
+            attn_metadata=None, vllm_config=vllm_config_unfused
+    ), global_force_attn_backend_context_manager(backend):
+        model_unfused = model_class(num_qo_heads=num_qo_heads,
+                                    num_kv_heads=num_kv_heads,
+                                    head_size=head_size,
+                                    kv_cache_dtype=FP8_DTYPE,
+                                    device=device,
+                                    vllm_config=vllm_config_unfused)
+        model_unfused = model_unfused.to(device)
+        forward_ctx = get_forward_context()
+        forward_ctx.attn_metadata = model_unfused.build_attn_metadata(
+            batch_size)
+        # Run model directly without compilation and fusion
+        result_unfused = model_unfused(q, k, v)
+    # Run model with attn fusion enabled
+    vllm_config.compilation_config.pass_config = PassConfig(
+        enable_attn_fusion=True, enable_noop=True)
+    with set_current_vllm_config(vllm_config), set_forward_context(
+            attn_metadata=None, vllm_config=vllm_config
+    ), global_force_attn_backend_context_manager(backend):
+        model_fused = model_class(num_qo_heads=num_qo_heads,
+                                  num_kv_heads=num_kv_heads,
+                                  head_size=head_size,
+                                  kv_cache_dtype=FP8_DTYPE,
+                                  device=device,
+                                  vllm_config=vllm_config,
+                                  w=model_unfused.w)
+        model_fused = model_fused.to(device)
+        forward_ctx = get_forward_context()
+        forward_ctx.attn_metadata = model_fused.build_attn_metadata(batch_size)
+        # Create test backend with fusion passes enabled
+        noop_pass = NoOpEliminationPass(vllm_config)
+        attn_pass = lambda *args, **kw: AttnFusionPass(vllm_config)(*args, **kw
+                                                                    )
+        test_backend = TestBackend(noop_pass, attn_pass)
+        # Compile model with fusion enabled
+        model_compiled = torch.compile(model_fused,
+                                       backend=test_backend,
+                                       fullgraph=True)
+        assert model_compiled.attn._o_scale_float is None
+        result_fused_1 = model_compiled(q, k, v)
+        # After the 1st round of the forward pass, output quant scale should be
+        # loaded into the attn layer's _o_scale_float, the 2nd round should
+        # reuse the loaded _o_scale_float
+        assert model_compiled.attn._o_scale_float is not None
+        result_fused_2 = model_compiled(q, k, v)
+        assert model_compiled.attn._o_scale_float is not None
+    # Check attn fusion support
+    quant_key = model_class.quant_key
+    attn_fusion_supported = [
+        layer.impl.fused_output_quant_supported(quant_key) for key, layer in
+        vllm_config.compilation_config.static_forward_context.items()
+    ]
+    if any(attn_fusion_supported):
+        # Check quantization ops in the graph before and after fusion
+        test_backend.check_before_ops([QUANT_OPS[quant_key]],
+                                      fully_replaced=True)
+    # Check attention ops in the graph before and after fusion
+    attn_nodes_pre = list(find_op_nodes(ATTN_OP, test_backend.graph_pre_pass))
+    attn_nodes_post = list(find_op_nodes(ATTN_OP,
+                                         test_backend.graph_post_pass))
+    assert len(attn_nodes_pre) > 0, "Should have attention nodes before fusion"
+    assert len(attn_nodes_pre) == len(attn_nodes_post), \
+        "Should have same number of attention nodes before and after fusion"
+    assert attn_nodes_pre[0].kwargs.get("output_scale") is None, \
+        "Attention should not have output_scale before fusion"
+    assert attn_nodes_post[0].kwargs.get("output_scale") is not None, \
+        "Attention should have output_scale after fusion"
+    assert attn_nodes_pre[0].kwargs.get("output_block_scale") is None, \
+        "Attention should not have output_block_scale before fusion"
+    if quant_key.dtype == FP8_DTYPE:
+        assert attn_nodes_post[0].kwargs.get("output_block_scale") is None, \
+            "Attention should not have output_block_scale after FP8 fusion"
+    elif quant_key.dtype == FP4_DTYPE:
+        assert attn_nodes_post[0].kwargs.get("output_block_scale") is not None, \
+            "Attention should have output_block_scale after FP4 fusion"  # noqa: E501
+    # Check that results are closed
+    torch.testing.assert_close(result_unfused,
+                               result_fused_1,
+                               atol=1e-2,
+                               rtol=1e-2)
+    torch.testing.assert_close(result_unfused,
+                               result_fused_2,
+                               atol=1e-2,
+                               rtol=1e-2)
--- a/tests/compile/untest_silu_mul_quant_fusion.py
+++ b/tests/compile/untest_silu_mul_quant_fusion.py
@@ -4,35 +4,44 @@ import pytest
 import torch
 import vllm.envs as envs
-from vllm.compilation.activation_quant_fusion import ActivationQuantFusionPass
+from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
-from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.compilation.activation_quant_fusion import (
+    FUSED_OPS, SILU_MUL_OP, ActivationQuantFusionPass)
+# yapf: enable
+from vllm.compilation.fusion import QUANT_OPS
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.config import CompilationConfig, PassConfig, VllmConfig
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape)
+    GroupShape, kFp8StaticTensorSym, kNvfp4Quant)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    CUTLASS_FP8_SUPPORTED, Fp8LinearOp)
+    Fp8LinearOp)
 from vllm.platforms import current_platform
 from .backend import TestBackend
+FP8_DTYPE = current_platform.fp8_dtype()
+FP4_DTYPE = torch.uint8
-class TestModel(torch.nn.Module):
-    def __init__(self, hidden_size: int, cutlass_fp8_enabled: bool, *args,
+def is_nvfp4_supported():
-                 **kwargs):
+    return current_platform.has_device_capability(100)
-        super().__init__(*args, **kwargs)
+class TestSiluMulFp8QuantModel(torch.nn.Module):
+    def __init__(self, hidden_size: int, force_fp8_e4m3fnuz: bool, **kwargs):
+        super().__init__()
        self.silu_and_mul = SiluAndMul()
        self.wscale = torch.rand(1, dtype=torch.float32)
        self.scale = torch.rand(1, dtype=torch.float32)
-        self.w = (torch.rand(
+        self.w = torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
-            hidden_size,
-            hidden_size).to(dtype=current_platform.fp8_dtype()).t())
        self.fp8_linear = Fp8LinearOp(
-            cutlass_fp8_supported=cutlass_fp8_enabled,
+            force_fp8_e4m3fnuz=force_fp8_e4m3fnuz,
            act_quant_static=True,
            act_quant_group_shape=GroupShape.PER_TENSOR,
        )
@@ -45,15 +54,56 @@ class TestModel(torch.nn.Module):
                                   input_scale=self.wscale)
        return x2
+    def ops_in_model_before(self):
+        return [SILU_MUL_OP, QUANT_OPS[kFp8StaticTensorSym]]
+    def ops_in_model_after(self):
+        return [FUSED_OPS[kFp8StaticTensorSym]]
+class TestSiluMulNvfp4QuantModel(torch.nn.Module):
+    def __init__(self, hidden_size: int, **kwargs):
+        super().__init__()
+        self.silu_and_mul = SiluAndMul()
+        self.w = torch.randint(256, (hidden_size, hidden_size // 2),
+                               dtype=FP4_DTYPE)
+        self.wscale = torch.randn(hidden_size,
+                                  hidden_size // 16).to(dtype=FP8_DTYPE)
+        self.wscale2 = torch.rand(1, dtype=torch.float32)
+        self.scale = torch.rand(1, dtype=torch.float32)
-@pytest.mark.parametrize("num_tokens", [256])
+    def forward(self, x):
-@pytest.mark.parametrize("hidden_size", [64])
+        y = self.silu_and_mul(x)
-@pytest.mark.parametrize("cutlass_fp8_enabled",
+        y_quant, y_block_scale = scaled_fp4_quant(y, 1 / self.scale)
-                         [True, False] if CUTLASS_FP8_SUPPORTED else [False])
+        out = cutlass_scaled_fp4_mm(a=y_quant,
+                                    b=self.w,
+                                    block_scale_a=y_block_scale,
+                                    block_scale_b=self.wscale,
+                                    alpha=self.scale * self.wscale2,
+                                    out_dtype=y.dtype)
+        return out
+    def ops_in_model_before(self):
+        return [SILU_MUL_OP, QUANT_OPS[kNvfp4Quant]]
+    def ops_in_model_after(self):
+        return [FUSED_OPS[kNvfp4Quant]]
+@pytest.mark.parametrize("num_tokens", [64])
+@pytest.mark.parametrize("hidden_size", [128])
+@pytest.mark.parametrize(
+    "model_class", [TestSiluMulFp8QuantModel, TestSiluMulNvfp4QuantModel]
+    if is_nvfp4_supported() else [TestSiluMulFp8QuantModel])
+@pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
                    reason="Only test on CUDA and ROCm")
-def test_fusion_silu_and_mul_quant(num_tokens, hidden_size,
+def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
-                                   cutlass_fp8_enabled):
+                                   force_fp8_e4m3fnuz):
+    if model_class == TestSiluMulNvfp4QuantModel and force_fp8_e4m3fnuz:
+        pytest.skip("Duplicate tests for NVFP4")
    torch.set_default_device("cuda")
    torch.set_default_dtype(torch.float16)
@@ -64,7 +114,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size,
    fusion_pass = ActivationQuantFusionPass(config)
    backend = TestBackend(NoOpEliminationPass(config), fusion_pass)
-    model = TestModel(hidden_size, cutlass_fp8_enabled)
+    model = model_class(hidden_size=hidden_size,
+                        force_fp8_e4m3fnuz=force_fp8_e4m3fnuz)
    # First dimension dynamic
    x = torch.rand(num_tokens, hidden_size * 2)
@@ -81,17 +132,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size,
                               atol=1e-3,
                               rtol=1e-3)
-    # Check substitution worked
+    # In pre-nodes, quant op should be present and fused kernels should not
-    pre_nodes = backend.graph_pre_pass.nodes
+    backend.check_before_ops(model.ops_in_model_before())
-    post_nodes = backend.graph_post_pass.nodes
-    silu_and_mul_quant = torch.ops._C.silu_and_mul_quant.default
-    fp8_quant = torch.ops._C.static_scaled_fp8_quant.default
-    # In pre-nodes, fp8 quant should be present and fused kernels should not
-    assert find_auto_fn_maybe(pre_nodes, silu_and_mul_quant) is None
-    find_auto_fn(pre_nodes, fp8_quant)
-    # In post-nodes, fused kernels should be present and fp8 quant should not
+    # In post-nodes, fused kernels should be present and quant op should not
-    find_auto_fn(post_nodes, silu_and_mul_quant)
+    backend.check_after_ops(model.ops_in_model_after())
-    assert find_auto_fn_maybe(post_nodes, fp8_quant) is None
--- a/tests/conftest.py
+++ b/tests/conftest.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
+import math
 import os
 import tempfile
 from enum import Enum
+from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast
-from typing import Any, Callable, Optional, TypedDict, TypeVar, Union
 import pytest
 import pytest_html
@@ -37,7 +37,7 @@ from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
+from vllm.sequence import Logprob
 from vllm.transformers_utils.utils import maybe_model_redirect
 from .utils import models_path_prefix
@@ -460,9 +460,16 @@ class HfRunner:
        # output is final logits
        all_inputs = self.get_inputs(prompts)
        outputs = []
+        problem_type = getattr(self.config, "problem_type", "")
        for inputs in all_inputs:
            output = self.model(**self.wrap_device(inputs))
-            logits = output.logits.softmax(dim=-1)[0].tolist()
+            if problem_type == "regression":
+                logits = output.logits[0].tolist()
+            elif problem_type == "multi_label_classification":
+                logits = output.logits.sigmoid()[0].tolist()
+            else:
+                logits = output.logits.softmax(dim=-1)[0].tolist()
            outputs.append(logits)
        return outputs
@@ -600,7 +607,7 @@ class HfRunner:
    def _hidden_states_to_logprobs(
        self,
        hidden_states: tuple[tuple[torch.Tensor, ...], ...],
-        num_logprobs: int,
+        num_logprobs: Optional[int],
    ) -> tuple[list[dict[int, float]], int]:
        seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
        output_len = len(hidden_states)
@@ -628,7 +635,7 @@ class HfRunner:
        self,
        prompts: list[str],
        max_tokens: int,
-        num_logprobs: int,
+        num_logprobs: Optional[int],
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
        videos: Optional[PromptVideoInput] = None,
@@ -675,7 +682,7 @@ class HfRunner:
        self,
        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
        max_tokens: int,
-        num_logprobs: int,
+        num_logprobs: Optional[int],
        images: Optional[PromptImageInput] = None,
        **kwargs: Any,
    ) -> list[TokensTextLogprobs]:
@@ -964,7 +971,7 @@ class VllmRunner:
        self,
        prompts: list[str],
        max_tokens: int,
-        num_logprobs: int,
+        num_logprobs: Optional[int],
        num_prompt_logprobs: Optional[int] = None,
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
@@ -989,11 +996,40 @@ class VllmRunner:
                                        videos=videos,
                                        **kwargs)
+    def generate_prompt_perplexity(self, prompts: list[str]) -> list[float]:
+        """
+        Return the perplexity score associated with generating the prompts
+        :param prompts: list of prompts to score
+        :return: perplexity score of each prompt
+        """
+        outputs = self.generate_greedy_logprobs(prompts,
+                                                max_tokens=1,
+                                                num_logprobs=None,
+                                                num_prompt_logprobs=0)
+        perplexities = []
+        for output in outputs:
+            output = cast(TokensTextLogprobsPromptLogprobs, output)
+            token_datas = cast(list[Optional[dict[int, Logprob]]], output[3])
+            assert token_datas[0] is None
+            token_log_probs = []
+            for token_data in token_datas[1:]:
+                assert token_data is not None
+                assert len(token_data) == 1
+                token_log_prob = list(token_data.values())[0].logprob
+                token_log_probs.append(token_log_prob)
+            perplexity = math.exp(-sum(token_log_probs) / len(token_log_probs))
+            perplexities.append(perplexity)
+        return perplexities
    def generate_encoder_decoder_greedy_logprobs(
        self,
        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
        max_tokens: int,
-        num_logprobs: int,
+        num_logprobs: Optional[int],
        num_prompt_logprobs: Optional[int] = None,
        skip_special_tokens: bool = True,
    ) -> Union[list[TokensTextLogprobs],
@@ -1020,15 +1056,17 @@ class VllmRunner:
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
+        concurrency_limit: Optional[int] = None,
    ) -> list[tuple[list[list[int]], list[str]]]:
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)
-        outputs = self.llm.beam_search(
+        outputs = self.llm.beam_search(inputs,
-            inputs,
+                                       BeamSearchParams(beam_width=beam_width,
-            BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
+                                                        max_tokens=max_tokens),
+                                       concurrency_limit=concurrency_limit)
        returned_outputs = []
        for output in outputs:
            token_ids = [x.tokens for x in output.sequences]
@@ -1086,6 +1124,9 @@ class VllmRunner:
        return self.llm.llm_engine.collective_rpc(_apply_model)
+    def get_llm(self) -> LLM:
+        return self.llm
    def __enter__(self):
        return self

--- a/tests/core/block/e2e/test_correctness_sliding_window.py
+++ b/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -34,7 +34,7 @@ BLOCK_SIZE = 16
 @pytest.mark.parametrize("test_llm_kwargs", [{}])
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS"])
 def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
                                  batch_size, seed, backend, monkeypatch):
    """
@@ -45,8 +45,6 @@ def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
    Additionally, we compare the results of the v1 and v2 managers.
    """
-    if backend == "FLASHINFER" and current_platform.is_rocm():
-        pytest.skip("Flashinfer does not support ROCm/HIP.")
    if backend == "XFORMERS" and current_platform.is_rocm():
        pytest.skip("Xformers does not support ROCm/HIP.")
@@ -98,7 +96,7 @@ def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
 @pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}])
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS"])
 def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
                                        backend, monkeypatch):
    """
@@ -109,8 +107,6 @@ def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
    The results with and without chunked prefill are not the same due to
    numerical instabilities.
    """
-    if backend == "FLASHINFER" and current_platform.is_rocm():
-        pytest.skip("Flashinfer does not support ROCm/HIP.")
    if backend == "XFORMERS" and current_platform.is_rocm():
        pytest.skip("Xformers does not support ROCm/HIP.")
    override_backend_env_variable(monkeypatch, backend)