Merge remote-tracking branch 'origin/v0.9.2-dev' into v0.9.2-dev

cfabf125 · 王敏 · dbd0bda6 · 645fcfd9 · cfabf125 · cfabf125
Commit cfabf125 authored Aug 27, 2025 by 王敏
20 changed files
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -18,7 +18,7 @@ if not current_platform.is_rocm():
    from xformers import ops as xops
    from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
-    from vllm.attention.backends.xformers import _make_alibi_bias
+from vllm.attention.backends.xformers import _make_alibi_bias
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.

--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@@ -25,7 +25,7 @@ def clear_cache():
    _cached_get_attn_backend.cache_clear()
-@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
+@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"] if not current_platform.is_rocm() else ["cpu", "hip"]) 
 def test_mha_attn_platform(device: str):
    """
    Test the attention selector between different platform and device.

--- a/tests/kernels/attention/test_triton_unified_attention.py
+++ b/tests/kernels/attention/test_triton_unified_attention.py
@@ -15,7 +15,7 @@ BLOCK_SIZES = [16, 32]
 DTYPES = [torch.float16, torch.bfloat16]
 QDTYPES = [None, torch.float8_e4m3fn] if not current_platform.is_rocm() else [
-    None, torch.float8_e4m3fnuz
+    None #, torch.float8_e4m3fnuz
 ]
 # one value large enough to test overflow in index calculation.
 # one value small enough to test the schema op check

--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@@ -96,7 +96,7 @@ class BatchedMMTensors:
 @pytest.mark.parametrize("N", [128, 256, 1024])
 @pytest.mark.parametrize(
    "dtype",
-    [torch.float8_e4m3fn, torch.float32, torch.float16, torch.bfloat16])
+    [torch.float8_e4m3fn, torch.float32, torch.float16, torch.bfloat16] if not current_platform.is_rocm() else [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("block_shape", [None, [128, 128]])
 @pytest.mark.parametrize("per_act_token_quant", [False, True])
 def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
@@ -208,7 +208,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
 @pytest.mark.parametrize(("m", "n", "k"), MNK_FACTORS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
+@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16] if not current_platform.is_rocm() else [torch.bfloat16])
 @pytest.mark.parametrize("per_act_token_quant", [False, True])
 @pytest.mark.parametrize("block_shape", [None, [128, 128]])
 @pytest.mark.parametrize("input_scales", [False])

--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -353,7 +353,7 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
                         [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("padding", [True, False])
 @pytest.mark.parametrize(
-    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
+    "use_rocm_aiter", [True, False] if not current_platform.is_rocm() else [False])
 @torch.inference_mode()
 def test_mixtral_moe(dtype: torch.dtype, padding: bool, use_rocm_aiter: bool,
                     monkeypatch):

--- a/tests/kernels/moe/test_block_fp8.py
+++ b/tests/kernels/moe/test_block_fp8.py
--- a/tests/kernels/moe/test_moe_permute_unpermute.py
+++ b/tests/kernels/moe/test_moe_permute_unpermute.py
--- a/tests/kernels/moe/test_nvfp4_moe.py
+++ b/tests/kernels/moe/test_nvfp4_moe.py
--- a/tests/kernels/moe/test_pplx_cutlass_moe.py
+++ b/tests/kernels/moe/test_pplx_cutlass_moe.py
--- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
+++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
--- a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
+++ b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
--- a/tests/kernels/quantization/__init__.py
+++ b/tests/kernels/quantization/__init__.py
--- a/tests/kernels/quantization/test_gguf.py
+++ b/tests/kernels/quantization/test_gguf.py
@@ -13,7 +13,7 @@ import vllm._custom_ops as ops
 from vllm.model_executor.layers.fused_moe import fused_experts
 from vllm.model_executor.layers.quantization.gguf import _fused_moe_gguf
 from vllm.platforms import current_platform
-from ..utils import models_path_prefix
+from ...utils import models_path_prefix
 # GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample")
 # GGUF_SAMPLE_MOE = snapshot_download("SzymonOzog/test-gguf-moe-sample")

--- a/tests/kernels/quantization/test_int8_quant.py
+++ b/tests/kernels/quantization/test_int8_quant.py
@@ -42,7 +42,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True):
                (output, input, scale, azp))
-@pytest.mark.skipif(current_platform(),
+@pytest.mark.skipif(current_platform.is_rocm(),
                    reason="Currently, there is not supported on ROCm.")
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@@ -67,7 +67,7 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
    opcheck_int8_quant_dynamic(ops_out, x)
-@pytest.mark.skipif(current_platform(),
+@pytest.mark.skipif(current_platform.is_rocm(),
                    reason="Currently, there is not supported on ROCm.")
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)

--- a/tests/kernels/quantization/test_triton_scaled_mm.py
+++ b/tests/kernels/quantization/test_triton_scaled_mm.py
@@ -4,6 +4,7 @@
 Run `pytest tests/kernels/test_triton_scaled_mm.py`.
 """
+import os
 import importlib
 from typing import Optional
@@ -11,6 +12,7 @@ import pytest
 import torch
 from vllm.platforms import current_platform
+from ...utils import models_path_prefix
 device = "cuda"
@@ -45,7 +47,7 @@ def get_8bit_types():
 # This test is to check regressions for int8 support on ROCm.
 @pytest.mark.parametrize("model_path", [
-    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+    os.path.join(models_path_prefix, "neuralmagic/Llama-3.2-1B-quantized.w8a8"),
 ])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [10])

--- a/tests/kernels/quantization/test_rocm_skinny_gemms.py
+++ b/tests/kernels/quantization/test_rocm_skinny_gemms.py
--- a/tests/kernels/test_flex_attention.py
+++ b/tests/kernels/test_flex_attention.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Integration tests for FlexAttention backend vs default backend"""
+import os
 import random
 import numpy as np
@@ -10,6 +11,7 @@ import torch
 from packaging import version
 from vllm import LLM, SamplingParams
+from ..utils import models_path_prefix
 TORCH_VERSION = version.parse(torch.__version__)
 MINIMUM_TORCH_VERSION = version.parse("2.7.0")
@@ -34,7 +36,7 @@ def test_flex_attention_vs_default_backend(monkeypatch):
    This test compares the outputs from the FlexAttention backend with
    the default backend, ensuring they are identical when using the same seed.
    """
-    model_name = "Qwen/Qwen2.5-1.5B-Instruct"
+    model_name = os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct")
    seed = 42
    max_tokens = 32
    prompts = [

--- a/tests/kernels/test_fused_quant_activation.py
+++ b/tests/kernels/test_fused_quant_activation.py
@@ -9,7 +9,7 @@ from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.platforms import current_platform
 DTYPES = [torch.bfloat16, torch.float16]
-QUANT_DTYPES = [current_platform.fp8_dtype()]
+QUANT_DTYPES = [current_platform.fp8_dtype()] if not current_platform.is_rocm() else [None]
 NUM_TOKENS = [1, 17, 86, 1234, 3045]  # Arbitrary values for testing
 HIDDEN_SIZES = [16, 48, 128, 1562, 4096]  # Arbitrary values for testing
 SEEDS = [0]

--- a/tests/kernels/test_triton_flash_attention.py
+++ b/tests/kernels/test_triton_flash_attention.py
@@ -7,8 +7,7 @@ Run `pytest tests/kernels/test_triton_flash_attention.py`.
 import pytest
 import torch
-from vllm.attention.ops.triton_flash_attention import (SUPPORTED_LAYOUTS,
+from vllm.attention.ops.triton_flash_attention import (MetaData,
-                                                       MetaData,
                                                       compute_alibi_tensor,
                                                       scale_fp8,
                                                       triton_attention_rocm)
@@ -60,26 +59,26 @@ class ReferenceAttention:
            ref_out = ref_out.transpose(1, 2).clone()
        return ref_out
-    def fwd_fp8(self, q_quantized, k_quantized, v_quantized):
+    # def fwd_fp8(self, q_quantized, k_quantized, v_quantized):
-        q = (q_quantized.to(torch.float16) * self.input_metadata.q_descale).to(
+    #     q = (q_quantized.to(torch.float16) * self.input_metadata.q_descale).to(
-            self.dtype)
+    #         self.dtype)
-        k = (k_quantized.to(torch.float16) * self.input_metadata.k_descale).to(
+    #     k = (k_quantized.to(torch.float16) * self.input_metadata.k_descale).to(
-            self.dtype)
+    #         self.dtype)
-        v = (v_quantized.to(torch.float16) * self.input_metadata.v_descale).to(
+    #     v = (v_quantized.to(torch.float16) * self.input_metadata.v_descale).to(
-            self.dtype)
+    #         self.dtype)
-        result = self.fwd(q, k, v)
+    #     result = self.fwd(q, k, v)
-        if self.input_metadata.o_scale is not None:
+    #     if self.input_metadata.o_scale is not None:
-            result, _ = scale_fp8(result, self.input_metadata.o_scale)
+    #         result, _ = scale_fp8(result, self.input_metadata.o_scale)
-        return result
+    #     return result
-    def fwd_fp8_kv(self, q, k_quantized, v_quantized):
+    # def fwd_fp8_kv(self, q, k_quantized, v_quantized):
-        k_descale, v_descale = (self.input_metadata.k_descale,
+    #     k_descale, v_descale = (self.input_metadata.k_descale,
-                                self.input_metadata.v_descale)
+    #                             self.input_metadata.v_descale)
-        k_dequantized = (k_quantized.to(torch.float32) *
+    #     k_dequantized = (k_quantized.to(torch.float32) *
-                         k_descale.to(torch.float32)).to(self.dtype)
+    #                      k_descale.to(torch.float32)).to(self.dtype)
-        v_dequantized = (v_quantized.to(torch.float32) *
+    #     v_dequantized = (v_quantized.to(torch.float32) *
-                         v_descale.to(torch.float32)).to(self.dtype)
+    #                      v_descale.to(torch.float32)).to(self.dtype)
-        return self.fwd(q, k_dequantized, v_dequantized)
+    #     return self.fwd(q, k_dequantized, v_dequantized)
    def varlen_fwd(self, q, k, v, is_mqa=False):
        ref_out = torch.empty_like(q)
@@ -145,7 +144,7 @@ def input_helper(
    use_o_scale=False,
    use_bias=False,
 ):
-    assert layout in SUPPORTED_LAYOUTS, "Got unsupported layout."
+    # assert layout in SUPPORTED_LAYOUTS, "Got unsupported layout."
    current_platform.seed_everything(0)

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -210,7 +210,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "JambaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"ai21labs/AI21-Jamba-1.5-Mini"),
                                        extras={"tiny": os.path.join(models_path_prefix,"ai21labs/Jamba-tiny-dev")}),  # noqa: E501
    "LlamaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"meta-llama/Llama-3.2-1B-Instruct"),
-                                        extras={"guard": os.path.join(models_path_prefix,"meta-llama/Llama-Guard-3-1B",  # noqa: E501
+                                        extras={"guard": os.path.join(models_path_prefix,"meta-llama/Llama-Guard-3-1B"),  # noqa: E501
                                                "hermes": os.path.join(models_path_prefix,"NousResearch/Hermes-3-Llama-3.1-8B"), # noqa: E501
                                                "fp8": os.path.join(models_path_prefix,"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8")}),  # noqa: E501
    "LLaMAForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"decapoda-research/llama-7b-hf"),
@@ -367,12 +367,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                         trust_remote_code=True),
    "Idefics3ForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3"),  # noqa: E501
                                                        {"tiny": os.path.join(models_path_prefix, "HuggingFaceTB/SmolVLM-256M-Instruct")}),  # noqa: E501
-    "KeyeForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
+    "KeyeForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "Kwai-Keye/Keye-VL-8B-Preview"), # noqa: E501
                                                    trust_remote_code=True),
    "KimiVLForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "moonshotai/Kimi-VL-A3B-Instruct"),  # noqa: E501
                                                      extras={"thinking": os.path.join(models_path_prefix, "moonshotai/Kimi-VL-A3B-Thinking")},  # noqa: E501
                                                      trust_remote_code=True),
-    "Llama4ForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct",   # noqa: E501
+    "Llama4ForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct"),   # noqa: E501
                                                      max_model_len=10240),
    "LlavaForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
                                                     extras={"mistral": os.path.join(models_path_prefix, "mistral-community/pixtral-12b"), # noqa: E501
@@ -407,7 +407,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                        trust_remote_code=True,
                                        max_transformers_version="4.48",
                                        transformers_version_reason="Use of deprecated imports which have been removed.",  # noqa: E501
-                              extras={"phi3.5": os.path.join(models_path_prefix,"microsoft/Phi-3.5-vision-instruct"})),  # noqa: E501
+                              extras={"phi3.5": os.path.join(models_path_prefix,"microsoft/Phi-3.5-vision-instruct")}),  # noqa: E501
    "Ovis": _HfExamplesInfo(os.path.join(models_path_prefix,"AIDC-AI/Ovis2-1B"), trust_remote_code=True,
                            extras={"1.6-llama": os.path.join(models_path_prefix,"AIDC-AI/Ovis1.6-Llama3.2-3B"),
                                    "1.6-gemma": os.path.join(models_path_prefix,"AIDC-AI/Ovis1.6-Gemma2-9B")}),  # noqa: E501