Commit 31584b45 authored by zhuwenwen's avatar zhuwenwen
Browse files

[fix]fix tests of kernels

parent 15347448
......@@ -18,7 +18,7 @@ if not current_platform.is_rocm():
from xformers import ops as xops
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
from vllm.attention.backends.xformers import _make_alibi_bias
from vllm.attention.backends.xformers import _make_alibi_bias
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
# This will change depending on the compute capability.
......
......@@ -25,7 +25,7 @@ def clear_cache():
_cached_get_attn_backend.cache_clear()
@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"] if not current_platform.is_rocm() else ["cpu", "hip"])
def test_mha_attn_platform(device: str):
"""
Test the attention selector between different platform and device.
......
......@@ -15,7 +15,7 @@ BLOCK_SIZES = [16, 32]
DTYPES = [torch.float16, torch.bfloat16]
QDTYPES = [None, torch.float8_e4m3fn] if not current_platform.is_rocm() else [
None, torch.float8_e4m3fnuz
None #, torch.float8_e4m3fnuz
]
# one value large enough to test overflow in index calculation.
# one value small enough to test the schema op check
......
......@@ -96,7 +96,7 @@ class BatchedMMTensors:
@pytest.mark.parametrize("N", [128, 256, 1024])
@pytest.mark.parametrize(
"dtype",
[torch.float8_e4m3fn, torch.float32, torch.float16, torch.bfloat16])
[torch.float8_e4m3fn, torch.float32, torch.float16, torch.bfloat16] if not current_platform.is_rocm() else [torch.float32, torch.float16, torch.bfloat16])
@pytest.mark.parametrize("block_shape", [None, [128, 128]])
@pytest.mark.parametrize("per_act_token_quant", [False, True])
def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
......@@ -208,7 +208,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
@pytest.mark.parametrize(("m", "n", "k"), MNK_FACTORS)
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16] if not current_platform.is_rocm() else [torch.bfloat16])
@pytest.mark.parametrize("per_act_token_quant", [False, True])
@pytest.mark.parametrize("block_shape", [None, [128, 128]])
@pytest.mark.parametrize("input_scales", [False])
......
......@@ -353,7 +353,7 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
[torch.float32, torch.float16, torch.bfloat16])
@pytest.mark.parametrize("padding", [True, False])
@pytest.mark.parametrize(
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
"use_rocm_aiter", [True, False] if not current_platform.is_rocm() else [False])
@torch.inference_mode()
def test_mixtral_moe(dtype: torch.dtype, padding: bool, use_rocm_aiter: bool,
monkeypatch):
......
......@@ -13,7 +13,7 @@ import vllm._custom_ops as ops
from vllm.model_executor.layers.fused_moe import fused_experts
from vllm.model_executor.layers.quantization.gguf import _fused_moe_gguf
from vllm.platforms import current_platform
from ..utils import models_path_prefix
from ...utils import models_path_prefix
# GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample")
# GGUF_SAMPLE_MOE = snapshot_download("SzymonOzog/test-gguf-moe-sample")
......
......@@ -42,7 +42,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True):
(output, input, scale, azp))
@pytest.mark.skipif(current_platform(),
@pytest.mark.skipif(current_platform.is_rocm(),
reason="Currently, there is not supported on ROCm.")
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
......@@ -67,7 +67,7 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
opcheck_int8_quant_dynamic(ops_out, x)
@pytest.mark.skipif(current_platform(),
@pytest.mark.skipif(current_platform.is_rocm(),
reason="Currently, there is not supported on ROCm.")
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
......
......@@ -4,6 +4,7 @@
Run `pytest tests/kernels/test_triton_scaled_mm.py`.
"""
import os
import importlib
from typing import Optional
......@@ -11,6 +12,7 @@ import pytest
import torch
from vllm.platforms import current_platform
from ...utils import models_path_prefix
device = "cuda"
......@@ -45,7 +47,7 @@ def get_8bit_types():
# This test is to check regressions for int8 support on ROCm.
@pytest.mark.parametrize("model_path", [
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
os.path.join(models_path_prefix, "neuralmagic/Llama-3.2-1B-quantized.w8a8"),
])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [10])
......
......@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Integration tests for FlexAttention backend vs default backend"""
import os
import random
import numpy as np
......@@ -10,6 +11,7 @@ import torch
from packaging import version
from vllm import LLM, SamplingParams
from ..utils import models_path_prefix
TORCH_VERSION = version.parse(torch.__version__)
MINIMUM_TORCH_VERSION = version.parse("2.7.0")
......@@ -34,7 +36,7 @@ def test_flex_attention_vs_default_backend(monkeypatch):
This test compares the outputs from the FlexAttention backend with
the default backend, ensuring they are identical when using the same seed.
"""
model_name = "Qwen/Qwen2.5-1.5B-Instruct"
model_name = os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct")
seed = 42
max_tokens = 32
prompts = [
......
......@@ -9,7 +9,7 @@ from vllm.model_executor.layers.activation import SiluAndMul
from vllm.platforms import current_platform
DTYPES = [torch.bfloat16, torch.float16]
QUANT_DTYPES = [current_platform.fp8_dtype()]
QUANT_DTYPES = [current_platform.fp8_dtype()] if not current_platform.is_rocm() else [None]
NUM_TOKENS = [1, 17, 86, 1234, 3045] # Arbitrary values for testing
HIDDEN_SIZES = [16, 48, 128, 1562, 4096] # Arbitrary values for testing
SEEDS = [0]
......
......@@ -7,8 +7,7 @@ Run `pytest tests/kernels/test_triton_flash_attention.py`.
import pytest
import torch
from vllm.attention.ops.triton_flash_attention import (SUPPORTED_LAYOUTS,
MetaData,
from vllm.attention.ops.triton_flash_attention import (MetaData,
compute_alibi_tensor,
scale_fp8,
triton_attention_rocm)
......@@ -60,26 +59,26 @@ class ReferenceAttention:
ref_out = ref_out.transpose(1, 2).clone()
return ref_out
def fwd_fp8(self, q_quantized, k_quantized, v_quantized):
q = (q_quantized.to(torch.float16) * self.input_metadata.q_descale).to(
self.dtype)
k = (k_quantized.to(torch.float16) * self.input_metadata.k_descale).to(
self.dtype)
v = (v_quantized.to(torch.float16) * self.input_metadata.v_descale).to(
self.dtype)
result = self.fwd(q, k, v)
if self.input_metadata.o_scale is not None:
result, _ = scale_fp8(result, self.input_metadata.o_scale)
return result
def fwd_fp8_kv(self, q, k_quantized, v_quantized):
k_descale, v_descale = (self.input_metadata.k_descale,
self.input_metadata.v_descale)
k_dequantized = (k_quantized.to(torch.float32) *
k_descale.to(torch.float32)).to(self.dtype)
v_dequantized = (v_quantized.to(torch.float32) *
v_descale.to(torch.float32)).to(self.dtype)
return self.fwd(q, k_dequantized, v_dequantized)
# def fwd_fp8(self, q_quantized, k_quantized, v_quantized):
# q = (q_quantized.to(torch.float16) * self.input_metadata.q_descale).to(
# self.dtype)
# k = (k_quantized.to(torch.float16) * self.input_metadata.k_descale).to(
# self.dtype)
# v = (v_quantized.to(torch.float16) * self.input_metadata.v_descale).to(
# self.dtype)
# result = self.fwd(q, k, v)
# if self.input_metadata.o_scale is not None:
# result, _ = scale_fp8(result, self.input_metadata.o_scale)
# return result
# def fwd_fp8_kv(self, q, k_quantized, v_quantized):
# k_descale, v_descale = (self.input_metadata.k_descale,
# self.input_metadata.v_descale)
# k_dequantized = (k_quantized.to(torch.float32) *
# k_descale.to(torch.float32)).to(self.dtype)
# v_dequantized = (v_quantized.to(torch.float32) *
# v_descale.to(torch.float32)).to(self.dtype)
# return self.fwd(q, k_dequantized, v_dequantized)
def varlen_fwd(self, q, k, v, is_mqa=False):
ref_out = torch.empty_like(q)
......@@ -145,7 +144,7 @@ def input_helper(
use_o_scale=False,
use_bias=False,
):
assert layout in SUPPORTED_LAYOUTS, "Got unsupported layout."
# assert layout in SUPPORTED_LAYOUTS, "Got unsupported layout."
current_platform.seed_everything(0)
......
......@@ -210,7 +210,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"JambaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"ai21labs/AI21-Jamba-1.5-Mini"),
extras={"tiny": os.path.join(models_path_prefix,"ai21labs/Jamba-tiny-dev")}), # noqa: E501
"LlamaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"meta-llama/Llama-3.2-1B-Instruct"),
extras={"guard": os.path.join(models_path_prefix,"meta-llama/Llama-Guard-3-1B", # noqa: E501
extras={"guard": os.path.join(models_path_prefix,"meta-llama/Llama-Guard-3-1B"), # noqa: E501
"hermes": os.path.join(models_path_prefix,"NousResearch/Hermes-3-Llama-3.1-8B"), # noqa: E501
"fp8": os.path.join(models_path_prefix,"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8")}), # noqa: E501
"LLaMAForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"decapoda-research/llama-7b-hf"),
......@@ -367,12 +367,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code=True),
"Idefics3ForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3"), # noqa: E501
{"tiny": os.path.join(models_path_prefix, "HuggingFaceTB/SmolVLM-256M-Instruct")}), # noqa: E501
"KeyeForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
"KeyeForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "Kwai-Keye/Keye-VL-8B-Preview"), # noqa: E501
trust_remote_code=True),
"KimiVLForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "moonshotai/Kimi-VL-A3B-Instruct"), # noqa: E501
extras={"thinking": os.path.join(models_path_prefix, "moonshotai/Kimi-VL-A3B-Thinking")}, # noqa: E501
trust_remote_code=True),
"Llama4ForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
"Llama4ForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct"), # noqa: E501
max_model_len=10240),
"LlavaForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
extras={"mistral": os.path.join(models_path_prefix, "mistral-community/pixtral-12b"), # noqa: E501
......@@ -407,7 +407,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code=True,
max_transformers_version="4.48",
transformers_version_reason="Use of deprecated imports which have been removed.", # noqa: E501
extras={"phi3.5": os.path.join(models_path_prefix,"microsoft/Phi-3.5-vision-instruct"})), # noqa: E501
extras={"phi3.5": os.path.join(models_path_prefix,"microsoft/Phi-3.5-vision-instruct")}), # noqa: E501
"Ovis": _HfExamplesInfo(os.path.join(models_path_prefix,"AIDC-AI/Ovis2-1B"), trust_remote_code=True,
extras={"1.6-llama": os.path.join(models_path_prefix,"AIDC-AI/Ovis1.6-Llama3.2-3B"),
"1.6-gemma": os.path.join(models_path_prefix,"AIDC-AI/Ovis1.6-Gemma2-9B")}), # noqa: E501
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment