Unverified Commit d2353954 authored by Adrian Abeyta's avatar Adrian Abeyta Committed by GitHub
Browse files

Use FLASHINFER MLA backend when testing fp8_kv_scale_compile (#28491)


Signed-off-by: default avataradabeyta <aabeyta@redhat.com>
parent 412e153d
...@@ -10,6 +10,7 @@ import torch ...@@ -10,6 +10,7 @@ import torch
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import is_torch_equal_or_newer from vllm.utils.torch_utils import is_torch_equal_or_newer
...@@ -184,13 +185,24 @@ def test_custom_compile_config( ...@@ -184,13 +185,24 @@ def test_custom_compile_config(
[CompilationMode.NONE, CompilationMode.VLLM_COMPILE], [CompilationMode.NONE, CompilationMode.VLLM_COMPILE],
) )
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model", "model, backend",
[ [
"Qwen/Qwen2-0.5B", # Standard attention model ("Qwen/Qwen2-0.5B", None), # Standard attention model
"deepseek-ai/DeepSeek-V2-Lite", # MLA (Multi-head Latent Attention) model (
"deepseek-ai/DeepSeek-V2-Lite",
AttentionBackendEnum.FLASHINFER_MLA,
), # MLA (Multi-head Latent Attention) model
], ],
) )
def test_fp8_kv_scale_compile(compilation_mode: int, model: str): def test_fp8_kv_scale_compile(
monkeypatch: pytest.MonkeyPatch,
compilation_mode: int,
model: str,
backend: AttentionBackendEnum | None,
):
if backend:
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
model_kwargs = { model_kwargs = {
"quantization": "fp8", "quantization": "fp8",
"kv_cache_dtype": "fp8_e4m3", "kv_cache_dtype": "fp8_e4m3",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment