Commit a810671a authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0rc0' into v0.14.0rc0-ori

parents 86b5aefe 6a09612b
......@@ -2,4 +2,4 @@ model_name: "Qwen/Qwen3-0.6B-FP8"
accuracy_threshold: 0.375
num_questions: 1319
num_fewshot: 5
max_model_len: 4096
\ No newline at end of file
server_args: "--enforce-eager --max-model-len 4096"
......@@ -2,5 +2,4 @@ model_name: "nvidia/Qwen3-30B-A3B-FP4"
accuracy_threshold: 0.89
num_questions: 1319
num_fewshot: 5
max_model_len: 4096
server_args: "--enforce-eager --max-model-len 4096"
model_name: "nm-testing/Qwen3-Next-80B-A3B-Instruct-NVFP4"
accuracy_threshold: 0.75
num_questions: 1319
num_fewshot: 5
server_args: >-
--enforce-eager
--max-model-len 4096
--tensor-parallel-size 2
--enable-expert-parallel
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}'
env:
VLLM_USE_FLASHINFER_MOE_FP4: "1"
......@@ -3,3 +3,4 @@ Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
Qwen1.5-MoE-W4A16-CT.yaml
DeepSeek-V2-Lite-Instruct-FP8.yaml
Qwen3-30B-A3B-NVFP4.yaml
Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
......@@ -11,14 +11,12 @@ def pytest_addoption(parser):
default="configs/models-small.txt",
help="File containing list of config files to test",
)
parser.addoption("--tp-size", default=1, type=int, help="Tensor parallel size")
def pytest_generate_tests(metafunc):
"""Generate test parameters from config files."""
if "config_filename" in metafunc.fixturenames:
config_list_file = metafunc.config.getoption("--config-list-file")
tp_size = metafunc.config.getoption("--tp-size")
# Handle both relative and absolute paths
config_list_path = Path(config_list_file)
......@@ -55,9 +53,9 @@ def pytest_generate_tests(metafunc):
# Generate test parameters
if config_files:
metafunc.parametrize(
["config_filename", "tp_size"],
[(config_file, int(tp_size)) for config_file in config_files],
ids=[f"{config_file.stem}-tp{tp_size}" for config_file in config_files],
"config_filename",
config_files,
ids=[config_file.stem for config_file in config_files],
)
else:
print("No config files found, test will be skipped")
......@@ -5,30 +5,31 @@ GSM8K evaluation using vLLM server and isolated GSM8K script.
Replacement for lm-eval-harness with better performance and control.
Usage:
pytest -s -v test_gsm8k_correctness.py \
--config-list-file=configs/models-small.txt \
--tp-size=1
pytest -s -v tests/evals/gsm8k/test_gsm8k_correctness.py \
--config-list-file=configs/models-small.txt
"""
import shlex
import yaml
from tests.utils import RemoteOpenAIServer
from .gsm8k_eval import evaluate_gsm8k
RTOL = 0.08 # Relative tolerance for accuracy comparison
TOL = 0.08 # Absolute tolerance for accuracy comparison
def launch_gsm8k_eval(eval_config, server_url, tp_size):
"""Launch GSM8K evaluation using our isolated script."""
def run_gsm8k_eval(eval_config: dict, server_url: str) -> dict:
"""Run GSM8K evaluation using our isolated script."""
# Extract host and port from server URL
if "://" in server_url:
server_url = server_url.split("://")[1]
host_port = server_url.split("/")[0] # Remove path if present
if ":" in host_port:
host, port = host_port.split(":")
port = int(port)
host, p = host_port.split(":")
port = int(p)
else:
host = host_port
port = 8000
......@@ -48,46 +49,57 @@ def launch_gsm8k_eval(eval_config, server_url, tp_size):
return results
def test_gsm8k_correctness_param(config_filename, tp_size):
def test_gsm8k_correctness(config_filename):
"""Test GSM8K correctness for a given model configuration."""
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
# Server arguments
server_args = [
"--max-model-len",
str(eval_config.get("max_model_len", 4096)),
"--enforce-eager",
"--trust-remote-code",
"--tensor-parallel-size",
str(tp_size),
]
# Parse server arguments from config (use shlex to handle quoted strings)
server_args_str = eval_config.get("server_args", "")
server_args = shlex.split(server_args_str) if server_args_str else []
# Add standard server arguments
server_args.extend(
[
"--trust-remote-code",
]
)
env_dict = eval_config.get("env", None)
print(f"Starting GSM8K evaluation for model: {eval_config['model_name']}")
print(f"Expected metric threshold: {eval_config['accuracy_threshold']}")
print(f"Number of questions: {eval_config['num_questions']}")
print(f"Number of few-shot examples: {eval_config['num_fewshot']}")
print(f"Server args: {' '.join(server_args)}")
# Launch server and run evaluation
with RemoteOpenAIServer(
eval_config["model_name"], server_args, env_dict=env_dict, max_wait_seconds=480
eval_config["model_name"],
server_args,
env_dict=env_dict,
max_wait_seconds=600,
) as remote_server:
server_url = remote_server.url_for("v1")
print(f"Server started at: {server_url}")
results = launch_gsm8k_eval(eval_config, server_url, tp_size)
results = run_gsm8k_eval(eval_config, server_url)
# Check accuracy against threshold
measured_accuracy = results["accuracy"]
expected_accuracy = eval_config["accuracy_threshold"]
measured_metric = results["accuracy"]
expected_metric = eval_config["accuracy_threshold"]
print(f"GSM8K Results for {eval_config['model_name']}:")
print(f" Accuracy: {measured_accuracy:.3f}")
print(f" Expected: {expected_accuracy:.3f}")
print(f" Measured metric: {measured_metric:.4f}")
print(f" Expected metric: {expected_metric:.4f}")
print(f" Tolerance: {TOL:.4f}")
print(f" Questions: {results['num_questions']}")
print(f" Invalid rate: {results['invalid_rate']:.3f}")
print(f" Latency: {results['latency']:.1f}s")
print(f" QPS: {results['questions_per_second']:.1f}")
# Verify accuracy is within tolerance
assert measured_accuracy >= expected_accuracy - RTOL, (
f"Accuracy too low: {measured_accuracy:.3f} < "
f"{expected_accuracy:.3f} - {RTOL:.3f}"
# Verify metric is within tolerance
assert measured_metric >= expected_metric - TOL, (
f"GSM8K metric too low: {measured_metric:.4f} < "
f"{expected_metric:.4f} - {TOL:.4f} = {expected_metric - TOL:.4f}"
)
print(f"✅ GSM8K test passed for {eval_config['model_name']}")
......@@ -9,7 +9,8 @@ import torch
from tests.kernels.allclose_default import get_default_atol, get_default_rtol
from tests.kernels.utils import opcheck
from vllm import _custom_ops as ops
from vllm.attention.layer import Attention, MultiHeadAttention
from vllm.attention.layer import Attention
from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
from vllm.platforms import current_platform
from vllm.utils.mem_utils import get_max_shared_memory_bytes
......@@ -442,7 +443,7 @@ def ref_multi_query_kv_attention(
return torch.cat(ref_outputs, dim=0)
@pytest.mark.parametrize("attention_cls", [Attention, MultiHeadAttention])
@pytest.mark.parametrize("attention_cls", [Attention, MMEncoderAttention])
def test_num_heads_not_divisble_by_num_kv_heads(attention_cls: type) -> None:
head_size = 64
scale = float(1.0 / (head_size**0.5))
......
......@@ -6,7 +6,9 @@ from unittest.mock import patch
import pytest
import torch
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
from vllm.config import AttentionConfig, VllmConfig, set_current_vllm_config
from vllm.platforms import current_platform
from vllm.platforms.cpu import CpuPlatform
from vllm.platforms.cuda import CudaPlatform
......@@ -73,18 +75,18 @@ def generate_params():
@pytest.mark.parametrize("device, name, use_mla, block_size", generate_params())
def test_env(
def test_backend_selection(
device: str,
name: str,
use_mla: bool,
block_size: int,
monkeypatch: pytest.MonkeyPatch,
):
"""Test attention backend selection with valid device-backend pairs."""
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", name)
m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
# Create AttentionConfig with the specified backend
attention_config = AttentionConfig(backend=AttentionBackendEnum[name])
vllm_config = VllmConfig(attention_config=attention_config)
with set_current_vllm_config(vllm_config):
if device == "cpu":
with patch("vllm.platforms.current_platform", CpuPlatform()):
backend = get_attn_backend(16, torch.float16, None, block_size)
......@@ -217,27 +219,32 @@ def test_env(
@pytest.mark.parametrize("device", ["cpu", "cuda"])
def test_fp32_fallback(device: str):
"""Test attention backend selection with fp32."""
if device == "cpu":
with patch("vllm.platforms.current_platform", CpuPlatform()):
backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "CPU_ATTN"
# Use default config (no backend specified)
vllm_config = VllmConfig()
elif device == "cuda":
with patch("vllm.platforms.current_platform", CudaPlatform()):
backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "FLEX_ATTENTION"
with set_current_vllm_config(vllm_config):
if device == "cpu":
with patch("vllm.platforms.current_platform", CpuPlatform()):
backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "CPU_ATTN"
elif device == "cuda":
with patch("vllm.platforms.current_platform", CudaPlatform()):
backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "FLEX_ATTENTION"
def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
"""Test FlashAttn validation."""
pytest.skip(
"Skipping as current backend selector does not "
"handle fallbacks when a backend is set via env var."
"handle fallbacks when a backend is explicitly set."
)
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", "FLASH_ATTN")
attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASH_ATTN)
vllm_config = VllmConfig(attention_config=attention_config)
with set_current_vllm_config(vllm_config):
# Unsupported CUDA arch
monkeypatch.setattr(torch.cuda, "get_device_capability", lambda _=None: (7, 5))
backend = get_attn_backend(16, torch.float16, None, 16)
......@@ -277,15 +284,10 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
assert backend.get_name() != "FLASH_ATTN"
def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
def test_invalid_backend():
"""Test that invalid attention backend names raise ValueError."""
with (
monkeypatch.context() as m,
patch("vllm.platforms.current_platform", CudaPlatform()),
pytest.raises(ValueError),
):
m.setenv("VLLM_ATTENTION_BACKEND", "INVALID")
# Should raise ValueError for invalid backend
with pytest.raises(ValueError) as exc_info:
get_attn_backend(32, torch.float16, None, 16)
assert "Invalid value 'INVALID'" in str(exc_info.value)
# Invalid backend name should raise ValueError when creating enum
AttentionConfig(backend=AttentionBackendEnum["INVALID"])
......@@ -455,3 +455,38 @@ def test_flashinfer_trtllm_prefill_with_baseline(
torch.testing.assert_close(output, output_trtllm, atol=atol, rtol=rtol),
f"{torch.max(torch.abs(output - output_trtllm))}",
)
def test_trtllm_attention_rejects_num_kv_heads_1() -> None:
"""Test that TRTLLM attention correctly rejects num_kv_heads=1.
When num_kv_heads=1 (MQA), the KV cache strides become degenerate
(stride_heads == stride_batch), which causes CUDA's cuTensorMapEncodeTiled
to fail because TMA descriptors cannot handle degenerate 4D tensors with
singleton dimensions.
This test verifies that can_use_trtllm_attention returns False for
num_kv_heads=1 configurations.
"""
from vllm.utils.flashinfer import can_use_trtllm_attention
# num_kv_heads=1 should be rejected
assert not can_use_trtllm_attention(num_qo_heads=64, num_kv_heads=1), (
"can_use_trtllm_attention should return False for num_kv_heads=1"
)
assert not can_use_trtllm_attention(num_qo_heads=32, num_kv_heads=1), (
"can_use_trtllm_attention should return False for num_kv_heads=1"
)
# num_kv_heads > 1 should be accepted (if platform supports it)
# Note: This may return False on non-Blackwell platforms, which is fine
result_kv8 = can_use_trtllm_attention(num_qo_heads=64, num_kv_heads=8)
result_kv1 = can_use_trtllm_attention(num_qo_heads=64, num_kv_heads=1)
# Even if platform doesn't support TRTLLM, num_kv_heads=1 should never
# return True when num_kv_heads > 1 returns True
if result_kv8:
assert not result_kv1, (
"If TRTLLM is supported for num_kv_heads=8, "
"it must be rejected for num_kv_heads=1"
)
......@@ -3,16 +3,17 @@
"""
Test:
* Tests for MultiHeadAttention layer
* Tests for MMEncoderAttention layer
"""
import itertools
from unittest.mock import patch
import pytest
import torch
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.attention.layer import MultiHeadAttention
from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
from vllm.attention.selector import _cached_get_attn_backend
from vllm.platforms import current_platform
from vllm.platforms.cpu import CpuPlatform
......@@ -42,35 +43,31 @@ def test_mha_attn_platform(device: str):
if device == "cpu":
with (
patch("vllm.attention.layer.current_platform", CpuPlatform()),
patch("vllm.model_executor.models.vision.current_platform", CpuPlatform()),
):
attn = MultiHeadAttention(16, 64, scale=1)
attn = MMEncoderAttention(16, 64, scale=1)
assert attn.attn_backend == AttentionBackendEnum.TORCH_SDPA
elif device == "hip":
with (
patch("vllm.attention.layer.current_platform", RocmPlatform()),
patch("vllm.model_executor.models.vision.current_platform", RocmPlatform()),
):
attn = MultiHeadAttention(16, 64, scale=1)
attn = MMEncoderAttention(16, 64, scale=1)
assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
else:
# Test CUDA with head_size=64 (divisible by 32)
# - should use vLLM's FlashAttention
with (
patch("vllm.attention.layer.current_platform", CudaPlatform()),
patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()),
):
attn = MultiHeadAttention(16, 64, scale=1)
attn = MMEncoderAttention(16, 64, scale=1)
assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
# Test CUDA with head_size=72 (not divisible by 32)
# - should use vLLM's FlashAttention
with (
patch("vllm.attention.layer.current_platform", CudaPlatform()),
patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()),
):
attn = MultiHeadAttention(16, 72, scale=1)
attn = MMEncoderAttention(16, 72, scale=1)
assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
......@@ -94,6 +91,10 @@ def ref_attention(
BATCH_SIZES = [1, 16]
SEQ_LENS = [1]
VAR_SEQ_LENS = [
[2, 2],
[2, 3, 4],
]
NUM_HEADS = [1, 16]
NUM_KV_HEADS = [1]
HEAD_SIZES = [64, 80]
......@@ -130,7 +131,7 @@ def test_mha_attn_forward(
k = torch.randn(batch_size, seq_len, num_kv_heads * head_size)
v = torch.randn(batch_size, seq_len, num_kv_heads * head_size)
scale = 1.0 / head_size**0.5
attn = MultiHeadAttention(
attn = MMEncoderAttention(
num_heads, head_size, scale=scale, num_kv_heads=num_kv_heads
)
output = attn(q, k, v)
......@@ -151,3 +152,58 @@ def test_mha_attn_forward(
scale=scale,
).reshape(batch_size, seq_len, num_heads * head_size)
torch.testing.assert_close(output, ref_output)
@pytest.mark.parametrize("var_seq_len", VAR_SEQ_LENS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("num_kv_heads", NUM_KV_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("device", CUDA_DEVICES)
def test_mha_attn_varlen_forward(
var_seq_len: list[int],
num_heads: int,
num_kv_heads: int,
head_size: int,
dtype: torch.dtype,
device: str,
):
current_platform.seed_everything(0)
torch.set_default_device(device)
torch.set_default_dtype(dtype)
q = torch.randn(1, sum(var_seq_len), num_heads, head_size)
k = torch.randn(1, sum(var_seq_len), num_kv_heads, head_size)
v = torch.randn(1, sum(var_seq_len), num_kv_heads, head_size)
cu_seqlens = torch.tensor(
[0] + list(itertools.accumulate(var_seq_len)), dtype=torch.int32
)
scale = 1.0 / head_size**0.5
attn = MMEncoderAttention(
num_heads, head_size, scale=scale, num_kv_heads=num_kv_heads
)
output = attn(
q, k, v, cu_seqlens=cu_seqlens, max_seqlen=torch.tensor(max(var_seq_len))
)
assert num_heads % num_kv_heads == 0
num_queries_per_kv = num_heads // num_kv_heads
if num_queries_per_kv > 1:
k = torch.repeat_interleave(k, num_queries_per_kv, dim=2)
v = torch.repeat_interleave(v, num_queries_per_kv, dim=2)
ref_output = []
for q_i, k_i, v_i in zip(
torch.split(q, var_seq_len, dim=1),
torch.split(k, var_seq_len, dim=1),
torch.split(v, var_seq_len, dim=1),
):
output_i = ref_attention(
q_i,
k_i,
v_i,
scale=scale,
)
ref_output.append(output_i)
ref_output = torch.cat(ref_output, dim=1)
torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2)
......@@ -4,7 +4,9 @@
import pytest
import torch
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
from vllm.config import AttentionConfig, VllmConfig, set_current_vllm_config
from vllm.platforms.rocm import RocmPlatform
......@@ -16,40 +18,56 @@ def clear_cache():
@pytest.mark.skip(reason="Skipped for now. Should be revisited.")
def test_selector(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_ATTN")
# Set the current platform to ROCm using monkeypatch
monkeypatch.setattr("vllm.attention.selector.current_platform", RocmPlatform())
# Set the current platform to ROCm using monkeypatch
monkeypatch.setattr("vllm.attention.selector.current_platform", RocmPlatform())
# Test standard ROCm attention
attention_config = AttentionConfig(backend=AttentionBackendEnum.ROCM_ATTN)
vllm_config = VllmConfig(attention_config=attention_config)
# Test standard ROCm attention
with set_current_vllm_config(vllm_config):
backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
assert backend.get_name() == "ROCM_FLASH" or backend.get_name() == "TRITON_ATTN"
# MLA test for deepseek related
# MLA test for deepseek related
# Change the attention backend to triton MLA
attention_config = AttentionConfig(backend=AttentionBackendEnum.TRITON_MLA)
vllm_config = VllmConfig(attention_config=attention_config)
# change the attention backend to triton MLA
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_MLA")
with set_current_vllm_config(vllm_config):
backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True)
assert backend.get_name() == "TRITON_MLA"
# If attention backend is None
# If use_mla is true
# The selected backend is triton MLA
m.setenv("VLLM_ATTENTION_BACKEND", "")
# If attention backend is None
# If use_mla is true
# The selected backend is triton MLA
attention_config = AttentionConfig(backend=None)
vllm_config = VllmConfig(attention_config=attention_config)
with set_current_vllm_config(vllm_config):
backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True)
assert backend.get_name() == "TRITON_MLA"
# change the attention backend to AITER MLA
m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_MLA")
# Change the attention backend to AITER MLA
attention_config = AttentionConfig(backend=AttentionBackendEnum.ROCM_AITER_MLA)
vllm_config = VllmConfig(attention_config=attention_config)
with set_current_vllm_config(vllm_config):
backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True)
assert backend.get_name() == "ROCM_AITER_MLA"
# If attention backend is None
# If use_mla is true
# If VLLM_ROCM_USE_AITER is enabled
# The selected backend is ROCM_AITER_MLA
m.setenv("VLLM_ATTENTION_BACKEND", "")
# If attention backend is None
# If use_mla is true
# If VLLM_ROCM_USE_AITER is enabled
# The selected backend is ROCM_AITER_MLA
with monkeypatch.context() as m:
m.setenv("VLLM_ROCM_USE_AITER", "1")
backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True)
assert backend.get_name() == "ROCM_AITER_MLA"
attention_config = AttentionConfig(backend=None)
vllm_config = VllmConfig(attention_config=attention_config)
with set_current_vllm_config(vllm_config):
backend = get_attn_backend(
576, torch.bfloat16, "auto", 1, False, use_mla=True
)
assert backend.get_name() == "ROCM_AITER_MLA"
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from tests.kernels.allclose_default import get_default_atol, get_default_rtol
from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight
from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul
from vllm.platforms import current_platform
if not current_platform.is_cpu():
pytest.skip("skipping CPU-only tests", allow_module_level=True)
EXPERT_NUM = [
8,
]
HIDDEN_DIM = [128, 2880]
INTERMEDIATE_DIM = [128, 2880]
BATCH_SIZE = [1, 64, 256]
ACT = ["silu", "swigluoai"]
USE_BIAS = [True, False]
ISA = ["amx", "vec"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
DTYPE = [torch.bfloat16]
_CPU_MOE_ACT = {
"silu": SiluAndMul(),
"swigluoai": SwigluOAIAndMul(),
}
def ref_fused_moe(
input: torch.Tensor,
w13: torch.Tensor,
w2: torch.Tensor,
w13_bias: torch.Tensor | None,
w2_bias: torch.Tensor | None,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
activation: str,
) -> torch.Tensor:
len_experts = w13.size(0)
cnts = topk_ids.new_zeros((topk_ids.shape[0], len_experts))
cnts.scatter_(1, topk_ids.to(torch.int64), 1)
tokens_per_expert = cnts.sum(dim=0)
idxs = topk_ids.view(-1).argsort()
sorted_tokens = input[idxs // topk_ids.shape[1]]
tokens_per_expert = tokens_per_expert.cpu().numpy()
outputs = []
start_idx = 0
for i, num_tokens in enumerate(tokens_per_expert):
end_idx = start_idx + num_tokens
if num_tokens == 0:
continue
tokens_for_this_expert = sorted_tokens[start_idx:end_idx].float()
curr_w13 = w13[i].float()
curr_w2 = w2[i].float()
curr_w13_bias = None
if w13_bias is not None:
curr_w13_bias = w13_bias[i].float()
curr_w2_bias = None
if w2_bias is not None:
curr_w2_bias = w2_bias[i].float()
gate_up = torch.nn.functional.linear(
tokens_for_this_expert, curr_w13, curr_w13_bias
)
# Note: to simulate the kernel implementation
gate_up = (
_CPU_MOE_ACT[activation]
.forward_native(gate_up)
.to(dtype=input.dtype)
.float()
)
expert_out = torch.nn.functional.linear(gate_up, curr_w2, curr_w2_bias)
outputs.append(expert_out)
start_idx = end_idx
outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
new_x = torch.empty_like(outs)
new_x[idxs] = outs
final_out = (
new_x.view(*topk_ids.shape, -1)
.mul_(topk_weights.unsqueeze(dim=-1))
.sum(dim=1)
.type(input.dtype)
)
return final_out
@pytest.mark.parametrize("batch_size", BATCH_SIZE)
@pytest.mark.parametrize("expert_num", EXPERT_NUM)
@pytest.mark.parametrize("hidden_size", HIDDEN_DIM)
@pytest.mark.parametrize("intermediate_size", INTERMEDIATE_DIM)
@pytest.mark.parametrize("use_bias", USE_BIAS)
@pytest.mark.parametrize("dtype", DTYPE)
@pytest.mark.parametrize("act", ACT)
@pytest.mark.parametrize("isa", ISA)
def test_cpu_fused_moe(
batch_size: int,
expert_num: int,
hidden_size: int,
intermediate_size: int,
use_bias: bool,
dtype: torch.dtype,
act: str,
isa: str,
):
current_platform.seed_everything(0)
topk_num = max(expert_num // 2, 1)
up_dim = 2 * intermediate_size
input = torch.randn((batch_size, hidden_size), dtype=dtype) / (
0.5 * hidden_size**0.5
)
w13 = torch.randn((expert_num, up_dim, hidden_size), dtype=dtype) / (
0.5 * hidden_size**0.5
)
w2 = torch.randn((expert_num, hidden_size, intermediate_size), dtype=dtype) / (
0.5 * intermediate_size**0.5
)
router_logits = torch.randn((batch_size, expert_num), dtype=dtype)
w13_bias = None
w2_bias = None
if use_bias:
w13_bias = torch.randn((expert_num, up_dim), dtype=dtype) / (0.5 * up_dim**0.5)
w2_bias = torch.randn((expert_num, hidden_size), dtype=dtype) / (
0.5 * hidden_size**0.5
)
score = torch.softmax(router_logits, dim=-1, dtype=torch.float32)
topk_weight, topk_ids = torch.topk(score, topk_num)
topk_ids = topk_ids.to(torch.int32)
ref_output = ref_fused_moe(
input,
w13,
w2,
w13_bias,
w2_bias,
topk_weight,
topk_ids,
act,
)
packed_w13 = cpu_prepack_moe_weight(w13, isa)
packed_w2 = cpu_prepack_moe_weight(w2, isa)
output = cpu_fused_moe(
input,
packed_w13,
packed_w2,
w13_bias,
w2_bias,
topk_weight,
topk_ids,
act,
isa,
)
atol, rtol = get_default_atol(output), get_default_rtol(output)
(
torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol),
f"{torch.max(torch.abs(output - ref_output))}",
)
......@@ -9,8 +9,8 @@ import pytest
import torch
from vllm.model_executor.layers.fused_moe.fused_moe import (
GroupedTopk,
fused_grouped_topk,
grouped_topk,
)
from vllm.platforms import current_platform
......@@ -50,15 +50,17 @@ def test_grouped_topk(
with monkeypatch.context() as m:
m.setenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "0")
baseline_topk_weights, baseline_topk_ids = grouped_topk(
hidden_states=hidden_states,
gating_output=gating_output,
grouped_topk = GroupedTopk(
topk=topk,
renormalize=renormalize,
num_expert_group=num_expert_group,
topk_group=topk_group,
scoring_func=scoring_func,
routed_scaling_factor=routed_scaling_factor,
)
baseline_topk_weights, baseline_topk_ids = grouped_topk(
hidden_states=hidden_states,
gating_output=gating_output,
e_score_correction_bias=e_score_correction_bias,
)
......
......@@ -40,7 +40,7 @@ def set_seed(seed):
not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION,
reason="CUDA not available or PyTorch version < 2.7",
)
def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
def test_flex_attention_vs_default_backend(vllm_runner):
"""Test that FlexAttention produces the same outputs as the default backend.
This test compares the outputs from the FlexAttention backend with
......@@ -57,35 +57,32 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
]
# Run with flex attention
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
set_seed(seed)
with vllm_runner(
model_name,
runner="generate",
tensor_parallel_size=1,
num_gpu_blocks_override=128,
enforce_eager=True,
) as llm_flex:
output_flex = llm_flex.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs
)
set_seed(seed)
with vllm_runner(
model_name,
runner="generate",
tensor_parallel_size=1,
num_gpu_blocks_override=128,
enforce_eager=True,
attention_config={"backend": "FLEX_ATTENTION"},
) as llm_flex:
output_flex = llm_flex.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs
)
# Run with default backend
with monkeypatch.context() as m:
set_seed(seed)
with vllm_runner(
model_name,
runner="generate",
tensor_parallel_size=1,
num_gpu_blocks_override=128,
enforce_eager=True,
gpu_memory_utilization=0.85,
) as llm_default:
output_default = llm_default.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs
)
set_seed(seed)
with vllm_runner(
model_name,
runner="generate",
tensor_parallel_size=1,
num_gpu_blocks_override=128,
enforce_eager=True,
gpu_memory_utilization=0.85,
) as llm_default:
output_default = llm_default.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=output_flex,
......@@ -99,7 +96,7 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION,
reason="CUDA not available or PyTorch version < 2.7",
)
def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
def test_encoder_flex_attention_vs_default_backend(vllm_runner):
"""Test that FlexAttention produces the same outputs as the default backend.
This test compares the outputs from the FlexAttention backend with
......@@ -113,30 +110,26 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
]
# Run with flex attention
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
with vllm_runner(
model_name,
runner="pooling",
dtype=torch.bfloat16,
tensor_parallel_size=1,
max_model_len=100,
enforce_eager=True,
) as llm_flex:
flex_outputs = llm_flex.embed(prompts)
with vllm_runner(
model_name,
runner="pooling",
dtype=torch.bfloat16,
tensor_parallel_size=1,
max_model_len=100,
enforce_eager=True,
attention_config={"backend": "FLEX_ATTENTION"},
) as llm_flex:
flex_outputs = llm_flex.embed(prompts)
# Run with default backend
with (
monkeypatch.context() as m,
vllm_runner(
model_name,
runner="pooling",
dtype=torch.bfloat16,
tensor_parallel_size=1,
max_model_len=100,
enforce_eager=True,
) as llm_default,
):
with vllm_runner(
model_name,
runner="pooling",
dtype=torch.bfloat16,
tensor_parallel_size=1,
max_model_len=100,
enforce_eager=True,
) as llm_default:
default_outputs = llm_default.embed(prompts)
check_embeddings_close(
......
......@@ -76,6 +76,8 @@ def test_gpt_oss_lora(gptoss20b_lora_files):
enable_lora=True,
max_loras=4,
max_lora_rank=8,
max_num_seqs=2,
max_num_batched_tokens=2048,
compilation_config=vllm.config.CompilationConfig( # Avoid OOM
cudagraph_specialize_lora=False,
),
......@@ -94,8 +96,10 @@ def test_gpt_oss_lora_tp2(gptoss20b_lora_files, fully_sharded_loras):
enable_lora=True,
max_loras=2,
max_lora_rank=8,
max_num_seqs=16,
max_num_seqs=2,
max_num_batched_tokens=2048,
tensor_parallel_size=2,
gpu_memory_utilization=0.8,
fully_sharded_loras=fully_sharded_loras,
compilation_config=vllm.config.CompilationConfig( # Avoid OOM
cudagraph_specialize_lora=False,
......
......@@ -76,11 +76,18 @@ def do_sample(
if lora_id
else None,
)
# Print the outputs.
lora_request = LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None
generated_texts: list[str] = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
# The output should include correct lora_request info
if lora_request is not None:
assert output.lora_request.lora_name == lora_request.lora_name
assert output.lora_request.lora_int_id == lora_request.lora_int_id
assert output.lora_request.lora_path == lora_request.lora_path
else:
assert output.lora_request is None
generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
return generated_texts
......
......@@ -3,7 +3,7 @@
from collections import OrderedDict
from typing import NamedTuple
from unittest.mock import patch
from unittest.mock import MagicMock, patch
import pytest
from huggingface_hub.utils import HfHubHTTPError
......@@ -194,5 +194,8 @@ def test_get_adapter_absolute_path_huggingface_error(
# Hugging Face model identifier with download error
path = "org/repo"
mock_exist.return_value = False
mock_snapshot_download.side_effect = HfHubHTTPError("failed to query model info")
mock_snapshot_download.side_effect = HfHubHTTPError(
"failed to query model info",
response=MagicMock(),
)
assert get_adapter_absolute_path(path) == path
......@@ -35,10 +35,12 @@ audio_lora_path = MODEL_NAME
models = [MODEL_NAME]
@pytest.fixture(autouse=True)
def set_attention_backend_for_rocm(monkeypatch):
@pytest.fixture
def granite_speech_attention_config():
"""Return attention config for Granite Speech tests on ROCm."""
if current_platform.is_rocm():
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
return {"backend": "TRITON_ATTN"}
return None
def run_test(
......@@ -53,6 +55,7 @@ def run_test(
num_logprobs: int,
tensor_parallel_size: int,
distributed_executor_backend: str | None = None,
attention_config: dict | None = None,
):
"""Inference result should be the same between hf and vllm.
......@@ -80,6 +83,7 @@ def run_test(
enable_lora=True,
max_lora_rank=64,
enforce_eager=True,
attention_config=attention_config,
) as vllm_model:
lora_request = LoRARequest("audio", 1, audio_lora_path)
vllm_outputs_per_case = [
......@@ -131,6 +135,7 @@ def test_models(
vllm_runner,
model: str,
audio_assets: AudioTestAssets,
granite_speech_attention_config,
dtype: str,
max_model_len: int,
max_tokens: int,
......@@ -157,4 +162,5 @@ def test_models(
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=1,
attention_config=granite_speech_attention_config,
)
......@@ -2,23 +2,17 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM pooling tests."""
import os
import warnings
import pytest
from vllm.platforms import current_platform
def pytest_collection_modifyitems(config, items):
"""Set FLEX_ATTENTION backend for SigLIP tests on ROCm."""
if not current_platform.is_rocm():
return
@pytest.fixture
def siglip_attention_config():
"""Return attention config for SigLIP tests on ROCm.
siglip_tests = [item for item in items if "test_siglip" in item.nodeid]
if siglip_tests:
os.environ["VLLM_ATTENTION_BACKEND"] = "FLEX_ATTENTION"
warnings.warn(
"ROCm: Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION for SigLIP tests",
UserWarning,
stacklevel=1,
)
On ROCm, SigLIP tests require FLEX_ATTENTION backend.
"""
if current_platform.is_rocm():
return {"backend": "FLEX_ATTENTION"}
return None
......@@ -38,6 +38,7 @@ def _run_test(
*,
dtype: str,
tokenization_kwargs: dict[str, Any] | None = None,
attention_config: dict[str, Any] | None = None,
) -> None:
if tokenization_kwargs is None:
tokenization_kwargs = {}
......@@ -49,6 +50,7 @@ def _run_test(
enforce_eager=True,
max_model_len=64,
gpu_memory_utilization=0.7,
attention_config=attention_config,
) as vllm_model:
vllm_outputs = vllm_model.embed(
input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs
......@@ -90,6 +92,7 @@ def test_models_text(
hf_runner,
vllm_runner,
image_assets,
siglip_attention_config,
model: str,
dtype: str,
) -> None:
......@@ -108,6 +111,7 @@ def test_models_text(
"padding": "max_length",
"max_length": 64,
}, # siglip2 was trained with this padding setting.
attention_config=siglip_attention_config,
)
......@@ -117,6 +121,7 @@ def test_models_image(
hf_runner,
vllm_runner,
image_assets,
siglip_attention_config,
model: str,
dtype: str,
) -> None:
......@@ -133,6 +138,7 @@ def test_models_image(
input_images,
model,
dtype=dtype,
attention_config=siglip_attention_config,
)
......@@ -141,6 +147,7 @@ def test_models_image(
def test_models_text_image_no_crash(
vllm_runner,
image_assets,
siglip_attention_config,
model: str,
dtype: str,
) -> None:
......@@ -154,6 +161,7 @@ def test_models_text_image_no_crash(
enforce_eager=True,
max_model_len=64,
gpu_memory_utilization=0.7,
attention_config=siglip_attention_config,
) as vllm_model:
with pytest.raises(ValueError, match="not both"):
vllm_model.embed(texts, images=images)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment