Commit a810671a authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0rc0' into v0.14.0rc0-ori

parents 86b5aefe 6a09612b
...@@ -2,4 +2,4 @@ model_name: "Qwen/Qwen3-0.6B-FP8" ...@@ -2,4 +2,4 @@ model_name: "Qwen/Qwen3-0.6B-FP8"
accuracy_threshold: 0.375 accuracy_threshold: 0.375
num_questions: 1319 num_questions: 1319
num_fewshot: 5 num_fewshot: 5
max_model_len: 4096 server_args: "--enforce-eager --max-model-len 4096"
\ No newline at end of file
...@@ -2,5 +2,4 @@ model_name: "nvidia/Qwen3-30B-A3B-FP4" ...@@ -2,5 +2,4 @@ model_name: "nvidia/Qwen3-30B-A3B-FP4"
accuracy_threshold: 0.89 accuracy_threshold: 0.89
num_questions: 1319 num_questions: 1319
num_fewshot: 5 num_fewshot: 5
max_model_len: 4096 server_args: "--enforce-eager --max-model-len 4096"
model_name: "nm-testing/Qwen3-Next-80B-A3B-Instruct-NVFP4"
accuracy_threshold: 0.75
num_questions: 1319
num_fewshot: 5
server_args: >-
--enforce-eager
--max-model-len 4096
--tensor-parallel-size 2
--enable-expert-parallel
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}'
env:
VLLM_USE_FLASHINFER_MOE_FP4: "1"
...@@ -3,3 +3,4 @@ Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml ...@@ -3,3 +3,4 @@ Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
Qwen1.5-MoE-W4A16-CT.yaml Qwen1.5-MoE-W4A16-CT.yaml
DeepSeek-V2-Lite-Instruct-FP8.yaml DeepSeek-V2-Lite-Instruct-FP8.yaml
Qwen3-30B-A3B-NVFP4.yaml Qwen3-30B-A3B-NVFP4.yaml
Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
...@@ -11,14 +11,12 @@ def pytest_addoption(parser): ...@@ -11,14 +11,12 @@ def pytest_addoption(parser):
default="configs/models-small.txt", default="configs/models-small.txt",
help="File containing list of config files to test", help="File containing list of config files to test",
) )
parser.addoption("--tp-size", default=1, type=int, help="Tensor parallel size")
def pytest_generate_tests(metafunc): def pytest_generate_tests(metafunc):
"""Generate test parameters from config files.""" """Generate test parameters from config files."""
if "config_filename" in metafunc.fixturenames: if "config_filename" in metafunc.fixturenames:
config_list_file = metafunc.config.getoption("--config-list-file") config_list_file = metafunc.config.getoption("--config-list-file")
tp_size = metafunc.config.getoption("--tp-size")
# Handle both relative and absolute paths # Handle both relative and absolute paths
config_list_path = Path(config_list_file) config_list_path = Path(config_list_file)
...@@ -55,9 +53,9 @@ def pytest_generate_tests(metafunc): ...@@ -55,9 +53,9 @@ def pytest_generate_tests(metafunc):
# Generate test parameters # Generate test parameters
if config_files: if config_files:
metafunc.parametrize( metafunc.parametrize(
["config_filename", "tp_size"], "config_filename",
[(config_file, int(tp_size)) for config_file in config_files], config_files,
ids=[f"{config_file.stem}-tp{tp_size}" for config_file in config_files], ids=[config_file.stem for config_file in config_files],
) )
else: else:
print("No config files found, test will be skipped") print("No config files found, test will be skipped")
...@@ -5,30 +5,31 @@ GSM8K evaluation using vLLM server and isolated GSM8K script. ...@@ -5,30 +5,31 @@ GSM8K evaluation using vLLM server and isolated GSM8K script.
Replacement for lm-eval-harness with better performance and control. Replacement for lm-eval-harness with better performance and control.
Usage: Usage:
pytest -s -v test_gsm8k_correctness.py \ pytest -s -v tests/evals/gsm8k/test_gsm8k_correctness.py \
--config-list-file=configs/models-small.txt \ --config-list-file=configs/models-small.txt
--tp-size=1
""" """
import shlex
import yaml import yaml
from tests.utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from .gsm8k_eval import evaluate_gsm8k from .gsm8k_eval import evaluate_gsm8k
RTOL = 0.08 # Relative tolerance for accuracy comparison TOL = 0.08 # Absolute tolerance for accuracy comparison
def launch_gsm8k_eval(eval_config, server_url, tp_size): def run_gsm8k_eval(eval_config: dict, server_url: str) -> dict:
"""Launch GSM8K evaluation using our isolated script.""" """Run GSM8K evaluation using our isolated script."""
# Extract host and port from server URL # Extract host and port from server URL
if "://" in server_url: if "://" in server_url:
server_url = server_url.split("://")[1] server_url = server_url.split("://")[1]
host_port = server_url.split("/")[0] # Remove path if present host_port = server_url.split("/")[0] # Remove path if present
if ":" in host_port: if ":" in host_port:
host, port = host_port.split(":") host, p = host_port.split(":")
port = int(port) port = int(p)
else: else:
host = host_port host = host_port
port = 8000 port = 8000
...@@ -48,46 +49,57 @@ def launch_gsm8k_eval(eval_config, server_url, tp_size): ...@@ -48,46 +49,57 @@ def launch_gsm8k_eval(eval_config, server_url, tp_size):
return results return results
def test_gsm8k_correctness_param(config_filename, tp_size): def test_gsm8k_correctness(config_filename):
"""Test GSM8K correctness for a given model configuration.""" """Test GSM8K correctness for a given model configuration."""
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8")) eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
# Server arguments # Parse server arguments from config (use shlex to handle quoted strings)
server_args = [ server_args_str = eval_config.get("server_args", "")
"--max-model-len", server_args = shlex.split(server_args_str) if server_args_str else []
str(eval_config.get("max_model_len", 4096)),
"--enforce-eager", # Add standard server arguments
"--trust-remote-code", server_args.extend(
"--tensor-parallel-size", [
str(tp_size), "--trust-remote-code",
] ]
)
env_dict = eval_config.get("env", None) env_dict = eval_config.get("env", None)
print(f"Starting GSM8K evaluation for model: {eval_config['model_name']}")
print(f"Expected metric threshold: {eval_config['accuracy_threshold']}")
print(f"Number of questions: {eval_config['num_questions']}")
print(f"Number of few-shot examples: {eval_config['num_fewshot']}")
print(f"Server args: {' '.join(server_args)}")
# Launch server and run evaluation # Launch server and run evaluation
with RemoteOpenAIServer( with RemoteOpenAIServer(
eval_config["model_name"], server_args, env_dict=env_dict, max_wait_seconds=480 eval_config["model_name"],
server_args,
env_dict=env_dict,
max_wait_seconds=600,
) as remote_server: ) as remote_server:
server_url = remote_server.url_for("v1") server_url = remote_server.url_for("v1")
print(f"Server started at: {server_url}")
results = launch_gsm8k_eval(eval_config, server_url, tp_size) results = run_gsm8k_eval(eval_config, server_url)
# Check accuracy against threshold measured_metric = results["accuracy"]
measured_accuracy = results["accuracy"] expected_metric = eval_config["accuracy_threshold"]
expected_accuracy = eval_config["accuracy_threshold"]
print(f"GSM8K Results for {eval_config['model_name']}:") print(f"GSM8K Results for {eval_config['model_name']}:")
print(f" Accuracy: {measured_accuracy:.3f}") print(f" Measured metric: {measured_metric:.4f}")
print(f" Expected: {expected_accuracy:.3f}") print(f" Expected metric: {expected_metric:.4f}")
print(f" Tolerance: {TOL:.4f}")
print(f" Questions: {results['num_questions']}") print(f" Questions: {results['num_questions']}")
print(f" Invalid rate: {results['invalid_rate']:.3f}") print(f" Invalid rate: {results['invalid_rate']:.3f}")
print(f" Latency: {results['latency']:.1f}s") print(f" Latency: {results['latency']:.1f}s")
print(f" QPS: {results['questions_per_second']:.1f}") print(f" QPS: {results['questions_per_second']:.1f}")
# Verify accuracy is within tolerance # Verify metric is within tolerance
assert measured_accuracy >= expected_accuracy - RTOL, ( assert measured_metric >= expected_metric - TOL, (
f"Accuracy too low: {measured_accuracy:.3f} < " f"GSM8K metric too low: {measured_metric:.4f} < "
f"{expected_accuracy:.3f} - {RTOL:.3f}" f"{expected_metric:.4f} - {TOL:.4f} = {expected_metric - TOL:.4f}"
) )
print(f"✅ GSM8K test passed for {eval_config['model_name']}") print(f"✅ GSM8K test passed for {eval_config['model_name']}")
...@@ -9,7 +9,8 @@ import torch ...@@ -9,7 +9,8 @@ import torch
from tests.kernels.allclose_default import get_default_atol, get_default_rtol from tests.kernels.allclose_default import get_default_atol, get_default_rtol
from tests.kernels.utils import opcheck from tests.kernels.utils import opcheck
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.attention.layer import Attention, MultiHeadAttention from vllm.attention.layer import Attention
from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.mem_utils import get_max_shared_memory_bytes from vllm.utils.mem_utils import get_max_shared_memory_bytes
...@@ -442,7 +443,7 @@ def ref_multi_query_kv_attention( ...@@ -442,7 +443,7 @@ def ref_multi_query_kv_attention(
return torch.cat(ref_outputs, dim=0) return torch.cat(ref_outputs, dim=0)
@pytest.mark.parametrize("attention_cls", [Attention, MultiHeadAttention]) @pytest.mark.parametrize("attention_cls", [Attention, MMEncoderAttention])
def test_num_heads_not_divisble_by_num_kv_heads(attention_cls: type) -> None: def test_num_heads_not_divisble_by_num_kv_heads(attention_cls: type) -> None:
head_size = 64 head_size = 64
scale = float(1.0 / (head_size**0.5)) scale = float(1.0 / (head_size**0.5))
......
...@@ -6,7 +6,9 @@ from unittest.mock import patch ...@@ -6,7 +6,9 @@ from unittest.mock import patch
import pytest import pytest
import torch import torch
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
from vllm.config import AttentionConfig, VllmConfig, set_current_vllm_config
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cpu import CpuPlatform
from vllm.platforms.cuda import CudaPlatform from vllm.platforms.cuda import CudaPlatform
...@@ -73,18 +75,18 @@ def generate_params(): ...@@ -73,18 +75,18 @@ def generate_params():
@pytest.mark.parametrize("device, name, use_mla, block_size", generate_params()) @pytest.mark.parametrize("device, name, use_mla, block_size", generate_params())
def test_env( def test_backend_selection(
device: str, device: str,
name: str, name: str,
use_mla: bool, use_mla: bool,
block_size: int, block_size: int,
monkeypatch: pytest.MonkeyPatch,
): ):
"""Test attention backend selection with valid device-backend pairs.""" """Test attention backend selection with valid device-backend pairs."""
with monkeypatch.context() as m: # Create AttentionConfig with the specified backend
m.setenv("VLLM_ATTENTION_BACKEND", name) attention_config = AttentionConfig(backend=AttentionBackendEnum[name])
m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0") vllm_config = VllmConfig(attention_config=attention_config)
with set_current_vllm_config(vllm_config):
if device == "cpu": if device == "cpu":
with patch("vllm.platforms.current_platform", CpuPlatform()): with patch("vllm.platforms.current_platform", CpuPlatform()):
backend = get_attn_backend(16, torch.float16, None, block_size) backend = get_attn_backend(16, torch.float16, None, block_size)
...@@ -217,27 +219,32 @@ def test_env( ...@@ -217,27 +219,32 @@ def test_env(
@pytest.mark.parametrize("device", ["cpu", "cuda"]) @pytest.mark.parametrize("device", ["cpu", "cuda"])
def test_fp32_fallback(device: str): def test_fp32_fallback(device: str):
"""Test attention backend selection with fp32.""" """Test attention backend selection with fp32."""
if device == "cpu": # Use default config (no backend specified)
with patch("vllm.platforms.current_platform", CpuPlatform()): vllm_config = VllmConfig()
backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "CPU_ATTN"
elif device == "cuda": with set_current_vllm_config(vllm_config):
with patch("vllm.platforms.current_platform", CudaPlatform()): if device == "cpu":
backend = get_attn_backend(16, torch.float32, None, 16) with patch("vllm.platforms.current_platform", CpuPlatform()):
assert backend.get_name() == "FLEX_ATTENTION" backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "CPU_ATTN"
elif device == "cuda":
with patch("vllm.platforms.current_platform", CudaPlatform()):
backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "FLEX_ATTENTION"
def test_flash_attn(monkeypatch: pytest.MonkeyPatch): def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
"""Test FlashAttn validation.""" """Test FlashAttn validation."""
pytest.skip( pytest.skip(
"Skipping as current backend selector does not " "Skipping as current backend selector does not "
"handle fallbacks when a backend is set via env var." "handle fallbacks when a backend is explicitly set."
) )
with monkeypatch.context() as m: attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASH_ATTN)
m.setenv("VLLM_ATTENTION_BACKEND", "FLASH_ATTN") vllm_config = VllmConfig(attention_config=attention_config)
with set_current_vllm_config(vllm_config):
# Unsupported CUDA arch # Unsupported CUDA arch
monkeypatch.setattr(torch.cuda, "get_device_capability", lambda _=None: (7, 5)) monkeypatch.setattr(torch.cuda, "get_device_capability", lambda _=None: (7, 5))
backend = get_attn_backend(16, torch.float16, None, 16) backend = get_attn_backend(16, torch.float16, None, 16)
...@@ -277,15 +284,10 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch): ...@@ -277,15 +284,10 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
assert backend.get_name() != "FLASH_ATTN" assert backend.get_name() != "FLASH_ATTN"
def test_invalid_env(monkeypatch: pytest.MonkeyPatch): def test_invalid_backend():
"""Test that invalid attention backend names raise ValueError.""" """Test that invalid attention backend names raise ValueError."""
with ( with (
monkeypatch.context() as m, pytest.raises(ValueError),
patch("vllm.platforms.current_platform", CudaPlatform()),
): ):
m.setenv("VLLM_ATTENTION_BACKEND", "INVALID") # Invalid backend name should raise ValueError when creating enum
AttentionConfig(backend=AttentionBackendEnum["INVALID"])
# Should raise ValueError for invalid backend
with pytest.raises(ValueError) as exc_info:
get_attn_backend(32, torch.float16, None, 16)
assert "Invalid value 'INVALID'" in str(exc_info.value)
...@@ -455,3 +455,38 @@ def test_flashinfer_trtllm_prefill_with_baseline( ...@@ -455,3 +455,38 @@ def test_flashinfer_trtllm_prefill_with_baseline(
torch.testing.assert_close(output, output_trtllm, atol=atol, rtol=rtol), torch.testing.assert_close(output, output_trtllm, atol=atol, rtol=rtol),
f"{torch.max(torch.abs(output - output_trtllm))}", f"{torch.max(torch.abs(output - output_trtllm))}",
) )
def test_trtllm_attention_rejects_num_kv_heads_1() -> None:
"""Test that TRTLLM attention correctly rejects num_kv_heads=1.
When num_kv_heads=1 (MQA), the KV cache strides become degenerate
(stride_heads == stride_batch), which causes CUDA's cuTensorMapEncodeTiled
to fail because TMA descriptors cannot handle degenerate 4D tensors with
singleton dimensions.
This test verifies that can_use_trtllm_attention returns False for
num_kv_heads=1 configurations.
"""
from vllm.utils.flashinfer import can_use_trtllm_attention
# num_kv_heads=1 should be rejected
assert not can_use_trtllm_attention(num_qo_heads=64, num_kv_heads=1), (
"can_use_trtllm_attention should return False for num_kv_heads=1"
)
assert not can_use_trtllm_attention(num_qo_heads=32, num_kv_heads=1), (
"can_use_trtllm_attention should return False for num_kv_heads=1"
)
# num_kv_heads > 1 should be accepted (if platform supports it)
# Note: This may return False on non-Blackwell platforms, which is fine
result_kv8 = can_use_trtllm_attention(num_qo_heads=64, num_kv_heads=8)
result_kv1 = can_use_trtllm_attention(num_qo_heads=64, num_kv_heads=1)
# Even if platform doesn't support TRTLLM, num_kv_heads=1 should never
# return True when num_kv_heads > 1 returns True
if result_kv8:
assert not result_kv1, (
"If TRTLLM is supported for num_kv_heads=8, "
"it must be rejected for num_kv_heads=1"
)
...@@ -3,16 +3,17 @@ ...@@ -3,16 +3,17 @@
""" """
Test: Test:
* Tests for MultiHeadAttention layer * Tests for MMEncoderAttention layer
""" """
import itertools
from unittest.mock import patch from unittest.mock import patch
import pytest import pytest
import torch import torch
from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.attention.layer import MultiHeadAttention from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
from vllm.attention.selector import _cached_get_attn_backend from vllm.attention.selector import _cached_get_attn_backend
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cpu import CpuPlatform
...@@ -42,35 +43,31 @@ def test_mha_attn_platform(device: str): ...@@ -42,35 +43,31 @@ def test_mha_attn_platform(device: str):
if device == "cpu": if device == "cpu":
with ( with (
patch("vllm.attention.layer.current_platform", CpuPlatform()),
patch("vllm.model_executor.models.vision.current_platform", CpuPlatform()), patch("vllm.model_executor.models.vision.current_platform", CpuPlatform()),
): ):
attn = MultiHeadAttention(16, 64, scale=1) attn = MMEncoderAttention(16, 64, scale=1)
assert attn.attn_backend == AttentionBackendEnum.TORCH_SDPA assert attn.attn_backend == AttentionBackendEnum.TORCH_SDPA
elif device == "hip": elif device == "hip":
with ( with (
patch("vllm.attention.layer.current_platform", RocmPlatform()),
patch("vllm.model_executor.models.vision.current_platform", RocmPlatform()), patch("vllm.model_executor.models.vision.current_platform", RocmPlatform()),
): ):
attn = MultiHeadAttention(16, 64, scale=1) attn = MMEncoderAttention(16, 64, scale=1)
assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
else: else:
# Test CUDA with head_size=64 (divisible by 32) # Test CUDA with head_size=64 (divisible by 32)
# - should use vLLM's FlashAttention # - should use vLLM's FlashAttention
with ( with (
patch("vllm.attention.layer.current_platform", CudaPlatform()),
patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()), patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()),
): ):
attn = MultiHeadAttention(16, 64, scale=1) attn = MMEncoderAttention(16, 64, scale=1)
assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
# Test CUDA with head_size=72 (not divisible by 32) # Test CUDA with head_size=72 (not divisible by 32)
# - should use vLLM's FlashAttention # - should use vLLM's FlashAttention
with ( with (
patch("vllm.attention.layer.current_platform", CudaPlatform()),
patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()), patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()),
): ):
attn = MultiHeadAttention(16, 72, scale=1) attn = MMEncoderAttention(16, 72, scale=1)
assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
...@@ -94,6 +91,10 @@ def ref_attention( ...@@ -94,6 +91,10 @@ def ref_attention(
BATCH_SIZES = [1, 16] BATCH_SIZES = [1, 16]
SEQ_LENS = [1] SEQ_LENS = [1]
VAR_SEQ_LENS = [
[2, 2],
[2, 3, 4],
]
NUM_HEADS = [1, 16] NUM_HEADS = [1, 16]
NUM_KV_HEADS = [1] NUM_KV_HEADS = [1]
HEAD_SIZES = [64, 80] HEAD_SIZES = [64, 80]
...@@ -130,7 +131,7 @@ def test_mha_attn_forward( ...@@ -130,7 +131,7 @@ def test_mha_attn_forward(
k = torch.randn(batch_size, seq_len, num_kv_heads * head_size) k = torch.randn(batch_size, seq_len, num_kv_heads * head_size)
v = torch.randn(batch_size, seq_len, num_kv_heads * head_size) v = torch.randn(batch_size, seq_len, num_kv_heads * head_size)
scale = 1.0 / head_size**0.5 scale = 1.0 / head_size**0.5
attn = MultiHeadAttention( attn = MMEncoderAttention(
num_heads, head_size, scale=scale, num_kv_heads=num_kv_heads num_heads, head_size, scale=scale, num_kv_heads=num_kv_heads
) )
output = attn(q, k, v) output = attn(q, k, v)
...@@ -151,3 +152,58 @@ def test_mha_attn_forward( ...@@ -151,3 +152,58 @@ def test_mha_attn_forward(
scale=scale, scale=scale,
).reshape(batch_size, seq_len, num_heads * head_size) ).reshape(batch_size, seq_len, num_heads * head_size)
torch.testing.assert_close(output, ref_output) torch.testing.assert_close(output, ref_output)
@pytest.mark.parametrize("var_seq_len", VAR_SEQ_LENS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("num_kv_heads", NUM_KV_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("device", CUDA_DEVICES)
def test_mha_attn_varlen_forward(
var_seq_len: list[int],
num_heads: int,
num_kv_heads: int,
head_size: int,
dtype: torch.dtype,
device: str,
):
current_platform.seed_everything(0)
torch.set_default_device(device)
torch.set_default_dtype(dtype)
q = torch.randn(1, sum(var_seq_len), num_heads, head_size)
k = torch.randn(1, sum(var_seq_len), num_kv_heads, head_size)
v = torch.randn(1, sum(var_seq_len), num_kv_heads, head_size)
cu_seqlens = torch.tensor(
[0] + list(itertools.accumulate(var_seq_len)), dtype=torch.int32
)
scale = 1.0 / head_size**0.5
attn = MMEncoderAttention(
num_heads, head_size, scale=scale, num_kv_heads=num_kv_heads
)
output = attn(
q, k, v, cu_seqlens=cu_seqlens, max_seqlen=torch.tensor(max(var_seq_len))
)
assert num_heads % num_kv_heads == 0
num_queries_per_kv = num_heads // num_kv_heads
if num_queries_per_kv > 1:
k = torch.repeat_interleave(k, num_queries_per_kv, dim=2)
v = torch.repeat_interleave(v, num_queries_per_kv, dim=2)
ref_output = []
for q_i, k_i, v_i in zip(
torch.split(q, var_seq_len, dim=1),
torch.split(k, var_seq_len, dim=1),
torch.split(v, var_seq_len, dim=1),
):
output_i = ref_attention(
q_i,
k_i,
v_i,
scale=scale,
)
ref_output.append(output_i)
ref_output = torch.cat(ref_output, dim=1)
torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2)
...@@ -4,7 +4,9 @@ ...@@ -4,7 +4,9 @@
import pytest import pytest
import torch import torch
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
from vllm.config import AttentionConfig, VllmConfig, set_current_vllm_config
from vllm.platforms.rocm import RocmPlatform from vllm.platforms.rocm import RocmPlatform
...@@ -16,40 +18,56 @@ def clear_cache(): ...@@ -16,40 +18,56 @@ def clear_cache():
@pytest.mark.skip(reason="Skipped for now. Should be revisited.") @pytest.mark.skip(reason="Skipped for now. Should be revisited.")
def test_selector(monkeypatch: pytest.MonkeyPatch): def test_selector(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m: # Set the current platform to ROCm using monkeypatch
m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_ATTN") monkeypatch.setattr("vllm.attention.selector.current_platform", RocmPlatform())
# Set the current platform to ROCm using monkeypatch # Test standard ROCm attention
monkeypatch.setattr("vllm.attention.selector.current_platform", RocmPlatform()) attention_config = AttentionConfig(backend=AttentionBackendEnum.ROCM_ATTN)
vllm_config = VllmConfig(attention_config=attention_config)
# Test standard ROCm attention with set_current_vllm_config(vllm_config):
backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
assert backend.get_name() == "ROCM_FLASH" or backend.get_name() == "TRITON_ATTN" assert backend.get_name() == "ROCM_FLASH" or backend.get_name() == "TRITON_ATTN"
# MLA test for deepseek related # MLA test for deepseek related
# Change the attention backend to triton MLA
attention_config = AttentionConfig(backend=AttentionBackendEnum.TRITON_MLA)
vllm_config = VllmConfig(attention_config=attention_config)
# change the attention backend to triton MLA with set_current_vllm_config(vllm_config):
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_MLA")
backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True) backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True)
assert backend.get_name() == "TRITON_MLA" assert backend.get_name() == "TRITON_MLA"
# If attention backend is None # If attention backend is None
# If use_mla is true # If use_mla is true
# The selected backend is triton MLA # The selected backend is triton MLA
m.setenv("VLLM_ATTENTION_BACKEND", "") attention_config = AttentionConfig(backend=None)
vllm_config = VllmConfig(attention_config=attention_config)
with set_current_vllm_config(vllm_config):
backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True) backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True)
assert backend.get_name() == "TRITON_MLA" assert backend.get_name() == "TRITON_MLA"
# change the attention backend to AITER MLA # Change the attention backend to AITER MLA
m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_MLA") attention_config = AttentionConfig(backend=AttentionBackendEnum.ROCM_AITER_MLA)
vllm_config = VllmConfig(attention_config=attention_config)
with set_current_vllm_config(vllm_config):
backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True) backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True)
assert backend.get_name() == "ROCM_AITER_MLA" assert backend.get_name() == "ROCM_AITER_MLA"
# If attention backend is None # If attention backend is None
# If use_mla is true # If use_mla is true
# If VLLM_ROCM_USE_AITER is enabled # If VLLM_ROCM_USE_AITER is enabled
# The selected backend is ROCM_AITER_MLA # The selected backend is ROCM_AITER_MLA
m.setenv("VLLM_ATTENTION_BACKEND", "") with monkeypatch.context() as m:
m.setenv("VLLM_ROCM_USE_AITER", "1") m.setenv("VLLM_ROCM_USE_AITER", "1")
backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True)
assert backend.get_name() == "ROCM_AITER_MLA" attention_config = AttentionConfig(backend=None)
vllm_config = VllmConfig(attention_config=attention_config)
with set_current_vllm_config(vllm_config):
backend = get_attn_backend(
576, torch.bfloat16, "auto", 1, False, use_mla=True
)
assert backend.get_name() == "ROCM_AITER_MLA"
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from tests.kernels.allclose_default import get_default_atol, get_default_rtol
from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight
from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul
from vllm.platforms import current_platform
if not current_platform.is_cpu():
pytest.skip("skipping CPU-only tests", allow_module_level=True)
EXPERT_NUM = [
8,
]
HIDDEN_DIM = [128, 2880]
INTERMEDIATE_DIM = [128, 2880]
BATCH_SIZE = [1, 64, 256]
ACT = ["silu", "swigluoai"]
USE_BIAS = [True, False]
ISA = ["amx", "vec"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
DTYPE = [torch.bfloat16]
_CPU_MOE_ACT = {
"silu": SiluAndMul(),
"swigluoai": SwigluOAIAndMul(),
}
def ref_fused_moe(
input: torch.Tensor,
w13: torch.Tensor,
w2: torch.Tensor,
w13_bias: torch.Tensor | None,
w2_bias: torch.Tensor | None,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
activation: str,
) -> torch.Tensor:
len_experts = w13.size(0)
cnts = topk_ids.new_zeros((topk_ids.shape[0], len_experts))
cnts.scatter_(1, topk_ids.to(torch.int64), 1)
tokens_per_expert = cnts.sum(dim=0)
idxs = topk_ids.view(-1).argsort()
sorted_tokens = input[idxs // topk_ids.shape[1]]
tokens_per_expert = tokens_per_expert.cpu().numpy()
outputs = []
start_idx = 0
for i, num_tokens in enumerate(tokens_per_expert):
end_idx = start_idx + num_tokens
if num_tokens == 0:
continue
tokens_for_this_expert = sorted_tokens[start_idx:end_idx].float()
curr_w13 = w13[i].float()
curr_w2 = w2[i].float()
curr_w13_bias = None
if w13_bias is not None:
curr_w13_bias = w13_bias[i].float()
curr_w2_bias = None
if w2_bias is not None:
curr_w2_bias = w2_bias[i].float()
gate_up = torch.nn.functional.linear(
tokens_for_this_expert, curr_w13, curr_w13_bias
)
# Note: to simulate the kernel implementation
gate_up = (
_CPU_MOE_ACT[activation]
.forward_native(gate_up)
.to(dtype=input.dtype)
.float()
)
expert_out = torch.nn.functional.linear(gate_up, curr_w2, curr_w2_bias)
outputs.append(expert_out)
start_idx = end_idx
outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
new_x = torch.empty_like(outs)
new_x[idxs] = outs
final_out = (
new_x.view(*topk_ids.shape, -1)
.mul_(topk_weights.unsqueeze(dim=-1))
.sum(dim=1)
.type(input.dtype)
)
return final_out
@pytest.mark.parametrize("batch_size", BATCH_SIZE)
@pytest.mark.parametrize("expert_num", EXPERT_NUM)
@pytest.mark.parametrize("hidden_size", HIDDEN_DIM)
@pytest.mark.parametrize("intermediate_size", INTERMEDIATE_DIM)
@pytest.mark.parametrize("use_bias", USE_BIAS)
@pytest.mark.parametrize("dtype", DTYPE)
@pytest.mark.parametrize("act", ACT)
@pytest.mark.parametrize("isa", ISA)
def test_cpu_fused_moe(
batch_size: int,
expert_num: int,
hidden_size: int,
intermediate_size: int,
use_bias: bool,
dtype: torch.dtype,
act: str,
isa: str,
):
current_platform.seed_everything(0)
topk_num = max(expert_num // 2, 1)
up_dim = 2 * intermediate_size
input = torch.randn((batch_size, hidden_size), dtype=dtype) / (
0.5 * hidden_size**0.5
)
w13 = torch.randn((expert_num, up_dim, hidden_size), dtype=dtype) / (
0.5 * hidden_size**0.5
)
w2 = torch.randn((expert_num, hidden_size, intermediate_size), dtype=dtype) / (
0.5 * intermediate_size**0.5
)
router_logits = torch.randn((batch_size, expert_num), dtype=dtype)
w13_bias = None
w2_bias = None
if use_bias:
w13_bias = torch.randn((expert_num, up_dim), dtype=dtype) / (0.5 * up_dim**0.5)
w2_bias = torch.randn((expert_num, hidden_size), dtype=dtype) / (
0.5 * hidden_size**0.5
)
score = torch.softmax(router_logits, dim=-1, dtype=torch.float32)
topk_weight, topk_ids = torch.topk(score, topk_num)
topk_ids = topk_ids.to(torch.int32)
ref_output = ref_fused_moe(
input,
w13,
w2,
w13_bias,
w2_bias,
topk_weight,
topk_ids,
act,
)
packed_w13 = cpu_prepack_moe_weight(w13, isa)
packed_w2 = cpu_prepack_moe_weight(w2, isa)
output = cpu_fused_moe(
input,
packed_w13,
packed_w2,
w13_bias,
w2_bias,
topk_weight,
topk_ids,
act,
isa,
)
atol, rtol = get_default_atol(output), get_default_rtol(output)
(
torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol),
f"{torch.max(torch.abs(output - ref_output))}",
)
...@@ -9,8 +9,8 @@ import pytest ...@@ -9,8 +9,8 @@ import pytest
import torch import torch
from vllm.model_executor.layers.fused_moe.fused_moe import ( from vllm.model_executor.layers.fused_moe.fused_moe import (
GroupedTopk,
fused_grouped_topk, fused_grouped_topk,
grouped_topk,
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
...@@ -50,15 +50,17 @@ def test_grouped_topk( ...@@ -50,15 +50,17 @@ def test_grouped_topk(
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "0") m.setenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "0")
baseline_topk_weights, baseline_topk_ids = grouped_topk( grouped_topk = GroupedTopk(
hidden_states=hidden_states,
gating_output=gating_output,
topk=topk, topk=topk,
renormalize=renormalize, renormalize=renormalize,
num_expert_group=num_expert_group, num_expert_group=num_expert_group,
topk_group=topk_group, topk_group=topk_group,
scoring_func=scoring_func, scoring_func=scoring_func,
routed_scaling_factor=routed_scaling_factor, routed_scaling_factor=routed_scaling_factor,
)
baseline_topk_weights, baseline_topk_ids = grouped_topk(
hidden_states=hidden_states,
gating_output=gating_output,
e_score_correction_bias=e_score_correction_bias, e_score_correction_bias=e_score_correction_bias,
) )
......
...@@ -40,7 +40,7 @@ def set_seed(seed): ...@@ -40,7 +40,7 @@ def set_seed(seed):
not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION, not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION,
reason="CUDA not available or PyTorch version < 2.7", reason="CUDA not available or PyTorch version < 2.7",
) )
def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch): def test_flex_attention_vs_default_backend(vllm_runner):
"""Test that FlexAttention produces the same outputs as the default backend. """Test that FlexAttention produces the same outputs as the default backend.
This test compares the outputs from the FlexAttention backend with This test compares the outputs from the FlexAttention backend with
...@@ -57,35 +57,32 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch): ...@@ -57,35 +57,32 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
] ]
# Run with flex attention # Run with flex attention
with monkeypatch.context() as m: set_seed(seed)
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") with vllm_runner(
model_name,
set_seed(seed) runner="generate",
with vllm_runner( tensor_parallel_size=1,
model_name, num_gpu_blocks_override=128,
runner="generate", enforce_eager=True,
tensor_parallel_size=1, attention_config={"backend": "FLEX_ATTENTION"},
num_gpu_blocks_override=128, ) as llm_flex:
enforce_eager=True, output_flex = llm_flex.generate_greedy_logprobs(
) as llm_flex: prompts, max_tokens, num_logprobs
output_flex = llm_flex.generate_greedy_logprobs( )
prompts, max_tokens, num_logprobs
)
# Run with default backend # Run with default backend
with monkeypatch.context() as m: set_seed(seed)
set_seed(seed) with vllm_runner(
with vllm_runner( model_name,
model_name, runner="generate",
runner="generate", tensor_parallel_size=1,
tensor_parallel_size=1, num_gpu_blocks_override=128,
num_gpu_blocks_override=128, enforce_eager=True,
enforce_eager=True, gpu_memory_utilization=0.85,
gpu_memory_utilization=0.85, ) as llm_default:
) as llm_default: output_default = llm_default.generate_greedy_logprobs(
output_default = llm_default.generate_greedy_logprobs( prompts, max_tokens, num_logprobs
prompts, max_tokens, num_logprobs )
)
check_logprobs_close( check_logprobs_close(
outputs_0_lst=output_flex, outputs_0_lst=output_flex,
...@@ -99,7 +96,7 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch): ...@@ -99,7 +96,7 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION, not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION,
reason="CUDA not available or PyTorch version < 2.7", reason="CUDA not available or PyTorch version < 2.7",
) )
def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch): def test_encoder_flex_attention_vs_default_backend(vllm_runner):
"""Test that FlexAttention produces the same outputs as the default backend. """Test that FlexAttention produces the same outputs as the default backend.
This test compares the outputs from the FlexAttention backend with This test compares the outputs from the FlexAttention backend with
...@@ -113,30 +110,26 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch): ...@@ -113,30 +110,26 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
] ]
# Run with flex attention # Run with flex attention
with monkeypatch.context() as m: with vllm_runner(
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") model_name,
with vllm_runner( runner="pooling",
model_name, dtype=torch.bfloat16,
runner="pooling", tensor_parallel_size=1,
dtype=torch.bfloat16, max_model_len=100,
tensor_parallel_size=1, enforce_eager=True,
max_model_len=100, attention_config={"backend": "FLEX_ATTENTION"},
enforce_eager=True, ) as llm_flex:
) as llm_flex: flex_outputs = llm_flex.embed(prompts)
flex_outputs = llm_flex.embed(prompts)
# Run with default backend # Run with default backend
with ( with vllm_runner(
monkeypatch.context() as m, model_name,
vllm_runner( runner="pooling",
model_name, dtype=torch.bfloat16,
runner="pooling", tensor_parallel_size=1,
dtype=torch.bfloat16, max_model_len=100,
tensor_parallel_size=1, enforce_eager=True,
max_model_len=100, ) as llm_default:
enforce_eager=True,
) as llm_default,
):
default_outputs = llm_default.embed(prompts) default_outputs = llm_default.embed(prompts)
check_embeddings_close( check_embeddings_close(
......
...@@ -76,6 +76,8 @@ def test_gpt_oss_lora(gptoss20b_lora_files): ...@@ -76,6 +76,8 @@ def test_gpt_oss_lora(gptoss20b_lora_files):
enable_lora=True, enable_lora=True,
max_loras=4, max_loras=4,
max_lora_rank=8, max_lora_rank=8,
max_num_seqs=2,
max_num_batched_tokens=2048,
compilation_config=vllm.config.CompilationConfig( # Avoid OOM compilation_config=vllm.config.CompilationConfig( # Avoid OOM
cudagraph_specialize_lora=False, cudagraph_specialize_lora=False,
), ),
...@@ -94,8 +96,10 @@ def test_gpt_oss_lora_tp2(gptoss20b_lora_files, fully_sharded_loras): ...@@ -94,8 +96,10 @@ def test_gpt_oss_lora_tp2(gptoss20b_lora_files, fully_sharded_loras):
enable_lora=True, enable_lora=True,
max_loras=2, max_loras=2,
max_lora_rank=8, max_lora_rank=8,
max_num_seqs=16, max_num_seqs=2,
max_num_batched_tokens=2048,
tensor_parallel_size=2, tensor_parallel_size=2,
gpu_memory_utilization=0.8,
fully_sharded_loras=fully_sharded_loras, fully_sharded_loras=fully_sharded_loras,
compilation_config=vllm.config.CompilationConfig( # Avoid OOM compilation_config=vllm.config.CompilationConfig( # Avoid OOM
cudagraph_specialize_lora=False, cudagraph_specialize_lora=False,
......
...@@ -76,11 +76,18 @@ def do_sample( ...@@ -76,11 +76,18 @@ def do_sample(
if lora_id if lora_id
else None, else None,
) )
# Print the outputs. lora_request = LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None
generated_texts: list[str] = [] generated_texts: list[str] = []
for output in outputs: for output in outputs:
prompt = output.prompt prompt = output.prompt
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
# The output should include correct lora_request info
if lora_request is not None:
assert output.lora_request.lora_name == lora_request.lora_name
assert output.lora_request.lora_int_id == lora_request.lora_int_id
assert output.lora_request.lora_path == lora_request.lora_path
else:
assert output.lora_request is None
generated_texts.append(generated_text) generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
return generated_texts return generated_texts
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
from collections import OrderedDict from collections import OrderedDict
from typing import NamedTuple from typing import NamedTuple
from unittest.mock import patch from unittest.mock import MagicMock, patch
import pytest import pytest
from huggingface_hub.utils import HfHubHTTPError from huggingface_hub.utils import HfHubHTTPError
...@@ -194,5 +194,8 @@ def test_get_adapter_absolute_path_huggingface_error( ...@@ -194,5 +194,8 @@ def test_get_adapter_absolute_path_huggingface_error(
# Hugging Face model identifier with download error # Hugging Face model identifier with download error
path = "org/repo" path = "org/repo"
mock_exist.return_value = False mock_exist.return_value = False
mock_snapshot_download.side_effect = HfHubHTTPError("failed to query model info") mock_snapshot_download.side_effect = HfHubHTTPError(
"failed to query model info",
response=MagicMock(),
)
assert get_adapter_absolute_path(path) == path assert get_adapter_absolute_path(path) == path
...@@ -35,10 +35,12 @@ audio_lora_path = MODEL_NAME ...@@ -35,10 +35,12 @@ audio_lora_path = MODEL_NAME
models = [MODEL_NAME] models = [MODEL_NAME]
@pytest.fixture(autouse=True) @pytest.fixture
def set_attention_backend_for_rocm(monkeypatch): def granite_speech_attention_config():
"""Return attention config for Granite Speech tests on ROCm."""
if current_platform.is_rocm(): if current_platform.is_rocm():
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN") return {"backend": "TRITON_ATTN"}
return None
def run_test( def run_test(
...@@ -53,6 +55,7 @@ def run_test( ...@@ -53,6 +55,7 @@ def run_test(
num_logprobs: int, num_logprobs: int,
tensor_parallel_size: int, tensor_parallel_size: int,
distributed_executor_backend: str | None = None, distributed_executor_backend: str | None = None,
attention_config: dict | None = None,
): ):
"""Inference result should be the same between hf and vllm. """Inference result should be the same between hf and vllm.
...@@ -80,6 +83,7 @@ def run_test( ...@@ -80,6 +83,7 @@ def run_test(
enable_lora=True, enable_lora=True,
max_lora_rank=64, max_lora_rank=64,
enforce_eager=True, enforce_eager=True,
attention_config=attention_config,
) as vllm_model: ) as vllm_model:
lora_request = LoRARequest("audio", 1, audio_lora_path) lora_request = LoRARequest("audio", 1, audio_lora_path)
vllm_outputs_per_case = [ vllm_outputs_per_case = [
...@@ -131,6 +135,7 @@ def test_models( ...@@ -131,6 +135,7 @@ def test_models(
vllm_runner, vllm_runner,
model: str, model: str,
audio_assets: AudioTestAssets, audio_assets: AudioTestAssets,
granite_speech_attention_config,
dtype: str, dtype: str,
max_model_len: int, max_model_len: int,
max_tokens: int, max_tokens: int,
...@@ -157,4 +162,5 @@ def test_models( ...@@ -157,4 +162,5 @@ def test_models(
max_tokens=max_tokens, max_tokens=max_tokens,
num_logprobs=num_logprobs, num_logprobs=num_logprobs,
tensor_parallel_size=1, tensor_parallel_size=1,
attention_config=granite_speech_attention_config,
) )
...@@ -2,23 +2,17 @@ ...@@ -2,23 +2,17 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM pooling tests.""" """Pytest configuration for vLLM pooling tests."""
import os import pytest
import warnings
from vllm.platforms import current_platform from vllm.platforms import current_platform
def pytest_collection_modifyitems(config, items): @pytest.fixture
"""Set FLEX_ATTENTION backend for SigLIP tests on ROCm.""" def siglip_attention_config():
if not current_platform.is_rocm(): """Return attention config for SigLIP tests on ROCm.
return
siglip_tests = [item for item in items if "test_siglip" in item.nodeid] On ROCm, SigLIP tests require FLEX_ATTENTION backend.
"""
if siglip_tests: if current_platform.is_rocm():
os.environ["VLLM_ATTENTION_BACKEND"] = "FLEX_ATTENTION" return {"backend": "FLEX_ATTENTION"}
warnings.warn( return None
"ROCm: Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION for SigLIP tests",
UserWarning,
stacklevel=1,
)
...@@ -38,6 +38,7 @@ def _run_test( ...@@ -38,6 +38,7 @@ def _run_test(
*, *,
dtype: str, dtype: str,
tokenization_kwargs: dict[str, Any] | None = None, tokenization_kwargs: dict[str, Any] | None = None,
attention_config: dict[str, Any] | None = None,
) -> None: ) -> None:
if tokenization_kwargs is None: if tokenization_kwargs is None:
tokenization_kwargs = {} tokenization_kwargs = {}
...@@ -49,6 +50,7 @@ def _run_test( ...@@ -49,6 +50,7 @@ def _run_test(
enforce_eager=True, enforce_eager=True,
max_model_len=64, max_model_len=64,
gpu_memory_utilization=0.7, gpu_memory_utilization=0.7,
attention_config=attention_config,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.embed( vllm_outputs = vllm_model.embed(
input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs
...@@ -90,6 +92,7 @@ def test_models_text( ...@@ -90,6 +92,7 @@ def test_models_text(
hf_runner, hf_runner,
vllm_runner, vllm_runner,
image_assets, image_assets,
siglip_attention_config,
model: str, model: str,
dtype: str, dtype: str,
) -> None: ) -> None:
...@@ -108,6 +111,7 @@ def test_models_text( ...@@ -108,6 +111,7 @@ def test_models_text(
"padding": "max_length", "padding": "max_length",
"max_length": 64, "max_length": 64,
}, # siglip2 was trained with this padding setting. }, # siglip2 was trained with this padding setting.
attention_config=siglip_attention_config,
) )
...@@ -117,6 +121,7 @@ def test_models_image( ...@@ -117,6 +121,7 @@ def test_models_image(
hf_runner, hf_runner,
vllm_runner, vllm_runner,
image_assets, image_assets,
siglip_attention_config,
model: str, model: str,
dtype: str, dtype: str,
) -> None: ) -> None:
...@@ -133,6 +138,7 @@ def test_models_image( ...@@ -133,6 +138,7 @@ def test_models_image(
input_images, input_images,
model, model,
dtype=dtype, dtype=dtype,
attention_config=siglip_attention_config,
) )
...@@ -141,6 +147,7 @@ def test_models_image( ...@@ -141,6 +147,7 @@ def test_models_image(
def test_models_text_image_no_crash( def test_models_text_image_no_crash(
vllm_runner, vllm_runner,
image_assets, image_assets,
siglip_attention_config,
model: str, model: str,
dtype: str, dtype: str,
) -> None: ) -> None:
...@@ -154,6 +161,7 @@ def test_models_text_image_no_crash( ...@@ -154,6 +161,7 @@ def test_models_text_image_no_crash(
enforce_eager=True, enforce_eager=True,
max_model_len=64, max_model_len=64,
gpu_memory_utilization=0.7, gpu_memory_utilization=0.7,
attention_config=siglip_attention_config,
) as vllm_model: ) as vllm_model:
with pytest.raises(ValueError, match="not both"): with pytest.raises(ValueError, match="not both"):
vllm_model.embed(texts, images=images) vllm_model.embed(texts, images=images)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment