Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a810671a
Commit
a810671a
authored
Jan 08, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.14.0rc0' into v0.14.0rc0-ori
parents
86b5aefe
6a09612b
Changes
291
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
496 additions
and
173 deletions
+496
-173
tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
+1
-1
tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml
tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml
+1
-2
tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
+12
-0
tests/evals/gsm8k/configs/models-blackwell.txt
tests/evals/gsm8k/configs/models-blackwell.txt
+1
-0
tests/evals/gsm8k/conftest.py
tests/evals/gsm8k/conftest.py
+3
-5
tests/evals/gsm8k/test_gsm8k_correctness.py
tests/evals/gsm8k/test_gsm8k_correctness.py
+41
-29
tests/kernels/attention/test_attention.py
tests/kernels/attention/test_attention.py
+3
-2
tests/kernels/attention/test_attention_selector.py
tests/kernels/attention/test_attention_selector.py
+27
-25
tests/kernels/attention/test_flashinfer_trtllm_attention.py
tests/kernels/attention/test_flashinfer_trtllm_attention.py
+35
-0
tests/kernels/attention/test_mha_attn.py
tests/kernels/attention/test_mha_attn.py
+67
-11
tests/kernels/attention/test_rocm_attention_selector.py
tests/kernels/attention/test_rocm_attention_selector.py
+39
-21
tests/kernels/moe/test_cpu_fused_moe.py
tests/kernels/moe/test_cpu_fused_moe.py
+172
-0
tests/kernels/moe/test_grouped_topk.py
tests/kernels/moe/test_grouped_topk.py
+6
-4
tests/kernels/test_flex_attention.py
tests/kernels/test_flex_attention.py
+44
-51
tests/lora/test_gptoss_tp.py
tests/lora/test_gptoss_tp.py
+5
-1
tests/lora/test_llama_tp.py
tests/lora/test_llama_tp.py
+8
-1
tests/lora/test_utils.py
tests/lora/test_utils.py
+5
-2
tests/models/multimodal/generation/test_granite_speech.py
tests/models/multimodal/generation/test_granite_speech.py
+9
-3
tests/models/multimodal/pooling/conftest.py
tests/models/multimodal/pooling/conftest.py
+9
-15
tests/models/multimodal/pooling/test_siglip.py
tests/models/multimodal/pooling/test_siglip.py
+8
-0
No files found.
tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
View file @
a810671a
...
...
@@ -2,4 +2,4 @@ model_name: "Qwen/Qwen3-0.6B-FP8"
accuracy_threshold
:
0.375
num_questions
:
1319
num_fewshot
:
5
max_model_len
:
4096
\ No newline at end of file
server_args
:
"
--enforce-eager
--max-model-len
4096"
tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml
View file @
a810671a
...
...
@@ -2,5 +2,4 @@ model_name: "nvidia/Qwen3-30B-A3B-FP4"
accuracy_threshold
:
0.89
num_questions
:
1319
num_fewshot
:
5
max_model_len
:
4096
server_args
:
"
--enforce-eager
--max-model-len
4096"
tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
0 → 100644
View file @
a810671a
model_name
:
"
nm-testing/Qwen3-Next-80B-A3B-Instruct-NVFP4"
accuracy_threshold
:
0.75
num_questions
:
1319
num_fewshot
:
5
server_args
:
>-
--enforce-eager
--max-model-len 4096
--tensor-parallel-size 2
--enable-expert-parallel
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}'
env
:
VLLM_USE_FLASHINFER_MOE_FP4
:
"
1"
tests/evals/gsm8k/configs/models-blackwell.txt
View file @
a810671a
...
...
@@ -3,3 +3,4 @@ Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
Qwen1.5-MoE-W4A16-CT.yaml
DeepSeek-V2-Lite-Instruct-FP8.yaml
Qwen3-30B-A3B-NVFP4.yaml
Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
tests/evals/gsm8k/conftest.py
View file @
a810671a
...
...
@@ -11,14 +11,12 @@ def pytest_addoption(parser):
default
=
"configs/models-small.txt"
,
help
=
"File containing list of config files to test"
,
)
parser
.
addoption
(
"--tp-size"
,
default
=
1
,
type
=
int
,
help
=
"Tensor parallel size"
)
def
pytest_generate_tests
(
metafunc
):
"""Generate test parameters from config files."""
if
"config_filename"
in
metafunc
.
fixturenames
:
config_list_file
=
metafunc
.
config
.
getoption
(
"--config-list-file"
)
tp_size
=
metafunc
.
config
.
getoption
(
"--tp-size"
)
# Handle both relative and absolute paths
config_list_path
=
Path
(
config_list_file
)
...
...
@@ -55,9 +53,9 @@ def pytest_generate_tests(metafunc):
# Generate test parameters
if
config_files
:
metafunc
.
parametrize
(
[
"config_filename"
,
"tp_size"
],
[(
config_file
,
int
(
tp_size
))
for
config_file
in
config_files
]
,
ids
=
[
f
"
{
config_file
.
stem
}
-tp
{
tp_size
}
"
for
config_file
in
config_files
],
"config_filename"
,
config_files
,
ids
=
[
config_file
.
stem
for
config_file
in
config_files
],
)
else
:
print
(
"No config files found, test will be skipped"
)
tests/evals/gsm8k/test_gsm8k_correctness.py
View file @
a810671a
...
...
@@ -5,30 +5,31 @@ GSM8K evaluation using vLLM server and isolated GSM8K script.
Replacement for lm-eval-harness with better performance and control.
Usage:
pytest -s -v test_gsm8k_correctness.py
\
--config-list-file=configs/models-small.txt
\
--tp-size=1
pytest -s -v tests/evals/gsm8k/test_gsm8k_correctness.py
\
--config-list-file=configs/models-small.txt
"""
import
shlex
import
yaml
from
tests.utils
import
RemoteOpenAIServer
from
.gsm8k_eval
import
evaluate_gsm8k
R
TOL
=
0.08
#
Relativ
e tolerance for accuracy comparison
TOL
=
0.08
#
Absolut
e tolerance for accuracy comparison
def
launch
_gsm8k_eval
(
eval_config
,
server_url
,
tp_size
)
:
"""
Launch
GSM8K evaluation using our isolated script."""
def
run
_gsm8k_eval
(
eval_config
:
dict
,
server_url
:
str
)
->
dict
:
"""
Run
GSM8K evaluation using our isolated script."""
# Extract host and port from server URL
if
"://"
in
server_url
:
server_url
=
server_url
.
split
(
"://"
)[
1
]
host_port
=
server_url
.
split
(
"/"
)[
0
]
# Remove path if present
if
":"
in
host_port
:
host
,
p
ort
=
host_port
.
split
(
":"
)
port
=
int
(
p
ort
)
host
,
p
=
host_port
.
split
(
":"
)
port
=
int
(
p
)
else
:
host
=
host_port
port
=
8000
...
...
@@ -48,46 +49,57 @@ def launch_gsm8k_eval(eval_config, server_url, tp_size):
return
results
def
test_gsm8k_correctness
_param
(
config_filename
,
tp_size
):
def
test_gsm8k_correctness
(
config_filename
):
"""Test GSM8K correctness for a given model configuration."""
eval_config
=
yaml
.
safe_load
(
config_filename
.
read_text
(
encoding
=
"utf-8"
))
# Server arguments
server_args
=
[
"--max-model-len"
,
str
(
eval_config
.
get
(
"max_model_len"
,
4096
)),
"--enforce-eager"
,
"--trust-remote-code"
,
"--tensor-parallel-size"
,
str
(
tp_size
),
]
# Parse server arguments from config (use shlex to handle quoted strings)
server_args_str
=
eval_config
.
get
(
"server_args"
,
""
)
server_args
=
shlex
.
split
(
server_args_str
)
if
server_args_str
else
[]
# Add standard server arguments
server_args
.
extend
(
[
"--trust-remote-code"
,
]
)
env_dict
=
eval_config
.
get
(
"env"
,
None
)
print
(
f
"Starting GSM8K evaluation for model:
{
eval_config
[
'model_name'
]
}
"
)
print
(
f
"Expected metric threshold:
{
eval_config
[
'accuracy_threshold'
]
}
"
)
print
(
f
"Number of questions:
{
eval_config
[
'num_questions'
]
}
"
)
print
(
f
"Number of few-shot examples:
{
eval_config
[
'num_fewshot'
]
}
"
)
print
(
f
"Server args:
{
' '
.
join
(
server_args
)
}
"
)
# Launch server and run evaluation
with
RemoteOpenAIServer
(
eval_config
[
"model_name"
],
server_args
,
env_dict
=
env_dict
,
max_wait_seconds
=
480
eval_config
[
"model_name"
],
server_args
,
env_dict
=
env_dict
,
max_wait_seconds
=
600
,
)
as
remote_server
:
server_url
=
remote_server
.
url_for
(
"v1"
)
print
(
f
"Server started at:
{
server_url
}
"
)
results
=
launch
_gsm8k_eval
(
eval_config
,
server_url
,
tp_size
)
results
=
run
_gsm8k_eval
(
eval_config
,
server_url
)
# Check accuracy against threshold
measured_accuracy
=
results
[
"accuracy"
]
expected_accuracy
=
eval_config
[
"accuracy_threshold"
]
measured_metric
=
results
[
"accuracy"
]
expected_metric
=
eval_config
[
"accuracy_threshold"
]
print
(
f
"GSM8K Results for
{
eval_config
[
'model_name'
]
}
:"
)
print
(
f
" Accuracy:
{
measured_accuracy
:.
3
f
}
"
)
print
(
f
" Expected:
{
expected_accuracy
:.
3
f
}
"
)
print
(
f
" Measured metric:
{
measured_metric
:.
4
f
}
"
)
print
(
f
" Expected metric:
{
expected_metric
:.
4
f
}
"
)
print
(
f
" Tolerance:
{
TOL
:.
4
f
}
"
)
print
(
f
" Questions:
{
results
[
'num_questions'
]
}
"
)
print
(
f
" Invalid rate:
{
results
[
'invalid_rate'
]:.
3
f
}
"
)
print
(
f
" Latency:
{
results
[
'latency'
]:.
1
f
}
s"
)
print
(
f
" QPS:
{
results
[
'questions_per_second'
]:.
1
f
}
"
)
# Verify
accuracy
is within tolerance
assert
measured_
accuracy
>=
expected_
accuracy
-
R
TOL
,
(
f
"
Accuracy
too low:
{
measured_
accuracy
:.
3
f
}
< "
f
"
{
expected_
accuracy
:.
3
f
}
-
{
R
TOL
:.
3
f
}
"
# Verify
metric
is within tolerance
assert
measured_
metric
>=
expected_
metric
-
TOL
,
(
f
"
GSM8K metric
too low:
{
measured_
metric
:.
4
f
}
< "
f
"
{
expected_
metric
:.
4
f
}
-
{
TOL
:.
4
f
}
=
{
expected_metric
-
TOL
:.
4
f
}
"
)
print
(
f
"✅ GSM8K test passed for
{
eval_config
[
'model_name'
]
}
"
)
tests/kernels/attention/test_attention.py
View file @
a810671a
...
...
@@ -9,7 +9,8 @@ import torch
from
tests.kernels.allclose_default
import
get_default_atol
,
get_default_rtol
from
tests.kernels.utils
import
opcheck
from
vllm
import
_custom_ops
as
ops
from
vllm.attention.layer
import
Attention
,
MultiHeadAttention
from
vllm.attention.layer
import
Attention
from
vllm.attention.layers.mm_encoder_attention
import
MMEncoderAttention
from
vllm.platforms
import
current_platform
from
vllm.utils.mem_utils
import
get_max_shared_memory_bytes
...
...
@@ -442,7 +443,7 @@ def ref_multi_query_kv_attention(
return
torch
.
cat
(
ref_outputs
,
dim
=
0
)
@
pytest
.
mark
.
parametrize
(
"attention_cls"
,
[
Attention
,
M
ultiHead
Attention
])
@
pytest
.
mark
.
parametrize
(
"attention_cls"
,
[
Attention
,
M
MEncoder
Attention
])
def
test_num_heads_not_divisble_by_num_kv_heads
(
attention_cls
:
type
)
->
None
:
head_size
=
64
scale
=
float
(
1.0
/
(
head_size
**
0.5
))
...
...
tests/kernels/attention/test_attention_selector.py
View file @
a810671a
...
...
@@ -6,7 +6,9 @@ from unittest.mock import patch
import
pytest
import
torch
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.attention.selector
import
_cached_get_attn_backend
,
get_attn_backend
from
vllm.config
import
AttentionConfig
,
VllmConfig
,
set_current_vllm_config
from
vllm.platforms
import
current_platform
from
vllm.platforms.cpu
import
CpuPlatform
from
vllm.platforms.cuda
import
CudaPlatform
...
...
@@ -73,18 +75,18 @@ def generate_params():
@
pytest
.
mark
.
parametrize
(
"device, name, use_mla, block_size"
,
generate_params
())
def
test_
env
(
def
test_
backend_selection
(
device
:
str
,
name
:
str
,
use_mla
:
bool
,
block_size
:
int
,
monkeypatch
:
pytest
.
MonkeyPatch
,
):
"""Test attention backend selection with valid device-backend pairs."""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
name
)
m
.
setenv
(
"VLLM_MLA_DISABLE"
,
"1"
if
use_mla
else
"0"
)
# Create AttentionConfig with the specified backend
attention_config
=
AttentionConfig
(
backend
=
AttentionBackendEnum
[
name
]
)
vllm_config
=
VllmConfig
(
attention_config
=
attention_config
)
with
set_current_vllm_config
(
vllm_config
):
if
device
==
"cpu"
:
with
patch
(
"vllm.platforms.current_platform"
,
CpuPlatform
()):
backend
=
get_attn_backend
(
16
,
torch
.
float16
,
None
,
block_size
)
...
...
@@ -217,27 +219,32 @@ def test_env(
@
pytest
.
mark
.
parametrize
(
"device"
,
[
"cpu"
,
"cuda"
])
def
test_fp32_fallback
(
device
:
str
):
"""Test attention backend selection with fp32."""
if
device
==
"cpu"
:
with
patch
(
"vllm.platforms.current_platform"
,
CpuPlatform
()):
backend
=
get_attn_backend
(
16
,
torch
.
float32
,
None
,
16
)
assert
backend
.
get_name
()
==
"CPU_ATTN"
# Use default config (no backend specified)
vllm_config
=
VllmConfig
()
elif
device
==
"cuda"
:
with
patch
(
"vllm.platforms.current_platform"
,
CudaPlatform
()):
backend
=
get_attn_backend
(
16
,
torch
.
float32
,
None
,
16
)
assert
backend
.
get_name
()
==
"FLEX_ATTENTION"
with
set_current_vllm_config
(
vllm_config
):
if
device
==
"cpu"
:
with
patch
(
"vllm.platforms.current_platform"
,
CpuPlatform
()):
backend
=
get_attn_backend
(
16
,
torch
.
float32
,
None
,
16
)
assert
backend
.
get_name
()
==
"CPU_ATTN"
elif
device
==
"cuda"
:
with
patch
(
"vllm.platforms.current_platform"
,
CudaPlatform
()):
backend
=
get_attn_backend
(
16
,
torch
.
float32
,
None
,
16
)
assert
backend
.
get_name
()
==
"FLEX_ATTENTION"
def
test_flash_attn
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""Test FlashAttn validation."""
pytest
.
skip
(
"Skipping as current backend selector does not "
"handle fallbacks when a backend is
set via env var
."
"handle fallbacks when a backend is
explicitly set
."
)
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"FLASH_ATTN"
)
attention_config
=
AttentionConfig
(
backend
=
AttentionBackendEnum
.
FLASH_ATTN
)
vllm_config
=
VllmConfig
(
attention_config
=
attention_config
)
with
set_current_vllm_config
(
vllm_config
):
# Unsupported CUDA arch
monkeypatch
.
setattr
(
torch
.
cuda
,
"get_device_capability"
,
lambda
_
=
None
:
(
7
,
5
))
backend
=
get_attn_backend
(
16
,
torch
.
float16
,
None
,
16
)
...
...
@@ -277,15 +284,10 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
assert
backend
.
get_name
()
!=
"FLASH_ATTN"
def
test_invalid_
env
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_invalid_
backend
(
):
"""Test that invalid attention backend names raise ValueError."""
with
(
monkeypatch
.
context
()
as
m
,
patch
(
"vllm.platforms.current_platform"
,
CudaPlatform
()),
pytest
.
raises
(
ValueError
),
):
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"INVALID"
)
# Should raise ValueError for invalid backend
with
pytest
.
raises
(
ValueError
)
as
exc_info
:
get_attn_backend
(
32
,
torch
.
float16
,
None
,
16
)
assert
"Invalid value 'INVALID'"
in
str
(
exc_info
.
value
)
# Invalid backend name should raise ValueError when creating enum
AttentionConfig
(
backend
=
AttentionBackendEnum
[
"INVALID"
])
tests/kernels/attention/test_flashinfer_trtllm_attention.py
View file @
a810671a
...
...
@@ -455,3 +455,38 @@ def test_flashinfer_trtllm_prefill_with_baseline(
torch
.
testing
.
assert_close
(
output
,
output_trtllm
,
atol
=
atol
,
rtol
=
rtol
),
f
"
{
torch
.
max
(
torch
.
abs
(
output
-
output_trtllm
))
}
"
,
)
def
test_trtllm_attention_rejects_num_kv_heads_1
()
->
None
:
"""Test that TRTLLM attention correctly rejects num_kv_heads=1.
When num_kv_heads=1 (MQA), the KV cache strides become degenerate
(stride_heads == stride_batch), which causes CUDA's cuTensorMapEncodeTiled
to fail because TMA descriptors cannot handle degenerate 4D tensors with
singleton dimensions.
This test verifies that can_use_trtllm_attention returns False for
num_kv_heads=1 configurations.
"""
from
vllm.utils.flashinfer
import
can_use_trtllm_attention
# num_kv_heads=1 should be rejected
assert
not
can_use_trtllm_attention
(
num_qo_heads
=
64
,
num_kv_heads
=
1
),
(
"can_use_trtllm_attention should return False for num_kv_heads=1"
)
assert
not
can_use_trtllm_attention
(
num_qo_heads
=
32
,
num_kv_heads
=
1
),
(
"can_use_trtllm_attention should return False for num_kv_heads=1"
)
# num_kv_heads > 1 should be accepted (if platform supports it)
# Note: This may return False on non-Blackwell platforms, which is fine
result_kv8
=
can_use_trtllm_attention
(
num_qo_heads
=
64
,
num_kv_heads
=
8
)
result_kv1
=
can_use_trtllm_attention
(
num_qo_heads
=
64
,
num_kv_heads
=
1
)
# Even if platform doesn't support TRTLLM, num_kv_heads=1 should never
# return True when num_kv_heads > 1 returns True
if
result_kv8
:
assert
not
result_kv1
,
(
"If TRTLLM is supported for num_kv_heads=8, "
"it must be rejected for num_kv_heads=1"
)
tests/kernels/attention/test_mha_attn.py
View file @
a810671a
...
...
@@ -3,16 +3,17 @@
"""
Test:
* Tests for M
ultiHead
Attention layer
* Tests for M
MEncoder
Attention layer
"""
import
itertools
from
unittest.mock
import
patch
import
pytest
import
torch
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.attention.layer
import
M
ultiHead
Attention
from
vllm.attention.layer
s.mm_encoder_attention
import
M
MEncoder
Attention
from
vllm.attention.selector
import
_cached_get_attn_backend
from
vllm.platforms
import
current_platform
from
vllm.platforms.cpu
import
CpuPlatform
...
...
@@ -42,35 +43,31 @@ def test_mha_attn_platform(device: str):
if
device
==
"cpu"
:
with
(
patch
(
"vllm.attention.layer.current_platform"
,
CpuPlatform
()),
patch
(
"vllm.model_executor.models.vision.current_platform"
,
CpuPlatform
()),
):
attn
=
M
ultiHead
Attention
(
16
,
64
,
scale
=
1
)
attn
=
M
MEncoder
Attention
(
16
,
64
,
scale
=
1
)
assert
attn
.
attn_backend
==
AttentionBackendEnum
.
TORCH_SDPA
elif
device
==
"hip"
:
with
(
patch
(
"vllm.attention.layer.current_platform"
,
RocmPlatform
()),
patch
(
"vllm.model_executor.models.vision.current_platform"
,
RocmPlatform
()),
):
attn
=
M
ultiHead
Attention
(
16
,
64
,
scale
=
1
)
attn
=
M
MEncoder
Attention
(
16
,
64
,
scale
=
1
)
assert
attn
.
attn_backend
==
AttentionBackendEnum
.
FLASH_ATTN
else
:
# Test CUDA with head_size=64 (divisible by 32)
# - should use vLLM's FlashAttention
with
(
patch
(
"vllm.attention.layer.current_platform"
,
CudaPlatform
()),
patch
(
"vllm.model_executor.models.vision.current_platform"
,
CudaPlatform
()),
):
attn
=
M
ultiHead
Attention
(
16
,
64
,
scale
=
1
)
attn
=
M
MEncoder
Attention
(
16
,
64
,
scale
=
1
)
assert
attn
.
attn_backend
==
AttentionBackendEnum
.
FLASH_ATTN
# Test CUDA with head_size=72 (not divisible by 32)
# - should use vLLM's FlashAttention
with
(
patch
(
"vllm.attention.layer.current_platform"
,
CudaPlatform
()),
patch
(
"vllm.model_executor.models.vision.current_platform"
,
CudaPlatform
()),
):
attn
=
M
ultiHead
Attention
(
16
,
72
,
scale
=
1
)
attn
=
M
MEncoder
Attention
(
16
,
72
,
scale
=
1
)
assert
attn
.
attn_backend
==
AttentionBackendEnum
.
FLASH_ATTN
...
...
@@ -94,6 +91,10 @@ def ref_attention(
BATCH_SIZES
=
[
1
,
16
]
SEQ_LENS
=
[
1
]
VAR_SEQ_LENS
=
[
[
2
,
2
],
[
2
,
3
,
4
],
]
NUM_HEADS
=
[
1
,
16
]
NUM_KV_HEADS
=
[
1
]
HEAD_SIZES
=
[
64
,
80
]
...
...
@@ -130,7 +131,7 @@ def test_mha_attn_forward(
k
=
torch
.
randn
(
batch_size
,
seq_len
,
num_kv_heads
*
head_size
)
v
=
torch
.
randn
(
batch_size
,
seq_len
,
num_kv_heads
*
head_size
)
scale
=
1.0
/
head_size
**
0.5
attn
=
M
ultiHead
Attention
(
attn
=
M
MEncoder
Attention
(
num_heads
,
head_size
,
scale
=
scale
,
num_kv_heads
=
num_kv_heads
)
output
=
attn
(
q
,
k
,
v
)
...
...
@@ -151,3 +152,58 @@ def test_mha_attn_forward(
scale
=
scale
,
).
reshape
(
batch_size
,
seq_len
,
num_heads
*
head_size
)
torch
.
testing
.
assert_close
(
output
,
ref_output
)
@
pytest
.
mark
.
parametrize
(
"var_seq_len"
,
VAR_SEQ_LENS
)
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
@
pytest
.
mark
.
parametrize
(
"num_kv_heads"
,
NUM_KV_HEADS
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_mha_attn_varlen_forward
(
var_seq_len
:
list
[
int
],
num_heads
:
int
,
num_kv_heads
:
int
,
head_size
:
int
,
dtype
:
torch
.
dtype
,
device
:
str
,
):
current_platform
.
seed_everything
(
0
)
torch
.
set_default_device
(
device
)
torch
.
set_default_dtype
(
dtype
)
q
=
torch
.
randn
(
1
,
sum
(
var_seq_len
),
num_heads
,
head_size
)
k
=
torch
.
randn
(
1
,
sum
(
var_seq_len
),
num_kv_heads
,
head_size
)
v
=
torch
.
randn
(
1
,
sum
(
var_seq_len
),
num_kv_heads
,
head_size
)
cu_seqlens
=
torch
.
tensor
(
[
0
]
+
list
(
itertools
.
accumulate
(
var_seq_len
)),
dtype
=
torch
.
int32
)
scale
=
1.0
/
head_size
**
0.5
attn
=
MMEncoderAttention
(
num_heads
,
head_size
,
scale
=
scale
,
num_kv_heads
=
num_kv_heads
)
output
=
attn
(
q
,
k
,
v
,
cu_seqlens
=
cu_seqlens
,
max_seqlen
=
torch
.
tensor
(
max
(
var_seq_len
))
)
assert
num_heads
%
num_kv_heads
==
0
num_queries_per_kv
=
num_heads
//
num_kv_heads
if
num_queries_per_kv
>
1
:
k
=
torch
.
repeat_interleave
(
k
,
num_queries_per_kv
,
dim
=
2
)
v
=
torch
.
repeat_interleave
(
v
,
num_queries_per_kv
,
dim
=
2
)
ref_output
=
[]
for
q_i
,
k_i
,
v_i
in
zip
(
torch
.
split
(
q
,
var_seq_len
,
dim
=
1
),
torch
.
split
(
k
,
var_seq_len
,
dim
=
1
),
torch
.
split
(
v
,
var_seq_len
,
dim
=
1
),
):
output_i
=
ref_attention
(
q_i
,
k_i
,
v_i
,
scale
=
scale
,
)
ref_output
.
append
(
output_i
)
ref_output
=
torch
.
cat
(
ref_output
,
dim
=
1
)
torch
.
testing
.
assert_close
(
output
,
ref_output
,
atol
=
1e-2
,
rtol
=
1e-2
)
tests/kernels/attention/test_rocm_attention_selector.py
View file @
a810671a
...
...
@@ -4,7 +4,9 @@
import
pytest
import
torch
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.attention.selector
import
_cached_get_attn_backend
,
get_attn_backend
from
vllm.config
import
AttentionConfig
,
VllmConfig
,
set_current_vllm_config
from
vllm.platforms.rocm
import
RocmPlatform
...
...
@@ -16,40 +18,56 @@ def clear_cache():
@
pytest
.
mark
.
skip
(
reason
=
"Skipped for now. Should be revisited."
)
def
test_selector
(
monkeypatch
:
pytest
.
MonkeyPatch
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"ROCM_ATTN"
)
# Set the current platform to ROCm using monkeypatch
m
onkeypatch
.
setattr
(
"vllm.attention.selector.current_platform"
,
RocmPlatform
()
)
# Set the current platform to ROCm using monkeypatch
monkeypatch
.
setattr
(
"vllm.attention.selector.current_platform"
,
RocmPlatform
())
# Test standard ROCm attention
attention_config
=
AttentionConfig
(
backend
=
AttentionBackendEnum
.
ROCM_ATTN
)
vllm_config
=
VllmConfig
(
attention_config
=
attention_config
)
# Test standard ROCm attention
with
set_current_vllm_config
(
vllm_config
):
backend
=
get_attn_backend
(
16
,
torch
.
float16
,
torch
.
float16
,
16
,
False
)
assert
backend
.
get_name
()
==
"ROCM_FLASH"
or
backend
.
get_name
()
==
"TRITON_ATTN"
# MLA test for deepseek related
# MLA test for deepseek related
# Change the attention backend to triton MLA
attention_config
=
AttentionConfig
(
backend
=
AttentionBackendEnum
.
TRITON_MLA
)
vllm_config
=
VllmConfig
(
attention_config
=
attention_config
)
# change the attention backend to triton MLA
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"TRITON_MLA"
)
with
set_current_vllm_config
(
vllm_config
):
backend
=
get_attn_backend
(
576
,
torch
.
bfloat16
,
"auto"
,
16
,
False
,
use_mla
=
True
)
assert
backend
.
get_name
()
==
"TRITON_MLA"
# If attention backend is None
# If use_mla is true
# The selected backend is triton MLA
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
""
)
# If attention backend is None
# If use_mla is true
# The selected backend is triton MLA
attention_config
=
AttentionConfig
(
backend
=
None
)
vllm_config
=
VllmConfig
(
attention_config
=
attention_config
)
with
set_current_vllm_config
(
vllm_config
):
backend
=
get_attn_backend
(
576
,
torch
.
bfloat16
,
"auto"
,
16
,
False
,
use_mla
=
True
)
assert
backend
.
get_name
()
==
"TRITON_MLA"
# change the attention backend to AITER MLA
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"ROCM_AITER_MLA"
)
# Change the attention backend to AITER MLA
attention_config
=
AttentionConfig
(
backend
=
AttentionBackendEnum
.
ROCM_AITER_MLA
)
vllm_config
=
VllmConfig
(
attention_config
=
attention_config
)
with
set_current_vllm_config
(
vllm_config
):
backend
=
get_attn_backend
(
576
,
torch
.
bfloat16
,
"auto"
,
1
,
False
,
use_mla
=
True
)
assert
backend
.
get_name
()
==
"ROCM_AITER_MLA"
# If attention backend is None
# If use_mla is true
# If VLLM_ROCM_USE_AITER is enabled
# The selected backend is ROCM_AITER_MLA
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
""
)
# If attention backend is None
# If use_mla is true
# If VLLM_ROCM_USE_AITER is enabled
# The selected backend is ROCM_AITER_MLA
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_ROCM_USE_AITER"
,
"1"
)
backend
=
get_attn_backend
(
576
,
torch
.
bfloat16
,
"auto"
,
1
,
False
,
use_mla
=
True
)
assert
backend
.
get_name
()
==
"ROCM_AITER_MLA"
attention_config
=
AttentionConfig
(
backend
=
None
)
vllm_config
=
VllmConfig
(
attention_config
=
attention_config
)
with
set_current_vllm_config
(
vllm_config
):
backend
=
get_attn_backend
(
576
,
torch
.
bfloat16
,
"auto"
,
1
,
False
,
use_mla
=
True
)
assert
backend
.
get_name
()
==
"ROCM_AITER_MLA"
tests/kernels/moe/test_cpu_fused_moe.py
0 → 100644
View file @
a810671a
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
from
tests.kernels.allclose_default
import
get_default_atol
,
get_default_rtol
from
vllm._custom_ops
import
cpu_fused_moe
,
cpu_prepack_moe_weight
from
vllm.model_executor.layers.activation
import
SiluAndMul
,
SwigluOAIAndMul
from
vllm.platforms
import
current_platform
if
not
current_platform
.
is_cpu
():
pytest
.
skip
(
"skipping CPU-only tests"
,
allow_module_level
=
True
)
EXPERT_NUM
=
[
8
,
]
HIDDEN_DIM
=
[
128
,
2880
]
INTERMEDIATE_DIM
=
[
128
,
2880
]
BATCH_SIZE
=
[
1
,
64
,
256
]
ACT
=
[
"silu"
,
"swigluoai"
]
USE_BIAS
=
[
True
,
False
]
ISA
=
[
"amx"
,
"vec"
]
if
torch
.
_C
.
_cpu
.
_is_amx_tile_supported
()
else
[
"vec"
]
DTYPE
=
[
torch
.
bfloat16
]
_CPU_MOE_ACT
=
{
"silu"
:
SiluAndMul
(),
"swigluoai"
:
SwigluOAIAndMul
(),
}
def
ref_fused_moe
(
input
:
torch
.
Tensor
,
w13
:
torch
.
Tensor
,
w2
:
torch
.
Tensor
,
w13_bias
:
torch
.
Tensor
|
None
,
w2_bias
:
torch
.
Tensor
|
None
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
activation
:
str
,
)
->
torch
.
Tensor
:
len_experts
=
w13
.
size
(
0
)
cnts
=
topk_ids
.
new_zeros
((
topk_ids
.
shape
[
0
],
len_experts
))
cnts
.
scatter_
(
1
,
topk_ids
.
to
(
torch
.
int64
),
1
)
tokens_per_expert
=
cnts
.
sum
(
dim
=
0
)
idxs
=
topk_ids
.
view
(
-
1
).
argsort
()
sorted_tokens
=
input
[
idxs
//
topk_ids
.
shape
[
1
]]
tokens_per_expert
=
tokens_per_expert
.
cpu
().
numpy
()
outputs
=
[]
start_idx
=
0
for
i
,
num_tokens
in
enumerate
(
tokens_per_expert
):
end_idx
=
start_idx
+
num_tokens
if
num_tokens
==
0
:
continue
tokens_for_this_expert
=
sorted_tokens
[
start_idx
:
end_idx
].
float
()
curr_w13
=
w13
[
i
].
float
()
curr_w2
=
w2
[
i
].
float
()
curr_w13_bias
=
None
if
w13_bias
is
not
None
:
curr_w13_bias
=
w13_bias
[
i
].
float
()
curr_w2_bias
=
None
if
w2_bias
is
not
None
:
curr_w2_bias
=
w2_bias
[
i
].
float
()
gate_up
=
torch
.
nn
.
functional
.
linear
(
tokens_for_this_expert
,
curr_w13
,
curr_w13_bias
)
# Note: to simulate the kernel implementation
gate_up
=
(
_CPU_MOE_ACT
[
activation
]
.
forward_native
(
gate_up
)
.
to
(
dtype
=
input
.
dtype
)
.
float
()
)
expert_out
=
torch
.
nn
.
functional
.
linear
(
gate_up
,
curr_w2
,
curr_w2_bias
)
outputs
.
append
(
expert_out
)
start_idx
=
end_idx
outs
=
torch
.
cat
(
outputs
,
dim
=
0
)
if
len
(
outputs
)
else
sorted_tokens
.
new_empty
(
0
)
new_x
=
torch
.
empty_like
(
outs
)
new_x
[
idxs
]
=
outs
final_out
=
(
new_x
.
view
(
*
topk_ids
.
shape
,
-
1
)
.
mul_
(
topk_weights
.
unsqueeze
(
dim
=-
1
))
.
sum
(
dim
=
1
)
.
type
(
input
.
dtype
)
)
return
final_out
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
BATCH_SIZE
)
@
pytest
.
mark
.
parametrize
(
"expert_num"
,
EXPERT_NUM
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_DIM
)
@
pytest
.
mark
.
parametrize
(
"intermediate_size"
,
INTERMEDIATE_DIM
)
@
pytest
.
mark
.
parametrize
(
"use_bias"
,
USE_BIAS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPE
)
@
pytest
.
mark
.
parametrize
(
"act"
,
ACT
)
@
pytest
.
mark
.
parametrize
(
"isa"
,
ISA
)
def
test_cpu_fused_moe
(
batch_size
:
int
,
expert_num
:
int
,
hidden_size
:
int
,
intermediate_size
:
int
,
use_bias
:
bool
,
dtype
:
torch
.
dtype
,
act
:
str
,
isa
:
str
,
):
current_platform
.
seed_everything
(
0
)
topk_num
=
max
(
expert_num
//
2
,
1
)
up_dim
=
2
*
intermediate_size
input
=
torch
.
randn
((
batch_size
,
hidden_size
),
dtype
=
dtype
)
/
(
0.5
*
hidden_size
**
0.5
)
w13
=
torch
.
randn
((
expert_num
,
up_dim
,
hidden_size
),
dtype
=
dtype
)
/
(
0.5
*
hidden_size
**
0.5
)
w2
=
torch
.
randn
((
expert_num
,
hidden_size
,
intermediate_size
),
dtype
=
dtype
)
/
(
0.5
*
intermediate_size
**
0.5
)
router_logits
=
torch
.
randn
((
batch_size
,
expert_num
),
dtype
=
dtype
)
w13_bias
=
None
w2_bias
=
None
if
use_bias
:
w13_bias
=
torch
.
randn
((
expert_num
,
up_dim
),
dtype
=
dtype
)
/
(
0.5
*
up_dim
**
0.5
)
w2_bias
=
torch
.
randn
((
expert_num
,
hidden_size
),
dtype
=
dtype
)
/
(
0.5
*
hidden_size
**
0.5
)
score
=
torch
.
softmax
(
router_logits
,
dim
=-
1
,
dtype
=
torch
.
float32
)
topk_weight
,
topk_ids
=
torch
.
topk
(
score
,
topk_num
)
topk_ids
=
topk_ids
.
to
(
torch
.
int32
)
ref_output
=
ref_fused_moe
(
input
,
w13
,
w2
,
w13_bias
,
w2_bias
,
topk_weight
,
topk_ids
,
act
,
)
packed_w13
=
cpu_prepack_moe_weight
(
w13
,
isa
)
packed_w2
=
cpu_prepack_moe_weight
(
w2
,
isa
)
output
=
cpu_fused_moe
(
input
,
packed_w13
,
packed_w2
,
w13_bias
,
w2_bias
,
topk_weight
,
topk_ids
,
act
,
isa
,
)
atol
,
rtol
=
get_default_atol
(
output
),
get_default_rtol
(
output
)
(
torch
.
testing
.
assert_close
(
output
,
ref_output
,
atol
=
atol
,
rtol
=
rtol
),
f
"
{
torch
.
max
(
torch
.
abs
(
output
-
ref_output
))
}
"
,
)
tests/kernels/moe/test_grouped_topk.py
View file @
a810671a
...
...
@@ -9,8 +9,8 @@ import pytest
import
torch
from
vllm.model_executor.layers.fused_moe.fused_moe
import
(
GroupedTopk
,
fused_grouped_topk
,
grouped_topk
,
)
from
vllm.platforms
import
current_platform
...
...
@@ -50,15 +50,17 @@ def test_grouped_topk(
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_FUSED_MOE_GROUPED_TOPK"
,
"0"
)
baseline_topk_weights
,
baseline_topk_ids
=
grouped_topk
(
hidden_states
=
hidden_states
,
gating_output
=
gating_output
,
grouped_topk
=
GroupedTopk
(
topk
=
topk
,
renormalize
=
renormalize
,
num_expert_group
=
num_expert_group
,
topk_group
=
topk_group
,
scoring_func
=
scoring_func
,
routed_scaling_factor
=
routed_scaling_factor
,
)
baseline_topk_weights
,
baseline_topk_ids
=
grouped_topk
(
hidden_states
=
hidden_states
,
gating_output
=
gating_output
,
e_score_correction_bias
=
e_score_correction_bias
,
)
...
...
tests/kernels/test_flex_attention.py
View file @
a810671a
...
...
@@ -40,7 +40,7 @@ def set_seed(seed):
not
torch
.
cuda
.
is_available
()
or
TORCH_VERSION
<
MINIMUM_TORCH_VERSION
,
reason
=
"CUDA not available or PyTorch version < 2.7"
,
)
def
test_flex_attention_vs_default_backend
(
vllm_runner
,
monkeypatch
):
def
test_flex_attention_vs_default_backend
(
vllm_runner
):
"""Test that FlexAttention produces the same outputs as the default backend.
This test compares the outputs from the FlexAttention backend with
...
...
@@ -57,35 +57,32 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
]
# Run with flex attention
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"FLEX_ATTENTION"
)
set_seed
(
seed
)
with
vllm_runner
(
model_name
,
runner
=
"generate"
,
tensor_parallel_size
=
1
,
num_gpu_blocks_override
=
128
,
enforce_eager
=
True
,
)
as
llm_flex
:
output_flex
=
llm_flex
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
)
set_seed
(
seed
)
with
vllm_runner
(
model_name
,
runner
=
"generate"
,
tensor_parallel_size
=
1
,
num_gpu_blocks_override
=
128
,
enforce_eager
=
True
,
attention_config
=
{
"backend"
:
"FLEX_ATTENTION"
},
)
as
llm_flex
:
output_flex
=
llm_flex
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
)
# Run with default backend
with
monkeypatch
.
context
()
as
m
:
set_seed
(
seed
)
with
vllm_runner
(
model_name
,
runner
=
"generate"
,
tensor_parallel_size
=
1
,
num_gpu_blocks_override
=
128
,
enforce_eager
=
True
,
gpu_memory_utilization
=
0.85
,
)
as
llm_default
:
output_default
=
llm_default
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
)
set_seed
(
seed
)
with
vllm_runner
(
model_name
,
runner
=
"generate"
,
tensor_parallel_size
=
1
,
num_gpu_blocks_override
=
128
,
enforce_eager
=
True
,
gpu_memory_utilization
=
0.85
,
)
as
llm_default
:
output_default
=
llm_default
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
)
check_logprobs_close
(
outputs_0_lst
=
output_flex
,
...
...
@@ -99,7 +96,7 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
not
torch
.
cuda
.
is_available
()
or
TORCH_VERSION
<
MINIMUM_TORCH_VERSION
,
reason
=
"CUDA not available or PyTorch version < 2.7"
,
)
def
test_encoder_flex_attention_vs_default_backend
(
vllm_runner
,
monkeypatch
):
def
test_encoder_flex_attention_vs_default_backend
(
vllm_runner
):
"""Test that FlexAttention produces the same outputs as the default backend.
This test compares the outputs from the FlexAttention backend with
...
...
@@ -113,30 +110,26 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
]
# Run with flex attention
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"FLEX_ATTENTION"
)
with
vllm_runner
(
model_name
,
runner
=
"pooling"
,
dtype
=
torch
.
bfloat16
,
tensor_parallel_size
=
1
,
max_model_len
=
100
,
enforce_eager
=
True
,
)
as
llm_flex
:
flex_outputs
=
llm_flex
.
embed
(
prompts
)
with
vllm_runner
(
model_name
,
runner
=
"pooling"
,
dtype
=
torch
.
bfloat16
,
tensor_parallel_size
=
1
,
max_model_len
=
100
,
enforce_eager
=
True
,
attention_config
=
{
"backend"
:
"FLEX_ATTENTION"
},
)
as
llm_flex
:
flex_outputs
=
llm_flex
.
embed
(
prompts
)
# Run with default backend
with
(
monkeypatch
.
context
()
as
m
,
vllm_runner
(
model_name
,
runner
=
"pooling"
,
dtype
=
torch
.
bfloat16
,
tensor_parallel_size
=
1
,
max_model_len
=
100
,
enforce_eager
=
True
,
)
as
llm_default
,
):
with
vllm_runner
(
model_name
,
runner
=
"pooling"
,
dtype
=
torch
.
bfloat16
,
tensor_parallel_size
=
1
,
max_model_len
=
100
,
enforce_eager
=
True
,
)
as
llm_default
:
default_outputs
=
llm_default
.
embed
(
prompts
)
check_embeddings_close
(
...
...
tests/lora/test_gptoss_tp.py
View file @
a810671a
...
...
@@ -76,6 +76,8 @@ def test_gpt_oss_lora(gptoss20b_lora_files):
enable_lora
=
True
,
max_loras
=
4
,
max_lora_rank
=
8
,
max_num_seqs
=
2
,
max_num_batched_tokens
=
2048
,
compilation_config
=
vllm
.
config
.
CompilationConfig
(
# Avoid OOM
cudagraph_specialize_lora
=
False
,
),
...
...
@@ -94,8 +96,10 @@ def test_gpt_oss_lora_tp2(gptoss20b_lora_files, fully_sharded_loras):
enable_lora
=
True
,
max_loras
=
2
,
max_lora_rank
=
8
,
max_num_seqs
=
16
,
max_num_seqs
=
2
,
max_num_batched_tokens
=
2048
,
tensor_parallel_size
=
2
,
gpu_memory_utilization
=
0.8
,
fully_sharded_loras
=
fully_sharded_loras
,
compilation_config
=
vllm
.
config
.
CompilationConfig
(
# Avoid OOM
cudagraph_specialize_lora
=
False
,
...
...
tests/lora/test_llama_tp.py
View file @
a810671a
...
...
@@ -76,11 +76,18 @@ def do_sample(
if
lora_id
else
None
,
)
# Print the outputs.
lora_request
=
LoRARequest
(
str
(
lora_id
),
lora_id
,
lora_path
)
if
lora_id
else
None
generated_texts
:
list
[
str
]
=
[]
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
# The output should include correct lora_request info
if
lora_request
is
not
None
:
assert
output
.
lora_request
.
lora_name
==
lora_request
.
lora_name
assert
output
.
lora_request
.
lora_int_id
==
lora_request
.
lora_int_id
assert
output
.
lora_request
.
lora_path
==
lora_request
.
lora_path
else
:
assert
output
.
lora_request
is
None
generated_texts
.
append
(
generated_text
)
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
return
generated_texts
...
...
tests/lora/test_utils.py
View file @
a810671a
...
...
@@ -3,7 +3,7 @@
from
collections
import
OrderedDict
from
typing
import
NamedTuple
from
unittest.mock
import
patch
from
unittest.mock
import
MagicMock
,
patch
import
pytest
from
huggingface_hub.utils
import
HfHubHTTPError
...
...
@@ -194,5 +194,8 @@ def test_get_adapter_absolute_path_huggingface_error(
# Hugging Face model identifier with download error
path
=
"org/repo"
mock_exist
.
return_value
=
False
mock_snapshot_download
.
side_effect
=
HfHubHTTPError
(
"failed to query model info"
)
mock_snapshot_download
.
side_effect
=
HfHubHTTPError
(
"failed to query model info"
,
response
=
MagicMock
(),
)
assert
get_adapter_absolute_path
(
path
)
==
path
tests/models/multimodal/generation/test_granite_speech.py
View file @
a810671a
...
...
@@ -35,10 +35,12 @@ audio_lora_path = MODEL_NAME
models
=
[
MODEL_NAME
]
@
pytest
.
fixture
(
autouse
=
True
)
def
set_attention_backend_for_rocm
(
monkeypatch
):
@
pytest
.
fixture
def
granite_speech_attention_config
():
"""Return attention config for Granite Speech tests on ROCm."""
if
current_platform
.
is_rocm
():
monkeypatch
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"TRITON_ATTN"
)
return
{
"backend"
:
"TRITON_ATTN"
}
return
None
def
run_test
(
...
...
@@ -53,6 +55,7 @@ def run_test(
num_logprobs
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
str
|
None
=
None
,
attention_config
:
dict
|
None
=
None
,
):
"""Inference result should be the same between hf and vllm.
...
...
@@ -80,6 +83,7 @@ def run_test(
enable_lora
=
True
,
max_lora_rank
=
64
,
enforce_eager
=
True
,
attention_config
=
attention_config
,
)
as
vllm_model
:
lora_request
=
LoRARequest
(
"audio"
,
1
,
audio_lora_path
)
vllm_outputs_per_case
=
[
...
...
@@ -131,6 +135,7 @@ def test_models(
vllm_runner
,
model
:
str
,
audio_assets
:
AudioTestAssets
,
granite_speech_attention_config
,
dtype
:
str
,
max_model_len
:
int
,
max_tokens
:
int
,
...
...
@@ -157,4 +162,5 @@ def test_models(
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
1
,
attention_config
=
granite_speech_attention_config
,
)
tests/models/multimodal/pooling/conftest.py
View file @
a810671a
...
...
@@ -2,23 +2,17 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM pooling tests."""
import
os
import
warnings
import
pytest
from
vllm.platforms
import
current_platform
def
pytest_collection_modifyitems
(
config
,
items
):
"""Set FLEX_ATTENTION backend for SigLIP tests on ROCm."""
if
not
current_platform
.
is_rocm
():
return
@
pytest
.
fixture
def
siglip_attention_config
():
"""Return attention config for SigLIP tests on ROCm.
siglip_tests
=
[
item
for
item
in
items
if
"test_siglip"
in
item
.
nodeid
]
if
siglip_tests
:
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
"FLEX_ATTENTION"
warnings
.
warn
(
"ROCm: Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION for SigLIP tests"
,
UserWarning
,
stacklevel
=
1
,
)
On ROCm, SigLIP tests require FLEX_ATTENTION backend.
"""
if
current_platform
.
is_rocm
():
return
{
"backend"
:
"FLEX_ATTENTION"
}
return
None
tests/models/multimodal/pooling/test_siglip.py
View file @
a810671a
...
...
@@ -38,6 +38,7 @@ def _run_test(
*
,
dtype
:
str
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
attention_config
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
None
:
if
tokenization_kwargs
is
None
:
tokenization_kwargs
=
{}
...
...
@@ -49,6 +50,7 @@ def _run_test(
enforce_eager
=
True
,
max_model_len
=
64
,
gpu_memory_utilization
=
0.7
,
attention_config
=
attention_config
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
embed
(
input_texts
,
images
=
input_images
,
tokenization_kwargs
=
tokenization_kwargs
...
...
@@ -90,6 +92,7 @@ def test_models_text(
hf_runner
,
vllm_runner
,
image_assets
,
siglip_attention_config
,
model
:
str
,
dtype
:
str
,
)
->
None
:
...
...
@@ -108,6 +111,7 @@ def test_models_text(
"padding"
:
"max_length"
,
"max_length"
:
64
,
},
# siglip2 was trained with this padding setting.
attention_config
=
siglip_attention_config
,
)
...
...
@@ -117,6 +121,7 @@ def test_models_image(
hf_runner
,
vllm_runner
,
image_assets
,
siglip_attention_config
,
model
:
str
,
dtype
:
str
,
)
->
None
:
...
...
@@ -133,6 +138,7 @@ def test_models_image(
input_images
,
model
,
dtype
=
dtype
,
attention_config
=
siglip_attention_config
,
)
...
...
@@ -141,6 +147,7 @@ def test_models_image(
def
test_models_text_image_no_crash
(
vllm_runner
,
image_assets
,
siglip_attention_config
,
model
:
str
,
dtype
:
str
,
)
->
None
:
...
...
@@ -154,6 +161,7 @@ def test_models_text_image_no_crash(
enforce_eager
=
True
,
max_model_len
=
64
,
gpu_memory_utilization
=
0.7
,
attention_config
=
siglip_attention_config
,
)
as
vllm_model
:
with
pytest
.
raises
(
ValueError
,
match
=
"not both"
):
vllm_model
.
embed
(
texts
,
images
=
images
)
...
...
Prev
1
2
3
4
5
6
7
8
9
10
…
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment