Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a810671a
Commit
a810671a
authored
Jan 08, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.14.0rc0' into v0.14.0rc0-ori
parents
86b5aefe
6a09612b
Changes
291
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
496 additions
and
173 deletions
+496
-173
tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
+1
-1
tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml
tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml
+1
-2
tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
+12
-0
tests/evals/gsm8k/configs/models-blackwell.txt
tests/evals/gsm8k/configs/models-blackwell.txt
+1
-0
tests/evals/gsm8k/conftest.py
tests/evals/gsm8k/conftest.py
+3
-5
tests/evals/gsm8k/test_gsm8k_correctness.py
tests/evals/gsm8k/test_gsm8k_correctness.py
+41
-29
tests/kernels/attention/test_attention.py
tests/kernels/attention/test_attention.py
+3
-2
tests/kernels/attention/test_attention_selector.py
tests/kernels/attention/test_attention_selector.py
+27
-25
tests/kernels/attention/test_flashinfer_trtllm_attention.py
tests/kernels/attention/test_flashinfer_trtllm_attention.py
+35
-0
tests/kernels/attention/test_mha_attn.py
tests/kernels/attention/test_mha_attn.py
+67
-11
tests/kernels/attention/test_rocm_attention_selector.py
tests/kernels/attention/test_rocm_attention_selector.py
+39
-21
tests/kernels/moe/test_cpu_fused_moe.py
tests/kernels/moe/test_cpu_fused_moe.py
+172
-0
tests/kernels/moe/test_grouped_topk.py
tests/kernels/moe/test_grouped_topk.py
+6
-4
tests/kernels/test_flex_attention.py
tests/kernels/test_flex_attention.py
+44
-51
tests/lora/test_gptoss_tp.py
tests/lora/test_gptoss_tp.py
+5
-1
tests/lora/test_llama_tp.py
tests/lora/test_llama_tp.py
+8
-1
tests/lora/test_utils.py
tests/lora/test_utils.py
+5
-2
tests/models/multimodal/generation/test_granite_speech.py
tests/models/multimodal/generation/test_granite_speech.py
+9
-3
tests/models/multimodal/pooling/conftest.py
tests/models/multimodal/pooling/conftest.py
+9
-15
tests/models/multimodal/pooling/test_siglip.py
tests/models/multimodal/pooling/test_siglip.py
+8
-0
No files found.
tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
View file @
a810671a
...
@@ -2,4 +2,4 @@ model_name: "Qwen/Qwen3-0.6B-FP8"
...
@@ -2,4 +2,4 @@ model_name: "Qwen/Qwen3-0.6B-FP8"
accuracy_threshold
:
0.375
accuracy_threshold
:
0.375
num_questions
:
1319
num_questions
:
1319
num_fewshot
:
5
num_fewshot
:
5
max_model_len
:
4096
server_args
:
"
--enforce-eager
--max-model-len
4096"
\ No newline at end of file
tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml
View file @
a810671a
...
@@ -2,5 +2,4 @@ model_name: "nvidia/Qwen3-30B-A3B-FP4"
...
@@ -2,5 +2,4 @@ model_name: "nvidia/Qwen3-30B-A3B-FP4"
accuracy_threshold
:
0.89
accuracy_threshold
:
0.89
num_questions
:
1319
num_questions
:
1319
num_fewshot
:
5
num_fewshot
:
5
max_model_len
:
4096
server_args
:
"
--enforce-eager
--max-model-len
4096"
tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
0 → 100644
View file @
a810671a
model_name
:
"
nm-testing/Qwen3-Next-80B-A3B-Instruct-NVFP4"
accuracy_threshold
:
0.75
num_questions
:
1319
num_fewshot
:
5
server_args
:
>-
--enforce-eager
--max-model-len 4096
--tensor-parallel-size 2
--enable-expert-parallel
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}'
env
:
VLLM_USE_FLASHINFER_MOE_FP4
:
"
1"
tests/evals/gsm8k/configs/models-blackwell.txt
View file @
a810671a
...
@@ -3,3 +3,4 @@ Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
...
@@ -3,3 +3,4 @@ Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
Qwen1.5-MoE-W4A16-CT.yaml
Qwen1.5-MoE-W4A16-CT.yaml
DeepSeek-V2-Lite-Instruct-FP8.yaml
DeepSeek-V2-Lite-Instruct-FP8.yaml
Qwen3-30B-A3B-NVFP4.yaml
Qwen3-30B-A3B-NVFP4.yaml
Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
tests/evals/gsm8k/conftest.py
View file @
a810671a
...
@@ -11,14 +11,12 @@ def pytest_addoption(parser):
...
@@ -11,14 +11,12 @@ def pytest_addoption(parser):
default
=
"configs/models-small.txt"
,
default
=
"configs/models-small.txt"
,
help
=
"File containing list of config files to test"
,
help
=
"File containing list of config files to test"
,
)
)
parser
.
addoption
(
"--tp-size"
,
default
=
1
,
type
=
int
,
help
=
"Tensor parallel size"
)
def
pytest_generate_tests
(
metafunc
):
def
pytest_generate_tests
(
metafunc
):
"""Generate test parameters from config files."""
"""Generate test parameters from config files."""
if
"config_filename"
in
metafunc
.
fixturenames
:
if
"config_filename"
in
metafunc
.
fixturenames
:
config_list_file
=
metafunc
.
config
.
getoption
(
"--config-list-file"
)
config_list_file
=
metafunc
.
config
.
getoption
(
"--config-list-file"
)
tp_size
=
metafunc
.
config
.
getoption
(
"--tp-size"
)
# Handle both relative and absolute paths
# Handle both relative and absolute paths
config_list_path
=
Path
(
config_list_file
)
config_list_path
=
Path
(
config_list_file
)
...
@@ -55,9 +53,9 @@ def pytest_generate_tests(metafunc):
...
@@ -55,9 +53,9 @@ def pytest_generate_tests(metafunc):
# Generate test parameters
# Generate test parameters
if
config_files
:
if
config_files
:
metafunc
.
parametrize
(
metafunc
.
parametrize
(
[
"config_filename"
,
"tp_size"
],
"config_filename"
,
[(
config_file
,
int
(
tp_size
))
for
config_file
in
config_files
]
,
config_files
,
ids
=
[
f
"
{
config_file
.
stem
}
-tp
{
tp_size
}
"
for
config_file
in
config_files
],
ids
=
[
config_file
.
stem
for
config_file
in
config_files
],
)
)
else
:
else
:
print
(
"No config files found, test will be skipped"
)
print
(
"No config files found, test will be skipped"
)
tests/evals/gsm8k/test_gsm8k_correctness.py
View file @
a810671a
...
@@ -5,30 +5,31 @@ GSM8K evaluation using vLLM server and isolated GSM8K script.
...
@@ -5,30 +5,31 @@ GSM8K evaluation using vLLM server and isolated GSM8K script.
Replacement for lm-eval-harness with better performance and control.
Replacement for lm-eval-harness with better performance and control.
Usage:
Usage:
pytest -s -v test_gsm8k_correctness.py
\
pytest -s -v tests/evals/gsm8k/test_gsm8k_correctness.py
\
--config-list-file=configs/models-small.txt
\
--config-list-file=configs/models-small.txt
--tp-size=1
"""
"""
import
shlex
import
yaml
import
yaml
from
tests.utils
import
RemoteOpenAIServer
from
tests.utils
import
RemoteOpenAIServer
from
.gsm8k_eval
import
evaluate_gsm8k
from
.gsm8k_eval
import
evaluate_gsm8k
R
TOL
=
0.08
#
Relativ
e tolerance for accuracy comparison
TOL
=
0.08
#
Absolut
e tolerance for accuracy comparison
def
launch
_gsm8k_eval
(
eval_config
,
server_url
,
tp_size
)
:
def
run
_gsm8k_eval
(
eval_config
:
dict
,
server_url
:
str
)
->
dict
:
"""
Launch
GSM8K evaluation using our isolated script."""
"""
Run
GSM8K evaluation using our isolated script."""
# Extract host and port from server URL
# Extract host and port from server URL
if
"://"
in
server_url
:
if
"://"
in
server_url
:
server_url
=
server_url
.
split
(
"://"
)[
1
]
server_url
=
server_url
.
split
(
"://"
)[
1
]
host_port
=
server_url
.
split
(
"/"
)[
0
]
# Remove path if present
host_port
=
server_url
.
split
(
"/"
)[
0
]
# Remove path if present
if
":"
in
host_port
:
if
":"
in
host_port
:
host
,
p
ort
=
host_port
.
split
(
":"
)
host
,
p
=
host_port
.
split
(
":"
)
port
=
int
(
p
ort
)
port
=
int
(
p
)
else
:
else
:
host
=
host_port
host
=
host_port
port
=
8000
port
=
8000
...
@@ -48,46 +49,57 @@ def launch_gsm8k_eval(eval_config, server_url, tp_size):
...
@@ -48,46 +49,57 @@ def launch_gsm8k_eval(eval_config, server_url, tp_size):
return
results
return
results
def
test_gsm8k_correctness
_param
(
config_filename
,
tp_size
):
def
test_gsm8k_correctness
(
config_filename
):
"""Test GSM8K correctness for a given model configuration."""
"""Test GSM8K correctness for a given model configuration."""
eval_config
=
yaml
.
safe_load
(
config_filename
.
read_text
(
encoding
=
"utf-8"
))
eval_config
=
yaml
.
safe_load
(
config_filename
.
read_text
(
encoding
=
"utf-8"
))
# Server arguments
# Parse server arguments from config (use shlex to handle quoted strings)
server_args
=
[
server_args_str
=
eval_config
.
get
(
"server_args"
,
""
)
"--max-model-len"
,
server_args
=
shlex
.
split
(
server_args_str
)
if
server_args_str
else
[]
str
(
eval_config
.
get
(
"max_model_len"
,
4096
)),
"--enforce-eager"
,
# Add standard server arguments
"--trust-remote-code"
,
server_args
.
extend
(
"--tensor-parallel-size"
,
[
str
(
tp_size
),
"--trust-remote-code"
,
]
]
)
env_dict
=
eval_config
.
get
(
"env"
,
None
)
env_dict
=
eval_config
.
get
(
"env"
,
None
)
print
(
f
"Starting GSM8K evaluation for model:
{
eval_config
[
'model_name'
]
}
"
)
print
(
f
"Expected metric threshold:
{
eval_config
[
'accuracy_threshold'
]
}
"
)
print
(
f
"Number of questions:
{
eval_config
[
'num_questions'
]
}
"
)
print
(
f
"Number of few-shot examples:
{
eval_config
[
'num_fewshot'
]
}
"
)
print
(
f
"Server args:
{
' '
.
join
(
server_args
)
}
"
)
# Launch server and run evaluation
# Launch server and run evaluation
with
RemoteOpenAIServer
(
with
RemoteOpenAIServer
(
eval_config
[
"model_name"
],
server_args
,
env_dict
=
env_dict
,
max_wait_seconds
=
480
eval_config
[
"model_name"
],
server_args
,
env_dict
=
env_dict
,
max_wait_seconds
=
600
,
)
as
remote_server
:
)
as
remote_server
:
server_url
=
remote_server
.
url_for
(
"v1"
)
server_url
=
remote_server
.
url_for
(
"v1"
)
print
(
f
"Server started at:
{
server_url
}
"
)
results
=
launch
_gsm8k_eval
(
eval_config
,
server_url
,
tp_size
)
results
=
run
_gsm8k_eval
(
eval_config
,
server_url
)
# Check accuracy against threshold
measured_metric
=
results
[
"accuracy"
]
measured_accuracy
=
results
[
"accuracy"
]
expected_metric
=
eval_config
[
"accuracy_threshold"
]
expected_accuracy
=
eval_config
[
"accuracy_threshold"
]
print
(
f
"GSM8K Results for
{
eval_config
[
'model_name'
]
}
:"
)
print
(
f
"GSM8K Results for
{
eval_config
[
'model_name'
]
}
:"
)
print
(
f
" Accuracy:
{
measured_accuracy
:.
3
f
}
"
)
print
(
f
" Measured metric:
{
measured_metric
:.
4
f
}
"
)
print
(
f
" Expected:
{
expected_accuracy
:.
3
f
}
"
)
print
(
f
" Expected metric:
{
expected_metric
:.
4
f
}
"
)
print
(
f
" Tolerance:
{
TOL
:.
4
f
}
"
)
print
(
f
" Questions:
{
results
[
'num_questions'
]
}
"
)
print
(
f
" Questions:
{
results
[
'num_questions'
]
}
"
)
print
(
f
" Invalid rate:
{
results
[
'invalid_rate'
]:.
3
f
}
"
)
print
(
f
" Invalid rate:
{
results
[
'invalid_rate'
]:.
3
f
}
"
)
print
(
f
" Latency:
{
results
[
'latency'
]:.
1
f
}
s"
)
print
(
f
" Latency:
{
results
[
'latency'
]:.
1
f
}
s"
)
print
(
f
" QPS:
{
results
[
'questions_per_second'
]:.
1
f
}
"
)
print
(
f
" QPS:
{
results
[
'questions_per_second'
]:.
1
f
}
"
)
# Verify
accuracy
is within tolerance
# Verify
metric
is within tolerance
assert
measured_
accuracy
>=
expected_
accuracy
-
R
TOL
,
(
assert
measured_
metric
>=
expected_
metric
-
TOL
,
(
f
"
Accuracy
too low:
{
measured_
accuracy
:.
3
f
}
< "
f
"
GSM8K metric
too low:
{
measured_
metric
:.
4
f
}
< "
f
"
{
expected_
accuracy
:.
3
f
}
-
{
R
TOL
:.
3
f
}
"
f
"
{
expected_
metric
:.
4
f
}
-
{
TOL
:.
4
f
}
=
{
expected_metric
-
TOL
:.
4
f
}
"
)
)
print
(
f
"✅ GSM8K test passed for
{
eval_config
[
'model_name'
]
}
"
)
print
(
f
"✅ GSM8K test passed for
{
eval_config
[
'model_name'
]
}
"
)
tests/kernels/attention/test_attention.py
View file @
a810671a
...
@@ -9,7 +9,8 @@ import torch
...
@@ -9,7 +9,8 @@ import torch
from
tests.kernels.allclose_default
import
get_default_atol
,
get_default_rtol
from
tests.kernels.allclose_default
import
get_default_atol
,
get_default_rtol
from
tests.kernels.utils
import
opcheck
from
tests.kernels.utils
import
opcheck
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
from
vllm.attention.layer
import
Attention
,
MultiHeadAttention
from
vllm.attention.layer
import
Attention
from
vllm.attention.layers.mm_encoder_attention
import
MMEncoderAttention
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.mem_utils
import
get_max_shared_memory_bytes
from
vllm.utils.mem_utils
import
get_max_shared_memory_bytes
...
@@ -442,7 +443,7 @@ def ref_multi_query_kv_attention(
...
@@ -442,7 +443,7 @@ def ref_multi_query_kv_attention(
return
torch
.
cat
(
ref_outputs
,
dim
=
0
)
return
torch
.
cat
(
ref_outputs
,
dim
=
0
)
@
pytest
.
mark
.
parametrize
(
"attention_cls"
,
[
Attention
,
M
ultiHead
Attention
])
@
pytest
.
mark
.
parametrize
(
"attention_cls"
,
[
Attention
,
M
MEncoder
Attention
])
def
test_num_heads_not_divisble_by_num_kv_heads
(
attention_cls
:
type
)
->
None
:
def
test_num_heads_not_divisble_by_num_kv_heads
(
attention_cls
:
type
)
->
None
:
head_size
=
64
head_size
=
64
scale
=
float
(
1.0
/
(
head_size
**
0.5
))
scale
=
float
(
1.0
/
(
head_size
**
0.5
))
...
...
tests/kernels/attention/test_attention_selector.py
View file @
a810671a
...
@@ -6,7 +6,9 @@ from unittest.mock import patch
...
@@ -6,7 +6,9 @@ from unittest.mock import patch
import
pytest
import
pytest
import
torch
import
torch
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.attention.selector
import
_cached_get_attn_backend
,
get_attn_backend
from
vllm.attention.selector
import
_cached_get_attn_backend
,
get_attn_backend
from
vllm.config
import
AttentionConfig
,
VllmConfig
,
set_current_vllm_config
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.platforms.cpu
import
CpuPlatform
from
vllm.platforms.cpu
import
CpuPlatform
from
vllm.platforms.cuda
import
CudaPlatform
from
vllm.platforms.cuda
import
CudaPlatform
...
@@ -73,18 +75,18 @@ def generate_params():
...
@@ -73,18 +75,18 @@ def generate_params():
@
pytest
.
mark
.
parametrize
(
"device, name, use_mla, block_size"
,
generate_params
())
@
pytest
.
mark
.
parametrize
(
"device, name, use_mla, block_size"
,
generate_params
())
def
test_
env
(
def
test_
backend_selection
(
device
:
str
,
device
:
str
,
name
:
str
,
name
:
str
,
use_mla
:
bool
,
use_mla
:
bool
,
block_size
:
int
,
block_size
:
int
,
monkeypatch
:
pytest
.
MonkeyPatch
,
):
):
"""Test attention backend selection with valid device-backend pairs."""
"""Test attention backend selection with valid device-backend pairs."""
with
monkeypatch
.
context
()
as
m
:
# Create AttentionConfig with the specified backend
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
name
)
attention_config
=
AttentionConfig
(
backend
=
AttentionBackendEnum
[
name
]
)
m
.
setenv
(
"VLLM_MLA_DISABLE"
,
"1"
if
use_mla
else
"0"
)
vllm_config
=
VllmConfig
(
attention_config
=
attention_config
)
with
set_current_vllm_config
(
vllm_config
):
if
device
==
"cpu"
:
if
device
==
"cpu"
:
with
patch
(
"vllm.platforms.current_platform"
,
CpuPlatform
()):
with
patch
(
"vllm.platforms.current_platform"
,
CpuPlatform
()):
backend
=
get_attn_backend
(
16
,
torch
.
float16
,
None
,
block_size
)
backend
=
get_attn_backend
(
16
,
torch
.
float16
,
None
,
block_size
)
...
@@ -217,27 +219,32 @@ def test_env(
...
@@ -217,27 +219,32 @@ def test_env(
@
pytest
.
mark
.
parametrize
(
"device"
,
[
"cpu"
,
"cuda"
])
@
pytest
.
mark
.
parametrize
(
"device"
,
[
"cpu"
,
"cuda"
])
def
test_fp32_fallback
(
device
:
str
):
def
test_fp32_fallback
(
device
:
str
):
"""Test attention backend selection with fp32."""
"""Test attention backend selection with fp32."""
if
device
==
"cpu"
:
# Use default config (no backend specified)
with
patch
(
"vllm.platforms.current_platform"
,
CpuPlatform
()):
vllm_config
=
VllmConfig
()
backend
=
get_attn_backend
(
16
,
torch
.
float32
,
None
,
16
)
assert
backend
.
get_name
()
==
"CPU_ATTN"
elif
device
==
"cuda"
:
with
set_current_vllm_config
(
vllm_config
):
with
patch
(
"vllm.platforms.current_platform"
,
CudaPlatform
()):
if
device
==
"cpu"
:
backend
=
get_attn_backend
(
16
,
torch
.
float32
,
None
,
16
)
with
patch
(
"vllm.platforms.current_platform"
,
CpuPlatform
()):
assert
backend
.
get_name
()
==
"FLEX_ATTENTION"
backend
=
get_attn_backend
(
16
,
torch
.
float32
,
None
,
16
)
assert
backend
.
get_name
()
==
"CPU_ATTN"
elif
device
==
"cuda"
:
with
patch
(
"vllm.platforms.current_platform"
,
CudaPlatform
()):
backend
=
get_attn_backend
(
16
,
torch
.
float32
,
None
,
16
)
assert
backend
.
get_name
()
==
"FLEX_ATTENTION"
def
test_flash_attn
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_flash_attn
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""Test FlashAttn validation."""
"""Test FlashAttn validation."""
pytest
.
skip
(
pytest
.
skip
(
"Skipping as current backend selector does not "
"Skipping as current backend selector does not "
"handle fallbacks when a backend is
set via env var
."
"handle fallbacks when a backend is
explicitly set
."
)
)
with
monkeypatch
.
context
()
as
m
:
attention_config
=
AttentionConfig
(
backend
=
AttentionBackendEnum
.
FLASH_ATTN
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"FLASH_ATTN"
)
vllm_config
=
VllmConfig
(
attention_config
=
attention_config
)
with
set_current_vllm_config
(
vllm_config
):
# Unsupported CUDA arch
# Unsupported CUDA arch
monkeypatch
.
setattr
(
torch
.
cuda
,
"get_device_capability"
,
lambda
_
=
None
:
(
7
,
5
))
monkeypatch
.
setattr
(
torch
.
cuda
,
"get_device_capability"
,
lambda
_
=
None
:
(
7
,
5
))
backend
=
get_attn_backend
(
16
,
torch
.
float16
,
None
,
16
)
backend
=
get_attn_backend
(
16
,
torch
.
float16
,
None
,
16
)
...
@@ -277,15 +284,10 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
...
@@ -277,15 +284,10 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
assert
backend
.
get_name
()
!=
"FLASH_ATTN"
assert
backend
.
get_name
()
!=
"FLASH_ATTN"
def
test_invalid_
env
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_invalid_
backend
(
):
"""Test that invalid attention backend names raise ValueError."""
"""Test that invalid attention backend names raise ValueError."""
with
(
with
(
monkeypatch
.
context
()
as
m
,
pytest
.
raises
(
ValueError
),
patch
(
"vllm.platforms.current_platform"
,
CudaPlatform
()),
):
):
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"INVALID"
)
# Invalid backend name should raise ValueError when creating enum
AttentionConfig
(
backend
=
AttentionBackendEnum
[
"INVALID"
])
# Should raise ValueError for invalid backend
with
pytest
.
raises
(
ValueError
)
as
exc_info
:
get_attn_backend
(
32
,
torch
.
float16
,
None
,
16
)
assert
"Invalid value 'INVALID'"
in
str
(
exc_info
.
value
)
tests/kernels/attention/test_flashinfer_trtllm_attention.py
View file @
a810671a
...
@@ -455,3 +455,38 @@ def test_flashinfer_trtllm_prefill_with_baseline(
...
@@ -455,3 +455,38 @@ def test_flashinfer_trtllm_prefill_with_baseline(
torch
.
testing
.
assert_close
(
output
,
output_trtllm
,
atol
=
atol
,
rtol
=
rtol
),
torch
.
testing
.
assert_close
(
output
,
output_trtllm
,
atol
=
atol
,
rtol
=
rtol
),
f
"
{
torch
.
max
(
torch
.
abs
(
output
-
output_trtllm
))
}
"
,
f
"
{
torch
.
max
(
torch
.
abs
(
output
-
output_trtllm
))
}
"
,
)
)
def
test_trtllm_attention_rejects_num_kv_heads_1
()
->
None
:
"""Test that TRTLLM attention correctly rejects num_kv_heads=1.
When num_kv_heads=1 (MQA), the KV cache strides become degenerate
(stride_heads == stride_batch), which causes CUDA's cuTensorMapEncodeTiled
to fail because TMA descriptors cannot handle degenerate 4D tensors with
singleton dimensions.
This test verifies that can_use_trtllm_attention returns False for
num_kv_heads=1 configurations.
"""
from
vllm.utils.flashinfer
import
can_use_trtllm_attention
# num_kv_heads=1 should be rejected
assert
not
can_use_trtllm_attention
(
num_qo_heads
=
64
,
num_kv_heads
=
1
),
(
"can_use_trtllm_attention should return False for num_kv_heads=1"
)
assert
not
can_use_trtllm_attention
(
num_qo_heads
=
32
,
num_kv_heads
=
1
),
(
"can_use_trtllm_attention should return False for num_kv_heads=1"
)
# num_kv_heads > 1 should be accepted (if platform supports it)
# Note: This may return False on non-Blackwell platforms, which is fine
result_kv8
=
can_use_trtllm_attention
(
num_qo_heads
=
64
,
num_kv_heads
=
8
)
result_kv1
=
can_use_trtllm_attention
(
num_qo_heads
=
64
,
num_kv_heads
=
1
)
# Even if platform doesn't support TRTLLM, num_kv_heads=1 should never
# return True when num_kv_heads > 1 returns True
if
result_kv8
:
assert
not
result_kv1
,
(
"If TRTLLM is supported for num_kv_heads=8, "
"it must be rejected for num_kv_heads=1"
)
tests/kernels/attention/test_mha_attn.py
View file @
a810671a
...
@@ -3,16 +3,17 @@
...
@@ -3,16 +3,17 @@
"""
"""
Test:
Test:
* Tests for M
ultiHead
Attention layer
* Tests for M
MEncoder
Attention layer
"""
"""
import
itertools
from
unittest.mock
import
patch
from
unittest.mock
import
patch
import
pytest
import
pytest
import
torch
import
torch
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.attention.layer
import
M
ultiHead
Attention
from
vllm.attention.layer
s.mm_encoder_attention
import
M
MEncoder
Attention
from
vllm.attention.selector
import
_cached_get_attn_backend
from
vllm.attention.selector
import
_cached_get_attn_backend
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.platforms.cpu
import
CpuPlatform
from
vllm.platforms.cpu
import
CpuPlatform
...
@@ -42,35 +43,31 @@ def test_mha_attn_platform(device: str):
...
@@ -42,35 +43,31 @@ def test_mha_attn_platform(device: str):
if
device
==
"cpu"
:
if
device
==
"cpu"
:
with
(
with
(
patch
(
"vllm.attention.layer.current_platform"
,
CpuPlatform
()),
patch
(
"vllm.model_executor.models.vision.current_platform"
,
CpuPlatform
()),
patch
(
"vllm.model_executor.models.vision.current_platform"
,
CpuPlatform
()),
):
):
attn
=
M
ultiHead
Attention
(
16
,
64
,
scale
=
1
)
attn
=
M
MEncoder
Attention
(
16
,
64
,
scale
=
1
)
assert
attn
.
attn_backend
==
AttentionBackendEnum
.
TORCH_SDPA
assert
attn
.
attn_backend
==
AttentionBackendEnum
.
TORCH_SDPA
elif
device
==
"hip"
:
elif
device
==
"hip"
:
with
(
with
(
patch
(
"vllm.attention.layer.current_platform"
,
RocmPlatform
()),
patch
(
"vllm.model_executor.models.vision.current_platform"
,
RocmPlatform
()),
patch
(
"vllm.model_executor.models.vision.current_platform"
,
RocmPlatform
()),
):
):
attn
=
M
ultiHead
Attention
(
16
,
64
,
scale
=
1
)
attn
=
M
MEncoder
Attention
(
16
,
64
,
scale
=
1
)
assert
attn
.
attn_backend
==
AttentionBackendEnum
.
FLASH_ATTN
assert
attn
.
attn_backend
==
AttentionBackendEnum
.
FLASH_ATTN
else
:
else
:
# Test CUDA with head_size=64 (divisible by 32)
# Test CUDA with head_size=64 (divisible by 32)
# - should use vLLM's FlashAttention
# - should use vLLM's FlashAttention
with
(
with
(
patch
(
"vllm.attention.layer.current_platform"
,
CudaPlatform
()),
patch
(
"vllm.model_executor.models.vision.current_platform"
,
CudaPlatform
()),
patch
(
"vllm.model_executor.models.vision.current_platform"
,
CudaPlatform
()),
):
):
attn
=
M
ultiHead
Attention
(
16
,
64
,
scale
=
1
)
attn
=
M
MEncoder
Attention
(
16
,
64
,
scale
=
1
)
assert
attn
.
attn_backend
==
AttentionBackendEnum
.
FLASH_ATTN
assert
attn
.
attn_backend
==
AttentionBackendEnum
.
FLASH_ATTN
# Test CUDA with head_size=72 (not divisible by 32)
# Test CUDA with head_size=72 (not divisible by 32)
# - should use vLLM's FlashAttention
# - should use vLLM's FlashAttention
with
(
with
(
patch
(
"vllm.attention.layer.current_platform"
,
CudaPlatform
()),
patch
(
"vllm.model_executor.models.vision.current_platform"
,
CudaPlatform
()),
patch
(
"vllm.model_executor.models.vision.current_platform"
,
CudaPlatform
()),
):
):
attn
=
M
ultiHead
Attention
(
16
,
72
,
scale
=
1
)
attn
=
M
MEncoder
Attention
(
16
,
72
,
scale
=
1
)
assert
attn
.
attn_backend
==
AttentionBackendEnum
.
FLASH_ATTN
assert
attn
.
attn_backend
==
AttentionBackendEnum
.
FLASH_ATTN
...
@@ -94,6 +91,10 @@ def ref_attention(
...
@@ -94,6 +91,10 @@ def ref_attention(
BATCH_SIZES
=
[
1
,
16
]
BATCH_SIZES
=
[
1
,
16
]
SEQ_LENS
=
[
1
]
SEQ_LENS
=
[
1
]
VAR_SEQ_LENS
=
[
[
2
,
2
],
[
2
,
3
,
4
],
]
NUM_HEADS
=
[
1
,
16
]
NUM_HEADS
=
[
1
,
16
]
NUM_KV_HEADS
=
[
1
]
NUM_KV_HEADS
=
[
1
]
HEAD_SIZES
=
[
64
,
80
]
HEAD_SIZES
=
[
64
,
80
]
...
@@ -130,7 +131,7 @@ def test_mha_attn_forward(
...
@@ -130,7 +131,7 @@ def test_mha_attn_forward(
k
=
torch
.
randn
(
batch_size
,
seq_len
,
num_kv_heads
*
head_size
)
k
=
torch
.
randn
(
batch_size
,
seq_len
,
num_kv_heads
*
head_size
)
v
=
torch
.
randn
(
batch_size
,
seq_len
,
num_kv_heads
*
head_size
)
v
=
torch
.
randn
(
batch_size
,
seq_len
,
num_kv_heads
*
head_size
)
scale
=
1.0
/
head_size
**
0.5
scale
=
1.0
/
head_size
**
0.5
attn
=
M
ultiHead
Attention
(
attn
=
M
MEncoder
Attention
(
num_heads
,
head_size
,
scale
=
scale
,
num_kv_heads
=
num_kv_heads
num_heads
,
head_size
,
scale
=
scale
,
num_kv_heads
=
num_kv_heads
)
)
output
=
attn
(
q
,
k
,
v
)
output
=
attn
(
q
,
k
,
v
)
...
@@ -151,3 +152,58 @@ def test_mha_attn_forward(
...
@@ -151,3 +152,58 @@ def test_mha_attn_forward(
scale
=
scale
,
scale
=
scale
,
).
reshape
(
batch_size
,
seq_len
,
num_heads
*
head_size
)
).
reshape
(
batch_size
,
seq_len
,
num_heads
*
head_size
)
torch
.
testing
.
assert_close
(
output
,
ref_output
)
torch
.
testing
.
assert_close
(
output
,
ref_output
)
@
pytest
.
mark
.
parametrize
(
"var_seq_len"
,
VAR_SEQ_LENS
)
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
@
pytest
.
mark
.
parametrize
(
"num_kv_heads"
,
NUM_KV_HEADS
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_mha_attn_varlen_forward
(
var_seq_len
:
list
[
int
],
num_heads
:
int
,
num_kv_heads
:
int
,
head_size
:
int
,
dtype
:
torch
.
dtype
,
device
:
str
,
):
current_platform
.
seed_everything
(
0
)
torch
.
set_default_device
(
device
)
torch
.
set_default_dtype
(
dtype
)
q
=
torch
.
randn
(
1
,
sum
(
var_seq_len
),
num_heads
,
head_size
)
k
=
torch
.
randn
(
1
,
sum
(
var_seq_len
),
num_kv_heads
,
head_size
)
v
=
torch
.
randn
(
1
,
sum
(
var_seq_len
),
num_kv_heads
,
head_size
)
cu_seqlens
=
torch
.
tensor
(
[
0
]
+
list
(
itertools
.
accumulate
(
var_seq_len
)),
dtype
=
torch
.
int32
)
scale
=
1.0
/
head_size
**
0.5
attn
=
MMEncoderAttention
(
num_heads
,
head_size
,
scale
=
scale
,
num_kv_heads
=
num_kv_heads
)
output
=
attn
(
q
,
k
,
v
,
cu_seqlens
=
cu_seqlens
,
max_seqlen
=
torch
.
tensor
(
max
(
var_seq_len
))
)
assert
num_heads
%
num_kv_heads
==
0
num_queries_per_kv
=
num_heads
//
num_kv_heads
if
num_queries_per_kv
>
1
:
k
=
torch
.
repeat_interleave
(
k
,
num_queries_per_kv
,
dim
=
2
)
v
=
torch
.
repeat_interleave
(
v
,
num_queries_per_kv
,
dim
=
2
)
ref_output
=
[]
for
q_i
,
k_i
,
v_i
in
zip
(
torch
.
split
(
q
,
var_seq_len
,
dim
=
1
),
torch
.
split
(
k
,
var_seq_len
,
dim
=
1
),
torch
.
split
(
v
,
var_seq_len
,
dim
=
1
),
):
output_i
=
ref_attention
(
q_i
,
k_i
,
v_i
,
scale
=
scale
,
)
ref_output
.
append
(
output_i
)
ref_output
=
torch
.
cat
(
ref_output
,
dim
=
1
)
torch
.
testing
.
assert_close
(
output
,
ref_output
,
atol
=
1e-2
,
rtol
=
1e-2
)
tests/kernels/attention/test_rocm_attention_selector.py
View file @
a810671a
...
@@ -4,7 +4,9 @@
...
@@ -4,7 +4,9 @@
import
pytest
import
pytest
import
torch
import
torch
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.attention.selector
import
_cached_get_attn_backend
,
get_attn_backend
from
vllm.attention.selector
import
_cached_get_attn_backend
,
get_attn_backend
from
vllm.config
import
AttentionConfig
,
VllmConfig
,
set_current_vllm_config
from
vllm.platforms.rocm
import
RocmPlatform
from
vllm.platforms.rocm
import
RocmPlatform
...
@@ -16,40 +18,56 @@ def clear_cache():
...
@@ -16,40 +18,56 @@ def clear_cache():
@
pytest
.
mark
.
skip
(
reason
=
"Skipped for now. Should be revisited."
)
@
pytest
.
mark
.
skip
(
reason
=
"Skipped for now. Should be revisited."
)
def
test_selector
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_selector
(
monkeypatch
:
pytest
.
MonkeyPatch
):
with
monkeypatch
.
context
()
as
m
:
# Set the current platform to ROCm using monkeypatch
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"ROCM_ATTN"
)
m
onkeypatch
.
setattr
(
"vllm.attention.selector.current_platform"
,
RocmPlatform
()
)
# Set the current platform to ROCm using monkeypatch
# Test standard ROCm attention
monkeypatch
.
setattr
(
"vllm.attention.selector.current_platform"
,
RocmPlatform
())
attention_config
=
AttentionConfig
(
backend
=
AttentionBackendEnum
.
ROCM_ATTN
)
vllm_config
=
VllmConfig
(
attention_config
=
attention_config
)
# Test standard ROCm attention
with
set_current_vllm_config
(
vllm_config
):
backend
=
get_attn_backend
(
16
,
torch
.
float16
,
torch
.
float16
,
16
,
False
)
backend
=
get_attn_backend
(
16
,
torch
.
float16
,
torch
.
float16
,
16
,
False
)
assert
backend
.
get_name
()
==
"ROCM_FLASH"
or
backend
.
get_name
()
==
"TRITON_ATTN"
assert
backend
.
get_name
()
==
"ROCM_FLASH"
or
backend
.
get_name
()
==
"TRITON_ATTN"
# MLA test for deepseek related
# MLA test for deepseek related
# Change the attention backend to triton MLA
attention_config
=
AttentionConfig
(
backend
=
AttentionBackendEnum
.
TRITON_MLA
)
vllm_config
=
VllmConfig
(
attention_config
=
attention_config
)
# change the attention backend to triton MLA
with
set_current_vllm_config
(
vllm_config
):
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"TRITON_MLA"
)
backend
=
get_attn_backend
(
576
,
torch
.
bfloat16
,
"auto"
,
16
,
False
,
use_mla
=
True
)
backend
=
get_attn_backend
(
576
,
torch
.
bfloat16
,
"auto"
,
16
,
False
,
use_mla
=
True
)
assert
backend
.
get_name
()
==
"TRITON_MLA"
assert
backend
.
get_name
()
==
"TRITON_MLA"
# If attention backend is None
# If attention backend is None
# If use_mla is true
# If use_mla is true
# The selected backend is triton MLA
# The selected backend is triton MLA
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
""
)
attention_config
=
AttentionConfig
(
backend
=
None
)
vllm_config
=
VllmConfig
(
attention_config
=
attention_config
)
with
set_current_vllm_config
(
vllm_config
):
backend
=
get_attn_backend
(
576
,
torch
.
bfloat16
,
"auto"
,
16
,
False
,
use_mla
=
True
)
backend
=
get_attn_backend
(
576
,
torch
.
bfloat16
,
"auto"
,
16
,
False
,
use_mla
=
True
)
assert
backend
.
get_name
()
==
"TRITON_MLA"
assert
backend
.
get_name
()
==
"TRITON_MLA"
# change the attention backend to AITER MLA
# Change the attention backend to AITER MLA
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"ROCM_AITER_MLA"
)
attention_config
=
AttentionConfig
(
backend
=
AttentionBackendEnum
.
ROCM_AITER_MLA
)
vllm_config
=
VllmConfig
(
attention_config
=
attention_config
)
with
set_current_vllm_config
(
vllm_config
):
backend
=
get_attn_backend
(
576
,
torch
.
bfloat16
,
"auto"
,
1
,
False
,
use_mla
=
True
)
backend
=
get_attn_backend
(
576
,
torch
.
bfloat16
,
"auto"
,
1
,
False
,
use_mla
=
True
)
assert
backend
.
get_name
()
==
"ROCM_AITER_MLA"
assert
backend
.
get_name
()
==
"ROCM_AITER_MLA"
# If attention backend is None
# If attention backend is None
# If use_mla is true
# If use_mla is true
# If VLLM_ROCM_USE_AITER is enabled
# If VLLM_ROCM_USE_AITER is enabled
# The selected backend is ROCM_AITER_MLA
# The selected backend is ROCM_AITER_MLA
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
""
)
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_ROCM_USE_AITER"
,
"1"
)
m
.
setenv
(
"VLLM_ROCM_USE_AITER"
,
"1"
)
backend
=
get_attn_backend
(
576
,
torch
.
bfloat16
,
"auto"
,
1
,
False
,
use_mla
=
True
)
assert
backend
.
get_name
()
==
"ROCM_AITER_MLA"
attention_config
=
AttentionConfig
(
backend
=
None
)
vllm_config
=
VllmConfig
(
attention_config
=
attention_config
)
with
set_current_vllm_config
(
vllm_config
):
backend
=
get_attn_backend
(
576
,
torch
.
bfloat16
,
"auto"
,
1
,
False
,
use_mla
=
True
)
assert
backend
.
get_name
()
==
"ROCM_AITER_MLA"
tests/kernels/moe/test_cpu_fused_moe.py
0 → 100644
View file @
a810671a
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
from
tests.kernels.allclose_default
import
get_default_atol
,
get_default_rtol
from
vllm._custom_ops
import
cpu_fused_moe
,
cpu_prepack_moe_weight
from
vllm.model_executor.layers.activation
import
SiluAndMul
,
SwigluOAIAndMul
from
vllm.platforms
import
current_platform
if
not
current_platform
.
is_cpu
():
pytest
.
skip
(
"skipping CPU-only tests"
,
allow_module_level
=
True
)
EXPERT_NUM
=
[
8
,
]
HIDDEN_DIM
=
[
128
,
2880
]
INTERMEDIATE_DIM
=
[
128
,
2880
]
BATCH_SIZE
=
[
1
,
64
,
256
]
ACT
=
[
"silu"
,
"swigluoai"
]
USE_BIAS
=
[
True
,
False
]
ISA
=
[
"amx"
,
"vec"
]
if
torch
.
_C
.
_cpu
.
_is_amx_tile_supported
()
else
[
"vec"
]
DTYPE
=
[
torch
.
bfloat16
]
_CPU_MOE_ACT
=
{
"silu"
:
SiluAndMul
(),
"swigluoai"
:
SwigluOAIAndMul
(),
}
def
ref_fused_moe
(
input
:
torch
.
Tensor
,
w13
:
torch
.
Tensor
,
w2
:
torch
.
Tensor
,
w13_bias
:
torch
.
Tensor
|
None
,
w2_bias
:
torch
.
Tensor
|
None
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
activation
:
str
,
)
->
torch
.
Tensor
:
len_experts
=
w13
.
size
(
0
)
cnts
=
topk_ids
.
new_zeros
((
topk_ids
.
shape
[
0
],
len_experts
))
cnts
.
scatter_
(
1
,
topk_ids
.
to
(
torch
.
int64
),
1
)
tokens_per_expert
=
cnts
.
sum
(
dim
=
0
)
idxs
=
topk_ids
.
view
(
-
1
).
argsort
()
sorted_tokens
=
input
[
idxs
//
topk_ids
.
shape
[
1
]]
tokens_per_expert
=
tokens_per_expert
.
cpu
().
numpy
()
outputs
=
[]
start_idx
=
0
for
i
,
num_tokens
in
enumerate
(
tokens_per_expert
):
end_idx
=
start_idx
+
num_tokens
if
num_tokens
==
0
:
continue
tokens_for_this_expert
=
sorted_tokens
[
start_idx
:
end_idx
].
float
()
curr_w13
=
w13
[
i
].
float
()
curr_w2
=
w2
[
i
].
float
()
curr_w13_bias
=
None
if
w13_bias
is
not
None
:
curr_w13_bias
=
w13_bias
[
i
].
float
()
curr_w2_bias
=
None
if
w2_bias
is
not
None
:
curr_w2_bias
=
w2_bias
[
i
].
float
()
gate_up
=
torch
.
nn
.
functional
.
linear
(
tokens_for_this_expert
,
curr_w13
,
curr_w13_bias
)
# Note: to simulate the kernel implementation
gate_up
=
(
_CPU_MOE_ACT
[
activation
]
.
forward_native
(
gate_up
)
.
to
(
dtype
=
input
.
dtype
)
.
float
()
)
expert_out
=
torch
.
nn
.
functional
.
linear
(
gate_up
,
curr_w2
,
curr_w2_bias
)
outputs
.
append
(
expert_out
)
start_idx
=
end_idx
outs
=
torch
.
cat
(
outputs
,
dim
=
0
)
if
len
(
outputs
)
else
sorted_tokens
.
new_empty
(
0
)
new_x
=
torch
.
empty_like
(
outs
)
new_x
[
idxs
]
=
outs
final_out
=
(
new_x
.
view
(
*
topk_ids
.
shape
,
-
1
)
.
mul_
(
topk_weights
.
unsqueeze
(
dim
=-
1
))
.
sum
(
dim
=
1
)
.
type
(
input
.
dtype
)
)
return
final_out
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
BATCH_SIZE
)
@
pytest
.
mark
.
parametrize
(
"expert_num"
,
EXPERT_NUM
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_DIM
)
@
pytest
.
mark
.
parametrize
(
"intermediate_size"
,
INTERMEDIATE_DIM
)
@
pytest
.
mark
.
parametrize
(
"use_bias"
,
USE_BIAS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPE
)
@
pytest
.
mark
.
parametrize
(
"act"
,
ACT
)
@
pytest
.
mark
.
parametrize
(
"isa"
,
ISA
)
def
test_cpu_fused_moe
(
batch_size
:
int
,
expert_num
:
int
,
hidden_size
:
int
,
intermediate_size
:
int
,
use_bias
:
bool
,
dtype
:
torch
.
dtype
,
act
:
str
,
isa
:
str
,
):
current_platform
.
seed_everything
(
0
)
topk_num
=
max
(
expert_num
//
2
,
1
)
up_dim
=
2
*
intermediate_size
input
=
torch
.
randn
((
batch_size
,
hidden_size
),
dtype
=
dtype
)
/
(
0.5
*
hidden_size
**
0.5
)
w13
=
torch
.
randn
((
expert_num
,
up_dim
,
hidden_size
),
dtype
=
dtype
)
/
(
0.5
*
hidden_size
**
0.5
)
w2
=
torch
.
randn
((
expert_num
,
hidden_size
,
intermediate_size
),
dtype
=
dtype
)
/
(
0.5
*
intermediate_size
**
0.5
)
router_logits
=
torch
.
randn
((
batch_size
,
expert_num
),
dtype
=
dtype
)
w13_bias
=
None
w2_bias
=
None
if
use_bias
:
w13_bias
=
torch
.
randn
((
expert_num
,
up_dim
),
dtype
=
dtype
)
/
(
0.5
*
up_dim
**
0.5
)
w2_bias
=
torch
.
randn
((
expert_num
,
hidden_size
),
dtype
=
dtype
)
/
(
0.5
*
hidden_size
**
0.5
)
score
=
torch
.
softmax
(
router_logits
,
dim
=-
1
,
dtype
=
torch
.
float32
)
topk_weight
,
topk_ids
=
torch
.
topk
(
score
,
topk_num
)
topk_ids
=
topk_ids
.
to
(
torch
.
int32
)
ref_output
=
ref_fused_moe
(
input
,
w13
,
w2
,
w13_bias
,
w2_bias
,
topk_weight
,
topk_ids
,
act
,
)
packed_w13
=
cpu_prepack_moe_weight
(
w13
,
isa
)
packed_w2
=
cpu_prepack_moe_weight
(
w2
,
isa
)
output
=
cpu_fused_moe
(
input
,
packed_w13
,
packed_w2
,
w13_bias
,
w2_bias
,
topk_weight
,
topk_ids
,
act
,
isa
,
)
atol
,
rtol
=
get_default_atol
(
output
),
get_default_rtol
(
output
)
(
torch
.
testing
.
assert_close
(
output
,
ref_output
,
atol
=
atol
,
rtol
=
rtol
),
f
"
{
torch
.
max
(
torch
.
abs
(
output
-
ref_output
))
}
"
,
)
tests/kernels/moe/test_grouped_topk.py
View file @
a810671a
...
@@ -9,8 +9,8 @@ import pytest
...
@@ -9,8 +9,8 @@ import pytest
import
torch
import
torch
from
vllm.model_executor.layers.fused_moe.fused_moe
import
(
from
vllm.model_executor.layers.fused_moe.fused_moe
import
(
GroupedTopk
,
fused_grouped_topk
,
fused_grouped_topk
,
grouped_topk
,
)
)
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
...
@@ -50,15 +50,17 @@ def test_grouped_topk(
...
@@ -50,15 +50,17 @@ def test_grouped_topk(
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_FUSED_MOE_GROUPED_TOPK"
,
"0"
)
m
.
setenv
(
"VLLM_USE_FUSED_MOE_GROUPED_TOPK"
,
"0"
)
baseline_topk_weights
,
baseline_topk_ids
=
grouped_topk
(
grouped_topk
=
GroupedTopk
(
hidden_states
=
hidden_states
,
gating_output
=
gating_output
,
topk
=
topk
,
topk
=
topk
,
renormalize
=
renormalize
,
renormalize
=
renormalize
,
num_expert_group
=
num_expert_group
,
num_expert_group
=
num_expert_group
,
topk_group
=
topk_group
,
topk_group
=
topk_group
,
scoring_func
=
scoring_func
,
scoring_func
=
scoring_func
,
routed_scaling_factor
=
routed_scaling_factor
,
routed_scaling_factor
=
routed_scaling_factor
,
)
baseline_topk_weights
,
baseline_topk_ids
=
grouped_topk
(
hidden_states
=
hidden_states
,
gating_output
=
gating_output
,
e_score_correction_bias
=
e_score_correction_bias
,
e_score_correction_bias
=
e_score_correction_bias
,
)
)
...
...
tests/kernels/test_flex_attention.py
View file @
a810671a
...
@@ -40,7 +40,7 @@ def set_seed(seed):
...
@@ -40,7 +40,7 @@ def set_seed(seed):
not
torch
.
cuda
.
is_available
()
or
TORCH_VERSION
<
MINIMUM_TORCH_VERSION
,
not
torch
.
cuda
.
is_available
()
or
TORCH_VERSION
<
MINIMUM_TORCH_VERSION
,
reason
=
"CUDA not available or PyTorch version < 2.7"
,
reason
=
"CUDA not available or PyTorch version < 2.7"
,
)
)
def
test_flex_attention_vs_default_backend
(
vllm_runner
,
monkeypatch
):
def
test_flex_attention_vs_default_backend
(
vllm_runner
):
"""Test that FlexAttention produces the same outputs as the default backend.
"""Test that FlexAttention produces the same outputs as the default backend.
This test compares the outputs from the FlexAttention backend with
This test compares the outputs from the FlexAttention backend with
...
@@ -57,35 +57,32 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
...
@@ -57,35 +57,32 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
]
]
# Run with flex attention
# Run with flex attention
with
monkeypatch
.
context
()
as
m
:
set_seed
(
seed
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"FLEX_ATTENTION"
)
with
vllm_runner
(
model_name
,
set_seed
(
seed
)
runner
=
"generate"
,
with
vllm_runner
(
tensor_parallel_size
=
1
,
model_name
,
num_gpu_blocks_override
=
128
,
runner
=
"generate"
,
enforce_eager
=
True
,
tensor_parallel_size
=
1
,
attention_config
=
{
"backend"
:
"FLEX_ATTENTION"
},
num_gpu_blocks_override
=
128
,
)
as
llm_flex
:
enforce_eager
=
True
,
output_flex
=
llm_flex
.
generate_greedy_logprobs
(
)
as
llm_flex
:
prompts
,
max_tokens
,
num_logprobs
output_flex
=
llm_flex
.
generate_greedy_logprobs
(
)
prompts
,
max_tokens
,
num_logprobs
)
# Run with default backend
# Run with default backend
with
monkeypatch
.
context
()
as
m
:
set_seed
(
seed
)
set_seed
(
seed
)
with
vllm_runner
(
with
vllm_runner
(
model_name
,
model_name
,
runner
=
"generate"
,
runner
=
"generate"
,
tensor_parallel_size
=
1
,
tensor_parallel_size
=
1
,
num_gpu_blocks_override
=
128
,
num_gpu_blocks_override
=
128
,
enforce_eager
=
True
,
enforce_eager
=
True
,
gpu_memory_utilization
=
0.85
,
gpu_memory_utilization
=
0.85
,
)
as
llm_default
:
)
as
llm_default
:
output_default
=
llm_default
.
generate_greedy_logprobs
(
output_default
=
llm_default
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
prompts
,
max_tokens
,
num_logprobs
)
)
check_logprobs_close
(
check_logprobs_close
(
outputs_0_lst
=
output_flex
,
outputs_0_lst
=
output_flex
,
...
@@ -99,7 +96,7 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
...
@@ -99,7 +96,7 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
not
torch
.
cuda
.
is_available
()
or
TORCH_VERSION
<
MINIMUM_TORCH_VERSION
,
not
torch
.
cuda
.
is_available
()
or
TORCH_VERSION
<
MINIMUM_TORCH_VERSION
,
reason
=
"CUDA not available or PyTorch version < 2.7"
,
reason
=
"CUDA not available or PyTorch version < 2.7"
,
)
)
def
test_encoder_flex_attention_vs_default_backend
(
vllm_runner
,
monkeypatch
):
def
test_encoder_flex_attention_vs_default_backend
(
vllm_runner
):
"""Test that FlexAttention produces the same outputs as the default backend.
"""Test that FlexAttention produces the same outputs as the default backend.
This test compares the outputs from the FlexAttention backend with
This test compares the outputs from the FlexAttention backend with
...
@@ -113,30 +110,26 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
...
@@ -113,30 +110,26 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
]
]
# Run with flex attention
# Run with flex attention
with
monkeypatch
.
context
()
as
m
:
with
vllm_runner
(
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"FLEX_ATTENTION"
)
model_name
,
with
vllm_runner
(
runner
=
"pooling"
,
model_name
,
dtype
=
torch
.
bfloat16
,
runner
=
"pooling"
,
tensor_parallel_size
=
1
,
dtype
=
torch
.
bfloat16
,
max_model_len
=
100
,
tensor_parallel_size
=
1
,
enforce_eager
=
True
,
max_model_len
=
100
,
attention_config
=
{
"backend"
:
"FLEX_ATTENTION"
},
enforce_eager
=
True
,
)
as
llm_flex
:
)
as
llm_flex
:
flex_outputs
=
llm_flex
.
embed
(
prompts
)
flex_outputs
=
llm_flex
.
embed
(
prompts
)
# Run with default backend
# Run with default backend
with
(
with
vllm_runner
(
monkeypatch
.
context
()
as
m
,
model_name
,
vllm_runner
(
runner
=
"pooling"
,
model_name
,
dtype
=
torch
.
bfloat16
,
runner
=
"pooling"
,
tensor_parallel_size
=
1
,
dtype
=
torch
.
bfloat16
,
max_model_len
=
100
,
tensor_parallel_size
=
1
,
enforce_eager
=
True
,
max_model_len
=
100
,
)
as
llm_default
:
enforce_eager
=
True
,
)
as
llm_default
,
):
default_outputs
=
llm_default
.
embed
(
prompts
)
default_outputs
=
llm_default
.
embed
(
prompts
)
check_embeddings_close
(
check_embeddings_close
(
...
...
tests/lora/test_gptoss_tp.py
View file @
a810671a
...
@@ -76,6 +76,8 @@ def test_gpt_oss_lora(gptoss20b_lora_files):
...
@@ -76,6 +76,8 @@ def test_gpt_oss_lora(gptoss20b_lora_files):
enable_lora
=
True
,
enable_lora
=
True
,
max_loras
=
4
,
max_loras
=
4
,
max_lora_rank
=
8
,
max_lora_rank
=
8
,
max_num_seqs
=
2
,
max_num_batched_tokens
=
2048
,
compilation_config
=
vllm
.
config
.
CompilationConfig
(
# Avoid OOM
compilation_config
=
vllm
.
config
.
CompilationConfig
(
# Avoid OOM
cudagraph_specialize_lora
=
False
,
cudagraph_specialize_lora
=
False
,
),
),
...
@@ -94,8 +96,10 @@ def test_gpt_oss_lora_tp2(gptoss20b_lora_files, fully_sharded_loras):
...
@@ -94,8 +96,10 @@ def test_gpt_oss_lora_tp2(gptoss20b_lora_files, fully_sharded_loras):
enable_lora
=
True
,
enable_lora
=
True
,
max_loras
=
2
,
max_loras
=
2
,
max_lora_rank
=
8
,
max_lora_rank
=
8
,
max_num_seqs
=
16
,
max_num_seqs
=
2
,
max_num_batched_tokens
=
2048
,
tensor_parallel_size
=
2
,
tensor_parallel_size
=
2
,
gpu_memory_utilization
=
0.8
,
fully_sharded_loras
=
fully_sharded_loras
,
fully_sharded_loras
=
fully_sharded_loras
,
compilation_config
=
vllm
.
config
.
CompilationConfig
(
# Avoid OOM
compilation_config
=
vllm
.
config
.
CompilationConfig
(
# Avoid OOM
cudagraph_specialize_lora
=
False
,
cudagraph_specialize_lora
=
False
,
...
...
tests/lora/test_llama_tp.py
View file @
a810671a
...
@@ -76,11 +76,18 @@ def do_sample(
...
@@ -76,11 +76,18 @@ def do_sample(
if
lora_id
if
lora_id
else
None
,
else
None
,
)
)
# Print the outputs.
lora_request
=
LoRARequest
(
str
(
lora_id
),
lora_id
,
lora_path
)
if
lora_id
else
None
generated_texts
:
list
[
str
]
=
[]
generated_texts
:
list
[
str
]
=
[]
for
output
in
outputs
:
for
output
in
outputs
:
prompt
=
output
.
prompt
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
generated_text
=
output
.
outputs
[
0
].
text
# The output should include correct lora_request info
if
lora_request
is
not
None
:
assert
output
.
lora_request
.
lora_name
==
lora_request
.
lora_name
assert
output
.
lora_request
.
lora_int_id
==
lora_request
.
lora_int_id
assert
output
.
lora_request
.
lora_path
==
lora_request
.
lora_path
else
:
assert
output
.
lora_request
is
None
generated_texts
.
append
(
generated_text
)
generated_texts
.
append
(
generated_text
)
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
return
generated_texts
return
generated_texts
...
...
tests/lora/test_utils.py
View file @
a810671a
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
from
collections
import
OrderedDict
from
collections
import
OrderedDict
from
typing
import
NamedTuple
from
typing
import
NamedTuple
from
unittest.mock
import
patch
from
unittest.mock
import
MagicMock
,
patch
import
pytest
import
pytest
from
huggingface_hub.utils
import
HfHubHTTPError
from
huggingface_hub.utils
import
HfHubHTTPError
...
@@ -194,5 +194,8 @@ def test_get_adapter_absolute_path_huggingface_error(
...
@@ -194,5 +194,8 @@ def test_get_adapter_absolute_path_huggingface_error(
# Hugging Face model identifier with download error
# Hugging Face model identifier with download error
path
=
"org/repo"
path
=
"org/repo"
mock_exist
.
return_value
=
False
mock_exist
.
return_value
=
False
mock_snapshot_download
.
side_effect
=
HfHubHTTPError
(
"failed to query model info"
)
mock_snapshot_download
.
side_effect
=
HfHubHTTPError
(
"failed to query model info"
,
response
=
MagicMock
(),
)
assert
get_adapter_absolute_path
(
path
)
==
path
assert
get_adapter_absolute_path
(
path
)
==
path
tests/models/multimodal/generation/test_granite_speech.py
View file @
a810671a
...
@@ -35,10 +35,12 @@ audio_lora_path = MODEL_NAME
...
@@ -35,10 +35,12 @@ audio_lora_path = MODEL_NAME
models
=
[
MODEL_NAME
]
models
=
[
MODEL_NAME
]
@
pytest
.
fixture
(
autouse
=
True
)
@
pytest
.
fixture
def
set_attention_backend_for_rocm
(
monkeypatch
):
def
granite_speech_attention_config
():
"""Return attention config for Granite Speech tests on ROCm."""
if
current_platform
.
is_rocm
():
if
current_platform
.
is_rocm
():
monkeypatch
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"TRITON_ATTN"
)
return
{
"backend"
:
"TRITON_ATTN"
}
return
None
def
run_test
(
def
run_test
(
...
@@ -53,6 +55,7 @@ def run_test(
...
@@ -53,6 +55,7 @@ def run_test(
num_logprobs
:
int
,
num_logprobs
:
int
,
tensor_parallel_size
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
str
|
None
=
None
,
distributed_executor_backend
:
str
|
None
=
None
,
attention_config
:
dict
|
None
=
None
,
):
):
"""Inference result should be the same between hf and vllm.
"""Inference result should be the same between hf and vllm.
...
@@ -80,6 +83,7 @@ def run_test(
...
@@ -80,6 +83,7 @@ def run_test(
enable_lora
=
True
,
enable_lora
=
True
,
max_lora_rank
=
64
,
max_lora_rank
=
64
,
enforce_eager
=
True
,
enforce_eager
=
True
,
attention_config
=
attention_config
,
)
as
vllm_model
:
)
as
vllm_model
:
lora_request
=
LoRARequest
(
"audio"
,
1
,
audio_lora_path
)
lora_request
=
LoRARequest
(
"audio"
,
1
,
audio_lora_path
)
vllm_outputs_per_case
=
[
vllm_outputs_per_case
=
[
...
@@ -131,6 +135,7 @@ def test_models(
...
@@ -131,6 +135,7 @@ def test_models(
vllm_runner
,
vllm_runner
,
model
:
str
,
model
:
str
,
audio_assets
:
AudioTestAssets
,
audio_assets
:
AudioTestAssets
,
granite_speech_attention_config
,
dtype
:
str
,
dtype
:
str
,
max_model_len
:
int
,
max_model_len
:
int
,
max_tokens
:
int
,
max_tokens
:
int
,
...
@@ -157,4 +162,5 @@ def test_models(
...
@@ -157,4 +162,5 @@ def test_models(
max_tokens
=
max_tokens
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
1
,
tensor_parallel_size
=
1
,
attention_config
=
granite_speech_attention_config
,
)
)
tests/models/multimodal/pooling/conftest.py
View file @
a810671a
...
@@ -2,23 +2,17 @@
...
@@ -2,23 +2,17 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM pooling tests."""
"""Pytest configuration for vLLM pooling tests."""
import
os
import
pytest
import
warnings
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
def
pytest_collection_modifyitems
(
config
,
items
):
@
pytest
.
fixture
"""Set FLEX_ATTENTION backend for SigLIP tests on ROCm."""
def
siglip_attention_config
():
if
not
current_platform
.
is_rocm
():
"""Return attention config for SigLIP tests on ROCm.
return
siglip_tests
=
[
item
for
item
in
items
if
"test_siglip"
in
item
.
nodeid
]
On ROCm, SigLIP tests require FLEX_ATTENTION backend.
"""
if
siglip_tests
:
if
current_platform
.
is_rocm
():
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
"FLEX_ATTENTION"
return
{
"backend"
:
"FLEX_ATTENTION"
}
warnings
.
warn
(
return
None
"ROCm: Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION for SigLIP tests"
,
UserWarning
,
stacklevel
=
1
,
)
tests/models/multimodal/pooling/test_siglip.py
View file @
a810671a
...
@@ -38,6 +38,7 @@ def _run_test(
...
@@ -38,6 +38,7 @@ def _run_test(
*
,
*
,
dtype
:
str
,
dtype
:
str
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
attention_config
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
None
:
)
->
None
:
if
tokenization_kwargs
is
None
:
if
tokenization_kwargs
is
None
:
tokenization_kwargs
=
{}
tokenization_kwargs
=
{}
...
@@ -49,6 +50,7 @@ def _run_test(
...
@@ -49,6 +50,7 @@ def _run_test(
enforce_eager
=
True
,
enforce_eager
=
True
,
max_model_len
=
64
,
max_model_len
=
64
,
gpu_memory_utilization
=
0.7
,
gpu_memory_utilization
=
0.7
,
attention_config
=
attention_config
,
)
as
vllm_model
:
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
embed
(
vllm_outputs
=
vllm_model
.
embed
(
input_texts
,
images
=
input_images
,
tokenization_kwargs
=
tokenization_kwargs
input_texts
,
images
=
input_images
,
tokenization_kwargs
=
tokenization_kwargs
...
@@ -90,6 +92,7 @@ def test_models_text(
...
@@ -90,6 +92,7 @@ def test_models_text(
hf_runner
,
hf_runner
,
vllm_runner
,
vllm_runner
,
image_assets
,
image_assets
,
siglip_attention_config
,
model
:
str
,
model
:
str
,
dtype
:
str
,
dtype
:
str
,
)
->
None
:
)
->
None
:
...
@@ -108,6 +111,7 @@ def test_models_text(
...
@@ -108,6 +111,7 @@ def test_models_text(
"padding"
:
"max_length"
,
"padding"
:
"max_length"
,
"max_length"
:
64
,
"max_length"
:
64
,
},
# siglip2 was trained with this padding setting.
},
# siglip2 was trained with this padding setting.
attention_config
=
siglip_attention_config
,
)
)
...
@@ -117,6 +121,7 @@ def test_models_image(
...
@@ -117,6 +121,7 @@ def test_models_image(
hf_runner
,
hf_runner
,
vllm_runner
,
vllm_runner
,
image_assets
,
image_assets
,
siglip_attention_config
,
model
:
str
,
model
:
str
,
dtype
:
str
,
dtype
:
str
,
)
->
None
:
)
->
None
:
...
@@ -133,6 +138,7 @@ def test_models_image(
...
@@ -133,6 +138,7 @@ def test_models_image(
input_images
,
input_images
,
model
,
model
,
dtype
=
dtype
,
dtype
=
dtype
,
attention_config
=
siglip_attention_config
,
)
)
...
@@ -141,6 +147,7 @@ def test_models_image(
...
@@ -141,6 +147,7 @@ def test_models_image(
def
test_models_text_image_no_crash
(
def
test_models_text_image_no_crash
(
vllm_runner
,
vllm_runner
,
image_assets
,
image_assets
,
siglip_attention_config
,
model
:
str
,
model
:
str
,
dtype
:
str
,
dtype
:
str
,
)
->
None
:
)
->
None
:
...
@@ -154,6 +161,7 @@ def test_models_text_image_no_crash(
...
@@ -154,6 +161,7 @@ def test_models_text_image_no_crash(
enforce_eager
=
True
,
enforce_eager
=
True
,
max_model_len
=
64
,
max_model_len
=
64
,
gpu_memory_utilization
=
0.7
,
gpu_memory_utilization
=
0.7
,
attention_config
=
siglip_attention_config
,
)
as
vllm_model
:
)
as
vllm_model
:
with
pytest
.
raises
(
ValueError
,
match
=
"not both"
):
with
pytest
.
raises
(
ValueError
,
match
=
"not both"
):
vllm_model
.
embed
(
texts
,
images
=
images
)
vllm_model
.
embed
(
texts
,
images
=
images
)
...
...
Prev
1
2
3
4
5
6
7
8
9
10
…
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment