Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7eb6cb6c
Unverified
Commit
7eb6cb6c
authored
Dec 17, 2025
by
Matthew Bonanni
Committed by
GitHub
Dec 17, 2025
Browse files
[Attention] Update tests to remove deprecated env vars (#30563)
Signed-off-by:
Matthew Bonanni
<
mbonanni@redhat.com
>
parent
9ca8cb38
Changes
34
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
415 additions
and
290 deletions
+415
-290
.buildkite/scripts/hardware_ci/run-xpu-test.sh
.buildkite/scripts/hardware_ci/run-xpu-test.sh
+1
-1
tests/basic_correctness/test_basic_correctness.py
tests/basic_correctness/test_basic_correctness.py
+41
-48
tests/compile/distributed/test_fusions_e2e.py
tests/compile/distributed/test_fusions_e2e.py
+6
-3
tests/compile/fullgraph/test_basic_correctness.py
tests/compile/fullgraph/test_basic_correctness.py
+40
-42
tests/compile/fullgraph/test_full_cudagraph.py
tests/compile/fullgraph/test_full_cudagraph.py
+3
-10
tests/compile/fullgraph/test_full_graph.py
tests/compile/fullgraph/test_full_graph.py
+3
-4
tests/distributed/test_context_parallel.py
tests/distributed/test_context_parallel.py
+1
-3
tests/distributed/test_pp_cudagraph.py
tests/distributed/test_pp_cudagraph.py
+12
-14
tests/engine/test_arg_utils.py
tests/engine/test_arg_utils.py
+134
-1
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_chat.py
+4
-9
tests/kernels/attention/test_attention_selector.py
tests/kernels/attention/test_attention_selector.py
+27
-25
tests/kernels/attention/test_rocm_attention_selector.py
tests/kernels/attention/test_rocm_attention_selector.py
+39
-21
tests/kernels/test_flex_attention.py
tests/kernels/test_flex_attention.py
+44
-51
tests/models/multimodal/generation/test_granite_speech.py
tests/models/multimodal/generation/test_granite_speech.py
+9
-3
tests/models/multimodal/pooling/conftest.py
tests/models/multimodal/pooling/conftest.py
+9
-15
tests/models/multimodal/pooling/test_siglip.py
tests/models/multimodal/pooling/test_siglip.py
+8
-0
tests/models/quantization/test_fp8.py
tests/models/quantization/test_fp8.py
+2
-1
tests/models/test_initialization.py
tests/models/test_initialization.py
+7
-5
tests/v1/attention/test_rocm_attention_backends_selection.py
tests/v1/attention/test_rocm_attention_backends_selection.py
+3
-9
tests/v1/attention/utils.py
tests/v1/attention/utils.py
+22
-25
No files found.
.buildkite/scripts/hardware_ci/run-xpu-test.sh
View file @
7eb6cb6c
...
...
@@ -39,7 +39,7 @@ docker run \
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
VLLM_ATTENTION_BACKEND=TRITON_ATTN
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
--attention-backend=TRITON_ATTN
cd tests
pytest -v -s v1/core
pytest -v -s v1/engine
...
...
tests/basic_correctness/test_basic_correctness.py
View file @
7eb6cb6c
...
...
@@ -67,7 +67,6 @@ def _fix_prompt_embed_outputs(
@
pytest
.
mark
.
parametrize
(
"model_executor"
,
[
"uni"
,
"mp"
])
@
pytest
.
mark
.
parametrize
(
"enable_prompt_embeds"
,
[
True
,
False
])
def
test_models
(
monkeypatch
:
pytest
.
MonkeyPatch
,
hf_runner
,
model
:
str
,
backend
:
str
,
...
...
@@ -77,48 +76,46 @@ def test_models(
model_executor
:
str
,
enable_prompt_embeds
:
bool
,
)
->
None
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
backend
)
# 5042 tokens for gemma2
# gemma2 has alternating sliding window size of 4096
# we need a prompt with more than 4096 tokens to test the sliding window
prompt
=
(
"The following numbers of the sequence "
+
", "
.
join
(
str
(
i
)
for
i
in
range
(
1024
))
+
" are:"
)
example_prompts
=
[
prompt
]
with
hf_runner
(
model
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
if
enable_prompt_embeds
:
with
torch
.
no_grad
():
prompt_embeds
=
hf_model
.
get_prompt_embeddings
(
example_prompts
)
with
VllmRunner
(
model
,
max_model_len
=
8192
,
enforce_eager
=
enforce_eager
,
enable_prompt_embeds
=
enable_prompt_embeds
,
gpu_memory_utilization
=
0.7
,
async_scheduling
=
async_scheduling
,
distributed_executor_backend
=
model_executor
,
)
as
vllm_model
:
if
enable_prompt_embeds
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
prompt_embeds
,
max_tokens
)
vllm_outputs
=
_fix_prompt_embed_outputs
(
vllm_outputs
,
hf_model
,
example_prompts
)
else
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
# 5042 tokens for gemma2
# gemma2 has alternating sliding window size of 4096
# we need a prompt with more than 4096 tokens to test the sliding window
prompt
=
(
"The following numbers of the sequence "
+
", "
.
join
(
str
(
i
)
for
i
in
range
(
1024
))
+
" are:"
)
example_prompts
=
[
prompt
]
with
hf_runner
(
model
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
if
enable_prompt_embeds
:
with
torch
.
no_grad
():
prompt_embeds
=
hf_model
.
get_prompt_embeddings
(
example_prompts
)
with
VllmRunner
(
model
,
max_model_len
=
8192
,
enforce_eager
=
enforce_eager
,
enable_prompt_embeds
=
enable_prompt_embeds
,
gpu_memory_utilization
=
0.7
,
async_scheduling
=
async_scheduling
,
distributed_executor_backend
=
model_executor
,
attention_config
=
{
"backend"
:
backend
},
)
as
vllm_model
:
if
enable_prompt_embeds
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
prompt_embeds
,
max_tokens
)
vllm_outputs
=
_fix_prompt_embed_outputs
(
vllm_outputs
,
hf_model
,
example_prompts
)
else
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
multi_gpu_test
(
num_gpus
=
2
)
...
...
@@ -161,12 +158,6 @@ def test_models_distributed(
):
# noqa
pytest
.
skip
(
"enable_prompt_embeds does not work with ray compiled dag."
)
if
attention_backend
:
monkeypatch_context
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
attention_backend
,
)
for
k
,
v
in
extra_env
.
items
():
monkeypatch_context
.
setenv
(
k
,
v
)
...
...
@@ -178,6 +169,7 @@ def test_models_distributed(
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method
# (the default method).
attention_config
=
{
"backend"
:
attention_backend
}
if
attention_backend
else
None
with
vllm_runner
(
model
,
dtype
=
dtype
,
...
...
@@ -185,6 +177,7 @@ def test_models_distributed(
distributed_executor_backend
=
distributed_executor_backend
,
enable_prompt_embeds
=
enable_prompt_embeds
,
gpu_memory_utilization
=
0.7
,
attention_config
=
attention_config
,
)
as
vllm_model
:
if
enable_prompt_embeds
:
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
...
...
tests/compile/distributed/test_fusions_e2e.py
View file @
7eb6cb6c
...
...
@@ -208,7 +208,8 @@ def test_attn_quant(
# To capture subprocess logs, we need to know whether spawn or fork is used.
# Force spawn as it is more general.
monkeypatch
.
setenv
(
"VLLM_WORKER_MULTIPROC_METHOD"
,
"spawn"
)
monkeypatch
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
backend
.
name
)
model_kwargs
[
"attention_config"
]
=
{
"backend"
:
backend
.
name
}
compilation_config
=
CompilationConfig
(
# Testing properties
...
...
@@ -297,7 +298,8 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
# To capture subprocess logs, we need to know whether spawn or fork is used.
# Force spawn as it is more general.
monkeypatch
.
setenv
(
"VLLM_WORKER_MULTIPROC_METHOD"
,
"spawn"
)
monkeypatch
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
backend
.
name
)
model_kwargs
[
"attention_config"
]
=
{
"backend"
:
backend
.
name
}
compilation_config
=
CompilationConfig
(
# Testing properties
...
...
@@ -409,7 +411,8 @@ def test_tp2_attn_quant_async_tp(
# To capture subprocess logs, we need to know whether spawn or fork is used.
# Force spawn as it is more general.
monkeypatch
.
setenv
(
"VLLM_WORKER_MULTIPROC_METHOD"
,
"spawn"
)
monkeypatch
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
backend
.
name
)
model_kwargs
[
"attention_config"
]
=
{
"backend"
:
backend
.
name
}
compilation_config
=
CompilationConfig
(
# Testing properties
...
...
tests/compile/fullgraph/test_basic_correctness.py
View file @
7eb6cb6c
...
...
@@ -89,7 +89,6 @@ class TestSetting:
],
)
def
test_compile_correctness
(
monkeypatch
:
pytest
.
MonkeyPatch
,
test_setting
:
TestSetting
,
):
# this test is run under multiple suits, with different GPUs.
...
...
@@ -107,49 +106,48 @@ def test_compile_correctness(
f
"
{
cuda_device_count_stateless
()
}
"
)
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
attn_backend
)
final_args
=
[
*
model_args
,
"-pp"
,
str
(
pp_size
),
"-tp"
,
str
(
tp_size
),
"-cc.cudagraph_mode=none"
,
]
final_args
=
[
*
model_args
,
"-pp"
,
str
(
pp_size
),
"-tp"
,
str
(
tp_size
),
"-cc.cudagraph_mode=none"
,
f
"--attention-backend=
{
attn_backend
}
"
,
]
all_args
:
list
[
list
[
str
]]
=
[]
all_envs
:
list
[
dict
[
str
,
str
]
|
None
]
=
[]
all_args
:
list
[
list
[
str
]]
=
[]
all_envs
:
list
[
dict
[
str
,
str
]
|
None
]
=
[]
for
comp_mode
in
[
CompilationMode
.
STOCK_TORCH_COMPILE
,
CompilationMode
.
DYNAMO_TRACE_ONCE
,
CompilationMode
.
VLLM_COMPILE
,
]:
for
mode
in
[
CompilationMode
.
NONE
,
comp_mode
]:
all_args
.
append
(
final_args
+
[
f
"-cc.mode=
{
mode
.
name
}
"
,
"-cc.backend=inductor"
]
)
# inductor will change the output, so we only compare if the output
# is close, not exactly the same.
compare_all_settings
(
model
,
all_args
,
all_envs
,
method
=
method
if
method
!=
"generate"
else
"generate_close"
,
for
comp_mode
in
[
CompilationMode
.
STOCK_TORCH_COMPILE
,
CompilationMode
.
DYNAMO_TRACE_ONCE
,
CompilationMode
.
VLLM_COMPILE
,
]:
for
mode
in
[
CompilationMode
.
NONE
,
comp_mode
]:
all_args
.
append
(
final_args
+
[
f
"-cc.mode=
{
mode
.
name
}
"
,
"-cc.backend=inductor"
]
)
all_envs
.
clear
()
all_args
.
clear
()
for
mode
in
[
CompilationMode
.
NONE
,
CompilationMode
.
STOCK_TORCH_COMPILE
,
CompilationMode
.
DYNAMO_TRACE_ONCE
,
CompilationMode
.
VLLM_COMPILE
,
]:
all_args
.
append
(
final_args
+
[
f
"-cc.mode=
{
mode
.
name
}
"
,
"-cc.backend=eager"
])
all_envs
.
append
({})
all_envs
.
append
({})
# inductor will change the output, so we only compare if the output
# is close, not exactly the same.
compare_all_settings
(
model
,
all_args
,
all_envs
,
method
=
method
if
method
!=
"generate"
else
"generate_close"
,
)
all_envs
.
clear
()
all_args
.
clear
()
for
mode
in
[
CompilationMode
.
NONE
,
CompilationMode
.
STOCK_TORCH_COMPILE
,
CompilationMode
.
DYNAMO_TRACE_ONCE
,
CompilationMode
.
VLLM_COMPILE
,
]:
all_args
.
append
(
final_args
+
[
f
"-cc.mode=
{
mode
.
name
}
"
,
"-cc.backend=eager"
])
all_envs
.
append
({})
all_envs
.
append
({})
compare_all_settings
(
model
,
all_args
*
3
,
all_envs
,
method
=
method
)
compare_all_settings
(
model
,
all_args
*
3
,
all_envs
,
method
=
method
)
tests/compile/fullgraph/test_full_cudagraph.py
View file @
7eb6cb6c
...
...
@@ -74,7 +74,6 @@ def llm_pair(request):
# Force native sampler to avoid potential nondeterminism in FlashInfer
# when per-request generators are not used in V1.
"VLLM_USE_FLASHINFER_SAMPLER"
:
"0"
,
**
backend_config
.
env_vars
,
}
with
temporary_environ
(
env_vars
):
full
=
LLM
(
...
...
@@ -170,16 +169,10 @@ class TestFullCUDAGraph:
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
(),
reason
=
"Skip if not cuda"
)
def
test_full_cudagraph_with_invalid_backend
():
with
(
temporary_environ
(
{
"VLLM_ATTENTION_BACKEND"
:
"FLEX_ATTENTION"
,
# Flex_Attention is not supported with full cuda graph
}
),
pytest
.
raises
(
RuntimeError
),
):
# Flex_Attention is not supported with full cuda graph
with
pytest
.
raises
(
RuntimeError
):
LLM
(
model
=
"Qwen/Qwen2-1.5B-Instruct"
,
compilation_config
=
CompilationConfig
(
cudagraph_mode
=
"FULL"
),
attention_config
=
{
"backend"
:
"FLEX_ATTENTION"
},
)
tests/compile/fullgraph/test_full_graph.py
View file @
7eb6cb6c
...
...
@@ -197,20 +197,19 @@ def test_custom_compile_config(
],
)
def
test_fp8_kv_scale_compile
(
monkeypatch
:
pytest
.
MonkeyPatch
,
compilation_mode
:
int
,
model
:
str
,
backend
:
AttentionBackendEnum
|
None
,
):
if
backend
:
monkeypatch
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
backend
.
name
)
model_kwargs
=
{
"quantization"
:
"fp8"
,
"kv_cache_dtype"
:
"fp8_e4m3"
,
"calculate_kv_scales"
:
True
,
"max_model_len"
:
512
,
}
if
backend
:
model_kwargs
[
"attention_config"
]
=
{
"backend"
:
backend
.
name
}
run_model
(
compilation_mode
,
model
,
**
model_kwargs
)
...
...
tests/distributed/test_context_parallel.py
View file @
7eb6cb6c
...
...
@@ -219,14 +219,12 @@ def _test_cp_gsm8k(
]
)
server_env
=
{}
if
attn_backend
:
server_
env
[
"VLLM_ATTENTION_BACKEND"
]
=
attn_backend
server_
args
.
append
(
f
"--attention-backend=
{
attn_backend
}
"
)
with
RemoteOpenAIServer
(
model_id
,
server_args
,
env_dict
=
server_env
,
max_wait_seconds
=
720
,
)
as
remote_server
:
host
=
f
"http://
{
remote_server
.
host
}
"
...
...
tests/distributed/test_pp_cudagraph.py
View file @
7eb6cb6c
...
...
@@ -20,23 +20,21 @@ from ..utils import compare_two_settings, create_new_process_for_each_test
)
@
create_new_process_for_each_test
()
def
test_pp_cudagraph
(
monkeypatch
:
pytest
.
MonkeyPatch
,
PP_SIZE
:
int
,
MODEL_NAME
:
str
,
ATTN_BACKEND
:
LiteralString
,
):
with
monkeypatch
.
context
()
as
m
:
cudagraph_args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"float16"
,
"--pipeline-parallel-size"
,
str
(
PP_SIZE
),
"--distributed-executor-backend"
,
"mp"
,
]
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
ATTN_BACKEND
)
cudagraph_args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"float16"
,
"--pipeline-parallel-size"
,
str
(
PP_SIZE
),
"--distributed-executor-backend"
,
"mp"
,
f
"--attention-backend=
{
ATTN_BACKEND
}
"
,
]
eager_args
=
cudagraph_args
+
[
"--enforce-eager"
]
eager_args
=
cudagraph_args
+
[
"--enforce-eager"
]
compare_two_settings
(
MODEL_NAME
,
eager_args
,
cudagraph_args
)
compare_two_settings
(
MODEL_NAME
,
eager_args
,
cudagraph_args
)
tests/engine/test_arg_utils.py
View file @
7eb6cb6c
...
...
@@ -9,7 +9,7 @@ from typing import Annotated, Literal
import
pytest
from
vllm.config
import
CompilationConfig
,
config
from
vllm.config
import
AttentionConfig
,
CompilationConfig
,
config
from
vllm.engine.arg_utils
import
(
EngineArgs
,
contains_type
,
...
...
@@ -298,6 +298,139 @@ def test_compilation_config():
)
def
test_attention_config
():
from
vllm.attention.backends.registry
import
AttentionBackendEnum
parser
=
EngineArgs
.
add_cli_args
(
FlexibleArgumentParser
())
# default value
args
=
parser
.
parse_args
([])
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
assert
engine_args
.
attention_config
==
AttentionConfig
()
# set backend via dot notation
args
=
parser
.
parse_args
([
"--attention-config.backend"
,
"FLASH_ATTN"
])
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
assert
engine_args
.
attention_config
.
backend
is
not
None
assert
engine_args
.
attention_config
.
backend
.
name
==
"FLASH_ATTN"
# set backend via --attention-backend shorthand
args
=
parser
.
parse_args
([
"--attention-backend"
,
"FLASHINFER"
])
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
assert
engine_args
.
attention_backend
is
not
None
assert
engine_args
.
attention_backend
==
"FLASHINFER"
# set all fields via dot notation
args
=
parser
.
parse_args
(
[
"--attention-config.backend"
,
"FLASH_ATTN"
,
"--attention-config.flash_attn_version"
,
"3"
,
"--attention-config.use_prefill_decode_attention"
,
"true"
,
"--attention-config.flash_attn_max_num_splits_for_cuda_graph"
,
"16"
,
"--attention-config.use_cudnn_prefill"
,
"true"
,
"--attention-config.use_trtllm_ragged_deepseek_prefill"
,
"true"
,
"--attention-config.use_trtllm_attention"
,
"true"
,
"--attention-config.disable_flashinfer_prefill"
,
"true"
,
"--attention-config.disable_flashinfer_q_quantization"
,
"true"
,
]
)
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
assert
engine_args
.
attention_config
.
backend
is
not
None
assert
engine_args
.
attention_config
.
backend
.
name
==
"FLASH_ATTN"
assert
engine_args
.
attention_config
.
flash_attn_version
==
3
assert
engine_args
.
attention_config
.
use_prefill_decode_attention
is
True
assert
engine_args
.
attention_config
.
flash_attn_max_num_splits_for_cuda_graph
==
16
assert
engine_args
.
attention_config
.
use_cudnn_prefill
is
True
assert
engine_args
.
attention_config
.
use_trtllm_ragged_deepseek_prefill
is
True
assert
engine_args
.
attention_config
.
use_trtllm_attention
is
True
assert
engine_args
.
attention_config
.
disable_flashinfer_prefill
is
True
assert
engine_args
.
attention_config
.
disable_flashinfer_q_quantization
is
True
# set to string form of a dict with all fields
args
=
parser
.
parse_args
(
[
"--attention-config="
'{"backend": "FLASHINFER", "flash_attn_version": 2, '
'"use_prefill_decode_attention": false, '
'"flash_attn_max_num_splits_for_cuda_graph": 8, '
'"use_cudnn_prefill": false, '
'"use_trtllm_ragged_deepseek_prefill": false, '
'"use_trtllm_attention": false, '
'"disable_flashinfer_prefill": false, '
'"disable_flashinfer_q_quantization": false}'
,
]
)
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
assert
engine_args
.
attention_config
.
backend
is
not
None
assert
engine_args
.
attention_config
.
backend
.
name
==
"FLASHINFER"
assert
engine_args
.
attention_config
.
flash_attn_version
==
2
assert
engine_args
.
attention_config
.
use_prefill_decode_attention
is
False
assert
engine_args
.
attention_config
.
flash_attn_max_num_splits_for_cuda_graph
==
8
assert
engine_args
.
attention_config
.
use_cudnn_prefill
is
False
assert
engine_args
.
attention_config
.
use_trtllm_ragged_deepseek_prefill
is
False
assert
engine_args
.
attention_config
.
use_trtllm_attention
is
False
assert
engine_args
.
attention_config
.
disable_flashinfer_prefill
is
False
assert
engine_args
.
attention_config
.
disable_flashinfer_q_quantization
is
False
# test --attention-backend flows into VllmConfig.attention_config
args
=
parser
.
parse_args
(
[
"--model"
,
"facebook/opt-125m"
,
"--attention-backend"
,
"FLASH_ATTN"
,
]
)
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
vllm_config
=
engine_args
.
create_engine_config
()
assert
vllm_config
.
attention_config
.
backend
==
AttentionBackendEnum
.
FLASH_ATTN
# test --attention-config.backend flows into VllmConfig.attention_config
args
=
parser
.
parse_args
(
[
"--model"
,
"facebook/opt-125m"
,
"--attention-config.backend"
,
"FLASHINFER"
,
]
)
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
vllm_config
=
engine_args
.
create_engine_config
()
assert
vllm_config
.
attention_config
.
backend
==
AttentionBackendEnum
.
FLASHINFER
# test --attention-backend and --attention-config.backend are mutually exclusive
args
=
parser
.
parse_args
(
[
"--model"
,
"facebook/opt-125m"
,
"--attention-backend"
,
"FLASH_ATTN"
,
"--attention-config.backend"
,
"FLASHINFER"
,
]
)
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
with
pytest
.
raises
(
ValueError
,
match
=
"mutually exclusive"
):
engine_args
.
create_engine_config
()
def
test_prefix_cache_default
():
parser
=
EngineArgs
.
add_cli_args
(
FlexibleArgumentParser
())
args
=
parser
.
parse_args
([])
...
...
tests/entrypoints/openai/test_serving_chat.py
View file @
7eb6cb6c
...
...
@@ -76,15 +76,10 @@ def default_server_args(with_tool_parser: bool):
@
pytest
.
fixture
(
scope
=
"module"
)
def
gptoss_server
(
monkeypatch_module
:
pytest
.
MonkeyPatch
,
default_server_args
:
list
[
str
]
):
with
monkeypatch_module
.
context
()
as
m
:
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"TRITON_ATTN"
)
with
RemoteOpenAIServer
(
GPT_OSS_MODEL_NAME
,
default_server_args
)
as
remote_server
:
yield
remote_server
def
gptoss_server
(
default_server_args
:
list
[
str
]):
server_args
=
default_server_args
+
[
"--attention-backend=TRITON_ATTN"
]
with
RemoteOpenAIServer
(
GPT_OSS_MODEL_NAME
,
server_args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
...
...
tests/kernels/attention/test_attention_selector.py
View file @
7eb6cb6c
...
...
@@ -6,7 +6,9 @@ from unittest.mock import patch
import
pytest
import
torch
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.attention.selector
import
_cached_get_attn_backend
,
get_attn_backend
from
vllm.config
import
AttentionConfig
,
VllmConfig
,
set_current_vllm_config
from
vllm.platforms
import
current_platform
from
vllm.platforms.cpu
import
CpuPlatform
from
vllm.platforms.cuda
import
CudaPlatform
...
...
@@ -73,18 +75,18 @@ def generate_params():
@
pytest
.
mark
.
parametrize
(
"device, name, use_mla, block_size"
,
generate_params
())
def
test_
env
(
def
test_
backend_selection
(
device
:
str
,
name
:
str
,
use_mla
:
bool
,
block_size
:
int
,
monkeypatch
:
pytest
.
MonkeyPatch
,
):
"""Test attention backend selection with valid device-backend pairs."""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
name
)
m
.
setenv
(
"VLLM_MLA_DISABLE"
,
"1"
if
use_mla
else
"0"
)
# Create AttentionConfig with the specified backend
attention_config
=
AttentionConfig
(
backend
=
AttentionBackendEnum
[
name
]
)
vllm_config
=
VllmConfig
(
attention_config
=
attention_config
)
with
set_current_vllm_config
(
vllm_config
):
if
device
==
"cpu"
:
with
patch
(
"vllm.platforms.current_platform"
,
CpuPlatform
()):
backend
=
get_attn_backend
(
16
,
torch
.
float16
,
None
,
block_size
)
...
...
@@ -217,27 +219,32 @@ def test_env(
@
pytest
.
mark
.
parametrize
(
"device"
,
[
"cpu"
,
"cuda"
])
def
test_fp32_fallback
(
device
:
str
):
"""Test attention backend selection with fp32."""
if
device
==
"cpu"
:
with
patch
(
"vllm.platforms.current_platform"
,
CpuPlatform
()):
backend
=
get_attn_backend
(
16
,
torch
.
float32
,
None
,
16
)
assert
backend
.
get_name
()
==
"CPU_ATTN"
# Use default config (no backend specified)
vllm_config
=
VllmConfig
()
elif
device
==
"cuda"
:
with
patch
(
"vllm.platforms.current_platform"
,
CudaPlatform
()):
backend
=
get_attn_backend
(
16
,
torch
.
float32
,
None
,
16
)
assert
backend
.
get_name
()
==
"FLEX_ATTENTION"
with
set_current_vllm_config
(
vllm_config
):
if
device
==
"cpu"
:
with
patch
(
"vllm.platforms.current_platform"
,
CpuPlatform
()):
backend
=
get_attn_backend
(
16
,
torch
.
float32
,
None
,
16
)
assert
backend
.
get_name
()
==
"CPU_ATTN"
elif
device
==
"cuda"
:
with
patch
(
"vllm.platforms.current_platform"
,
CudaPlatform
()):
backend
=
get_attn_backend
(
16
,
torch
.
float32
,
None
,
16
)
assert
backend
.
get_name
()
==
"FLEX_ATTENTION"
def
test_flash_attn
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""Test FlashAttn validation."""
pytest
.
skip
(
"Skipping as current backend selector does not "
"handle fallbacks when a backend is
set via env var
."
"handle fallbacks when a backend is
explicitly set
."
)
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"FLASH_ATTN"
)
attention_config
=
AttentionConfig
(
backend
=
AttentionBackendEnum
.
FLASH_ATTN
)
vllm_config
=
VllmConfig
(
attention_config
=
attention_config
)
with
set_current_vllm_config
(
vllm_config
):
# Unsupported CUDA arch
monkeypatch
.
setattr
(
torch
.
cuda
,
"get_device_capability"
,
lambda
_
=
None
:
(
7
,
5
))
backend
=
get_attn_backend
(
16
,
torch
.
float16
,
None
,
16
)
...
...
@@ -277,15 +284,10 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
assert
backend
.
get_name
()
!=
"FLASH_ATTN"
def
test_invalid_
env
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_invalid_
backend
(
):
"""Test that invalid attention backend names raise ValueError."""
with
(
monkeypatch
.
context
()
as
m
,
patch
(
"vllm.platforms.current_platform"
,
CudaPlatform
()),
pytest
.
raises
(
ValueError
),
):
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"INVALID"
)
# Should raise ValueError for invalid backend
with
pytest
.
raises
(
ValueError
)
as
exc_info
:
get_attn_backend
(
32
,
torch
.
float16
,
None
,
16
)
assert
"Invalid value 'INVALID'"
in
str
(
exc_info
.
value
)
# Invalid backend name should raise ValueError when creating enum
AttentionConfig
(
backend
=
AttentionBackendEnum
[
"INVALID"
])
tests/kernels/attention/test_rocm_attention_selector.py
View file @
7eb6cb6c
...
...
@@ -4,7 +4,9 @@
import
pytest
import
torch
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.attention.selector
import
_cached_get_attn_backend
,
get_attn_backend
from
vllm.config
import
AttentionConfig
,
VllmConfig
,
set_current_vllm_config
from
vllm.platforms.rocm
import
RocmPlatform
...
...
@@ -16,40 +18,56 @@ def clear_cache():
@
pytest
.
mark
.
skip
(
reason
=
"Skipped for now. Should be revisited."
)
def
test_selector
(
monkeypatch
:
pytest
.
MonkeyPatch
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"ROCM_ATTN"
)
# Set the current platform to ROCm using monkeypatch
m
onkeypatch
.
setattr
(
"vllm.attention.selector.current_platform"
,
RocmPlatform
()
)
# Set the current platform to ROCm using monkeypatch
monkeypatch
.
setattr
(
"vllm.attention.selector.current_platform"
,
RocmPlatform
())
# Test standard ROCm attention
attention_config
=
AttentionConfig
(
backend
=
AttentionBackendEnum
.
ROCM_ATTN
)
vllm_config
=
VllmConfig
(
attention_config
=
attention_config
)
# Test standard ROCm attention
with
set_current_vllm_config
(
vllm_config
):
backend
=
get_attn_backend
(
16
,
torch
.
float16
,
torch
.
float16
,
16
,
False
)
assert
backend
.
get_name
()
==
"ROCM_FLASH"
or
backend
.
get_name
()
==
"TRITON_ATTN"
# MLA test for deepseek related
# MLA test for deepseek related
# Change the attention backend to triton MLA
attention_config
=
AttentionConfig
(
backend
=
AttentionBackendEnum
.
TRITON_MLA
)
vllm_config
=
VllmConfig
(
attention_config
=
attention_config
)
# change the attention backend to triton MLA
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"TRITON_MLA"
)
with
set_current_vllm_config
(
vllm_config
):
backend
=
get_attn_backend
(
576
,
torch
.
bfloat16
,
"auto"
,
16
,
False
,
use_mla
=
True
)
assert
backend
.
get_name
()
==
"TRITON_MLA"
# If attention backend is None
# If use_mla is true
# The selected backend is triton MLA
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
""
)
# If attention backend is None
# If use_mla is true
# The selected backend is triton MLA
attention_config
=
AttentionConfig
(
backend
=
None
)
vllm_config
=
VllmConfig
(
attention_config
=
attention_config
)
with
set_current_vllm_config
(
vllm_config
):
backend
=
get_attn_backend
(
576
,
torch
.
bfloat16
,
"auto"
,
16
,
False
,
use_mla
=
True
)
assert
backend
.
get_name
()
==
"TRITON_MLA"
# change the attention backend to AITER MLA
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"ROCM_AITER_MLA"
)
# Change the attention backend to AITER MLA
attention_config
=
AttentionConfig
(
backend
=
AttentionBackendEnum
.
ROCM_AITER_MLA
)
vllm_config
=
VllmConfig
(
attention_config
=
attention_config
)
with
set_current_vllm_config
(
vllm_config
):
backend
=
get_attn_backend
(
576
,
torch
.
bfloat16
,
"auto"
,
1
,
False
,
use_mla
=
True
)
assert
backend
.
get_name
()
==
"ROCM_AITER_MLA"
# If attention backend is None
# If use_mla is true
# If VLLM_ROCM_USE_AITER is enabled
# The selected backend is ROCM_AITER_MLA
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
""
)
# If attention backend is None
# If use_mla is true
# If VLLM_ROCM_USE_AITER is enabled
# The selected backend is ROCM_AITER_MLA
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_ROCM_USE_AITER"
,
"1"
)
backend
=
get_attn_backend
(
576
,
torch
.
bfloat16
,
"auto"
,
1
,
False
,
use_mla
=
True
)
assert
backend
.
get_name
()
==
"ROCM_AITER_MLA"
attention_config
=
AttentionConfig
(
backend
=
None
)
vllm_config
=
VllmConfig
(
attention_config
=
attention_config
)
with
set_current_vllm_config
(
vllm_config
):
backend
=
get_attn_backend
(
576
,
torch
.
bfloat16
,
"auto"
,
1
,
False
,
use_mla
=
True
)
assert
backend
.
get_name
()
==
"ROCM_AITER_MLA"
tests/kernels/test_flex_attention.py
View file @
7eb6cb6c
...
...
@@ -37,7 +37,7 @@ def set_seed(seed):
not
torch
.
cuda
.
is_available
()
or
TORCH_VERSION
<
MINIMUM_TORCH_VERSION
,
reason
=
"CUDA not available or PyTorch version < 2.7"
,
)
def
test_flex_attention_vs_default_backend
(
vllm_runner
,
monkeypatch
):
def
test_flex_attention_vs_default_backend
(
vllm_runner
):
"""Test that FlexAttention produces the same outputs as the default backend.
This test compares the outputs from the FlexAttention backend with
...
...
@@ -54,35 +54,32 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
]
# Run with flex attention
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"FLEX_ATTENTION"
)
set_seed
(
seed
)
with
vllm_runner
(
model_name
,
runner
=
"generate"
,
tensor_parallel_size
=
1
,
num_gpu_blocks_override
=
128
,
enforce_eager
=
True
,
)
as
llm_flex
:
output_flex
=
llm_flex
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
)
set_seed
(
seed
)
with
vllm_runner
(
model_name
,
runner
=
"generate"
,
tensor_parallel_size
=
1
,
num_gpu_blocks_override
=
128
,
enforce_eager
=
True
,
attention_config
=
{
"backend"
:
"FLEX_ATTENTION"
},
)
as
llm_flex
:
output_flex
=
llm_flex
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
)
# Run with default backend
with
monkeypatch
.
context
()
as
m
:
set_seed
(
seed
)
with
vllm_runner
(
model_name
,
runner
=
"generate"
,
tensor_parallel_size
=
1
,
num_gpu_blocks_override
=
128
,
enforce_eager
=
True
,
gpu_memory_utilization
=
0.85
,
)
as
llm_default
:
output_default
=
llm_default
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
)
set_seed
(
seed
)
with
vllm_runner
(
model_name
,
runner
=
"generate"
,
tensor_parallel_size
=
1
,
num_gpu_blocks_override
=
128
,
enforce_eager
=
True
,
gpu_memory_utilization
=
0.85
,
)
as
llm_default
:
output_default
=
llm_default
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
)
check_logprobs_close
(
outputs_0_lst
=
output_flex
,
...
...
@@ -96,7 +93,7 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
not
torch
.
cuda
.
is_available
()
or
TORCH_VERSION
<
MINIMUM_TORCH_VERSION
,
reason
=
"CUDA not available or PyTorch version < 2.7"
,
)
def
test_encoder_flex_attention_vs_default_backend
(
vllm_runner
,
monkeypatch
):
def
test_encoder_flex_attention_vs_default_backend
(
vllm_runner
):
"""Test that FlexAttention produces the same outputs as the default backend.
This test compares the outputs from the FlexAttention backend with
...
...
@@ -110,30 +107,26 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
]
# Run with flex attention
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"FLEX_ATTENTION"
)
with
vllm_runner
(
model_name
,
runner
=
"pooling"
,
dtype
=
torch
.
bfloat16
,
tensor_parallel_size
=
1
,
max_model_len
=
100
,
enforce_eager
=
True
,
)
as
llm_flex
:
flex_outputs
=
llm_flex
.
embed
(
prompts
)
with
vllm_runner
(
model_name
,
runner
=
"pooling"
,
dtype
=
torch
.
bfloat16
,
tensor_parallel_size
=
1
,
max_model_len
=
100
,
enforce_eager
=
True
,
attention_config
=
{
"backend"
:
"FLEX_ATTENTION"
},
)
as
llm_flex
:
flex_outputs
=
llm_flex
.
embed
(
prompts
)
# Run with default backend
with
(
monkeypatch
.
context
()
as
m
,
vllm_runner
(
model_name
,
runner
=
"pooling"
,
dtype
=
torch
.
bfloat16
,
tensor_parallel_size
=
1
,
max_model_len
=
100
,
enforce_eager
=
True
,
)
as
llm_default
,
):
with
vllm_runner
(
model_name
,
runner
=
"pooling"
,
dtype
=
torch
.
bfloat16
,
tensor_parallel_size
=
1
,
max_model_len
=
100
,
enforce_eager
=
True
,
)
as
llm_default
:
default_outputs
=
llm_default
.
embed
(
prompts
)
check_embeddings_close
(
...
...
tests/models/multimodal/generation/test_granite_speech.py
View file @
7eb6cb6c
...
...
@@ -35,10 +35,12 @@ audio_lora_path = MODEL_NAME
models
=
[
MODEL_NAME
]
@
pytest
.
fixture
(
autouse
=
True
)
def
set_attention_backend_for_rocm
(
monkeypatch
):
@
pytest
.
fixture
def
granite_speech_attention_config
():
"""Return attention config for Granite Speech tests on ROCm."""
if
current_platform
.
is_rocm
():
monkeypatch
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"TRITON_ATTN"
)
return
{
"backend"
:
"TRITON_ATTN"
}
return
None
def
run_test
(
...
...
@@ -53,6 +55,7 @@ def run_test(
num_logprobs
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
str
|
None
=
None
,
attention_config
:
dict
|
None
=
None
,
):
"""Inference result should be the same between hf and vllm.
...
...
@@ -80,6 +83,7 @@ def run_test(
enable_lora
=
True
,
max_lora_rank
=
64
,
enforce_eager
=
True
,
attention_config
=
attention_config
,
)
as
vllm_model
:
lora_request
=
LoRARequest
(
"audio"
,
1
,
audio_lora_path
)
vllm_outputs_per_case
=
[
...
...
@@ -131,6 +135,7 @@ def test_models(
vllm_runner
,
model
:
str
,
audio_assets
:
AudioTestAssets
,
granite_speech_attention_config
,
dtype
:
str
,
max_model_len
:
int
,
max_tokens
:
int
,
...
...
@@ -157,4 +162,5 @@ def test_models(
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
1
,
attention_config
=
granite_speech_attention_config
,
)
tests/models/multimodal/pooling/conftest.py
View file @
7eb6cb6c
...
...
@@ -2,23 +2,17 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM pooling tests."""
import
os
import
warnings
import
pytest
from
vllm.platforms
import
current_platform
def
pytest_collection_modifyitems
(
config
,
items
):
"""Set FLEX_ATTENTION backend for SigLIP tests on ROCm."""
if
not
current_platform
.
is_rocm
():
return
@
pytest
.
fixture
def
siglip_attention_config
():
"""Return attention config for SigLIP tests on ROCm.
siglip_tests
=
[
item
for
item
in
items
if
"test_siglip"
in
item
.
nodeid
]
if
siglip_tests
:
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
"FLEX_ATTENTION"
warnings
.
warn
(
"ROCm: Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION for SigLIP tests"
,
UserWarning
,
stacklevel
=
1
,
)
On ROCm, SigLIP tests require FLEX_ATTENTION backend.
"""
if
current_platform
.
is_rocm
():
return
{
"backend"
:
"FLEX_ATTENTION"
}
return
None
tests/models/multimodal/pooling/test_siglip.py
View file @
7eb6cb6c
...
...
@@ -38,6 +38,7 @@ def _run_test(
*
,
dtype
:
str
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
attention_config
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
None
:
if
tokenization_kwargs
is
None
:
tokenization_kwargs
=
{}
...
...
@@ -49,6 +50,7 @@ def _run_test(
enforce_eager
=
True
,
max_model_len
=
64
,
gpu_memory_utilization
=
0.7
,
attention_config
=
attention_config
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
embed
(
input_texts
,
images
=
input_images
,
tokenization_kwargs
=
tokenization_kwargs
...
...
@@ -90,6 +92,7 @@ def test_models_text(
hf_runner
,
vllm_runner
,
image_assets
,
siglip_attention_config
,
model
:
str
,
dtype
:
str
,
)
->
None
:
...
...
@@ -108,6 +111,7 @@ def test_models_text(
"padding"
:
"max_length"
,
"max_length"
:
64
,
},
# siglip2 was trained with this padding setting.
attention_config
=
siglip_attention_config
,
)
...
...
@@ -117,6 +121,7 @@ def test_models_image(
hf_runner
,
vllm_runner
,
image_assets
,
siglip_attention_config
,
model
:
str
,
dtype
:
str
,
)
->
None
:
...
...
@@ -133,6 +138,7 @@ def test_models_image(
input_images
,
model
,
dtype
=
dtype
,
attention_config
=
siglip_attention_config
,
)
...
...
@@ -141,6 +147,7 @@ def test_models_image(
def
test_models_text_image_no_crash
(
vllm_runner
,
image_assets
,
siglip_attention_config
,
model
:
str
,
dtype
:
str
,
)
->
None
:
...
...
@@ -154,6 +161,7 @@ def test_models_text_image_no_crash(
enforce_eager
=
True
,
max_model_len
=
64
,
gpu_memory_utilization
=
0.7
,
attention_config
=
siglip_attention_config
,
)
as
vllm_model
:
with
pytest
.
raises
(
ValueError
,
match
=
"not both"
):
vllm_model
.
embed
(
texts
,
images
=
images
)
...
...
tests/models/quantization/test_fp8.py
View file @
7eb6cb6c
...
...
@@ -75,7 +75,6 @@ def test_models(
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"TOKENIZERS_PARALLELISM"
,
"true"
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
backend
)
MAX_MODEL_LEN
=
1024
NUM_LOG_PROBS
=
8
...
...
@@ -86,6 +85,7 @@ def test_models(
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
kv_cache_dtype
=
"auto"
,
attention_config
=
{
"backend"
:
backend
},
)
as
vllm_model
:
baseline_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
...
...
@@ -97,6 +97,7 @@ def test_models(
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
kv_cache_dtype
=
kv_cache_dtype
,
attention_config
=
{
"backend"
:
backend
},
)
as
vllm_model
:
test_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
...
...
tests/models/test_initialization.py
View file @
7eb6cb6c
...
...
@@ -108,11 +108,12 @@ def can_initialize(
patch
.
object
(
V1EngineCore
,
"_initialize_kv_caches"
,
_initialize_kv_caches_v1
),
monkeypatch
.
context
()
as
m
,
):
if
model_arch
==
"GptOssForCausalLM"
:
# FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
# has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
# L4 supports FA3.
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"TRITON_ATTN"
)
# FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
# has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
# L4 supports FA3.
attention_config
=
(
{
"backend"
:
"TRITON_ATTN"
}
if
model_arch
==
"GptOssForCausalLM"
else
None
)
if
model_arch
==
"WhisperForConditionalGeneration"
:
m
.
setenv
(
"VLLM_WORKER_MULTIPROC_METHOD"
,
"spawn"
)
...
...
@@ -143,6 +144,7 @@ def can_initialize(
else
"vllm"
,
hf_overrides
=
hf_overrides_fn
,
max_num_seqs
=
model_info
.
max_num_seqs
,
attention_config
=
attention_config
,
)
...
...
tests/v1/attention/test_rocm_attention_backends_selection.py
View file @
7eb6cb6c
...
...
@@ -94,26 +94,20 @@ def mock_on_gfx9():
None
,
AttentionBackendEnum
.
ROCM_AITER_UNIFIED_ATTN
.
get_path
(),
),
# Test Case 9: VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1
(
{
"VLLM_V1_USE_PREFILL_DECODE_ATTENTION"
:
"1"
},
None
,
AttentionBackendEnum
.
ROCM_ATTN
.
get_path
(),
),
# Test Case 10: VLLM_ROCM_USE_AITER=1 + explicit TRITON_ATTN
# Test Case 9: VLLM_ROCM_USE_AITER=1 + explicit TRITON_ATTN
(
{
"VLLM_ROCM_USE_AITER"
:
"1"
},
"TRITON_ATTN"
,
AttentionBackendEnum
.
TRITON_ATTN
.
get_path
(),
),
# Test Case 1
1
: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=0
# Test Case 1
0
: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=0
# (explicitly disabled)
(
{
"VLLM_ROCM_USE_AITER"
:
"1"
,
"VLLM_ROCM_USE_AITER_MHA"
:
"0"
},
None
,
AttentionBackendEnum
.
TRITON_ATTN
.
get_path
(),
),
# Test Case 1
2
: VLLM_ROCM_USE_AITER=1 + explicit ROCM_ATTN
# Test Case 1
1
: VLLM_ROCM_USE_AITER=1 + explicit ROCM_ATTN
(
{
"VLLM_ROCM_USE_AITER"
:
"1"
},
"ROCM_ATTN"
,
...
...
tests/v1/attention/utils.py
View file @
7eb6cb6c
...
...
@@ -249,8 +249,8 @@ def create_dummy_kv_cache(
@
dataclass
class
BackendConfig
:
name
:
str
env_vars
:
dict
comp_config
:
dict
# compilation config
attention_config
:
dict
comp_config
:
dict
specific_gpu_arch
:
tuple
|
None
=
None
...
...
@@ -259,10 +259,10 @@ full_cg_backend_configs = {
# FA3 on Hopper
"FA3"
:
BackendConfig
(
name
=
"FA3"
,
env_vars
=
{
"
VLLM_ATTENTION_BACKEND
"
:
"FLASH_ATTN"
,
"
VLLM_FLASH_ATTN_VERSION"
:
"3"
,
"
VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH
"
:
"
16
"
,
attention_config
=
{
"
backend
"
:
"FLASH_ATTN"
,
"
flash_attn_version"
:
3
,
"
flash_attn_max_num_splits_for_cuda_graph
"
:
16
,
},
comp_config
=
{
"cudagraph_mode"
:
"FULL"
,
...
...
@@ -272,9 +272,7 @@ full_cg_backend_configs = {
# FlashMLA on Hopper
"FlashMLA"
:
BackendConfig
(
name
=
"FlashMLA"
,
env_vars
=
{
"VLLM_ATTENTION_BACKEND"
:
"FLASHMLA"
,
},
attention_config
=
{
"backend"
:
"FLASHMLA"
},
comp_config
=
{
"cudagraph_mode"
:
"FULL_AND_PIECEWISE"
,
},
...
...
@@ -283,9 +281,7 @@ full_cg_backend_configs = {
# Cutlass MLA on Blackwell
"CutlassMLA"
:
BackendConfig
(
name
=
"CutlassMLA"
,
env_vars
=
{
"VLLM_ATTENTION_BACKEND"
:
"CUTLASS_MLA"
,
},
attention_config
=
{
"backend"
:
"CUTLASS_MLA"
},
comp_config
=
{
"cudagraph_mode"
:
"FULL_AND_PIECEWISE"
,
},
...
...
@@ -294,9 +290,7 @@ full_cg_backend_configs = {
# FlashInfer MLA on Blackwell
"FlashInferMLA"
:
BackendConfig
(
name
=
"FlashInferMLA"
,
env_vars
=
{
"VLLM_ATTENTION_BACKEND"
:
"FLASHINFER_MLA"
,
},
attention_config
=
{
"backend"
:
"FLASHINFER_MLA"
},
comp_config
=
{
"cudagraph_mode"
:
"FULL_AND_PIECEWISE"
,
},
...
...
@@ -305,9 +299,9 @@ full_cg_backend_configs = {
# FlashAttention MLA on Hopper
"FlashAttentionMLA"
:
BackendConfig
(
name
=
"FlashAttentionMLA"
,
env_vars
=
{
"
VLLM_ATTENTION_BACKEND
"
:
"FLASH_ATTN_MLA"
,
"
VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH
"
:
"
16
"
,
attention_config
=
{
"
backend
"
:
"FLASH_ATTN_MLA"
,
"
flash_attn_max_num_splits_for_cuda_graph
"
:
16
,
},
comp_config
=
{
"cudagraph_mode"
:
"FULL_DECODE_ONLY"
,
...
...
@@ -317,10 +311,10 @@ full_cg_backend_configs = {
# FA2
"FA2"
:
BackendConfig
(
name
=
"FA2"
,
env_vars
=
{
"
VLLM_ATTENTION_BACKEND
"
:
"FLASH_ATTN"
,
"
VLLM_FLASH_ATTN_VERSION"
:
"2"
,
"
VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH
"
:
"
16
"
,
attention_config
=
{
"
backend
"
:
"FLASH_ATTN"
,
"
flash_attn_version"
:
2
,
"
flash_attn_max_num_splits_for_cuda_graph
"
:
16
,
},
comp_config
=
{
"cudagraph_mode"
:
"FULL_AND_PIECEWISE"
,
...
...
@@ -329,7 +323,7 @@ full_cg_backend_configs = {
# Triton Attention
"TritonAttn"
:
BackendConfig
(
name
=
"TritonAttn"
,
env_vars
=
{
"VLLM_ATTENTION_BACKEND
"
:
"TRITON_ATTN"
},
attention_config
=
{
"backend
"
:
"TRITON_ATTN"
},
comp_config
=
{
"cudagraph_mode"
:
"FULL_AND_PIECEWISE"
,
},
...
...
@@ -337,14 +331,17 @@ full_cg_backend_configs = {
# FlashInfer
"FlashInfer"
:
BackendConfig
(
name
=
"FlashInfer"
,
env_vars
=
{
"VLLM_ATTENTION_BACKEND
"
:
"FLASHINFER"
},
attention_config
=
{
"backend
"
:
"FLASHINFER"
},
comp_config
=
{
"cudagraph_mode"
:
"FULL_AND_PIECEWISE"
,
},
),
"RocmAttn"
:
BackendConfig
(
name
=
"RocmAttn"
,
env_vars
=
{
"VLLM_V1_USE_PREFILL_DECODE_ATTENTION"
:
"1"
},
attention_config
=
{
"backend"
:
"ROCM_ATTN"
,
"use_prefill_decode_attention"
:
True
,
},
comp_config
=
{
"cudagraph_mode"
:
"FULL"
,
},
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment