Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7e63ef82
Commit
7e63ef82
authored
Jan 21, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.14.0' into v0.14.0-dev
parents
8cbcac5d
b17039bc
Changes
681
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1338 additions
and
223 deletions
+1338
-223
tests/compile/test_aot_compile.py
tests/compile/test_aot_compile.py
+105
-1
tests/compile/test_config.py
tests/compile/test_config.py
+67
-49
tests/compile/test_dynamic_shapes_compilation.py
tests/compile/test_dynamic_shapes_compilation.py
+1
-0
tests/compile/test_noop_elimination.py
tests/compile/test_noop_elimination.py
+5
-2
tests/compile/test_qk_norm_rope_fusion.py
tests/compile/test_qk_norm_rope_fusion.py
+1
-1
tests/compile/untest_fusion.py
tests/compile/untest_fusion.py
+189
-127
tests/compile/untest_fusion_attn.py
tests/compile/untest_fusion_attn.py
+18
-10
tests/config/base_model_arch_groundtruth.json
tests/config/base_model_arch_groundtruth.json
+359
-0
tests/config/draft_model_arch_groundtruth.json
tests/config/draft_model_arch_groundtruth.json
+87
-0
tests/config/test_model_arch_config.py
tests/config/test_model_arch_config.py
+152
-0
tests/config/test_multimodal_config.py
tests/config/test_multimodal_config.py
+1
-1
tests/conftest.py
tests/conftest.py
+20
-4
tests/distributed/test_context_parallel.py
tests/distributed/test_context_parallel.py
+1
-3
tests/distributed/test_eplb_algo.py
tests/distributed/test_eplb_algo.py
+141
-0
tests/distributed/test_eplb_execute.py
tests/distributed/test_eplb_execute.py
+9
-8
tests/distributed/test_pp_cudagraph.py
tests/distributed/test_pp_cudagraph.py
+12
-14
tests/engine/test_arg_utils.py
tests/engine/test_arg_utils.py
+144
-1
tests/entrypoints/instrumentator/__init__.py
tests/entrypoints/instrumentator/__init__.py
+0
-0
tests/entrypoints/instrumentator/test_metrics.py
tests/entrypoints/instrumentator/test_metrics.py
+2
-2
tests/entrypoints/openai/conftest.py
tests/entrypoints/openai/conftest.py
+24
-0
No files found.
Too many changes to show.
To preserve performance only
681 of 681+
files are displayed.
Plain diff
Email patch
tests/compile/test_aot_compile.py
View file @
7e63ef82
...
...
@@ -5,10 +5,12 @@ import functools
import
multiprocessing
import
tempfile
from
contextlib
import
contextmanager
from
pathlib
import
Path
import
pytest
import
torch
import
vllm.model_executor.layers.activation
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
(
CompilationConfig
,
...
...
@@ -16,9 +18,19 @@ from vllm.config import (
VllmConfig
,
set_current_vllm_config
,
)
from
vllm.envs
import
disable_envs_cache
from
vllm.forward_context
import
set_forward_context
from
vllm.utils.torch_utils
import
is_torch_equal_or_newer
from
..utils
import
create_new_process_for_each_test
@
pytest
.
fixture
def
vllm_tmp_cache
(
tmp_path
:
Path
,
monkeypatch
:
pytest
.
MonkeyPatch
)
->
Path
:
"""Fixture that sets VLLM_CACHE_ROOT to a temporary directory."""
monkeypatch
.
setenv
(
"VLLM_CACHE_ROOT"
,
str
(
tmp_path
/
"vllm_cache"
))
return
tmp_path
def
reference_fn
(
x
:
torch
.
Tensor
):
assert
x
.
shape
[
0
]
<=
42
...
...
@@ -66,6 +78,7 @@ def test_no_dynamo_cache_entry(monkeypatch: pytest.MonkeyPatch):
torch
.
compiler
.
set_stance
(
"fail_on_recompile"
),
):
CompiledMod
(
vllm_config
=
vllm_config
)(
*
args
)
disable_envs_cache
()
m
.
setenv
(
"VLLM_USE_AOT_COMPILE"
,
"1"
)
torch
.
_dynamo
.
reset
()
...
...
@@ -101,6 +114,7 @@ def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
vllm_config
=
make_vllm_config
()
with
use_vllm_config
(
vllm_config
):
expected
=
CompiledMod
(
vllm_config
=
vllm_config
)(
*
args
)
disable_envs_cache
()
m
.
setenv
(
"VLLM_FORCE_AOT_LOAD"
,
"1"
)
vllm_config
=
make_vllm_config
()
...
...
@@ -130,6 +144,7 @@ def test_shape_env(monkeypatch: pytest.MonkeyPatch):
artifacts
=
compiled_mod
.
aot_compiled_fn
.
_artifacts
guards_string
=
artifacts
.
compiled_fn
.
shape_env
.
format_guards
()
assert
guards_string
==
" - s77 <= 42
\n
- Eq(Mod(s77, 2), 0)"
disable_envs_cache
()
m
.
setenv
(
"VLLM_FORCE_AOT_LOAD"
,
"1"
)
vllm_config
=
make_vllm_config
()
...
...
@@ -144,7 +159,94 @@ def test_shape_env(monkeypatch: pytest.MonkeyPatch):
@
pytest
.
mark
.
skipif
(
not
is_torch_equal_or_newer
(
"2.10.0.dev"
),
reason
=
"requires torch 2.10"
)
@
use_vllm_config
(
make_vllm_config
())
def
test_partition_wrapper_applied_on_aot_load
(
monkeypatch
:
pytest
.
MonkeyPatch
,
vllm_tmp_cache
:
Path
,
mocker
):
"""
Test that partition wrappers are applied when loading AOT cached functions.
This test verifies the fix for GitHub issue #31439 where AOT compile
caused 2x latency regression when use_inductor_graph_partition=True.
The root cause was that partition wrapper context was bypassed when
loading from AOT cache.
"""
from
vllm.config
import
CUDAGraphMode
args
=
(
torch
.
randn
(
10
,
10
),)
monkeypatch
.
setenv
(
"VLLM_USE_AOT_COMPILE"
,
"1"
)
# Create config with partition enabled
vllm_config
=
VllmConfig
(
compilation_config
=
CompilationConfig
(
mode
=
CompilationMode
.
VLLM_COMPILE
,
use_inductor_graph_partition
=
True
,
cudagraph_mode
=
CUDAGraphMode
.
PIECEWISE
,
)
)
# First compilation - save to cache
with
use_vllm_config
(
vllm_config
):
compiled_mod
=
CompiledMod
(
vllm_config
=
vllm_config
)
compiled_mod
(
*
args
)
disable_envs_cache
()
# Second run - load from cache, verify partition wrapper applied
monkeypatch
.
setenv
(
"VLLM_FORCE_AOT_LOAD"
,
"1"
)
vllm_config
=
VllmConfig
(
compilation_config
=
CompilationConfig
(
mode
=
CompilationMode
.
VLLM_COMPILE
,
use_inductor_graph_partition
=
True
,
cudagraph_mode
=
CUDAGraphMode
.
PIECEWISE
,
)
)
# Use mocker to spy on set_customized_partition_wrappers
spy
=
mocker
.
spy
(
torch
.
_inductor
.
utils
,
"set_customized_partition_wrappers"
)
with
use_vllm_config
(
vllm_config
):
compiled_mod
=
CompiledMod
(
vllm_config
=
vllm_config
)
# First call after restart: loads from AOT cache.
# This tests the fix for the first call after a restart.
compiled_mod
(
*
args
)
# Verify partition wrapper was called on AOT load.
assert
spy
.
call_count
>=
2
,
(
"Expected partition wrapper to be set and cleared on AOT load, "
f
"got
{
spy
.
call_count
}
calls"
)
# First call should set a wrapper, last call should clear it
assert
spy
.
call_args_list
[
0
][
0
][
0
]
is
not
None
,
(
"First call on AOT load should set a wrapper function"
)
assert
spy
.
call_args_list
[
-
1
][
0
][
0
]
is
None
,
(
"Last call on AOT load should clear the wrapper"
)
# Reset for the next check.
spy
.
reset_mock
()
# Subsequent call: uses the cached `aot_compiled_fn`.
# This tests the fix for subsequent calls.
compiled_mod
(
*
args
)
# Verify partition wrapper was called on the subsequent call.
assert
spy
.
call_count
>=
2
,
(
"Expected partition wrapper set and cleared on subsequent "
f
"call, got
{
spy
.
call_count
}
calls"
)
assert
spy
.
call_args_list
[
0
][
0
][
0
]
is
not
None
,
(
"First call on subsequent call should set a wrapper function"
)
assert
spy
.
call_args_list
[
-
1
][
0
][
0
]
is
None
,
(
"Last call on subsequent call should clear the wrapper"
)
@
pytest
.
mark
.
skipif
(
not
is_torch_equal_or_newer
(
"2.10.0.dev"
),
reason
=
"requires torch 2.10"
)
@
create_new_process_for_each_test
(
"spawn"
)
def
test_gpt2_cache_hit
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""
Test that compiling gpt2 twice results in a cache hit and
...
...
@@ -186,6 +288,8 @@ def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch):
# Clean up first model
del
llm_model
disable_envs_cache
()
vllm
.
model_executor
.
layers
.
activation
.
_ACTIVATION_REGISTRY
.
_dict
.
clear
()
# Second compilation - should hit cache
m
.
setenv
(
"VLLM_FORCE_AOT_LOAD"
,
"1"
)
...
...
tests/compile/test_config.py
View file @
7e63ef82
...
...
@@ -15,7 +15,10 @@ from vllm.config.compilation import CompilationMode, PassConfig
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.logger
import
_print_warning_once
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
_is_torch_equal_or_newer
from
vllm.utils.torch_utils
import
(
_is_torch_equal_or_newer
,
is_torch_equal
,
)
# This import automatically registers `torch.ops.silly.attention`
from
.
import
silly_attention
# noqa: F401
...
...
@@ -30,6 +33,29 @@ def test_version():
assert
not
_is_torch_equal_or_newer
(
"2.7.1"
,
"2.8.0.dev"
)
def
test_get_raw_stream_patch
():
"""Test that get_raw_stream patch is applied only for torch 2.9.0 or 2.9.1."""
import
builtins
# Check if get_raw_stream exists in builtins
has_patch
=
hasattr
(
builtins
,
"get_raw_stream"
)
# Import torch to get actual version
is_torch_2_9
=
is_torch_equal
(
"2.9.0"
)
or
is_torch_equal
(
"2.9.1"
)
if
is_torch_2_9
:
# For torch 2.9.x, the patch should be applied
assert
has_patch
,
"get_raw_stream should be patched for torch 2.9.x"
# Verify it's callable (it should be the _cuda_getCurrentRawStream function)
get_raw_stream
=
builtins
.
get_raw_stream
# type: ignore[attr-defined]
assert
callable
(
get_raw_stream
)
# Verify it's the correct function from torch._C
from
torch._C
import
_cuda_getCurrentRawStream
assert
get_raw_stream
is
_cuda_getCurrentRawStream
def
test_copy_pass
():
vllm_config
=
VllmConfig
()
inductor_pass
=
FixFunctionalizationPass
(
vllm_config
)
...
...
@@ -406,51 +432,43 @@ def test_cudagraph_sizes_post_init(
)
def
test_pass_config_deprecation
(
caplog_vllm
):
caplog_vllm
.
set_level
(
logging
.
WARNING
)
# Clear cache to ensure warnings are re-issued
_print_warning_once
.
cache_clear
()
# Test enable_fusion -> fuse_norm_quant, fuse_act_quant
caplog_vllm
.
clear
()
config
=
PassConfig
(
enable_fusion
=
True
)
assert
"enable_fusion is deprecated"
in
caplog_vllm
.
text
assert
config
.
fuse_norm_quant
is
True
assert
config
.
fuse_act_quant
is
True
assert
config
.
enable_fusion
is
None
# Test enable_attn_fusion -> fuse_attn_quant
caplog_vllm
.
clear
()
config
=
PassConfig
(
enable_attn_fusion
=
True
)
assert
"enable_attn_fusion is deprecated"
in
caplog_vllm
.
text
assert
config
.
fuse_attn_quant
is
True
assert
config
.
enable_attn_fusion
is
None
# Test enable_noop -> eliminate_noops
caplog_vllm
.
clear
()
config
=
PassConfig
(
enable_noop
=
True
)
assert
"enable_noop is deprecated"
in
caplog_vllm
.
text
assert
config
.
eliminate_noops
is
True
assert
config
.
enable_noop
is
None
# Test enable_sequence_parallelism -> enable_sp
caplog_vllm
.
clear
()
config
=
PassConfig
(
enable_sequence_parallelism
=
True
)
assert
"enable_sequence_parallelism is deprecated"
in
caplog_vllm
.
text
assert
config
.
enable_sp
is
True
assert
config
.
enable_sequence_parallelism
is
None
# Test enable_async_tp -> fuse_gemm_comms
caplog_vllm
.
clear
()
config
=
PassConfig
(
enable_async_tp
=
True
)
assert
"enable_async_tp is deprecated"
in
caplog_vllm
.
text
assert
config
.
fuse_gemm_comms
is
True
assert
config
.
enable_async_tp
is
None
# Test enable_fi_allreduce_fusion -> fuse_allreduce_rms
caplog_vllm
.
clear
()
config
=
PassConfig
(
enable_fi_allreduce_fusion
=
True
)
assert
"enable_fi_allreduce_fusion is deprecated"
in
caplog_vllm
.
text
assert
config
.
fuse_allreduce_rms
is
True
assert
config
.
enable_fi_allreduce_fusion
is
None
def
test_cached_compilation_config
(
default_vllm_config
):
import
torch
from
torch._inductor.utils
import
run_and_get_code
from
vllm.config
import
get_cached_compilation_config
,
set_current_vllm_config
from
vllm.model_executor.layers.quantization.input_quant_fp8
import
QuantFP8
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
GroupShape
dtype
=
torch
.
bfloat16
device
=
torch
.
device
(
"cuda:0"
)
batch_size
,
num_qo_heads
,
head_size
=
8
,
16
,
128
# access and cache default compilation config
# default compilation config does not contain +quant_fp8 custom op. If this is
# used, the generated code would use inductor-generated triton kernel instead
# of the custom op `torch.ops._C.static_scaled_fp8_quant`.
get_cached_compilation_config
()
vllm_config
=
VllmConfig
(
compilation_config
=
CompilationConfig
(
mode
=
CompilationMode
.
VLLM_COMPILE
,
custom_ops
=
[
"+quant_fp8"
],
)
)
# set_current_vllm_config should clear cached compilation config and
# use the new compilation_config in vllm_config
with
set_current_vllm_config
(
vllm_config
):
query_quant
=
QuantFP8
(
static
=
True
,
group_shape
=
GroupShape
.
PER_TENSOR
)
query_quant
=
torch
.
compile
(
query_quant
)
_q_scale
=
torch
.
tensor
(
1.0
,
dtype
=
torch
.
float32
,
device
=
"cuda"
)
query
=
torch
.
randn
(
batch_size
,
num_qo_heads
*
head_size
,
dtype
=
dtype
,
device
=
device
)
_
,
code
=
run_and_get_code
(
query_quant
,
query
,
_q_scale
)
code
=
" "
.
join
(
code
)
assert
"torch.ops._C.static_scaled_fp8_quant.default("
in
code
tests/compile/test_dynamic_shapes_compilation.py
View file @
7e63ef82
...
...
@@ -77,6 +77,7 @@ def test_dynamic_shapes_compilation(
"evaluate_guards"
:
evaluate_guards
,
},
},
max_model_len
=
1024
,
)
output
=
model
.
generate
(
prompt
)
...
...
tests/compile/test_noop_elimination.py
View file @
7e63ef82
...
...
@@ -25,10 +25,13 @@ def test_noop_elimination(dtype, num_tokens, hidden_size, buffer_size):
class
Model
(
torch
.
nn
.
Module
):
def
__init__
(
self
)
->
None
:
super
().
__init__
()
self
.
pos_embed
=
torch
.
empty
(
buffer_size
,
hidden_size
,
dtype
=
dtype
)
# Avoid using empty, since on rocm torch.empty
# does not initialize the memory.
self
.
pos_embed
=
torch
.
randn
(
buffer_size
,
hidden_size
,
dtype
=
dtype
)
def
forward
(
self
,
x
):
x
+=
self
.
pos_embed
[:
x
.
shape
[
0
]]
# Avoid += to prevent inplace addition.
x
=
x
+
self
.
pos_embed
[:
x
.
shape
[
0
]]
# Chain of reshapes
y
=
x
.
reshape
(
-
1
,
128
,
32
)
z
=
y
.
reshape
(
-
1
,
4096
)
...
...
tests/compile/test_qk_norm_rope_fusion.py
View file @
7e63ef82
...
...
@@ -5,7 +5,6 @@ import pytest
import
torch
from
tests.compile.backend
import
TestBackend
from
vllm.attention.backends.abstract
import
AttentionType
from
vllm.attention.layer
import
Attention
from
vllm.compilation.matcher_utils
import
FLASHINFER_ROTARY_OP
,
RMS_OP
,
ROTARY_OP
from
vllm.compilation.noop_elimination
import
NoOpEliminationPass
...
...
@@ -25,6 +24,7 @@ from vllm.config import (
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.rotary_embedding
import
RotaryEmbedding
from
vllm.platforms
import
current_platform
from
vllm.v1.attention.backend
import
AttentionType
RSQRT_OP
=
torch
.
ops
.
aten
.
rsqrt
.
default
INDEX_SELECT_OP
=
torch
.
ops
.
aten
.
index
.
Tensor
...
...
tests/compile/untest_fusion.py
View file @
7e63ef82
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
itertools
import
pytest
import
torch
...
...
@@ -53,37 +52,61 @@ class TestModel(torch.nn.Module):
hidden_size
:
int
,
eps
:
float
,
group_shape
:
GroupShape
,
cuda_force_torch
:
bool
,
use_aiter
:
bool
=
False
,
cuda_force_torch
:
bool
=
False
,
use_aiter_quant_op
:
bool
=
True
,
*
args
,
**
kwargs
,
):
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
use_aiter
=
use_aiter
self
.
use_aiter_quant_op
=
use_aiter_quant_op
self
.
cuda_force_torch
=
cuda_force_torch
self
.
group_shape
=
group_shape
self
.
enable_quant_fp8_custom_op
=
None
# Will be set later if applicable
self
.
norm
=
[
RMSNorm
(
hidden_size
,
eps
)
for
_
in
range
(
4
)]
if
group_shape
.
is_per_group
():
self
.
wscale
=
[
torch
.
rand
(
(
hidden_size
//
group_shape
[
1
],
hidden_size
//
group_shape
[
1
]),
dtype
=
torch
.
float32
,
)
for
_
in
range
(
3
)
]
else
:
self
.
wscale
=
[
torch
.
rand
(
1
,
dtype
=
torch
.
float32
)
for
_
in
range
(
3
)]
static
=
group_shape
==
GroupShape
.
PER_TENSOR
# Setup quantization scale descriptor
static
=
group_shape
==
GroupShape
.
PER_TENSOR
and
not
use_aiter
quant_scale
=
ScaleDesc
(
torch
.
float32
,
static
,
group_shape
)
self
.
quant_key
=
QuantKey
(
dtype
=
FP8_DTYPE
,
scale
=
quant_scale
,
symmetric
=
True
)
# Setup scales
if
static
:
self
.
scale
=
[
torch
.
rand
(
1
,
dtype
=
torch
.
float32
)
for
_
in
range
(
3
)]
else
:
self
.
scale
=
[
None
for
_
in
range
(
3
)]
# Setup weights
self
.
w
=
[
torch
.
rand
(
hidden_size
,
hidden_size
).
to
(
dtype
=
FP8_DTYPE
)
for
_
in
range
(
3
)
]
if
not
group_shape
.
is_per_group
():
if
not
group_shape
.
is_per_group
()
or
use_aiter
:
self
.
w
=
[
self
.
w
[
0
].
t
()
for
_
in
range
(
3
)]
# Setup weight scales
if
group_shape
.
is_per_group
():
scale_size
=
(
(
hidden_size
+
128
-
1
)
//
128
if
use_aiter
else
hidden_size
//
group_shape
[
1
]
)
wscale_shape
:
tuple
[
int
,
...]
=
(
scale_size
,
scale_size
)
else
:
wscale_shape
=
(
1
,)
self
.
wscale
=
[
torch
.
rand
(
wscale_shape
,
dtype
=
torch
.
float32
)
for
_
in
range
(
3
)]
# Setup FP8 linear operation
is_per_group
=
group_shape
.
is_per_group
()
if
is_per_group
and
use_aiter
:
self
.
fp8_linear
=
W8A8BlockFp8LinearOp
(
weight_group_shape
=
GroupShape
(
128
,
128
),
act_quant_group_shape
=
group_shape
,
use_aiter_and_is_supported
=
use_aiter_quant_op
,
)
# AITER blockwise doesn't use enable_quant_fp8_custom_op
elif
is_per_group
:
self
.
fp8_linear
=
W8A8BlockFp8LinearOp
(
weight_group_shape
=
GroupShape
(
group_shape
[
1
],
group_shape
[
1
]),
act_quant_group_shape
=
group_shape
,
...
...
@@ -91,6 +114,13 @@ class TestModel(torch.nn.Module):
use_aiter_and_is_supported
=
False
,
)
self
.
enable_quant_fp8_custom_op
=
self
.
fp8_linear
.
input_quant_op
.
enabled
()
elif
use_aiter
:
self
.
fp8_linear
=
Fp8LinearOp
(
act_quant_static
=
False
,
act_quant_group_shape
=
group_shape
,
)
self
.
fp8_linear
.
quant_fp8
.
use_aiter
=
use_aiter_quant_op
self
.
enable_quant_fp8_custom_op
=
self
.
fp8_linear
.
quant_fp8
.
enabled
()
else
:
with
override_cutlass_fp8_supported
(
not
cuda_force_torch
):
self
.
fp8_linear
=
Fp8LinearOp
(
...
...
@@ -100,7 +130,6 @@ class TestModel(torch.nn.Module):
self
.
enable_quant_fp8_custom_op
=
self
.
fp8_linear
.
quant_fp8
.
enabled
()
self
.
enable_rms_norm_custom_op
=
self
.
norm
[
0
].
enabled
()
self
.
group_shape
=
group_shape
def
forward
(
self
,
x
):
# avoid having graph input be an arg to a pattern directly
...
...
@@ -126,19 +155,49 @@ class TestModel(torch.nn.Module):
y4
,
resid
=
self
.
norm
[
3
](
x4
,
resid
)
# use resid here
return
y4
def
ops_in_model_before
(
self
):
if
(
self
.
use_aiter
and
self
.
group_shape
.
is_per_group
()
and
current_platform
.
is_fp8_fnuz
()
):
return
[
rocm_aiter_ops
.
get_group_quant_op
()]
if
self
.
use_aiter
and
self
.
group_shape
.
is_per_group
():
return
[
torch
.
ops
.
vllm
.
triton_per_token_group_quant_fp8
.
default
]
if
self
.
use_aiter
and
self
.
use_aiter_quant_op
:
return
[
rocm_aiter_ops
.
get_per_token_quant_op
()]
if
self
.
use_aiter
:
return
[
QUANT_OPS
[
self
.
quant_key
]]
if
self
.
enable_quant_fp8_custom_op
:
return
[
QUANT_OPS
[
self
.
quant_key
]]
return
[
torch
.
ops
.
aten
.
reciprocal
]
def
ops_in_model_after
(
self
):
if
self
.
use_aiter
and
self
.
group_shape
.
is_per_group
():
from
vllm.compilation.rocm_aiter_fusion
import
(
AiterFusedAddRMSFp8GroupQuantPattern
,
AiterRMSFp8GroupQuantPattern
,
)
return
[
AiterFusedAddRMSFp8GroupQuantPattern
.
FUSED_OP
,
AiterRMSFp8GroupQuantPattern
.
FUSED_OP
,
]
if
self
.
use_aiter
:
from
vllm.compilation.rocm_aiter_fusion
import
(
AiterFusedAddRMSNormDynamicQuantPattern
,
AiterRMSNormDynamicQuantPattern
,
)
return
[
AiterFusedAddRMSNormDynamicQuantPattern
.
FUSED_OP
,
AiterRMSNormDynamicQuantPattern
.
FUSED_OP
,
]
return
[
FUSED_OPS
[
FusedRMSQuantKey
(
self
.
quant_key
,
True
)],
FUSED_OPS
[
FusedRMSQuantKey
(
self
.
quant_key
,
False
)],
]
def
ops_in_model_before
(
self
):
return
(
[
QUANT_OPS
[
self
.
quant_key
]]
if
self
.
enable_quant_fp8_custom_op
else
[
torch
.
ops
.
aten
.
reciprocal
]
)
def
ops_in_model_before_partial
(
self
):
return
(
[
RMS_OP
,
RMS_ADD_OP
]
...
...
@@ -155,67 +214,45 @@ GROUP_SHAPES = [
]
class
TestRmsnormGroupFp8QuantModel
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
eps
:
float
,
**
kwargs
):
super
().
__init__
()
self
.
w8a8_block_fp8_linear
=
W8A8BlockFp8LinearOp
(
weight_group_shape
=
GroupShape
(
128
,
128
),
act_quant_group_shape
=
GroupShape
(
1
,
128
),
cutlass_block_fp8_supported
=
False
,
use_aiter_and_is_supported
=
True
,
)
self
.
w
=
[
torch
.
rand
(
hidden_size
,
hidden_size
).
to
(
dtype
=
FP8_DTYPE
).
t
()
for
_
in
range
(
3
)
]
def
_run_fusion_test
(
model
,
fusion_pass
,
vllm_config
,
dtype
,
hidden_size
,
num_tokens
,
):
"""Helper function for common fusion test logic.
scale_hidden_size
=
(
hidden_size
+
128
-
1
)
//
128
self
.
wscale
=
[
torch
.
rand
((
scale_hidden_size
,
scale_hidden_size
),
dtype
=
torch
.
float32
)
for
_
in
range
(
3
)
]
Must be called within vllm_config context.
"""
noop_pass
=
NoOpEliminationPass
(
vllm_config
)
cleanup_pass
=
PostCleanupPass
(
vllm_config
)
self
.
norm_weight
=
[
torch
.
ones
(
hidden_size
)
for
_
in
range
(
4
)]
self
.
eps
=
eps
backend
=
TestBackend
(
noop_pass
,
fusion_pass
,
cleanup_pass
)
backend2
=
TestBackend
(
noop_pass
,
cleanup_pass
)
def
forward
(
self
,
x
):
# avoid having graph input be an arg to a pattern directly
x
=
resid
=
torch
.
relu
(
x
)
y
=
rocm_aiter_ops
.
rms_norm
(
x
,
self
.
norm_weight
[
0
],
self
.
eps
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
)
torch
.
_dynamo
.
mark_dynamic
(
x
,
0
)
x2
=
self
.
w8a8_block_fp8_linear
.
apply
(
y
,
self
.
w
[
0
],
self
.
wscale
[
0
])
# make sure resid is used for replacement to work
y2
,
resid
=
rocm_aiter_ops
.
rms_norm2d_with_add
(
x2
,
resid
,
self
.
norm_weight
[
1
],
self
.
eps
)
model_fused
=
torch
.
compile
(
model
,
backend
=
backend
)
result_fused
=
model_fused
(
x
)
x3
=
self
.
w8a8_block_fp8_linear
.
apply
(
y2
,
self
.
w
[
1
],
self
.
wscale
[
1
])
model_unfused
=
torch
.
compile
(
model
,
backend
=
backend2
)
result_unfused
=
model_unfused
(
x
)
y3
,
resid
=
rocm_aiter_ops
.
rms_norm2d_with_add
(
x3
,
resid
,
self
.
norm_weight
[
2
],
self
.
eps
)
if
dtype
==
torch
.
float16
:
ATOL
,
RTOL
=
(
2e-3
,
2e-3
)
else
:
ATOL
,
RTOL
=
(
1e-2
,
1e-2
)
x4
=
self
.
w8a8_block_fp8_linear
.
apply
(
y3
,
self
.
w
[
2
],
self
.
wscale
[
2
]
)
torch
.
testing
.
assert_close
(
result_fused
,
result_unfused
,
atol
=
ATOL
,
rtol
=
RTOL
)
y4
,
resid
=
rocm_aiter_ops
.
rms_norm2d_with_add
(
x4
,
resid
,
self
.
norm_weight
[
3
],
self
.
eps
)
return
y4
assert
fusion_pass
.
matched_count
==
3
backend
.
check_before_ops
(
model
.
ops_in_model_before
())
backend
.
check_after_ops
(
model
.
ops_in_model_after
())
def
ops_in_model_before
(
self
):
return
[
torch
.
ops
.
vllm
.
rocm_aiter_rms_norm
,
torch
.
ops
.
vllm
.
rocm_aiter_group_fp8_quant
,
]
def
ops_in_model_before_partial
(
self
):
return
[]
def
ops_in_model_after
(
self
):
return
[
torch
.
ops
.
vllm
.
rocm_aiter_rmsnorm_fp8_group_quant
,
torch
.
ops
.
vllm
.
rocm_aiter_rmsnorm_with_add_fp8_group_quant
,
]
return
backend
,
backend2
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float16
,
torch
.
bfloat16
])
...
...
@@ -223,11 +260,8 @@ class TestRmsnormGroupFp8QuantModel(torch.nn.Module):
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
[
257
])
@
pytest
.
mark
.
parametrize
(
"eps"
,
[
1e-5
,
1e-6
])
@
pytest
.
mark
.
parametrize
(
"group_shape"
,
GROUP_SHAPES
)
@
pytest
.
mark
.
parametrize
(
"model_class, enable_rms_norm_custom_op, enable_quant_fp8_custom_op"
,
list
(
itertools
.
product
([
TestModel
],
[
True
,
False
],
[
True
,
False
]))
+
[(
TestRmsnormGroupFp8QuantModel
,
False
,
False
)],
)
@
pytest
.
mark
.
parametrize
(
"enable_rms_norm_custom_op"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"enable_quant_fp8_custom_op"
,
[
True
,
False
])
# cuda_force_torch used to test torch code path on platforms that
# cutlass_fp8_supported() == True.
@
pytest
.
mark
.
parametrize
(
...
...
@@ -242,23 +276,13 @@ def test_fusion_rmsnorm_quant(
num_tokens
,
eps
,
group_shape
,
model_class
,
enable_rms_norm_custom_op
,
enable_quant_fp8_custom_op
,
cuda_force_torch
,
):
if
model_class
is
TestRmsnormGroupFp8QuantModel
and
not
IS_AITER_FOUND
:
pytest
.
skip
(
"AITER is not supported on this GPU."
)
torch
.
set_default_device
(
"cuda"
)
torch
.
set_default_dtype
(
dtype
)
torch
.
manual_seed
(
1
)
maybe_create_device_identity
()
# needed for certain non-cutlass fp8 paths
if
not
enable_quant_fp8_custom_op
and
group_shape
.
is_per_group
():
pytest
.
skip
(
"Unsupported unwrapped quant fp8 op for blockwise quantization"
)
# Skip test for 64-bit group shape when running with cutlass or deepgemm
if
group_shape
==
GroupShape
(
1
,
64
)
and
(
cutlass_block_fp8_supported
()
or
is_deep_gemm_supported
()
):
...
...
@@ -269,6 +293,7 @@ def test_fusion_rmsnorm_quant(
custom_ops
.
append
(
"+rms_norm"
)
if
enable_quant_fp8_custom_op
:
custom_ops
.
append
(
"+quant_fp8"
)
vllm_config
=
VllmConfig
(
model_config
=
ModelConfig
(
dtype
=
dtype
),
compilation_config
=
CompilationConfig
(
...
...
@@ -279,60 +304,97 @@ def test_fusion_rmsnorm_quant(
),
),
)
with
vllm
.
config
.
set_current_vllm_config
(
vllm_config
):
# Reshape pass is needed for the fusion pass to work
noop_pass
=
NoOpEliminationPass
(
vllm_config
)
if
model_class
is
TestRmsnormGroupFp8QuantModel
:
from
vllm.compilation.rocm_aiter_fusion
import
(
RocmAiterRMSNormFp8GroupQuantFusionPass
,
)
fusion_pass
=
RocmAiterRMSNormFp8GroupQuantFusionPass
(
vllm_config
)
else
:
fusion_pass
=
RMSNormQuantFusionPass
(
vllm_config
)
cleanup_pass
=
PostCleanupPass
(
vllm_config
)
backend
=
TestBackend
(
noop_pass
,
fusion_pass
,
cleanup_pass
)
backend2
=
TestBackend
(
noop_pass
,
cleanup_pass
)
model
=
model_class
(
with
vllm
.
config
.
set_current_vllm_config
(
vllm_config
):
# Setup device before model creation
torch
.
set_default_device
(
"cuda"
)
torch
.
set_default_dtype
(
dtype
)
torch
.
manual_seed
(
1
)
maybe_create_device_identity
()
fusion_pass
=
RMSNormQuantFusionPass
(
vllm_config
)
model
=
TestModel
(
hidden_size
=
hidden_size
,
eps
=
eps
,
group_shape
=
group_shape
,
use_aiter
=
False
,
cuda_force_torch
=
cuda_force_torch
,
)
# First dimension dynamic
x
=
torch
.
rand
(
num_tokens
,
hidden_size
)
torch
.
_dynamo
.
mark_dynamic
(
x
,
0
)
model_fused
=
torch
.
compile
(
model
,
backend
=
backend
)
result_fused
=
model_fused
(
x
)
model_unfused
=
torch
.
compile
(
model
,
backend
=
backend2
)
result_unfused
=
model_unfused
(
x
)
if
dtype
==
torch
.
float16
:
ATOL
,
RTOL
=
(
2e-3
,
2e-3
)
else
:
ATOL
,
RTOL
=
(
1e-2
,
1e-2
)
torch
.
testing
.
assert_close
(
result_fused
,
result_unfused
,
atol
=
ATOL
,
rtol
=
RTOL
)
assert
fusion_pass
.
matched_count
==
3
backend
.
check_before_ops
(
model
.
ops_in_model_before
())
backend
,
_
=
_run_fusion_test
(
model
,
fusion_pass
,
vllm_config
,
dtype
,
hidden_size
,
num_tokens
)
backend
.
check_before_ops
(
model
.
ops_in_model_before_partial
(),
fully_replaced
=
False
)
backend
.
check_after_ops
(
model
.
ops_in_model_after
())
# If RMSNorm custom op is disabled (native/torch impl used),
# there's a risk that the fused add doesn't get included in the
# replacement and only the rms part gets fused with quant.
# Hence, we check only 2 add nodes are left (final fused rmsnorm add).
if
(
not
enable_rms_norm_custom_op
and
model_class
is
not
TestRmsnormGroupFp8QuantModel
):
if
not
enable_rms_norm_custom_op
:
n_add_nodes
=
lambda
g
:
sum
(
1
for
_
in
find_op_nodes
(
torch
.
ops
.
aten
.
add
,
g
))
# 7 = 1 (RMS) + 3x2 (3xRMS_ADD, 2 each)
assert
n_add_nodes
(
backend
.
graph_pre_pass
)
==
7
assert
n_add_nodes
(
backend
.
graph_post_pass
)
==
2
GROUP_SHAPE_QUANT_OPS_MATCHS
=
[
(
GroupShape
.
PER_TOKEN
,
True
),
(
GroupShape
.
PER_TOKEN
,
False
),
(
GroupShape
(
1
,
128
),
True
),
]
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
[
256
])
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
[
257
])
@
pytest
.
mark
.
parametrize
(
"eps"
,
[
1e-5
,
1e-6
])
@
pytest
.
mark
.
parametrize
(
"group_shape, use_aiter_quant_op"
,
GROUP_SHAPE_QUANT_OPS_MATCHS
)
@
pytest
.
mark
.
skipif
(
(
not
current_platform
.
is_rocm
()
or
not
IS_AITER_FOUND
),
reason
=
"Only test on ROCm with aiter package installed"
,
)
def
test_aiter_fusion_rmsnorm_quant
(
dtype
:
torch
.
dtype
,
hidden_size
:
int
,
num_tokens
:
int
,
eps
:
float
,
group_shape
:
GroupShape
,
use_aiter_quant_op
:
bool
,
monkeypatch
:
pytest
.
MonkeyPatch
,
):
vllm_config
=
VllmConfig
(
model_config
=
ModelConfig
(
dtype
=
dtype
),
compilation_config
=
CompilationConfig
(
mode
=
CompilationMode
.
VLLM_COMPILE
,
custom_ops
=
[
"+rms_norm"
,
"+quant_fp8"
],
pass_config
=
PassConfig
(
fuse_norm_quant
=
True
,
eliminate_noops
=
True
),
),
)
with
vllm
.
config
.
set_current_vllm_config
(
vllm_config
),
monkeypatch
.
context
()
as
m
:
from
vllm.compilation.rocm_aiter_fusion
import
RocmAiterRMSNormFusionPass
m
.
setenv
(
"VLLM_ROCM_USE_AITER"
,
"1"
)
rocm_aiter_ops
.
refresh_env_variables
()
torch
.
set_default_device
(
"cuda"
)
torch
.
set_default_dtype
(
dtype
)
torch
.
manual_seed
(
1
)
maybe_create_device_identity
()
fusion_pass
=
RocmAiterRMSNormFusionPass
(
vllm_config
)
model
=
TestModel
(
hidden_size
=
hidden_size
,
eps
=
eps
,
group_shape
=
group_shape
,
use_aiter
=
True
,
use_aiter_quant_op
=
use_aiter_quant_op
,
)
_run_fusion_test
(
model
,
fusion_pass
,
vllm_config
,
dtype
,
hidden_size
,
num_tokens
)
tests/compile/untest_fusion_attn.py
View file @
7e63ef82
...
...
@@ -9,8 +9,6 @@ from tests.compile.backend import LazyInitPass, TestBackend
from
tests.utils
import
flat_product
from
tests.v1.attention.utils
import
BatchSpec
,
create_common_attn_metadata
from
vllm._custom_ops
import
cutlass_scaled_fp4_mm
,
scaled_fp4_quant
from
vllm.attention.backends.abstract
import
AttentionMetadata
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.attention.layer
import
Attention
from
vllm.compilation.fusion_attn
import
ATTN_OP
,
AttnFusionPass
from
vllm.compilation.fx_utils
import
find_op_nodes
...
...
@@ -37,6 +35,8 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
from
vllm.model_executor.layers.quantization.utils.w8a8_utils
import
Fp8LinearOp
from
vllm.platforms
import
current_platform
from
vllm.utils.flashinfer
import
has_flashinfer
from
vllm.v1.attention.backend
import
AttentionMetadata
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
from
vllm.v1.kv_cache_interface
import
AttentionSpec
FP8_DTYPE
=
current_platform
.
fp8_dtype
()
...
...
@@ -305,8 +305,12 @@ def test_attention_quant_pattern(
model_class
:
type
[
AttentionQuantPatternModel
],
backend
:
AttentionBackendEnum
,
dist_init
,
monkeypatch
,
use_fresh_inductor_cache
,
):
"""Test AttentionStaticQuantPattern fusion pass"""
monkeypatch
.
setenv
(
"VLLM_DISABLE_COMPILE_CACHE"
,
"1"
)
if
backend
==
AttentionBackendEnum
.
FLASHINFER
and
(
not
current_platform
.
is_device_capability
((
10
,
0
))
or
not
has_flashinfer
()
):
...
...
@@ -363,13 +367,15 @@ def test_attention_quant_pattern(
vllm_config
=
vllm_config_unfused
,
)
model_unfused
=
model_unfused
.
to
(
device
)
result_unfused_0
=
model_unfused
(
q
,
k
,
v
)
# noqa: F841 HACK: See #131044
forward_ctx
=
get_forward_context
()
forward_ctx
.
attn_metadata
=
model_unfused
.
build_attn_metadata
(
batch_size
)
# Run model directly without fusion
# Still compile so query QuantFP8 has closer numerics
result_unfused
=
torch
.
compile
(
model_unfused
,
fullgraph
=
True
)(
q
,
k
,
v
)
compiled_unfused
=
torch
.
compile
(
model_unfused
,
fullgraph
=
True
)
result_unfused
=
compiled_unfused
(
q
,
k
,
v
)
# Run model with attn fusion enabled
vllm_config
.
compilation_config
.
pass_config
=
PassConfig
(
...
...
@@ -399,24 +405,26 @@ def test_attention_quant_pattern(
cleanup_pass
=
PostCleanupPass
(
vllm_config
)
test_backend
=
TestBackend
(
noop_pass
,
attn_pass
,
cleanup_pass
)
# HACK: See https://github.com/vllm-project/vllm/issues/31044
result_fused_0
=
model_fused
(
q
,
k
,
v
)
# noqa: F841
# Compile model with fusion enabled
model_
compiled
=
torch
.
compile
(
compil
ed_fus
ed
=
torch
.
compile
(
model_fused
,
backend
=
test_backend
,
fullgraph
=
True
)
assert
model_
compiled
.
attn
.
_o_scale_float
is
None
assert
compil
ed_fus
ed
.
attn
.
_o_scale_float
is
None
result_fused
_1
=
model_
compiled
(
q
,
k
,
v
)
result_fused
=
compil
ed_fus
ed
(
q
,
k
,
v
)
if
backend
==
AttentionBackendEnum
.
FLASHINFER
:
# With the Flashinfer backend after the 1st round of the forward
# pass, output quant scale should be loaded into the attn layer's
# _o_scale_float, the 2nd round should reuse the loaded
# _o_scale_float
assert
model_
compiled
.
attn
.
_o_scale_float
is
not
None
result_fused_2
=
model_
compiled
(
q
,
k
,
v
)
assert
compil
ed_fus
ed
.
attn
.
_o_scale_float
is
not
None
result_fused_2
=
compil
ed_fus
ed
(
q
,
k
,
v
)
assert
model_
compiled
.
attn
.
_o_scale_float
is
not
None
assert
compil
ed_fus
ed
.
attn
.
_o_scale_float
is
not
None
torch
.
testing
.
assert_close
(
result_unfused
,
result_fused_2
,
atol
=
1e-2
,
rtol
=
1e-2
...
...
@@ -474,4 +482,4 @@ def test_attention_quant_pattern(
)
# Check that results are close
torch
.
testing
.
assert_close
(
result_unfused
,
result_fused
_1
,
atol
=
1e-2
,
rtol
=
1e-2
)
torch
.
testing
.
assert_close
(
result_unfused
,
result_fused
,
atol
=
1e-2
,
rtol
=
1e-2
)
tests/config/base_model_arch_groundtruth.json
0 → 100644
View file @
7e63ef82
{
"state-spaces/mamba-130m-hf"
:
{
"architectures"
:
[
"MambaForCausalLM"
],
"model_type"
:
"mamba"
,
"text_model_type"
:
"mamba"
,
"hidden_size"
:
768
,
"total_num_hidden_layers"
:
24
,
"total_num_attention_heads"
:
0
,
"head_size"
:
0
,
"vocab_size"
:
50280
,
"total_num_kv_heads"
:
0
,
"num_experts"
:
0
,
"is_deepseek_mla"
:
false
,
"is_multimodal_model"
:
false
,
"dtype"
:
"torch.float32"
},
"mistralai/Mamba-Codestral-7B-v0.1"
:
{
"architectures"
:
[
"Mamba2ForCausalLM"
],
"model_type"
:
"mamba"
,
"text_model_type"
:
"mamba"
,
"hidden_size"
:
4096
,
"total_num_hidden_layers"
:
64
,
"total_num_attention_heads"
:
0
,
"head_size"
:
0
,
"vocab_size"
:
32768
,
"total_num_kv_heads"
:
0
,
"num_experts"
:
0
,
"is_deepseek_mla"
:
false
,
"is_multimodal_model"
:
false
,
"dtype"
:
"torch.bfloat16"
},
"ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
:
{
"architectures"
:
[
"Terratorch"
],
"model_type"
:
"timm_wrapper"
,
"text_model_type"
:
"timm_wrapper"
,
"hidden_size"
:
0
,
"total_num_hidden_layers"
:
0
,
"total_num_attention_heads"
:
0
,
"head_size"
:
0
,
"vocab_size"
:
0
,
"total_num_kv_heads"
:
0
,
"num_experts"
:
0
,
"is_deepseek_mla"
:
false
,
"is_multimodal_model"
:
true
,
"dtype"
:
"torch.float32"
},
"tiiuae/falcon-mamba-7b-instruct"
:
{
"architectures"
:
[
"FalconMambaForCausalLM"
],
"model_type"
:
"falcon_mamba"
,
"text_model_type"
:
"falcon_mamba"
,
"hidden_size"
:
4096
,
"total_num_hidden_layers"
:
64
,
"total_num_attention_heads"
:
0
,
"head_size"
:
0
,
"vocab_size"
:
65024
,
"total_num_kv_heads"
:
0
,
"num_experts"
:
0
,
"is_deepseek_mla"
:
false
,
"is_multimodal_model"
:
false
,
"dtype"
:
"torch.bfloat16"
},
"Zyphra/Zamba2-7B-instruct"
:
{
"architectures"
:
[
"Zamba2ForCausalLM"
],
"model_type"
:
"zamba2"
,
"text_model_type"
:
"zamba2"
,
"hidden_size"
:
3584
,
"total_num_hidden_layers"
:
81
,
"total_num_attention_heads"
:
32
,
"head_size"
:
224
,
"vocab_size"
:
32000
,
"total_num_kv_heads"
:
32
,
"num_experts"
:
0
,
"is_deepseek_mla"
:
false
,
"is_multimodal_model"
:
false
,
"dtype"
:
"torch.bfloat16"
},
"mosaicml/mpt-7b"
:
{
"architectures"
:
[
"MPTForCausalLM"
],
"model_type"
:
"mpt"
,
"text_model_type"
:
"mpt"
,
"hidden_size"
:
4096
,
"total_num_hidden_layers"
:
32
,
"total_num_attention_heads"
:
32
,
"head_size"
:
128
,
"vocab_size"
:
50432
,
"total_num_kv_heads"
:
32
,
"num_experts"
:
0
,
"is_deepseek_mla"
:
false
,
"is_multimodal_model"
:
false
,
"dtype"
:
"torch.bfloat16"
},
"databricks/dbrx-instruct"
:
{
"architectures"
:
[
"DbrxForCausalLM"
],
"model_type"
:
"dbrx"
,
"text_model_type"
:
"dbrx"
,
"hidden_size"
:
6144
,
"total_num_hidden_layers"
:
40
,
"total_num_attention_heads"
:
48
,
"head_size"
:
128
,
"vocab_size"
:
100352
,
"total_num_kv_heads"
:
8
,
"num_experts"
:
0
,
"is_deepseek_mla"
:
false
,
"is_multimodal_model"
:
false
,
"dtype"
:
"torch.bfloat16"
},
"tiiuae/falcon-7b"
:
{
"architectures"
:
[
"FalconForCausalLM"
],
"model_type"
:
"falcon"
,
"text_model_type"
:
"falcon"
,
"hidden_size"
:
4544
,
"total_num_hidden_layers"
:
32
,
"total_num_attention_heads"
:
71
,
"head_size"
:
64
,
"vocab_size"
:
65024
,
"total_num_kv_heads"
:
1
,
"num_experts"
:
0
,
"is_deepseek_mla"
:
false
,
"is_multimodal_model"
:
false
,
"dtype"
:
"torch.bfloat16"
},
"tiiuae/falcon-40b"
:
{
"architectures"
:
[
"FalconForCausalLM"
],
"model_type"
:
"falcon"
,
"text_model_type"
:
"falcon"
,
"hidden_size"
:
8192
,
"total_num_hidden_layers"
:
60
,
"total_num_attention_heads"
:
128
,
"head_size"
:
64
,
"vocab_size"
:
65024
,
"total_num_kv_heads"
:
8
,
"num_experts"
:
0
,
"is_deepseek_mla"
:
false
,
"is_multimodal_model"
:
false
,
"dtype"
:
"torch.bfloat16"
},
"luccafong/deepseek_mtp_main_random"
:
{
"architectures"
:
[
"DeepseekV3ForCausalLM"
],
"model_type"
:
"deepseek_v3"
,
"text_model_type"
:
"deepseek_v3"
,
"hidden_size"
:
2560
,
"total_num_hidden_layers"
:
5
,
"total_num_attention_heads"
:
32
,
"head_size"
:
576
,
"vocab_size"
:
129280
,
"total_num_kv_heads"
:
32
,
"num_experts"
:
72
,
"is_deepseek_mla"
:
true
,
"is_multimodal_model"
:
false
,
"dtype"
:
"torch.bfloat16"
},
"luccafong/deepseek_mtp_draft_random"
:
{
"architectures"
:
[
"DeepseekV3ForCausalLM"
],
"model_type"
:
"deepseek_v3"
,
"text_model_type"
:
"deepseek_v3"
,
"hidden_size"
:
2560
,
"total_num_hidden_layers"
:
10
,
"total_num_attention_heads"
:
32
,
"head_size"
:
576
,
"vocab_size"
:
129280
,
"total_num_kv_heads"
:
32
,
"num_experts"
:
72
,
"is_deepseek_mla"
:
true
,
"is_multimodal_model"
:
false
,
"dtype"
:
"torch.bfloat16"
},
"Qwen/Qwen3-Next-80B-A3B-Instruct"
:
{
"architectures"
:
[
"Qwen3NextForCausalLM"
],
"model_type"
:
"qwen3_next"
,
"text_model_type"
:
"qwen3_next"
,
"hidden_size"
:
2048
,
"total_num_hidden_layers"
:
48
,
"total_num_attention_heads"
:
16
,
"head_size"
:
256
,
"vocab_size"
:
151936
,
"total_num_kv_heads"
:
2
,
"num_experts"
:
512
,
"is_deepseek_mla"
:
false
,
"is_multimodal_model"
:
false
,
"dtype"
:
"torch.bfloat16"
},
"tiny-random/qwen3-next-moe"
:
{
"architectures"
:
[
"Qwen3NextForCausalLM"
],
"model_type"
:
"qwen3_next"
,
"text_model_type"
:
"qwen3_next"
,
"hidden_size"
:
8
,
"total_num_hidden_layers"
:
4
,
"total_num_attention_heads"
:
16
,
"head_size"
:
32
,
"vocab_size"
:
151936
,
"total_num_kv_heads"
:
8
,
"num_experts"
:
32
,
"is_deepseek_mla"
:
false
,
"is_multimodal_model"
:
false
,
"dtype"
:
"torch.bfloat16"
},
"zai-org/GLM-4.5"
:
{
"architectures"
:
[
"Glm4MoeForCausalLM"
],
"model_type"
:
"glm4_moe"
,
"text_model_type"
:
"glm4_moe"
,
"hidden_size"
:
5120
,
"total_num_hidden_layers"
:
92
,
"total_num_attention_heads"
:
96
,
"head_size"
:
128
,
"vocab_size"
:
151552
,
"total_num_kv_heads"
:
8
,
"num_experts"
:
160
,
"is_deepseek_mla"
:
false
,
"is_multimodal_model"
:
false
,
"dtype"
:
"torch.bfloat16"
},
"baidu/ERNIE-4.5-21B-A3B-PT"
:
{
"architectures"
:
[
"Ernie4_5_MoeForCausalLM"
],
"model_type"
:
"ernie4_5_moe"
,
"text_model_type"
:
"ernie4_5_moe"
,
"hidden_size"
:
2560
,
"total_num_hidden_layers"
:
28
,
"total_num_attention_heads"
:
20
,
"head_size"
:
128
,
"vocab_size"
:
103424
,
"total_num_kv_heads"
:
4
,
"num_experts"
:
64
,
"is_deepseek_mla"
:
false
,
"is_multimodal_model"
:
false
,
"dtype"
:
"torch.bfloat16"
},
"lmsys/gpt-oss-20b-bf16"
:
{
"architectures"
:
[
"GptOssForCausalLM"
],
"model_type"
:
"gpt_oss"
,
"text_model_type"
:
"gpt_oss"
,
"hidden_size"
:
2880
,
"total_num_hidden_layers"
:
24
,
"total_num_attention_heads"
:
64
,
"head_size"
:
64
,
"vocab_size"
:
201088
,
"total_num_kv_heads"
:
8
,
"num_experts"
:
32
,
"is_deepseek_mla"
:
false
,
"is_multimodal_model"
:
false
,
"dtype"
:
"torch.bfloat16"
},
"deepseek-ai/DeepSeek-V3.2-Exp"
:
{
"architectures"
:
[
"DeepseekV32ForCausalLM"
],
"model_type"
:
"deepseek_v32"
,
"text_model_type"
:
"deepseek_v32"
,
"hidden_size"
:
7168
,
"total_num_hidden_layers"
:
61
,
"total_num_attention_heads"
:
128
,
"head_size"
:
576
,
"vocab_size"
:
129280
,
"total_num_kv_heads"
:
128
,
"num_experts"
:
256
,
"is_deepseek_mla"
:
true
,
"is_multimodal_model"
:
false
,
"dtype"
:
"torch.bfloat16"
},
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
:
{
"architectures"
:
[
"Llama4ForConditionalGeneration"
],
"model_type"
:
"llama4"
,
"text_model_type"
:
"llama4_text"
,
"hidden_size"
:
5120
,
"total_num_hidden_layers"
:
48
,
"total_num_attention_heads"
:
40
,
"head_size"
:
128
,
"vocab_size"
:
202048
,
"total_num_kv_heads"
:
8
,
"num_experts"
:
16
,
"is_deepseek_mla"
:
false
,
"is_multimodal_model"
:
true
,
"dtype"
:
"torch.bfloat16"
},
"nvidia/Llama-3_3-Nemotron-Super-49B-v1"
:
{
"architectures"
:
[
"DeciLMForCausalLM"
],
"model_type"
:
"nemotron-nas"
,
"text_model_type"
:
"nemotron-nas"
,
"hidden_size"
:
8192
,
"total_num_hidden_layers"
:
80
,
"total_num_attention_heads"
:
64
,
"head_size"
:
128
,
"vocab_size"
:
128256
,
"total_num_kv_heads"
:
8
,
"num_experts"
:
0
,
"is_deepseek_mla"
:
false
,
"is_multimodal_model"
:
false
,
"dtype"
:
"torch.bfloat16"
},
"XiaomiMiMo/MiMo-7B-RL"
:
{
"architectures"
:
[
"MiMoForCausalLM"
],
"model_type"
:
"mimo"
,
"text_model_type"
:
"mimo"
,
"hidden_size"
:
4096
,
"total_num_hidden_layers"
:
36
,
"total_num_attention_heads"
:
32
,
"head_size"
:
128
,
"vocab_size"
:
151680
,
"total_num_kv_heads"
:
8
,
"num_experts"
:
0
,
"is_deepseek_mla"
:
false
,
"is_multimodal_model"
:
false
,
"dtype"
:
"torch.bfloat16"
},
"meituan-longcat/LongCat-Flash-Chat"
:
{
"architectures"
:
[
"LongcatFlashForCausalLM"
],
"model_type"
:
"longcat_flash"
,
"text_model_type"
:
"longcat_flash"
,
"hidden_size"
:
6144
,
"total_num_hidden_layers"
:
28
,
"total_num_attention_heads"
:
64
,
"head_size"
:
576
,
"vocab_size"
:
131072
,
"total_num_kv_heads"
:
64
,
"num_experts"
:
512
,
"is_deepseek_mla"
:
true
,
"is_multimodal_model"
:
false
,
"dtype"
:
"torch.float32"
}
}
tests/config/draft_model_arch_groundtruth.json
0 → 100644
View file @
7e63ef82
{
"abhigoyal/vllm-medusa-llama-68m-random"
:
{
"architectures"
:
[
"MedusaModel"
],
"model_type"
:
"medusa"
,
"text_model_type"
:
"medusa"
,
"hidden_size"
:
768
,
"total_num_hidden_layers"
:
1
,
"total_num_attention_heads"
:
0
,
"head_size"
:
"Error: integer division or modulo by zero"
,
"vocab_size"
:
32000
,
"total_num_kv_heads"
:
0
,
"num_experts"
:
0
,
"is_deepseek_mla"
:
false
,
"is_multimodal_model"
:
false
,
"dtype"
:
"torch.float32"
},
"luccafong/deepseek_mtp_draft_random"
:
{
"architectures"
:
[
"DeepSeekMTPModel"
],
"model_type"
:
"deepseek_mtp"
,
"text_model_type"
:
"deepseek_mtp"
,
"hidden_size"
:
2560
,
"total_num_hidden_layers"
:
1
,
"total_num_attention_heads"
:
32
,
"head_size"
:
576
,
"vocab_size"
:
129280
,
"total_num_kv_heads"
:
32
,
"num_experts"
:
72
,
"is_deepseek_mla"
:
true
,
"is_multimodal_model"
:
false
,
"dtype"
:
"torch.bfloat16"
},
"eagle618/eagle-deepseek-v3-random"
:
{
"architectures"
:
[
"EagleDeepSeekMTPModel"
],
"model_type"
:
"eagle"
,
"text_model_type"
:
"deepseek_mtp"
,
"hidden_size"
:
2560
,
"total_num_hidden_layers"
:
1
,
"total_num_attention_heads"
:
32
,
"head_size"
:
576
,
"vocab_size"
:
129280
,
"total_num_kv_heads"
:
32
,
"num_experts"
:
72
,
"is_deepseek_mla"
:
true
,
"is_multimodal_model"
:
false
,
"dtype"
:
"bfloat16"
},
"yuhuili/EAGLE-LLaMA3-Instruct-8B"
:
{
"architectures"
:
[
"EagleLlamaForCausalLM"
],
"model_type"
:
"eagle"
,
"text_model_type"
:
"llama"
,
"hidden_size"
:
4096
,
"total_num_hidden_layers"
:
1
,
"total_num_attention_heads"
:
32
,
"head_size"
:
128
,
"vocab_size"
:
128256
,
"total_num_kv_heads"
:
8
,
"num_experts"
:
0
,
"is_deepseek_mla"
:
false
,
"is_multimodal_model"
:
false
,
"dtype"
:
"float16"
},
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
:
{
"architectures"
:
[
"Eagle3LlamaForCausalLM"
],
"model_type"
:
"eagle"
,
"text_model_type"
:
"llama"
,
"hidden_size"
:
4096
,
"total_num_hidden_layers"
:
1
,
"total_num_attention_heads"
:
32
,
"head_size"
:
128
,
"vocab_size"
:
128256
,
"total_num_kv_heads"
:
8
,
"num_experts"
:
0
,
"is_deepseek_mla"
:
false
,
"is_multimodal_model"
:
false
,
"dtype"
:
"float16"
}
}
tests/config/test_model_arch_config.py
0 → 100644
View file @
7e63ef82
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for ModelArchitectureConfig and its integration with ModelConfig."""
import
json
from
pathlib
import
Path
import
pytest
from
vllm.config
import
ModelConfig
,
ParallelConfig
,
SpeculativeConfig
from
vllm.transformers_utils.model_arch_config_convertor
import
(
ModelArchConfigConvertorBase
,
)
BASE_TRUST_REMOTE_CODE_MODELS
=
{
"nvidia/Llama-3_3-Nemotron-Super-49B-v1"
,
"XiaomiMiMo/MiMo-7B-RL"
,
# Excluded: Not available online right now
# "FreedomIntelligence/openPangu-Ultra-MoE-718B-V1.1",
"meituan-longcat/LongCat-Flash-Chat"
,
}
BASE_MODELS_TO_TEST
=
[
"state-spaces/mamba-130m-hf"
,
"mistralai/Mamba-Codestral-7B-v0.1"
,
# Excluded: terratorch/torchgeo version mismatch in CPU CI environment
# (NonGeoDataset import error). Tested in model initialization tests.
# "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11",
"Zyphra/Zamba2-7B-instruct"
,
# FIXME: mosaicml/mpt-7b has been deleted
# "mosaicml/mpt-7b",
# FIXME: databricks/dbrx-instruct has been deleted
# "databricks/dbrx-instruct",
"tiiuae/falcon-7b"
,
"tiiuae/falcon-40b"
,
"luccafong/deepseek_mtp_main_random"
,
"Qwen/Qwen3-Next-80B-A3B-Instruct"
,
"tiny-random/qwen3-next-moe"
,
"zai-org/GLM-4.5"
,
"baidu/ERNIE-4.5-21B-A3B-PT"
,
# Models using base convertor
"lmsys/gpt-oss-20b-bf16"
,
"deepseek-ai/DeepSeek-V3.2-Exp"
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
]
+
list
(
BASE_TRUST_REMOTE_CODE_MODELS
)
# (target_model, draft_model, trust_remote_code)
SPECULATIVE_MODELS
=
[
(
"JackFram/llama-68m"
,
"abhigoyal/vllm-medusa-llama-68m-random"
,
False
),
(
"luccafong/deepseek_mtp_main_random"
,
"luccafong/deepseek_mtp_draft_random"
,
True
),
(
"eagle618/deepseek-v3-random"
,
"eagle618/eagle-deepseek-v3-random"
,
True
),
(
"meta-llama/Meta-Llama-3-8B-Instruct"
,
"yuhuili/EAGLE-LLaMA3-Instruct-8B"
,
True
),
(
"meta-llama/Llama-3.1-8B-Instruct"
,
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
,
True
),
]
def
_load_groundtruth
(
filename
:
str
)
->
dict
:
"""Load groundtruth JSON from the test directory."""
groundtruth_path
=
Path
(
__file__
).
parent
/
filename
with
open
(
groundtruth_path
)
as
f
:
return
json
.
load
(
f
)
def
_assert_model_arch_config
(
model_config
,
expected
:
dict
,
check_head_size
:
bool
=
True
):
"""Assert model_arch_config matches expected values."""
model_arch_config
=
model_config
.
model_arch_config
assert
model_arch_config
.
architectures
==
expected
[
"architectures"
]
assert
model_arch_config
.
model_type
==
expected
[
"model_type"
]
assert
model_arch_config
.
text_model_type
==
expected
[
"text_model_type"
]
assert
model_arch_config
.
hidden_size
==
expected
[
"hidden_size"
]
assert
(
model_arch_config
.
total_num_hidden_layers
==
expected
[
"total_num_hidden_layers"
]
)
assert
(
model_arch_config
.
total_num_attention_heads
==
expected
[
"total_num_attention_heads"
]
)
assert
model_arch_config
.
vocab_size
==
expected
[
"vocab_size"
]
assert
model_arch_config
.
total_num_kv_heads
==
expected
[
"total_num_kv_heads"
]
assert
model_arch_config
.
num_experts
==
expected
[
"num_experts"
]
assert
model_arch_config
.
is_deepseek_mla
==
expected
[
"is_deepseek_mla"
]
torch_dtype
=
ModelArchConfigConvertorBase
.
get_torch_dtype
(
model_config
.
hf_config
,
model_config
.
model
,
revision
=
model_config
.
revision
)
assert
str
(
torch_dtype
)
==
expected
[
"dtype"
]
if
check_head_size
:
assert
model_arch_config
.
head_size
==
expected
[
"head_size"
]
def
_assert_model_config_methods
(
model_config
,
expected
:
dict
,
check_head_size
:
bool
=
True
):
"""Assert model_config methods return expected values."""
assert
model_config
.
architectures
==
expected
[
"architectures"
]
assert
model_config
.
get_vocab_size
()
==
expected
[
"vocab_size"
]
assert
model_config
.
get_hidden_size
()
==
expected
[
"hidden_size"
]
assert
model_config
.
get_total_num_kv_heads
()
==
expected
[
"total_num_kv_heads"
]
assert
model_config
.
get_num_experts
()
==
expected
[
"num_experts"
]
assert
(
model_config
.
get_total_num_hidden_layers
()
==
expected
[
"total_num_hidden_layers"
]
)
if
check_head_size
:
assert
model_config
.
get_head_size
()
==
expected
[
"head_size"
]
@
pytest
.
mark
.
parametrize
(
"model"
,
BASE_MODELS_TO_TEST
)
def
test_base_model_arch_config
(
model
:
str
):
"""Test model architecture config for base models."""
groundtruth
=
_load_groundtruth
(
"base_model_arch_groundtruth.json"
)
expected
=
groundtruth
[
model
]
model_config
=
ModelConfig
(
model
,
trust_remote_code
=
model
in
BASE_TRUST_REMOTE_CODE_MODELS
)
_assert_model_arch_config
(
model_config
,
expected
)
_assert_model_config_methods
(
model_config
,
expected
)
@
pytest
.
mark
.
parametrize
(
"target_model,draft_model,trust_remote_code"
,
SPECULATIVE_MODELS
)
def
test_draft_model_arch_config
(
target_model
:
str
,
draft_model
:
str
,
trust_remote_code
:
bool
):
"""Test model architecture config for draft/speculative models."""
groundtruth
=
_load_groundtruth
(
"draft_model_arch_groundtruth.json"
)
expected
=
groundtruth
[
draft_model
]
target_model_config
=
ModelConfig
(
target_model
,
trust_remote_code
=
trust_remote_code
)
speculative_config
=
SpeculativeConfig
(
model
=
draft_model
,
num_speculative_tokens
=
1
,
target_model_config
=
target_model_config
,
target_parallel_config
=
ParallelConfig
(),
)
model_config
=
speculative_config
.
draft_model_config
# For medusa models, head_size may cause division by zero before
# model_arch_config was introduced, so we conditionally check it
check_head_size
=
isinstance
(
expected
[
"head_size"
],
int
)
_assert_model_arch_config
(
model_config
,
expected
,
check_head_size
=
check_head_size
)
_assert_model_config_methods
(
model_config
,
expected
,
check_head_size
=
check_head_size
)
tests/config/test_multimodal_config.py
View file @
7e63ef82
...
...
@@ -3,8 +3,8 @@
import
pytest
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.config.multimodal
import
MultiModalConfig
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
def
test_mm_encoder_attn_backend_str_conversion
():
...
...
tests/conftest.py
View file @
7e63ef82
...
...
@@ -47,7 +47,11 @@ from transformers import (
)
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
tests.models.utils
import
TokensTextLogprobs
,
TokensTextLogprobsPromptLogprobs
from
tests.models.utils
import
(
TokensTextLogprobs
,
TokensTextLogprobsPromptLogprobs
,
softmax
,
)
from
vllm
import
LLM
,
SamplingParams
,
envs
from
vllm.assets.audio
import
AudioAsset
from
vllm.assets.image
import
ImageAsset
...
...
@@ -189,6 +193,17 @@ def dist_init():
cleanup_dist_env_and_memory
()
@
pytest
.
fixture
def
default_vllm_config
():
"""Set a default VllmConfig for tests that directly test CustomOps or pathways
that use get_current_vllm_config() outside of a full engine context.
"""
from
vllm.config
import
VllmConfig
,
set_current_vllm_config
with
set_current_vllm_config
(
VllmConfig
()):
yield
@
pytest
.
fixture
()
def
should_do_global_cleanup_after_test
(
request
)
->
bool
:
"""Allow subdirectories to skip global cleanup by overriding this fixture.
...
...
@@ -414,7 +429,7 @@ class HfRunner:
# don't put this import at the top level
# it will call torch.cuda.device_count()
from
transformers
import
AutoProcessor
# noqa: F401
from
transformers
import
AutoProcessor
self
.
processor
=
AutoProcessor
.
from_pretrained
(
model_name
,
...
...
@@ -517,7 +532,7 @@ class HfRunner:
elif
problem_type
==
"multi_label_classification"
:
logits
=
output
.
logits
.
sigmoid
()[
0
].
tolist
()
else
:
logits
=
output
.
logits
.
softmax
(
dim
=-
1
)[
0
].
tolist
()
logits
=
softmax
(
output
.
logits
)[
0
].
tolist
()
outputs
.
append
(
logits
)
return
outputs
...
...
@@ -685,6 +700,7 @@ class HfRunner:
images
:
PromptImageInput
|
None
=
None
,
audios
:
PromptAudioInput
|
None
=
None
,
videos
:
PromptVideoInput
|
None
=
None
,
use_cache
:
bool
=
True
,
**
kwargs
:
Any
,
)
->
list
[
TokensTextLogprobs
]:
all_inputs
=
self
.
get_inputs
(
...
...
@@ -698,7 +714,7 @@ class HfRunner:
for
inputs
in
all_inputs
:
output
:
"GenerateOutput"
=
self
.
model
.
generate
(
**
self
.
wrap_device
(
inputs
),
use_cache
=
Tru
e
,
use_cache
=
use_cach
e
,
do_sample
=
False
,
max_new_tokens
=
max_tokens
,
output_hidden_states
=
True
,
...
...
tests/distributed/test_context_parallel.py
View file @
7e63ef82
...
...
@@ -219,14 +219,12 @@ def _test_cp_gsm8k(
]
)
server_env
=
{}
if
attn_backend
:
server_
env
[
"VLLM_ATTENTION_BACKEND"
]
=
attn_backend
server_
args
.
append
(
f
"--attention-backend=
{
attn_backend
}
"
)
with
RemoteOpenAIServer
(
model_id
,
server_args
,
env_dict
=
server_env
,
max_wait_seconds
=
720
,
)
as
remote_server
:
host
=
f
"http://
{
remote_server
.
host
}
"
...
...
tests/distributed/test_eplb_algo.py
View file @
7e63ef82
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
numpy
as
np
import
pytest
import
torch
...
...
@@ -310,3 +311,143 @@ if __name__ == "__main__":
print
(
phy2log
)
test_basic_rebalance
()
def
_make_phy_replicas_idx_from_phy2log
(
phy2log
:
np
.
ndarray
)
->
np
.
ndarray
:
"""Create replicas indices mapping from phy2log."""
pr
=
np
.
zeros_like
(
phy2log
,
dtype
=
np
.
int64
)
for
layer
in
range
(
phy2log
.
shape
[
0
]):
seen
:
dict
[
int
,
int
]
=
{}
row
=
phy2log
[
layer
].
tolist
()
for
i
,
expert
in
enumerate
(
row
):
r
=
seen
.
get
(
expert
,
0
)
pr
[
layer
,
i
]
=
r
seen
[
expert
]
=
r
+
1
return
pr
def
_validate_intragpu_rearrangement
(
old_global_expert_indices
:
np
.
ndarray
,
new_phy2log
:
np
.
ndarray
,
new_phy_replicas_idx
:
np
.
ndarray
,
post_phy2log
:
np
.
ndarray
,
post_phy_replicas_idx
:
np
.
ndarray
,
num_ranks
:
int
,
slots_per_gpu
:
int
,
):
# Per-GPU checks
for
gpu_idx
in
range
(
num_ranks
):
start
=
gpu_idx
*
slots_per_gpu
end
=
start
+
slots_per_gpu
old_seg
=
old_global_expert_indices
[
0
,
start
:
end
]
new_seg
=
new_phy2log
[
0
,
start
:
end
]
new_rnk
=
new_phy_replicas_idx
[
0
,
start
:
end
]
post_seg
=
post_phy2log
[
0
,
start
:
end
]
post_rnk
=
post_phy_replicas_idx
[
0
,
start
:
end
]
# Pairwise equality for (expert, rank) pairs to ensure nothing is lost
def
sorted_pairs
(
seg
,
rnk
):
pairs
=
list
(
zip
(
seg
.
tolist
(),
rnk
.
tolist
()))
pairs
.
sort
()
return
pairs
assert
sorted_pairs
(
post_seg
,
post_rnk
)
==
sorted_pairs
(
new_seg
,
new_rnk
),
(
f
"Per-GPU pairs of (expert,rank) must match new mapping for GPU
{
gpu_idx
}
"
)
# For experts that remain on the same GPU, the old slot is preserved
# for at least one occurrence; rank at that slot must be valid for that expert
old_list
=
old_seg
.
tolist
()
new_list
=
new_seg
.
tolist
()
post_list
=
post_seg
.
tolist
()
remained
=
set
(
old_list
)
&
set
(
new_list
)
new_ranks_for_expert
:
dict
[
int
,
list
[
int
]]
=
{}
for
v
,
r
in
zip
(
new_list
,
new_rnk
.
tolist
()):
new_ranks_for_expert
.
setdefault
(
v
,
[]).
append
(
r
)
for
expert
in
remained
:
old_pos
=
old_list
.
index
(
expert
)
assert
post_list
[
old_pos
]
==
expert
,
(
f
"Expert
{
expert
}
on GPU
{
gpu_idx
}
should stay at old slot
{
old_pos
}
"
)
# Rank at preserved slot must be one of the ranks
# the expert has in new mapping
assert
post_rnk
.
tolist
()[
old_pos
]
in
new_ranks_for_expert
[
expert
],
(
f
"Rank for expert
{
expert
}
at preserved slot on GPU
{
gpu_idx
}
"
"must come from new mapping"
)
@
pytest
.
mark
.
parametrize
(
"num_ranks, slots_per_gpu, old_phy2log, new_phy2log"
,
[
pytest
.
param
(
# Setup: 2 GPUs, 4 slots each, 1 layer
# Old mapping: GPU0 -> [0,1,2,3], GPU1 -> [4,5,6,7]
# New mapping shuffles within GPU0 and brings 4,5 into GPU0.
# GPU0 new -> [1,5,0,4]; GPU1 new -> [6,2,7,3]
2
,
4
,
np
.
array
([[
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
]]),
np
.
array
([[
1
,
5
,
0
,
4
,
6
,
2
,
7
,
3
]]),
id
=
"simple"
,
),
pytest
.
param
(
# Setup: 2 GPUs, 5 slots each (total 10 physical experts), 1 layer
# Old mapping:
# GPU0 -> [0, 1, 0, 2, 3] (expert 0 duplicated)
# GPU1 -> [4, 5, 6, 1, 2]
# New mapping reorders within GPUs and moves some experts across GPUs,
# while still including duplicates:
# GPU0 new -> [0, 5, 4, 0, 1] (expert 0 duplicated, 4/5 incoming)
# GPU1 new -> [6, 2, 3, 2, 1] (expert 2 duplicated)
2
,
5
,
np
.
array
([[
0
,
1
,
0
,
2
,
3
,
4
,
5
,
6
,
1
,
2
]]),
np
.
array
([[
0
,
5
,
4
,
0
,
1
,
6
,
2
,
3
,
2
,
1
]]),
id
=
"duplicates"
,
),
pytest
.
param
(
# Setup: 3 GPUs, 4 slots each (total 12 physical experts), 1 layer
# Old mapping:
# GPU0 -> [0, 1, 2, 3]
# GPU1 -> [0, 1, 2, 3]
# GPU2 -> [0, 1, 2, 3]
# New mapping decides to use one expert on 2 GPUs and shuffles
# experts on the third GPU,
# GPU0 new -> [0, 0, 0, 0]
# GPU1 new -> [0, 0, 0, 0]
# GPU2 new -> [1, 2, 3, 0]
3
,
4
,
np
.
array
([[
0
,
1
,
2
,
3
,
0
,
1
,
2
,
3
,
0
,
1
,
2
,
3
]]),
np
.
array
([[
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
2
,
3
,
0
]]),
id
=
"skewed_expert"
,
),
],
)
def
test_preserve_intragpu_slots
(
num_ranks
:
int
,
slots_per_gpu
:
int
,
old_phy2log
:
torch
.
Tensor
,
new_phy2log
:
torch
.
Tensor
,
):
"""Experts that stay on a GPU keep their old slots; incoming not lost."""
phy_replicas_idx
=
_make_phy_replicas_idx_from_phy2log
(
new_phy2log
)
post_phy2log
,
post_phy_replicas_idx
=
DefaultEplbPolicy
.
preserve_intragpu_slots
(
new_phy2log
,
phy_replicas_idx
,
num_ranks
,
old_phy2log
)
# Shapes preserved
assert
post_phy2log
.
shape
==
new_phy2log
.
shape
assert
post_phy_replicas_idx
.
shape
==
phy_replicas_idx
.
shape
_validate_intragpu_rearrangement
(
old_phy2log
,
new_phy2log
,
phy_replicas_idx
,
post_phy2log
,
post_phy_replicas_idx
,
num_ranks
,
slots_per_gpu
,
)
tests/distributed/test_eplb_execute.py
View file @
7e63ef82
...
...
@@ -286,15 +286,17 @@ def _test_async_transfer_layer_without_mtp_worker(
device
,
old_indices
,
)
old_indices_cpu
=
old_indices
.
cpu
()
new_indices_cpu
=
new_indices
.
cpu
()
expert_buffer
=
[
torch
.
empty_like
(
w
)
for
w
in
expert_weights
[
0
]]
cuda_stream
=
torch
.
cuda
.
Stream
(
device
=
device
)
for
layer_idx
in
range
(
num_layers
):
is_unchanged
,
is_received_locally
,
experts_recv_loc
=
asyncio
.
run
(
is_unchanged
,
is_received_locally
,
recv_metadata
=
asyncio
.
run
(
transfer_layer
(
old_global_expert_indices
=
old_indices
,
new_global_expert_indices
=
new_indices
,
old_global_expert_indices
=
old_indices
_cpu
,
new_global_expert_indices
=
new_indices
_cpu
,
expert_weights
=
expert_weights
,
expert_weights_buffer
=
expert_buffer
,
ep_group
=
ep_group
,
...
...
@@ -302,16 +304,15 @@ def _test_async_transfer_layer_without_mtp_worker(
cuda_stream
=
cuda_stream
,
)
)
cuda_stream
.
synchronize
()
move_from_buffer
(
expert_weights
=
expert_weights
[
layer_idx
],
expert_weights_buffer
=
expert_buffer
,
expert_weights_buffer
s
=
expert_buffer
,
is_unchanged
=
is_unchanged
,
is_received_locally
=
is_received_locally
,
experts_recv_loc
=
experts_recv_loc
,
new_indices
=
new_indices
[
layer_idx
].
tolist
(),
ep_
group
=
ep_group
,
recv_metadata
=
recv_metadata
,
new_indices
=
new_indices
_cpu
[
layer_idx
].
numpy
(),
ep_
rank
=
ep_rank
,
)
verify_expert_weights_after_shuffle
(
...
...
tests/distributed/test_pp_cudagraph.py
View file @
7e63ef82
...
...
@@ -21,23 +21,21 @@ from ..utils import compare_two_settings, create_new_process_for_each_test, mode
)
@
create_new_process_for_each_test
()
def
test_pp_cudagraph
(
monkeypatch
:
pytest
.
MonkeyPatch
,
PP_SIZE
:
int
,
MODEL_NAME
:
str
,
ATTN_BACKEND
:
LiteralString
,
):
with
monkeypatch
.
context
()
as
m
:
cudagraph_args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"float16"
,
"--pipeline-parallel-size"
,
str
(
PP_SIZE
),
"--distributed-executor-backend"
,
"mp"
,
]
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
ATTN_BACKEND
)
cudagraph_args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"float16"
,
"--pipeline-parallel-size"
,
str
(
PP_SIZE
),
"--distributed-executor-backend"
,
"mp"
,
f
"--attention-backend=
{
ATTN_BACKEND
}
"
,
]
eager_args
=
cudagraph_args
+
[
"--enforce-eager"
]
eager_args
=
cudagraph_args
+
[
"--enforce-eager"
]
compare_two_settings
(
MODEL_NAME
,
eager_args
,
cudagraph_args
)
compare_two_settings
(
MODEL_NAME
,
eager_args
,
cudagraph_args
)
tests/engine/test_arg_utils.py
View file @
7e63ef82
...
...
@@ -9,7 +9,7 @@ from typing import Annotated, Literal
import
pytest
from
vllm.config
import
CompilationConfig
,
config
from
vllm.config
import
AttentionConfig
,
CompilationConfig
,
config
from
vllm.engine.arg_utils
import
(
EngineArgs
,
contains_type
,
...
...
@@ -298,6 +298,139 @@ def test_compilation_config():
)
def
test_attention_config
():
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
parser
=
EngineArgs
.
add_cli_args
(
FlexibleArgumentParser
())
# default value
args
=
parser
.
parse_args
([])
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
assert
engine_args
.
attention_config
==
AttentionConfig
()
# set backend via dot notation
args
=
parser
.
parse_args
([
"--attention-config.backend"
,
"FLASH_ATTN"
])
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
assert
engine_args
.
attention_config
.
backend
is
not
None
assert
engine_args
.
attention_config
.
backend
.
name
==
"FLASH_ATTN"
# set backend via --attention-backend shorthand
args
=
parser
.
parse_args
([
"--attention-backend"
,
"FLASHINFER"
])
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
assert
engine_args
.
attention_backend
is
not
None
assert
engine_args
.
attention_backend
==
"FLASHINFER"
# set all fields via dot notation
args
=
parser
.
parse_args
(
[
"--attention-config.backend"
,
"FLASH_ATTN"
,
"--attention-config.flash_attn_version"
,
"3"
,
"--attention-config.use_prefill_decode_attention"
,
"true"
,
"--attention-config.flash_attn_max_num_splits_for_cuda_graph"
,
"16"
,
"--attention-config.use_cudnn_prefill"
,
"true"
,
"--attention-config.use_trtllm_ragged_deepseek_prefill"
,
"true"
,
"--attention-config.use_trtllm_attention"
,
"true"
,
"--attention-config.disable_flashinfer_prefill"
,
"true"
,
"--attention-config.disable_flashinfer_q_quantization"
,
"true"
,
]
)
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
assert
engine_args
.
attention_config
.
backend
is
not
None
assert
engine_args
.
attention_config
.
backend
.
name
==
"FLASH_ATTN"
assert
engine_args
.
attention_config
.
flash_attn_version
==
3
assert
engine_args
.
attention_config
.
use_prefill_decode_attention
is
True
assert
engine_args
.
attention_config
.
flash_attn_max_num_splits_for_cuda_graph
==
16
assert
engine_args
.
attention_config
.
use_cudnn_prefill
is
True
assert
engine_args
.
attention_config
.
use_trtllm_ragged_deepseek_prefill
is
True
assert
engine_args
.
attention_config
.
use_trtllm_attention
is
True
assert
engine_args
.
attention_config
.
disable_flashinfer_prefill
is
True
assert
engine_args
.
attention_config
.
disable_flashinfer_q_quantization
is
True
# set to string form of a dict with all fields
args
=
parser
.
parse_args
(
[
"--attention-config="
'{"backend": "FLASHINFER", "flash_attn_version": 2, '
'"use_prefill_decode_attention": false, '
'"flash_attn_max_num_splits_for_cuda_graph": 8, '
'"use_cudnn_prefill": false, '
'"use_trtllm_ragged_deepseek_prefill": false, '
'"use_trtllm_attention": false, '
'"disable_flashinfer_prefill": false, '
'"disable_flashinfer_q_quantization": false}'
,
]
)
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
assert
engine_args
.
attention_config
.
backend
is
not
None
assert
engine_args
.
attention_config
.
backend
.
name
==
"FLASHINFER"
assert
engine_args
.
attention_config
.
flash_attn_version
==
2
assert
engine_args
.
attention_config
.
use_prefill_decode_attention
is
False
assert
engine_args
.
attention_config
.
flash_attn_max_num_splits_for_cuda_graph
==
8
assert
engine_args
.
attention_config
.
use_cudnn_prefill
is
False
assert
engine_args
.
attention_config
.
use_trtllm_ragged_deepseek_prefill
is
False
assert
engine_args
.
attention_config
.
use_trtllm_attention
is
False
assert
engine_args
.
attention_config
.
disable_flashinfer_prefill
is
False
assert
engine_args
.
attention_config
.
disable_flashinfer_q_quantization
is
False
# test --attention-backend flows into VllmConfig.attention_config
args
=
parser
.
parse_args
(
[
"--model"
,
"facebook/opt-125m"
,
"--attention-backend"
,
"FLASH_ATTN"
,
]
)
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
vllm_config
=
engine_args
.
create_engine_config
()
assert
vllm_config
.
attention_config
.
backend
==
AttentionBackendEnum
.
FLASH_ATTN
# test --attention-config.backend flows into VllmConfig.attention_config
args
=
parser
.
parse_args
(
[
"--model"
,
"facebook/opt-125m"
,
"--attention-config.backend"
,
"FLASHINFER"
,
]
)
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
vllm_config
=
engine_args
.
create_engine_config
()
assert
vllm_config
.
attention_config
.
backend
==
AttentionBackendEnum
.
FLASHINFER
# test --attention-backend and --attention-config.backend are mutually exclusive
args
=
parser
.
parse_args
(
[
"--model"
,
"facebook/opt-125m"
,
"--attention-backend"
,
"FLASH_ATTN"
,
"--attention-config.backend"
,
"FLASHINFER"
,
]
)
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
with
pytest
.
raises
(
ValueError
,
match
=
"mutually exclusive"
):
engine_args
.
create_engine_config
()
def
test_prefix_cache_default
():
parser
=
EngineArgs
.
add_cli_args
(
FlexibleArgumentParser
())
args
=
parser
.
parse_args
([])
...
...
@@ -378,6 +511,16 @@ def test_human_readable_model_len():
args
=
parser
.
parse_args
([
"--max-model-len"
,
"10.2123451234567t"
])
assert
args
.
max_model_len
==
10212345123456
# Special value -1 for auto-fit to GPU memory
args
=
parser
.
parse_args
([
"--max-model-len"
,
"-1"
])
assert
args
.
max_model_len
==
-
1
# 'auto' is an alias for -1
args
=
parser
.
parse_args
([
"--max-model-len"
,
"auto"
])
assert
args
.
max_model_len
==
-
1
args
=
parser
.
parse_args
([
"--max-model-len"
,
"AUTO"
])
assert
args
.
max_model_len
==
-
1
# Invalid (do not allow decimals with binary multipliers)
for
invalid
in
[
"1a"
,
"pwd"
,
"10.24"
,
"1.23M"
,
"1.22T"
]:
with
pytest
.
raises
(
ArgumentError
):
...
...
tests/
tpu/lora
/__init__.py
→
tests/
entrypoints/instrumentator
/__init__.py
View file @
7e63ef82
File moved
tests/entrypoints/
openai
/test_metrics.py
→
tests/entrypoints/
instrumentator
/test_metrics.py
View file @
7e63ef82
...
...
@@ -15,9 +15,9 @@ import requests
from
prometheus_client.parser
import
text_string_to_metric_families
from
transformers
import
AutoTokenizer
from
tests.conftest
import
LocalAssetServer
from
tests.utils
import
RemoteOpenAIServer
from
vllm
import
version
from
...conftest
import
LocalAssetServer
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
MODELS
=
{
...
...
tests/entrypoints/openai/conftest.py
View file @
7e63ef82
...
...
@@ -5,6 +5,30 @@ import pytest
from
vllm.assets.audio
import
AudioAsset
def
add_attention_backend
(
server_args
,
attention_config
):
"""Append attention backend CLI arg if specified.
Args:
server_args: List of server arguments to extend in-place.
attention_config: Dict with 'backend' key, or None.
"""
if
attention_config
and
"backend"
in
attention_config
:
server_args
.
extend
([
"--attention-backend"
,
attention_config
[
"backend"
]])
@
pytest
.
fixture
(
scope
=
"module"
)
def
rocm_aiter_fa_attention
():
"""Return attention config for transcription/translation tests on ROCm.
On ROCm, audio tests require ROCM_AITER_FA attention backend.
"""
from
vllm.platforms
import
current_platform
if
current_platform
.
is_rocm
():
return
{
"backend"
:
"ROCM_AITER_FA"
}
return
None
@
pytest
.
fixture
def
mary_had_lamb
():
path
=
AudioAsset
(
"mary_had_lamb"
).
get_local_path
()
...
...
Prev
1
…
11
12
13
14
15
16
17
18
19
…
35
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment