Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3fb4b5fa
Commit
3fb4b5fa
authored
Mar 23, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.18.0' into v0.18.0-ori
parents
bcf25339
89138b21
Changes
488
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1406 additions
and
113 deletions
+1406
-113
tests/compile/passes/test_rope_kvcache_fusion.py
tests/compile/passes/test_rope_kvcache_fusion.py
+334
-0
tests/compile/passes/test_scatter_split_replace.py
tests/compile/passes/test_scatter_split_replace.py
+107
-0
tests/compile/passes/test_silu_mul_quant_fusion.py
tests/compile/passes/test_silu_mul_quant_fusion.py
+27
-15
tests/compile/test_aot_compile.py
tests/compile/test_aot_compile.py
+177
-0
tests/compile/test_cold_start.py
tests/compile/test_cold_start.py
+0
-48
tests/compile/test_compile_ranges.py
tests/compile/test_compile_ranges.py
+89
-6
tests/compile/test_config.py
tests/compile/test_config.py
+43
-0
tests/compile/test_decorator.py
tests/compile/test_decorator.py
+1
-1
tests/compile/test_dynamic_shapes_compilation.py
tests/compile/test_dynamic_shapes_compilation.py
+2
-2
tests/compile/test_graph_partition.py
tests/compile/test_graph_partition.py
+144
-1
tests/compile/test_sequence_parallelism_threshold.py
tests/compile/test_sequence_parallelism_threshold.py
+110
-0
tests/compile/test_startup.py
tests/compile/test_startup.py
+71
-0
tests/compile/test_structured_logging.py
tests/compile/test_structured_logging.py
+3
-3
tests/compile/test_wrapper.py
tests/compile/test_wrapper.py
+1
-1
tests/config/test_config_generation.py
tests/config/test_config_generation.py
+31
-0
tests/config/test_multimodal_config.py
tests/config/test_multimodal_config.py
+18
-0
tests/conftest.py
tests/conftest.py
+60
-34
tests/cuda/scripts/check_device_count_respects_env.py
tests/cuda/scripts/check_device_count_respects_env.py
+1
-1
tests/cuda/test_cuda_compatibility_path.py
tests/cuda/test_cuda_compatibility_path.py
+187
-0
tests/detokenizer/test_disable_detokenization.py
tests/detokenizer/test_disable_detokenization.py
+0
-1
No files found.
Too many changes to show.
To preserve performance only
488 of 488+
files are displayed.
Plain diff
Email patch
tests/compile/passes/test_rope_kvcache_fusion.py
0 → 100644
View file @
3fb4b5fa
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
import
vllm.config
from
tests.compile.backend
import
TestBackend
from
tests.v1.attention.utils
import
BatchSpec
,
create_common_attn_metadata
from
vllm._aiter_ops
import
is_aiter_found_and_supported
,
rocm_aiter_ops
from
vllm.compilation.passes.fusion.matcher_utils
import
ROTARY_OP
from
vllm.compilation.passes.fusion.rope_kvcache_fusion
import
RopeKVCacheFusionPass
from
vllm.compilation.passes.utility.noop_elimination
import
NoOpEliminationPass
from
vllm.compilation.passes.utility.post_cleanup
import
PostCleanupPass
from
vllm.compilation.passes.utility.scatter_split_replace
import
(
ScatterSplitReplacementPass
,
)
from
vllm.compilation.passes.utility.split_coalescing
import
SplitCoalescingPass
from
vllm.config
import
(
CacheConfig
,
CompilationConfig
,
CompilationMode
,
ModelConfig
,
PassConfig
,
VllmConfig
,
)
from
vllm.forward_context
import
get_forward_context
,
set_forward_context
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.rotary_embedding
import
RotaryEmbedding
from
vllm.platforms
import
current_platform
from
vllm.v1.attention.backend
import
(
AttentionBackend
,
CommonAttentionMetadata
,
)
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
from
vllm.v1.kv_cache_interface
import
AttentionSpec
INDEX_SELECT_OP
=
torch
.
ops
.
aten
.
index
.
Tensor
VLLM_UNIFIED_KV_CACHE_UPDATE_OP
=
torch
.
ops
.
vllm
.
unified_kv_cache_update
FP8_DTYPE
=
current_platform
.
fp8_dtype
()
class
QKRoPEKVCacheTestModel
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
vllm_config
:
VllmConfig
,
attn_backend
:
AttentionBackendEnum
,
num_heads
:
int
,
num_kv_heads
:
int
,
head_size
:
int
,
is_neox
:
bool
,
dtype
:
torch
.
dtype
,
device
:
torch
.
device
,
prefix
:
str
=
"model.layers.0.self_attn.attn"
,
):
super
().
__init__
()
self
.
num_heads
=
num_heads
self
.
num_kv_heads
=
num_kv_heads
self
.
head_size
=
head_size
self
.
block_size
=
vllm_config
.
cache_config
.
block_size
self
.
q_size
=
num_heads
*
head_size
self
.
kv_size
=
num_kv_heads
*
head_size
self
.
is_neox
=
is_neox
self
.
dtype
=
dtype
self
.
device
=
device
self
.
layer_name
=
prefix
self
.
rotary_emb
=
RotaryEmbedding
(
head_size
,
rotary_dim
=
head_size
,
max_position_embeddings
=
4096
,
base
=
10000
,
is_neox_style
=
is_neox
,
dtype
=
self
.
dtype
,
)
# Whether to check for the RoPE custom op or component index_select
self
.
enable_rope_custom_op
=
self
.
rotary_emb
.
enabled
()
# Register layer metadata for the fusion pass via Attention.
self
.
attn
=
Attention
(
num_heads
=
num_heads
,
head_size
=
head_size
,
scale
=
1.0
/
head_size
**
0.5
,
num_kv_heads
=
num_kv_heads
,
cache_config
=
vllm_config
.
cache_config
,
quant_config
=
vllm_config
.
quant_config
,
prefix
=
prefix
,
attn_backend
=
attn_backend
.
get_class
(),
)
self
.
attn_backend
:
type
[
AttentionBackend
]
=
self
.
attn
.
get_attn_backend
()
assert
not
self
.
attn_backend
.
forward_includes_kv_cache_update
,
(
f
"Attention backend
{
self
.
attn_backend
}
does not support fuse_rope_kvcache."
)
self
.
attn
.
_k_scale
=
self
.
attn
.
_k_scale
.
to
(
device
)
self
.
attn
.
_v_scale
=
self
.
attn
.
_v_scale
.
to
(
device
)
kv_cache_dtype_str
=
vllm_config
.
cache_config
.
cache_dtype
self
.
kv_cache_dtype
=
(
FP8_DTYPE
if
kv_cache_dtype_str
.
startswith
(
"fp8"
)
else
self
.
dtype
)
# Initialize attn MetadataBuilder
self
.
builder
=
self
.
attn
.
attn_backend
.
get_builder_cls
()(
kv_cache_spec
=
AttentionSpec
(
block_size
=
self
.
block_size
,
num_kv_heads
=
self
.
num_kv_heads
,
head_size
=
head_size
,
dtype
=
self
.
kv_cache_dtype
,
),
layer_names
=
[
self
.
attn
.
layer_name
],
vllm_config
=
vllm_config
,
device
=
device
,
)
def
build_attn_metadata
(
self
,
batch_size
:
int
)
->
CommonAttentionMetadata
:
"""Initialize attention metadata."""
# Create common attn metadata
batch_spec
=
BatchSpec
(
seq_lens
=
[
1
]
*
batch_size
,
query_lens
=
[
1
]
*
batch_size
)
common_attn_metadata
=
create_common_attn_metadata
(
batch_spec
,
self
.
block_size
,
self
.
device
,
arange_block_indices
=
True
)
max_blocks
=
(
max
(
batch_spec
.
seq_lens
)
+
self
.
block_size
-
1
)
//
self
.
block_size
num_blocks
=
batch_size
*
max_blocks
# Fetch the attention backend and kv cache shape and stride order
attn_backend
=
self
.
attn
.
attn_backend
kv_cache_shape
=
attn_backend
.
get_kv_cache_shape
(
num_blocks
,
self
.
block_size
,
self
.
num_kv_heads
,
self
.
head_size
)
try
:
kv_cache_stride_order
=
attn_backend
.
get_kv_cache_stride_order
()
except
(
AttributeError
,
NotImplementedError
):
kv_cache_stride_order
=
tuple
(
range
(
len
(
kv_cache_shape
)))
kv_cache_shape
=
tuple
(
kv_cache_shape
[
i
]
for
i
in
kv_cache_stride_order
)
inv_order
=
[
kv_cache_stride_order
.
index
(
i
)
for
i
in
range
(
len
(
kv_cache_stride_order
))
]
# Create dummy KV cache
raw_tensor
=
torch
.
zeros
(
2
*
num_blocks
*
self
.
block_size
*
self
.
num_kv_heads
*
self
.
head_size
,
dtype
=
self
.
kv_cache_dtype
,
device
=
self
.
device
,
)
raw_tensor
=
raw_tensor
.
view
(
kv_cache_shape
)
kv_cache
=
raw_tensor
.
permute
(
*
inv_order
)
self
.
attn
.
kv_cache
=
[
kv_cache
]
# Build attn metadata
attn_metadata
=
self
.
builder
.
build
(
common_prefix_len
=
0
,
common_attn_metadata
=
common_attn_metadata
)
return
attn_metadata
def
forward
(
self
,
qkv
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
# Create copy so inplace ops do not modify the original tensors
qkv
=
qkv
.
clone
()
q
,
k
,
v
=
qkv
.
split
([
self
.
q_size
,
self
.
kv_size
,
self
.
kv_size
],
dim
=-
1
)
q
,
k
=
self
.
rotary_emb
(
positions
,
q
,
k
)
# Instead of a full forward pass, match only the KV cache update op here
q
=
q
.
view
(
-
1
,
self
.
num_heads
,
self
.
head_size
)
k
=
k
.
view
(
-
1
,
self
.
num_kv_heads
,
self
.
head_size
)
v
=
v
.
view
(
-
1
,
self
.
num_kv_heads
,
self
.
head_size
)
kv_cache_dummy_dep
=
torch
.
ops
.
vllm
.
unified_kv_cache_update
(
k
,
v
,
self
.
layer_name
)
return
q
,
k
,
v
,
kv_cache_dummy_dep
def
ops_in_model_before
(
self
)
->
list
[
torch
.
_ops
.
OpOverload
]:
ops
=
[]
if
self
.
enable_rope_custom_op
:
if
rocm_aiter_ops
.
is_triton_rotary_embed_enabled
():
ops
.
append
(
torch
.
ops
.
vllm
.
rocm_aiter_triton_rotary_embedding
.
default
)
else
:
ops
.
append
(
ROTARY_OP
)
else
:
ops
.
append
(
INDEX_SELECT_OP
)
ops
.
append
(
torch
.
ops
.
vllm
.
unified_kv_cache_update
.
default
)
return
ops
def
ops_in_model_after
(
self
)
->
list
[
torch
.
_ops
.
OpOverload
]:
return
[
torch
.
ops
.
vllm
.
fused_rope_and_unified_kv_cache_update
.
default
]
@
pytest
.
mark
.
parametrize
(
"attn_backend"
,
[
AttentionBackendEnum
.
ROCM_AITER_UNIFIED_ATTN
,
AttentionBackendEnum
.
TRITON_ATTN
,
AttentionBackendEnum
.
ROCM_ATTN
,
AttentionBackendEnum
.
ROCM_AITER_FA
,
],
)
@
pytest
.
mark
.
parametrize
(
"enable_rope_custom_op"
,
[
True
])
# [True, False])
@
pytest
.
mark
.
parametrize
(
"enable_aiter_triton_rope"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"num_kv_heads"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"head_size"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"is_neox"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype"
,
[
"auto"
,
"fp8"
])
@
pytest
.
mark
.
skipif
(
not
is_aiter_found_and_supported
(),
reason
=
"Only test on ROCm with AITER installed and supported"
,
)
def
test_rope_kvcache_fusion
(
attn_backend
:
AttentionBackendEnum
,
enable_rope_custom_op
:
bool
,
enable_aiter_triton_rope
:
bool
,
num_heads
:
int
,
num_kv_heads
:
int
,
head_size
:
int
,
block_size
:
int
,
is_neox
:
bool
,
dtype
:
torch
.
dtype
,
kv_cache_dtype
:
str
,
monkeypatch
:
pytest
.
MonkeyPatch
,
):
torch
.
set_default_device
(
"cuda"
)
torch
.
set_default_dtype
(
dtype
)
torch
.
manual_seed
(
0
)
custom_ops
:
list
[
str
]
=
[]
if
enable_rope_custom_op
:
custom_ops
.
append
(
"+rotary_embedding"
)
vllm_config
=
VllmConfig
(
model_config
=
ModelConfig
(
dtype
=
dtype
),
cache_config
=
CacheConfig
(
block_size
=
block_size
,
cache_dtype
=
kv_cache_dtype
,
),
compilation_config
=
CompilationConfig
(
mode
=
CompilationMode
.
VLLM_COMPILE
,
custom_ops
=
custom_ops
,
pass_config
=
PassConfig
(
fuse_rope_kvcache
=
True
,
eliminate_noops
=
True
,
),
),
)
with
vllm
.
config
.
set_current_vllm_config
(
vllm_config
),
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_ROCM_USE_AITER"
,
"1"
)
m
.
setenv
(
"VLLM_ROCM_USE_AITER_TRITON_ROPE"
,
"1"
if
enable_aiter_triton_rope
else
"0"
)
rocm_aiter_ops
.
refresh_env_variables
()
model
=
QKRoPEKVCacheTestModel
(
vllm_config
=
vllm_config
,
attn_backend
=
attn_backend
,
num_heads
=
num_heads
,
num_kv_heads
=
num_kv_heads
,
head_size
=
head_size
,
is_neox
=
is_neox
,
dtype
=
dtype
,
device
=
torch
.
get_default_device
(),
)
fusion_pass
=
RopeKVCacheFusionPass
(
vllm_config
)
passes
=
[
NoOpEliminationPass
(
vllm_config
),
SplitCoalescingPass
(
vllm_config
),
ScatterSplitReplacementPass
(
vllm_config
),
fusion_pass
,
PostCleanupPass
(
vllm_config
),
]
backend
=
TestBackend
(
*
passes
)
T
=
5
qkv
=
torch
.
randn
(
T
,
num_heads
*
head_size
+
2
*
num_kv_heads
*
head_size
,
dtype
=
dtype
)
pos
=
torch
.
arange
(
T
,
dtype
=
torch
.
long
)
qkv_unfused
=
qkv
.
clone
()
pos_unfused
=
pos
.
clone
()
with
set_forward_context
(
None
,
vllm_config
):
forward_context
=
get_forward_context
()
attn_metadata
=
model
.
build_attn_metadata
(
T
)
forward_context
.
slot_mapping
=
{
model
.
layer_name
:
attn_metadata
.
slot_mapping
}
q_unfused
,
k_unfused
,
v_unfused
,
dummy
=
model
(
qkv_unfused
,
pos_unfused
)
attn_layer
=
forward_context
.
no_compile_layers
[
model
.
layer_name
]
kv_cache_unfused
=
attn_layer
.
kv_cache
[
forward_context
.
virtual_engine
]
del
dummy
torch
.
_dynamo
.
mark_dynamic
(
qkv
,
0
)
torch
.
_dynamo
.
mark_dynamic
(
pos
,
0
)
with
set_forward_context
(
None
,
vllm_config
):
model_fused
=
torch
.
compile
(
model
,
backend
=
backend
)
forward_context
=
get_forward_context
()
attn_metadata
=
model_fused
.
build_attn_metadata
(
T
)
forward_context
.
slot_mapping
=
{
model
.
layer_name
:
attn_metadata
.
slot_mapping
}
q_fused
,
k_fused
,
v_fused
,
dummy
=
model_fused
(
qkv
,
pos
)
attn_layer
=
forward_context
.
no_compile_layers
[
model
.
layer_name
]
kv_cache_fused
=
attn_layer
.
kv_cache
[
forward_context
.
virtual_engine
]
del
dummy
assert
fusion_pass
.
matched_count
==
1
backend
.
check_before_ops
(
model
.
ops_in_model_before
())
backend
.
check_after_ops
(
model
.
ops_in_model_after
())
if
dtype
==
torch
.
float16
:
ATOL
,
RTOL
=
(
2e-3
,
2e-3
)
else
:
ATOL
,
RTOL
=
(
1e-2
,
1e-2
)
torch
.
testing
.
assert_close
(
q_unfused
,
q_fused
,
atol
=
ATOL
,
rtol
=
RTOL
)
torch
.
testing
.
assert_close
(
k_unfused
,
k_fused
,
atol
=
ATOL
,
rtol
=
RTOL
)
torch
.
testing
.
assert_close
(
v_unfused
,
v_fused
,
atol
=
ATOL
,
rtol
=
RTOL
)
# Cannot compare fp8_* directly here, cast to model dtype instead
torch
.
testing
.
assert_close
(
kv_cache_unfused
.
view
(
dtype
),
kv_cache_fused
.
view
(
dtype
),
atol
=
ATOL
,
rtol
=
RTOL
,
)
tests/compile/passes/test_scatter_split_replace.py
0 → 100644
View file @
3fb4b5fa
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
import
torch.nn
as
nn
import
vllm
from
tests.compile.backend
import
TestBackend
from
vllm.compilation.passes.utility.scatter_split_replace
import
(
ScatterSplitReplacementPass
,
)
from
vllm.compilation.passes.utility.split_coalescing
import
SplitCoalescingPass
from
vllm.config
import
CompilationConfig
,
CompilationMode
,
VllmConfig
from
vllm.model_executor.layers.rotary_embedding
import
RotaryEmbedding
class
ScatterSplitReplacementModel
(
nn
.
Module
):
"""Model with a rope+getitem+slice_scatter+split_with_sizes sequence."""
def
__init__
(
self
,
num_heads
:
int
,
num_kv_heads
:
int
,
head_size
:
int
,
dtype
:
torch
.
dtype
,
):
super
().
__init__
()
self
.
q_size
=
num_heads
*
head_size
self
.
kv_size
=
num_kv_heads
*
head_size
self
.
rotary_emb
=
RotaryEmbedding
(
head_size
,
rotary_dim
=
head_size
,
max_position_embeddings
=
4096
,
base
=
10000
,
is_neox_style
=
True
,
dtype
=
dtype
,
)
def
forward
(
self
,
qkv
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
):
# Create copy so inplace ops do not modify the original tensors
qkv
=
qkv
.
clone
()
q
,
k
,
v
=
qkv
.
split
([
self
.
q_size
,
self
.
kv_size
,
self
.
kv_size
],
dim
=-
1
)
q
,
k
=
self
.
rotary_emb
(
positions
,
q
,
k
)
q
=
q
+
1
k
=
k
+
2
v
=
v
+
3
return
q
,
k
,
v
def
ops_in_model_before
(
self
)
->
list
[
torch
.
_ops
.
OpOverload
]:
return
[
torch
.
ops
.
aten
.
slice_scatter
.
default
,
torch
.
ops
.
aten
.
split_with_sizes
.
default
,
torch
.
ops
.
aten
.
getitem
.
default
,
]
def
ops_in_model_after
(
self
)
->
list
[
torch
.
_ops
.
OpOverload
]:
return
[
torch
.
ops
.
aten
.
getitem
.
default
]
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float16
,
torch
.
bfloat16
])
def
test_scatter_split_replace
(
dtype
):
torch
.
set_default_device
(
"cuda"
)
torch
.
set_default_dtype
(
dtype
)
torch
.
manual_seed
(
0
)
num_heads
=
8
num_kv_heads
=
4
head_size
=
64
vllm_config
=
VllmConfig
(
compilation_config
=
CompilationConfig
(
mode
=
CompilationMode
.
VLLM_COMPILE
,
custom_ops
=
[
"+rotary_embedding"
],
),
)
with
vllm
.
config
.
set_current_vllm_config
(
vllm_config
):
# ScatterSplitReplacementPass requires SplitCoalescingPass to be run before it
coalesce_pass
=
SplitCoalescingPass
(
vllm_config
)
replace_pass
=
ScatterSplitReplacementPass
(
vllm_config
)
passes
=
[
coalesce_pass
,
replace_pass
]
backend
=
TestBackend
(
*
passes
)
model
=
ScatterSplitReplacementModel
(
num_heads
,
num_kv_heads
,
head_size
,
dtype
)
T
=
5
qkv
=
torch
.
randn
(
T
,
num_heads
*
head_size
+
2
*
num_kv_heads
*
head_size
,
dtype
=
dtype
)
pos
=
torch
.
arange
(
T
,
dtype
=
torch
.
long
)
qkv_eager
=
qkv
.
clone
()
pos_eager
=
pos
.
clone
()
result_eager
=
model
(
qkv_eager
,
pos_eager
)
torch
.
_dynamo
.
mark_dynamic
(
qkv
,
0
)
torch
.
_dynamo
.
mark_dynamic
(
pos
,
0
)
model_compiled
=
torch
.
compile
(
model
,
backend
=
backend
)
result_compiled
=
model_compiled
(
qkv
,
pos
)
for
eager
,
compiled
in
zip
(
result_eager
,
result_compiled
):
torch
.
testing
.
assert_close
(
eager
,
compiled
)
assert
backend
.
op_count
(
torch
.
ops
.
aten
.
slice_scatter
.
default
)
==
0
assert
backend
.
op_count
(
torch
.
ops
.
aten
.
split_with_sizes
.
default
)
==
1
tests/compile/passes/test_silu_mul_quant_fusion.py
View file @
3fb4b5fa
...
@@ -26,22 +26,14 @@ from vllm.config import (
...
@@ -26,22 +26,14 @@ from vllm.config import (
VllmConfig
,
VllmConfig
,
set_current_vllm_config
,
set_current_vllm_config
,
)
)
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.kernels.linear
import
(
from
vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass
import
(
CutlassFP8ScaledMMLinearKernel
,
CutlassFP8ScaledMMLinearKernel
,
)
from
vllm.model_executor.layers.quantization.kernels.scaled_mm.flashinfer
import
(
FlashInferFP8ScaledMMLinearKernel
,
FlashInferFP8ScaledMMLinearKernel
,
)
FP8ScaledMMLinearKernel
,
from
vllm.model_executor.layers.quantization.kernels.scaled_mm.pytorch
import
(
PerTensorTorchFP8ScaledMMLinearKernel
,
PerTensorTorchFP8ScaledMMLinearKernel
,
)
from
vllm.model_executor.layers.quantization.kernels.scaled_mm.rocm
import
(
ROCmFP8ScaledMMLinearKernel
,
ROCmFP8ScaledMMLinearKernel
,
)
)
from
vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel
import
(
# noqa: E501
from
vllm.model_executor.layers.activation
import
SiluAndMul
FP8ScaledMMLinearKernel
,
)
from
vllm.model_executor.layers.quantization.utils.fp8_utils
import
W8A8BlockFp8LinearOp
from
vllm.model_executor.layers.quantization.utils.fp8_utils
import
W8A8BlockFp8LinearOp
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
GroupShape
,
GroupShape
,
...
@@ -190,8 +182,24 @@ TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS
...
@@ -190,8 +182,24 @@ TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS
"model_class, enable_quant_fp8_custom_op, force_kernel"
,
"model_class, enable_quant_fp8_custom_op, force_kernel"
,
list
(
itertools
.
product
([
TestSiluMulFp8QuantModel
],
[
True
,
False
],
TEST_KERNELS
))
list
(
itertools
.
product
([
TestSiluMulFp8QuantModel
],
[
True
,
False
],
TEST_KERNELS
))
+
[
+
[
(
TestSiluMulNvfp4QuantModel
,
False
,
None
),
pytest
.
param
(
(
TestSiluMulGroupFp8QuantModel
,
False
,
None
),
TestSiluMulNvfp4QuantModel
,
False
,
None
,
marks
=
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
(),
reason
=
"CUDA only"
),
),
# GroupFP8Quant fusion only works with AITER on ROCm.
# and the enable_quant_fp8_custom_op must be True.
pytest
.
param
(
TestSiluMulGroupFp8QuantModel
,
True
,
None
,
marks
=
pytest
.
mark
.
skipif
(
not
current_platform
.
is_rocm
(),
reason
=
"ROCm only"
),
),
],
],
)
)
@
pytest
.
mark
.
skipif
(
@
pytest
.
mark
.
skipif
(
...
@@ -209,6 +217,7 @@ def test_fusion_silu_and_mul_quant(
...
@@ -209,6 +217,7 @@ def test_fusion_silu_and_mul_quant(
enable_silu_mul_custom_op
:
bool
,
enable_silu_mul_custom_op
:
bool
,
enable_quant_fp8_custom_op
:
bool
,
enable_quant_fp8_custom_op
:
bool
,
force_kernel
:
FP8ScaledMMLinearKernel
|
None
,
force_kernel
:
FP8ScaledMMLinearKernel
|
None
,
monkeypatch
:
pytest
.
MonkeyPatch
,
):
):
if
model_class
is
TestSiluMulNvfp4QuantModel
and
not
is_nvfp4_supported
():
if
model_class
is
TestSiluMulNvfp4QuantModel
and
not
is_nvfp4_supported
():
pytest
.
skip
(
"NVFP4 is not supported on this GPU."
)
pytest
.
skip
(
"NVFP4 is not supported on this GPU."
)
...
@@ -235,13 +244,16 @@ def test_fusion_silu_and_mul_quant(
...
@@ -235,13 +244,16 @@ def test_fusion_silu_and_mul_quant(
),
),
)
)
with
set_current_vllm_config
(
config
):
with
set_current_vllm_config
(
config
)
,
monkeypatch
.
context
()
as
m
:
fusion_passes
=
[
ActivationQuantFusionPass
(
config
)]
fusion_passes
=
[
ActivationQuantFusionPass
(
config
)]
if
IS_AITER_FOUND
:
if
IS_AITER_FOUND
and
model_class
is
TestSiluMulGroupFp8QuantModel
:
from
vllm._aiter_ops
import
rocm_aiter_ops
from
vllm.compilation.passes.fusion.rocm_aiter_fusion
import
(
from
vllm.compilation.passes.fusion.rocm_aiter_fusion
import
(
RocmAiterSiluMulFp8GroupQuantFusionPass
,
RocmAiterSiluMulFp8GroupQuantFusionPass
,
)
)
m
.
setenv
(
"VLLM_ROCM_USE_AITER"
,
"1"
)
rocm_aiter_ops
.
refresh_env_variables
()
fusion_passes
+=
[
RocmAiterSiluMulFp8GroupQuantFusionPass
(
config
)]
fusion_passes
+=
[
RocmAiterSiluMulFp8GroupQuantFusionPass
(
config
)]
passes
=
[
NoOpEliminationPass
(
config
),
*
fusion_passes
,
PostCleanupPass
(
config
)]
passes
=
[
NoOpEliminationPass
(
config
),
*
fusion_passes
,
PostCleanupPass
(
config
)]
...
...
tests/compile/test_aot_compile.py
View file @
3fb4b5fa
...
@@ -4,6 +4,7 @@
...
@@ -4,6 +4,7 @@
import
functools
import
functools
import
hashlib
import
hashlib
import
multiprocessing
import
multiprocessing
import
os
import
pickle
import
pickle
import
tempfile
import
tempfile
from
contextlib
import
contextmanager
from
contextlib
import
contextmanager
...
@@ -14,9 +15,12 @@ import pytest
...
@@ -14,9 +15,12 @@ import pytest
import
torch
import
torch
import
vllm.model_executor.layers.activation
import
vllm.model_executor.layers.activation
from
vllm.compilation.backends
import
VllmBackend
from
vllm.compilation.caching
import
(
from
vllm.compilation.caching
import
(
StandaloneCompiledArtifacts
,
StandaloneCompiledArtifacts
,
VllmSerializableFunction
,
)
)
from
vllm.compilation.counter
import
compilation_counter
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
(
from
vllm.config
import
(
CompilationConfig
,
CompilationConfig
,
...
@@ -156,6 +160,26 @@ def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
...
@@ -156,6 +160,26 @@ def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
assert
torch
.
allclose
(
ret
,
expected
)
assert
torch
.
allclose
(
ret
,
expected
)
@
pytest
.
mark
.
skipif
(
not
is_torch_equal_or_newer
(
"2.10.0"
),
reason
=
"requires torch 2.10"
)
def
test_save_and_load_slice
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
foo
(
x
:
torch
.
Tensor
):
return
x
[
slice
(
0
,
x
.
shape
[
0
])]
vllm_config
=
make_vllm_config
()
example_input
=
torch
.
randn
(
10
,
10
)
torch
.
_dynamo
.
mark_dynamic
(
example_input
,
0
)
gm
=
torch
.
fx
.
symbolic_trace
(
foo
)
assert
"getitem_1 = x[slice(0, getitem, None)]"
in
gm
.
code
with
use_vllm_config
(
vllm_config
):
payload
=
VllmSerializableFunction
.
serialize_compile_artifacts
(
VllmSerializableFunction
(
gm
,
(
example_input
,),
""
,
foo
)
)
fn
=
VllmSerializableFunction
.
deserialize_compile_artifacts
(
payload
)
assert
gm
.
code
==
fn
.
graph_module
.
code
@
pytest
.
mark
.
skipif
(
not
is_torch_equal_or_newer
(
"2.10.0"
),
reason
=
"requires torch 2.10"
)
@
pytest
.
mark
.
skipif
(
not
is_torch_equal_or_newer
(
"2.10.0"
),
reason
=
"requires torch 2.10"
)
def
test_cache_load_returns_tuple_consistency
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_cache_load_returns_tuple_consistency
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""
"""
...
@@ -700,3 +724,156 @@ class TestStandaloneCompiledArtifactsIntegration:
...
@@ -700,3 +724,156 @@ class TestStandaloneCompiledArtifactsIntegration:
(
"mod3"
,
"shape3"
),
(
"mod3"
,
"shape3"
),
]:
]:
assert
cache
.
get
(
submod
,
shape
)
==
shared_data
assert
cache
.
get
(
submod
,
shape
)
==
shared_data
def
test_functorch_config
(
self
):
vllm_config
=
make_vllm_config
()
example_inputs
=
(
torch
.
randn
(
10
,
10
),)
def
add_1
(
x
:
torch
.
Tensor
):
return
x
+
1
gm
=
torch
.
_dynamo
.
functional_export
.
dynamo_graph_capture_for_export
(
add_1
)(
*
example_inputs
)
gm
.
graph
.
_codegen
=
torch
.
fx
.
graph
.
CodeGen
()
gm
.
_dynamo_bytecode_flatten
=
None
gm
.
_dynamo_bytecode_unflatten
=
None
with
(
torch
.
_functorch
.
config
.
patch
(
bundled_autograd_cache
=
False
),
set_current_vllm_config
(
vllm_config
),
):
with
torch
.
_functorch
.
config
.
patch
(
bundled_autograd_cache
=
True
):
fn
=
VllmSerializableFunction
(
gm
,
example_inputs
,
""
,
add_1
)
payload
=
VllmSerializableFunction
.
serialize_compile_artifacts
(
fn
)
config
=
None
def
backend
(
*
args
,
**
kwargs
)
->
VllmSerializableFunction
:
nonlocal
config
# bundled_autograd_cache should be True even compiler backend
# runs with bundled_autograd_cache=False in ambient context.
config
=
torch
.
_functorch
.
config
.
save_config_portable
()
return
fn
loaded_fn
=
VllmSerializableFunction
.
deserialize_compile_artifacts
(
payload
)
with
patch
.
object
(
VllmBackend
,
"__call__"
,
backend
):
loaded_fn
(
*
example_inputs
)
assert
isinstance
(
config
,
dict
)
assert
"bundled_autograd_cache"
in
config
assert
config
[
"bundled_autograd_cache"
]
is
True
@
pytest
.
mark
.
skipif
(
not
is_torch_equal_or_newer
(
"2.10.0"
),
reason
=
"requires torch 2.10"
)
def
test_disable_compile_cache_skips_aot_save
(
monkeypatch
:
pytest
.
MonkeyPatch
,
fresh_vllm_cache
:
str
):
"""When VLLM_DISABLE_COMPILE_CACHE=1, AOT artifacts must not be saved."""
monkeypatch
.
setenv
(
"VLLM_DISABLE_COMPILE_CACHE"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_USE_AOT_COMPILE"
,
"1"
)
disable_envs_cache
()
args
=
(
torch
.
randn
(
10
,
10
),)
expected
=
reference_fn
(
*
args
)
vllm_config
=
make_vllm_config
()
with
(
use_vllm_config
(
vllm_config
),
compilation_counter
.
expect
(
num_aot_compiles
=
1
,
num_aot_artifacts_saved
=
0
,
num_aot_artifacts_loaded
=
0
,
),
):
mod
=
CompiledMod
(
vllm_config
=
vllm_config
)
actual
=
mod
(
*
args
)
assert
torch
.
allclose
(
actual
,
expected
)
# No cached artifact should exist on disk
aot_dir
=
os
.
path
.
join
(
fresh_vllm_cache
,
"torch_compile_cache"
,
"torch_aot_compile"
)
if
os
.
path
.
isdir
(
aot_dir
):
for
root
,
_dirs
,
files
in
os
.
walk
(
aot_dir
):
for
f
in
files
:
assert
f
!=
"model"
,
(
f
"AOT artifact unexpectedly saved at
{
os
.
path
.
join
(
root
,
f
)
}
"
)
@
pytest
.
mark
.
skipif
(
not
is_torch_equal_or_newer
(
"2.10.0"
),
reason
=
"requires torch 2.10"
)
def
test_disable_compile_cache_skips_aot_load
(
monkeypatch
:
pytest
.
MonkeyPatch
,
fresh_vllm_cache
:
str
):
"""When VLLM_DISABLE_COMPILE_CACHE=1, AOT artifacts must not be loaded."""
# Phase 1: compile and save with cache enabled
monkeypatch
.
setenv
(
"VLLM_USE_AOT_COMPILE"
,
"1"
)
disable_envs_cache
()
args
=
(
torch
.
randn
(
10
,
10
),)
vllm_config
=
make_vllm_config
()
with
(
use_vllm_config
(
vllm_config
),
compilation_counter
.
expect
(
num_aot_artifacts_saved
=
1
),
):
CompiledMod
(
vllm_config
=
vllm_config
)(
*
args
)
# Phase 2: disable cache, compile again — should NOT load from disk
monkeypatch
.
setenv
(
"VLLM_DISABLE_COMPILE_CACHE"
,
"1"
)
disable_envs_cache
()
torch
.
_dynamo
.
reset
()
vllm_config
=
make_vllm_config
()
with
(
use_vllm_config
(
vllm_config
),
compilation_counter
.
expect
(
num_aot_compiles
=
1
,
num_aot_artifacts_saved
=
0
,
num_aot_artifacts_loaded
=
0
,
),
):
mod
=
CompiledMod
(
vllm_config
=
vllm_config
)
mod
(
*
args
)
assert
not
mod
.
was_aot_compile_fn_loaded_from_disk
@
pytest
.
mark
.
skipif
(
not
is_torch_equal_or_newer
(
"2.10.0"
),
reason
=
"requires torch 2.10"
)
def
test_aot_counters_on_save_and_load
(
monkeypatch
:
pytest
.
MonkeyPatch
,
fresh_vllm_cache
:
str
):
"""Verify AOT counters are incremented correctly on save and load."""
monkeypatch
.
setenv
(
"VLLM_USE_AOT_COMPILE"
,
"1"
)
disable_envs_cache
()
args
=
(
torch
.
randn
(
10
,
10
),)
# Phase 1: fresh compile + save
vllm_config
=
make_vllm_config
()
with
(
use_vllm_config
(
vllm_config
),
compilation_counter
.
expect
(
num_aot_compiles
=
1
,
num_aot_artifacts_saved
=
1
,
num_aot_artifacts_loaded
=
0
,
),
):
CompiledMod
(
vllm_config
=
vllm_config
)(
*
args
)
# Phase 2: load from cache
monkeypatch
.
setenv
(
"VLLM_FORCE_AOT_LOAD"
,
"1"
)
disable_envs_cache
()
vllm_config
=
make_vllm_config
()
with
(
use_vllm_config
(
vllm_config
),
compilation_counter
.
expect
(
num_aot_compiles
=
0
,
num_aot_artifacts_saved
=
0
,
num_aot_artifacts_loaded
=
1
,
),
):
CompiledMod
(
vllm_config
=
vllm_config
)(
*
args
)
tests/compile/test_cold_start.py
deleted
100644 → 0
View file @
bcf25339
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
torch._dynamo.utils
import
counters
from
vllm
import
LLM
from
vllm.config
import
CompilationConfig
,
CompilationMode
,
CUDAGraphMode
def
test_moe_compilation_cold_start
(
monkeypatch
,
use_fresh_inductor_cache
):
# Run in same process so we can access PyTorch's internal counters
monkeypatch
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"0"
)
# I'm not sure if this is going to affect the numbers
monkeypatch
.
setenv
(
"VLLM_USE_AOT_COMPILE"
,
"0"
)
# Force cold compilation
monkeypatch
.
setenv
(
"VLLM_DISABLE_COMPILE_CACHE"
,
"1"
)
compilation_config
=
CompilationConfig
(
mode
=
CompilationMode
.
VLLM_COMPILE
,
cudagraph_mode
=
CUDAGraphMode
.
NONE
,
# make the model loading faster
)
counters
.
clear
()
_
=
LLM
(
model
=
"microsoft/Phi-tiny-MoE-instruct"
,
max_model_len
=
256
,
load_format
=
"dummy"
,
# make the model loading faster
compilation_config
=
compilation_config
,
num_gpu_blocks_override
=
8
,
# make the model loading faster
)
# vLLM-compile cold start is special. By default, we do
# one full dynamo capture of the entire forward pass.
# The forward pass consists of 32 transformer layers.
# Then, we split on the attention operation. This results in
# 33 subgraphs (not including the attention operation).
# We then generate compiled artifacts for the unique subgraphs.
#
# There are actually only 3 unique subgraphs for this model
# (all of its transformer layers are the same modulo weights);
# this is true for most vLLM models.
# So we test that during cold start, we are only compling
# for 3 unique subgraphs.
assert
counters
[
"aot_autograd"
][
"autograd_cache_miss"
]
==
3
assert
counters
[
"aot_autograd"
][
"autograd_cache_hit"
]
==
0
tests/compile/test_compile_ranges.py
View file @
3fb4b5fa
...
@@ -73,6 +73,7 @@ def test_compile_ranges(use_fresh_inductor_cache):
...
@@ -73,6 +73,7 @@ def test_compile_ranges(use_fresh_inductor_cache):
Range
(
start
=
16
,
end
=
16
),
Range
(
start
=
16
,
end
=
16
),
Range
(
start
=
9
,
end
=
32
),
Range
(
start
=
9
,
end
=
32
),
Range
(
start
=
64
,
end
=
64
),
Range
(
start
=
64
,
end
=
64
),
Range
(
start
=
128
,
end
=
128
),
Range
(
start
=
33
,
end
=
8192
),
Range
(
start
=
33
,
end
=
8192
),
]
]
)
)
...
@@ -85,7 +86,7 @@ def test_compile_ranges(use_fresh_inductor_cache):
...
@@ -85,7 +86,7 @@ def test_compile_ranges(use_fresh_inductor_cache):
),
),
compilation_config
=
CompilationConfig
(
compilation_config
=
CompilationConfig
(
mode
=
CompilationMode
.
VLLM_COMPILE
,
mode
=
CompilationMode
.
VLLM_COMPILE
,
compile_ranges_
split_
points
=
[
8
,
32
],
compile_ranges_
end
points
=
[
8
,
32
],
compile_sizes
=
[
16
,
64
,
128
],
compile_sizes
=
[
16
,
64
,
128
],
inductor_compile_config
=
{
inductor_compile_config
=
{
"post_grad_custom_post_pass"
:
post_grad_range_checker
,
"post_grad_custom_post_pass"
:
post_grad_range_checker
,
...
@@ -95,21 +96,21 @@ def test_compile_ranges(use_fresh_inductor_cache):
...
@@ -95,21 +96,21 @@ def test_compile_ranges(use_fresh_inductor_cache):
with
set_current_vllm_config
(
vllm_config
):
with
set_current_vllm_config
(
vllm_config
):
model
=
TestModel
(
vllm_config
=
vllm_config
,
prefix
=
""
).
eval
()
model
=
TestModel
(
vllm_config
=
vllm_config
,
prefix
=
""
).
eval
()
# Number of compilations: 3
for each
compile range +
2
compile sizes
# Number of compilations: 3 compile range
s
+
3
compile sizes
batch_sizes
=
[
1
,
4
,
16
,
24
,
48
,
64
,
8192
]
batch_sizes
=
[
1
,
4
,
16
,
24
,
48
,
64
,
8192
]
with
compilation_counter
.
expect
(
with
compilation_counter
.
expect
(
num_graphs_seen
=
1
,
num_graphs_seen
=
1
,
num_piecewise_graphs_seen
=
1
,
num_piecewise_graphs_seen
=
1
,
num_backend_compilations
=
5
,
num_backend_compilations
=
6
,
):
):
run_model
(
vllm_config
,
model
,
batch_sizes
)
run_model
(
vllm_config
,
model
,
batch_sizes
)
assert
post_grad_range_checker
.
num_calls
==
5
assert
post_grad_range_checker
.
num_calls
==
6
def
test_compile_config_get_compile_ranges
():
def
test_compile_config_get_compile_ranges
():
compilation_config
=
CompilationConfig
(
compilation_config
=
CompilationConfig
(
compile_ranges_
split_
points
=
[
8
,
32
],
compile_ranges_
end
points
=
[
8
,
32
],
)
)
VllmConfig
(
VllmConfig
(
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
...
@@ -126,6 +127,88 @@ def test_compile_config_get_compile_ranges():
...
@@ -126,6 +127,88 @@ def test_compile_config_get_compile_ranges():
]
]
class
PostGradStaticShapeChecker
(
InductorPass
):
"""Asserts that compile_sizes entries produce graphs with fully concrete
(non-symbolic) shapes, and compile_ranges entries have symbolic shapes."""
def
__init__
(
self
):
self
.
num_static_calls
=
0
self
.
num_dynamic_calls
=
0
def
__call__
(
self
,
graph
:
fx
.
Graph
):
from
torch.fx.experimental.symbolic_shapes
import
is_symbolic
compile_range
=
get_pass_context
().
compile_range
is_single
=
compile_range
.
is_single_size
()
for
node
in
graph
.
nodes
:
val
=
node
.
meta
.
get
(
"val"
)
if
val
is
None
:
val
=
node
.
meta
.
get
(
"example_value"
)
if
isinstance
(
val
,
torch
.
Tensor
):
has_symbolic
=
any
(
is_symbolic
(
d
)
for
d
in
val
.
shape
)
if
is_single
:
assert
not
has_symbolic
,
(
f
"compile_sizes entry
{
compile_range
}
: "
f
"node '
{
node
.
name
}
' has symbolic shape "
f
"
{
val
.
shape
}
"
)
else
:
# compile_ranges should have at least some
# symbolic shapes (the batch dimension)
if
has_symbolic
:
self
.
num_dynamic_calls
+=
1
return
if
is_single
:
self
.
num_static_calls
+=
1
def
uuid
(
self
)
->
str
:
state
:
dict
[
str
,
Any
]
=
{}
return
InductorPass
.
hash_dict
(
state
)
def
test_compile_sizes_produce_static_shapes
(
use_fresh_inductor_cache
):
"""Verify that compile_sizes entries are compiled with fully concrete
shapes (no SymInts), while compile_ranges entries retain dynamic shapes."""
checker
=
PostGradStaticShapeChecker
()
torch
.
set_default_device
(
"cuda"
)
vllm_config
=
VllmConfig
(
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
=
8192
,
max_model_len
=
8192
,
is_encoder_decoder
=
False
,
),
compilation_config
=
CompilationConfig
(
mode
=
CompilationMode
.
VLLM_COMPILE
,
compile_ranges_endpoints
=
[
8
],
compile_sizes
=
[
16
],
inductor_compile_config
=
{
"post_grad_custom_post_pass"
:
checker
,
},
),
)
with
set_current_vllm_config
(
vllm_config
):
model
=
TestModel
(
vllm_config
=
vllm_config
,
prefix
=
""
).
eval
()
# 3 compilations: Range(1,8), Range(9,8192), single-size 16
with
compilation_counter
.
expect
(
num_graphs_seen
=
1
,
num_piecewise_graphs_seen
=
1
,
num_backend_compilations
=
3
,
):
run_model
(
vllm_config
,
model
,
[
1
,
16
,
64
])
# compile_sizes=16 should produce static shapes
assert
checker
.
num_static_calls
==
1
,
(
f
"Expected 1 static compilation, got
{
checker
.
num_static_calls
}
"
)
# compile_ranges should produce dynamic shapes
assert
checker
.
num_dynamic_calls
==
2
,
(
f
"Expected 2 dynamic compilations, got
{
checker
.
num_dynamic_calls
}
"
)
def
test_inductor_cache_compile_ranges
(
monkeypatch
,
use_fresh_inductor_cache
):
def
test_inductor_cache_compile_ranges
(
monkeypatch
,
use_fresh_inductor_cache
):
# To force multiple compilations, we disable the compile cache
# To force multiple compilations, we disable the compile cache
monkeypatch
.
setenv
(
"VLLM_DISABLE_COMPILE_CACHE"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_DISABLE_COMPILE_CACHE"
,
"1"
)
...
@@ -148,7 +231,7 @@ def test_inductor_cache_compile_ranges(monkeypatch, use_fresh_inductor_cache):
...
@@ -148,7 +231,7 @@ def test_inductor_cache_compile_ranges(monkeypatch, use_fresh_inductor_cache):
scheduler_config
=
scheduler_config
,
scheduler_config
=
scheduler_config
,
compilation_config
=
CompilationConfig
(
compilation_config
=
CompilationConfig
(
mode
=
CompilationMode
.
VLLM_COMPILE
,
mode
=
CompilationMode
.
VLLM_COMPILE
,
compile_ranges_
split_
points
=
[
8
],
compile_ranges_
end
points
=
[
8
],
inductor_compile_config
=
{
inductor_compile_config
=
{
"post_grad_custom_post_pass"
:
post_grad_range_checker
,
"post_grad_custom_post_pass"
:
post_grad_range_checker
,
},
},
...
...
tests/compile/test_config.py
View file @
3fb4b5fa
...
@@ -421,6 +421,7 @@ def test_cudagraph_sizes_post_init(
...
@@ -421,6 +421,7 @@ def test_cudagraph_sizes_post_init(
fuse_norm_quant
=
True
,
fuse_norm_quant
=
True
,
fuse_act_quant
=
True
,
fuse_act_quant
=
True
,
eliminate_noops
=
True
,
eliminate_noops
=
True
,
sp_min_token_num
=
512
if
enable_sp
else
None
,
),
),
cudagraph_mode
=
cudagraph_mode
,
cudagraph_mode
=
cudagraph_mode
,
)
)
...
@@ -569,3 +570,45 @@ def test_compile_sizes_padding_validation():
...
@@ -569,3 +570,45 @@ def test_compile_sizes_padding_validation():
assert
sorted
(
config
.
compile_sizes
)
==
[
3
,
5
,
7
]
assert
sorted
(
config
.
compile_sizes
)
==
[
3
,
5
,
7
]
dispatcher
=
CudagraphDispatcher
(
_create_vllm_config_for_validation
(
config
))
dispatcher
=
CudagraphDispatcher
(
_create_vllm_config_for_validation
(
config
))
dispatcher
.
initialize_cudagraph_keys
(
CUDAGraphMode
.
NONE
)
# Should not raise
dispatcher
.
initialize_cudagraph_keys
(
CUDAGraphMode
.
NONE
)
# Should not raise
@
pytest
.
mark
.
parametrize
(
"capture_sizes, max_size, num_blocks, expected_sizes, expected_max"
,
[
# Normal capping: sizes filtered to <= num_blocks
(
[
1
,
2
,
4
,
8
,
16
,
32
,
64
,
128
,
256
,
512
],
512
,
200
,
[
1
,
2
,
4
,
8
,
16
,
32
,
64
,
128
],
128
,
),
# No capping needed: num_blocks >= max
([
1
,
2
,
4
,
8
,
16
],
16
,
1000
,
[
1
,
2
,
4
,
8
,
16
],
16
),
# Exact boundary: num_blocks == max (no capping)
([
1
,
2
,
4
,
8
,
16
,
32
],
32
,
32
,
[
1
,
2
,
4
,
8
,
16
,
32
],
32
),
# All sizes capped: num_blocks < smallest size
([
8
,
16
,
32
],
32
,
4
,
[],
0
),
# num_blocks <= 0: early return, no change
([
1
,
2
,
4
],
4
,
0
,
[
1
,
2
,
4
],
4
),
],
)
def
test_adjust_cudagraph_sizes_for_mamba_cache
(
capture_sizes
,
max_size
,
num_blocks
,
expected_sizes
,
expected_max
):
"""Test that cudagraph capture sizes are correctly capped to fit
available Mamba cache blocks.
See: https://github.com/vllm-project/vllm/issues/34094
"""
config
=
CompilationConfig
(
cudagraph_capture_sizes
=
capture_sizes
,
max_cudagraph_capture_size
=
max_size
,
cudagraph_mode
=
CUDAGraphMode
.
NONE
,
)
config
.
adjust_cudagraph_sizes_for_mamba_cache
(
num_blocks
)
assert
config
.
cudagraph_capture_sizes
==
expected_sizes
assert
config
.
max_cudagraph_capture_size
==
expected_max
# Invariant: last element == max_cudagraph_capture_size
if
expected_sizes
:
assert
config
.
cudagraph_capture_sizes
[
-
1
]
==
config
.
max_cudagraph_capture_size
tests/compile/test_decorator.py
View file @
3fb4b5fa
...
@@ -234,7 +234,7 @@ def test_conditional_compile_enable_if(use_inductor_graph_partition, monkeypatch
...
@@ -234,7 +234,7 @@ def test_conditional_compile_enable_if(use_inductor_graph_partition, monkeypatch
expected_num_backend_compilations
=
4
expected_num_backend_compilations
=
4
# A has support_torch_compile but enable_if fn returns False
# A has support_torch_compile but enable_if fn returns False
# ena
l
be_if will be True for B, so we expect mod1 and mod2
# enab
l
e_if will be True for B, so we expect mod1 and mod2
# to be compiled
# to be compiled
with
compilation_counter
.
expect
(
with
compilation_counter
.
expect
(
num_graphs_seen
=
2
,
num_graphs_seen
=
2
,
...
...
tests/compile/test_dynamic_shapes_compilation.py
View file @
3fb4b5fa
...
@@ -99,8 +99,8 @@ def test_dynamic_shapes_compilation(
...
@@ -99,8 +99,8 @@ def test_dynamic_shapes_compilation(
# Clean up GPU memory
# Clean up GPU memory
del
model
del
model
gc
.
collect
()
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
torch
.
accelerator
.
empty_cache
()
torch
.
cuda
.
synchronize
()
torch
.
accelerator
.
synchronize
()
print
(
"GPU memory cleared"
)
print
(
"GPU memory cleared"
)
...
...
tests/compile/test_graph_partition.py
View file @
3fb4b5fa
...
@@ -7,7 +7,7 @@ import pytest
...
@@ -7,7 +7,7 @@ import pytest
import
torch
import
torch
from
torch.fx.experimental.proxy_tensor
import
make_fx
from
torch.fx.experimental.proxy_tensor
import
make_fx
from
vllm.compilation.backends
import
split_graph
from
vllm.compilation.backends
import
_is_empty_allocation_node
,
split_graph
from
vllm.compilation.passes.fx_utils
import
find_op_nodes
from
vllm.compilation.passes.fx_utils
import
find_op_nodes
# This import automatically registers `torch.ops.silly.attention`
# This import automatically registers `torch.ops.silly.attention`
...
@@ -184,3 +184,146 @@ def test_consecutive_ops_in_split():
...
@@ -184,3 +184,146 @@ def test_consecutive_ops_in_split():
assert
[
node
.
op
for
node
in
splitting_gm
.
graph
.
nodes
]
==
[
"placeholder"
]
+
2
*
[
assert
[
node
.
op
for
node
in
splitting_gm
.
graph
.
nodes
]
==
[
"placeholder"
]
+
2
*
[
"call_function"
"call_function"
]
+
[
"output"
]
]
+
[
"output"
]
def
_get_empty_nodes
(
split_item
):
return
[
node
for
node
in
split_item
.
graph
.
graph
.
nodes
if
_is_empty_allocation_node
(
node
)
]
def
_subgraphs_with_empty_nodes
(
split_items
,
*
,
is_splitting_graph
):
return
[
split_item
for
split_item
in
split_items
if
split_item
.
is_splitting_graph
==
is_splitting_graph
and
_get_empty_nodes
(
split_item
)
]
def
test_empty_only_partition_stays_separate_after_splitting_predecessor
():
"""
Empty-only subgraphs should not be merged when the only predecessor is
a splitting-op subgraph.
"""
def
model_fn
(
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
y
=
torch
.
sin
(
x
)
out
=
torch
.
empty_like
(
y
)
torch
.
ops
.
aten
.
cos
.
out
(
y
,
out
=
out
)
return
out
x
=
torch
.
randn
(
4
,
3
)
gm
=
make_fx
(
model_fn
)(
x
)
split_ops
=
[
"aten::sin"
,
"aten::cos.out"
]
split_gm
,
split_items
=
split_graph
(
gm
,
split_ops
)
# Graph partitioning for this pattern is:
# [sin], [empty_like], [cos.out].
assert
len
(
split_items
)
==
3
,
(
"Empty-only partition should not merge into splitting-op subgraph"
)
splitting_with_empty
=
_subgraphs_with_empty_nodes
(
split_items
,
is_splitting_graph
=
True
)
assert
len
(
splitting_with_empty
)
==
0
,
(
"Splitting-op subgraphs should not contain empty allocation nodes: "
f
"
{
[
item
.
submod_name
for
item
in
splitting_with_empty
]
}
"
)
output_original
=
gm
(
x
)
output_split
=
split_gm
(
x
)
assert
torch
.
allclose
(
output_original
,
output_split
),
"Output mismatch after split"
def
test_empty_only_partition_is_merged
():
"""
Empty-only subgraphs should still be merged when a non-splitting predecessor
exists. The merged empty node must remain outside splitting-op subgraphs.
"""
def
model_fn
(
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
base
=
x
+
1
y
=
torch
.
sin
(
base
)
out
=
torch
.
empty_like
(
base
)
torch
.
ops
.
aten
.
cos
.
out
(
base
,
out
=
out
)
return
out
+
y
x
=
torch
.
randn
(
4
,
3
)
gm
=
make_fx
(
model_fn
)(
x
)
split_gm
,
split_items
=
split_graph
(
gm
,
[
"aten::sin"
,
"aten::cos.out"
])
# Partitioning should be:
# [add, empty_like], [sin], [cos.out], [add].
assert
len
(
split_items
)
==
4
,
(
"Empty-only partition should be merged into non-splitting predecessor"
)
splitting_with_empty
=
_subgraphs_with_empty_nodes
(
split_items
,
is_splitting_graph
=
True
)
assert
len
(
splitting_with_empty
)
==
0
,
(
"Splitting-op subgraphs should not contain empty allocation nodes: "
f
"
{
[
item
.
submod_name
for
item
in
splitting_with_empty
]
}
"
)
non_splitting_with_empty
=
_subgraphs_with_empty_nodes
(
split_items
,
is_splitting_graph
=
False
)
assert
len
(
non_splitting_with_empty
)
==
1
,
(
"Exactly one non-splitting subgraph should contain the merged empty node"
)
assert
len
(
_get_empty_nodes
(
non_splitting_with_empty
[
0
]))
==
1
,
(
"Expected exactly one empty allocation node in merged subgraph"
)
output_original
=
gm
(
x
)
output_split
=
split_gm
(
x
)
assert
torch
.
allclose
(
output_original
,
output_split
),
"Output mismatch after split"
def
test_builtin_empty_only_partition_is_merged
():
"""
In Dynamo graphs, torch.empty/empty_like may appear as builtin call targets
(not aten OpOverload). Ensure empty-only partitions are still merged.
"""
def
model_fn
(
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
hidden
=
x
+
1
out1
=
torch
.
empty_like
(
hidden
)
torch
.
ops
.
silly
.
attention
(
hidden
,
hidden
,
hidden
,
out1
)
out2
=
torch
.
empty_like
(
hidden
)
torch
.
ops
.
silly
.
attention
(
out1
,
out1
,
hidden
,
out2
)
return
out2
+
hidden
gm
=
torch
.
fx
.
symbolic_trace
(
model_fn
)
split_gm
,
split_items
=
split_graph
(
gm
,
[
"silly::attention"
])
# Without empty-only merge, this graph would split into:
# [add, empty_like], [attention], [empty_like], [attention], [add].
assert
len
(
split_items
)
==
4
,
"Builtin empty-only partition should be merged"
splitting_with_empty
=
_subgraphs_with_empty_nodes
(
split_items
,
is_splitting_graph
=
True
)
assert
len
(
splitting_with_empty
)
==
0
,
(
"Splitting-op subgraphs should not contain empty allocation nodes: "
f
"
{
[
item
.
submod_name
for
item
in
splitting_with_empty
]
}
"
)
non_splitting_with_empty
=
_subgraphs_with_empty_nodes
(
split_items
,
is_splitting_graph
=
False
)
assert
len
(
non_splitting_with_empty
)
==
1
,
(
"Exactly one non-splitting subgraph should contain merged empty nodes"
)
assert
len
(
_get_empty_nodes
(
non_splitting_with_empty
[
0
]))
==
2
,
(
"Expected two builtin empty_like nodes in merged non-splitting subgraph"
)
x
=
torch
.
randn
(
2
,
3
,
device
=
"cuda"
)
output_original
=
gm
(
x
)
output_split
=
split_gm
(
x
)
assert
torch
.
allclose
(
output_original
,
output_split
),
"Output mismatch after split"
tests/compile/test_sequence_parallelism_threshold.py
0 → 100644
View file @
3fb4b5fa
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
vllm.compilation.passes.fusion.sequence_parallelism
import
(
SP_MIN_HIDDEN_SIZE
,
SP_MIN_PER_GPU_SIZE_MB
,
get_sequence_parallelism_threshold
,
)
class
TestGetSequenceParallelismThreshold
:
"""Tests for get_sequence_parallelism_threshold function."""
def
test_non_cuda_returns_none
(
self
,
mock_cuda_platform
):
"""Non-CUDA platforms should return None."""
with
mock_cuda_platform
(
is_cuda
=
False
):
result
=
get_sequence_parallelism_threshold
(
hidden_size
=
8192
,
tp_size
=
2
,
element_size
=
2
)
assert
result
is
None
def
test_unsupported_device_capability_returns_none
(
self
,
mock_cuda_platform
):
"""Unsupported device capabilities (e.g., sm80) should return None."""
with
mock_cuda_platform
(
capability
=
(
8
,
0
)):
result
=
get_sequence_parallelism_threshold
(
hidden_size
=
8192
,
tp_size
=
2
,
element_size
=
2
)
assert
result
is
None
def
test_small_hidden_size_returns_none
(
self
,
mock_cuda_platform
):
"""H100 with hidden_size below threshold should return None."""
with
mock_cuda_platform
(
capability
=
(
9
,
0
)):
result
=
get_sequence_parallelism_threshold
(
hidden_size
=
4096
,
tp_size
=
2
,
element_size
=
2
,
# 4096 < 8192
)
assert
result
is
None
def
test_h100_large_model_returns_threshold
(
self
,
mock_cuda_platform
):
"""H100 with large enough hidden_size should return calculated threshold."""
with
mock_cuda_platform
(
capability
=
(
9
,
0
)):
hidden_size
=
8192
tp_size
=
2
element_size
=
2
# float16/bfloat16
result
=
get_sequence_parallelism_threshold
(
hidden_size
=
hidden_size
,
tp_size
=
tp_size
,
element_size
=
element_size
,
)
# Verify calculation: (8 * 2 * 1024 * 1024) // (8192 * 2) = 1024
MiB
=
1024
*
1024
expected
=
int
(
(
SP_MIN_PER_GPU_SIZE_MB
[
90
]
*
tp_size
*
MiB
)
//
(
hidden_size
*
element_size
)
)
assert
result
==
expected
assert
result
==
1024
@
pytest
.
mark
.
parametrize
(
"hidden_size,tp_size,element_size,expected"
,
[
# Boundary: exactly at min hidden size threshold, tp_size=1
# (8 * 1 * 1024 * 1024) // (8192 * 2) = 512
(
8192
,
1
,
2
,
512
),
# Larger hidden size reduces token threshold
# (8 * 1 * 1024 * 1024) // (16384 * 2) = 256
(
16384
,
1
,
2
,
256
),
# Larger tp_size increases token threshold
# (8 * 4 * 1024 * 1024) // (8192 * 2) = 2048
(
8192
,
4
,
2
,
2048
),
# Larger element_size (fp32) reduces token threshold
# (8 * 2 * 1024 * 1024) // (8192 * 4) = 512
(
8192
,
2
,
4
,
512
),
],
)
def
test_threshold_calculation_variations
(
self
,
mock_cuda_platform
,
hidden_size
,
tp_size
,
element_size
,
expected
):
"""Test threshold calculation with various parameter combinations."""
with
mock_cuda_platform
(
capability
=
(
9
,
0
)):
result
=
get_sequence_parallelism_threshold
(
hidden_size
=
hidden_size
,
tp_size
=
tp_size
,
element_size
=
element_size
,
)
assert
result
==
expected
def
test_hidden_size_boundary
(
self
,
mock_cuda_platform
):
"""Test behavior at the exact hidden_size boundary."""
with
mock_cuda_platform
(
capability
=
(
9
,
0
)):
# Just below threshold
result
=
get_sequence_parallelism_threshold
(
hidden_size
=
SP_MIN_HIDDEN_SIZE
[
90
]
-
1
,
tp_size
=
2
,
element_size
=
2
,
)
assert
result
is
None
# Exactly at threshold
result
=
get_sequence_parallelism_threshold
(
hidden_size
=
SP_MIN_HIDDEN_SIZE
[
90
],
tp_size
=
2
,
element_size
=
2
,
)
assert
result
is
not
None
tests/compile/test_startup.py
0 → 100644
View file @
3fb4b5fa
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Cold start and warm start tests for vLLM-compile.
Cold start runs in a forked child (must fork before CUDA init) which
populates on-disk caches and asserts cold-start counters. Warm start
then runs in the parent with clean in-memory state but populated caches.
"""
import
multiprocessing
as
mp
from
torch._dynamo.utils
import
counters
from
vllm.compilation.counter
import
compilation_counter
from
vllm.config
import
CompilationConfig
,
CompilationMode
,
CUDAGraphMode
MODEL
=
"microsoft/Phi-tiny-MoE-instruct"
def
_run_vllm
(
vllm_runner
):
with
vllm_runner
(
MODEL
,
trust_remote_code
=
False
,
max_model_len
=
256
,
max_num_batched_tokens
=
1024
,
load_format
=
"dummy"
,
compilation_config
=
CompilationConfig
(
mode
=
CompilationMode
.
VLLM_COMPILE
,
cudagraph_mode
=
CUDAGraphMode
.
NONE
,
),
num_gpu_blocks_override
=
8
,
):
pass
def
_cold_start
(
vllm_runner
):
counters
.
clear
()
with
compilation_counter
.
expect
(
num_compiled_artifacts_saved
=
3
,
num_compiled_artifacts_loaded
=
0
,
):
_run_vllm
(
vllm_runner
)
assert
counters
[
"aot_autograd"
][
"total"
]
==
33
assert
counters
[
"aot_autograd"
][
"autograd_cache_miss"
]
==
3
assert
counters
[
"aot_autograd"
][
"autograd_cache_hit"
]
==
0
def
test_moe_startup
(
monkeypatch
,
vllm_runner
,
fresh_vllm_cache
):
monkeypatch
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"0"
)
# Cold start in a forked child (must fork before CUDA init).
# This model has 32 identical transformer layers which produce
# 33 subgraphs after splitting on attention — only 3 are unique.
ctx
=
mp
.
get_context
(
"fork"
)
p
=
ctx
.
Process
(
target
=
_cold_start
,
args
=
(
vllm_runner
,))
p
.
start
()
p
.
join
()
assert
p
.
exitcode
==
0
,
"Cold-start child failed"
# Warm start — compiled artifacts loaded from disk cache.
counters
.
clear
()
with
compilation_counter
.
expect
(
num_compiled_artifacts_loaded
=
3
,
num_compiled_artifacts_saved
=
0
,
):
_run_vllm
(
vllm_runner
)
assert
counters
[
"aot_autograd"
][
"total"
]
==
30
assert
counters
[
"aot_autograd"
][
"autograd_cache_miss"
]
==
0
assert
(
counters
[
"aot_autograd"
][
"autograd_cache_hit"
]
==
0
)
# No miss at aot_autograd level causing disk I/O.
tests/compile/test_structured_logging.py
View file @
3fb4b5fa
...
@@ -109,9 +109,9 @@ def test_vllm_structured_logging_artifacts(use_fresh_inductor_cache):
...
@@ -109,9 +109,9 @@ def test_vllm_structured_logging_artifacts(use_fresh_inductor_cache):
f
"got
{
len
(
vllm_piecewise_split_graph
)
}
"
f
"got
{
len
(
vllm_piecewise_split_graph
)
}
"
)
)
compile_start_artifacts
=
capture
.
get
(
"artifact"
,
"vllm_piecewise_compile_start"
)
compile_start_artifacts
=
capture
.
get
(
"artifact"
,
"vllm_piecewise_compile_start"
)
assert
len
(
compile_start_artifacts
)
==
2
,
(
assert
len
(
compile_start_artifacts
)
==
4
,
(
"Expected
2
vllm_piecewise_compile_start "
"Expected
4
vllm_piecewise_compile_start "
"(
one for dynamic ranges, one for
compile size), "
"(
2 subgraphs x 2 ranges each: dynamic +
compile size), "
f
"got
{
len
(
compile_start_artifacts
)
}
"
f
"got
{
len
(
compile_start_artifacts
)
}
"
)
)
submod_dumps
=
capture
.
get
(
"graph_dump"
,
r
"vllm_submod_.*"
)
submod_dumps
=
capture
.
get
(
"graph_dump"
,
r
"vllm_submod_.*"
)
...
...
tests/compile/test_wrapper.py
View file @
3fb4b5fa
...
@@ -95,7 +95,7 @@ def test_torch_compile_wrapper(use_bytecode_hook, monkeypatch):
...
@@ -95,7 +95,7 @@ def test_torch_compile_wrapper(use_bytecode_hook, monkeypatch):
f
"Expected
{
expected1
}
, got
{
result1
}
"
f
"Expected
{
expected1
}
, got
{
result1
}
"
)
)
# Second call should triger another compilation
# Second call should trig
g
er another compilation
x2
=
torch
.
tensor
([
1
,
2
,
3
])
x2
=
torch
.
tensor
([
1
,
2
,
3
])
result2
=
wrapper
(
x2
)
result2
=
wrapper
(
x2
)
expected2
=
torch
.
tensor
([
100
,
200
,
300
])
expected2
=
torch
.
tensor
([
100
,
200
,
300
])
...
...
tests/config/test_config_generation.py
View file @
3fb4b5fa
...
@@ -78,3 +78,34 @@ def test_ray_runtime_env(monkeypatch: pytest.MonkeyPatch):
...
@@ -78,3 +78,34 @@ def test_ray_runtime_env(monkeypatch: pytest.MonkeyPatch):
)
)
ray
.
shutdown
()
ray
.
shutdown
()
def
test_unrecognized_env
(
monkeypatch
):
import
os
from
vllm.envs
import
environment_variables
# Remove any existing unrecognized VLLM env vars that might interfere
for
env
in
list
(
os
.
environ
):
if
env
.
startswith
(
"VLLM_"
)
and
env
not
in
environment_variables
:
monkeypatch
.
delenv
(
env
,
raising
=
False
)
# Test that if fail_on_environ_validation is True, then an error
# is raised when an unrecognized vLLM environment variable is set
monkeypatch
.
setenv
(
"VLLM_UNRECOGNIZED_ENV_VAR"
,
"some_value"
)
engine_args
=
EngineArgs
(
fail_on_environ_validation
=
True
,
)
with
pytest
.
raises
(
ValueError
,
match
=
"Unknown vLLM environment variable detected"
):
engine_args
.
create_engine_config
()
# Test that if fail_on_environ_validation is False, then no error is raised
engine_args
=
EngineArgs
()
engine_args
.
create_engine_config
()
# Test that when the unrecognized env var is removed, no error is raised
monkeypatch
.
delenv
(
"VLLM_UNRECOGNIZED_ENV_VAR"
)
engine_args
=
EngineArgs
(
fail_on_environ_validation
=
True
,
)
engine_args
.
create_engine_config
()
tests/config/test_multimodal_config.py
View file @
3fb4b5fa
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
import
pytest
import
pytest
from
vllm.config.model
import
ModelConfig
from
vllm.config.multimodal
import
MultiModalConfig
from
vllm.config.multimodal
import
MultiModalConfig
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
...
@@ -23,3 +24,20 @@ def test_mm_encoder_attn_backend_hash_updates():
...
@@ -23,3 +24,20 @@ def test_mm_encoder_attn_backend_hash_updates():
mm_encoder_attn_backend
=
AttentionBackendEnum
.
FLASH_ATTN
mm_encoder_attn_backend
=
AttentionBackendEnum
.
FLASH_ATTN
).
compute_hash
()
).
compute_hash
()
assert
base_hash
!=
overridden_hash
assert
base_hash
!=
overridden_hash
def
test_language_model_only_does_not_affect_mm_hash
():
"""language_model_only does not affect the ViT computation graph,
so it should not change the multimodal config hash."""
base_hash
=
MultiModalConfig
().
compute_hash
()
lm_only_hash
=
MultiModalConfig
(
language_model_only
=
True
).
compute_hash
()
assert
base_hash
==
lm_only_hash
def
test_language_model_only_affects_model_hash
():
"""language_model_only affects the LM computation graph,
so it should change the model config hash."""
model
=
"llava-hf/llava-1.5-7b-hf"
base_hash
=
ModelConfig
(
model
).
compute_hash
()
lm_only_hash
=
ModelConfig
(
model
,
language_model_only
=
True
).
compute_hash
()
assert
base_hash
!=
lm_only_hash
tests/conftest.py
View file @
3fb4b5fa
...
@@ -176,16 +176,20 @@ def init_test_http_connection():
...
@@ -176,16 +176,20 @@ def init_test_http_connection():
@
pytest
.
fixture
@
pytest
.
fixture
def
dist_init
():
def
dist_init
():
from
tests.utils
import
ensure_current_vllm_config
temp_file
=
tempfile
.
mkstemp
()[
1
]
temp_file
=
tempfile
.
mkstemp
()[
1
]
init_distributed_environment
(
world_size
=
1
,
with
ensure_current_vllm_config
():
rank
=
0
,
init_distributed_environment
(
distributed_init_method
=
f
"file://
{
temp_file
}
"
,
world_size
=
1
,
local_rank
=
0
,
rank
=
0
,
backend
=
"nccl"
,
distributed_init_method
=
f
"file://
{
temp_file
}
"
,
)
local_rank
=
0
,
initialize_model_parallel
(
1
,
1
)
backend
=
"nccl"
,
yield
)
initialize_model_parallel
(
1
,
1
)
yield
cleanup_dist_env_and_memory
()
cleanup_dist_env_and_memory
()
...
@@ -419,18 +423,16 @@ class HfRunner:
...
@@ -419,18 +423,16 @@ class HfRunner:
self
.
tokenizer
:
"PreTrainedTokenizer | PreTrainedTokenizerFast"
=
(
self
.
tokenizer
:
"PreTrainedTokenizer | PreTrainedTokenizerFast"
=
(
AutoTokenizer
.
from_pretrained
(
AutoTokenizer
.
from_pretrained
(
model_name
,
model_name
,
dtype
=
dtype
,
trust_remote_code
=
trust_remote_code
,
trust_remote_code
=
trust_remote_code
,
)
)
)
)
# don't put this import at the top level
# don't put this import at the top level
# it will call torch.
cuda
.device_count()
# it will call torch.
accelerator
.device_count()
from
transformers
import
AutoProcessor
from
transformers
import
AutoProcessor
self
.
processor
=
AutoProcessor
.
from_pretrained
(
self
.
processor
=
AutoProcessor
.
from_pretrained
(
model_name
,
model_name
,
dtype
=
dtype
,
trust_remote_code
=
trust_remote_code
,
trust_remote_code
=
trust_remote_code
,
)
)
if
skip_tokenizer_init
:
if
skip_tokenizer_init
:
...
@@ -792,7 +794,6 @@ class VllmRunner:
...
@@ -792,7 +794,6 @@ class VllmRunner:
tensor_parallel_size
:
int
=
1
,
tensor_parallel_size
:
int
=
1
,
block_size
:
int
=
16
if
not
torch
.
xpu
.
is_available
()
else
64
,
block_size
:
int
=
16
if
not
torch
.
xpu
.
is_available
()
else
64
,
enable_chunked_prefill
:
bool
|
None
=
False
,
enable_chunked_prefill
:
bool
|
None
=
False
,
swap_space
:
int
=
4
,
enforce_eager
:
bool
|
None
=
False
,
enforce_eager
:
bool
|
None
=
False
,
# Set this to avoid hanging issue
# Set this to avoid hanging issue
default_torch_num_threads
:
int
|
None
=
None
,
default_torch_num_threads
:
int
|
None
=
None
,
...
@@ -829,7 +830,6 @@ class VllmRunner:
...
@@ -829,7 +830,6 @@ class VllmRunner:
trust_remote_code
=
trust_remote_code
,
trust_remote_code
=
trust_remote_code
,
dtype
=
dtype
,
dtype
=
dtype
,
seed
=
seed
,
seed
=
seed
,
swap_space
=
swap_space
,
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
disable_log_stats
=
disable_log_stats
,
disable_log_stats
=
disable_log_stats
,
tensor_parallel_size
=
tensor_parallel_size
,
tensor_parallel_size
=
tensor_parallel_size
,
...
@@ -841,7 +841,10 @@ class VllmRunner:
...
@@ -841,7 +841,10 @@ class VllmRunner:
def
get_inputs
(
def
get_inputs
(
self
,
self
,
prompts
:
list
[
str
]
|
list
[
torch
.
Tensor
]
|
list
[
list
[
int
]],
prompts
:
list
[
str
]
|
list
[
torch
.
Tensor
]
|
list
[
list
[
int
]]
|
list
[
dict
[
str
,
Any
]],
images
:
PromptImageInput
|
None
=
None
,
images
:
PromptImageInput
|
None
=
None
,
videos
:
PromptVideoInput
|
None
=
None
,
videos
:
PromptVideoInput
|
None
=
None
,
audios
:
PromptAudioInput
|
None
=
None
,
audios
:
PromptAudioInput
|
None
=
None
,
...
@@ -855,26 +858,32 @@ class VllmRunner:
...
@@ -855,26 +858,32 @@ class VllmRunner:
inputs
=
list
[
dict
[
str
,
Any
]]()
inputs
=
list
[
dict
[
str
,
Any
]]()
for
i
,
prompt
in
enumerate
(
prompts
):
for
i
,
prompt
in
enumerate
(
prompts
):
prompt_dict
=
dict
[
str
,
Any
]()
# If we're passing an encoder/decoder prompt, we assume it
if
isinstance
(
prompt
,
str
):
# already contains the multimodal data in the prompt
prompt_dict
[
"prompt"
]
=
prompt
if
isinstance
(
prompt
,
dict
):
elif
isinstance
(
prompt
,
list
):
assert
images
is
None
and
audios
is
None
and
videos
is
None
prompt_dict
[
"prompt_token_ids"
]
=
prompt
inputs
.
append
(
prompt
.
copy
())
else
:
else
:
prompt_dict
[
"prompt_embeds"
]
=
prompt
prompt_dict
=
dict
[
str
,
Any
]()
if
isinstance
(
prompt
,
str
):
multi_modal_data
=
dict
[
str
,
Any
]()
prompt_dict
[
"prompt"
]
=
prompt
if
images
is
not
None
and
(
image
:
=
images
[
i
])
is
not
None
:
elif
isinstance
(
prompt
,
list
):
multi_modal_data
[
"image"
]
=
image
prompt_dict
[
"prompt_token_ids"
]
=
prompt
if
videos
is
not
None
and
(
video
:
=
videos
[
i
])
is
not
None
:
else
:
multi_modal_data
[
"video"
]
=
video
prompt_dict
[
"prompt_embeds"
]
=
prompt
if
audios
is
not
None
and
(
audio
:
=
audios
[
i
])
is
not
None
:
multi_modal_data
[
"audio"
]
=
audio
multi_modal_data
=
dict
[
str
,
Any
]()
if
images
is
not
None
and
(
image
:
=
images
[
i
])
is
not
None
:
multi_modal_data
[
"image"
]
=
image
if
videos
is
not
None
and
(
video
:
=
videos
[
i
])
is
not
None
:
multi_modal_data
[
"video"
]
=
video
if
audios
is
not
None
and
(
audio
:
=
audios
[
i
])
is
not
None
:
multi_modal_data
[
"audio"
]
=
audio
if
multi_modal_data
:
if
multi_modal_data
:
prompt_dict
[
"multi_modal_data"
]
=
multi_modal_data
prompt_dict
[
"multi_modal_data"
]
=
multi_modal_data
inputs
.
append
(
prompt_dict
)
inputs
.
append
(
prompt_dict
)
return
inputs
return
inputs
...
@@ -1138,6 +1147,15 @@ class VllmRunner:
...
@@ -1138,6 +1147,15 @@ class VllmRunner:
return
self
return
self
def
__exit__
(
self
,
exc_type
,
exc_value
,
traceback
):
def
__exit__
(
self
,
exc_type
,
exc_value
,
traceback
):
# Explicitly shutdown the engine core to release GPU resources
# This is needed because when executing consecutive tests, the GC
# might not be fast enough in shutting down the llm engine. This can lead to OOMs
# because when the next test starts some GPU memory is still in use.
try
:
self
.
llm
.
llm_engine
.
engine_core
.
shutdown
()
except
Exception
:
# Ignore shutdown errors as cleanup will still proceed
pass
del
self
.
llm
del
self
.
llm
cleanup_dist_env_and_memory
()
cleanup_dist_env_and_memory
()
...
@@ -1517,7 +1535,7 @@ def clean_gpu_memory_between_tests():
...
@@ -1517,7 +1535,7 @@ def clean_gpu_memory_between_tests():
from
tests.utils
import
wait_for_gpu_memory_to_clear
from
tests.utils
import
wait_for_gpu_memory_to_clear
num_gpus
=
torch
.
cuda
.
device_count
()
num_gpus
=
torch
.
accelerator
.
device_count
()
if
num_gpus
>
0
:
if
num_gpus
>
0
:
try
:
try
:
wait_for_gpu_memory_to_clear
(
wait_for_gpu_memory_to_clear
(
...
@@ -1531,7 +1549,7 @@ def clean_gpu_memory_between_tests():
...
@@ -1531,7 +1549,7 @@ def clean_gpu_memory_between_tests():
# Clean up GPU memory after the test
# Clean up GPU memory after the test
if
torch
.
cuda
.
is_available
():
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
empty_cache
()
torch
.
accelerator
.
empty_cache
()
gc
.
collect
()
gc
.
collect
()
...
@@ -1546,6 +1564,14 @@ def use_fresh_inductor_cache():
...
@@ -1546,6 +1564,14 @@ def use_fresh_inductor_cache():
yield
yield
@
pytest
.
fixture
def
fresh_vllm_cache
(
monkeypatch
,
use_fresh_inductor_cache
):
"""Temporary VLLM_CACHE_ROOT combined with a fresh inductor cache."""
with
tempfile
.
TemporaryDirectory
()
as
tmp_dir
:
monkeypatch
.
setenv
(
"VLLM_CACHE_ROOT"
,
tmp_dir
)
yield
tmp_dir
@
pytest
.
fixture
(
scope
=
"function"
)
@
pytest
.
fixture
(
scope
=
"function"
)
def
enable_pickle
(
monkeypatch
):
def
enable_pickle
(
monkeypatch
):
"""`LLM.apply_model` requires pickling a function."""
"""`LLM.apply_model` requires pickling a function."""
...
...
tests/cuda/scripts/check_device_count_respects_env.py
View file @
3fb4b5fa
...
@@ -14,7 +14,7 @@ import torch # noqa: E402
...
@@ -14,7 +14,7 @@ import torch # noqa: E402
from
vllm.platforms
import
current_platform
# noqa: F401, E402
from
vllm.platforms
import
current_platform
# noqa: F401, E402
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
"0"
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
"0"
count
=
torch
.
cuda
.
device_count
()
count
=
torch
.
accelerator
.
device_count
()
if
count
==
0
:
if
count
==
0
:
sys
.
exit
(
0
)
# Skip: no GPUs available
sys
.
exit
(
0
)
# Skip: no GPUs available
...
...
tests/cuda/test_cuda_compatibility_path.py
0 → 100644
View file @
3fb4b5fa
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for CUDA forward compatibility path logic in env_override.py.
Verifies the opt-in LD_LIBRARY_PATH manipulation for CUDA compat libs,
including env var parsing, path detection, and deduplication.
"""
import
os
from
unittest.mock
import
patch
import
pytest
# Import the functions directly (they're module-level in env_override)
# We must import them without triggering the module-level side effects,
# so we import the functions by name after the module is already loaded.
from
vllm.env_override
import
(
_get_torch_cuda_version
,
_maybe_set_cuda_compatibility_path
,
)
class
TestCudaCompatibilityEnvParsing
:
"""Test VLLM_ENABLE_CUDA_COMPATIBILITY env var parsing."""
def
test_disabled_by_default
(
self
,
monkeypatch
):
"""Compat path is NOT set when env var is absent."""
monkeypatch
.
delenv
(
"VLLM_ENABLE_CUDA_COMPATIBILITY"
,
raising
=
False
)
monkeypatch
.
delenv
(
"LD_LIBRARY_PATH"
,
raising
=
False
)
_maybe_set_cuda_compatibility_path
()
assert
(
"LD_LIBRARY_PATH"
not
in
os
.
environ
or
os
.
environ
.
get
(
"LD_LIBRARY_PATH"
,
""
)
==
""
)
@
pytest
.
mark
.
parametrize
(
"value"
,
[
"0"
,
"false"
,
"False"
,
"no"
,
""
])
def
test_disabled_values
(
self
,
monkeypatch
,
value
):
"""Various falsy values should not activate compat path."""
monkeypatch
.
setenv
(
"VLLM_ENABLE_CUDA_COMPATIBILITY"
,
value
)
monkeypatch
.
delenv
(
"LD_LIBRARY_PATH"
,
raising
=
False
)
_maybe_set_cuda_compatibility_path
()
# LD_LIBRARY_PATH should not be set (or remain empty)
ld_path
=
os
.
environ
.
get
(
"LD_LIBRARY_PATH"
,
""
)
assert
"compat"
not
in
ld_path
@
pytest
.
mark
.
parametrize
(
"value"
,
[
"1"
,
"true"
,
"True"
,
" 1 "
,
" TRUE "
])
def
test_enabled_values_with_valid_path
(
self
,
monkeypatch
,
tmp_path
,
value
):
"""Truthy values activate compat path when a valid path exists."""
compat_dir
=
tmp_path
/
"compat"
compat_dir
.
mkdir
()
monkeypatch
.
setenv
(
"VLLM_ENABLE_CUDA_COMPATIBILITY"
,
value
)
monkeypatch
.
setenv
(
"VLLM_CUDA_COMPATIBILITY_PATH"
,
str
(
compat_dir
))
monkeypatch
.
delenv
(
"LD_LIBRARY_PATH"
,
raising
=
False
)
_maybe_set_cuda_compatibility_path
()
ld_path
=
os
.
environ
.
get
(
"LD_LIBRARY_PATH"
,
""
)
assert
str
(
compat_dir
)
in
ld_path
class
TestCudaCompatibilityPathDetection
:
"""Test path detection: custom override, conda, default."""
def
test_custom_path_override
(
self
,
monkeypatch
,
tmp_path
):
"""VLLM_CUDA_COMPATIBILITY_PATH takes highest priority."""
custom_dir
=
tmp_path
/
"my-compat"
custom_dir
.
mkdir
()
monkeypatch
.
setenv
(
"VLLM_ENABLE_CUDA_COMPATIBILITY"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_CUDA_COMPATIBILITY_PATH"
,
str
(
custom_dir
))
monkeypatch
.
delenv
(
"LD_LIBRARY_PATH"
,
raising
=
False
)
_maybe_set_cuda_compatibility_path
()
ld_path
=
os
.
environ
.
get
(
"LD_LIBRARY_PATH"
,
""
)
assert
ld_path
.
startswith
(
str
(
custom_dir
))
def
test_conda_prefix_fallback
(
self
,
monkeypatch
,
tmp_path
):
"""Falls back to $CONDA_PREFIX/cuda-compat if custom not set."""
conda_dir
=
tmp_path
/
"conda-env"
compat_dir
=
conda_dir
/
"cuda-compat"
compat_dir
.
mkdir
(
parents
=
True
)
monkeypatch
.
setenv
(
"VLLM_ENABLE_CUDA_COMPATIBILITY"
,
"1"
)
monkeypatch
.
delenv
(
"VLLM_CUDA_COMPATIBILITY_PATH"
,
raising
=
False
)
monkeypatch
.
setenv
(
"CONDA_PREFIX"
,
str
(
conda_dir
))
monkeypatch
.
delenv
(
"LD_LIBRARY_PATH"
,
raising
=
False
)
_maybe_set_cuda_compatibility_path
()
ld_path
=
os
.
environ
.
get
(
"LD_LIBRARY_PATH"
,
""
)
assert
str
(
compat_dir
)
in
ld_path
def
test_no_valid_path_does_nothing
(
self
,
monkeypatch
):
"""When enabled but no valid path exists, LD_LIBRARY_PATH unchanged."""
monkeypatch
.
setenv
(
"VLLM_ENABLE_CUDA_COMPATIBILITY"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_CUDA_COMPATIBILITY_PATH"
,
"/nonexistent/path"
)
monkeypatch
.
delenv
(
"CONDA_PREFIX"
,
raising
=
False
)
monkeypatch
.
delenv
(
"LD_LIBRARY_PATH"
,
raising
=
False
)
with
patch
(
"vllm.env_override._get_torch_cuda_version"
,
return_value
=
None
):
_maybe_set_cuda_compatibility_path
()
assert
os
.
environ
.
get
(
"LD_LIBRARY_PATH"
,
""
)
==
""
def
test_default_cuda_path_fallback
(
self
,
monkeypatch
,
tmp_path
):
"""Falls back to /usr/local/cuda-{ver}/compat via torch version."""
fake_cuda
=
tmp_path
/
"cuda-12.8"
/
"compat"
fake_cuda
.
mkdir
(
parents
=
True
)
monkeypatch
.
setenv
(
"VLLM_ENABLE_CUDA_COMPATIBILITY"
,
"1"
)
monkeypatch
.
delenv
(
"VLLM_CUDA_COMPATIBILITY_PATH"
,
raising
=
False
)
monkeypatch
.
delenv
(
"CONDA_PREFIX"
,
raising
=
False
)
monkeypatch
.
delenv
(
"LD_LIBRARY_PATH"
,
raising
=
False
)
with
(
patch
(
"vllm.env_override._get_torch_cuda_version"
,
return_value
=
"12.8"
),
patch
(
"vllm.env_override.os.path.isdir"
,
side_effect
=
lambda
p
:
p
==
"/usr/local/cuda-12.8/compat"
or
os
.
path
.
isdir
(
p
),
),
):
_maybe_set_cuda_compatibility_path
()
ld_path
=
os
.
environ
.
get
(
"LD_LIBRARY_PATH"
,
""
)
assert
"/usr/local/cuda-12.8/compat"
in
ld_path
class
TestCudaCompatibilityLdPathManipulation
:
"""Test LD_LIBRARY_PATH prepend and deduplication logic."""
def
test_prepends_to_empty_ld_path
(
self
,
monkeypatch
,
tmp_path
):
"""Compat path is set when LD_LIBRARY_PATH is empty."""
compat_dir
=
tmp_path
/
"compat"
compat_dir
.
mkdir
()
monkeypatch
.
setenv
(
"VLLM_ENABLE_CUDA_COMPATIBILITY"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_CUDA_COMPATIBILITY_PATH"
,
str
(
compat_dir
))
monkeypatch
.
delenv
(
"LD_LIBRARY_PATH"
,
raising
=
False
)
_maybe_set_cuda_compatibility_path
()
assert
os
.
environ
[
"LD_LIBRARY_PATH"
]
==
str
(
compat_dir
)
def
test_prepends_to_existing_ld_path
(
self
,
monkeypatch
,
tmp_path
):
"""Compat path is prepended before existing entries."""
compat_dir
=
tmp_path
/
"compat"
compat_dir
.
mkdir
()
monkeypatch
.
setenv
(
"VLLM_ENABLE_CUDA_COMPATIBILITY"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_CUDA_COMPATIBILITY_PATH"
,
str
(
compat_dir
))
monkeypatch
.
setenv
(
"LD_LIBRARY_PATH"
,
"/usr/lib:/other/lib"
)
_maybe_set_cuda_compatibility_path
()
ld_path
=
os
.
environ
[
"LD_LIBRARY_PATH"
]
parts
=
ld_path
.
split
(
os
.
pathsep
)
assert
parts
[
0
]
==
str
(
compat_dir
)
assert
"/usr/lib"
in
parts
assert
"/other/lib"
in
parts
def
test_deduplicates_existing_compat_path
(
self
,
monkeypatch
,
tmp_path
):
"""If compat path already in LD_LIBRARY_PATH, move to front."""
compat_dir
=
tmp_path
/
"compat"
compat_dir
.
mkdir
()
monkeypatch
.
setenv
(
"VLLM_ENABLE_CUDA_COMPATIBILITY"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_CUDA_COMPATIBILITY_PATH"
,
str
(
compat_dir
))
monkeypatch
.
setenv
(
"LD_LIBRARY_PATH"
,
f
"/usr/lib:
{
compat_dir
}
:/other/lib"
,
)
_maybe_set_cuda_compatibility_path
()
ld_path
=
os
.
environ
[
"LD_LIBRARY_PATH"
]
parts
=
ld_path
.
split
(
os
.
pathsep
)
assert
parts
[
0
]
==
str
(
compat_dir
)
assert
parts
.
count
(
str
(
compat_dir
))
==
1
def
test_already_at_front_is_noop
(
self
,
monkeypatch
,
tmp_path
):
"""If compat path is already first, don't modify LD_LIBRARY_PATH."""
compat_dir
=
tmp_path
/
"compat"
compat_dir
.
mkdir
()
original
=
f
"
{
compat_dir
}
:/usr/lib"
monkeypatch
.
setenv
(
"VLLM_ENABLE_CUDA_COMPATIBILITY"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_CUDA_COMPATIBILITY_PATH"
,
str
(
compat_dir
))
monkeypatch
.
setenv
(
"LD_LIBRARY_PATH"
,
original
)
_maybe_set_cuda_compatibility_path
()
assert
os
.
environ
[
"LD_LIBRARY_PATH"
]
==
original
class
TestGetTorchCudaVersion
:
"""Test _get_torch_cuda_version() helper."""
def
test_returns_string_when_torch_available
(
self
):
"""Should return a CUDA version string like '12.8'."""
version
=
_get_torch_cuda_version
()
# torch is installed in vllm's environment
assert
version
is
None
or
isinstance
(
version
,
str
)
def
test_returns_none_when_torch_missing
(
self
):
"""Should return None when torch is not importable."""
with
patch
(
"vllm.env_override.importlib.util.find_spec"
,
return_value
=
None
,
):
assert
_get_torch_cuda_version
()
is
None
tests/detokenizer/test_disable_detokenization.py
View file @
3fb4b5fa
...
@@ -7,7 +7,6 @@ from vllm.entrypoints.llm import LLM
...
@@ -7,7 +7,6 @@ from vllm.entrypoints.llm import LLM
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
@
pytest
.
mark
.
skip_v1
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"distilbert/distilgpt2"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"distilbert/distilgpt2"
])
def
test_computed_prefix_blocks
(
model
:
str
):
def
test_computed_prefix_blocks
(
model
:
str
):
# This test checks if the engine generates completions both with and
# This test checks if the engine generates completions both with and
...
...
Prev
1
…
18
19
20
21
22
23
24
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment