Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3fb4b5fa
Commit
3fb4b5fa
authored
Mar 23, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.18.0' into v0.18.0-ori
parents
bcf25339
89138b21
Changes
488
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1406 additions
and
113 deletions
+1406
-113
tests/compile/passes/test_rope_kvcache_fusion.py
tests/compile/passes/test_rope_kvcache_fusion.py
+334
-0
tests/compile/passes/test_scatter_split_replace.py
tests/compile/passes/test_scatter_split_replace.py
+107
-0
tests/compile/passes/test_silu_mul_quant_fusion.py
tests/compile/passes/test_silu_mul_quant_fusion.py
+27
-15
tests/compile/test_aot_compile.py
tests/compile/test_aot_compile.py
+177
-0
tests/compile/test_cold_start.py
tests/compile/test_cold_start.py
+0
-48
tests/compile/test_compile_ranges.py
tests/compile/test_compile_ranges.py
+89
-6
tests/compile/test_config.py
tests/compile/test_config.py
+43
-0
tests/compile/test_decorator.py
tests/compile/test_decorator.py
+1
-1
tests/compile/test_dynamic_shapes_compilation.py
tests/compile/test_dynamic_shapes_compilation.py
+2
-2
tests/compile/test_graph_partition.py
tests/compile/test_graph_partition.py
+144
-1
tests/compile/test_sequence_parallelism_threshold.py
tests/compile/test_sequence_parallelism_threshold.py
+110
-0
tests/compile/test_startup.py
tests/compile/test_startup.py
+71
-0
tests/compile/test_structured_logging.py
tests/compile/test_structured_logging.py
+3
-3
tests/compile/test_wrapper.py
tests/compile/test_wrapper.py
+1
-1
tests/config/test_config_generation.py
tests/config/test_config_generation.py
+31
-0
tests/config/test_multimodal_config.py
tests/config/test_multimodal_config.py
+18
-0
tests/conftest.py
tests/conftest.py
+60
-34
tests/cuda/scripts/check_device_count_respects_env.py
tests/cuda/scripts/check_device_count_respects_env.py
+1
-1
tests/cuda/test_cuda_compatibility_path.py
tests/cuda/test_cuda_compatibility_path.py
+187
-0
tests/detokenizer/test_disable_detokenization.py
tests/detokenizer/test_disable_detokenization.py
+0
-1
No files found.
Too many changes to show.
To preserve performance only
488 of 488+
files are displayed.
Plain diff
Email patch
tests/compile/passes/test_rope_kvcache_fusion.py
0 → 100644
View file @
3fb4b5fa
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
import
vllm.config
from
tests.compile.backend
import
TestBackend
from
tests.v1.attention.utils
import
BatchSpec
,
create_common_attn_metadata
from
vllm._aiter_ops
import
is_aiter_found_and_supported
,
rocm_aiter_ops
from
vllm.compilation.passes.fusion.matcher_utils
import
ROTARY_OP
from
vllm.compilation.passes.fusion.rope_kvcache_fusion
import
RopeKVCacheFusionPass
from
vllm.compilation.passes.utility.noop_elimination
import
NoOpEliminationPass
from
vllm.compilation.passes.utility.post_cleanup
import
PostCleanupPass
from
vllm.compilation.passes.utility.scatter_split_replace
import
(
ScatterSplitReplacementPass
,
)
from
vllm.compilation.passes.utility.split_coalescing
import
SplitCoalescingPass
from
vllm.config
import
(
CacheConfig
,
CompilationConfig
,
CompilationMode
,
ModelConfig
,
PassConfig
,
VllmConfig
,
)
from
vllm.forward_context
import
get_forward_context
,
set_forward_context
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.rotary_embedding
import
RotaryEmbedding
from
vllm.platforms
import
current_platform
from
vllm.v1.attention.backend
import
(
AttentionBackend
,
CommonAttentionMetadata
,
)
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
from
vllm.v1.kv_cache_interface
import
AttentionSpec
INDEX_SELECT_OP
=
torch
.
ops
.
aten
.
index
.
Tensor
VLLM_UNIFIED_KV_CACHE_UPDATE_OP
=
torch
.
ops
.
vllm
.
unified_kv_cache_update
FP8_DTYPE
=
current_platform
.
fp8_dtype
()
class
QKRoPEKVCacheTestModel
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
vllm_config
:
VllmConfig
,
attn_backend
:
AttentionBackendEnum
,
num_heads
:
int
,
num_kv_heads
:
int
,
head_size
:
int
,
is_neox
:
bool
,
dtype
:
torch
.
dtype
,
device
:
torch
.
device
,
prefix
:
str
=
"model.layers.0.self_attn.attn"
,
):
super
().
__init__
()
self
.
num_heads
=
num_heads
self
.
num_kv_heads
=
num_kv_heads
self
.
head_size
=
head_size
self
.
block_size
=
vllm_config
.
cache_config
.
block_size
self
.
q_size
=
num_heads
*
head_size
self
.
kv_size
=
num_kv_heads
*
head_size
self
.
is_neox
=
is_neox
self
.
dtype
=
dtype
self
.
device
=
device
self
.
layer_name
=
prefix
self
.
rotary_emb
=
RotaryEmbedding
(
head_size
,
rotary_dim
=
head_size
,
max_position_embeddings
=
4096
,
base
=
10000
,
is_neox_style
=
is_neox
,
dtype
=
self
.
dtype
,
)
# Whether to check for the RoPE custom op or component index_select
self
.
enable_rope_custom_op
=
self
.
rotary_emb
.
enabled
()
# Register layer metadata for the fusion pass via Attention.
self
.
attn
=
Attention
(
num_heads
=
num_heads
,
head_size
=
head_size
,
scale
=
1.0
/
head_size
**
0.5
,
num_kv_heads
=
num_kv_heads
,
cache_config
=
vllm_config
.
cache_config
,
quant_config
=
vllm_config
.
quant_config
,
prefix
=
prefix
,
attn_backend
=
attn_backend
.
get_class
(),
)
self
.
attn_backend
:
type
[
AttentionBackend
]
=
self
.
attn
.
get_attn_backend
()
assert
not
self
.
attn_backend
.
forward_includes_kv_cache_update
,
(
f
"Attention backend
{
self
.
attn_backend
}
does not support fuse_rope_kvcache."
)
self
.
attn
.
_k_scale
=
self
.
attn
.
_k_scale
.
to
(
device
)
self
.
attn
.
_v_scale
=
self
.
attn
.
_v_scale
.
to
(
device
)
kv_cache_dtype_str
=
vllm_config
.
cache_config
.
cache_dtype
self
.
kv_cache_dtype
=
(
FP8_DTYPE
if
kv_cache_dtype_str
.
startswith
(
"fp8"
)
else
self
.
dtype
)
# Initialize attn MetadataBuilder
self
.
builder
=
self
.
attn
.
attn_backend
.
get_builder_cls
()(
kv_cache_spec
=
AttentionSpec
(
block_size
=
self
.
block_size
,
num_kv_heads
=
self
.
num_kv_heads
,
head_size
=
head_size
,
dtype
=
self
.
kv_cache_dtype
,
),
layer_names
=
[
self
.
attn
.
layer_name
],
vllm_config
=
vllm_config
,
device
=
device
,
)
def
build_attn_metadata
(
self
,
batch_size
:
int
)
->
CommonAttentionMetadata
:
"""Initialize attention metadata."""
# Create common attn metadata
batch_spec
=
BatchSpec
(
seq_lens
=
[
1
]
*
batch_size
,
query_lens
=
[
1
]
*
batch_size
)
common_attn_metadata
=
create_common_attn_metadata
(
batch_spec
,
self
.
block_size
,
self
.
device
,
arange_block_indices
=
True
)
max_blocks
=
(
max
(
batch_spec
.
seq_lens
)
+
self
.
block_size
-
1
)
//
self
.
block_size
num_blocks
=
batch_size
*
max_blocks
# Fetch the attention backend and kv cache shape and stride order
attn_backend
=
self
.
attn
.
attn_backend
kv_cache_shape
=
attn_backend
.
get_kv_cache_shape
(
num_blocks
,
self
.
block_size
,
self
.
num_kv_heads
,
self
.
head_size
)
try
:
kv_cache_stride_order
=
attn_backend
.
get_kv_cache_stride_order
()
except
(
AttributeError
,
NotImplementedError
):
kv_cache_stride_order
=
tuple
(
range
(
len
(
kv_cache_shape
)))
kv_cache_shape
=
tuple
(
kv_cache_shape
[
i
]
for
i
in
kv_cache_stride_order
)
inv_order
=
[
kv_cache_stride_order
.
index
(
i
)
for
i
in
range
(
len
(
kv_cache_stride_order
))
]
# Create dummy KV cache
raw_tensor
=
torch
.
zeros
(
2
*
num_blocks
*
self
.
block_size
*
self
.
num_kv_heads
*
self
.
head_size
,
dtype
=
self
.
kv_cache_dtype
,
device
=
self
.
device
,
)
raw_tensor
=
raw_tensor
.
view
(
kv_cache_shape
)
kv_cache
=
raw_tensor
.
permute
(
*
inv_order
)
self
.
attn
.
kv_cache
=
[
kv_cache
]
# Build attn metadata
attn_metadata
=
self
.
builder
.
build
(
common_prefix_len
=
0
,
common_attn_metadata
=
common_attn_metadata
)
return
attn_metadata
def
forward
(
self
,
qkv
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
# Create copy so inplace ops do not modify the original tensors
qkv
=
qkv
.
clone
()
q
,
k
,
v
=
qkv
.
split
([
self
.
q_size
,
self
.
kv_size
,
self
.
kv_size
],
dim
=-
1
)
q
,
k
=
self
.
rotary_emb
(
positions
,
q
,
k
)
# Instead of a full forward pass, match only the KV cache update op here
q
=
q
.
view
(
-
1
,
self
.
num_heads
,
self
.
head_size
)
k
=
k
.
view
(
-
1
,
self
.
num_kv_heads
,
self
.
head_size
)
v
=
v
.
view
(
-
1
,
self
.
num_kv_heads
,
self
.
head_size
)
kv_cache_dummy_dep
=
torch
.
ops
.
vllm
.
unified_kv_cache_update
(
k
,
v
,
self
.
layer_name
)
return
q
,
k
,
v
,
kv_cache_dummy_dep
def
ops_in_model_before
(
self
)
->
list
[
torch
.
_ops
.
OpOverload
]:
ops
=
[]
if
self
.
enable_rope_custom_op
:
if
rocm_aiter_ops
.
is_triton_rotary_embed_enabled
():
ops
.
append
(
torch
.
ops
.
vllm
.
rocm_aiter_triton_rotary_embedding
.
default
)
else
:
ops
.
append
(
ROTARY_OP
)
else
:
ops
.
append
(
INDEX_SELECT_OP
)
ops
.
append
(
torch
.
ops
.
vllm
.
unified_kv_cache_update
.
default
)
return
ops
def
ops_in_model_after
(
self
)
->
list
[
torch
.
_ops
.
OpOverload
]:
return
[
torch
.
ops
.
vllm
.
fused_rope_and_unified_kv_cache_update
.
default
]
@
pytest
.
mark
.
parametrize
(
"attn_backend"
,
[
AttentionBackendEnum
.
ROCM_AITER_UNIFIED_ATTN
,
AttentionBackendEnum
.
TRITON_ATTN
,
AttentionBackendEnum
.
ROCM_ATTN
,
AttentionBackendEnum
.
ROCM_AITER_FA
,
],
)
@
pytest
.
mark
.
parametrize
(
"enable_rope_custom_op"
,
[
True
])
# [True, False])
@
pytest
.
mark
.
parametrize
(
"enable_aiter_triton_rope"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"num_kv_heads"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"head_size"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"is_neox"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype"
,
[
"auto"
,
"fp8"
])
@
pytest
.
mark
.
skipif
(
not
is_aiter_found_and_supported
(),
reason
=
"Only test on ROCm with AITER installed and supported"
,
)
def
test_rope_kvcache_fusion
(
attn_backend
:
AttentionBackendEnum
,
enable_rope_custom_op
:
bool
,
enable_aiter_triton_rope
:
bool
,
num_heads
:
int
,
num_kv_heads
:
int
,
head_size
:
int
,
block_size
:
int
,
is_neox
:
bool
,
dtype
:
torch
.
dtype
,
kv_cache_dtype
:
str
,
monkeypatch
:
pytest
.
MonkeyPatch
,
):
torch
.
set_default_device
(
"cuda"
)
torch
.
set_default_dtype
(
dtype
)
torch
.
manual_seed
(
0
)
custom_ops
:
list
[
str
]
=
[]
if
enable_rope_custom_op
:
custom_ops
.
append
(
"+rotary_embedding"
)
vllm_config
=
VllmConfig
(
model_config
=
ModelConfig
(
dtype
=
dtype
),
cache_config
=
CacheConfig
(
block_size
=
block_size
,
cache_dtype
=
kv_cache_dtype
,
),
compilation_config
=
CompilationConfig
(
mode
=
CompilationMode
.
VLLM_COMPILE
,
custom_ops
=
custom_ops
,
pass_config
=
PassConfig
(
fuse_rope_kvcache
=
True
,
eliminate_noops
=
True
,
),
),
)
with
vllm
.
config
.
set_current_vllm_config
(
vllm_config
),
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_ROCM_USE_AITER"
,
"1"
)
m
.
setenv
(
"VLLM_ROCM_USE_AITER_TRITON_ROPE"
,
"1"
if
enable_aiter_triton_rope
else
"0"
)
rocm_aiter_ops
.
refresh_env_variables
()
model
=
QKRoPEKVCacheTestModel
(
vllm_config
=
vllm_config
,
attn_backend
=
attn_backend
,
num_heads
=
num_heads
,
num_kv_heads
=
num_kv_heads
,
head_size
=
head_size
,
is_neox
=
is_neox
,
dtype
=
dtype
,
device
=
torch
.
get_default_device
(),
)
fusion_pass
=
RopeKVCacheFusionPass
(
vllm_config
)
passes
=
[
NoOpEliminationPass
(
vllm_config
),
SplitCoalescingPass
(
vllm_config
),
ScatterSplitReplacementPass
(
vllm_config
),
fusion_pass
,
PostCleanupPass
(
vllm_config
),
]
backend
=
TestBackend
(
*
passes
)
T
=
5
qkv
=
torch
.
randn
(
T
,
num_heads
*
head_size
+
2
*
num_kv_heads
*
head_size
,
dtype
=
dtype
)
pos
=
torch
.
arange
(
T
,
dtype
=
torch
.
long
)
qkv_unfused
=
qkv
.
clone
()
pos_unfused
=
pos
.
clone
()
with
set_forward_context
(
None
,
vllm_config
):
forward_context
=
get_forward_context
()
attn_metadata
=
model
.
build_attn_metadata
(
T
)
forward_context
.
slot_mapping
=
{
model
.
layer_name
:
attn_metadata
.
slot_mapping
}
q_unfused
,
k_unfused
,
v_unfused
,
dummy
=
model
(
qkv_unfused
,
pos_unfused
)
attn_layer
=
forward_context
.
no_compile_layers
[
model
.
layer_name
]
kv_cache_unfused
=
attn_layer
.
kv_cache
[
forward_context
.
virtual_engine
]
del
dummy
torch
.
_dynamo
.
mark_dynamic
(
qkv
,
0
)
torch
.
_dynamo
.
mark_dynamic
(
pos
,
0
)
with
set_forward_context
(
None
,
vllm_config
):
model_fused
=
torch
.
compile
(
model
,
backend
=
backend
)
forward_context
=
get_forward_context
()
attn_metadata
=
model_fused
.
build_attn_metadata
(
T
)
forward_context
.
slot_mapping
=
{
model
.
layer_name
:
attn_metadata
.
slot_mapping
}
q_fused
,
k_fused
,
v_fused
,
dummy
=
model_fused
(
qkv
,
pos
)
attn_layer
=
forward_context
.
no_compile_layers
[
model
.
layer_name
]
kv_cache_fused
=
attn_layer
.
kv_cache
[
forward_context
.
virtual_engine
]
del
dummy
assert
fusion_pass
.
matched_count
==
1
backend
.
check_before_ops
(
model
.
ops_in_model_before
())
backend
.
check_after_ops
(
model
.
ops_in_model_after
())
if
dtype
==
torch
.
float16
:
ATOL
,
RTOL
=
(
2e-3
,
2e-3
)
else
:
ATOL
,
RTOL
=
(
1e-2
,
1e-2
)
torch
.
testing
.
assert_close
(
q_unfused
,
q_fused
,
atol
=
ATOL
,
rtol
=
RTOL
)
torch
.
testing
.
assert_close
(
k_unfused
,
k_fused
,
atol
=
ATOL
,
rtol
=
RTOL
)
torch
.
testing
.
assert_close
(
v_unfused
,
v_fused
,
atol
=
ATOL
,
rtol
=
RTOL
)
# Cannot compare fp8_* directly here, cast to model dtype instead
torch
.
testing
.
assert_close
(
kv_cache_unfused
.
view
(
dtype
),
kv_cache_fused
.
view
(
dtype
),
atol
=
ATOL
,
rtol
=
RTOL
,
)
tests/compile/passes/test_scatter_split_replace.py
0 → 100644
View file @
3fb4b5fa
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
import
torch.nn
as
nn
import
vllm
from
tests.compile.backend
import
TestBackend
from
vllm.compilation.passes.utility.scatter_split_replace
import
(
ScatterSplitReplacementPass
,
)
from
vllm.compilation.passes.utility.split_coalescing
import
SplitCoalescingPass
from
vllm.config
import
CompilationConfig
,
CompilationMode
,
VllmConfig
from
vllm.model_executor.layers.rotary_embedding
import
RotaryEmbedding
class
ScatterSplitReplacementModel
(
nn
.
Module
):
"""Model with a rope+getitem+slice_scatter+split_with_sizes sequence."""
def
__init__
(
self
,
num_heads
:
int
,
num_kv_heads
:
int
,
head_size
:
int
,
dtype
:
torch
.
dtype
,
):
super
().
__init__
()
self
.
q_size
=
num_heads
*
head_size
self
.
kv_size
=
num_kv_heads
*
head_size
self
.
rotary_emb
=
RotaryEmbedding
(
head_size
,
rotary_dim
=
head_size
,
max_position_embeddings
=
4096
,
base
=
10000
,
is_neox_style
=
True
,
dtype
=
dtype
,
)
def
forward
(
self
,
qkv
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
):
# Create copy so inplace ops do not modify the original tensors
qkv
=
qkv
.
clone
()
q
,
k
,
v
=
qkv
.
split
([
self
.
q_size
,
self
.
kv_size
,
self
.
kv_size
],
dim
=-
1
)
q
,
k
=
self
.
rotary_emb
(
positions
,
q
,
k
)
q
=
q
+
1
k
=
k
+
2
v
=
v
+
3
return
q
,
k
,
v
def
ops_in_model_before
(
self
)
->
list
[
torch
.
_ops
.
OpOverload
]:
return
[
torch
.
ops
.
aten
.
slice_scatter
.
default
,
torch
.
ops
.
aten
.
split_with_sizes
.
default
,
torch
.
ops
.
aten
.
getitem
.
default
,
]
def
ops_in_model_after
(
self
)
->
list
[
torch
.
_ops
.
OpOverload
]:
return
[
torch
.
ops
.
aten
.
getitem
.
default
]
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float16
,
torch
.
bfloat16
])
def
test_scatter_split_replace
(
dtype
):
torch
.
set_default_device
(
"cuda"
)
torch
.
set_default_dtype
(
dtype
)
torch
.
manual_seed
(
0
)
num_heads
=
8
num_kv_heads
=
4
head_size
=
64
vllm_config
=
VllmConfig
(
compilation_config
=
CompilationConfig
(
mode
=
CompilationMode
.
VLLM_COMPILE
,
custom_ops
=
[
"+rotary_embedding"
],
),
)
with
vllm
.
config
.
set_current_vllm_config
(
vllm_config
):
# ScatterSplitReplacementPass requires SplitCoalescingPass to be run before it
coalesce_pass
=
SplitCoalescingPass
(
vllm_config
)
replace_pass
=
ScatterSplitReplacementPass
(
vllm_config
)
passes
=
[
coalesce_pass
,
replace_pass
]
backend
=
TestBackend
(
*
passes
)
model
=
ScatterSplitReplacementModel
(
num_heads
,
num_kv_heads
,
head_size
,
dtype
)
T
=
5
qkv
=
torch
.
randn
(
T
,
num_heads
*
head_size
+
2
*
num_kv_heads
*
head_size
,
dtype
=
dtype
)
pos
=
torch
.
arange
(
T
,
dtype
=
torch
.
long
)
qkv_eager
=
qkv
.
clone
()
pos_eager
=
pos
.
clone
()
result_eager
=
model
(
qkv_eager
,
pos_eager
)
torch
.
_dynamo
.
mark_dynamic
(
qkv
,
0
)
torch
.
_dynamo
.
mark_dynamic
(
pos
,
0
)
model_compiled
=
torch
.
compile
(
model
,
backend
=
backend
)
result_compiled
=
model_compiled
(
qkv
,
pos
)
for
eager
,
compiled
in
zip
(
result_eager
,
result_compiled
):
torch
.
testing
.
assert_close
(
eager
,
compiled
)
assert
backend
.
op_count
(
torch
.
ops
.
aten
.
slice_scatter
.
default
)
==
0
assert
backend
.
op_count
(
torch
.
ops
.
aten
.
split_with_sizes
.
default
)
==
1
tests/compile/passes/test_silu_mul_quant_fusion.py
View file @
3fb4b5fa
...
...
@@ -26,22 +26,14 @@ from vllm.config import (
VllmConfig
,
set_current_vllm_config
,
)
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass
import
(
from
vllm.model_executor.kernels.linear
import
(
CutlassFP8ScaledMMLinearKernel
,
)
from
vllm.model_executor.layers.quantization.kernels.scaled_mm.flashinfer
import
(
FlashInferFP8ScaledMMLinearKernel
,
)
from
vllm.model_executor.layers.quantization.kernels.scaled_mm.pytorch
import
(
FP8ScaledMMLinearKernel
,
PerTensorTorchFP8ScaledMMLinearKernel
,
)
from
vllm.model_executor.layers.quantization.kernels.scaled_mm.rocm
import
(
ROCmFP8ScaledMMLinearKernel
,
)
from
vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel
import
(
# noqa: E501
FP8ScaledMMLinearKernel
,
)
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.quantization.utils.fp8_utils
import
W8A8BlockFp8LinearOp
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
GroupShape
,
...
...
@@ -190,8 +182,24 @@ TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS
"model_class, enable_quant_fp8_custom_op, force_kernel"
,
list
(
itertools
.
product
([
TestSiluMulFp8QuantModel
],
[
True
,
False
],
TEST_KERNELS
))
+
[
(
TestSiluMulNvfp4QuantModel
,
False
,
None
),
(
TestSiluMulGroupFp8QuantModel
,
False
,
None
),
pytest
.
param
(
TestSiluMulNvfp4QuantModel
,
False
,
None
,
marks
=
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
(),
reason
=
"CUDA only"
),
),
# GroupFP8Quant fusion only works with AITER on ROCm.
# and the enable_quant_fp8_custom_op must be True.
pytest
.
param
(
TestSiluMulGroupFp8QuantModel
,
True
,
None
,
marks
=
pytest
.
mark
.
skipif
(
not
current_platform
.
is_rocm
(),
reason
=
"ROCm only"
),
),
],
)
@
pytest
.
mark
.
skipif
(
...
...
@@ -209,6 +217,7 @@ def test_fusion_silu_and_mul_quant(
enable_silu_mul_custom_op
:
bool
,
enable_quant_fp8_custom_op
:
bool
,
force_kernel
:
FP8ScaledMMLinearKernel
|
None
,
monkeypatch
:
pytest
.
MonkeyPatch
,
):
if
model_class
is
TestSiluMulNvfp4QuantModel
and
not
is_nvfp4_supported
():
pytest
.
skip
(
"NVFP4 is not supported on this GPU."
)
...
...
@@ -235,13 +244,16 @@ def test_fusion_silu_and_mul_quant(
),
)
with
set_current_vllm_config
(
config
):
with
set_current_vllm_config
(
config
)
,
monkeypatch
.
context
()
as
m
:
fusion_passes
=
[
ActivationQuantFusionPass
(
config
)]
if
IS_AITER_FOUND
:
if
IS_AITER_FOUND
and
model_class
is
TestSiluMulGroupFp8QuantModel
:
from
vllm._aiter_ops
import
rocm_aiter_ops
from
vllm.compilation.passes.fusion.rocm_aiter_fusion
import
(
RocmAiterSiluMulFp8GroupQuantFusionPass
,
)
m
.
setenv
(
"VLLM_ROCM_USE_AITER"
,
"1"
)
rocm_aiter_ops
.
refresh_env_variables
()
fusion_passes
+=
[
RocmAiterSiluMulFp8GroupQuantFusionPass
(
config
)]
passes
=
[
NoOpEliminationPass
(
config
),
*
fusion_passes
,
PostCleanupPass
(
config
)]
...
...
tests/compile/test_aot_compile.py
View file @
3fb4b5fa
...
...
@@ -4,6 +4,7 @@
import
functools
import
hashlib
import
multiprocessing
import
os
import
pickle
import
tempfile
from
contextlib
import
contextmanager
...
...
@@ -14,9 +15,12 @@ import pytest
import
torch
import
vllm.model_executor.layers.activation
from
vllm.compilation.backends
import
VllmBackend
from
vllm.compilation.caching
import
(
StandaloneCompiledArtifacts
,
VllmSerializableFunction
,
)
from
vllm.compilation.counter
import
compilation_counter
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
(
CompilationConfig
,
...
...
@@ -156,6 +160,26 @@ def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
assert
torch
.
allclose
(
ret
,
expected
)
@
pytest
.
mark
.
skipif
(
not
is_torch_equal_or_newer
(
"2.10.0"
),
reason
=
"requires torch 2.10"
)
def
test_save_and_load_slice
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
foo
(
x
:
torch
.
Tensor
):
return
x
[
slice
(
0
,
x
.
shape
[
0
])]
vllm_config
=
make_vllm_config
()
example_input
=
torch
.
randn
(
10
,
10
)
torch
.
_dynamo
.
mark_dynamic
(
example_input
,
0
)
gm
=
torch
.
fx
.
symbolic_trace
(
foo
)
assert
"getitem_1 = x[slice(0, getitem, None)]"
in
gm
.
code
with
use_vllm_config
(
vllm_config
):
payload
=
VllmSerializableFunction
.
serialize_compile_artifacts
(
VllmSerializableFunction
(
gm
,
(
example_input
,),
""
,
foo
)
)
fn
=
VllmSerializableFunction
.
deserialize_compile_artifacts
(
payload
)
assert
gm
.
code
==
fn
.
graph_module
.
code
@
pytest
.
mark
.
skipif
(
not
is_torch_equal_or_newer
(
"2.10.0"
),
reason
=
"requires torch 2.10"
)
def
test_cache_load_returns_tuple_consistency
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""
...
...
@@ -700,3 +724,156 @@ class TestStandaloneCompiledArtifactsIntegration:
(
"mod3"
,
"shape3"
),
]:
assert
cache
.
get
(
submod
,
shape
)
==
shared_data
def
test_functorch_config
(
self
):
vllm_config
=
make_vllm_config
()
example_inputs
=
(
torch
.
randn
(
10
,
10
),)
def
add_1
(
x
:
torch
.
Tensor
):
return
x
+
1
gm
=
torch
.
_dynamo
.
functional_export
.
dynamo_graph_capture_for_export
(
add_1
)(
*
example_inputs
)
gm
.
graph
.
_codegen
=
torch
.
fx
.
graph
.
CodeGen
()
gm
.
_dynamo_bytecode_flatten
=
None
gm
.
_dynamo_bytecode_unflatten
=
None
with
(
torch
.
_functorch
.
config
.
patch
(
bundled_autograd_cache
=
False
),
set_current_vllm_config
(
vllm_config
),
):
with
torch
.
_functorch
.
config
.
patch
(
bundled_autograd_cache
=
True
):
fn
=
VllmSerializableFunction
(
gm
,
example_inputs
,
""
,
add_1
)
payload
=
VllmSerializableFunction
.
serialize_compile_artifacts
(
fn
)
config
=
None
def
backend
(
*
args
,
**
kwargs
)
->
VllmSerializableFunction
:
nonlocal
config
# bundled_autograd_cache should be True even compiler backend
# runs with bundled_autograd_cache=False in ambient context.
config
=
torch
.
_functorch
.
config
.
save_config_portable
()
return
fn
loaded_fn
=
VllmSerializableFunction
.
deserialize_compile_artifacts
(
payload
)
with
patch
.
object
(
VllmBackend
,
"__call__"
,
backend
):
loaded_fn
(
*
example_inputs
)
assert
isinstance
(
config
,
dict
)
assert
"bundled_autograd_cache"
in
config
assert
config
[
"bundled_autograd_cache"
]
is
True
@
pytest
.
mark
.
skipif
(
not
is_torch_equal_or_newer
(
"2.10.0"
),
reason
=
"requires torch 2.10"
)
def
test_disable_compile_cache_skips_aot_save
(
monkeypatch
:
pytest
.
MonkeyPatch
,
fresh_vllm_cache
:
str
):
"""When VLLM_DISABLE_COMPILE_CACHE=1, AOT artifacts must not be saved."""
monkeypatch
.
setenv
(
"VLLM_DISABLE_COMPILE_CACHE"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_USE_AOT_COMPILE"
,
"1"
)
disable_envs_cache
()
args
=
(
torch
.
randn
(
10
,
10
),)
expected
=
reference_fn
(
*
args
)
vllm_config
=
make_vllm_config
()
with
(
use_vllm_config
(
vllm_config
),
compilation_counter
.
expect
(
num_aot_compiles
=
1
,
num_aot_artifacts_saved
=
0
,
num_aot_artifacts_loaded
=
0
,
),
):
mod
=
CompiledMod
(
vllm_config
=
vllm_config
)
actual
=
mod
(
*
args
)
assert
torch
.
allclose
(
actual
,
expected
)
# No cached artifact should exist on disk
aot_dir
=
os
.
path
.
join
(
fresh_vllm_cache
,
"torch_compile_cache"
,
"torch_aot_compile"
)
if
os
.
path
.
isdir
(
aot_dir
):
for
root
,
_dirs
,
files
in
os
.
walk
(
aot_dir
):
for
f
in
files
:
assert
f
!=
"model"
,
(
f
"AOT artifact unexpectedly saved at
{
os
.
path
.
join
(
root
,
f
)
}
"
)
@
pytest
.
mark
.
skipif
(
not
is_torch_equal_or_newer
(
"2.10.0"
),
reason
=
"requires torch 2.10"
)
def
test_disable_compile_cache_skips_aot_load
(
monkeypatch
:
pytest
.
MonkeyPatch
,
fresh_vllm_cache
:
str
):
"""When VLLM_DISABLE_COMPILE_CACHE=1, AOT artifacts must not be loaded."""
# Phase 1: compile and save with cache enabled
monkeypatch
.
setenv
(
"VLLM_USE_AOT_COMPILE"
,
"1"
)
disable_envs_cache
()
args
=
(
torch
.
randn
(
10
,
10
),)
vllm_config
=
make_vllm_config
()
with
(
use_vllm_config
(
vllm_config
),
compilation_counter
.
expect
(
num_aot_artifacts_saved
=
1
),
):
CompiledMod
(
vllm_config
=
vllm_config
)(
*
args
)
# Phase 2: disable cache, compile again — should NOT load from disk
monkeypatch
.
setenv
(
"VLLM_DISABLE_COMPILE_CACHE"
,
"1"
)
disable_envs_cache
()
torch
.
_dynamo
.
reset
()
vllm_config
=
make_vllm_config
()
with
(
use_vllm_config
(
vllm_config
),
compilation_counter
.
expect
(
num_aot_compiles
=
1
,
num_aot_artifacts_saved
=
0
,
num_aot_artifacts_loaded
=
0
,
),
):
mod
=
CompiledMod
(
vllm_config
=
vllm_config
)
mod
(
*
args
)
assert
not
mod
.
was_aot_compile_fn_loaded_from_disk
@
pytest
.
mark
.
skipif
(
not
is_torch_equal_or_newer
(
"2.10.0"
),
reason
=
"requires torch 2.10"
)
def
test_aot_counters_on_save_and_load
(
monkeypatch
:
pytest
.
MonkeyPatch
,
fresh_vllm_cache
:
str
):
"""Verify AOT counters are incremented correctly on save and load."""
monkeypatch
.
setenv
(
"VLLM_USE_AOT_COMPILE"
,
"1"
)
disable_envs_cache
()
args
=
(
torch
.
randn
(
10
,
10
),)
# Phase 1: fresh compile + save
vllm_config
=
make_vllm_config
()
with
(
use_vllm_config
(
vllm_config
),
compilation_counter
.
expect
(
num_aot_compiles
=
1
,
num_aot_artifacts_saved
=
1
,
num_aot_artifacts_loaded
=
0
,
),
):
CompiledMod
(
vllm_config
=
vllm_config
)(
*
args
)
# Phase 2: load from cache
monkeypatch
.
setenv
(
"VLLM_FORCE_AOT_LOAD"
,
"1"
)
disable_envs_cache
()
vllm_config
=
make_vllm_config
()
with
(
use_vllm_config
(
vllm_config
),
compilation_counter
.
expect
(
num_aot_compiles
=
0
,
num_aot_artifacts_saved
=
0
,
num_aot_artifacts_loaded
=
1
,
),
):
CompiledMod
(
vllm_config
=
vllm_config
)(
*
args
)
tests/compile/test_cold_start.py
deleted
100644 → 0
View file @
bcf25339
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
torch._dynamo.utils
import
counters
from
vllm
import
LLM
from
vllm.config
import
CompilationConfig
,
CompilationMode
,
CUDAGraphMode
def
test_moe_compilation_cold_start
(
monkeypatch
,
use_fresh_inductor_cache
):
# Run in same process so we can access PyTorch's internal counters
monkeypatch
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"0"
)
# I'm not sure if this is going to affect the numbers
monkeypatch
.
setenv
(
"VLLM_USE_AOT_COMPILE"
,
"0"
)
# Force cold compilation
monkeypatch
.
setenv
(
"VLLM_DISABLE_COMPILE_CACHE"
,
"1"
)
compilation_config
=
CompilationConfig
(
mode
=
CompilationMode
.
VLLM_COMPILE
,
cudagraph_mode
=
CUDAGraphMode
.
NONE
,
# make the model loading faster
)
counters
.
clear
()
_
=
LLM
(
model
=
"microsoft/Phi-tiny-MoE-instruct"
,
max_model_len
=
256
,
load_format
=
"dummy"
,
# make the model loading faster
compilation_config
=
compilation_config
,
num_gpu_blocks_override
=
8
,
# make the model loading faster
)
# vLLM-compile cold start is special. By default, we do
# one full dynamo capture of the entire forward pass.
# The forward pass consists of 32 transformer layers.
# Then, we split on the attention operation. This results in
# 33 subgraphs (not including the attention operation).
# We then generate compiled artifacts for the unique subgraphs.
#
# There are actually only 3 unique subgraphs for this model
# (all of its transformer layers are the same modulo weights);
# this is true for most vLLM models.
# So we test that during cold start, we are only compling
# for 3 unique subgraphs.
assert
counters
[
"aot_autograd"
][
"autograd_cache_miss"
]
==
3
assert
counters
[
"aot_autograd"
][
"autograd_cache_hit"
]
==
0
tests/compile/test_compile_ranges.py
View file @
3fb4b5fa
...
...
@@ -73,6 +73,7 @@ def test_compile_ranges(use_fresh_inductor_cache):
Range
(
start
=
16
,
end
=
16
),
Range
(
start
=
9
,
end
=
32
),
Range
(
start
=
64
,
end
=
64
),
Range
(
start
=
128
,
end
=
128
),
Range
(
start
=
33
,
end
=
8192
),
]
)
...
...
@@ -85,7 +86,7 @@ def test_compile_ranges(use_fresh_inductor_cache):
),
compilation_config
=
CompilationConfig
(
mode
=
CompilationMode
.
VLLM_COMPILE
,
compile_ranges_
split_
points
=
[
8
,
32
],
compile_ranges_
end
points
=
[
8
,
32
],
compile_sizes
=
[
16
,
64
,
128
],
inductor_compile_config
=
{
"post_grad_custom_post_pass"
:
post_grad_range_checker
,
...
...
@@ -95,21 +96,21 @@ def test_compile_ranges(use_fresh_inductor_cache):
with
set_current_vllm_config
(
vllm_config
):
model
=
TestModel
(
vllm_config
=
vllm_config
,
prefix
=
""
).
eval
()
# Number of compilations: 3
for each
compile range +
2
compile sizes
# Number of compilations: 3 compile range
s
+
3
compile sizes
batch_sizes
=
[
1
,
4
,
16
,
24
,
48
,
64
,
8192
]
with
compilation_counter
.
expect
(
num_graphs_seen
=
1
,
num_piecewise_graphs_seen
=
1
,
num_backend_compilations
=
5
,
num_backend_compilations
=
6
,
):
run_model
(
vllm_config
,
model
,
batch_sizes
)
assert
post_grad_range_checker
.
num_calls
==
5
assert
post_grad_range_checker
.
num_calls
==
6
def
test_compile_config_get_compile_ranges
():
compilation_config
=
CompilationConfig
(
compile_ranges_
split_
points
=
[
8
,
32
],
compile_ranges_
end
points
=
[
8
,
32
],
)
VllmConfig
(
scheduler_config
=
SchedulerConfig
(
...
...
@@ -126,6 +127,88 @@ def test_compile_config_get_compile_ranges():
]
class
PostGradStaticShapeChecker
(
InductorPass
):
"""Asserts that compile_sizes entries produce graphs with fully concrete
(non-symbolic) shapes, and compile_ranges entries have symbolic shapes."""
def
__init__
(
self
):
self
.
num_static_calls
=
0
self
.
num_dynamic_calls
=
0
def
__call__
(
self
,
graph
:
fx
.
Graph
):
from
torch.fx.experimental.symbolic_shapes
import
is_symbolic
compile_range
=
get_pass_context
().
compile_range
is_single
=
compile_range
.
is_single_size
()
for
node
in
graph
.
nodes
:
val
=
node
.
meta
.
get
(
"val"
)
if
val
is
None
:
val
=
node
.
meta
.
get
(
"example_value"
)
if
isinstance
(
val
,
torch
.
Tensor
):
has_symbolic
=
any
(
is_symbolic
(
d
)
for
d
in
val
.
shape
)
if
is_single
:
assert
not
has_symbolic
,
(
f
"compile_sizes entry
{
compile_range
}
: "
f
"node '
{
node
.
name
}
' has symbolic shape "
f
"
{
val
.
shape
}
"
)
else
:
# compile_ranges should have at least some
# symbolic shapes (the batch dimension)
if
has_symbolic
:
self
.
num_dynamic_calls
+=
1
return
if
is_single
:
self
.
num_static_calls
+=
1
def
uuid
(
self
)
->
str
:
state
:
dict
[
str
,
Any
]
=
{}
return
InductorPass
.
hash_dict
(
state
)
def
test_compile_sizes_produce_static_shapes
(
use_fresh_inductor_cache
):
"""Verify that compile_sizes entries are compiled with fully concrete
shapes (no SymInts), while compile_ranges entries retain dynamic shapes."""
checker
=
PostGradStaticShapeChecker
()
torch
.
set_default_device
(
"cuda"
)
vllm_config
=
VllmConfig
(
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
=
8192
,
max_model_len
=
8192
,
is_encoder_decoder
=
False
,
),
compilation_config
=
CompilationConfig
(
mode
=
CompilationMode
.
VLLM_COMPILE
,
compile_ranges_endpoints
=
[
8
],
compile_sizes
=
[
16
],
inductor_compile_config
=
{
"post_grad_custom_post_pass"
:
checker
,
},
),
)
with
set_current_vllm_config
(
vllm_config
):
model
=
TestModel
(
vllm_config
=
vllm_config
,
prefix
=
""
).
eval
()
# 3 compilations: Range(1,8), Range(9,8192), single-size 16
with
compilation_counter
.
expect
(
num_graphs_seen
=
1
,
num_piecewise_graphs_seen
=
1
,
num_backend_compilations
=
3
,
):
run_model
(
vllm_config
,
model
,
[
1
,
16
,
64
])
# compile_sizes=16 should produce static shapes
assert
checker
.
num_static_calls
==
1
,
(
f
"Expected 1 static compilation, got
{
checker
.
num_static_calls
}
"
)
# compile_ranges should produce dynamic shapes
assert
checker
.
num_dynamic_calls
==
2
,
(
f
"Expected 2 dynamic compilations, got
{
checker
.
num_dynamic_calls
}
"
)
def
test_inductor_cache_compile_ranges
(
monkeypatch
,
use_fresh_inductor_cache
):
# To force multiple compilations, we disable the compile cache
monkeypatch
.
setenv
(
"VLLM_DISABLE_COMPILE_CACHE"
,
"1"
)
...
...
@@ -148,7 +231,7 @@ def test_inductor_cache_compile_ranges(monkeypatch, use_fresh_inductor_cache):
scheduler_config
=
scheduler_config
,
compilation_config
=
CompilationConfig
(
mode
=
CompilationMode
.
VLLM_COMPILE
,
compile_ranges_
split_
points
=
[
8
],
compile_ranges_
end
points
=
[
8
],
inductor_compile_config
=
{
"post_grad_custom_post_pass"
:
post_grad_range_checker
,
},
...
...
tests/compile/test_config.py
View file @
3fb4b5fa
...
...
@@ -421,6 +421,7 @@ def test_cudagraph_sizes_post_init(
fuse_norm_quant
=
True
,
fuse_act_quant
=
True
,
eliminate_noops
=
True
,
sp_min_token_num
=
512
if
enable_sp
else
None
,
),
cudagraph_mode
=
cudagraph_mode
,
)
...
...
@@ -569,3 +570,45 @@ def test_compile_sizes_padding_validation():
assert
sorted
(
config
.
compile_sizes
)
==
[
3
,
5
,
7
]
dispatcher
=
CudagraphDispatcher
(
_create_vllm_config_for_validation
(
config
))
dispatcher
.
initialize_cudagraph_keys
(
CUDAGraphMode
.
NONE
)
# Should not raise
@
pytest
.
mark
.
parametrize
(
"capture_sizes, max_size, num_blocks, expected_sizes, expected_max"
,
[
# Normal capping: sizes filtered to <= num_blocks
(
[
1
,
2
,
4
,
8
,
16
,
32
,
64
,
128
,
256
,
512
],
512
,
200
,
[
1
,
2
,
4
,
8
,
16
,
32
,
64
,
128
],
128
,
),
# No capping needed: num_blocks >= max
([
1
,
2
,
4
,
8
,
16
],
16
,
1000
,
[
1
,
2
,
4
,
8
,
16
],
16
),
# Exact boundary: num_blocks == max (no capping)
([
1
,
2
,
4
,
8
,
16
,
32
],
32
,
32
,
[
1
,
2
,
4
,
8
,
16
,
32
],
32
),
# All sizes capped: num_blocks < smallest size
([
8
,
16
,
32
],
32
,
4
,
[],
0
),
# num_blocks <= 0: early return, no change
([
1
,
2
,
4
],
4
,
0
,
[
1
,
2
,
4
],
4
),
],
)
def
test_adjust_cudagraph_sizes_for_mamba_cache
(
capture_sizes
,
max_size
,
num_blocks
,
expected_sizes
,
expected_max
):
"""Test that cudagraph capture sizes are correctly capped to fit
available Mamba cache blocks.
See: https://github.com/vllm-project/vllm/issues/34094
"""
config
=
CompilationConfig
(
cudagraph_capture_sizes
=
capture_sizes
,
max_cudagraph_capture_size
=
max_size
,
cudagraph_mode
=
CUDAGraphMode
.
NONE
,
)
config
.
adjust_cudagraph_sizes_for_mamba_cache
(
num_blocks
)
assert
config
.
cudagraph_capture_sizes
==
expected_sizes
assert
config
.
max_cudagraph_capture_size
==
expected_max
# Invariant: last element == max_cudagraph_capture_size
if
expected_sizes
:
assert
config
.
cudagraph_capture_sizes
[
-
1
]
==
config
.
max_cudagraph_capture_size
tests/compile/test_decorator.py
View file @
3fb4b5fa
...
...
@@ -234,7 +234,7 @@ def test_conditional_compile_enable_if(use_inductor_graph_partition, monkeypatch
expected_num_backend_compilations
=
4
# A has support_torch_compile but enable_if fn returns False
# ena
l
be_if will be True for B, so we expect mod1 and mod2
# enab
l
e_if will be True for B, so we expect mod1 and mod2
# to be compiled
with
compilation_counter
.
expect
(
num_graphs_seen
=
2
,
...
...
tests/compile/test_dynamic_shapes_compilation.py
View file @
3fb4b5fa
...
...
@@ -99,8 +99,8 @@ def test_dynamic_shapes_compilation(
# Clean up GPU memory
del
model
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
synchronize
()
torch
.
accelerator
.
empty_cache
()
torch
.
accelerator
.
synchronize
()
print
(
"GPU memory cleared"
)
...
...
tests/compile/test_graph_partition.py
View file @
3fb4b5fa
...
...
@@ -7,7 +7,7 @@ import pytest
import
torch
from
torch.fx.experimental.proxy_tensor
import
make_fx
from
vllm.compilation.backends
import
split_graph
from
vllm.compilation.backends
import
_is_empty_allocation_node
,
split_graph
from
vllm.compilation.passes.fx_utils
import
find_op_nodes
# This import automatically registers `torch.ops.silly.attention`
...
...
@@ -184,3 +184,146 @@ def test_consecutive_ops_in_split():
assert
[
node
.
op
for
node
in
splitting_gm
.
graph
.
nodes
]
==
[
"placeholder"
]
+
2
*
[
"call_function"
]
+
[
"output"
]
def
_get_empty_nodes
(
split_item
):
return
[
node
for
node
in
split_item
.
graph
.
graph
.
nodes
if
_is_empty_allocation_node
(
node
)
]
def
_subgraphs_with_empty_nodes
(
split_items
,
*
,
is_splitting_graph
):
return
[
split_item
for
split_item
in
split_items
if
split_item
.
is_splitting_graph
==
is_splitting_graph
and
_get_empty_nodes
(
split_item
)
]
def
test_empty_only_partition_stays_separate_after_splitting_predecessor
():
"""
Empty-only subgraphs should not be merged when the only predecessor is
a splitting-op subgraph.
"""
def
model_fn
(
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
y
=
torch
.
sin
(
x
)
out
=
torch
.
empty_like
(
y
)
torch
.
ops
.
aten
.
cos
.
out
(
y
,
out
=
out
)
return
out
x
=
torch
.
randn
(
4
,
3
)
gm
=
make_fx
(
model_fn
)(
x
)
split_ops
=
[
"aten::sin"
,
"aten::cos.out"
]
split_gm
,
split_items
=
split_graph
(
gm
,
split_ops
)
# Graph partitioning for this pattern is:
# [sin], [empty_like], [cos.out].
assert
len
(
split_items
)
==
3
,
(
"Empty-only partition should not merge into splitting-op subgraph"
)
splitting_with_empty
=
_subgraphs_with_empty_nodes
(
split_items
,
is_splitting_graph
=
True
)
assert
len
(
splitting_with_empty
)
==
0
,
(
"Splitting-op subgraphs should not contain empty allocation nodes: "
f
"
{
[
item
.
submod_name
for
item
in
splitting_with_empty
]
}
"
)
output_original
=
gm
(
x
)
output_split
=
split_gm
(
x
)
assert
torch
.
allclose
(
output_original
,
output_split
),
"Output mismatch after split"
def
test_empty_only_partition_is_merged
():
"""
Empty-only subgraphs should still be merged when a non-splitting predecessor
exists. The merged empty node must remain outside splitting-op subgraphs.
"""
def
model_fn
(
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
base
=
x
+
1
y
=
torch
.
sin
(
base
)
out
=
torch
.
empty_like
(
base
)
torch
.
ops
.
aten
.
cos
.
out
(
base
,
out
=
out
)
return
out
+
y
x
=
torch
.
randn
(
4
,
3
)
gm
=
make_fx
(
model_fn
)(
x
)
split_gm
,
split_items
=
split_graph
(
gm
,
[
"aten::sin"
,
"aten::cos.out"
])
# Partitioning should be:
# [add, empty_like], [sin], [cos.out], [add].
assert
len
(
split_items
)
==
4
,
(
"Empty-only partition should be merged into non-splitting predecessor"
)
splitting_with_empty
=
_subgraphs_with_empty_nodes
(
split_items
,
is_splitting_graph
=
True
)
assert
len
(
splitting_with_empty
)
==
0
,
(
"Splitting-op subgraphs should not contain empty allocation nodes: "
f
"
{
[
item
.
submod_name
for
item
in
splitting_with_empty
]
}
"
)
non_splitting_with_empty
=
_subgraphs_with_empty_nodes
(
split_items
,
is_splitting_graph
=
False
)
assert
len
(
non_splitting_with_empty
)
==
1
,
(
"Exactly one non-splitting subgraph should contain the merged empty node"
)
assert
len
(
_get_empty_nodes
(
non_splitting_with_empty
[
0
]))
==
1
,
(
"Expected exactly one empty allocation node in merged subgraph"
)
output_original
=
gm
(
x
)
output_split
=
split_gm
(
x
)
assert
torch
.
allclose
(
output_original
,
output_split
),
"Output mismatch after split"
def
test_builtin_empty_only_partition_is_merged
():
"""
In Dynamo graphs, torch.empty/empty_like may appear as builtin call targets
(not aten OpOverload). Ensure empty-only partitions are still merged.
"""
def
model_fn
(
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
hidden
=
x
+
1
out1
=
torch
.
empty_like
(
hidden
)
torch
.
ops
.
silly
.
attention
(
hidden
,
hidden
,
hidden
,
out1
)
out2
=
torch
.
empty_like
(
hidden
)
torch
.
ops
.
silly
.
attention
(
out1
,
out1
,
hidden
,
out2
)
return
out2
+
hidden
gm
=
torch
.
fx
.
symbolic_trace
(
model_fn
)
split_gm
,
split_items
=
split_graph
(
gm
,
[
"silly::attention"
])
# Without empty-only merge, this graph would split into:
# [add, empty_like], [attention], [empty_like], [attention], [add].
assert
len
(
split_items
)
==
4
,
"Builtin empty-only partition should be merged"
splitting_with_empty
=
_subgraphs_with_empty_nodes
(
split_items
,
is_splitting_graph
=
True
)
assert
len
(
splitting_with_empty
)
==
0
,
(
"Splitting-op subgraphs should not contain empty allocation nodes: "
f
"
{
[
item
.
submod_name
for
item
in
splitting_with_empty
]
}
"
)
non_splitting_with_empty
=
_subgraphs_with_empty_nodes
(
split_items
,
is_splitting_graph
=
False
)
assert
len
(
non_splitting_with_empty
)
==
1
,
(
"Exactly one non-splitting subgraph should contain merged empty nodes"
)
assert
len
(
_get_empty_nodes
(
non_splitting_with_empty
[
0
]))
==
2
,
(
"Expected two builtin empty_like nodes in merged non-splitting subgraph"
)
x
=
torch
.
randn
(
2
,
3
,
device
=
"cuda"
)
output_original
=
gm
(
x
)
output_split
=
split_gm
(
x
)
assert
torch
.
allclose
(
output_original
,
output_split
),
"Output mismatch after split"
tests/compile/test_sequence_parallelism_threshold.py
0 → 100644
View file @
3fb4b5fa
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
vllm.compilation.passes.fusion.sequence_parallelism
import
(
SP_MIN_HIDDEN_SIZE
,
SP_MIN_PER_GPU_SIZE_MB
,
get_sequence_parallelism_threshold
,
)
class
TestGetSequenceParallelismThreshold
:
"""Tests for get_sequence_parallelism_threshold function."""
def
test_non_cuda_returns_none
(
self
,
mock_cuda_platform
):
"""Non-CUDA platforms should return None."""
with
mock_cuda_platform
(
is_cuda
=
False
):
result
=
get_sequence_parallelism_threshold
(
hidden_size
=
8192
,
tp_size
=
2
,
element_size
=
2
)
assert
result
is
None
def
test_unsupported_device_capability_returns_none
(
self
,
mock_cuda_platform
):
"""Unsupported device capabilities (e.g., sm80) should return None."""
with
mock_cuda_platform
(
capability
=
(
8
,
0
)):
result
=
get_sequence_parallelism_threshold
(
hidden_size
=
8192
,
tp_size
=
2
,
element_size
=
2
)
assert
result
is
None
def
test_small_hidden_size_returns_none
(
self
,
mock_cuda_platform
):
"""H100 with hidden_size below threshold should return None."""
with
mock_cuda_platform
(
capability
=
(
9
,
0
)):
result
=
get_sequence_parallelism_threshold
(
hidden_size
=
4096
,
tp_size
=
2
,
element_size
=
2
,
# 4096 < 8192
)
assert
result
is
None
def
test_h100_large_model_returns_threshold
(
self
,
mock_cuda_platform
):
"""H100 with large enough hidden_size should return calculated threshold."""
with
mock_cuda_platform
(
capability
=
(
9
,
0
)):
hidden_size
=
8192
tp_size
=
2
element_size
=
2
# float16/bfloat16
result
=
get_sequence_parallelism_threshold
(
hidden_size
=
hidden_size
,
tp_size
=
tp_size
,
element_size
=
element_size
,
)
# Verify calculation: (8 * 2 * 1024 * 1024) // (8192 * 2) = 1024
MiB
=
1024
*
1024
expected
=
int
(
(
SP_MIN_PER_GPU_SIZE_MB
[
90
]
*
tp_size
*
MiB
)
//
(
hidden_size
*
element_size
)
)
assert
result
==
expected
assert
result
==
1024
@
pytest
.
mark
.
parametrize
(
"hidden_size,tp_size,element_size,expected"
,
[
# Boundary: exactly at min hidden size threshold, tp_size=1
# (8 * 1 * 1024 * 1024) // (8192 * 2) = 512
(
8192
,
1
,
2
,
512
),
# Larger hidden size reduces token threshold
# (8 * 1 * 1024 * 1024) // (16384 * 2) = 256
(
16384
,
1
,
2
,
256
),
# Larger tp_size increases token threshold
# (8 * 4 * 1024 * 1024) // (8192 * 2) = 2048
(
8192
,
4
,
2
,
2048
),
# Larger element_size (fp32) reduces token threshold
# (8 * 2 * 1024 * 1024) // (8192 * 4) = 512
(
8192
,
2
,
4
,
512
),
],
)
def
test_threshold_calculation_variations
(
self
,
mock_cuda_platform
,
hidden_size
,
tp_size
,
element_size
,
expected
):
"""Test threshold calculation with various parameter combinations."""
with
mock_cuda_platform
(
capability
=
(
9
,
0
)):
result
=
get_sequence_parallelism_threshold
(
hidden_size
=
hidden_size
,
tp_size
=
tp_size
,
element_size
=
element_size
,
)
assert
result
==
expected
def
test_hidden_size_boundary
(
self
,
mock_cuda_platform
):
"""Test behavior at the exact hidden_size boundary."""
with
mock_cuda_platform
(
capability
=
(
9
,
0
)):
# Just below threshold
result
=
get_sequence_parallelism_threshold
(
hidden_size
=
SP_MIN_HIDDEN_SIZE
[
90
]
-
1
,
tp_size
=
2
,
element_size
=
2
,
)
assert
result
is
None
# Exactly at threshold
result
=
get_sequence_parallelism_threshold
(
hidden_size
=
SP_MIN_HIDDEN_SIZE
[
90
],
tp_size
=
2
,
element_size
=
2
,
)
assert
result
is
not
None
tests/compile/test_startup.py
0 → 100644
View file @
3fb4b5fa
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Cold start and warm start tests for vLLM-compile.
Cold start runs in a forked child (must fork before CUDA init) which
populates on-disk caches and asserts cold-start counters. Warm start
then runs in the parent with clean in-memory state but populated caches.
"""
import
multiprocessing
as
mp
from
torch._dynamo.utils
import
counters
from
vllm.compilation.counter
import
compilation_counter
from
vllm.config
import
CompilationConfig
,
CompilationMode
,
CUDAGraphMode
MODEL
=
"microsoft/Phi-tiny-MoE-instruct"
def
_run_vllm
(
vllm_runner
):
with
vllm_runner
(
MODEL
,
trust_remote_code
=
False
,
max_model_len
=
256
,
max_num_batched_tokens
=
1024
,
load_format
=
"dummy"
,
compilation_config
=
CompilationConfig
(
mode
=
CompilationMode
.
VLLM_COMPILE
,
cudagraph_mode
=
CUDAGraphMode
.
NONE
,
),
num_gpu_blocks_override
=
8
,
):
pass
def
_cold_start
(
vllm_runner
):
counters
.
clear
()
with
compilation_counter
.
expect
(
num_compiled_artifacts_saved
=
3
,
num_compiled_artifacts_loaded
=
0
,
):
_run_vllm
(
vllm_runner
)
assert
counters
[
"aot_autograd"
][
"total"
]
==
33
assert
counters
[
"aot_autograd"
][
"autograd_cache_miss"
]
==
3
assert
counters
[
"aot_autograd"
][
"autograd_cache_hit"
]
==
0
def
test_moe_startup
(
monkeypatch
,
vllm_runner
,
fresh_vllm_cache
):
monkeypatch
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"0"
)
# Cold start in a forked child (must fork before CUDA init).
# This model has 32 identical transformer layers which produce
# 33 subgraphs after splitting on attention — only 3 are unique.
ctx
=
mp
.
get_context
(
"fork"
)
p
=
ctx
.
Process
(
target
=
_cold_start
,
args
=
(
vllm_runner
,))
p
.
start
()
p
.
join
()
assert
p
.
exitcode
==
0
,
"Cold-start child failed"
# Warm start — compiled artifacts loaded from disk cache.
counters
.
clear
()
with
compilation_counter
.
expect
(
num_compiled_artifacts_loaded
=
3
,
num_compiled_artifacts_saved
=
0
,
):
_run_vllm
(
vllm_runner
)
assert
counters
[
"aot_autograd"
][
"total"
]
==
30
assert
counters
[
"aot_autograd"
][
"autograd_cache_miss"
]
==
0
assert
(
counters
[
"aot_autograd"
][
"autograd_cache_hit"
]
==
0
)
# No miss at aot_autograd level causing disk I/O.
tests/compile/test_structured_logging.py
View file @
3fb4b5fa
...
...
@@ -109,9 +109,9 @@ def test_vllm_structured_logging_artifacts(use_fresh_inductor_cache):
f
"got
{
len
(
vllm_piecewise_split_graph
)
}
"
)
compile_start_artifacts
=
capture
.
get
(
"artifact"
,
"vllm_piecewise_compile_start"
)
assert
len
(
compile_start_artifacts
)
==
2
,
(
"Expected
2
vllm_piecewise_compile_start "
"(
one for dynamic ranges, one for
compile size), "
assert
len
(
compile_start_artifacts
)
==
4
,
(
"Expected
4
vllm_piecewise_compile_start "
"(
2 subgraphs x 2 ranges each: dynamic +
compile size), "
f
"got
{
len
(
compile_start_artifacts
)
}
"
)
submod_dumps
=
capture
.
get
(
"graph_dump"
,
r
"vllm_submod_.*"
)
...
...
tests/compile/test_wrapper.py
View file @
3fb4b5fa
...
...
@@ -95,7 +95,7 @@ def test_torch_compile_wrapper(use_bytecode_hook, monkeypatch):
f
"Expected
{
expected1
}
, got
{
result1
}
"
)
# Second call should triger another compilation
# Second call should trig
g
er another compilation
x2
=
torch
.
tensor
([
1
,
2
,
3
])
result2
=
wrapper
(
x2
)
expected2
=
torch
.
tensor
([
100
,
200
,
300
])
...
...
tests/config/test_config_generation.py
View file @
3fb4b5fa
...
...
@@ -78,3 +78,34 @@ def test_ray_runtime_env(monkeypatch: pytest.MonkeyPatch):
)
ray
.
shutdown
()
def
test_unrecognized_env
(
monkeypatch
):
import
os
from
vllm.envs
import
environment_variables
# Remove any existing unrecognized VLLM env vars that might interfere
for
env
in
list
(
os
.
environ
):
if
env
.
startswith
(
"VLLM_"
)
and
env
not
in
environment_variables
:
monkeypatch
.
delenv
(
env
,
raising
=
False
)
# Test that if fail_on_environ_validation is True, then an error
# is raised when an unrecognized vLLM environment variable is set
monkeypatch
.
setenv
(
"VLLM_UNRECOGNIZED_ENV_VAR"
,
"some_value"
)
engine_args
=
EngineArgs
(
fail_on_environ_validation
=
True
,
)
with
pytest
.
raises
(
ValueError
,
match
=
"Unknown vLLM environment variable detected"
):
engine_args
.
create_engine_config
()
# Test that if fail_on_environ_validation is False, then no error is raised
engine_args
=
EngineArgs
()
engine_args
.
create_engine_config
()
# Test that when the unrecognized env var is removed, no error is raised
monkeypatch
.
delenv
(
"VLLM_UNRECOGNIZED_ENV_VAR"
)
engine_args
=
EngineArgs
(
fail_on_environ_validation
=
True
,
)
engine_args
.
create_engine_config
()
tests/config/test_multimodal_config.py
View file @
3fb4b5fa
...
...
@@ -3,6 +3,7 @@
import
pytest
from
vllm.config.model
import
ModelConfig
from
vllm.config.multimodal
import
MultiModalConfig
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
...
...
@@ -23,3 +24,20 @@ def test_mm_encoder_attn_backend_hash_updates():
mm_encoder_attn_backend
=
AttentionBackendEnum
.
FLASH_ATTN
).
compute_hash
()
assert
base_hash
!=
overridden_hash
def
test_language_model_only_does_not_affect_mm_hash
():
"""language_model_only does not affect the ViT computation graph,
so it should not change the multimodal config hash."""
base_hash
=
MultiModalConfig
().
compute_hash
()
lm_only_hash
=
MultiModalConfig
(
language_model_only
=
True
).
compute_hash
()
assert
base_hash
==
lm_only_hash
def
test_language_model_only_affects_model_hash
():
"""language_model_only affects the LM computation graph,
so it should change the model config hash."""
model
=
"llava-hf/llava-1.5-7b-hf"
base_hash
=
ModelConfig
(
model
).
compute_hash
()
lm_only_hash
=
ModelConfig
(
model
,
language_model_only
=
True
).
compute_hash
()
assert
base_hash
!=
lm_only_hash
tests/conftest.py
View file @
3fb4b5fa
...
...
@@ -176,16 +176,20 @@ def init_test_http_connection():
@
pytest
.
fixture
def
dist_init
():
from
tests.utils
import
ensure_current_vllm_config
temp_file
=
tempfile
.
mkstemp
()[
1
]
init_distributed_environment
(
world_size
=
1
,
rank
=
0
,
distributed_init_method
=
f
"file://
{
temp_file
}
"
,
local_rank
=
0
,
backend
=
"nccl"
,
)
initialize_model_parallel
(
1
,
1
)
yield
with
ensure_current_vllm_config
():
init_distributed_environment
(
world_size
=
1
,
rank
=
0
,
distributed_init_method
=
f
"file://
{
temp_file
}
"
,
local_rank
=
0
,
backend
=
"nccl"
,
)
initialize_model_parallel
(
1
,
1
)
yield
cleanup_dist_env_and_memory
()
...
...
@@ -419,18 +423,16 @@ class HfRunner:
self
.
tokenizer
:
"PreTrainedTokenizer | PreTrainedTokenizerFast"
=
(
AutoTokenizer
.
from_pretrained
(
model_name
,
dtype
=
dtype
,
trust_remote_code
=
trust_remote_code
,
)
)
# don't put this import at the top level
# it will call torch.
cuda
.device_count()
# it will call torch.
accelerator
.device_count()
from
transformers
import
AutoProcessor
self
.
processor
=
AutoProcessor
.
from_pretrained
(
model_name
,
dtype
=
dtype
,
trust_remote_code
=
trust_remote_code
,
)
if
skip_tokenizer_init
:
...
...
@@ -792,7 +794,6 @@ class VllmRunner:
tensor_parallel_size
:
int
=
1
,
block_size
:
int
=
16
if
not
torch
.
xpu
.
is_available
()
else
64
,
enable_chunked_prefill
:
bool
|
None
=
False
,
swap_space
:
int
=
4
,
enforce_eager
:
bool
|
None
=
False
,
# Set this to avoid hanging issue
default_torch_num_threads
:
int
|
None
=
None
,
...
...
@@ -829,7 +830,6 @@ class VllmRunner:
trust_remote_code
=
trust_remote_code
,
dtype
=
dtype
,
seed
=
seed
,
swap_space
=
swap_space
,
enforce_eager
=
enforce_eager
,
disable_log_stats
=
disable_log_stats
,
tensor_parallel_size
=
tensor_parallel_size
,
...
...
@@ -841,7 +841,10 @@ class VllmRunner:
def
get_inputs
(
self
,
prompts
:
list
[
str
]
|
list
[
torch
.
Tensor
]
|
list
[
list
[
int
]],
prompts
:
list
[
str
]
|
list
[
torch
.
Tensor
]
|
list
[
list
[
int
]]
|
list
[
dict
[
str
,
Any
]],
images
:
PromptImageInput
|
None
=
None
,
videos
:
PromptVideoInput
|
None
=
None
,
audios
:
PromptAudioInput
|
None
=
None
,
...
...
@@ -855,26 +858,32 @@ class VllmRunner:
inputs
=
list
[
dict
[
str
,
Any
]]()
for
i
,
prompt
in
enumerate
(
prompts
):
prompt_dict
=
dict
[
str
,
Any
]()
if
isinstance
(
prompt
,
str
):
prompt_dict
[
"prompt"
]
=
prompt
elif
isinstance
(
prompt
,
list
):
prompt_dict
[
"prompt_token_ids"
]
=
prompt
# If we're passing an encoder/decoder prompt, we assume it
# already contains the multimodal data in the prompt
if
isinstance
(
prompt
,
dict
):
assert
images
is
None
and
audios
is
None
and
videos
is
None
inputs
.
append
(
prompt
.
copy
())
else
:
prompt_dict
[
"prompt_embeds"
]
=
prompt
multi_modal_data
=
dict
[
str
,
Any
]()
if
images
is
not
None
and
(
image
:
=
images
[
i
])
is
not
None
:
multi_modal_data
[
"image"
]
=
image
if
videos
is
not
None
and
(
video
:
=
videos
[
i
])
is
not
None
:
multi_modal_data
[
"video"
]
=
video
if
audios
is
not
None
and
(
audio
:
=
audios
[
i
])
is
not
None
:
multi_modal_data
[
"audio"
]
=
audio
prompt_dict
=
dict
[
str
,
Any
]()
if
isinstance
(
prompt
,
str
):
prompt_dict
[
"prompt"
]
=
prompt
elif
isinstance
(
prompt
,
list
):
prompt_dict
[
"prompt_token_ids"
]
=
prompt
else
:
prompt_dict
[
"prompt_embeds"
]
=
prompt
multi_modal_data
=
dict
[
str
,
Any
]()
if
images
is
not
None
and
(
image
:
=
images
[
i
])
is
not
None
:
multi_modal_data
[
"image"
]
=
image
if
videos
is
not
None
and
(
video
:
=
videos
[
i
])
is
not
None
:
multi_modal_data
[
"video"
]
=
video
if
audios
is
not
None
and
(
audio
:
=
audios
[
i
])
is
not
None
:
multi_modal_data
[
"audio"
]
=
audio
if
multi_modal_data
:
prompt_dict
[
"multi_modal_data"
]
=
multi_modal_data
if
multi_modal_data
:
prompt_dict
[
"multi_modal_data"
]
=
multi_modal_data
inputs
.
append
(
prompt_dict
)
inputs
.
append
(
prompt_dict
)
return
inputs
...
...
@@ -1138,6 +1147,15 @@ class VllmRunner:
return
self
def
__exit__
(
self
,
exc_type
,
exc_value
,
traceback
):
# Explicitly shutdown the engine core to release GPU resources
# This is needed because when executing consecutive tests, the GC
# might not be fast enough in shutting down the llm engine. This can lead to OOMs
# because when the next test starts some GPU memory is still in use.
try
:
self
.
llm
.
llm_engine
.
engine_core
.
shutdown
()
except
Exception
:
# Ignore shutdown errors as cleanup will still proceed
pass
del
self
.
llm
cleanup_dist_env_and_memory
()
...
...
@@ -1517,7 +1535,7 @@ def clean_gpu_memory_between_tests():
from
tests.utils
import
wait_for_gpu_memory_to_clear
num_gpus
=
torch
.
cuda
.
device_count
()
num_gpus
=
torch
.
accelerator
.
device_count
()
if
num_gpus
>
0
:
try
:
wait_for_gpu_memory_to_clear
(
...
...
@@ -1531,7 +1549,7 @@ def clean_gpu_memory_between_tests():
# Clean up GPU memory after the test
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
empty_cache
()
torch
.
accelerator
.
empty_cache
()
gc
.
collect
()
...
...
@@ -1546,6 +1564,14 @@ def use_fresh_inductor_cache():
yield
@
pytest
.
fixture
def
fresh_vllm_cache
(
monkeypatch
,
use_fresh_inductor_cache
):
"""Temporary VLLM_CACHE_ROOT combined with a fresh inductor cache."""
with
tempfile
.
TemporaryDirectory
()
as
tmp_dir
:
monkeypatch
.
setenv
(
"VLLM_CACHE_ROOT"
,
tmp_dir
)
yield
tmp_dir
@
pytest
.
fixture
(
scope
=
"function"
)
def
enable_pickle
(
monkeypatch
):
"""`LLM.apply_model` requires pickling a function."""
...
...
tests/cuda/scripts/check_device_count_respects_env.py
View file @
3fb4b5fa
...
...
@@ -14,7 +14,7 @@ import torch # noqa: E402
from
vllm.platforms
import
current_platform
# noqa: F401, E402
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
"0"
count
=
torch
.
cuda
.
device_count
()
count
=
torch
.
accelerator
.
device_count
()
if
count
==
0
:
sys
.
exit
(
0
)
# Skip: no GPUs available
...
...
tests/cuda/test_cuda_compatibility_path.py
0 → 100644
View file @
3fb4b5fa
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for CUDA forward compatibility path logic in env_override.py.
Verifies the opt-in LD_LIBRARY_PATH manipulation for CUDA compat libs,
including env var parsing, path detection, and deduplication.
"""
import
os
from
unittest.mock
import
patch
import
pytest
# Import the functions directly (they're module-level in env_override)
# We must import them without triggering the module-level side effects,
# so we import the functions by name after the module is already loaded.
from
vllm.env_override
import
(
_get_torch_cuda_version
,
_maybe_set_cuda_compatibility_path
,
)
class
TestCudaCompatibilityEnvParsing
:
"""Test VLLM_ENABLE_CUDA_COMPATIBILITY env var parsing."""
def
test_disabled_by_default
(
self
,
monkeypatch
):
"""Compat path is NOT set when env var is absent."""
monkeypatch
.
delenv
(
"VLLM_ENABLE_CUDA_COMPATIBILITY"
,
raising
=
False
)
monkeypatch
.
delenv
(
"LD_LIBRARY_PATH"
,
raising
=
False
)
_maybe_set_cuda_compatibility_path
()
assert
(
"LD_LIBRARY_PATH"
not
in
os
.
environ
or
os
.
environ
.
get
(
"LD_LIBRARY_PATH"
,
""
)
==
""
)
@
pytest
.
mark
.
parametrize
(
"value"
,
[
"0"
,
"false"
,
"False"
,
"no"
,
""
])
def
test_disabled_values
(
self
,
monkeypatch
,
value
):
"""Various falsy values should not activate compat path."""
monkeypatch
.
setenv
(
"VLLM_ENABLE_CUDA_COMPATIBILITY"
,
value
)
monkeypatch
.
delenv
(
"LD_LIBRARY_PATH"
,
raising
=
False
)
_maybe_set_cuda_compatibility_path
()
# LD_LIBRARY_PATH should not be set (or remain empty)
ld_path
=
os
.
environ
.
get
(
"LD_LIBRARY_PATH"
,
""
)
assert
"compat"
not
in
ld_path
@
pytest
.
mark
.
parametrize
(
"value"
,
[
"1"
,
"true"
,
"True"
,
" 1 "
,
" TRUE "
])
def
test_enabled_values_with_valid_path
(
self
,
monkeypatch
,
tmp_path
,
value
):
"""Truthy values activate compat path when a valid path exists."""
compat_dir
=
tmp_path
/
"compat"
compat_dir
.
mkdir
()
monkeypatch
.
setenv
(
"VLLM_ENABLE_CUDA_COMPATIBILITY"
,
value
)
monkeypatch
.
setenv
(
"VLLM_CUDA_COMPATIBILITY_PATH"
,
str
(
compat_dir
))
monkeypatch
.
delenv
(
"LD_LIBRARY_PATH"
,
raising
=
False
)
_maybe_set_cuda_compatibility_path
()
ld_path
=
os
.
environ
.
get
(
"LD_LIBRARY_PATH"
,
""
)
assert
str
(
compat_dir
)
in
ld_path
class
TestCudaCompatibilityPathDetection
:
"""Test path detection: custom override, conda, default."""
def
test_custom_path_override
(
self
,
monkeypatch
,
tmp_path
):
"""VLLM_CUDA_COMPATIBILITY_PATH takes highest priority."""
custom_dir
=
tmp_path
/
"my-compat"
custom_dir
.
mkdir
()
monkeypatch
.
setenv
(
"VLLM_ENABLE_CUDA_COMPATIBILITY"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_CUDA_COMPATIBILITY_PATH"
,
str
(
custom_dir
))
monkeypatch
.
delenv
(
"LD_LIBRARY_PATH"
,
raising
=
False
)
_maybe_set_cuda_compatibility_path
()
ld_path
=
os
.
environ
.
get
(
"LD_LIBRARY_PATH"
,
""
)
assert
ld_path
.
startswith
(
str
(
custom_dir
))
def
test_conda_prefix_fallback
(
self
,
monkeypatch
,
tmp_path
):
"""Falls back to $CONDA_PREFIX/cuda-compat if custom not set."""
conda_dir
=
tmp_path
/
"conda-env"
compat_dir
=
conda_dir
/
"cuda-compat"
compat_dir
.
mkdir
(
parents
=
True
)
monkeypatch
.
setenv
(
"VLLM_ENABLE_CUDA_COMPATIBILITY"
,
"1"
)
monkeypatch
.
delenv
(
"VLLM_CUDA_COMPATIBILITY_PATH"
,
raising
=
False
)
monkeypatch
.
setenv
(
"CONDA_PREFIX"
,
str
(
conda_dir
))
monkeypatch
.
delenv
(
"LD_LIBRARY_PATH"
,
raising
=
False
)
_maybe_set_cuda_compatibility_path
()
ld_path
=
os
.
environ
.
get
(
"LD_LIBRARY_PATH"
,
""
)
assert
str
(
compat_dir
)
in
ld_path
def
test_no_valid_path_does_nothing
(
self
,
monkeypatch
):
"""When enabled but no valid path exists, LD_LIBRARY_PATH unchanged."""
monkeypatch
.
setenv
(
"VLLM_ENABLE_CUDA_COMPATIBILITY"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_CUDA_COMPATIBILITY_PATH"
,
"/nonexistent/path"
)
monkeypatch
.
delenv
(
"CONDA_PREFIX"
,
raising
=
False
)
monkeypatch
.
delenv
(
"LD_LIBRARY_PATH"
,
raising
=
False
)
with
patch
(
"vllm.env_override._get_torch_cuda_version"
,
return_value
=
None
):
_maybe_set_cuda_compatibility_path
()
assert
os
.
environ
.
get
(
"LD_LIBRARY_PATH"
,
""
)
==
""
def
test_default_cuda_path_fallback
(
self
,
monkeypatch
,
tmp_path
):
"""Falls back to /usr/local/cuda-{ver}/compat via torch version."""
fake_cuda
=
tmp_path
/
"cuda-12.8"
/
"compat"
fake_cuda
.
mkdir
(
parents
=
True
)
monkeypatch
.
setenv
(
"VLLM_ENABLE_CUDA_COMPATIBILITY"
,
"1"
)
monkeypatch
.
delenv
(
"VLLM_CUDA_COMPATIBILITY_PATH"
,
raising
=
False
)
monkeypatch
.
delenv
(
"CONDA_PREFIX"
,
raising
=
False
)
monkeypatch
.
delenv
(
"LD_LIBRARY_PATH"
,
raising
=
False
)
with
(
patch
(
"vllm.env_override._get_torch_cuda_version"
,
return_value
=
"12.8"
),
patch
(
"vllm.env_override.os.path.isdir"
,
side_effect
=
lambda
p
:
p
==
"/usr/local/cuda-12.8/compat"
or
os
.
path
.
isdir
(
p
),
),
):
_maybe_set_cuda_compatibility_path
()
ld_path
=
os
.
environ
.
get
(
"LD_LIBRARY_PATH"
,
""
)
assert
"/usr/local/cuda-12.8/compat"
in
ld_path
class
TestCudaCompatibilityLdPathManipulation
:
"""Test LD_LIBRARY_PATH prepend and deduplication logic."""
def
test_prepends_to_empty_ld_path
(
self
,
monkeypatch
,
tmp_path
):
"""Compat path is set when LD_LIBRARY_PATH is empty."""
compat_dir
=
tmp_path
/
"compat"
compat_dir
.
mkdir
()
monkeypatch
.
setenv
(
"VLLM_ENABLE_CUDA_COMPATIBILITY"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_CUDA_COMPATIBILITY_PATH"
,
str
(
compat_dir
))
monkeypatch
.
delenv
(
"LD_LIBRARY_PATH"
,
raising
=
False
)
_maybe_set_cuda_compatibility_path
()
assert
os
.
environ
[
"LD_LIBRARY_PATH"
]
==
str
(
compat_dir
)
def
test_prepends_to_existing_ld_path
(
self
,
monkeypatch
,
tmp_path
):
"""Compat path is prepended before existing entries."""
compat_dir
=
tmp_path
/
"compat"
compat_dir
.
mkdir
()
monkeypatch
.
setenv
(
"VLLM_ENABLE_CUDA_COMPATIBILITY"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_CUDA_COMPATIBILITY_PATH"
,
str
(
compat_dir
))
monkeypatch
.
setenv
(
"LD_LIBRARY_PATH"
,
"/usr/lib:/other/lib"
)
_maybe_set_cuda_compatibility_path
()
ld_path
=
os
.
environ
[
"LD_LIBRARY_PATH"
]
parts
=
ld_path
.
split
(
os
.
pathsep
)
assert
parts
[
0
]
==
str
(
compat_dir
)
assert
"/usr/lib"
in
parts
assert
"/other/lib"
in
parts
def
test_deduplicates_existing_compat_path
(
self
,
monkeypatch
,
tmp_path
):
"""If compat path already in LD_LIBRARY_PATH, move to front."""
compat_dir
=
tmp_path
/
"compat"
compat_dir
.
mkdir
()
monkeypatch
.
setenv
(
"VLLM_ENABLE_CUDA_COMPATIBILITY"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_CUDA_COMPATIBILITY_PATH"
,
str
(
compat_dir
))
monkeypatch
.
setenv
(
"LD_LIBRARY_PATH"
,
f
"/usr/lib:
{
compat_dir
}
:/other/lib"
,
)
_maybe_set_cuda_compatibility_path
()
ld_path
=
os
.
environ
[
"LD_LIBRARY_PATH"
]
parts
=
ld_path
.
split
(
os
.
pathsep
)
assert
parts
[
0
]
==
str
(
compat_dir
)
assert
parts
.
count
(
str
(
compat_dir
))
==
1
def
test_already_at_front_is_noop
(
self
,
monkeypatch
,
tmp_path
):
"""If compat path is already first, don't modify LD_LIBRARY_PATH."""
compat_dir
=
tmp_path
/
"compat"
compat_dir
.
mkdir
()
original
=
f
"
{
compat_dir
}
:/usr/lib"
monkeypatch
.
setenv
(
"VLLM_ENABLE_CUDA_COMPATIBILITY"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_CUDA_COMPATIBILITY_PATH"
,
str
(
compat_dir
))
monkeypatch
.
setenv
(
"LD_LIBRARY_PATH"
,
original
)
_maybe_set_cuda_compatibility_path
()
assert
os
.
environ
[
"LD_LIBRARY_PATH"
]
==
original
class
TestGetTorchCudaVersion
:
"""Test _get_torch_cuda_version() helper."""
def
test_returns_string_when_torch_available
(
self
):
"""Should return a CUDA version string like '12.8'."""
version
=
_get_torch_cuda_version
()
# torch is installed in vllm's environment
assert
version
is
None
or
isinstance
(
version
,
str
)
def
test_returns_none_when_torch_missing
(
self
):
"""Should return None when torch is not importable."""
with
patch
(
"vllm.env_override.importlib.util.find_spec"
,
return_value
=
None
,
):
assert
_get_torch_cuda_version
()
is
None
tests/detokenizer/test_disable_detokenization.py
View file @
3fb4b5fa
...
...
@@ -7,7 +7,6 @@ from vllm.entrypoints.llm import LLM
from
vllm.sampling_params
import
SamplingParams
@
pytest
.
mark
.
skip_v1
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"distilbert/distilgpt2"
])
def
test_computed_prefix_blocks
(
model
:
str
):
# This test checks if the engine generates completions both with and
...
...
Prev
1
…
18
19
20
21
22
23
24
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment