Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7e63ef82
Commit
7e63ef82
authored
Jan 21, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.14.0' into v0.14.0-dev
parents
8cbcac5d
b17039bc
Changes
681
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
686 additions
and
151 deletions
+686
-151
tests/v1/attention/test_attention_splitting.py
tests/v1/attention/test_attention_splitting.py
+1
-0
tests/v1/attention/test_batch_reordering.py
tests/v1/attention/test_batch_reordering.py
+21
-0
tests/v1/attention/test_chunked_local_attention.py
tests/v1/attention/test_chunked_local_attention.py
+1
-1
tests/v1/attention/test_mla_backends.py
tests/v1/attention/test_mla_backends.py
+11
-7
tests/v1/attention/test_rocm_attention_backends_selection.py
tests/v1/attention/test_rocm_attention_backends_selection.py
+43
-20
tests/v1/attention/test_sparse_mla_backends.py
tests/v1/attention/test_sparse_mla_backends.py
+12
-3
tests/v1/attention/utils.py
tests/v1/attention/utils.py
+25
-28
tests/v1/core/test_kv_cache_utils.py
tests/v1/core/test_kv_cache_utils.py
+57
-0
tests/v1/core/test_kv_sharing.py
tests/v1/core/test_kv_sharing.py
+3
-1
tests/v1/core/test_prefix_caching.py
tests/v1/core/test_prefix_caching.py
+329
-12
tests/v1/core/test_scheduler.py
tests/v1/core/test_scheduler.py
+65
-17
tests/v1/core/test_single_type_kv_cache_manager.py
tests/v1/core/test_single_type_kv_cache_manager.py
+62
-8
tests/v1/core/utils.py
tests/v1/core/utils.py
+7
-1
tests/v1/cudagraph/test_cudagraph_dispatch.py
tests/v1/cudagraph/test_cudagraph_dispatch.py
+4
-1
tests/v1/cudagraph/test_cudagraph_mode.py
tests/v1/cudagraph/test_cudagraph_mode.py
+7
-26
tests/v1/determinism/test_batch_invariance.py
tests/v1/determinism/test_batch_invariance.py
+13
-13
tests/v1/determinism/test_online_batch_invariance.py
tests/v1/determinism/test_online_batch_invariance.py
+2
-3
tests/v1/determinism/test_rms_norm_batch_invariant.py
tests/v1/determinism/test_rms_norm_batch_invariant.py
+12
-6
tests/v1/determinism/utils.py
tests/v1/determinism/utils.py
+1
-1
tests/v1/distributed/test_eagle_dp.py
tests/v1/distributed/test_eagle_dp.py
+10
-3
No files found.
Too many changes to show.
To preserve performance only
681 of 681+
files are displayed.
Plain diff
Email patch
tests/v1/attention/test_attention_splitting.py
View file @
7e63ef82
...
...
@@ -323,6 +323,7 @@ def test_prefill_split_across_ubatches(
num_tokens
,
batch_spec
.
batch_size
,
split_point
=
split_point
,
num_ubatches
=
2
,
)
assert
ubatch_slices
is
not
None
and
len
(
ubatch_slices
)
==
2
...
...
tests/v1/attention/test_batch_reordering.py
View file @
7e63ef82
...
...
@@ -98,6 +98,27 @@ REORDER_TEST_CASES = {
expected_order
=
[
0
,
1
,
6
,
8
,
4
,
3
,
2
,
7
,
5
],
expected_modified
=
True
,
),
"new_request_single_token_prefill"
:
ReorderTestCase
(
requests
=
[
(
100
,
0
),
(
1
,
0
),
# New request with only 1 token (STILL prefill)
(
50
,
100
),
(
1
,
10
),
],
# Only index 3 is a true decode (has num_computed_tokens > 0)
expected_order
=
[
3
,
2
,
0
,
1
],
expected_modified
=
True
,
),
"multiple_new_requests_single_token_prefill"
:
ReorderTestCase
(
requests
=
[
(
1
,
0
),
# New prefill (1 token, no computed)
(
1
,
0
),
# New prefill (1 token, no computed)
(
1
,
50
),
(
200
,
0
),
],
expected_order
=
[
2
,
1
,
0
,
3
],
expected_modified
=
True
,
),
}
...
...
tests/v1/attention/test_chunked_local_attention.py
View file @
7e63ef82
...
...
@@ -172,7 +172,7 @@ def test_local_attention_virtual_batches(test_data: LocalAttentionTestData):
)
# Call the function
result
=
make_local_attention_virtual_batches
(
result
,
_
=
make_local_attention_virtual_batches
(
attn_chunk_size
,
common_attn_metadata
,
block_size
)
...
...
tests/v1/attention/test_mla_backends.py
View file @
7e63ef82
...
...
@@ -18,15 +18,15 @@ from tests.v1.attention.utils import (
try_get_attention_backend
,
)
from
vllm
import
_custom_ops
as
ops
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.attention.ops.flashmla
import
is_flashmla_dense_supported
from
vllm.attention.utils.fa_utils
import
flash_attn_supports_mla
from
vllm.config.vllm
import
set_current_vllm_config
from
vllm.model_executor.layers.attention_layer_base
import
AttentionLayerBase
from
vllm.utils.math_utils
import
cdiv
from
vllm.utils.torch_utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
vllm.v1.attention.backend
import
CommonAttentionMetadata
from
vllm.v1.attention.backends.fa_utils
import
flash_attn_supports_mla
from
vllm.v1.attention.backends.mla.common
import
QueryLenSupport
from
vllm.v1.attention.backends.utils
import
CommonAttentionMetadata
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
from
vllm.v1.attention.ops.flashmla
import
is_flashmla_dense_supported
from
vllm.v1.kv_cache_interface
import
FullAttentionSpec
BACKENDS_TO_TEST
=
[
...
...
@@ -154,12 +154,12 @@ def create_and_prepopulate_kv_cache(
MLA KV cache tensor
"""
batch_size
=
len
(
kv_c_contexts
)
seq_lens
=
common_attn_metadata
.
seq_lens
_
cpu
seq_lens
=
common_attn_metadata
.
seq_lens
.
cpu
()
query_lens
=
(
common_attn_metadata
.
query_start_loc_cpu
[
1
:]
-
common_attn_metadata
.
query_start_loc_cpu
[:
-
1
]
)
context_lens
=
common_attn_metadata
.
num_computed_tokens_cpu
context_lens
=
seq_lens
-
query_lens
block_table
=
common_attn_metadata
.
block_table_tensor
slot_mapping
=
common_attn_metadata
.
slot_mapping
...
...
@@ -394,7 +394,11 @@ def run_attention_backend(
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"deepseek-ai/DeepSeek-R1"
])
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
1
,
4
,
8
,
16
])
def
test_backend_correctness
(
dist_init
,
batch_spec_name
:
str
,
model
:
str
,
tensor_parallel_size
:
int
default_vllm_config
,
dist_init
,
batch_spec_name
:
str
,
model
:
str
,
tensor_parallel_size
:
int
,
):
"""
Test that all backends produce similar outputs to a reference implementation
...
...
tests/v1/attention/test_rocm_attention_backends_selection.py
View file @
7e63ef82
...
...
@@ -7,8 +7,9 @@ from unittest.mock import MagicMock, patch
import
pytest
import
torch
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.platforms
import
current_platform
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
from
vllm.v1.attention.selector
import
AttentionSelectorConfig
# ROCm-specific attention backend selection tests
pytestmark
=
pytest
.
mark
.
skipif
(
...
...
@@ -94,26 +95,20 @@ def mock_on_gfx9():
None
,
AttentionBackendEnum
.
ROCM_AITER_UNIFIED_ATTN
.
get_path
(),
),
# Test Case 9: VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1
(
{
"VLLM_V1_USE_PREFILL_DECODE_ATTENTION"
:
"1"
},
None
,
AttentionBackendEnum
.
ROCM_ATTN
.
get_path
(),
),
# Test Case 10: VLLM_ROCM_USE_AITER=1 + explicit TRITON_ATTN
# Test Case 9: VLLM_ROCM_USE_AITER=1 + explicit TRITON_ATTN
(
{
"VLLM_ROCM_USE_AITER"
:
"1"
},
"TRITON_ATTN"
,
AttentionBackendEnum
.
TRITON_ATTN
.
get_path
(),
),
# Test Case 1
1
: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=0
# Test Case 1
0
: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=0
# (explicitly disabled)
(
{
"VLLM_ROCM_USE_AITER"
:
"1"
,
"VLLM_ROCM_USE_AITER_MHA"
:
"0"
},
None
,
AttentionBackendEnum
.
TRITON_ATTN
.
get_path
(),
),
# Test Case 1
2
: VLLM_ROCM_USE_AITER=1 + explicit ROCM_ATTN
# Test Case 1
1
: VLLM_ROCM_USE_AITER=1 + explicit ROCM_ATTN
(
{
"VLLM_ROCM_USE_AITER"
:
"1"
},
"ROCM_ATTN"
,
...
...
@@ -150,8 +145,7 @@ def test_standard_attention_backend_selection(
# Get the backend class path
from
vllm.platforms.rocm
import
RocmPlatform
backend_path
=
RocmPlatform
.
get_attn_backend_cls
(
selected_backend
=
backend_enum
,
attn_selector_config
=
AttentionSelectorConfig
(
head_size
=
128
,
dtype
=
torch
.
float16
,
kv_cache_dtype
=
"auto"
,
...
...
@@ -160,6 +154,11 @@ def test_standard_attention_backend_selection(
has_sink
=
False
,
use_sparse
=
False
,
)
backend_path
=
RocmPlatform
.
get_attn_backend_cls
(
selected_backend
=
backend_enum
,
attn_selector_config
=
attn_selector_config
)
assert
backend_path
==
expected_backend_path
...
...
@@ -273,8 +272,16 @@ def test_mla_backend_selection(
if
should_raise
:
with
pytest
.
raises
(
ValueError
):
RocmPlatform
.
get_attn_backend_cls
(
selected_backend
=
backend_enum
,
attn_selector_config
=
AttentionSelectorConfig
(
head_size
=
128
,
dtype
=
torch
.
float16
,
kv_cache_dtype
=
"auto"
,
block_size
=
block_size
,
use_mla
=
True
,
has_sink
=
False
,
use_sparse
=
False
,
)
attn_selector_config
=
AttentionSelectorConfig
(
head_size
=
128
,
dtype
=
torch
.
float16
,
kv_cache_dtype
=
"auto"
,
...
...
@@ -283,9 +290,13 @@ def test_mla_backend_selection(
has_sink
=
False
,
use_sparse
=
False
,
)
backend_path
=
RocmPlatform
.
get_attn_backend_cls
(
selected_backend
=
backend_enum
,
attn_selector_config
=
attn_selector_config
,
)
else
:
backend_path
=
RocmPlatform
.
get_attn_backend_cls
(
selected_backend
=
backend_enum
,
attn_selector_config
=
AttentionSelectorConfig
(
head_size
=
128
,
dtype
=
torch
.
float16
,
kv_cache_dtype
=
"auto"
,
...
...
@@ -294,6 +305,11 @@ def test_mla_backend_selection(
has_sink
=
False
,
use_sparse
=
False
,
)
backend_path
=
RocmPlatform
.
get_attn_backend_cls
(
selected_backend
=
backend_enum
,
attn_selector_config
=
attn_selector_config
)
assert
backend_path
==
expected_backend_path
...
...
@@ -309,8 +325,7 @@ def test_aiter_fa_requires_gfx9(mock_vllm_config):
match
=
"only supported on gfx9"
,
),
):
RocmPlatform
.
get_attn_backend_cls
(
selected_backend
=
AttentionBackendEnum
.
ROCM_AITER_FA
,
attn_selector_config
=
AttentionSelectorConfig
(
head_size
=
128
,
dtype
=
torch
.
float16
,
kv_cache_dtype
=
"auto"
,
...
...
@@ -320,6 +335,11 @@ def test_aiter_fa_requires_gfx9(mock_vllm_config):
use_sparse
=
False
,
)
RocmPlatform
.
get_attn_backend_cls
(
selected_backend
=
AttentionBackendEnum
.
ROCM_AITER_FA
,
attn_selector_config
=
attn_selector_config
,
)
def
test_sparse_not_supported
(
mock_vllm_config
):
"""Test that sparse attention is not supported on ROCm."""
...
...
@@ -328,8 +348,7 @@ def test_sparse_not_supported(mock_vllm_config):
with
pytest
.
raises
(
AssertionError
,
match
=
"Sparse MLA backend on ROCm only supports block size 1"
):
RocmPlatform
.
get_attn_backend_cls
(
selected_backend
=
None
,
attn_selector_config
=
AttentionSelectorConfig
(
head_size
=
128
,
dtype
=
torch
.
float16
,
kv_cache_dtype
=
"auto"
,
...
...
@@ -338,3 +357,7 @@ def test_sparse_not_supported(mock_vllm_config):
has_sink
=
False
,
use_sparse
=
True
,
)
RocmPlatform
.
get_attn_backend_cls
(
selected_backend
=
None
,
attn_selector_config
=
attn_selector_config
)
tests/v1/attention/test_sparse_mla_backends.py
View file @
7e63ef82
...
...
@@ -22,15 +22,16 @@ from tests.v1.attention.utils import (
create_vllm_config
,
)
from
vllm
import
_custom_ops
as
ops
from
vllm.attention.ops
import
flashmla
from
vllm.config
import
set_current_vllm_config
from
vllm.model_executor.layers.linear
import
ColumnParallelLinear
from
vllm.platforms
import
current_platform
from
vllm.utils.math_utils
import
cdiv
from
vllm.v1.attention.backends.mla.flashmla_sparse
import
(
FlashMLASparseBackend
,
triton_convert_req_index_to_global_index
,
)
from
vllm.v1.attention.backends.utils
import
split_prefill_chunks
from
vllm.v1.attention.ops
import
flashmla
from
...utils
import
models_path_prefix
SPARSE_BACKEND_BATCH_SPECS
=
{
...
...
@@ -125,8 +126,16 @@ def _quantize_dequantize_fp8_ds_mla(
reason
=
"FlashMLASparseBackend requires CUDA 9.0 or higher"
,
)
def
test_sparse_backend_decode_correctness
(
dist_init
,
batch_name
,
kv_cache_dtype
,
tensor_parallel_size
,
workspace_init
default_vllm_config
,
dist_init
,
batch_name
,
kv_cache_dtype
,
tensor_parallel_size
,
workspace_init
,
):
if
current_platform
.
is_rocm
():
pytest
.
skip
(
"ROCm does not support fp8_ds_mla data type for kv cache."
)
if
not
torch
.
cuda
.
is_available
():
pytest
.
skip
(
"CUDA is required for sparse MLA decode test"
)
...
...
@@ -295,7 +304,7 @@ def test_sparse_backend_decode_correctness(
positions
=
np
.
arange
(
starts
[
-
1
],
dtype
=
np
.
int32
)
-
np
.
repeat
(
starts
[:
-
1
],
seg_lengths
)
seq_lengths
=
np
.
asarray
(
common_attn_metadata
.
seq_lens
_
cpu
,
dtype
=
np
.
int32
)
seq_lengths
=
np
.
asarray
(
common_attn_metadata
.
seq_lens
.
cpu
()
,
dtype
=
np
.
int32
)
prefix_lengths
=
seq_lengths
-
seg_lengths
positions
+=
np
.
repeat
(
prefix_lengths
,
seg_lengths
)
...
...
tests/v1/attention/utils.py
View file @
7e63ef82
...
...
@@ -7,8 +7,6 @@ from dataclasses import dataclass
import
pytest
import
torch
from
vllm.attention.backends.abstract
import
AttentionImpl
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.config
import
(
CacheConfig
,
CompilationConfig
,
...
...
@@ -20,10 +18,12 @@ from vllm.config import (
VllmConfig
,
)
from
vllm.config.model
import
ModelDType
from
vllm.v1.attention.backends.utils
import
(
from
vllm.v1.attention.backend
import
(
AttentionImpl
,
AttentionMetadataBuilder
,
CommonAttentionMetadata
,
)
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
from
vllm.v1.kv_cache_interface
import
FullAttentionSpec
...
...
@@ -249,8 +249,8 @@ def create_dummy_kv_cache(
@
dataclass
class
BackendConfig
:
name
:
str
env_vars
:
dict
comp_config
:
dict
# compilation config
attention_config
:
dict
comp_config
:
dict
specific_gpu_arch
:
tuple
|
None
=
None
...
...
@@ -259,10 +259,10 @@ full_cg_backend_configs = {
# FA3 on Hopper
"FA3"
:
BackendConfig
(
name
=
"FA3"
,
env_vars
=
{
"
VLLM_ATTENTION_BACKEND
"
:
"FLASH_ATTN"
,
"
VLLM_FLASH_ATTN_VERSION"
:
"3"
,
"
VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH
"
:
"
16
"
,
attention_config
=
{
"
backend
"
:
"FLASH_ATTN"
,
"
flash_attn_version"
:
3
,
"
flash_attn_max_num_splits_for_cuda_graph
"
:
16
,
},
comp_config
=
{
"cudagraph_mode"
:
"FULL"
,
...
...
@@ -272,9 +272,7 @@ full_cg_backend_configs = {
# FlashMLA on Hopper
"FlashMLA"
:
BackendConfig
(
name
=
"FlashMLA"
,
env_vars
=
{
"VLLM_ATTENTION_BACKEND"
:
"FLASHMLA"
,
},
attention_config
=
{
"backend"
:
"FLASHMLA"
},
comp_config
=
{
"cudagraph_mode"
:
"FULL_AND_PIECEWISE"
,
},
...
...
@@ -283,9 +281,7 @@ full_cg_backend_configs = {
# Cutlass MLA on Blackwell
"CutlassMLA"
:
BackendConfig
(
name
=
"CutlassMLA"
,
env_vars
=
{
"VLLM_ATTENTION_BACKEND"
:
"CUTLASS_MLA"
,
},
attention_config
=
{
"backend"
:
"CUTLASS_MLA"
},
comp_config
=
{
"cudagraph_mode"
:
"FULL_AND_PIECEWISE"
,
},
...
...
@@ -294,9 +290,7 @@ full_cg_backend_configs = {
# FlashInfer MLA on Blackwell
"FlashInferMLA"
:
BackendConfig
(
name
=
"FlashInferMLA"
,
env_vars
=
{
"VLLM_ATTENTION_BACKEND"
:
"FLASHINFER_MLA"
,
},
attention_config
=
{
"backend"
:
"FLASHINFER_MLA"
},
comp_config
=
{
"cudagraph_mode"
:
"FULL_AND_PIECEWISE"
,
},
...
...
@@ -305,9 +299,9 @@ full_cg_backend_configs = {
# FlashAttention MLA on Hopper
"FlashAttentionMLA"
:
BackendConfig
(
name
=
"FlashAttentionMLA"
,
env_vars
=
{
"
VLLM_ATTENTION_BACKEND
"
:
"FLASH_ATTN_MLA"
,
"
VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH
"
:
"
16
"
,
attention_config
=
{
"
backend
"
:
"FLASH_ATTN_MLA"
,
"
flash_attn_max_num_splits_for_cuda_graph
"
:
16
,
},
comp_config
=
{
"cudagraph_mode"
:
"FULL_DECODE_ONLY"
,
...
...
@@ -317,10 +311,10 @@ full_cg_backend_configs = {
# FA2
"FA2"
:
BackendConfig
(
name
=
"FA2"
,
env_vars
=
{
"
VLLM_ATTENTION_BACKEND
"
:
"FLASH_ATTN"
,
"
VLLM_FLASH_ATTN_VERSION"
:
"2"
,
"
VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH
"
:
"
16
"
,
attention_config
=
{
"
backend
"
:
"FLASH_ATTN"
,
"
flash_attn_version"
:
2
,
"
flash_attn_max_num_splits_for_cuda_graph
"
:
16
,
},
comp_config
=
{
"cudagraph_mode"
:
"FULL_AND_PIECEWISE"
,
...
...
@@ -329,7 +323,7 @@ full_cg_backend_configs = {
# Triton Attention
"TritonAttn"
:
BackendConfig
(
name
=
"TritonAttn"
,
env_vars
=
{
"VLLM_ATTENTION_BACKEND
"
:
"TRITON_ATTN"
},
attention_config
=
{
"backend
"
:
"TRITON_ATTN"
},
comp_config
=
{
"cudagraph_mode"
:
"FULL_AND_PIECEWISE"
,
},
...
...
@@ -337,14 +331,17 @@ full_cg_backend_configs = {
# FlashInfer
"FlashInfer"
:
BackendConfig
(
name
=
"FlashInfer"
,
env_vars
=
{
"VLLM_ATTENTION_BACKEND
"
:
"FLASHINFER"
},
attention_config
=
{
"backend
"
:
"FLASHINFER"
},
comp_config
=
{
"cudagraph_mode"
:
"FULL_AND_PIECEWISE"
,
},
),
"RocmAttn"
:
BackendConfig
(
name
=
"RocmAttn"
,
env_vars
=
{
"VLLM_V1_USE_PREFILL_DECODE_ATTENTION"
:
"1"
},
attention_config
=
{
"backend"
:
"ROCM_ATTN"
,
"use_prefill_decode_attention"
:
True
,
},
comp_config
=
{
"cudagraph_mode"
:
"FULL"
,
},
...
...
tests/v1/core/test_kv_cache_utils.py
View file @
7e63ef82
...
...
@@ -1800,3 +1800,60 @@ def test_request_with_prompt_embeds_and_mm_inputs(hash_fn: Callable[[Any], bytes
)
)
assert
block_hashes
[
1
]
==
expected_hash2
def
test_auto_fit_max_model_len
():
"""Test that max_model_len=-1 auto-fits to available GPU memory."""
# Create config with original_max_model_len=-1 to trigger auto-fit
model_config
=
ModelConfig
(
max_model_len
=
1024
)
# Simulate the user passing -1 by setting original_max_model_len
model_config
.
original_max_model_len
=
-
1
vllm_config
=
VllmConfig
(
model_config
=
model_config
)
mem_per_block_per_layer
=
16
*
2
*
64
*
4
*
2
# 16KB per block per layer
kv_cache_specs
=
{
"layer_1"
:
new_kv_cache_spec
(),
"layer_2"
:
new_kv_cache_spec
(),
}
# With enough memory, max_model_len stays at the derived max
large_available_memory
=
mem_per_block_per_layer
*
2
*
1024
# plenty of memory
_kv_cache_configs
=
get_kv_cache_configs
(
vllm_config
,
[
kv_cache_specs
],
[
large_available_memory
]
)
assert
vllm_config
.
model_config
.
max_model_len
==
1024
# Reset for next test
model_config
=
ModelConfig
(
max_model_len
=
1024
)
model_config
.
original_max_model_len
=
-
1
vllm_config
=
VllmConfig
(
model_config
=
model_config
)
# With limited memory, max_model_len should be reduced
# Need memory for at least max_model_len tokens
# 32 blocks worth of memory for 2 layers = can fit 32*16=512 tokens
limited_memory
=
mem_per_block_per_layer
*
2
*
32
_kv_cache_configs
=
get_kv_cache_configs
(
vllm_config
,
[
kv_cache_specs
],
[
limited_memory
]
)
# Should be reduced to fit in memory
assert
vllm_config
.
model_config
.
max_model_len
<
1024
assert
vllm_config
.
model_config
.
max_model_len
>
0
def
test_auto_fit_max_model_len_not_triggered
():
"""Test that auto-fit is not triggered when original_max_model_len is not -1."""
model_config
=
ModelConfig
(
max_model_len
=
16
)
# original_max_model_len should be None by default, not -1
vllm_config
=
VllmConfig
(
model_config
=
model_config
)
mem_per_block_per_layer
=
16
*
2
*
64
*
4
*
2
kv_cache_specs
=
{
"layer_1"
:
new_kv_cache_spec
(),
"layer_2"
:
new_kv_cache_spec
(),
}
# This should work normally without auto-fit
_kv_cache_configs
=
get_kv_cache_configs
(
vllm_config
,
[
kv_cache_specs
],
[
mem_per_block_per_layer
*
2
*
32
]
)
assert
vllm_config
.
model_config
.
max_model_len
==
16
tests/v1/core/test_kv_sharing.py
View file @
7e63ef82
...
...
@@ -11,7 +11,9 @@ pytestmark = pytest.mark.cpu_test
def
new_kv_cache_spec
():
return
FullAttentionSpec
(
16
,
1
,
1
,
torch
.
float32
,
False
)
return
FullAttentionSpec
(
block_size
=
16
,
num_kv_heads
=
1
,
head_size
=
1
,
dtype
=
torch
.
float32
)
def
test_initialize_kv_cache_for_kv_sharing_different_attn_groups
():
...
...
tests/v1/core/test_prefix_caching.py
View file @
7e63ef82
...
...
@@ -35,6 +35,7 @@ from vllm.v1.kv_cache_interface import (
FullAttentionSpec
,
KVCacheConfig
,
KVCacheGroupSpec
,
MambaSpec
,
SlidingWindowSpec
,
)
...
...
@@ -94,35 +95,105 @@ def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
kv_cache_groups
=
[
KVCacheGroupSpec
(
[
"layer"
],
FullAttentionSpec
(
block_size
,
1
,
1
,
torch
.
float32
),
FullAttentionSpec
(
block_size
=
block_size
,
num_kv_heads
=
1
,
head_size
=
1
,
dtype
=
torch
.
float32
,
),
)
],
)
def
make_kv_cache_config_hybrid_model
(
block_size
:
int
,
num_blocks
:
int
block_size
:
int
,
num_blocks
:
int
,
second_spec_type
:
str
=
"sliding_window"
)
->
KVCacheConfig
:
if
second_spec_type
==
"sliding_window"
:
second_spec
=
SlidingWindowSpec
(
block_size
=
block_size
,
num_kv_heads
=
1
,
head_size
=
1
,
dtype
=
torch
.
float32
,
sliding_window
=
2
*
block_size
,
)
elif
second_spec_type
==
"mamba"
:
second_spec
=
MambaSpec
(
block_size
=
block_size
,
shapes
=
(
1
,
1
),
dtypes
=
(
torch
.
float32
,),
)
return
KVCacheConfig
(
num_blocks
=
num_blocks
,
kv_cache_tensors
=
[],
kv_cache_groups
=
[
KVCacheGroupSpec
(
[
"layer1"
],
FullAttentionSpec
(
block_size
,
1
,
1
,
torch
.
float32
),
FullAttentionSpec
(
block_size
=
block_size
,
num_kv_heads
=
1
,
head_size
=
1
,
dtype
=
torch
.
float32
,
),
),
KVCacheGroupSpec
(
[
"layer2"
],
SlidingWindowSpec
(
block_size
,
1
,
1
,
torch
.
float32
,
sliding_window
=
2
*
block_size
),
second_spec
,
),
KVCacheGroupSpec
(
[
"layer3"
],
second_spec
,
),
],
)
def
make_kv_cache_config_three_types
(
block_size
:
int
,
num_blocks
:
int
,
third_spec_type
:
str
=
"mamba"
)
->
KVCacheConfig
:
if
third_spec_type
==
"mamba"
:
third_spec
=
MambaSpec
(
block_size
=
block_size
,
shapes
=
(
1
,
1
),
dtypes
=
(
torch
.
float32
,),
)
elif
third_spec_type
==
"sliding_window"
:
third_spec
=
SlidingWindowSpec
(
block_size
=
block_size
,
num_kv_heads
=
1
,
head_size
=
1
,
dtype
=
torch
.
float32
,
sliding_window
=
4
*
block_size
,
)
return
KVCacheConfig
(
num_blocks
=
num_blocks
,
kv_cache_tensors
=
[],
kv_cache_groups
=
[
KVCacheGroupSpec
(
[
"layer1"
],
FullAttentionSpec
(
block_size
=
block_size
,
num_kv_heads
=
1
,
head_size
=
1
,
dtype
=
torch
.
float32
,
),
),
KVCacheGroupSpec
(
[
"layer2"
],
SlidingWindowSpec
(
block_size
,
1
,
1
,
torch
.
float32
,
sliding_window
=
2
*
block_size
block_size
=
block_size
,
num_kv_heads
=
1
,
head_size
=
1
,
dtype
=
torch
.
float32
,
sliding_window
=
2
*
block_size
,
),
),
KVCacheGroupSpec
(
[
"layer3"
],
third_spec
,
),
],
)
...
...
@@ -406,6 +477,184 @@ def test_prefill_hybrid_model():
)
def
_make_hybrid_kv_cache_config
(
block_size
:
int
,
num_blocks
:
int
,
spec_types
:
list
[
str
]
)
->
KVCacheConfig
:
"""
Create a KVCacheConfig with the specified spec types.
Args:
block_size: The block size for KV cache.
num_blocks: The number of blocks in the KV cache.
spec_types: List of spec type strings. Supported types:
- "full": FullAttentionSpec
- "sliding_window": SlidingWindowSpec with window=2*block_size
- "sliding_window_large": SlidingWindowSpec with window=4*block_size
- "mamba": MambaSpec
"""
spec_map
=
{
"full"
:
lambda
:
FullAttentionSpec
(
block_size
=
block_size
,
num_kv_heads
=
1
,
head_size
=
1
,
dtype
=
torch
.
float32
,
),
"sliding_window"
:
lambda
:
SlidingWindowSpec
(
block_size
=
block_size
,
num_kv_heads
=
1
,
head_size
=
1
,
dtype
=
torch
.
float32
,
sliding_window
=
2
*
block_size
,
),
"sliding_window_large"
:
lambda
:
SlidingWindowSpec
(
block_size
=
block_size
,
num_kv_heads
=
1
,
head_size
=
1
,
dtype
=
torch
.
float32
,
sliding_window
=
4
*
block_size
,
),
"mamba"
:
lambda
:
MambaSpec
(
block_size
=
block_size
,
shapes
=
(
1
,
1
),
dtypes
=
(
torch
.
float32
,),
),
}
kv_cache_groups
=
[
KVCacheGroupSpec
([
f
"layer
{
i
}
"
],
spec_map
[
spec_type
]())
for
i
,
spec_type
in
enumerate
(
spec_types
)
]
return
KVCacheConfig
(
num_blocks
=
num_blocks
,
kv_cache_tensors
=
[],
kv_cache_groups
=
kv_cache_groups
,
)
# Test cases covering various combinations of KV cache spec types:
# - Varying number of groups (2, 3, or 4)
# - 0, 1, or 2 full attention groups
# - Sliding window with different window sizes
# - Interleaved group IDs (full attn and other types mixed)
# - Mamba spec combinations
_HYBRID_MODEL_TEST_CASES
=
[
# 2 groups: 1 full + 1 other
pytest
.
param
([
"full"
,
"sliding_window"
],
id
=
"2g-full+sw"
),
pytest
.
param
([
"full"
,
"mamba"
],
id
=
"2g-full+mamba"
),
# 2 groups: 0 full (all other types)
pytest
.
param
([
"sliding_window"
,
"mamba"
],
id
=
"2g-sw+mamba"
),
pytest
.
param
([
"sliding_window"
,
"sliding_window_large"
],
id
=
"2g-sw+sw_large"
),
# 3 groups: 1 full + 2 others (same type)
pytest
.
param
([
"full"
,
"sliding_window"
,
"sliding_window"
],
id
=
"3g-full+2sw"
),
pytest
.
param
([
"full"
,
"mamba"
,
"mamba"
],
id
=
"3g-full+2mamba"
),
# 3 groups: 1 full + 2 others (different types)
pytest
.
param
([
"full"
,
"sliding_window"
,
"mamba"
],
id
=
"3g-full+sw+mamba"
),
pytest
.
param
(
[
"full"
,
"sliding_window"
,
"sliding_window_large"
],
id
=
"3g-full+sw+sw_large"
,
),
# 3 groups: 2 full + 1 other
pytest
.
param
([
"full"
,
"full"
,
"sliding_window"
],
id
=
"3g-2full+sw"
),
pytest
.
param
([
"full"
,
"full"
,
"mamba"
],
id
=
"3g-2full+mamba"
),
# 4 groups: interleaved (full, other, full, other)
pytest
.
param
(
[
"full"
,
"sliding_window"
,
"full"
,
"sliding_window_large"
],
id
=
"4g-interleaved-full+sw+sw_large"
,
),
pytest
.
param
(
[
"full"
,
"mamba"
,
"full"
,
"mamba"
],
id
=
"4g-interleaved-full+mamba"
,
),
# 4 groups: interleaved with different sliding windows
pytest
.
param
(
[
"full"
,
"sliding_window"
,
"full"
,
"sliding_window_large"
],
id
=
"4g-interleaved-full+sw_mixed"
,
),
# 4 groups: 0 full (all other types)
pytest
.
param
(
[
"sliding_window"
,
"mamba"
,
"sliding_window_large"
,
"mamba"
],
id
=
"4g-sw+mamba+sw_large+mamba"
,
),
# 4 groups: 2 full + 2 others (grouped)
pytest
.
param
(
[
"full"
,
"full"
,
"sliding_window"
,
"mamba"
],
id
=
"4g-2full+sw+mamba"
,
),
]
@
pytest
.
mark
.
parametrize
(
"spec_types"
,
_HYBRID_MODEL_TEST_CASES
)
def
test_prefill_hybrid_model_combinations
(
spec_types
:
list
[
str
]):
"""
Test prefix caching with hybrid models containing various combinations of
KV cache spec types.
This unified test covers:
- Various combinations (full attn + other attn types)
- Varying number of groups (2, 3, or 4)
- 0, 1, or 2 full attention groups in the combination
- Two sliding_window attn groups with different window sizes
- Interleaved group IDs (full attn and other types alternating)
- Mamba spec with other attention types
"""
block_size
=
16
num_groups
=
len
(
spec_types
)
# Allocate enough blocks for all groups
num_blocks
=
10
*
num_groups
kv_cache_config
=
_make_hybrid_kv_cache_config
(
block_size
,
num_blocks
,
spec_types
)
manager
=
KVCacheManager
(
kv_cache_config
,
max_model_len
=
8192
,
enable_caching
=
True
,
hash_block_size
=
block_size
,
)
hash_fn
=
sha256
# Complete 3 blocks (48 tokens)
common_token_ids
=
[
i
for
i
in
range
(
3
)
for
_
in
range
(
block_size
)]
unique_token_ids
=
[
3
]
*
7
all_token_ids
=
common_token_ids
+
unique_token_ids
# First request: no cache hit initially
req0
=
make_request
(
"0"
,
all_token_ids
,
block_size
,
hash_fn
)
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req0
)
assert
len
(
req0
.
block_hashes
)
==
3
assert
not
computed_blocks
.
blocks
[
0
]
# No cache hit initially
assert
num_computed_tokens
==
0
blocks
=
manager
.
allocate_slots
(
req0
,
55
,
len
(
computed_blocks
.
blocks
[
0
])
*
block_size
,
computed_blocks
)
assert
blocks
is
not
None
# Should have blocks for all groups
assert
len
(
blocks
.
get_block_ids
())
==
num_groups
# Second request: should hit cached blocks for common prefix
req1
=
make_request
(
"1"
,
common_token_ids
+
[
4
]
*
5
,
block_size
,
hash_fn
)
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req1
)
# Should hit cached blocks for all groups
assert
num_computed_tokens
==
3
*
block_size
assert
len
(
computed_blocks
.
blocks
)
==
num_groups
# Allocate and verify blocks for second request
blocks
=
manager
.
allocate_slots
(
req1
,
len
(
common_token_ids
)
+
5
-
num_computed_tokens
,
num_computed_tokens
,
computed_blocks
,
)
assert
blocks
is
not
None
assert
len
(
blocks
.
get_block_ids
())
==
num_groups
manager
.
free
(
req0
)
manager
.
free
(
req1
)
def
test_prefill_plp
():
"""Test prefill with APC and some prompt logprobs (plp) requests.
...
...
@@ -1356,6 +1605,69 @@ def test_kv_cache_events(blocks_to_cache: int):
assert
len
(
manager
.
block_pool
.
cached_block_hash_to_block
)
==
0
def
test_null_parent_block_hash
():
block_size
=
1
num_cached_blocks
=
2
num_full_blocks
=
4
pool
=
BlockPool
(
num_gpu_blocks
=
8
,
enable_caching
=
True
,
hash_block_size
=
block_size
,
enable_kv_cache_events
=
True
,
)
req
=
make_request
(
"req_null_parent"
,
prompt_token_ids
=
[
10
,
11
,
12
,
13
],
block_size
=
block_size
,
hash_fn
=
sha256
,
)
assert
len
(
req
.
block_hashes
)
==
num_full_blocks
# Physical parent is `null_block` (no hash), while the logical parent hash
# still exists in `request.block_hashes[num_cached_blocks - 1]`.
assert
pool
.
null_block
.
block_hash
is
None
new_blocks
=
pool
.
get_new_blocks
(
num_full_blocks
-
1
)
blocks
=
[
new_blocks
[:
num_cached_blocks
-
1
],
pool
.
null_block
,
# physical parent
*
new_blocks
[
num_cached_blocks
-
1
:],
]
pool
.
cache_full_blocks
(
request
=
req
,
blocks
=
blocks
,
num_cached_blocks
=
num_cached_blocks
,
num_full_blocks
=
num_full_blocks
,
block_size
=
block_size
,
kv_cache_group_id
=
0
,
)
events
=
pool
.
take_events
()
assert
len
(
events
)
==
1
event
=
events
[
0
]
assert
isinstance
(
event
,
BlockStored
)
expected_parent
=
kv_cache_utils
.
maybe_convert_block_hash
(
req
.
block_hashes
[
num_cached_blocks
-
1
]
)
assert
event
.
parent_block_hash
==
expected_parent
assert
event
.
parent_block_hash
is
not
None
expected_new_hashes
=
[
kv_cache_utils
.
maybe_convert_block_hash
(
h
)
for
h
in
req
.
block_hashes
[
num_cached_blocks
:
num_full_blocks
]
]
assert
event
.
block_hashes
==
expected_new_hashes
# Ensure we didn't accidentally assign a hash to the null block.
assert
pool
.
null_block
.
block_hash
is
None
# Sanity check: newly cached physical blocks should have hashes assigned.
assert
blocks
[
num_cached_blocks
].
block_hash
is
not
None
assert
blocks
[
num_full_blocks
-
1
].
block_hash
is
not
None
@
pytest
.
mark
.
parametrize
(
"blocks_to_cache"
,
[
2
,
3
,
10
])
def
test_kv_cache_events_with_lora
(
blocks_to_cache
:
int
):
"""Test BlockStored events contain correct lora_id when using LoRA requests."""
...
...
@@ -1553,15 +1865,20 @@ def test_different_block_size():
kv_cache_groups
=
[
KVCacheGroupSpec
(
[
"layer1"
],
FullAttentionSpec
(
block_size
*
2
,
1
,
1
,
torch
.
float16
),
FullAttentionSpec
(
block_size
=
block_size
*
2
,
num_kv_heads
=
1
,
head_size
=
1
,
dtype
=
torch
.
float16
,
),
),
KVCacheGroupSpec
(
[
"layer2"
],
SlidingWindowSpec
(
block_size
,
1
,
1
,
torch
.
float32
,
block_size
=
block_size
,
num_kv_heads
=
1
,
head_size
=
1
,
dtype
=
torch
.
float32
,
sliding_window
=
2
*
block_size
,
),
),
...
...
tests/v1/core/test_scheduler.py
View file @
7e63ef82
...
...
@@ -1264,10 +1264,11 @@ def test_kv_connector_unable_to_allocate(use_ec_connector, ec_role):
assert
len
(
scheduler
.
waiting
)
==
0
@
pytest
.
mark
.
parametrize
(
"is_async"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"use_ec_connector, ec_role"
,
[(
False
,
None
),
(
True
,
"ec_consumer"
)]
)
def
test_kv_connector_handles_preemption
(
use_ec_connector
,
ec_role
):
def
test_kv_connector_handles_preemption
(
is_async
,
use_ec_connector
,
ec_role
):
"""
Test whether scheduler with KVConnector is able to handle
unable to allocate (run out of blocks in allocate_slots().
...
...
@@ -1280,7 +1281,9 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
NUM_MATCHED_NEW_TOKENS
=
BLOCK_SIZE
scheduler
=
create_scheduler
(
enable_prefix_caching
=
True
,
use_kv_connector
=
mock_kv
(
matched_tokens
=
NUM_MATCHED_NEW_TOKENS
,
is_async
=
False
),
use_kv_connector
=
mock_kv
(
matched_tokens
=
NUM_MATCHED_NEW_TOKENS
,
is_async
=
is_async
),
block_size
=
BLOCK_SIZE
,
num_blocks
=
NUM_BLOCKS
,
# encoder connector should not affect test results
...
...
@@ -1318,6 +1321,12 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
# All can be scheduled - 1st token.
output
=
scheduler
.
schedule
()
if
is_async
:
assert
len
(
scheduler
.
waiting
)
==
2
assert
scheduler
.
running
==
[]
_step_until_kv_transfer_finished
(
scheduler
,
req_ids
)
output
=
scheduler
.
schedule
()
_assert_right_scheduler_output
(
output
,
# 2 remote kv cache hits.
...
...
@@ -1370,6 +1379,12 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
# Restarts the preempted request - generate 3rd token.
# This will have a local and remote cache hit.
output
=
scheduler
.
schedule
()
if
is_async
:
waiting_req_ids
=
[
req
.
request_id
for
req
in
scheduler
.
waiting
]
assert
len
(
waiting_req_ids
)
==
1
_step_until_kv_transfer_finished
(
scheduler
,
waiting_req_ids
)
output
=
scheduler
.
schedule
()
_assert_right_scheduler_output
(
output
,
# 1 remote kv_cache hit!
...
...
@@ -1380,6 +1395,8 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
)
assert
len
(
scheduler
.
running
)
==
1
assert
len
(
scheduler
.
waiting
)
==
0
assert
output
.
scheduled_cached_reqs
.
num_reqs
==
1
assert
output
.
scheduled_new_reqs
==
[]
_
=
scheduler
.
update_from_output
(
output
,
MODEL_RUNNER_OUTPUT
)
assert
len
(
scheduler
.
running
)
==
1
assert
len
(
scheduler
.
waiting
)
==
0
...
...
@@ -1392,6 +1409,8 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
num_requests
=
0
,
expected_num_scheduled_tokens
=
1
,
)
assert
output
.
scheduled_cached_reqs
.
num_reqs
==
1
assert
output
.
scheduled_new_reqs
==
[]
assert
len
(
scheduler
.
running
)
==
1
_
=
scheduler
.
update_from_output
(
output
,
MODEL_RUNNER_OUTPUT
)
assert
len
(
scheduler
.
running
)
==
0
...
...
@@ -1577,7 +1596,13 @@ def create_scheduler_with_priority(
kv_cache_tensors
=
[],
kv_cache_groups
=
[
KVCacheGroupSpec
(
[
"layer"
],
FullAttentionSpec
(
block_size
,
1
,
1
,
torch
.
float32
,
False
)
[
"layer"
],
FullAttentionSpec
(
block_size
=
block_size
,
num_kv_heads
=
1
,
head_size
=
1
,
dtype
=
torch
.
float32
,
),
)
],
)
...
...
@@ -2288,7 +2313,6 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
# 4th Schedule - this should trigger the resumption
output
=
scheduler
.
schedule
()
scheduled_cached_reqs
=
output
.
scheduled_cached_reqs
resumed_from_preemption
=
scheduled_cached_reqs
.
resumed_from_preemption
assert
len
(
output
.
scheduled_new_reqs
)
==
0
assert
scheduled_cached_reqs
.
num_reqs
==
1
...
...
@@ -2296,14 +2320,14 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
assert
len
(
scheduler
.
running
)
==
1
# Preempted request resumed in scheduled_cached_reqs
assert
len
(
resumed_from_preemption
)
==
1
assert
len
(
scheduled_cached_reqs
.
resumed_req_token_ids
)
==
1
assert
resumed_from_preemption
[
0
]
assert
len
(
scheduled_cached_reqs
.
resumed_req_ids
)
==
1
assert
len
(
scheduled_cached_reqs
.
all_token_ids
)
==
1
assert
scheduled_cached_reqs
.
req_ids
[
0
]
==
request_low
.
request_id
assert
scheduled_cached_reqs
.
resumed_req_token_ids
[
0
]
is
not
None
assert
request_low
.
request_id
in
scheduled_cached_reqs
.
resumed_req_ids
assert
request_low
.
request_id
in
scheduled_cached_reqs
.
all_token_ids
# Resumed tokens include 30 prompt tokens and 2 decoded tokens
assert
len
(
scheduled_cached_reqs
.
resumed_req_token_ids
[
0
])
==
32
assert
scheduled_cached_reqs
.
resumed_req_token_ids
[
0
][
31
]
==
100
assert
len
(
scheduled_cached_reqs
.
all_token_ids
[
request_low
.
request_id
])
==
32
assert
scheduled_cached_reqs
.
all_token_ids
[
request_low
.
request_id
][
31
]
==
100
@
pytest
.
mark
.
parametrize
(
...
...
@@ -3126,7 +3150,6 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
# 4th Schedule - this should trigger req_low resumption from waiting
output
=
scheduler
.
schedule
()
scheduled_cached_reqs
=
output
.
scheduled_cached_reqs
resumed_from_preemption
=
scheduled_cached_reqs
.
resumed_from_preemption
assert
len
(
output
.
scheduled_new_reqs
)
==
0
assert
scheduled_cached_reqs
.
num_reqs
==
1
...
...
@@ -3134,14 +3157,14 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
assert
len
(
scheduler
.
running
)
==
1
# Preempted request resumed in scheduled_cached_reqs
assert
len
(
resumed_from_preemption
)
==
1
assert
len
(
scheduled_cached_reqs
.
resumed_req_token_ids
)
==
1
assert
resumed_from_preemption
[
0
]
assert
len
(
scheduled_cached_reqs
.
resumed_req_ids
)
==
1
assert
len
(
scheduled_cached_reqs
.
all_token_ids
)
==
1
assert
scheduled_cached_reqs
.
req_ids
[
0
]
==
request_low
.
request_id
assert
scheduled_cached_reqs
.
resumed_req_token_ids
[
0
]
is
not
None
assert
request_low
.
request_id
in
scheduled_cached_reqs
.
resumed_req_ids
assert
request_low
.
request_id
in
scheduled_cached_reqs
.
all_token_ids
## Resumed tokens include 94 prompt tokens and 2 decoded tokens
assert
len
(
scheduled_cached_reqs
.
resumed_req_token_ids
[
0
])
==
96
assert
scheduled_cached_reqs
.
resumed_req_token_ids
[
0
][
95
]
==
100
assert
len
(
scheduled_cached_reqs
.
all_token_ids
[
request_low
.
request_id
])
==
96
assert
scheduled_cached_reqs
.
all_token_ids
[
request_low
.
request_id
][
95
]
==
100
assert
scheduler
.
running
[
0
].
request_id
==
request_low
.
request_id
assert
request_high
.
request_id
in
output
.
finished_req_ids
...
...
@@ -3330,3 +3353,28 @@ def test_ec_connector_allocate_encoder_tokens_with_external_load(use_kv_connecto
# ==============================================================================
# EPD (Encoder-Prefill-Decode) Encoder-cache-specific tests end
# ==============================================================================
def
test_prepend_skipped_requests_order
():
scheduler
=
create_scheduler
(
max_num_seqs
=
1
,
use_kv_connector
=
True
)
requests
=
create_requests
(
num_requests
=
4
)
for
request
in
requests
:
scheduler
.
add_request
(
request
)
# 4 requests waiting, capture their order
expected_waiting_reqs
=
list
(
scheduler
.
waiting
)
# simulate first 2 waiting requests are waiting for remote KVs
for
req
in
expected_waiting_reqs
[:
2
]:
req
.
status
=
RequestStatus
.
WAITING_FOR_REMOTE_KVS
# schedule step
# expect the first 2 waiting to be skipped, the third running,
# and the fourth waiting
scheduler
.
schedule
()
# pop the third request which is expected to be running
expected_waiting_reqs
.
pop
(
2
)
# verify waiting order is preserved
assert
list
(
scheduler
.
waiting
)
==
expected_waiting_reqs
tests/v1/core/test_single_type_kv_cache_manager.py
View file @
7e63ef82
...
...
@@ -21,13 +21,23 @@ from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, SlidingWindowS
pytestmark
=
pytest
.
mark
.
cpu_test
def
get_sliding_window_manager
(
sliding_window_spec
,
block_pool
):
return
SlidingWindowManager
(
sliding_window_spec
,
block_pool
,
kv_cache_group_id
=
0
)
def
get_sliding_window_manager
(
sliding_window_spec
,
block_pool
,
enable_caching
=
True
):
return
SlidingWindowManager
(
sliding_window_spec
,
block_pool
,
enable_caching
=
enable_caching
,
kv_cache_group_id
=
0
,
)
def
get_chunked_local_attention_manager
(
chunked_local_attention_spec
,
block_pool
):
def
get_chunked_local_attention_manager
(
chunked_local_attention_spec
,
block_pool
,
enable_caching
=
True
):
return
ChunkedLocalAttentionManager
(
chunked_local_attention_spec
,
block_pool
,
kv_cache_group_id
=
0
chunked_local_attention_spec
,
block_pool
,
enable_caching
=
enable_caching
,
kv_cache_group_id
=
0
,
)
...
...
@@ -332,11 +342,53 @@ def test_get_num_blocks_to_allocate():
]
assert
(
manager
.
get_num_blocks_to_allocate
(
"1"
,
20
*
block_size
,
cached_blocks_1
)
==
20
manager
.
get_num_blocks_to_allocate
(
"1"
,
20
*
block_size
,
cached_blocks_1
,
0
)
==
20
)
assert
(
manager
.
get_num_blocks_to_allocate
(
"2"
,
20
*
block_size
,
cached_blocks_2
)
==
15
manager
.
get_num_blocks_to_allocate
(
"2"
,
20
*
block_size
,
cached_blocks_2
,
0
)
==
15
)
def
test_evictable_cached_blocks_not_double_allocated
():
block_size
=
2
sliding_window_length
=
2
*
block_size
sliding_window_spec
=
SlidingWindowSpec
(
block_size
=
block_size
,
num_kv_heads
=
1
,
head_size
=
1
,
dtype
=
torch
.
float32
,
sliding_window
=
sliding_window_length
,
)
block_pool
=
BlockPool
(
num_gpu_blocks
=
100
,
enable_caching
=
True
,
hash_block_size
=
block_size
)
manager
=
get_sliding_window_manager
(
sliding_window_spec
,
block_pool
)
request_id
=
"req"
evictable_block
=
block_pool
.
blocks
[
1
]
# ref_cnt == 0, eviction candidate
num_blocks_to_allocate
=
manager
.
get_num_blocks_to_allocate
(
request_id
=
request_id
,
num_tokens
=
2
*
block_size
,
new_computed_blocks
=
[
evictable_block
],
total_computed_tokens
=
block_size
,
)
# Free capacity check should count evictable cached blocks, but allocation
# should only allocate the truly new block.
assert
num_blocks_to_allocate
==
2
manager
.
allocate_new_computed_blocks
(
request_id
,
[
evictable_block
],
num_local_computed_tokens
=
block_size
,
num_external_computed_tokens
=
0
,
)
new_blocks
=
manager
.
allocate_new_blocks
(
request_id
,
num_tokens
=
4
)
assert
len
(
new_blocks
)
==
1
assert
len
(
manager
.
req_to_blocks
[
request_id
])
==
2
def
test_chunked_local_attention_get_num_blocks_to_allocate
():
...
...
@@ -359,8 +411,10 @@ def test_chunked_local_attention_get_num_blocks_to_allocate():
]
assert
(
manager
.
get_num_blocks_to_allocate
(
"1"
,
20
*
block_size
,
cached_blocks_1
)
==
20
manager
.
get_num_blocks_to_allocate
(
"1"
,
20
*
block_size
,
cached_blocks_1
,
0
)
==
20
)
assert
(
manager
.
get_num_blocks_to_allocate
(
"2"
,
20
*
block_size
,
cached_blocks_2
)
==
15
manager
.
get_num_blocks_to_allocate
(
"2"
,
20
*
block_size
,
cached_blocks_2
,
0
)
==
15
)
tests/v1/core/utils.py
View file @
7e63ef82
...
...
@@ -142,7 +142,13 @@ def create_scheduler(
kv_cache_tensors
=
[],
kv_cache_groups
=
[
KVCacheGroupSpec
(
[
"layer"
],
FullAttentionSpec
(
block_size
,
1
,
1
,
torch
.
float32
,
False
)
[
"layer"
],
FullAttentionSpec
(
block_size
=
block_size
,
num_kv_heads
=
1
,
head_size
=
1
,
dtype
=
torch
.
float32
,
),
)
],
)
...
...
tests/v1/cudagraph/test_cudagraph_dispatch.py
View file @
7e63ef82
...
...
@@ -49,7 +49,10 @@ def _create_vllm_config(
mock_config
.
lora_config
=
None
# Mimic the behavior of VllmConfig.__post_init__()
if
compilation_config
.
mode
==
CompilationMode
.
VLLM_COMPILE
:
compilation_config
.
set_splitting_ops_for_v1
()
compilation_config
.
set_splitting_ops_for_v1
(
all2all_backend
=
mock_config
.
parallel_config
.
all2all_backend
,
data_parallel_size
=
mock_config
.
parallel_config
.
data_parallel_size
,
)
# mimic VllmConfig.__post_init__
if
compilation_config
.
cudagraph_capture_sizes
:
...
...
tests/v1/cudagraph/test_cudagraph_mode.py
View file @
7e63ef82
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
contextlib
import
os
import
weakref
from
contextlib
import
ExitStack
...
...
@@ -13,26 +11,6 @@ from vllm import LLM
from
vllm.config
import
CompilationConfig
,
CompilationMode
from
vllm.platforms
import
current_platform
@
contextlib
.
contextmanager
def
temporary_environ
(
env_vars
):
"""
Temporarily set environment variables and restore them afterward.
We have to do this vs monkeypatch because monkeypatch doesn't work
with "module" scoped fixtures.
"""
original_env
=
{
k
:
os
.
environ
.
get
(
k
)
for
k
in
env_vars
}
try
:
os
.
environ
.
update
(
env_vars
)
yield
finally
:
for
k
,
v
in
original_env
.
items
():
if
v
is
None
:
os
.
environ
.
pop
(
k
,
None
)
else
:
os
.
environ
[
k
]
=
v
# test attention backend and cudagraph_mode combo
# (backend_name, cudagraph_mode, supported)
if
current_platform
.
is_rocm
():
...
...
@@ -68,9 +46,9 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
):
pytest
.
skip
(
"Only Hopper GPUs support FA3 and FlashMLA"
)
env_vars
=
backend_config
s
[
backend_name
].
env_vars
attention_config
=
backend_config
.
attention_config
with
temporary_environ
(
env_vars
),
ExitStack
()
as
stack
:
with
ExitStack
()
as
stack
:
if
not
supported
:
stack
.
enter_context
(
pytest
.
raises
(
Exception
))
...
...
@@ -80,6 +58,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
trust_remote_code
=
True
,
gpu_memory_utilization
=
0.45
,
max_model_len
=
1024
,
attention_config
=
attention_config
,
compilation_config
=
CompilationConfig
(
mode
=
CompilationMode
.
VLLM_COMPILE
,
cudagraph_mode
=
cudagraph_mode
),
...
...
@@ -122,9 +101,10 @@ combo_cases_2 = [
def
test_cudagraph_compilation_combo
(
backend_name
,
cudagraph_mode
,
compilation_mode
,
supported
):
env_vars
=
backend_configs
[
backend_name
].
env_vars
backend_config
=
backend_configs
[
backend_name
]
attention_config
=
backend_config
.
attention_config
with
temporary_environ
(
env_vars
),
ExitStack
()
as
stack
:
with
ExitStack
()
as
stack
:
if
not
supported
:
stack
.
enter_context
(
pytest
.
raises
(
Exception
))
...
...
@@ -134,6 +114,7 @@ def test_cudagraph_compilation_combo(
trust_remote_code
=
True
,
gpu_memory_utilization
=
0.45
,
max_model_len
=
1024
,
attention_config
=
attention_config
,
compilation_config
=
CompilationConfig
(
mode
=
compilation_mode
,
cudagraph_mode
=
cudagraph_mode
),
...
...
tests/v1/determinism/test_batch_invariance.py
View file @
7e63ef82
...
...
@@ -28,7 +28,7 @@ IS_DEVICE_CAPABILITY_BELOW_90 = is_device_capability_below_90()
BACKENDS
,
)
def
test_v1_generation_is_deterministic_across_batch_sizes_with_needle
(
backend
,
monkeypatch
:
pytest
.
MonkeyPatch
backend
,
):
"""
Ensures that the same request (the 'needle' prompt) yields identical output
...
...
@@ -54,7 +54,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
seed
=
int
(
os
.
getenv
(
"VLLM_TEST_SEED"
,
"12345"
))
random
.
seed
(
seed
)
monkeypatch
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
backend
)
attention_config
=
{
"backend"
:
backend
}
# Allow overrides from environment (useful for CI tuning)
# "facebook/opt-125m" is too small, doesn't reliably test determinism
model
=
resolve_model_name
(
backend
)
...
...
@@ -92,6 +92,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
max_num_seqs
=
max_batch_size
,
gpu_memory_utilization
=
gpu_mem_util
,
max_model_len
=
max_model_len
,
attention_config
=
attention_config
,
)
# Baseline generation for the needle prompt alone.
...
...
@@ -106,6 +107,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
max_num_seqs
=
max_batch_size
,
gpu_memory_utilization
=
gpu_mem_util
,
max_model_len
=
max_model_len
,
attention_config
=
attention_config
,
)
mismatches
=
0
...
...
@@ -163,10 +165,8 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
BACKENDS
,
)
def
test_logprobs_bitwise_batch_invariance_bs1_vs_bsN
(
backend
,
monkeypatch
:
pytest
.
MonkeyPatch
backend
,
):
monkeypatch
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
backend
)
seed
=
int
(
os
.
getenv
(
"VLLM_TEST_SEED"
,
"12345"
))
random
.
seed
(
seed
)
model_name
=
resolve_model_name
(
backend
)
...
...
@@ -188,12 +188,12 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
llm
=
LLM
(
model
=
model_name
,
tensor_parallel_size
=
tp_size
,
# enable_prefix_caching=False,
max_num_seqs
=
32
,
max_model_len
=
8192
,
dtype
=
"bfloat16"
,
# not everything is supported
gpu_memory_utilization
=
0.9
,
enforce_eager
=
IS_DEVICE_CAPABILITY_BELOW_90
,
attention_config
=
{
"backend"
:
backend
},
)
# Use more realistic prompts for better token generation
...
...
@@ -382,12 +382,11 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
"backend"
,
BACKENDS
,
)
def
test_simple_generation
(
backend
,
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_simple_generation
(
backend
):
"""
Simple test that runs the model with a basic prompt and prints the output.
Useful for quick smoke testing and debugging.
"""
monkeypatch
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
backend
)
model
=
resolve_model_name
(
backend
)
llm
=
LLM
(
...
...
@@ -399,6 +398,7 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
dtype
=
"bfloat16"
,
enable_prefix_caching
=
False
,
enforce_eager
=
IS_DEVICE_CAPABILITY_BELOW_90
,
attention_config
=
{
"backend"
:
backend
},
)
prompt
=
"the capital of france is"
...
...
@@ -445,8 +445,6 @@ def test_logprobs_without_batch_invariance_should_fail(
The test will PASS if we detect differences (proving batch invariance matters).
The test will FAIL if everything matches (suggesting batch invariance isn't needed).
"""
monkeypatch
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
backend
)
# CRITICAL: Disable batch invariance for this test
monkeypatch
.
setenv
(
"VLLM_BATCH_INVARIANT"
,
"0"
)
monkeypatch
.
setattr
(
batch_invariant
,
"VLLM_BATCH_INVARIANT"
,
False
)
...
...
@@ -466,6 +464,7 @@ def test_logprobs_without_batch_invariance_should_fail(
max_model_len
=
8192
,
dtype
=
"bfloat16"
,
enforce_eager
=
IS_DEVICE_CAPABILITY_BELOW_90
,
attention_config
=
{
"backend"
:
backend
},
)
# build ragged prompts to change shapes significantly across BS=1 vs BS=N
...
...
@@ -650,7 +649,7 @@ def test_logprobs_without_batch_invariance_should_fail(
@
skip_unsupported
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"FLASH_ATTN"
])
def
test_decode_logprobs_match_prefill_logprobs
(
backend
,
monkeypatch
:
pytest
.
MonkeyPatch
backend
,
):
"""
Test that verifies decode logprobs match prefill logprobs.
...
...
@@ -665,8 +664,6 @@ def test_decode_logprobs_match_prefill_logprobs(
This ensures that the logprobs from decode are consistent with what
we would get if we ran prefill on each prefix.
"""
monkeypatch
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
backend
)
seed
=
int
(
os
.
getenv
(
"VLLM_TEST_SEED"
,
"12345"
))
random
.
seed
(
seed
)
model_name
=
resolve_model_name
(
backend
)
...
...
@@ -690,6 +687,7 @@ def test_decode_logprobs_match_prefill_logprobs(
max_model_len
=
8192
,
dtype
=
"bfloat16"
,
enforce_eager
=
IS_DEVICE_CAPABILITY_BELOW_90
,
attention_config
=
{
"backend"
:
backend
},
)
# Use a few test prompts
...
...
@@ -921,6 +919,7 @@ def LLM_with_max_seqs(
max_num_seqs
:
int
,
gpu_memory_utilization
:
float
,
max_model_len
:
int
,
attention_config
:
dict
|
None
=
None
,
)
->
LLM
:
"""
Helper to construct an LLM with a specific max_num_seqs (batch-size limit)
...
...
@@ -935,6 +934,7 @@ def LLM_with_max_seqs(
tensor_parallel_size
=
int
(
os
.
getenv
(
"VLLM_TP_SIZE"
,
"1"
)),
enable_prefix_caching
=
False
,
enforce_eager
=
IS_DEVICE_CAPABILITY_BELOW_90
,
attention_config
=
attention_config
,
# Enable for MOE models
# enable_expert_parallel=True,
)
tests/v1/determinism/test_online_batch_invariance.py
View file @
7e63ef82
...
...
@@ -136,11 +136,9 @@ def _compare_bs1_vs_bsn_single_process(
@
skip_unsupported
@
pytest
.
mark
.
parametrize
(
"backend"
,
BACKENDS
)
def
test_logprobs_bitwise_batch_invariance_bs1_vs_bsN
(
backend
:
str
,
monkeypatch
:
pytest
.
MonkeyPatch
backend
:
str
,
)
->
None
:
random
.
seed
(
int
(
os
.
getenv
(
"VLLM_TEST_SEED"
,
"12345"
)))
# Override backend for this test (and the RemoteOpenAIServer child process).
monkeypatch
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
backend
)
model_name
=
resolve_model_name
(
backend
)
prompts_all
=
[
_random_prompt
(
10
,
50
)
for
_
in
range
(
32
)]
...
...
@@ -156,6 +154,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
server_args
:
list
[
str
]
=
[
"--max-model-len=8192"
,
"--max-num-seqs=32"
,
f
"--attention-backend=
{
backend
}
"
,
]
if
tp_size
:
server_args
+=
[
"-tp"
,
tp_size
]
...
...
tests/v1/determinism/test_rms_norm_batch_invariant.py
View file @
7e63ef82
...
...
@@ -21,7 +21,11 @@ from vllm.model_executor.layers.layernorm import RMSNorm
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float16
,
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"eps"
,
[
1e-6
,
1e-5
])
def
test_rms_norm_batch_invariant_vs_standard
(
batch_size
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
eps
:
float
default_vllm_config
,
batch_size
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
eps
:
float
,
):
"""
Compare batch-invariant Triton RMS norm against standard CUDA implementation.
...
...
@@ -68,7 +72,9 @@ def test_rms_norm_batch_invariant_vs_standard(
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
16
,
128
])
@
pytest
.
mark
.
parametrize
(
"seq_len"
,
[
1
,
32
,
512
])
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
[
2048
,
4096
])
def
test_rms_norm_3d_input
(
batch_size
:
int
,
seq_len
:
int
,
hidden_size
:
int
):
def
test_rms_norm_3d_input
(
default_vllm_config
,
batch_size
:
int
,
seq_len
:
int
,
hidden_size
:
int
):
"""
Test RMS norm with 3D input tensors (batch, seq_len, hidden_size).
...
...
@@ -107,7 +113,7 @@ def test_rms_norm_3d_input(batch_size: int, seq_len: int, hidden_size: int):
@
skip_unsupported
def
test_rms_norm_numerical_stability
():
def
test_rms_norm_numerical_stability
(
default_vllm_config
):
"""
Test RMS norm numerical stability with extreme values.
...
...
@@ -167,7 +173,7 @@ def test_rms_norm_numerical_stability():
@
skip_unsupported
def
test_rms_norm_formula
():
def
test_rms_norm_formula
(
default_vllm_config
):
"""
Test that RMS norm follows the correct mathematical formula.
...
...
@@ -201,7 +207,7 @@ def test_rms_norm_formula():
@
skip_unsupported
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
[
128
,
1024
,
4096
,
16384
])
def
test_rms_norm_different_hidden_sizes
(
hidden_size
:
int
):
def
test_rms_norm_different_hidden_sizes
(
default_vllm_config
,
hidden_size
:
int
):
"""
Test RMS norm with various hidden sizes to ensure block size handling.
...
...
@@ -238,7 +244,7 @@ def test_rms_norm_different_hidden_sizes(hidden_size: int):
@
skip_unsupported
def
test_rms_norm_determinism
():
def
test_rms_norm_determinism
(
default_vllm_config
):
"""
Test that batch-invariant RMS norm produces deterministic results.
...
...
tests/v1/determinism/utils.py
View file @
7e63ef82
...
...
@@ -6,9 +6,9 @@ import random
import
pytest
import
torch
from
vllm.attention.utils.fa_utils
import
flash_attn_supports_mla
from
vllm.platforms
import
current_platform
from
vllm.utils.flashinfer
import
has_flashinfer
from
vllm.v1.attention.backends.fa_utils
import
flash_attn_supports_mla
skip_unsupported
=
pytest
.
mark
.
skipif
(
not
(
current_platform
.
is_cuda
()
and
current_platform
.
has_device_capability
(
80
)),
...
...
tests/v1/distributed/test_eagle_dp.py
View file @
7e63ef82
...
...
@@ -16,7 +16,12 @@ DP_SIZE = int(os.getenv("DP_SIZE", 2))
@
pytest
.
mark
.
asyncio
async
def
test_run_eagle_dp
():
async
def
test_run_eagle_dp
(
monkeypatch
:
pytest
.
MonkeyPatch
):
# This test checks that running a model with and without eagle
# leads to identical tokens. This is only true in batch invariant mode
# (because the target model verifies all draft tokens in one big forward pass)
monkeypatch
.
setenv
(
"VLLM_BATCH_INVARIANT"
,
"1"
)
target_model
=
"meta-llama/Llama-3.1-8B-Instruct"
draft_model
=
"yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
...
...
@@ -29,6 +34,7 @@ async def test_run_eagle_dp():
data_parallel_backend
=
"mp"
,
# ray takes more time
trust_remote_code
=
True
,
max_model_len
=
16384
,
attention_config
=
{
"backend"
:
"FLASH_ATTN"
},
)
eagle_engine_args
=
replace
(
...
...
@@ -41,9 +47,10 @@ async def test_run_eagle_dp():
)
prompt
=
"This is a test of data parallel with eagle"
num_expected_tokens
=
100
# This test might be flaky, see
# https://github.com/vllm-project/vllm/issues/31913
num_expected_tokens
=
20
sampling_params
=
SamplingParams
(
min_tokens
=
num_expected_tokens
,
max_tokens
=
num_expected_tokens
,
ignore_eos
=
True
,
output_kind
=
RequestOutputKind
.
FINAL_ONLY
,
...
...
Prev
1
…
27
28
29
30
31
32
33
34
35
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment