Commit 7e63ef82 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0' into v0.14.0-dev

parents 8cbcac5d b17039bc
......@@ -323,6 +323,7 @@ def test_prefill_split_across_ubatches(
num_tokens,
batch_spec.batch_size,
split_point=split_point,
num_ubatches=2,
)
assert ubatch_slices is not None and len(ubatch_slices) == 2
......
......@@ -98,6 +98,27 @@ REORDER_TEST_CASES = {
expected_order=[0, 1, 6, 8, 4, 3, 2, 7, 5],
expected_modified=True,
),
"new_request_single_token_prefill": ReorderTestCase(
requests=[
(100, 0),
(1, 0), # New request with only 1 token (STILL prefill)
(50, 100),
(1, 10),
],
# Only index 3 is a true decode (has num_computed_tokens > 0)
expected_order=[3, 2, 0, 1],
expected_modified=True,
),
"multiple_new_requests_single_token_prefill": ReorderTestCase(
requests=[
(1, 0), # New prefill (1 token, no computed)
(1, 0), # New prefill (1 token, no computed)
(1, 50),
(200, 0),
],
expected_order=[2, 1, 0, 3],
expected_modified=True,
),
}
......
......@@ -172,7 +172,7 @@ def test_local_attention_virtual_batches(test_data: LocalAttentionTestData):
)
# Call the function
result = make_local_attention_virtual_batches(
result, _ = make_local_attention_virtual_batches(
attn_chunk_size, common_attn_metadata, block_size
)
......
......@@ -18,15 +18,15 @@ from tests.v1.attention.utils import (
try_get_attention_backend,
)
from vllm import _custom_ops as ops
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.attention.ops.flashmla import is_flashmla_dense_supported
from vllm.attention.utils.fa_utils import flash_attn_supports_mla
from vllm.config.vllm import set_current_vllm_config
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
from vllm.utils.math_utils import cdiv
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
from vllm.v1.attention.backend import CommonAttentionMetadata
from vllm.v1.attention.backends.fa_utils import flash_attn_supports_mla
from vllm.v1.attention.backends.mla.common import QueryLenSupport
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
from vllm.v1.attention.backends.registry import AttentionBackendEnum
from vllm.v1.attention.ops.flashmla import is_flashmla_dense_supported
from vllm.v1.kv_cache_interface import FullAttentionSpec
BACKENDS_TO_TEST = [
......@@ -154,12 +154,12 @@ def create_and_prepopulate_kv_cache(
MLA KV cache tensor
"""
batch_size = len(kv_c_contexts)
seq_lens = common_attn_metadata.seq_lens_cpu
seq_lens = common_attn_metadata.seq_lens.cpu()
query_lens = (
common_attn_metadata.query_start_loc_cpu[1:]
- common_attn_metadata.query_start_loc_cpu[:-1]
)
context_lens = common_attn_metadata.num_computed_tokens_cpu
context_lens = seq_lens - query_lens
block_table = common_attn_metadata.block_table_tensor
slot_mapping = common_attn_metadata.slot_mapping
......@@ -394,7 +394,11 @@ def run_attention_backend(
@pytest.mark.parametrize("model", ["deepseek-ai/DeepSeek-R1"])
@pytest.mark.parametrize("tensor_parallel_size", [1, 4, 8, 16])
def test_backend_correctness(
dist_init, batch_spec_name: str, model: str, tensor_parallel_size: int
default_vllm_config,
dist_init,
batch_spec_name: str,
model: str,
tensor_parallel_size: int,
):
"""
Test that all backends produce similar outputs to a reference implementation
......
......@@ -7,8 +7,9 @@ from unittest.mock import MagicMock, patch
import pytest
import torch
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.platforms import current_platform
from vllm.v1.attention.backends.registry import AttentionBackendEnum
from vllm.v1.attention.selector import AttentionSelectorConfig
# ROCm-specific attention backend selection tests
pytestmark = pytest.mark.skipif(
......@@ -94,26 +95,20 @@ def mock_on_gfx9():
None,
AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path(),
),
# Test Case 9: VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1
(
{"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
None,
AttentionBackendEnum.ROCM_ATTN.get_path(),
),
# Test Case 10: VLLM_ROCM_USE_AITER=1 + explicit TRITON_ATTN
# Test Case 9: VLLM_ROCM_USE_AITER=1 + explicit TRITON_ATTN
(
{"VLLM_ROCM_USE_AITER": "1"},
"TRITON_ATTN",
AttentionBackendEnum.TRITON_ATTN.get_path(),
),
# Test Case 11: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=0
# Test Case 10: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=0
# (explicitly disabled)
(
{"VLLM_ROCM_USE_AITER": "1", "VLLM_ROCM_USE_AITER_MHA": "0"},
None,
AttentionBackendEnum.TRITON_ATTN.get_path(),
),
# Test Case 12: VLLM_ROCM_USE_AITER=1 + explicit ROCM_ATTN
# Test Case 11: VLLM_ROCM_USE_AITER=1 + explicit ROCM_ATTN
(
{"VLLM_ROCM_USE_AITER": "1"},
"ROCM_ATTN",
......@@ -150,8 +145,7 @@ def test_standard_attention_backend_selection(
# Get the backend class path
from vllm.platforms.rocm import RocmPlatform
backend_path = RocmPlatform.get_attn_backend_cls(
selected_backend=backend_enum,
attn_selector_config = AttentionSelectorConfig(
head_size=128,
dtype=torch.float16,
kv_cache_dtype="auto",
......@@ -160,6 +154,11 @@ def test_standard_attention_backend_selection(
has_sink=False,
use_sparse=False,
)
backend_path = RocmPlatform.get_attn_backend_cls(
selected_backend=backend_enum, attn_selector_config=attn_selector_config
)
assert backend_path == expected_backend_path
......@@ -273,8 +272,16 @@ def test_mla_backend_selection(
if should_raise:
with pytest.raises(ValueError):
RocmPlatform.get_attn_backend_cls(
selected_backend=backend_enum,
attn_selector_config = AttentionSelectorConfig(
head_size=128,
dtype=torch.float16,
kv_cache_dtype="auto",
block_size=block_size,
use_mla=True,
has_sink=False,
use_sparse=False,
)
attn_selector_config = AttentionSelectorConfig(
head_size=128,
dtype=torch.float16,
kv_cache_dtype="auto",
......@@ -283,9 +290,13 @@ def test_mla_backend_selection(
has_sink=False,
use_sparse=False,
)
backend_path = RocmPlatform.get_attn_backend_cls(
selected_backend=backend_enum,
attn_selector_config=attn_selector_config,
)
else:
backend_path = RocmPlatform.get_attn_backend_cls(
selected_backend=backend_enum,
attn_selector_config = AttentionSelectorConfig(
head_size=128,
dtype=torch.float16,
kv_cache_dtype="auto",
......@@ -294,6 +305,11 @@ def test_mla_backend_selection(
has_sink=False,
use_sparse=False,
)
backend_path = RocmPlatform.get_attn_backend_cls(
selected_backend=backend_enum, attn_selector_config=attn_selector_config
)
assert backend_path == expected_backend_path
......@@ -309,8 +325,7 @@ def test_aiter_fa_requires_gfx9(mock_vllm_config):
match="only supported on gfx9",
),
):
RocmPlatform.get_attn_backend_cls(
selected_backend=AttentionBackendEnum.ROCM_AITER_FA,
attn_selector_config = AttentionSelectorConfig(
head_size=128,
dtype=torch.float16,
kv_cache_dtype="auto",
......@@ -320,6 +335,11 @@ def test_aiter_fa_requires_gfx9(mock_vllm_config):
use_sparse=False,
)
RocmPlatform.get_attn_backend_cls(
selected_backend=AttentionBackendEnum.ROCM_AITER_FA,
attn_selector_config=attn_selector_config,
)
def test_sparse_not_supported(mock_vllm_config):
"""Test that sparse attention is not supported on ROCm."""
......@@ -328,8 +348,7 @@ def test_sparse_not_supported(mock_vllm_config):
with pytest.raises(
AssertionError, match="Sparse MLA backend on ROCm only supports block size 1"
):
RocmPlatform.get_attn_backend_cls(
selected_backend=None,
attn_selector_config = AttentionSelectorConfig(
head_size=128,
dtype=torch.float16,
kv_cache_dtype="auto",
......@@ -338,3 +357,7 @@ def test_sparse_not_supported(mock_vllm_config):
has_sink=False,
use_sparse=True,
)
RocmPlatform.get_attn_backend_cls(
selected_backend=None, attn_selector_config=attn_selector_config
)
......@@ -22,15 +22,16 @@ from tests.v1.attention.utils import (
create_vllm_config,
)
from vllm import _custom_ops as ops
from vllm.attention.ops import flashmla
from vllm.config import set_current_vllm_config
from vllm.model_executor.layers.linear import ColumnParallelLinear
from vllm.platforms import current_platform
from vllm.utils.math_utils import cdiv
from vllm.v1.attention.backends.mla.flashmla_sparse import (
FlashMLASparseBackend,
triton_convert_req_index_to_global_index,
)
from vllm.v1.attention.backends.utils import split_prefill_chunks
from vllm.v1.attention.ops import flashmla
from ...utils import models_path_prefix
SPARSE_BACKEND_BATCH_SPECS = {
......@@ -125,8 +126,16 @@ def _quantize_dequantize_fp8_ds_mla(
reason="FlashMLASparseBackend requires CUDA 9.0 or higher",
)
def test_sparse_backend_decode_correctness(
dist_init, batch_name, kv_cache_dtype, tensor_parallel_size, workspace_init
default_vllm_config,
dist_init,
batch_name,
kv_cache_dtype,
tensor_parallel_size,
workspace_init,
):
if current_platform.is_rocm():
pytest.skip("ROCm does not support fp8_ds_mla data type for kv cache.")
if not torch.cuda.is_available():
pytest.skip("CUDA is required for sparse MLA decode test")
......@@ -295,7 +304,7 @@ def test_sparse_backend_decode_correctness(
positions = np.arange(starts[-1], dtype=np.int32) - np.repeat(
starts[:-1], seg_lengths
)
seq_lengths = np.asarray(common_attn_metadata.seq_lens_cpu, dtype=np.int32)
seq_lengths = np.asarray(common_attn_metadata.seq_lens.cpu(), dtype=np.int32)
prefix_lengths = seq_lengths - seg_lengths
positions += np.repeat(prefix_lengths, seg_lengths)
......
......@@ -7,8 +7,6 @@ from dataclasses import dataclass
import pytest
import torch
from vllm.attention.backends.abstract import AttentionImpl
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.config import (
CacheConfig,
CompilationConfig,
......@@ -20,10 +18,12 @@ from vllm.config import (
VllmConfig,
)
from vllm.config.model import ModelDType
from vllm.v1.attention.backends.utils import (
from vllm.v1.attention.backend import (
AttentionImpl,
AttentionMetadataBuilder,
CommonAttentionMetadata,
)
from vllm.v1.attention.backends.registry import AttentionBackendEnum
from vllm.v1.kv_cache_interface import FullAttentionSpec
......@@ -249,8 +249,8 @@ def create_dummy_kv_cache(
@dataclass
class BackendConfig:
name: str
env_vars: dict
comp_config: dict # compilation config
attention_config: dict
comp_config: dict
specific_gpu_arch: tuple | None = None
......@@ -259,10 +259,10 @@ full_cg_backend_configs = {
# FA3 on Hopper
"FA3": BackendConfig(
name="FA3",
env_vars={
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
"VLLM_FLASH_ATTN_VERSION": "3",
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
attention_config={
"backend": "FLASH_ATTN",
"flash_attn_version": 3,
"flash_attn_max_num_splits_for_cuda_graph": 16,
},
comp_config={
"cudagraph_mode": "FULL",
......@@ -272,9 +272,7 @@ full_cg_backend_configs = {
# FlashMLA on Hopper
"FlashMLA": BackendConfig(
name="FlashMLA",
env_vars={
"VLLM_ATTENTION_BACKEND": "FLASHMLA",
},
attention_config={"backend": "FLASHMLA"},
comp_config={
"cudagraph_mode": "FULL_AND_PIECEWISE",
},
......@@ -283,9 +281,7 @@ full_cg_backend_configs = {
# Cutlass MLA on Blackwell
"CutlassMLA": BackendConfig(
name="CutlassMLA",
env_vars={
"VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
},
attention_config={"backend": "CUTLASS_MLA"},
comp_config={
"cudagraph_mode": "FULL_AND_PIECEWISE",
},
......@@ -294,9 +290,7 @@ full_cg_backend_configs = {
# FlashInfer MLA on Blackwell
"FlashInferMLA": BackendConfig(
name="FlashInferMLA",
env_vars={
"VLLM_ATTENTION_BACKEND": "FLASHINFER_MLA",
},
attention_config={"backend": "FLASHINFER_MLA"},
comp_config={
"cudagraph_mode": "FULL_AND_PIECEWISE",
},
......@@ -305,9 +299,9 @@ full_cg_backend_configs = {
# FlashAttention MLA on Hopper
"FlashAttentionMLA": BackendConfig(
name="FlashAttentionMLA",
env_vars={
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
attention_config={
"backend": "FLASH_ATTN_MLA",
"flash_attn_max_num_splits_for_cuda_graph": 16,
},
comp_config={
"cudagraph_mode": "FULL_DECODE_ONLY",
......@@ -317,10 +311,10 @@ full_cg_backend_configs = {
# FA2
"FA2": BackendConfig(
name="FA2",
env_vars={
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
"VLLM_FLASH_ATTN_VERSION": "2",
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
attention_config={
"backend": "FLASH_ATTN",
"flash_attn_version": 2,
"flash_attn_max_num_splits_for_cuda_graph": 16,
},
comp_config={
"cudagraph_mode": "FULL_AND_PIECEWISE",
......@@ -329,7 +323,7 @@ full_cg_backend_configs = {
# Triton Attention
"TritonAttn": BackendConfig(
name="TritonAttn",
env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
attention_config={"backend": "TRITON_ATTN"},
comp_config={
"cudagraph_mode": "FULL_AND_PIECEWISE",
},
......@@ -337,14 +331,17 @@ full_cg_backend_configs = {
# FlashInfer
"FlashInfer": BackendConfig(
name="FlashInfer",
env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
attention_config={"backend": "FLASHINFER"},
comp_config={
"cudagraph_mode": "FULL_AND_PIECEWISE",
},
),
"RocmAttn": BackendConfig(
name="RocmAttn",
env_vars={"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
attention_config={
"backend": "ROCM_ATTN",
"use_prefill_decode_attention": True,
},
comp_config={
"cudagraph_mode": "FULL",
},
......
......@@ -1800,3 +1800,60 @@ def test_request_with_prompt_embeds_and_mm_inputs(hash_fn: Callable[[Any], bytes
)
)
assert block_hashes[1] == expected_hash2
def test_auto_fit_max_model_len():
"""Test that max_model_len=-1 auto-fits to available GPU memory."""
# Create config with original_max_model_len=-1 to trigger auto-fit
model_config = ModelConfig(max_model_len=1024)
# Simulate the user passing -1 by setting original_max_model_len
model_config.original_max_model_len = -1
vllm_config = VllmConfig(model_config=model_config)
mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2 # 16KB per block per layer
kv_cache_specs = {
"layer_1": new_kv_cache_spec(),
"layer_2": new_kv_cache_spec(),
}
# With enough memory, max_model_len stays at the derived max
large_available_memory = mem_per_block_per_layer * 2 * 1024 # plenty of memory
_kv_cache_configs = get_kv_cache_configs(
vllm_config, [kv_cache_specs], [large_available_memory]
)
assert vllm_config.model_config.max_model_len == 1024
# Reset for next test
model_config = ModelConfig(max_model_len=1024)
model_config.original_max_model_len = -1
vllm_config = VllmConfig(model_config=model_config)
# With limited memory, max_model_len should be reduced
# Need memory for at least max_model_len tokens
# 32 blocks worth of memory for 2 layers = can fit 32*16=512 tokens
limited_memory = mem_per_block_per_layer * 2 * 32
_kv_cache_configs = get_kv_cache_configs(
vllm_config, [kv_cache_specs], [limited_memory]
)
# Should be reduced to fit in memory
assert vllm_config.model_config.max_model_len < 1024
assert vllm_config.model_config.max_model_len > 0
def test_auto_fit_max_model_len_not_triggered():
"""Test that auto-fit is not triggered when original_max_model_len is not -1."""
model_config = ModelConfig(max_model_len=16)
# original_max_model_len should be None by default, not -1
vllm_config = VllmConfig(model_config=model_config)
mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
kv_cache_specs = {
"layer_1": new_kv_cache_spec(),
"layer_2": new_kv_cache_spec(),
}
# This should work normally without auto-fit
_kv_cache_configs = get_kv_cache_configs(
vllm_config, [kv_cache_specs], [mem_per_block_per_layer * 2 * 32]
)
assert vllm_config.model_config.max_model_len == 16
......@@ -11,7 +11,9 @@ pytestmark = pytest.mark.cpu_test
def new_kv_cache_spec():
return FullAttentionSpec(16, 1, 1, torch.float32, False)
return FullAttentionSpec(
block_size=16, num_kv_heads=1, head_size=1, dtype=torch.float32
)
def test_initialize_kv_cache_for_kv_sharing_different_attn_groups():
......
......@@ -35,6 +35,7 @@ from vllm.v1.kv_cache_interface import (
FullAttentionSpec,
KVCacheConfig,
KVCacheGroupSpec,
MambaSpec,
SlidingWindowSpec,
)
......@@ -94,35 +95,105 @@ def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
kv_cache_groups=[
KVCacheGroupSpec(
["layer"],
FullAttentionSpec(block_size, 1, 1, torch.float32),
FullAttentionSpec(
block_size=block_size,
num_kv_heads=1,
head_size=1,
dtype=torch.float32,
),
)
],
)
def make_kv_cache_config_hybrid_model(
block_size: int, num_blocks: int
block_size: int, num_blocks: int, second_spec_type: str = "sliding_window"
) -> KVCacheConfig:
if second_spec_type == "sliding_window":
second_spec = SlidingWindowSpec(
block_size=block_size,
num_kv_heads=1,
head_size=1,
dtype=torch.float32,
sliding_window=2 * block_size,
)
elif second_spec_type == "mamba":
second_spec = MambaSpec(
block_size=block_size,
shapes=(1, 1),
dtypes=(torch.float32,),
)
return KVCacheConfig(
num_blocks=num_blocks,
kv_cache_tensors=[],
kv_cache_groups=[
KVCacheGroupSpec(
["layer1"],
FullAttentionSpec(block_size, 1, 1, torch.float32),
FullAttentionSpec(
block_size=block_size,
num_kv_heads=1,
head_size=1,
dtype=torch.float32,
),
),
KVCacheGroupSpec(
["layer2"],
SlidingWindowSpec(
block_size, 1, 1, torch.float32, sliding_window=2 * block_size
),
second_spec,
),
KVCacheGroupSpec(
["layer3"],
second_spec,
),
],
)
def make_kv_cache_config_three_types(
block_size: int, num_blocks: int, third_spec_type: str = "mamba"
) -> KVCacheConfig:
if third_spec_type == "mamba":
third_spec = MambaSpec(
block_size=block_size,
shapes=(1, 1),
dtypes=(torch.float32,),
)
elif third_spec_type == "sliding_window":
third_spec = SlidingWindowSpec(
block_size=block_size,
num_kv_heads=1,
head_size=1,
dtype=torch.float32,
sliding_window=4 * block_size,
)
return KVCacheConfig(
num_blocks=num_blocks,
kv_cache_tensors=[],
kv_cache_groups=[
KVCacheGroupSpec(
["layer1"],
FullAttentionSpec(
block_size=block_size,
num_kv_heads=1,
head_size=1,
dtype=torch.float32,
),
),
KVCacheGroupSpec(
["layer2"],
SlidingWindowSpec(
block_size, 1, 1, torch.float32, sliding_window=2 * block_size
block_size=block_size,
num_kv_heads=1,
head_size=1,
dtype=torch.float32,
sliding_window=2 * block_size,
),
),
KVCacheGroupSpec(
["layer3"],
third_spec,
),
],
)
......@@ -406,6 +477,184 @@ def test_prefill_hybrid_model():
)
def _make_hybrid_kv_cache_config(
block_size: int, num_blocks: int, spec_types: list[str]
) -> KVCacheConfig:
"""
Create a KVCacheConfig with the specified spec types.
Args:
block_size: The block size for KV cache.
num_blocks: The number of blocks in the KV cache.
spec_types: List of spec type strings. Supported types:
- "full": FullAttentionSpec
- "sliding_window": SlidingWindowSpec with window=2*block_size
- "sliding_window_large": SlidingWindowSpec with window=4*block_size
- "mamba": MambaSpec
"""
spec_map = {
"full": lambda: FullAttentionSpec(
block_size=block_size,
num_kv_heads=1,
head_size=1,
dtype=torch.float32,
),
"sliding_window": lambda: SlidingWindowSpec(
block_size=block_size,
num_kv_heads=1,
head_size=1,
dtype=torch.float32,
sliding_window=2 * block_size,
),
"sliding_window_large": lambda: SlidingWindowSpec(
block_size=block_size,
num_kv_heads=1,
head_size=1,
dtype=torch.float32,
sliding_window=4 * block_size,
),
"mamba": lambda: MambaSpec(
block_size=block_size,
shapes=(1, 1),
dtypes=(torch.float32,),
),
}
kv_cache_groups = [
KVCacheGroupSpec([f"layer{i}"], spec_map[spec_type]())
for i, spec_type in enumerate(spec_types)
]
return KVCacheConfig(
num_blocks=num_blocks,
kv_cache_tensors=[],
kv_cache_groups=kv_cache_groups,
)
# Test cases covering various combinations of KV cache spec types:
# - Varying number of groups (2, 3, or 4)
# - 0, 1, or 2 full attention groups
# - Sliding window with different window sizes
# - Interleaved group IDs (full attn and other types mixed)
# - Mamba spec combinations
_HYBRID_MODEL_TEST_CASES = [
# 2 groups: 1 full + 1 other
pytest.param(["full", "sliding_window"], id="2g-full+sw"),
pytest.param(["full", "mamba"], id="2g-full+mamba"),
# 2 groups: 0 full (all other types)
pytest.param(["sliding_window", "mamba"], id="2g-sw+mamba"),
pytest.param(["sliding_window", "sliding_window_large"], id="2g-sw+sw_large"),
# 3 groups: 1 full + 2 others (same type)
pytest.param(["full", "sliding_window", "sliding_window"], id="3g-full+2sw"),
pytest.param(["full", "mamba", "mamba"], id="3g-full+2mamba"),
# 3 groups: 1 full + 2 others (different types)
pytest.param(["full", "sliding_window", "mamba"], id="3g-full+sw+mamba"),
pytest.param(
["full", "sliding_window", "sliding_window_large"],
id="3g-full+sw+sw_large",
),
# 3 groups: 2 full + 1 other
pytest.param(["full", "full", "sliding_window"], id="3g-2full+sw"),
pytest.param(["full", "full", "mamba"], id="3g-2full+mamba"),
# 4 groups: interleaved (full, other, full, other)
pytest.param(
["full", "sliding_window", "full", "sliding_window_large"],
id="4g-interleaved-full+sw+sw_large",
),
pytest.param(
["full", "mamba", "full", "mamba"],
id="4g-interleaved-full+mamba",
),
# 4 groups: interleaved with different sliding windows
pytest.param(
["full", "sliding_window", "full", "sliding_window_large"],
id="4g-interleaved-full+sw_mixed",
),
# 4 groups: 0 full (all other types)
pytest.param(
["sliding_window", "mamba", "sliding_window_large", "mamba"],
id="4g-sw+mamba+sw_large+mamba",
),
# 4 groups: 2 full + 2 others (grouped)
pytest.param(
["full", "full", "sliding_window", "mamba"],
id="4g-2full+sw+mamba",
),
]
@pytest.mark.parametrize("spec_types", _HYBRID_MODEL_TEST_CASES)
def test_prefill_hybrid_model_combinations(spec_types: list[str]):
"""
Test prefix caching with hybrid models containing various combinations of
KV cache spec types.
This unified test covers:
- Various combinations (full attn + other attn types)
- Varying number of groups (2, 3, or 4)
- 0, 1, or 2 full attention groups in the combination
- Two sliding_window attn groups with different window sizes
- Interleaved group IDs (full attn and other types alternating)
- Mamba spec with other attention types
"""
block_size = 16
num_groups = len(spec_types)
# Allocate enough blocks for all groups
num_blocks = 10 * num_groups
kv_cache_config = _make_hybrid_kv_cache_config(block_size, num_blocks, spec_types)
manager = KVCacheManager(
kv_cache_config,
max_model_len=8192,
enable_caching=True,
hash_block_size=block_size,
)
hash_fn = sha256
# Complete 3 blocks (48 tokens)
common_token_ids = [i for i in range(3) for _ in range(block_size)]
unique_token_ids = [3] * 7
all_token_ids = common_token_ids + unique_token_ids
# First request: no cache hit initially
req0 = make_request("0", all_token_ids, block_size, hash_fn)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
assert len(req0.block_hashes) == 3
assert not computed_blocks.blocks[0] # No cache hit initially
assert num_computed_tokens == 0
blocks = manager.allocate_slots(
req0, 55, len(computed_blocks.blocks[0]) * block_size, computed_blocks
)
assert blocks is not None
# Should have blocks for all groups
assert len(blocks.get_block_ids()) == num_groups
# Second request: should hit cached blocks for common prefix
req1 = make_request("1", common_token_ids + [4] * 5, block_size, hash_fn)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
# Should hit cached blocks for all groups
assert num_computed_tokens == 3 * block_size
assert len(computed_blocks.blocks) == num_groups
# Allocate and verify blocks for second request
blocks = manager.allocate_slots(
req1,
len(common_token_ids) + 5 - num_computed_tokens,
num_computed_tokens,
computed_blocks,
)
assert blocks is not None
assert len(blocks.get_block_ids()) == num_groups
manager.free(req0)
manager.free(req1)
def test_prefill_plp():
"""Test prefill with APC and some prompt logprobs (plp) requests.
......@@ -1356,6 +1605,69 @@ def test_kv_cache_events(blocks_to_cache: int):
assert len(manager.block_pool.cached_block_hash_to_block) == 0
def test_null_parent_block_hash():
block_size = 1
num_cached_blocks = 2
num_full_blocks = 4
pool = BlockPool(
num_gpu_blocks=8,
enable_caching=True,
hash_block_size=block_size,
enable_kv_cache_events=True,
)
req = make_request(
"req_null_parent",
prompt_token_ids=[10, 11, 12, 13],
block_size=block_size,
hash_fn=sha256,
)
assert len(req.block_hashes) == num_full_blocks
# Physical parent is `null_block` (no hash), while the logical parent hash
# still exists in `request.block_hashes[num_cached_blocks - 1]`.
assert pool.null_block.block_hash is None
new_blocks = pool.get_new_blocks(num_full_blocks - 1)
blocks = [
new_blocks[: num_cached_blocks - 1],
pool.null_block, # physical parent
*new_blocks[num_cached_blocks - 1 :],
]
pool.cache_full_blocks(
request=req,
blocks=blocks,
num_cached_blocks=num_cached_blocks,
num_full_blocks=num_full_blocks,
block_size=block_size,
kv_cache_group_id=0,
)
events = pool.take_events()
assert len(events) == 1
event = events[0]
assert isinstance(event, BlockStored)
expected_parent = kv_cache_utils.maybe_convert_block_hash(
req.block_hashes[num_cached_blocks - 1]
)
assert event.parent_block_hash == expected_parent
assert event.parent_block_hash is not None
expected_new_hashes = [
kv_cache_utils.maybe_convert_block_hash(h)
for h in req.block_hashes[num_cached_blocks:num_full_blocks]
]
assert event.block_hashes == expected_new_hashes
# Ensure we didn't accidentally assign a hash to the null block.
assert pool.null_block.block_hash is None
# Sanity check: newly cached physical blocks should have hashes assigned.
assert blocks[num_cached_blocks].block_hash is not None
assert blocks[num_full_blocks - 1].block_hash is not None
@pytest.mark.parametrize("blocks_to_cache", [2, 3, 10])
def test_kv_cache_events_with_lora(blocks_to_cache: int):
"""Test BlockStored events contain correct lora_id when using LoRA requests."""
......@@ -1553,15 +1865,20 @@ def test_different_block_size():
kv_cache_groups=[
KVCacheGroupSpec(
["layer1"],
FullAttentionSpec(block_size * 2, 1, 1, torch.float16),
FullAttentionSpec(
block_size=block_size * 2,
num_kv_heads=1,
head_size=1,
dtype=torch.float16,
),
),
KVCacheGroupSpec(
["layer2"],
SlidingWindowSpec(
block_size,
1,
1,
torch.float32,
block_size=block_size,
num_kv_heads=1,
head_size=1,
dtype=torch.float32,
sliding_window=2 * block_size,
),
),
......
......@@ -1264,10 +1264,11 @@ def test_kv_connector_unable_to_allocate(use_ec_connector, ec_role):
assert len(scheduler.waiting) == 0
@pytest.mark.parametrize("is_async", [False, True])
@pytest.mark.parametrize(
"use_ec_connector, ec_role", [(False, None), (True, "ec_consumer")]
)
def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
def test_kv_connector_handles_preemption(is_async, use_ec_connector, ec_role):
"""
Test whether scheduler with KVConnector is able to handle
unable to allocate (run out of blocks in allocate_slots().
......@@ -1280,7 +1281,9 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
scheduler = create_scheduler(
enable_prefix_caching=True,
use_kv_connector=mock_kv(matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=False),
use_kv_connector=mock_kv(
matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=is_async
),
block_size=BLOCK_SIZE,
num_blocks=NUM_BLOCKS,
# encoder connector should not affect test results
......@@ -1318,6 +1321,12 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
# All can be scheduled - 1st token.
output = scheduler.schedule()
if is_async:
assert len(scheduler.waiting) == 2
assert scheduler.running == []
_step_until_kv_transfer_finished(scheduler, req_ids)
output = scheduler.schedule()
_assert_right_scheduler_output(
output,
# 2 remote kv cache hits.
......@@ -1370,6 +1379,12 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
# Restarts the preempted request - generate 3rd token.
# This will have a local and remote cache hit.
output = scheduler.schedule()
if is_async:
waiting_req_ids = [req.request_id for req in scheduler.waiting]
assert len(waiting_req_ids) == 1
_step_until_kv_transfer_finished(scheduler, waiting_req_ids)
output = scheduler.schedule()
_assert_right_scheduler_output(
output,
# 1 remote kv_cache hit!
......@@ -1380,6 +1395,8 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
)
assert len(scheduler.running) == 1
assert len(scheduler.waiting) == 0
assert output.scheduled_cached_reqs.num_reqs == 1
assert output.scheduled_new_reqs == []
_ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
assert len(scheduler.running) == 1
assert len(scheduler.waiting) == 0
......@@ -1392,6 +1409,8 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
num_requests=0,
expected_num_scheduled_tokens=1,
)
assert output.scheduled_cached_reqs.num_reqs == 1
assert output.scheduled_new_reqs == []
assert len(scheduler.running) == 1
_ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
assert len(scheduler.running) == 0
......@@ -1577,7 +1596,13 @@ def create_scheduler_with_priority(
kv_cache_tensors=[],
kv_cache_groups=[
KVCacheGroupSpec(
["layer"], FullAttentionSpec(block_size, 1, 1, torch.float32, False)
["layer"],
FullAttentionSpec(
block_size=block_size,
num_kv_heads=1,
head_size=1,
dtype=torch.float32,
),
)
],
)
......@@ -2288,7 +2313,6 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
# 4th Schedule - this should trigger the resumption
output = scheduler.schedule()
scheduled_cached_reqs = output.scheduled_cached_reqs
resumed_from_preemption = scheduled_cached_reqs.resumed_from_preemption
assert len(output.scheduled_new_reqs) == 0
assert scheduled_cached_reqs.num_reqs == 1
......@@ -2296,14 +2320,14 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
assert len(scheduler.running) == 1
# Preempted request resumed in scheduled_cached_reqs
assert len(resumed_from_preemption) == 1
assert len(scheduled_cached_reqs.resumed_req_token_ids) == 1
assert resumed_from_preemption[0]
assert len(scheduled_cached_reqs.resumed_req_ids) == 1
assert len(scheduled_cached_reqs.all_token_ids) == 1
assert scheduled_cached_reqs.req_ids[0] == request_low.request_id
assert scheduled_cached_reqs.resumed_req_token_ids[0] is not None
assert request_low.request_id in scheduled_cached_reqs.resumed_req_ids
assert request_low.request_id in scheduled_cached_reqs.all_token_ids
# Resumed tokens include 30 prompt tokens and 2 decoded tokens
assert len(scheduled_cached_reqs.resumed_req_token_ids[0]) == 32
assert scheduled_cached_reqs.resumed_req_token_ids[0][31] == 100
assert len(scheduled_cached_reqs.all_token_ids[request_low.request_id]) == 32
assert scheduled_cached_reqs.all_token_ids[request_low.request_id][31] == 100
@pytest.mark.parametrize(
......@@ -3126,7 +3150,6 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
# 4th Schedule - this should trigger req_low resumption from waiting
output = scheduler.schedule()
scheduled_cached_reqs = output.scheduled_cached_reqs
resumed_from_preemption = scheduled_cached_reqs.resumed_from_preemption
assert len(output.scheduled_new_reqs) == 0
assert scheduled_cached_reqs.num_reqs == 1
......@@ -3134,14 +3157,14 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
assert len(scheduler.running) == 1
# Preempted request resumed in scheduled_cached_reqs
assert len(resumed_from_preemption) == 1
assert len(scheduled_cached_reqs.resumed_req_token_ids) == 1
assert resumed_from_preemption[0]
assert len(scheduled_cached_reqs.resumed_req_ids) == 1
assert len(scheduled_cached_reqs.all_token_ids) == 1
assert scheduled_cached_reqs.req_ids[0] == request_low.request_id
assert scheduled_cached_reqs.resumed_req_token_ids[0] is not None
assert request_low.request_id in scheduled_cached_reqs.resumed_req_ids
assert request_low.request_id in scheduled_cached_reqs.all_token_ids
## Resumed tokens include 94 prompt tokens and 2 decoded tokens
assert len(scheduled_cached_reqs.resumed_req_token_ids[0]) == 96
assert scheduled_cached_reqs.resumed_req_token_ids[0][95] == 100
assert len(scheduled_cached_reqs.all_token_ids[request_low.request_id]) == 96
assert scheduled_cached_reqs.all_token_ids[request_low.request_id][95] == 100
assert scheduler.running[0].request_id == request_low.request_id
assert request_high.request_id in output.finished_req_ids
......@@ -3330,3 +3353,28 @@ def test_ec_connector_allocate_encoder_tokens_with_external_load(use_kv_connecto
# ==============================================================================
# EPD (Encoder-Prefill-Decode) Encoder-cache-specific tests end
# ==============================================================================
def test_prepend_skipped_requests_order():
scheduler = create_scheduler(max_num_seqs=1, use_kv_connector=True)
requests = create_requests(num_requests=4)
for request in requests:
scheduler.add_request(request)
# 4 requests waiting, capture their order
expected_waiting_reqs = list(scheduler.waiting)
# simulate first 2 waiting requests are waiting for remote KVs
for req in expected_waiting_reqs[:2]:
req.status = RequestStatus.WAITING_FOR_REMOTE_KVS
# schedule step
# expect the first 2 waiting to be skipped, the third running,
# and the fourth waiting
scheduler.schedule()
# pop the third request which is expected to be running
expected_waiting_reqs.pop(2)
# verify waiting order is preserved
assert list(scheduler.waiting) == expected_waiting_reqs
......@@ -21,13 +21,23 @@ from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, SlidingWindowS
pytestmark = pytest.mark.cpu_test
def get_sliding_window_manager(sliding_window_spec, block_pool):
return SlidingWindowManager(sliding_window_spec, block_pool, kv_cache_group_id=0)
def get_sliding_window_manager(sliding_window_spec, block_pool, enable_caching=True):
return SlidingWindowManager(
sliding_window_spec,
block_pool,
enable_caching=enable_caching,
kv_cache_group_id=0,
)
def get_chunked_local_attention_manager(chunked_local_attention_spec, block_pool):
def get_chunked_local_attention_manager(
chunked_local_attention_spec, block_pool, enable_caching=True
):
return ChunkedLocalAttentionManager(
chunked_local_attention_spec, block_pool, kv_cache_group_id=0
chunked_local_attention_spec,
block_pool,
enable_caching=enable_caching,
kv_cache_group_id=0,
)
......@@ -332,11 +342,53 @@ def test_get_num_blocks_to_allocate():
]
assert (
manager.get_num_blocks_to_allocate("1", 20 * block_size, cached_blocks_1) == 20
manager.get_num_blocks_to_allocate("1", 20 * block_size, cached_blocks_1, 0)
== 20
)
assert (
manager.get_num_blocks_to_allocate("2", 20 * block_size, cached_blocks_2) == 15
manager.get_num_blocks_to_allocate("2", 20 * block_size, cached_blocks_2, 0)
== 15
)
def test_evictable_cached_blocks_not_double_allocated():
block_size = 2
sliding_window_length = 2 * block_size
sliding_window_spec = SlidingWindowSpec(
block_size=block_size,
num_kv_heads=1,
head_size=1,
dtype=torch.float32,
sliding_window=sliding_window_length,
)
block_pool = BlockPool(
num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
)
manager = get_sliding_window_manager(sliding_window_spec, block_pool)
request_id = "req"
evictable_block = block_pool.blocks[1] # ref_cnt == 0, eviction candidate
num_blocks_to_allocate = manager.get_num_blocks_to_allocate(
request_id=request_id,
num_tokens=2 * block_size,
new_computed_blocks=[evictable_block],
total_computed_tokens=block_size,
)
# Free capacity check should count evictable cached blocks, but allocation
# should only allocate the truly new block.
assert num_blocks_to_allocate == 2
manager.allocate_new_computed_blocks(
request_id,
[evictable_block],
num_local_computed_tokens=block_size,
num_external_computed_tokens=0,
)
new_blocks = manager.allocate_new_blocks(request_id, num_tokens=4)
assert len(new_blocks) == 1
assert len(manager.req_to_blocks[request_id]) == 2
def test_chunked_local_attention_get_num_blocks_to_allocate():
......@@ -359,8 +411,10 @@ def test_chunked_local_attention_get_num_blocks_to_allocate():
]
assert (
manager.get_num_blocks_to_allocate("1", 20 * block_size, cached_blocks_1) == 20
manager.get_num_blocks_to_allocate("1", 20 * block_size, cached_blocks_1, 0)
== 20
)
assert (
manager.get_num_blocks_to_allocate("2", 20 * block_size, cached_blocks_2) == 15
manager.get_num_blocks_to_allocate("2", 20 * block_size, cached_blocks_2, 0)
== 15
)
......@@ -142,7 +142,13 @@ def create_scheduler(
kv_cache_tensors=[],
kv_cache_groups=[
KVCacheGroupSpec(
["layer"], FullAttentionSpec(block_size, 1, 1, torch.float32, False)
["layer"],
FullAttentionSpec(
block_size=block_size,
num_kv_heads=1,
head_size=1,
dtype=torch.float32,
),
)
],
)
......
......@@ -49,7 +49,10 @@ def _create_vllm_config(
mock_config.lora_config = None
# Mimic the behavior of VllmConfig.__post_init__()
if compilation_config.mode == CompilationMode.VLLM_COMPILE:
compilation_config.set_splitting_ops_for_v1()
compilation_config.set_splitting_ops_for_v1(
all2all_backend=mock_config.parallel_config.all2all_backend,
data_parallel_size=mock_config.parallel_config.data_parallel_size,
)
# mimic VllmConfig.__post_init__
if compilation_config.cudagraph_capture_sizes:
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import contextlib
import os
import weakref
from contextlib import ExitStack
......@@ -13,26 +11,6 @@ from vllm import LLM
from vllm.config import CompilationConfig, CompilationMode
from vllm.platforms import current_platform
@contextlib.contextmanager
def temporary_environ(env_vars):
"""
Temporarily set environment variables and restore them afterward.
We have to do this vs monkeypatch because monkeypatch doesn't work
with "module" scoped fixtures.
"""
original_env = {k: os.environ.get(k) for k in env_vars}
try:
os.environ.update(env_vars)
yield
finally:
for k, v in original_env.items():
if v is None:
os.environ.pop(k, None)
else:
os.environ[k] = v
# test attention backend and cudagraph_mode combo
# (backend_name, cudagraph_mode, supported)
if current_platform.is_rocm():
......@@ -68,9 +46,9 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
):
pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
env_vars = backend_configs[backend_name].env_vars
attention_config = backend_config.attention_config
with temporary_environ(env_vars), ExitStack() as stack:
with ExitStack() as stack:
if not supported:
stack.enter_context(pytest.raises(Exception))
......@@ -80,6 +58,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
trust_remote_code=True,
gpu_memory_utilization=0.45,
max_model_len=1024,
attention_config=attention_config,
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE, cudagraph_mode=cudagraph_mode
),
......@@ -122,9 +101,10 @@ combo_cases_2 = [
def test_cudagraph_compilation_combo(
backend_name, cudagraph_mode, compilation_mode, supported
):
env_vars = backend_configs[backend_name].env_vars
backend_config = backend_configs[backend_name]
attention_config = backend_config.attention_config
with temporary_environ(env_vars), ExitStack() as stack:
with ExitStack() as stack:
if not supported:
stack.enter_context(pytest.raises(Exception))
......@@ -134,6 +114,7 @@ def test_cudagraph_compilation_combo(
trust_remote_code=True,
gpu_memory_utilization=0.45,
max_model_len=1024,
attention_config=attention_config,
compilation_config=CompilationConfig(
mode=compilation_mode, cudagraph_mode=cudagraph_mode
),
......
......@@ -28,7 +28,7 @@ IS_DEVICE_CAPABILITY_BELOW_90 = is_device_capability_below_90()
BACKENDS,
)
def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
backend, monkeypatch: pytest.MonkeyPatch
backend,
):
"""
Ensures that the same request (the 'needle' prompt) yields identical output
......@@ -54,7 +54,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
random.seed(seed)
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
attention_config = {"backend": backend}
# Allow overrides from environment (useful for CI tuning)
# "facebook/opt-125m" is too small, doesn't reliably test determinism
model = resolve_model_name(backend)
......@@ -92,6 +92,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
max_num_seqs=max_batch_size,
gpu_memory_utilization=gpu_mem_util,
max_model_len=max_model_len,
attention_config=attention_config,
)
# Baseline generation for the needle prompt alone.
......@@ -106,6 +107,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
max_num_seqs=max_batch_size,
gpu_memory_utilization=gpu_mem_util,
max_model_len=max_model_len,
attention_config=attention_config,
)
mismatches = 0
......@@ -163,10 +165,8 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
BACKENDS,
)
def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
backend, monkeypatch: pytest.MonkeyPatch
backend,
):
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
random.seed(seed)
model_name = resolve_model_name(backend)
......@@ -188,12 +188,12 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
llm = LLM(
model=model_name,
tensor_parallel_size=tp_size,
# enable_prefix_caching=False,
max_num_seqs=32,
max_model_len=8192,
dtype="bfloat16", # not everything is supported
gpu_memory_utilization=0.9,
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
attention_config={"backend": backend},
)
# Use more realistic prompts for better token generation
......@@ -382,12 +382,11 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
"backend",
BACKENDS,
)
def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
def test_simple_generation(backend):
"""
Simple test that runs the model with a basic prompt and prints the output.
Useful for quick smoke testing and debugging.
"""
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
model = resolve_model_name(backend)
llm = LLM(
......@@ -399,6 +398,7 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
dtype="bfloat16",
enable_prefix_caching=False,
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
attention_config={"backend": backend},
)
prompt = "the capital of france is"
......@@ -445,8 +445,6 @@ def test_logprobs_without_batch_invariance_should_fail(
The test will PASS if we detect differences (proving batch invariance matters).
The test will FAIL if everything matches (suggesting batch invariance isn't needed).
"""
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
# CRITICAL: Disable batch invariance for this test
monkeypatch.setenv("VLLM_BATCH_INVARIANT", "0")
monkeypatch.setattr(batch_invariant, "VLLM_BATCH_INVARIANT", False)
......@@ -466,6 +464,7 @@ def test_logprobs_without_batch_invariance_should_fail(
max_model_len=8192,
dtype="bfloat16",
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
attention_config={"backend": backend},
)
# build ragged prompts to change shapes significantly across BS=1 vs BS=N
......@@ -650,7 +649,7 @@ def test_logprobs_without_batch_invariance_should_fail(
@skip_unsupported
@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
def test_decode_logprobs_match_prefill_logprobs(
backend, monkeypatch: pytest.MonkeyPatch
backend,
):
"""
Test that verifies decode logprobs match prefill logprobs.
......@@ -665,8 +664,6 @@ def test_decode_logprobs_match_prefill_logprobs(
This ensures that the logprobs from decode are consistent with what
we would get if we ran prefill on each prefix.
"""
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
random.seed(seed)
model_name = resolve_model_name(backend)
......@@ -690,6 +687,7 @@ def test_decode_logprobs_match_prefill_logprobs(
max_model_len=8192,
dtype="bfloat16",
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
attention_config={"backend": backend},
)
# Use a few test prompts
......@@ -921,6 +919,7 @@ def LLM_with_max_seqs(
max_num_seqs: int,
gpu_memory_utilization: float,
max_model_len: int,
attention_config: dict | None = None,
) -> LLM:
"""
Helper to construct an LLM with a specific max_num_seqs (batch-size limit)
......@@ -935,6 +934,7 @@ def LLM_with_max_seqs(
tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
enable_prefix_caching=False,
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
attention_config=attention_config,
# Enable for MOE models
# enable_expert_parallel=True,
)
......@@ -136,11 +136,9 @@ def _compare_bs1_vs_bsn_single_process(
@skip_unsupported
@pytest.mark.parametrize("backend", BACKENDS)
def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
backend: str, monkeypatch: pytest.MonkeyPatch
backend: str,
) -> None:
random.seed(int(os.getenv("VLLM_TEST_SEED", "12345")))
# Override backend for this test (and the RemoteOpenAIServer child process).
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
model_name = resolve_model_name(backend)
prompts_all = [_random_prompt(10, 50) for _ in range(32)]
......@@ -156,6 +154,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
server_args: list[str] = [
"--max-model-len=8192",
"--max-num-seqs=32",
f"--attention-backend={backend}",
]
if tp_size:
server_args += ["-tp", tp_size]
......
......@@ -21,7 +21,11 @@ from vllm.model_executor.layers.layernorm import RMSNorm
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("eps", [1e-6, 1e-5])
def test_rms_norm_batch_invariant_vs_standard(
batch_size: int, hidden_size: int, dtype: torch.dtype, eps: float
default_vllm_config,
batch_size: int,
hidden_size: int,
dtype: torch.dtype,
eps: float,
):
"""
Compare batch-invariant Triton RMS norm against standard CUDA implementation.
......@@ -68,7 +72,9 @@ def test_rms_norm_batch_invariant_vs_standard(
@pytest.mark.parametrize("batch_size", [1, 16, 128])
@pytest.mark.parametrize("seq_len", [1, 32, 512])
@pytest.mark.parametrize("hidden_size", [2048, 4096])
def test_rms_norm_3d_input(batch_size: int, seq_len: int, hidden_size: int):
def test_rms_norm_3d_input(
default_vllm_config, batch_size: int, seq_len: int, hidden_size: int
):
"""
Test RMS norm with 3D input tensors (batch, seq_len, hidden_size).
......@@ -107,7 +113,7 @@ def test_rms_norm_3d_input(batch_size: int, seq_len: int, hidden_size: int):
@skip_unsupported
def test_rms_norm_numerical_stability():
def test_rms_norm_numerical_stability(default_vllm_config):
"""
Test RMS norm numerical stability with extreme values.
......@@ -167,7 +173,7 @@ def test_rms_norm_numerical_stability():
@skip_unsupported
def test_rms_norm_formula():
def test_rms_norm_formula(default_vllm_config):
"""
Test that RMS norm follows the correct mathematical formula.
......@@ -201,7 +207,7 @@ def test_rms_norm_formula():
@skip_unsupported
@pytest.mark.parametrize("hidden_size", [128, 1024, 4096, 16384])
def test_rms_norm_different_hidden_sizes(hidden_size: int):
def test_rms_norm_different_hidden_sizes(default_vllm_config, hidden_size: int):
"""
Test RMS norm with various hidden sizes to ensure block size handling.
......@@ -238,7 +244,7 @@ def test_rms_norm_different_hidden_sizes(hidden_size: int):
@skip_unsupported
def test_rms_norm_determinism():
def test_rms_norm_determinism(default_vllm_config):
"""
Test that batch-invariant RMS norm produces deterministic results.
......
......@@ -6,9 +6,9 @@ import random
import pytest
import torch
from vllm.attention.utils.fa_utils import flash_attn_supports_mla
from vllm.platforms import current_platform
from vllm.utils.flashinfer import has_flashinfer
from vllm.v1.attention.backends.fa_utils import flash_attn_supports_mla
skip_unsupported = pytest.mark.skipif(
not (current_platform.is_cuda() and current_platform.has_device_capability(80)),
......
......@@ -16,7 +16,12 @@ DP_SIZE = int(os.getenv("DP_SIZE", 2))
@pytest.mark.asyncio
async def test_run_eagle_dp():
async def test_run_eagle_dp(monkeypatch: pytest.MonkeyPatch):
# This test checks that running a model with and without eagle
# leads to identical tokens. This is only true in batch invariant mode
# (because the target model verifies all draft tokens in one big forward pass)
monkeypatch.setenv("VLLM_BATCH_INVARIANT", "1")
target_model = "meta-llama/Llama-3.1-8B-Instruct"
draft_model = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
......@@ -29,6 +34,7 @@ async def test_run_eagle_dp():
data_parallel_backend="mp", # ray takes more time
trust_remote_code=True,
max_model_len=16384,
attention_config={"backend": "FLASH_ATTN"},
)
eagle_engine_args = replace(
......@@ -41,9 +47,10 @@ async def test_run_eagle_dp():
)
prompt = "This is a test of data parallel with eagle"
num_expected_tokens = 100
# This test might be flaky, see
# https://github.com/vllm-project/vllm/issues/31913
num_expected_tokens = 20
sampling_params = SamplingParams(
min_tokens=num_expected_tokens,
max_tokens=num_expected_tokens,
ignore_eos=True,
output_kind=RequestOutputKind.FINAL_ONLY,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment