utils.py 10.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Utility functions for attention-related v1 tests."""

from dataclasses import dataclass

import pytest
import torch

10
from vllm.attention.backends.abstract import AttentionImpl
11
from vllm.attention.backends.registry import AttentionBackendEnum
12
13
14
15
16
17
18
from vllm.config import (
    CacheConfig,
    CompilationConfig,
    DeviceConfig,
    LoadConfig,
    ModelConfig,
    ParallelConfig,
19
    RendererConfig,
20
21
22
    SchedulerConfig,
    VllmConfig,
)
23
from vllm.config.model import ModelDType
24
25
26
27
from vllm.v1.attention.backends.utils import (
    AttentionMetadataBuilder,
    CommonAttentionMetadata,
)
28
29
30
31
32
33
from vllm.v1.kv_cache_interface import FullAttentionSpec


@dataclass
class BatchSpec:
    """Specification for a batch configuration (workload shape only)."""
34

35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
    seq_lens: list[int]
    query_lens: list[int]

    name: str = "unnamed"

    @property
    def batch_size(self):
        return len(self.seq_lens)

    def __post_init__(self):
        assert len(self.seq_lens) == len(self.query_lens)

    def compute_num_tokens(self):
        return sum(self.query_lens)


def create_common_attn_metadata(
52
53
54
55
56
57
    batch_spec: BatchSpec,
    block_size: int,
    device: torch.device,
    max_block_idx: int = 1000,
    arange_block_indices: bool = False,
) -> CommonAttentionMetadata:
58
59
    """Create CommonAttentionMetadata from a BatchSpec and ModelParams."""
    # Create query start locations
60
61
62
63
64
65
    query_start_loc = torch.zeros(
        batch_spec.batch_size + 1, dtype=torch.int32, device=device
    )
    query_start_loc[1:] = torch.tensor(
        batch_spec.query_lens, dtype=torch.int32, device=device
    ).cumsum(0)
66
67
68
69
    query_start_loc_cpu = query_start_loc.cpu()
    num_tokens = batch_spec.compute_num_tokens()

    # Create sequence lengths
70
    seq_lens = torch.tensor(batch_spec.seq_lens, dtype=torch.int32, device=device)
71
    seq_lens_cpu = seq_lens.cpu()
72
    max_seq_len = int(seq_lens_cpu.max())
73
74
75
76
77
78
79
80

    # Create computed tokens (context length for each sequence)
    context_lens = [
        batch_spec.seq_lens[i] - batch_spec.query_lens[i]
        for i in range(batch_spec.batch_size)
    ]
    num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)

81
    # Create block table and slot mapping
82
    max_blocks = (max(batch_spec.seq_lens) + block_size - 1) // block_size
83
84
    if arange_block_indices:
        num_blocks = batch_spec.batch_size * max_blocks
85
86
87
88
89
90
        block_table_tensor = torch.arange(
            num_blocks, dtype=torch.int32, device=device
        ).view(batch_spec.batch_size, max_blocks)
        slot_mapping = torch.arange(num_tokens, dtype=torch.int64, device=device).view(
            num_tokens
        )
91
    else:
92
93
94
95
96
97
98
99
100
101
        block_table_tensor = torch.randint(
            0,
            max_block_idx,
            (batch_spec.batch_size, max_blocks),
            dtype=torch.int32,
            device=device,
        )
        slot_mapping = torch.randint(
            0, max_block_idx, (num_tokens,), dtype=torch.int64, device=device
        )
102
103
104
105
106
107
108
109
110
111
112
113
114

    # Calculate max query length
    max_query_len = max(batch_spec.query_lens)

    return CommonAttentionMetadata(
        query_start_loc=query_start_loc,
        query_start_loc_cpu=query_start_loc_cpu,
        seq_lens=seq_lens,
        seq_lens_cpu=seq_lens_cpu,
        num_computed_tokens_cpu=num_computed_tokens_cpu,
        num_reqs=batch_spec.batch_size,
        num_actual_tokens=num_tokens,
        max_query_len=max_query_len,
115
        max_seq_len=max_seq_len,
116
117
        block_table_tensor=block_table_tensor,
        slot_mapping=slot_mapping,
118
        causal=True,
119
120
121
    )


122
def try_get_attention_backend(
123
    backend: AttentionBackendEnum,
124
125
) -> tuple[type[AttentionMetadataBuilder], type[AttentionImpl]]:
    """Try to get the attention backend class, skipping test if not found."""
126
    try:
127
        backend_class = backend.get_class()
128
129
        return backend_class.get_builder_cls(), backend_class.get_impl_cls()
    except ImportError as e:
130
        pytest.skip(f"{backend.name} not available: {e}")
131
        raise AssertionError("unreachable") from None
132
133


134
def create_standard_kv_cache_spec(vllm_config: VllmConfig) -> FullAttentionSpec:
135
136
137
138
    """Create a FullAttentionSpec from ModelParams only."""
    return FullAttentionSpec(
        block_size=vllm_config.cache_config.block_size,
        num_kv_heads=vllm_config.model_config.get_num_kv_heads(
139
140
            vllm_config.parallel_config
        ),
141
142
143
144
145
146
        head_size=vllm_config.model_config.get_head_size(),
        dtype=vllm_config.model_config.dtype,
        sliding_window=vllm_config.model_config.get_sliding_window(),
    )


147
148
149
150
def create_vllm_config(
    model_name: str = "meta-llama/Meta-Llama-3-8B",
    tensor_parallel_size: int = 1,
    max_model_len: int = 1024,
151
    dtype: ModelDType | torch.dtype = "auto",
152
153
154
155
156
157
    num_gpu_blocks: int = 1000,
    block_size: int = 16,
    max_num_seqs: int = 256,
    max_num_batched_tokens: int = 8192,
    enable_chunked_prefill: bool = True,
    add_mock_model_methods: bool = True,
158
    hf_config_override: dict | None = None,
159
) -> VllmConfig:
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
    """Create a VllmConfig for testing with reasonable defaults."""

    model_config = ModelConfig(
        model=model_name,
        tokenizer=model_name,
        trust_remote_code=False,
        dtype=dtype,
        seed=0,
        max_model_len=max_model_len,
    )

    cache_config = CacheConfig(
        block_size=block_size,
        cache_dtype="auto",
        swap_space=0,
    )
    # Set cache blocks for testing
    #   (these may be set during initialization normally)
Matthew Bonanni's avatar
Matthew Bonanni committed
178
    cache_config.num_gpu_blocks = num_gpu_blocks
179
180
181
    cache_config.num_cpu_blocks = 0

    parallel_config = ParallelConfig(
182
183
        tensor_parallel_size=tensor_parallel_size,
    )
184
185
186
187

    scheduler_config = SchedulerConfig(
        max_num_seqs=max_num_seqs,
        max_num_batched_tokens=max_num_batched_tokens,
Matthew Bonanni's avatar
Matthew Bonanni committed
188
        enable_chunked_prefill=enable_chunked_prefill,
189
190
        max_model_len=model_config.max_model_len,
        is_encoder_decoder=model_config.is_encoder_decoder,
191
192
193
194
195
196
197
198
199
200
201
202
    )

    device_config = DeviceConfig()
    load_config = LoadConfig()
    compilation_config = CompilationConfig()

    if add_mock_model_methods:
        # Add mock methods to satisfy backends that need them
        # This is a workaround because tests don't build full, real models,
        # but some backends expect to query the model for layer-specific
        # parameters
        import types
203
204

        model_config.get_num_layers = types.MethodType(lambda self: 1, model_config)
205
        model_config.get_sliding_window_for_layer = types.MethodType(
206
207
            lambda self, i: None, model_config
        )
208
        model_config.get_logits_soft_cap_for_layer = types.MethodType(
209
210
            lambda self, i: 0.0, model_config
        )
211
        model_config.get_sm_scale_for_layer = types.MethodType(
212
213
            lambda self, i: 1.0 / model_config.get_head_size() ** 0.5, model_config
        )
214

215
216
217
    if hf_config_override:
        model_config.hf_config.update(hf_config_override)

218
219
    return VllmConfig(
        model_config=model_config,
220
        renderer_config=RendererConfig(model_config=model_config),
221
222
223
224
225
226
227
228
229
        cache_config=cache_config,
        parallel_config=parallel_config,
        scheduler_config=scheduler_config,
        device_config=device_config,
        load_config=load_config,
        compilation_config=compilation_config,
    )


230
231
232
233
234
235
236
237
def create_dummy_kv_cache(
    block_size: int,
    num_kv_heads: int,
    head_size: int,
    dtype: torch.dtype,
    device: torch.device,
    num_blocks: int = 100,
) -> torch.Tensor:
238
239
240
241
242
243
244
245
    """Create a dummy KV cache tensor for testing."""
    kv_cache = torch.randn(
        num_blocks,
        2,  # K and V
        block_size,
        num_kv_heads,
        head_size,
        dtype=dtype,
246
247
        device=device,
    )
248
    return kv_cache
249
250
251
252
253
254
255


@dataclass
class BackendConfig:
    name: str
    env_vars: dict
    comp_config: dict  # compilation config
256
    specific_gpu_arch: tuple | None = None
257
258
259
260
261


# Define all backend configurations of full cudagraph to be tested
full_cg_backend_configs = {
    # FA3 on Hopper
262
263
264
265
266
267
268
269
270
271
272
273
    "FA3": BackendConfig(
        name="FA3",
        env_vars={
            "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
            "VLLM_FLASH_ATTN_VERSION": "3",
            "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
        },
        comp_config={
            "cudagraph_mode": "FULL",
        },
        specific_gpu_arch=(9, 0),
    ),
274
    # FlashMLA on Hopper
275
276
277
278
279
280
281
282
283
284
    "FlashMLA": BackendConfig(
        name="FlashMLA",
        env_vars={
            "VLLM_ATTENTION_BACKEND": "FLASHMLA",
        },
        comp_config={
            "cudagraph_mode": "FULL_AND_PIECEWISE",
        },
        specific_gpu_arch=(9, 0),
    ),
285
    # Cutlass MLA on Blackwell
286
    "CutlassMLA": BackendConfig(
287
288
289
        name="CutlassMLA",
        env_vars={
            "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
290
291
292
293
294
295
296
297
298
299
300
        },
        comp_config={
            "cudagraph_mode": "FULL_AND_PIECEWISE",
        },
        specific_gpu_arch=(10, 0),
    ),
    # FlashInfer MLA on Blackwell
    "FlashInferMLA": BackendConfig(
        name="FlashInferMLA",
        env_vars={
            "VLLM_ATTENTION_BACKEND": "FLASHINFER_MLA",
301
302
303
304
        },
        comp_config={
            "cudagraph_mode": "FULL_AND_PIECEWISE",
        },
305
306
        specific_gpu_arch=(10, 0),
    ),
307
    # FlashAttention MLA on Hopper
308
309
310
311
312
313
314
315
316
317
318
    "FlashAttentionMLA": BackendConfig(
        name="FlashAttentionMLA",
        env_vars={
            "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
            "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
        },
        comp_config={
            "cudagraph_mode": "FULL_DECODE_ONLY",
        },
        specific_gpu_arch=(9, 0),
    ),
319
    # FA2
320
321
322
323
324
325
326
327
328
329
330
    "FA2": BackendConfig(
        name="FA2",
        env_vars={
            "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
            "VLLM_FLASH_ATTN_VERSION": "2",
            "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
        },
        comp_config={
            "cudagraph_mode": "FULL_AND_PIECEWISE",
        },
    ),
331
    # Triton Attention
332
333
334
335
336
337
338
    "TritonAttn": BackendConfig(
        name="TritonAttn",
        env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
        comp_config={
            "cudagraph_mode": "FULL_AND_PIECEWISE",
        },
    ),
339
    # FlashInfer
340
341
342
343
344
345
346
    "FlashInfer": BackendConfig(
        name="FlashInfer",
        env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
        comp_config={
            "cudagraph_mode": "FULL_AND_PIECEWISE",
        },
    ),
347
348
349
350
351
352
353
    "RocmAttn": BackendConfig(
        name="RocmAttn",
        env_vars={"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
        comp_config={
            "cudagraph_mode": "FULL",
        },
    ),
354
}