utils.py 11.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Utility functions for attention-related v1 tests."""

from dataclasses import dataclass

import pytest
import torch

10
11
12
13
14
15
16
17
18
19
from vllm.config import (
    CacheConfig,
    CompilationConfig,
    DeviceConfig,
    LoadConfig,
    ModelConfig,
    ParallelConfig,
    SchedulerConfig,
    VllmConfig,
)
20
from vllm.config.model import ModelDType
21
22
from vllm.v1.attention.backend import (
    AttentionImpl,
23
    AttentionMetadataBuilder,
24
    AttentionType,
25
26
    CommonAttentionMetadata,
)
27
from vllm.v1.attention.backends.registry import AttentionBackendEnum
28
from vllm.v1.kv_cache_interface import EncoderOnlyAttentionSpec, FullAttentionSpec
29
30
31
32
33


@dataclass
class BatchSpec:
    """Specification for a batch configuration (workload shape only)."""
34

35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
    seq_lens: list[int]
    query_lens: list[int]

    name: str = "unnamed"

    @property
    def batch_size(self):
        return len(self.seq_lens)

    def __post_init__(self):
        assert len(self.seq_lens) == len(self.query_lens)

    def compute_num_tokens(self):
        return sum(self.query_lens)


def create_common_attn_metadata(
52
53
54
55
56
57
    batch_spec: BatchSpec,
    block_size: int,
    device: torch.device,
    max_block_idx: int = 1000,
    arange_block_indices: bool = False,
) -> CommonAttentionMetadata:
58
59
    """Create CommonAttentionMetadata from a BatchSpec and ModelParams."""
    # Create query start locations
60
61
62
63
64
65
    query_start_loc = torch.zeros(
        batch_spec.batch_size + 1, dtype=torch.int32, device=device
    )
    query_start_loc[1:] = torch.tensor(
        batch_spec.query_lens, dtype=torch.int32, device=device
    ).cumsum(0)
66
67
68
69
    query_start_loc_cpu = query_start_loc.cpu()
    num_tokens = batch_spec.compute_num_tokens()

    # Create sequence lengths
70
    seq_lens = torch.tensor(batch_spec.seq_lens, dtype=torch.int32, device=device)
71
    seq_lens_cpu = seq_lens.cpu()
72
    max_seq_len = int(seq_lens_cpu.max())
73
74
75
76
77
78
79
80

    # Create computed tokens (context length for each sequence)
    context_lens = [
        batch_spec.seq_lens[i] - batch_spec.query_lens[i]
        for i in range(batch_spec.batch_size)
    ]
    num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)

81
    # Create block table and slot mapping
82
    max_blocks = (max(batch_spec.seq_lens) + block_size - 1) // block_size
83
84
    if arange_block_indices:
        num_blocks = batch_spec.batch_size * max_blocks
85
86
87
88
89
90
        block_table_tensor = torch.arange(
            num_blocks, dtype=torch.int32, device=device
        ).view(batch_spec.batch_size, max_blocks)
        slot_mapping = torch.arange(num_tokens, dtype=torch.int64, device=device).view(
            num_tokens
        )
91
    else:
92
93
94
95
96
97
98
99
100
101
        block_table_tensor = torch.randint(
            0,
            max_block_idx,
            (batch_spec.batch_size, max_blocks),
            dtype=torch.int32,
            device=device,
        )
        slot_mapping = torch.randint(
            0, max_block_idx, (num_tokens,), dtype=torch.int64, device=device
        )
102
103
104
105
106
107
108
109

    # Calculate max query length
    max_query_len = max(batch_spec.query_lens)

    return CommonAttentionMetadata(
        query_start_loc=query_start_loc,
        query_start_loc_cpu=query_start_loc_cpu,
        seq_lens=seq_lens,
110
111
        _seq_lens_cpu=seq_lens_cpu,
        _num_computed_tokens_cpu=num_computed_tokens_cpu,
112
113
114
        num_reqs=batch_spec.batch_size,
        num_actual_tokens=num_tokens,
        max_query_len=max_query_len,
115
        max_seq_len=max_seq_len,
116
117
        block_table_tensor=block_table_tensor,
        slot_mapping=slot_mapping,
118
        causal=True,
119
120
121
    )


122
def try_get_attention_backend(
123
    backend: AttentionBackendEnum,
124
125
) -> tuple[type[AttentionMetadataBuilder], type[AttentionImpl]]:
    """Try to get the attention backend class, skipping test if not found."""
126
    try:
127
        backend_class = backend.get_class()
128
129
        return backend_class.get_builder_cls(), backend_class.get_impl_cls()
    except ImportError as e:
130
        pytest.skip(f"{backend.name} not available: {e}")
131
        raise AssertionError("unreachable") from None
132
133


134
135
136
137
138
139
140
141
142
143
144
145
def try_backend_includes_kv_cache_update(
    backend: AttentionBackendEnum,
) -> bool:
    """Try to get the attention backend class, skipping test if not found."""
    try:
        backend_class = backend.get_class()
        return backend_class.forward_includes_kv_cache_update
    except ImportError as e:
        pytest.skip(f"{backend.name} not available: {e}")
        raise AssertionError("unreachable") from None


146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
def create_standard_kv_cache_spec(
    vllm_config: VllmConfig,
    attn_type: AttentionType = AttentionType.DECODER,
) -> FullAttentionSpec | EncoderOnlyAttentionSpec:
    """Create an AttentionSpec from VllmConfig.

    Returns an EncoderOnlyAttentionSpec for encoder-only attention (no KV
    cache), and a FullAttentionSpec otherwise.
    """
    if attn_type == AttentionType.ENCODER_ONLY:
        return EncoderOnlyAttentionSpec(
            block_size=vllm_config.cache_config.block_size,
            num_kv_heads=vllm_config.model_config.get_num_kv_heads(
                vllm_config.parallel_config
            ),
            head_size=vllm_config.model_config.get_head_size(),
            dtype=vllm_config.model_config.dtype,
        )
164
165
166
    return FullAttentionSpec(
        block_size=vllm_config.cache_config.block_size,
        num_kv_heads=vllm_config.model_config.get_num_kv_heads(
167
168
            vllm_config.parallel_config
        ),
169
170
171
172
173
174
        head_size=vllm_config.model_config.get_head_size(),
        dtype=vllm_config.model_config.dtype,
        sliding_window=vllm_config.model_config.get_sliding_window(),
    )


175
176
177
178
def create_vllm_config(
    model_name: str = "meta-llama/Meta-Llama-3-8B",
    tensor_parallel_size: int = 1,
    max_model_len: int = 1024,
179
    dtype: ModelDType | torch.dtype = "auto",
180
181
182
183
184
185
    num_gpu_blocks: int = 1000,
    block_size: int = 16,
    max_num_seqs: int = 256,
    max_num_batched_tokens: int = 8192,
    enable_chunked_prefill: bool = True,
    add_mock_model_methods: bool = True,
186
    hf_config_override: dict | None = None,
187
) -> VllmConfig:
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
    """Create a VllmConfig for testing with reasonable defaults."""

    model_config = ModelConfig(
        model=model_name,
        tokenizer=model_name,
        trust_remote_code=False,
        dtype=dtype,
        seed=0,
        max_model_len=max_model_len,
    )

    cache_config = CacheConfig(
        block_size=block_size,
        cache_dtype="auto",
    )
    # Set cache blocks for testing
    #   (these may be set during initialization normally)
Matthew Bonanni's avatar
Matthew Bonanni committed
205
    cache_config.num_gpu_blocks = num_gpu_blocks
206
207
208
    cache_config.num_cpu_blocks = 0

    parallel_config = ParallelConfig(
209
210
        tensor_parallel_size=tensor_parallel_size,
    )
211
212
213
214

    scheduler_config = SchedulerConfig(
        max_num_seqs=max_num_seqs,
        max_num_batched_tokens=max_num_batched_tokens,
Matthew Bonanni's avatar
Matthew Bonanni committed
215
        enable_chunked_prefill=enable_chunked_prefill,
216
217
        max_model_len=model_config.max_model_len,
        is_encoder_decoder=model_config.is_encoder_decoder,
218
219
220
221
222
223
224
225
226
227
228
229
    )

    device_config = DeviceConfig()
    load_config = LoadConfig()
    compilation_config = CompilationConfig()

    if add_mock_model_methods:
        # Add mock methods to satisfy backends that need them
        # This is a workaround because tests don't build full, real models,
        # but some backends expect to query the model for layer-specific
        # parameters
        import types
230
231

        model_config.get_num_layers = types.MethodType(lambda self: 1, model_config)
232
        model_config.get_sliding_window_for_layer = types.MethodType(
233
234
            lambda self, i: None, model_config
        )
235
        model_config.get_logits_soft_cap_for_layer = types.MethodType(
236
237
            lambda self, i: 0.0, model_config
        )
238
        model_config.get_sm_scale_for_layer = types.MethodType(
239
240
            lambda self, i: 1.0 / model_config.get_head_size() ** 0.5, model_config
        )
241

242
243
244
    if hf_config_override:
        model_config.hf_config.update(hf_config_override)

245
246
247
248
249
250
251
252
253
254
255
    return VllmConfig(
        model_config=model_config,
        cache_config=cache_config,
        parallel_config=parallel_config,
        scheduler_config=scheduler_config,
        device_config=device_config,
        load_config=load_config,
        compilation_config=compilation_config,
    )


256
257
258
259
260
261
262
263
def create_dummy_kv_cache(
    block_size: int,
    num_kv_heads: int,
    head_size: int,
    dtype: torch.dtype,
    device: torch.device,
    num_blocks: int = 100,
) -> torch.Tensor:
264
265
266
267
268
269
270
271
    """Create a dummy KV cache tensor for testing."""
    kv_cache = torch.randn(
        num_blocks,
        2,  # K and V
        block_size,
        num_kv_heads,
        head_size,
        dtype=dtype,
272
273
        device=device,
    )
274
    return kv_cache
275
276
277
278
279


@dataclass
class BackendConfig:
    name: str
280
281
    attention_config: dict
    comp_config: dict
282
    specific_gpu_arch: tuple | None = None
283
284
285
286
287


# Define all backend configurations of full cudagraph to be tested
full_cg_backend_configs = {
    # FA3 on Hopper
288
289
    "FA3": BackendConfig(
        name="FA3",
290
291
292
293
        attention_config={
            "backend": "FLASH_ATTN",
            "flash_attn_version": 3,
            "flash_attn_max_num_splits_for_cuda_graph": 16,
294
295
296
297
298
299
        },
        comp_config={
            "cudagraph_mode": "FULL",
        },
        specific_gpu_arch=(9, 0),
    ),
300
    # FlashMLA on Hopper
301
302
    "FlashMLA": BackendConfig(
        name="FlashMLA",
303
        attention_config={"backend": "FLASHMLA"},
304
305
306
307
308
        comp_config={
            "cudagraph_mode": "FULL_AND_PIECEWISE",
        },
        specific_gpu_arch=(9, 0),
    ),
309
    # Cutlass MLA on Blackwell
310
    "CutlassMLA": BackendConfig(
311
        name="CutlassMLA",
312
        attention_config={"backend": "CUTLASS_MLA"},
313
314
315
316
317
318
319
320
        comp_config={
            "cudagraph_mode": "FULL_AND_PIECEWISE",
        },
        specific_gpu_arch=(10, 0),
    ),
    # FlashInfer MLA on Blackwell
    "FlashInferMLA": BackendConfig(
        name="FlashInferMLA",
321
        attention_config={"backend": "FLASHINFER_MLA"},
322
323
324
        comp_config={
            "cudagraph_mode": "FULL_AND_PIECEWISE",
        },
325
326
        specific_gpu_arch=(10, 0),
    ),
327
    # FlashAttention MLA on Hopper
328
329
    "FlashAttentionMLA": BackendConfig(
        name="FlashAttentionMLA",
330
331
332
        attention_config={
            "backend": "FLASH_ATTN_MLA",
            "flash_attn_max_num_splits_for_cuda_graph": 16,
333
334
335
336
337
338
        },
        comp_config={
            "cudagraph_mode": "FULL_DECODE_ONLY",
        },
        specific_gpu_arch=(9, 0),
    ),
339
    # FA2
340
341
    "FA2": BackendConfig(
        name="FA2",
342
343
344
345
        attention_config={
            "backend": "FLASH_ATTN",
            "flash_attn_version": 2,
            "flash_attn_max_num_splits_for_cuda_graph": 16,
346
347
348
349
350
        },
        comp_config={
            "cudagraph_mode": "FULL_AND_PIECEWISE",
        },
    ),
351
    # Triton Attention
352
353
    "TritonAttn": BackendConfig(
        name="TritonAttn",
354
        attention_config={"backend": "TRITON_ATTN"},
355
356
357
358
        comp_config={
            "cudagraph_mode": "FULL_AND_PIECEWISE",
        },
    ),
359
    # FlashInfer
360
361
    "FlashInfer": BackendConfig(
        name="FlashInfer",
362
        attention_config={"backend": "FLASHINFER"},
363
364
365
366
        comp_config={
            "cudagraph_mode": "FULL_AND_PIECEWISE",
        },
    ),
367
368
    "RocmAttn": BackendConfig(
        name="RocmAttn",
369
370
371
372
        attention_config={
            "backend": "ROCM_ATTN",
            "use_prefill_decode_attention": True,
        },
373
374
375
376
        comp_config={
            "cudagraph_mode": "FULL",
        },
    ),
377
}