abstract.py 9 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0

3
from abc import ABC, abstractmethod
4
from contextlib import contextmanager
5
from dataclasses import dataclass, fields
6
7
from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional,
                    Protocol, Set, Tuple, Type, TypeVar)
8
9
10

import torch

11
12
from vllm.multimodal import MultiModalPlaceholderMap

13
if TYPE_CHECKING:
14
15
16
    from vllm.worker.model_runner_base import (ModelRunnerBase,
                                               ModelRunnerInputBase,
                                               ModelRunnerInputBuilderBase)
17

18

19
20
21
22
23
24
25
26
27
28
29
30
31
class AttentionType:
    """
    Attention type.
    Use string to be compatible with `torch.compile`.
    """
    # Decoder attention between previous layer Q/K/V
    DECODER = "decoder"
    # Encoder attention between previous layer Q/K/V for encoder-decoder
    ENCODER = "encoder"
    # Encoder attention between previous layer Q/K/V
    ENCODER_ONLY = "encoder_only"
    # Attention between dec. Q and enc. K/V for encoder-decoder
    ENCODER_DECODER = "encoder_decoder"
32
33


34
35
class AttentionBackend(ABC):
    """Abstract class for attention backends."""
36
37
38
39
    # For some attention backends, we allocate an output tensor before
    # calling the custom op. When piecewise cudagraph is enabled, this
    # makes sure the output tensor is allocated inside the cudagraph.
    accept_output_buffer: bool = False
40

41
42
43
44
45
    @staticmethod
    @abstractmethod
    def get_name() -> str:
        raise NotImplementedError

46
47
48
49
50
51
52
    @staticmethod
    @abstractmethod
    def get_impl_cls() -> Type["AttentionImpl"]:
        raise NotImplementedError

    @staticmethod
    @abstractmethod
53
    def get_metadata_cls() -> Type["AttentionMetadata"]:
54
55
        raise NotImplementedError

56
57
58
59
60
    @staticmethod
    @abstractmethod
    def get_state_cls() -> Type["AttentionState"]:
        raise NotImplementedError

61
62
63
64
    @classmethod
    def make_metadata(cls, *args, **kwargs) -> "AttentionMetadata":
        return cls.get_metadata_cls()(*args, **kwargs)

65
66
67
68
69
    @staticmethod
    @abstractmethod
    def get_builder_cls() -> Type["AttentionMetadataBuilder"]:
        raise NotImplementedError

70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
    @staticmethod
    @abstractmethod
    def get_kv_cache_shape(
        num_blocks: int,
        block_size: int,
        num_kv_heads: int,
        head_size: int,
    ) -> Tuple[int, ...]:
        raise NotImplementedError

    @staticmethod
    @abstractmethod
    def swap_blocks(
        src_kv_cache: torch.Tensor,
        dst_kv_cache: torch.Tensor,
85
        src_to_dst: torch.Tensor,
86
87
88
89
90
91
92
    ) -> None:
        raise NotImplementedError

    @staticmethod
    @abstractmethod
    def copy_blocks(
        kv_caches: List[torch.Tensor],
93
        src_to_dists: torch.Tensor,
94
95
96
    ) -> None:
        raise NotImplementedError

97
98
99
    def advance_step(self, model_input: "ModelRunnerInputBase",
                     sampled_token_ids: Optional[torch.Tensor],
                     block_size: int, num_seqs: int, num_queries: int) -> None:
100
101
        raise NotImplementedError

102
103

@dataclass
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
class AttentionMetadata:
    """Attention metadata for prefill and decode batched together."""
    # Total number of prefill requests.
    num_prefills: int
    # Number of prefill tokens.
    num_prefill_tokens: int
    # Number of decode tokens. Note that it is equivalent to the number of
    # decode requests.
    num_decode_tokens: int
    # (num_tokens,). The indices of the token slots that input tokens will be
    # stored into. E.g., if `slot_mapping` is [35, 2, 17] and the block size
    # is 16, the three tokens are stored in the 3rd slot in block 2, 2nd slot
    # in block 0, and 1st slot in block 1, respectively.
    slot_mapping: torch.Tensor

119
120
121
122
123
124
125
126
127
    # The index maps that relate multi-modal embeddings to the corresponding
    # placeholders.
    #
    # N.B. These aren't really related to attention and don't belong on this
    # type -- this is just a temporary solution to make them available to
    # `model_executable`.
    multi_modal_placeholder_index_maps: Optional[Dict[
        str, MultiModalPlaceholderMap.IndexMap]]

128
129
130
131
    # Enable/disable KV scales calculation. This is so that we can disable the
    # calculation until after prefill and cuda graph capture.
    enable_kv_scales_calculation: bool

132
133
134
135
136
137
138
139
140
141
142
143
144
    @property
    @abstractmethod
    def prefill_metadata(self) -> Optional["AttentionMetadata"]:
        """Return the attention metadata that's required to run prefill
        attention."""
        pass

    @property
    @abstractmethod
    def decode_metadata(self) -> Optional["AttentionMetadata"]:
        """Return the attention metadata that's required to run decode
        attention."""
        pass
145

146
147
148
    def asdict_zerocopy(self,
                        skip_fields: Optional[Set[str]] = None
                        ) -> Dict[str, Any]:
149
        """Similar to dataclasses.asdict, but avoids deepcopying."""
150
151
        if skip_fields is None:
            skip_fields = set()
152
153
154
155
        # Note that if we add dataclasses as fields, they will need
        # similar handling.
        return {
            field.name: getattr(self, field.name)
156
            for field in fields(self) if field.name not in skip_fields
157
158
159
        }


160
T = TypeVar("T", bound=AttentionMetadata)
161
162


163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
class AttentionState(ABC, Generic[T]):
    """Holds attention backend-specific objects reused during the
    lifetime of the model runner."""

    @abstractmethod
    def __init__(self, runner: "ModelRunnerBase"):
        ...

    @abstractmethod
    @contextmanager
    def graph_capture(self, max_batch_size: int):
        """Context manager used when capturing CUDA graphs."""
        yield

    @abstractmethod
    def graph_clone(self, batch_size: int) -> "AttentionState[T]":
        """Clone attention state to save in CUDA graph metadata."""
        ...

    @abstractmethod
183
184
185
186
    def graph_capture_get_metadata_for_batch(
            self,
            batch_size: int,
            is_encoder_decoder_model: bool = False) -> T:
187
188
189
190
        """Get attention metadata for CUDA graph capture of batch_size."""
        ...

    @abstractmethod
191
192
193
194
    def get_graph_input_buffers(
            self,
            attn_metadata: T,
            is_encoder_decoder_model: bool = False) -> Dict[str, Any]:
195
196
197
198
        """Get attention-specific input buffers for CUDA graph capture."""
        ...

    @abstractmethod
199
200
201
202
203
    def prepare_graph_input_buffers(
            self,
            input_buffers: Dict[str, Any],
            attn_metadata: T,
            is_encoder_decoder_model: bool = False) -> None:
204
205
206
207
208
209
210
211
212
        """In-place modify input buffers dict for CUDA graph replay."""
        ...

    @abstractmethod
    def begin_forward(self, model_input: "ModelRunnerInputBase") -> None:
        """Prepare state for forward pass."""
        ...


213
214
215
216
class AttentionMetadataBuilder(ABC, Generic[T]):
    """Abstract class for attention metadata builders."""

    @abstractmethod
217
    def __init__(self, input_builder: "ModelRunnerInputBuilderBase") -> None:
218
219
220
221
222
223
        """Create the builder, remember some configuration and parameters."""
        raise NotImplementedError

    @abstractmethod
    def prepare(self) -> None:
        """Prepare for one batch."""
224
225
226
        raise NotImplementedError

    @abstractmethod
227
228
    def build(self, seq_lens: List[int], query_lens: List[int],
              cuda_graph_pad_size: int, batch_size: int) -> T:
229
230
231
232
        """Build attention metadata with on-device tensors."""
        raise NotImplementedError


233
234
class AttentionLayer(Protocol):

235
    _q_scale: torch.Tensor
236
237
238
239
    _k_scale: torch.Tensor
    _v_scale: torch.Tensor
    _k_scale_float: float
    _v_scale_float: float
240
241
242
243
244
245
246
247
248
249
250
251

    def forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        kv_cache: torch.Tensor,
        attn_metadata: AttentionMetadata,
    ) -> torch.Tensor:
        ...


252
class AttentionImpl(ABC, Generic[T]):
253
254
255
256
257
258
259
260
261
262

    @abstractmethod
    def __init__(
        self,
        num_heads: int,
        head_size: int,
        scale: float,
        num_kv_heads: Optional[int] = None,
        alibi_slopes: Optional[List[float]] = None,
        sliding_window: Optional[int] = None,
263
        kv_cache_dtype: str = "auto",
264
        blocksparse_params: Optional[Dict[str, Any]] = None,
265
        logits_soft_cap: Optional[float] = None,
266
        attn_type: str = AttentionType.DECODER,
267
268
269
270
271
272
    ) -> None:
        raise NotImplementedError

    @abstractmethod
    def forward(
        self,
273
        layer: AttentionLayer,
274
275
276
277
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        kv_cache: torch.Tensor,
278
        attn_metadata: T,
279
        output: Optional[torch.Tensor] = None,
280
281
    ) -> torch.Tensor:
        raise NotImplementedError
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297


class MLAAttentionImpl(AttentionImpl[T], Generic[T]):

    @abstractmethod
    def forward(
        self,
        layer: AttentionLayer,
        hidden_states_or_cq: torch.Tensor,
        kv_c_normed: torch.Tensor,
        k_pe: torch.Tensor,
        kv_cache: torch.Tensor,
        attn_metadata: T,
        output: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        raise NotImplementedError
298
299
300
301


def is_quantized_kv_cache(kv_cache_dtype: str) -> bool:
    return kv_cache_dtype != "auto"