_ipex_ops.py 14.3 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
6
7

import torch

from vllm.logger import init_logger
8
from vllm.platforms import current_platform
9
10
11
12
13
14

logger = init_logger(__name__)

try:
    import intel_extension_for_pytorch as ipex
except ImportError as e:
15
    logger.debug("Import error msg: %s", e.msg)
16
17
18
19
20


class ipex_ops:
    @staticmethod
    def _reshape_activation_tensor(
21
22
        x: torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor]:
23
24
25
26
27
28
29
30
        num = x.size(0)
        d = x.size(1) // 2
        x = x.reshape(num, 2, d)
        x1, x2 = torch.chunk(x, chunks=2, dim=1)
        x1 = x1.reshape(num, d)
        x2 = x2.reshape(num, d)
        return x1, x2

31
    @staticmethod
32
    def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
33
        ipex.llm.functional.silu_and_mul(x, out)
34

35
    @staticmethod
36
    def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
37
        ipex.llm.functional.gelu_and_mul(x, out)
38

39
    @staticmethod
40
    def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
41
        ipex.llm.functional.gelu_and_mul(x, out)
42

43
    @staticmethod
44
45
    def gelu_fast(x: torch.Tensor) -> torch.Tensor:
        return torch.nn.functional.gelu(x)
46

47
    @staticmethod
48
49
    def gelu_new(x: torch.Tensor) -> torch.Tensor:
        return torch.nn.functional.gelu(x)
50

51
52
53
    @staticmethod
    def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
        ipex.llm.functional.gelu_quick(x, out)
54

55
    @staticmethod
56
57
58
59
60
61
62
63
64
65
66
    def paged_attention_v1(
        out: torch.Tensor,
        query: torch.Tensor,
        key_cache: torch.Tensor,
        value_cache: torch.Tensor,
        num_kv_heads: int,
        scale: float,
        block_tables: torch.Tensor,
        context_lens: torch.Tensor,
        block_size: int,
        max_context_len: int,
67
        alibi_slopes: torch.Tensor | None,
68
        kv_cache_dtype: str,
69
70
        k_scale: float,
        v_scale: float,
71
72
73
74
75
76
77
78
79
        tp_rank: int = 0,
        blocksparse_local_blocks: int = 0,
        blocksparse_vert_stride: int = 0,
        blocksparse_block_size: int = 64,
        blocksparse_head_sliding_step: int = 0,
    ) -> None:
        assert kv_cache_dtype == "auto"
        num_heads = out.size(1)
        num_queries_per_tokens = num_heads // num_kv_heads
80
        ipex.llm.modules.PagedAttention.single_query_kv_attention(
81
82
83
84
            out,
            query.contiguous(),
            key_cache.view_as(value_cache),
            value_cache,
85
            num_queries_per_tokens,
86
87
88
89
90
91
92
            scale,
            block_tables,
            context_lens,
            block_size,
            max_context_len,
            alibi_slopes,
        )
93

94
    @staticmethod
95
96
97
98
99
100
101
102
103
104
105
106
107
108
    def paged_attention_v2(
        out: torch.Tensor,
        exp_sum: torch.Tensor,
        max_logits: torch.Tensor,
        tmp_out: torch.Tensor,
        query: torch.Tensor,
        key_cache: torch.Tensor,
        value_cache: torch.Tensor,
        num_kv_heads: int,
        scale: float,
        block_tables: torch.Tensor,
        context_lens: torch.Tensor,
        block_size: int,
        max_context_len: int,
109
        alibi_slopes: torch.Tensor | None,
110
        kv_cache_dtype: str,
111
112
        k_scale: float,
        v_scale: float,
113
114
115
116
117
118
119
120
121
        tp_rank: int = 0,
        blocksparse_local_blocks: int = 0,
        blocksparse_vert_stride: int = 0,
        blocksparse_block_size: int = 64,
        blocksparse_head_sliding_step: int = 0,
    ) -> None:
        assert kv_cache_dtype == "auto"
        num_heads = out.size(1)
        num_queries_per_tokens = num_heads // num_kv_heads
122
        ipex.llm.modules.PagedAttention.single_query_kv_attention(
123
124
125
126
            out,
            query.contiguous(),
            key_cache.view_as(value_cache),
            value_cache,
127
128
            num_queries_per_tokens,
            scale,
129
130
131
132
133
134
            block_tables,
            context_lens,
            block_size,
            max_context_len,
            alibi_slopes,
        )
135

136
    @staticmethod
137
138
139
140
141
142
143
144
    def rotary_embedding(
        positions: torch.Tensor,  # [batch_size, seq_len]
        query: torch.Tensor,  # [batch_size, seq_len, num_heads*head_size]
        key: torch.Tensor,  # [batch_size, seq_len, num_kv_heads*head_size]
        head_size: int,
        cos_sin_cache: torch.Tensor,  # [cos_sin_dim, rot_dim]
        is_neox: bool,
    ) -> None:
145
        rot_dim = cos_sin_cache.size(1)
146
147
148
        ipex.llm.functional.rotary_embedding_batched(
            positions, query, key, head_size, cos_sin_cache, is_neox, rot_dim
        )
149

150
    @staticmethod
151
152
153
    def rms_norm(
        input: torch.Tensor, weight: torch.Tensor, epsilon: float
    ) -> torch.Tensor:
154
155
156
        out = torch.empty_like(input)
        torch.ops.torch_ipex.rms_norm_vllm(out, input.contiguous(), weight, epsilon)
        return out
157

158
    @staticmethod
159
160
161
162
163
164
    def fused_add_rms_norm(
        input: torch.Tensor,
        residual: torch.Tensor,
        weight: torch.Tensor,
        epsilon: float,
    ) -> None:
165
        torch.ops.torch_ipex.fused_add_rms_norm_vllm(input, residual, weight, epsilon)
166

167
    @staticmethod
168
169
170
171
172
173
174
    def varlen_attention(
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        out: torch.Tensor,
        seqlen_q: torch.Tensor,
        seqlen_k: torch.Tensor,
175
        alibi_slopes: torch.Tensor | None,
176
177
178
179
180
181
182
183
        max_seqlen_q: int,
        max_seqlen_k: int,
        pdropout: float,
        softmax_scale: float,
        zero_tensors: bool,
        is_causal: bool,
        return_softmax: bool,
        gen_: torch.Generator,
184
185
        window_size_left: float,
        window_size_right: float,
186
        logits_soft_cap: float,
187
    ) -> None:
Thien Tran's avatar
Thien Tran committed
188
189
190
        if ipex.__version__.endswith("cpu"):
            if logits_soft_cap != 0.0:
                raise ValueError("IPEX CPU does not support logits_soft_cap")
191
192
            assert alibi_slopes is None
            assert window_size_left < 0 and window_size_right < 0
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
            ipex.llm.functional.varlen_attention(
                query.contiguous(),
                key.contiguous(),
                value.contiguous(),
                out,
                seqlen_q.int(),
                seqlen_k.int(),
                max_seqlen_q,
                max_seqlen_k,
                pdropout,
                softmax_scale,
                zero_tensors,
                is_causal,
                return_softmax,
                gen_,
            )
Thien Tran's avatar
Thien Tran committed
209
        else:  # XPU build
210
            ipex.llm.functional.varlen_attention(
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
                query.contiguous(),
                key.contiguous(),
                value.contiguous(),
                out,
                seqlen_q.int(),
                seqlen_k.int(),
                alibi_slopes,
                max_seqlen_q,
                max_seqlen_k,
                pdropout,
                softmax_scale,
                zero_tensors,
                is_causal,
                return_softmax,
                gen_,
                window_size_left,
                window_size_right,
                logits_soft_cap,
            )
230

231
    @staticmethod
232
233
234
235
236
237
238
    def reshape_and_cache(
        key: torch.Tensor,
        value: torch.Tensor,
        key_cache: torch.Tensor,
        value_cache: torch.Tensor,
        slot_mapping: torch.Tensor,
        kv_cache_dtype: str,
239
240
        k_scale: float,
        v_scale: float,
241
242
243
    ) -> None:
        assert kv_cache_dtype == "auto"
        ipex.llm.modules.PagedAttention.reshape_and_cache(
244
245
            key, value, key_cache, value_cache, slot_mapping
        )
246

247
248
249
250
251
252
253
254
    @staticmethod
    def reshape_and_cache_flash(
        key: torch.Tensor,
        value: torch.Tensor,
        key_cache: torch.Tensor,
        value_cache: torch.Tensor,
        slot_mapping: torch.Tensor,
        kv_cache_dtype: str,
255
256
        k_scale: torch.Tensor | None = None,
        v_scale: torch.Tensor | None = None,
257
258
259
260
        k_scale_float: float = 1.0,
        v_scale_float: float = 1.0,
    ) -> None:
        ipex.llm.modules.PagedAttention.reshape_and_cache_flash(
261
262
263
264
265
266
267
268
269
            key,
            value,
            key_cache,
            value_cache,
            slot_mapping,
            kv_cache_dtype,
            k_scale_float,
            v_scale_float,
        )
270
271
272
273
274
275
276
277
278

    @staticmethod
    def flash_attn_varlen_func(
        q: torch.Tensor,
        k: torch.Tensor,
        v: torch.Tensor,
        cu_seqlens_q: torch.Tensor,
        max_seqlen_q: int,
        max_seqlen_k: int,
279
280
281
282
283
        softmax_scale: float | None = None,
        causal: bool = False,
        out: torch.Tensor | None = None,
        block_table: torch.Tensor | None = None,
        alibi_slopes: torch.Tensor | None = None,
284
285
        window_size: list[int] | None = None,
        softcap: float | None = 0.0,
286
        seqused_k: torch.Tensor | None = None,
287
        cu_seqlens_k: torch.Tensor | None = None,
288
289
        # passed in qwen vl
        dropout_p: float = 0.0,
290
291
292
293
294
295
296
        # The following parameters are not used in ipex kernel currently,
        # we keep API compatible to CUDA's.
        scheduler_metadata=None,
        fa_version: int = 2,
        q_descale=None,
        k_descale=None,
        v_descale=None,
297
        num_splits=0,
298
        s_aux: torch.Tensor | None = None,
299
    ):
300
301
        if out is None:
            out = torch.empty(q.shape, dtype=q.dtype, device=q.device)
302
303
304
305
306
307
        real_window_size: tuple[int, int]
        if window_size is None:
            real_window_size = (-1, -1)
        else:
            assert len(window_size) == 2
            real_window_size = (window_size[0], window_size[1])
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356

        if block_table is None:
            assert cu_seqlens_k is not None, (
                "cu_seqlens_k can't be None when calling varlen_attention."
            )
            if softmax_scale is None:
                softmax_scale = q.shape[-1] ** (-0.5)
            ipex_ops.varlen_attention(
                q.contiguous(),
                k.contiguous(),
                v.contiguous(),
                out,
                cu_seqlens_q,
                cu_seqlens_k,
                None,
                max_seqlen_q,
                max_seqlen_k,
                0.0,
                softmax_scale,
                False,
                causal,
                False,
                None,
                real_window_size[0],
                real_window_size[1],
                -1,
            )
            return out
        else:
            return ipex.llm.modules.PagedAttention.flash_attn_varlen_func(
                out,
                q.contiguous(),
                k,
                v,
                cu_seqlens_q,
                seqused_k,
                max_seqlen_q,
                max_seqlen_k,
                softmax_scale,
                causal,
                block_table,
                alibi_slopes,
                sink=s_aux,
                softcap=softcap,
                window_size_left=real_window_size[0],
                window_size_right=real_window_size[1],
                k_scale=1.0,
                v_scale=1.0,
            )
357
358
359

    @staticmethod
    def get_scheduler_metadata(
360
361
362
363
364
365
366
367
368
        batch_size,
        max_seqlen_q,
        max_seqlen_k,
        num_heads_q,
        num_heads_kv,
        headdim,
        cache_seqlens: torch.Tensor,
        qkv_dtype=torch.bfloat16,
        headdim_v=None,
369
370
371
372
        cu_seqlens_q: torch.Tensor | None = None,
        cu_seqlens_k_new: torch.Tensor | None = None,
        cache_leftpad: torch.Tensor | None = None,
        page_size: int | None = None,
373
374
375
376
377
378
379
        max_seqlen_k_new=0,
        causal=False,
        window_size=(-1, -1),  # -1 means infinite context window
        has_softcap=False,
        num_splits=0,  # Can be tuned for speed
        pack_gqa=None,  # Can be tuned for speed
        sm_margin=0,  # Can be tuned if some SMs are used for communication
380
381
    ) -> None:
        logger.warning_once(
382
383
            "get_scheduler_metadata is not implemented for ipex_ops, returning None."
        )
384
385
        return None

386
    @staticmethod
387
388
389
    def swap_blocks(
        src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor
    ) -> None:
390
        torch.xpu.swap_blocks(src, dst, block_mapping)  # type: ignore
391
392
393
394

    @staticmethod
    def scaled_fp8_quant(
        input: torch.Tensor,
395
396
397
        scale: torch.Tensor | None = None,
        num_token_padding: int | None = None,
        scale_ub: torch.Tensor | None = None,
398
        use_per_token_if_dynamic: bool = False,
399
        output: torch.Tensor | None = None,
400
401
402
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
        Quantize input tensor to FP8 and return quantized tensor and scale.
403

404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
        This function is designed for both static and dynamic quantization:
        If you provide the scale, it will use static scaling and if you omit
        it, the scale will be determined dynamically. Currently, XPU platform
        only supports dynamic quantization. The function also allows optional
        padding of the output tensors for downstream kernels that will benefit
        from padding.

        Args:
            input: The input tensor to be quantized to FP8
            scale: Optional scaling factor for the FP8 quantization
            scale_ub: Optional upper bound for scaling factor in dynamic
                per token case
            num_token_padding: If specified, pad the first dimension
                of the output to at least this value.
            use_per_token_if_dynamic: Whether to do per_tensor or per_token
                in the dynamic quantization case.
420

421
422
423
424
425
        Returns:
            tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
                scaling factor.
        """
        # This code assumes batch_dim and num_tokens are flattened
426
        assert input.ndim == 2
427
        shape: tuple[int, int] | torch.Size = input.shape
428
429
430
431
432
433
        out_dtype: torch.dtype = current_platform.fp8_dtype()
        if num_token_padding:
            shape = (max(num_token_padding, input.shape[0]), shape[1])
        if output is None:
            output = torch.empty(shape, device=input.device, dtype=out_dtype)
        else:
434
            assert num_token_padding is None, (
435
                "padding not supported if output passed in"
436
            )
437
438
439
            assert output.dtype == out_dtype
        assert scale is None, "only dynamic fp8 quantization supported on XPU"
        assert not use_per_token_if_dynamic, (
440
441
            "per token dynamic fp8 quantization not supported on XPU"
        )
442
443
444
445
        scale = torch.zeros(1, device=input.device, dtype=torch.float32)
        torch.ops.torch_ipex.dynamic_scaled_fp8_quant(output, input, scale)

        return output, scale