"examples/vscode:/vscode.git/clone" did not exist on "3408e471597e7a36ca79fab5fc849f4fb5576df8"
matcher_utils.py 14.8 KB
Newer Older
1
2
3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC, abstractmethod
4
from typing import Any
5
6
7
8
9

import torch
from torch._higher_order_ops import auto_functionalized
from torch._ops import OpOverload

10
from vllm._aiter_ops import rocm_aiter_ops
11
from vllm.config import get_current_vllm_config
12
from vllm.model_executor.layers.activation import SiluAndMul
13
14
15
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
from vllm.model_executor.layers.quantization.utils.quant_utils import (
16
    GroupShape,
17
18
    QuantKey,
    _normalize_quant_group_shape,
19
20
    kFp8Dynamic64Sym,
    kFp8Dynamic128Sym,
21
22
23
    kFp8DynamicTensorSym,
    kFp8DynamicTokenSym,
    kFp8StaticTensorSym,
24
    kNvfp4Dynamic,
25
)
26
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
27
28
29
30
from vllm.platforms import current_platform

RMS_OP = torch.ops._C.rms_norm.default
RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default
31
32
ROTARY_OP = torch.ops._C.rotary_embedding.default
FLASHINFER_ROTARY_OP = torch.ops.vllm.flashinfer_rotary_embedding.default
33
34
35
36
37
38
39
40

QUANT_OPS: dict[QuantKey, OpOverload] = {
    kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default,  # noqa: E501
    kFp8DynamicTensorSym: torch.ops._C.dynamic_scaled_fp8_quant.default,  # noqa: E501
    kFp8DynamicTokenSym: torch.ops._C.dynamic_per_token_scaled_fp8_quant.default,  # noqa: E501
}

if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"):
41
    QUANT_OPS[kNvfp4Dynamic] = torch.ops._C.scaled_fp4_quant.default  # noqa: E501
42

43
44
45
46
if current_platform.is_cuda():
    QUANT_OPS[kFp8Dynamic128Sym] = torch.ops._C.per_token_group_fp8_quant.default  # noqa: E501
    QUANT_OPS[kFp8Dynamic64Sym] = torch.ops._C.per_token_group_fp8_quant.default  # noqa: E501

47
48
SILU_MUL_OP = torch.ops._C.silu_and_mul.default

49
50

class MatcherCustomOp(ABC):
51
    def __init__(self, enabled: bool) -> None:
52
53
54
55
56
57
58
59
        config = get_current_vllm_config()
        self.model_dtype = config.model_config.dtype if config.model_config else None
        self.device = config.device_config.device if config.device_config else None

        self.enabled = enabled
        self.forward = self.forward_custom if enabled else self.forward_native

    @abstractmethod
60
    def forward_custom(self, *args: Any, **kwargs: Any) -> Any:
61
62
63
        pass

    @abstractmethod
64
    def forward_native(self, *args: Any, **kwargs: Any) -> Any:
65
66
        pass

67
68
    def __call__(self, *args: Any, **kwargs: Any) -> Any:
        return self.forward(*args, **kwargs)
69

70
71
    def empty(self, *args: Any, **kwargs: Any) -> torch.Tensor:
        return torch.empty(*args, dtype=self.model_dtype, device=self.device, **kwargs)
72

73
74
    def empty_int64(self, *args: Any, **kwargs: Any) -> torch.Tensor:
        return torch.empty(*args, dtype=torch.int64, device=self.device, **kwargs)
75

76
77
    def empty_f32(self, *args: Any, **kwargs: Any) -> torch.Tensor:
        return torch.empty(*args, dtype=torch.float32, device=self.device, **kwargs)
78
79
80
81
82
83

    def inputs(self) -> list[torch.Tensor]:
        """Utility for inputs to the pattern"""
        raise NotImplementedError


84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
class MatcherRotaryEmbedding(MatcherCustomOp):
    def __init__(
        self,
        is_neox: bool,
        head_size: int,
        num_heads: int,
        num_kv_heads: int,
        use_flashinfer: bool = False,
        enabled: bool | None = None,
    ) -> None:
        if enabled is None:
            enabled = RotaryEmbedding.enabled()

        super().__init__(enabled)
        self.is_neox = is_neox
        self.head_size = head_size
        self.num_heads = num_heads
        self.num_kv_heads = num_kv_heads
        self.q_size = self.num_heads * self.head_size
        self.kv_size = self.num_kv_heads * self.head_size
        self.rotary_dim = head_size
        if use_flashinfer:
            self.rotary_op = FLASHINFER_ROTARY_OP
        else:
            self.rotary_op = ROTARY_OP

    def inputs(self) -> list[torch.Tensor]:
        positions = self.empty_int64(5)
        query = self.empty(5, self.q_size)
        key = self.empty(5, self.kv_size)
        cos_sin_cache = self.empty(4096, self.rotary_dim)
        return [positions, query, key, cos_sin_cache]

    def forward_custom(
        self,
        positions: torch.Tensor,
        query: torch.Tensor,
        key: torch.Tensor | None,
        cos_sin_cache: torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor | None]:
        result = auto_functionalized(
            self.rotary_op,
            positions=positions,
            query=query,
            key=key,
            head_size=self.head_size,
            cos_sin_cache=cos_sin_cache,
            is_neox=self.is_neox,
        )
        query_out = result[1]
        key_out = result[2] if len(result) > 2 else None
        return query_out, key_out

    def forward_native(
        self,
        positions: torch.Tensor,
        query: torch.Tensor,
        key: torch.Tensor | None,
        cos_sin_cache: torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor | None]:
144
145
146
147
148
149
150
151
152
153
        result: tuple[torch.Tensor, torch.Tensor | None] = (
            RotaryEmbedding.forward_static(
                positions,
                query,
                key,
                self.head_size,
                self.rotary_dim,
                cos_sin_cache,
                self.is_neox,
            )
154
        )
155
        return result
156
157


158
class MatcherRMSNorm(MatcherCustomOp):
159
160
161
162
163
    def __init__(
        self,
        epsilon: float,
        enabled: bool | None = None,
        match_rocm_aiter: bool = False,
164
    ) -> None:
165
166
167
168
169
        if enabled is None:
            enabled = RMSNorm.enabled()

        super().__init__(enabled)
        self.epsilon = epsilon
170
171
172
173
174
        self._rmsnorm_op = RMS_OP
        self.match_rocm_aiter = match_rocm_aiter

        if match_rocm_aiter:
            self._rmsnorm_op = rocm_aiter_ops.get_rmsnorm_op()
175

176
    def inputs(self) -> list[torch.Tensor]:
177
178
179
180
        input = self.empty(5, 16) if self.enabled else self.empty_f32(5, 16)
        weight = self.empty(16)
        return [input, weight]

181
182
183
184
185
186
187
188
189
190
191
    def forward_rocm_aiter(
        self,
        input: torch.Tensor,
        weight: torch.Tensor,
    ) -> torch.Tensor:
        return self._rmsnorm_op(
            x=input,
            weight=weight,
            variance_epsilon=self.epsilon,
        )

192
193
194
195
196
    def forward_custom(
        self,
        input: torch.Tensor,
        weight: torch.Tensor,
    ) -> torch.Tensor:
197
198
199
        if self.match_rocm_aiter:
            return self.forward_rocm_aiter(input, weight)

200
201
        result = torch.empty_like(input)
        _, result = auto_functionalized(
202
            self._rmsnorm_op,
203
            result=result,
204
            input=input,
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
            weight=weight,
            epsilon=self.epsilon,
        )

        return result

    def forward_native(
        self,
        input: torch.Tensor,
        weight: torch.Tensor,
    ) -> torch.Tensor:
        return RMSNorm.forward_static(
            input, self.epsilon, input.size(-1), self.model_dtype, weight
        )


class MatcherFusedAddRMSNorm(MatcherCustomOp):
222
223
224
225
226
    def __init__(
        self,
        epsilon: float,
        enabled: bool | None = None,
        match_rocm_aiter: bool = False,
227
    ) -> None:
228
229
230
231
232
        if enabled is None:
            enabled = RMSNorm.enabled()

        super().__init__(enabled)
        self.epsilon = epsilon
233
234
235
236
237
238
        self.match_rocm_aiter = match_rocm_aiter

        self._rmsnorm_op = RMS_ADD_OP

        if match_rocm_aiter:
            self._rmsnorm_op = rocm_aiter_ops.get_rmsnorm_fused_add_op()
239

240
    def inputs(self) -> list[torch.Tensor]:
241
242
243
244
245
        input = self.empty(5, 16) if self.enabled else self.empty_f32(5, 16)
        weight = self.empty(16)
        residual = self.empty(5, 16)
        return [input, weight, residual]

246
247
248
249
250
251
    def forward_rocm_aiter(
        self,
        input: torch.Tensor,
        weight: torch.Tensor,
        residual: torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor]:
252
        return self._rmsnorm_op(  # type: ignore[no-any-return]
253
254
255
            x=input, residual=residual, weight=weight, variance_epsilon=self.epsilon
        )

256
257
258
259
260
261
    def forward_custom(
        self,
        input: torch.Tensor,
        weight: torch.Tensor,
        residual: torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor]:
262
263
264
        if self.match_rocm_aiter:
            return self.forward_rocm_aiter(input, weight, residual)

265
        _, result, residual = auto_functionalized(
266
            self._rmsnorm_op,
267
268
269
270
271
272
273
274
275
276
277
278
279
280
            input=input,
            residual=residual,
            weight=weight,
            epsilon=self.epsilon,
        )

        return result, residual

    def forward_native(
        self,
        input: torch.Tensor,
        weight: torch.Tensor,
        residual: torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor]:
281
        result: tuple[torch.Tensor, torch.Tensor] = RMSNorm.forward_static(
282
283
            input, self.epsilon, input.size(-1), self.model_dtype, weight, residual
        )
284
        return result
285
286
287


class MatcherQuantFP8(MatcherCustomOp):
288
289
290
291
    def __init__(
        self,
        quant_key: QuantKey,
        enabled: bool | None = None,
292
293
        has_col_major_scales: bool = False,
        is_e8m0: bool = False,
294
        match_rocm_aiter: bool = False,
295
    ) -> None:
296
297
298
299
300
        if enabled is None:
            enabled = QuantFP8.enabled()

        super().__init__(enabled)
        self.quant_key = quant_key
301
302
        self.has_col_major_scales = has_col_major_scales
        self.is_e8m0 = is_e8m0
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
        self.match_rocm_aiter = match_rocm_aiter

        if match_rocm_aiter:
            assert not quant_key.scale.group_shape.is_per_tensor(), (
                "ROCm aiter fusion pass does not support per tensor quantization"
            )
            if quant_key.scale.group_shape.is_per_token():
                self.QUANT_OP = rocm_aiter_ops.get_per_token_quant_op()
            else:
                assert quant_key.scale.group_shape.col == 128, (
                    "ROCm aiter fusion pass currently supports "
                    "quantization operation with group_size 128"
                )
                if current_platform.is_fp8_fnuz():
                    self.QUANT_OP = rocm_aiter_ops.get_group_quant_op()
                else:
                    self.QUANT_OP = (
                        torch.ops.vllm.triton_per_token_group_quant_fp8.default
                    )

        else:
            assert quant_key in QUANT_OPS, (
                f"unsupported quantization scheme {quant_key}"
            )
            self.QUANT_OP = QUANT_OPS[quant_key]

            assert quant_key.dtype == current_platform.fp8_dtype(), (
                "Only QuantFP8 supported by"
            )
            assert quant_key.scale2 is None
333
334
335
336
337
338

        self.quant_fp8 = QuantFP8(
            quant_key.scale.static,
            quant_key.scale.group_shape,
            column_major_scales=has_col_major_scales,
            use_ue8m0=is_e8m0,
339
            compile_native=False,
340
        )
341

342
343
344
345
346
347
348
    def forward_rocm_aiter(
        self,
        input: torch.Tensor,
        scale: torch.Tensor | None = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        quant_key_group_shape = self.quant_key.scale.group_shape
        if quant_key_group_shape == GroupShape.PER_TOKEN:
349
            return self.QUANT_OP(  # type: ignore[no-any-return]
350
351
352
353
354
                x=input,
                quant_dtype=self.quant_key.dtype,
                scale=scale,
            )
        else:
355
            return self.QUANT_OP(input, quant_key_group_shape.col)  # type: ignore[no-any-return]
356

357
358
359
360
361
    def forward_custom(
        self,
        input: torch.Tensor,
        scale: torch.Tensor | None = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
362
363
364
        if self.match_rocm_aiter:
            return self.forward_rocm_aiter(input, scale)

365
366
367
368
        result = torch.empty(
            input.shape, device=input.device, dtype=self.quant_key.dtype
        )

369
370
        if self.quant_key.scale.group_shape.is_per_group():
            assert scale is None
371
            scale = self.make_scale(input, transposed=self.has_col_major_scales)
372
373
374
375
376
377
378
379
380
381
382
383
384
385

            finfo = torch.finfo(self.quant_key.dtype)
            fp8_min = finfo.min
            fp8_max = finfo.max

            _, result, scale = auto_functionalized(
                self.QUANT_OP,
                input=input,
                output_q=result,
                output_s=scale,
                group_size=self.quant_key.scale.group_shape[1],
                eps=1e-10,
                fp8_min=fp8_min,
                fp8_max=fp8_max,
386
                scale_ue8m0=self.is_e8m0,
387
388
389
            )
            return result, scale

390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
        if self.quant_key.scale.static:
            assert scale is not None
            _, result = auto_functionalized(
                self.QUANT_OP, result=result, input=input, scale=scale
            )
            return result, scale
        else:
            assert scale is None
            scale = self.make_scale(input)
            _, result, scale = auto_functionalized(
                self.QUANT_OP, result=result, input=input, scale=scale, scale_ub=None
            )
            return result, scale

    def forward_native(
        self,
        input: torch.Tensor,
        scale: torch.Tensor | None = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
409
        return self.quant_fp8(input, scale)  # type: ignore[no-any-return]
410

411
    def make_scale(self, input: torch.Tensor, transposed: bool = False) -> torch.Tensor:
412
413
414
415
416
417
418
        normalized_group_shape = _normalize_quant_group_shape(
            input, self.quant_key.scale.group_shape
        )
        scale_shape = (
            input.shape[0] // normalized_group_shape[0],
            input.shape[1] // normalized_group_shape[1],
        )
419
420
421
422
423
        if transposed:
            scale_shape = tuple(reversed(scale_shape))
            return torch.empty(
                scale_shape, device=input.device, dtype=torch.float32
            ).permute(-1, -2)
424
425
426
427
428
429
430
431
432

        return torch.empty(scale_shape, device=input.device, dtype=torch.float32)

    def inputs(self) -> list[torch.Tensor]:
        input = self.empty(5, 16)
        if self.quant_key.scale.static:
            return [input, self.empty_f32(1, 1)]

        return [input]
433
434
435


class MatcherSiluAndMul(MatcherCustomOp):
436
    def __init__(self, enabled: bool | None = None) -> None:
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
        if enabled is None:
            enabled = SiluAndMul.enabled()
        super().__init__(enabled)

    def inputs(self) -> list[torch.Tensor]:
        input = self.empty(5, 4)
        return [input]

    def forward_custom(
        self,
        x: torch.Tensor,
    ) -> torch.Tensor:
        d = x.shape[-1] // 2
        output_shape = x.shape[:-1] + (d,)
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        result = auto_functionalized(SILU_MUL_OP, result=out, input=x)
        return result[1]

    def forward_native(
        self,
        x: torch.Tensor,
    ) -> torch.Tensor:
        return SiluAndMul.forward_native(x)