common.py 3.21 KB
Newer Older
raojy's avatar
raojy committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
from collections.abc import Callable, Iterable
from typing import Any, NamedTuple

import pytest
import regex as re

from vllm.platforms import current_platform
from vllm.v1.attention.backends.registry import AttentionBackendEnum


class Matches(NamedTuple):
    # simple pointwise
    aiter_rms_quant_fusion: int = 0
    rms_quant_fusion: int = 0
    act_quant_fusion: int = 0
    norm_rope_fusion: int = 0
    attn_quant_fusion: int = 0
    # distributed
    ar_rms_fusion: int = 0
    sequence_parallel: int = 0
    async_tp: int = 0


class ModelFusionInfo(NamedTuple):
    model_name: str
    matches: Callable[[int], Matches]
    """Given number of hidden layers, produces the matches object"""
    model_kwargs: dict[str, Any] = {}
    hf_overrides: Callable[[int], dict] = lambda n: {"num_hidden_layers": n}


class AttentionBackendCase(NamedTuple):
    backend: AttentionBackendEnum
    model_kwargs: dict[str, Any] = {}
    """Additional args required for attn+quant fusion"""


is_blackwell = lambda: current_platform.is_device_capability_family(100)
"""Are we running on Blackwell, a lot of tests depend on it"""


def custom_ops_combos(*custom_ops: str) -> Iterable[str]:
    """Generate all combinations of custom ops for parametrization."""
    custom_ops_lists = [[f"-{op}", f"+{op}"] for op in custom_ops]
    for op_list in itertools.product(*custom_ops_lists):
        yield ",".join(op_list)


# Quick inline validation
assert list(custom_ops_combos("silu_and_mul")) == ["-silu_and_mul", "+silu_and_mul"]
assert list(custom_ops_combos("quant_fp8", "rms_norm")) == [
    "-quant_fp8,-rms_norm",
    "-quant_fp8,+rms_norm",
    "+quant_fp8,-rms_norm",
    "+quant_fp8,+rms_norm",
]


def has_cuda_graph_wrapper_metadata() -> bool:
    from importlib import import_module

    try:
        module = import_module("torch._inductor.utils")
        module.CUDAGraphWrapperMetadata  # noqa B018
    except AttributeError:
        return False
    return True


INDUCTOR_GRAPH_PARTITION = [
    pytest.param(
        True,
        marks=pytest.mark.skipif(
            not has_cuda_graph_wrapper_metadata(),
            reason="torch version does not support Inductor partition",
        ),
        id="inductor_partition",
    ),
    pytest.param(False, id="dynamo_partition"),
]

FUSION_LOG_PATTERNS: dict[str, re.Pattern] = {
    "aiter_rms_quant_fusion": re.compile(
        r"RocmAiterRMSNormQuantFusionPass Replaced (\d+) patterns"
    ),
    "rms_quant_fusion": re.compile(r"rms_quant_fusion.py:\d+] Replaced (\d+) patterns"),
    "act_quant_fusion": re.compile(r"act_quant_fusion.py:\d+] Replaced (\d+) patterns"),
    "norm_rope_fusion": re.compile(
        r"qk_norm_rope_fusion.py:\d+] Fused QK Norm\+RoPE on (\d+) sites"
    ),
    "attn_quant_fusion": re.compile(
        r"attn_quant_fusion.py:\d+] Fused quant onto (\d+) attention nodes"
    ),
    "ar_rms_fusion": re.compile(
        r"allreduce_rms_fusion.py:\d+] Replaced (\d+) patterns"
    ),
    "sequence_parallel": re.compile(
        r"sequence_parallelism.py:\d+] Replaced (\d+) patterns"
    ),
    "async_tp": re.compile(r"collective_fusion.py:\d+] Replaced (\d+) patterns"),
}