test_full_graph.py 7.8 KB
Newer Older
raojy's avatar
raojy committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import tempfile
from pathlib import Path
from typing import Any

import pytest
import torch

from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
from vllm.platforms import current_platform
from vllm.utils.torch_utils import is_torch_equal_or_newer
from vllm.v1.attention.backends.registry import AttentionBackendEnum

from ...utils import create_new_process_for_each_test


def models_list(*, all: bool = True, keywords: list[str] | None = None):
    TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
        ("facebook/opt-125m", {}),
        (
            "neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic",
            {"dtype": torch.float16},
        ),
        ("meta-llama/Llama-3.2-1B-Instruct", {}),
    ]

    if all:
        TEST_MODELS.extend(
            [
                ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {}),
                (
                    "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
                    {"dtype": torch.float16},
                ),
            ]
        )

        # TODO: figure out why this fails.
        if False and is_quant_method_supported("gguf"):  # noqa: SIM223
            TEST_MODELS.append(
                ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {"quantization": "gguf"})
            )

        if is_quant_method_supported("gptq"):
            TEST_MODELS.append(
                ("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {"quantization": "gptq"})
            )

        if is_quant_method_supported("gptq_marlin"):
            TEST_MODELS.append(
                (
                    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
                    {"quantization": "gptq_marlin"},
                )
            )

        if not current_platform.is_rocm() and is_quant_method_supported("awq"):
            TEST_MODELS.append(
                ("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {"quantization": "AWQ"})
            )

    if keywords is None:
        return TEST_MODELS

    # filter by keywords
    pred = lambda model: any(keyword in model[0] for keyword in keywords)
    return list(filter(pred, TEST_MODELS))


@pytest.mark.parametrize(
    "compilation_mode",
    [CompilationMode.DYNAMO_TRACE_ONCE, CompilationMode.VLLM_COMPILE],
)
@pytest.mark.parametrize("model, model_kwargs", models_list(all=True))
@create_new_process_for_each_test()
def test_full_graph(
    monkeypatch: pytest.MonkeyPatch,
    model: str,
    model_kwargs: dict[str, Any],
    compilation_mode: int,
):
    if (
        "w8a8" in model
        or "w8w8" in model
        and current_platform.has_device_capability((10, 0))
    ):
        # int8 removed on Blackwell:
        pytest.skip("int8 support removed on Blackwell")

    with monkeypatch.context():
        print(f"MODEL={model}")

        run_model(compilation_mode, model, **model_kwargs)


# TODO(luka) add other supported compilation config scenarios here
@pytest.mark.parametrize(
    "compilation_config, model, model_kwargs",
    [
        # additional compile sizes, only some of the models
        (
            CompilationConfig(mode=CompilationMode.VLLM_COMPILE, compile_sizes=[1, 2]),
            *model_info,
        )
        for model_info in models_list(all=False)
    ]
    + [
        # RMSNorm + quant fusion, only 8-bit quant models
        (
            CompilationConfig(
                mode=CompilationMode.VLLM_COMPILE,
                custom_ops=["+rms_norm"],
                pass_config=PassConfig(
                    fuse_norm_quant=True, fuse_act_quant=True, eliminate_noops=True
                ),
            ),
            *model_info,
        )
        for model_info in models_list(keywords=["FP8-dynamic", "quantized.w8a8"])
    ]
    + [
        # Test depyf integration works
        (
            CompilationConfig(
                mode=CompilationMode.VLLM_COMPILE,
                debug_dump_path=Path(tempfile.gettempdir()),
            ),
            "facebook/opt-125m",
            {},
        ),
    ]
    + [
        # graph inductor partition
        (
            CompilationConfig(
                mode=CompilationMode.VLLM_COMPILE,
                # inductor graph partition uses
                # torch._C.Tag.cudagraph_unsafe to specify splitting ops
                use_inductor_graph_partition=True,
                cudagraph_mode=CUDAGraphMode.PIECEWISE,
                compile_sizes=[1, 2],
            ),
            *model_info,
        )
        for model_info in models_list(all=False)
        if is_torch_equal_or_newer("2.9.0.dev")
    ]
    + [
        # Test get_raw_stream patch with compile_sizes
        # This tests that TorchInductor autotune works correctly with get_raw_stream
        # patch in torch 2.9 and without patch in torch 2.10+
        (
            CompilationConfig(
                mode=CompilationMode.VLLM_COMPILE,
                compile_sizes=[1, 2],  # Triggers autotune which uses get_raw_stream
                cudagraph_mode=CUDAGraphMode.NONE,
            ),
            "facebook/opt-125m",
            {},
        ),
    ],
)
# only test some of the models
@create_new_process_for_each_test()
def test_custom_compile_config(
    compilation_config: CompilationConfig,
    model: str,
    model_kwargs: dict[str, Any],
):
    if (
        "w8a8" in model
        or "w8w8" in model
        and current_platform.has_device_capability((10, 0))
    ):
        # int8 removed on Blackwell:
        pytest.skip("int8 support removed on Blackwell")

    if compilation_config.use_inductor_graph_partition and not is_torch_equal_or_newer(
        "2.9.0.dev"
    ):
        pytest.skip("inductor graph partition is only available in PyTorch 2.9+")

    print(f"MODEL={model}")
    run_model(compilation_config, model, **model_kwargs)


@pytest.mark.parametrize(
    "compilation_mode",
    [CompilationMode.NONE, CompilationMode.VLLM_COMPILE],
)
@pytest.mark.parametrize(
    "model, backend",
    [
        ("Qwen/Qwen2-0.5B", None),  # Standard attention model
        (
            "deepseek-ai/DeepSeek-V2-Lite",
            AttentionBackendEnum.FLASHINFER_MLA,
        ),  # MLA (Multi-head Latent Attention) model
    ],
)
def test_fp8_kv_scale_compile(
    compilation_mode: int,
    model: str,
    backend: AttentionBackendEnum | None,
):
    model_kwargs = {
        "quantization": "fp8",
        "kv_cache_dtype": "fp8_e4m3",
        "calculate_kv_scales": True,
        "max_model_len": 512,
    }
    if backend:
        model_kwargs["attention_config"] = {"backend": backend.name}

    run_model(compilation_mode, model, **model_kwargs)


def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):
    compilation_config = (
        compile_config
        if isinstance(compile_config, CompilationConfig)
        else CompilationConfig(mode=compile_config)
    )

    prompts = [
        "Hello, my name is",
        "The president of the United States is",
        "The capital of France is",
        "The future of AI is",
    ]
    sampling_params = SamplingParams(temperature=0)
    # Allow override from model_kwargs
    model_kwargs = {"tensor_parallel_size": 1, **model_kwargs}
    model_kwargs = {"disable_custom_all_reduce": True, **model_kwargs}

    # No cudagraphs by default
    if compilation_config.cudagraph_mode is None:
        compilation_config.cudagraph_mode = CUDAGraphMode.NONE

    llm = LLM(
        model=model,
        compilation_config=compilation_config,
        **model_kwargs,
    )
    outputs = llm.generate(prompts, sampling_params)

    # Print the outputs.
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")