test_config.py 16.5 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
import copy
4
import logging
5
from contextlib import nullcontext
6
from unittest.mock import patch
7

8
import pytest
9
from pydantic import ValidationError
10
11

from vllm.compilation.counter import compilation_counter
12
from vllm.compilation.fix_functionalization import FixFunctionalizationPass
13
from vllm.config import CompilationConfig, CUDAGraphMode, ParallelConfig, VllmConfig
14
from vllm.config.compilation import CompilationMode, PassConfig
15
from vllm.engine.arg_utils import EngineArgs
16
from vllm.logger import _print_warning_once
17
from vllm.platforms import current_platform
18
19
20
21
from vllm.utils.torch_utils import (
    _is_torch_equal_or_newer,
    is_torch_equal,
)
22

23
24
25
# This import automatically registers `torch.ops.silly.attention`
from . import silly_attention  # noqa: F401

26
27

def test_version():
28
    # Test the version comparison logic using the private function
29
30
31
32
33
    assert _is_torch_equal_or_newer("2.8.0.dev20250624+cu128", "2.8.0.dev")
    assert _is_torch_equal_or_newer("2.8.0a0+gitc82a174", "2.8.0.dev")
    assert _is_torch_equal_or_newer("2.8.0", "2.8.0.dev")
    assert _is_torch_equal_or_newer("2.8.1", "2.8.0.dev")
    assert not _is_torch_equal_or_newer("2.7.1", "2.8.0.dev")
34
35


36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def test_get_raw_stream_patch():
    """Test that get_raw_stream patch is applied only for torch 2.9.0 or 2.9.1."""
    import builtins

    # Check if get_raw_stream exists in builtins
    has_patch = hasattr(builtins, "get_raw_stream")

    # Import torch to get actual version

    is_torch_2_9 = is_torch_equal("2.9.0") or is_torch_equal("2.9.1")

    if is_torch_2_9:
        # For torch 2.9.x, the patch should be applied
        assert has_patch, "get_raw_stream should be patched for torch 2.9.x"
        # Verify it's callable (it should be the _cuda_getCurrentRawStream function)
        get_raw_stream = builtins.get_raw_stream  # type: ignore[attr-defined]
        assert callable(get_raw_stream)
        # Verify it's the correct function from torch._C
        from torch._C import _cuda_getCurrentRawStream

        assert get_raw_stream is _cuda_getCurrentRawStream


59
60
61
62
63
64
65
66
67
68
69
70
71
72
def test_copy_pass():
    vllm_config = VllmConfig()
    inductor_pass = FixFunctionalizationPass(vllm_config)
    copied_inductor_pass = copy.deepcopy(inductor_pass)
    assert (
        copied_inductor_pass.compilation_config.use_inductor_graph_partition
        == vllm_config.compilation_config.use_inductor_graph_partition
    )
    assert (
        copied_inductor_pass.compilation_config.splitting_ops
        == vllm_config.compilation_config.splitting_ops
    )


73
74
75
76
77
78
79
80
def test_custom_op():
    # proper syntax
    _ = CompilationConfig(custom_ops=["+quant_fp8", "-silu_and_mul"])

    with pytest.raises(ValueError, match="Invalid syntax '"):
        _ = CompilationConfig(custom_ops=["quant_fp8"])


81
82
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@pytest.mark.forked
83
84
85
86
87
# NB: We don't test VLLM_DISABLE_COMPILE_CACHE=0 because that depends
# on the state of the cache directory on the current machine, which
# may be influenced by other tests.
@pytest.mark.parametrize("val", ["1"])
def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
88
    # Disable multiprocessing so that the counter is in the same process
89
90
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", val)
91
92

    compilation_config = {
93
        "cudagraph_mode": CUDAGraphMode.NONE,  # speed things up a bit
94
95
    }
    with (
96
97
98
99
100
101
102
103
104
105
        compilation_counter.expect(
            num_cache_entries_updated=0, num_compiled_artifacts_saved=0
        ),
        # loading the model causes compilation (if enabled) to happen
        vllm_runner(
            "facebook/opt-125m",
            compilation_config=compilation_config,
            gpu_memory_utilization=0.4,
        ) as _,
    ):
106
107
108
        pass


109
110
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@pytest.mark.forked
111
112
113
114
115
116
117
118
119
120
121
122
@pytest.mark.parametrize(
    "cudagraph_mode,num_cudagraph_captured",
    [
        (CUDAGraphMode.NONE, 0),
        (CUDAGraphMode.FULL_DECODE_ONLY, 1),
        (CUDAGraphMode.PIECEWISE, 13),
        (CUDAGraphMode.FULL_AND_PIECEWISE, 14),
    ],
)
def test_use_cudagraphs(
    vllm_runner, monkeypatch, cudagraph_mode, num_cudagraph_captured
):
123
    # Disable multiprocessing so that the counter is in the same process
124
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
125
126
127

    compilation_config = {
        "cudagraph_capture_sizes": [100],
128
        "cudagraph_mode": cudagraph_mode,
129
    }
130
    num_gpu_runner_capture_triggers = 1 if cudagraph_mode != CUDAGraphMode.NONE else 0
131
    with (
132
133
        compilation_counter.expect(
            num_graphs_seen=1,
134
135
            num_gpu_runner_capture_triggers=num_gpu_runner_capture_triggers,
            num_cudagraph_captured=num_cudagraph_captured,
136
137
138
139
140
141
142
143
        ),
        # loading the model causes compilation (if enabled) to happen
        vllm_runner(
            "facebook/opt-125m",
            compilation_config=compilation_config,
            gpu_memory_utilization=0.4,
        ) as _,
    ):
144
        pass
145
146
147
148


# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@pytest.mark.forked
149
def test_stock_torch_compile(vllm_runner, monkeypatch):
150
    # Disable multiprocessing so that the counter is in the same process
151
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
152
153

    with (
154
        compilation_counter.expect(stock_torch_compile_count=1),
155
156
157
        # loading the model causes compilation (if enabled) to happen
        vllm_runner(
            "facebook/opt-125m",
158
            compilation_config={"mode": CompilationMode.STOCK_TORCH_COMPILE},
159
160
161
            gpu_memory_utilization=0.4,
        ) as _,
    ):
162
163
164
165
166
167
168
        pass


# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@pytest.mark.forked
def test_no_compilation(vllm_runner, monkeypatch):
    # Disable multiprocessing so that the counter is in the same process
169
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
170
    with (
171
        compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0),
172
173
174
        # loading the model causes compilation (if enabled) to happen
        vllm_runner(
            "facebook/opt-125m",
175
            compilation_config={"mode": CompilationMode.NONE},
176
177
178
            gpu_memory_utilization=0.4,
        ) as _,
    ):
179
180
181
182
183
184
185
        pass


# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@pytest.mark.forked
def test_enforce_eager(vllm_runner, monkeypatch):
    # Disable multiprocessing so that the counter is in the same process
186
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
187
188

    with (
189
        compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0),
190
191
192
193
194
        # loading the model causes compilation (if enabled) to happen
        vllm_runner(
            "facebook/opt-125m", enforce_eager=True, gpu_memory_utilization=0.4
        ) as _,
    ):
195
        pass
196
197
198
199
200


def test_splitting_ops_dynamic():
    # Default config
    config = VllmConfig()
201
202
    # Default V1 config leaves cudagraph mode unset; splitting ops are only
    # populated when the engine decides to use piecewise compilation.
203
204
    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
    assert config.compilation_config.splitting_ops_contain_attention()
205
206

    # When use_inductor_graph_partition=True
207
208
209
210
211
    config = VllmConfig(
        compilation_config=CompilationConfig(
            mode=CompilationMode.VLLM_COMPILE,
            use_inductor_graph_partition=True,
            splitting_ops=["vllm::unified_attention"],
212
        )
213
214
215
216
    )
    # with inductor partition we use splitting_ops directly for
    # partition rules
    assert config.compilation_config.splitting_ops == ["vllm::unified_attention"]
217

218
    # When attn_fusion pass enabled.
219
220
    config = VllmConfig(
        compilation_config=CompilationConfig(
221
            mode=CompilationMode.VLLM_COMPILE,
222
            pass_config=PassConfig(fuse_attn_quant=True, eliminate_noops=True),
223
224
225
226
            custom_ops=["+quant_fp8"],
            cudagraph_mode=CUDAGraphMode.PIECEWISE,
        )
    )
227
228
229
    assert config.compilation_config.splitting_ops == []
    # cudagraph mode also fall back to FULL
    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL
230

231
232
233
    # splitting_ops can not contain attention ops when attn_fusion
    # pass enabled.
    with pytest.raises(ValidationError):
234
235
        config = VllmConfig(
            compilation_config=CompilationConfig(
236
                mode=CompilationMode.VLLM_COMPILE,
237
                pass_config=PassConfig(fuse_attn_quant=True, eliminate_noops=True),
238
239
                custom_ops=["+quant_fp8"],
                cudagraph_mode=CUDAGraphMode.PIECEWISE,
240
241
                # work around for accessing all attntion ops
                splitting_ops=CompilationConfig()._attention_ops,
242
243
            )
        )
244
245
246
247
248
249

    # When both use_inductor_graph_partition and attn_fusion pass enabled.
    config = VllmConfig(
        compilation_config=CompilationConfig(
            mode=CompilationMode.VLLM_COMPILE,
            use_inductor_graph_partition=True,
250
            pass_config=PassConfig(fuse_attn_quant=True, eliminate_noops=True),
251
252
253
254
255
256
257
            custom_ops=["+quant_fp8"],
            cudagraph_mode=CUDAGraphMode.PIECEWISE,
        )
    )
    # With inductor graph partition, attn_fusion and splitting_ops
    # work together. Default splitting_ops include attention ops.
    assert config.compilation_config.splitting_ops_contain_attention()
258
    # fuse_attn_quant is directly supported under
259
260
261
    # use_inductor_graph_partition=True, and cudagraph_mode
    # is unchanged.
    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
262
263


264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
def test_moe_splitting_ops_deepep_ht_inductor_partition():
    # Inductor partition case: user-provided splitting_ops should be
    # preserved and MoE ops should be appended for DeepEP HT with dp>1.
    config = VllmConfig(
        parallel_config=ParallelConfig(
            all2all_backend="deepep_high_throughput",
            data_parallel_size=8,
        ),
        compilation_config=CompilationConfig(
            mode=CompilationMode.VLLM_COMPILE,
            use_inductor_graph_partition=True,
            splitting_ops=[
                "vllm::unified_attention",
                "vllm::moe_forward",
                "vllm::moe_forward_shared",
            ],
        ),
    )
    splitting_ops = config.compilation_config.splitting_ops
    assert splitting_ops == [
        "vllm::unified_attention",
        "vllm::moe_forward",
        "vllm::moe_forward_shared",
    ]


290
def test_should_split():
291
292
    import torch

293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
    from vllm.compilation.partition_rules import should_split

    graph = torch.fx.Graph()
    node = torch.fx.Node(
        graph=graph,
        name="dummy_node",
        op="call_function",
        target=torch.ops.aten.add.default,
        args=(),
        kwargs={},
    )

    # supports OpOverloadPacket
    splitting_ops = ["aten::add"]
    assert should_split(node, splitting_ops)

    # supports OpOverload
    splitting_ops = ["aten::add.default"]
    assert should_split(node, splitting_ops)

    # supports OpOverload
    splitting_ops = ["aten::add.Tensor"]
    assert not should_split(node, splitting_ops)

    q, k, v, out = [torch.randn(1)] * 4

    # supports custom ops as OpOverloadPacket
    node = torch.fx.Node(
        graph=graph,
        name="dummy_node",
        op="call_function",
        target=torch.ops.silly.attention,
        args=(q, k, v, out),
        kwargs={},
    )

    splitting_ops = ["silly::attention"]
    assert should_split(node, splitting_ops)

    # supports custom ops as OpOverload
    node = torch.fx.Node(
        graph=graph,
        name="dummy_node",
        op="call_function",
        target=torch.ops.silly.attention.default,
        args=(q, k, v, out),
        kwargs={},
    )

    splitting_ops = ["silly::attention"]
    assert should_split(node, splitting_ops)

    splitting_ops = ["silly::attention.default"]
    assert should_split(node, splitting_ops)
347
348
349
350
351
352
353
354
355
356
357


@pytest.mark.skipif(
    not current_platform.support_static_graph_mode(),
    reason="Skip if not cudagraph mode supported",
)
@pytest.mark.parametrize(
    (
        "cudagraph_capture_sizes",
        "max_cudagraph_capture_size",
        "tp_size",
358
        "enable_sp",
359
        "max_num_batched_tokens",
360
        "cudagraph_mode",
361
362
363
        "expected_max_size",
    ),
    [
364
365
366
367
368
369
370
371
372
373
374
375
376
377
        (None, None, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 256),
        ([1, 2, 4], 4, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 4),
        (
            [1, 2, 4],
            8,
            1,
            False,
            2048,
            CUDAGraphMode.FULL_AND_PIECEWISE,
            ValidationError,
        ),
        ([1, 256], None, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 256),
        ([], None, 1, False, 2048, CUDAGraphMode.NONE, 0),
        (None, 0, 1, False, 2048, CUDAGraphMode.NONE, 0),
378
        # truncated to nearest multiple of 8 or 16
379
380
381
382
383
384
385
        (None, 257, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 256),
        # max from list
        ([1, 2, 4, 15], None, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 15),
        # filtered out 15 due to SP
        ([1, 2, 4, 15], None, 2, True, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 4),
        # limited by the max_tokens
        ([1, 2, 4, 15], None, 1, False, 8, CUDAGraphMode.FULL_AND_PIECEWISE, 4),
386
        # the list should contain at least 1 element when use cudagraph
387
        ([], None, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, ValidationError),
388
        # the max capturing size should be >= 1 when use cudagraph
389
        (None, 0, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, ValidationError),
390
391
392
393
394
395
    ],
)
def test_cudagraph_sizes_post_init(
    cudagraph_capture_sizes,
    max_cudagraph_capture_size,
    tp_size,
396
    enable_sp,
397
    max_num_batched_tokens,
398
    cudagraph_mode,
399
400
401
    expected_max_size,
):
    ctx = nullcontext()
402
    if expected_max_size == ValidationError:
403
404
        ctx = pytest.raises(expected_max_size)

405
406
407
408
    with (
        ctx,
        patch("vllm.config.parallel.cuda_device_count_stateless", return_value=tp_size),
    ):
409
410
411
        compilation_config = CompilationConfig(
            cudagraph_capture_sizes=cudagraph_capture_sizes,
            max_cudagraph_capture_size=max_cudagraph_capture_size,
412
413
414
415
416
417
            pass_config=PassConfig(
                enable_sp=enable_sp,
                fuse_norm_quant=True,
                fuse_act_quant=True,
                eliminate_noops=True,
            ),
418
419
420
421
422
            cudagraph_mode=cudagraph_mode,
        )
        engine_args = EngineArgs(
            model="facebook/opt-125m",
            tensor_parallel_size=tp_size,
423
            max_num_seqs=min(max_num_batched_tokens, 128),
424
425
426
427
428
            max_num_batched_tokens=max_num_batched_tokens,
            compilation_config=compilation_config,
        )
        vllm_config = engine_args.create_engine_config()

429
430
431
432
        assert (
            vllm_config.compilation_config.max_cudagraph_capture_size
            == expected_max_size
        )
433
434


435
def test_cached_compilation_config(default_vllm_config):
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
    import torch
    from torch._inductor.utils import run_and_get_code

    from vllm.config import get_cached_compilation_config, set_current_vllm_config
    from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
    from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape

    dtype = torch.bfloat16
    device = torch.device("cuda:0")
    batch_size, num_qo_heads, head_size = 8, 16, 128

    # access and cache default compilation config
    # default compilation config does not contain +quant_fp8 custom op. If this is
    # used, the generated code would use inductor-generated triton kernel instead
    # of the custom op `torch.ops._C.static_scaled_fp8_quant`.
    get_cached_compilation_config()

    vllm_config = VllmConfig(
        compilation_config=CompilationConfig(
            mode=CompilationMode.VLLM_COMPILE,
            custom_ops=["+quant_fp8"],
        )
    )

    # set_current_vllm_config should clear cached compilation config and
    # use the new compilation_config in vllm_config
    with set_current_vllm_config(vllm_config):
        query_quant = QuantFP8(static=True, group_shape=GroupShape.PER_TENSOR)
        query_quant = torch.compile(query_quant)

        _q_scale = torch.tensor(1.0, dtype=torch.float32, device="cuda")
        query = torch.randn(
            batch_size, num_qo_heads * head_size, dtype=dtype, device=device
        )

        _, code = run_and_get_code(query_quant, query, _q_scale)

    code = " ".join(code)
    assert "torch.ops._C.static_scaled_fp8_quant.default(" in code