test_config.py 7.36 KB
Newer Older
1
2
3
4
5
6
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest

import vllm
from vllm.compilation.counter import compilation_counter
7
from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
8
9
10
11
from vllm.utils import _is_torch_equal_or_newer


def test_version():
12
13
14
15
16
    assert _is_torch_equal_or_newer("2.8.0.dev20250624+cu128", "2.8.0.dev")
    assert _is_torch_equal_or_newer("2.8.0a0+gitc82a174", "2.8.0.dev")
    assert _is_torch_equal_or_newer("2.8.0", "2.8.0.dev")
    assert _is_torch_equal_or_newer("2.8.1", "2.8.0.dev")
    assert not _is_torch_equal_or_newer("2.7.1", "2.8.0.dev")
17
18


19
20
21
22
23
def test_use_cudagraphs_dynamic(monkeypatch):
    assert vllm.envs.VLLM_USE_V1
    vllm_config = VllmConfig()
    assert vllm_config.compilation_config.use_cudagraph

24
    monkeypatch.setenv("VLLM_USE_V1", "0")
25
26
27
28
    vllm_config = VllmConfig()
    assert not vllm_config.compilation_config.use_cudagraph


29
30
31
32
33
34
35
36
def test_custom_op():
    # proper syntax
    _ = CompilationConfig(custom_ops=["+quant_fp8", "-silu_and_mul"])

    with pytest.raises(ValueError, match="Invalid syntax '"):
        _ = CompilationConfig(custom_ops=["quant_fp8"])


37
38
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@pytest.mark.forked
39
40
41
42
43
44
45
# NB: We don't test VLLM_DISABLE_COMPILE_CACHE=0 because that depends
# on the state of the cache directory on the current machine, which
# may be influenced by other tests.
@pytest.mark.parametrize("val", ["1"])
def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
    assert vllm.envs.VLLM_USE_V1

46
    # Disable multiprocessing so that the counter is in the same process
47
48
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", val)
49
50
51
52
53

    compilation_config = {
        "use_cudagraph": False,  # speed things up a bit
    }
    with (
54
55
56
57
58
59
60
61
62
63
        compilation_counter.expect(
            num_cache_entries_updated=0, num_compiled_artifacts_saved=0
        ),
        # loading the model causes compilation (if enabled) to happen
        vllm_runner(
            "facebook/opt-125m",
            compilation_config=compilation_config,
            gpu_memory_utilization=0.4,
        ) as _,
    ):
64
65
66
        pass


67
68
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@pytest.mark.forked
69
@pytest.mark.parametrize("enabled", [True, False])
70
def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
71
    assert vllm.envs.VLLM_USE_V1
72
73

    # Disable multiprocessing so that the counter is in the same process
74
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
75
76
77
78
79
80

    compilation_config = {
        "cudagraph_capture_sizes": [100],
        "use_cudagraph": enabled,
    }
    with (
81
82
83
84
85
86
87
88
89
90
91
92
        compilation_counter.expect(
            num_graphs_seen=1,
            num_gpu_runner_capture_triggers=1 if enabled else 0,
            num_cudagraph_captured=13 if enabled else 0,
        ),
        # loading the model causes compilation (if enabled) to happen
        vllm_runner(
            "facebook/opt-125m",
            compilation_config=compilation_config,
            gpu_memory_utilization=0.4,
        ) as _,
    ):
93
        pass
94
95
96
97
98
99


# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@pytest.mark.forked
def test_dynamo_as_is(vllm_runner, monkeypatch):
    # Disable multiprocessing so that the counter is in the same process
100
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
101
102

    with (
103
104
105
106
107
108
109
110
        compilation_counter.expect(dynamo_as_is_count=1),
        # loading the model causes compilation (if enabled) to happen
        vllm_runner(
            "facebook/opt-125m",
            compilation_config={"level": 1},
            gpu_memory_utilization=0.4,
        ) as _,
    ):
111
112
113
114
115
116
117
        pass


# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@pytest.mark.forked
def test_no_compilation(vllm_runner, monkeypatch):
    # Disable multiprocessing so that the counter is in the same process
118
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
119
    with (
120
121
122
123
124
125
126
127
        compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0),
        # loading the model causes compilation (if enabled) to happen
        vllm_runner(
            "facebook/opt-125m",
            compilation_config={"level": 0},
            gpu_memory_utilization=0.4,
        ) as _,
    ):
128
129
130
131
132
133
134
        pass


# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@pytest.mark.forked
def test_enforce_eager(vllm_runner, monkeypatch):
    # Disable multiprocessing so that the counter is in the same process
135
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
136
137

    with (
138
139
140
141
142
143
        compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0),
        # loading the model causes compilation (if enabled) to happen
        vllm_runner(
            "facebook/opt-125m", enforce_eager=True, gpu_memory_utilization=0.4
        ) as _,
    ):
144
        pass
145
146
147
148
149


def test_splitting_ops_dynamic():
    # Default config
    config = VllmConfig()
150
    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
151
152
153
    assert config.compilation_config.splitting_ops_contain_attention()

    # When use_inductor_graph_partition=True
154
    if _is_torch_equal_or_newer("2.9.0.dev"):
155
156
        # inductor graph partition is only available in PyTorch 2.9+.
        # this is a fast config check so we are not using pytest.skip.
157
158
159
160
161
        config = VllmConfig(
            compilation_config=CompilationConfig(
                use_inductor_graph_partition=True, splitting_ops=["silly_attention"]
            )
        )
162
163
164
165
        # should ignore splitting_ops
        assert config.compilation_config.splitting_ops == []

    # When attn_fusion pass enabled.
166
167
168
169
170
171
172
    config = VllmConfig(
        compilation_config=CompilationConfig(
            pass_config={"enable_attn_fusion": True, "enable_noop": True},
            custom_ops=["+quant_fp8"],
            cudagraph_mode=CUDAGraphMode.PIECEWISE,
        )
    )
173
174
    assert config.compilation_config.splitting_ops == []
    # cudagraph mode also fall back to FULL
175
    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL
176
177
178
179

    # splitting_ops can not contain attention ops when attn_fusion
    # pass enabled.
    with pytest.raises(AssertionError):
180
181
182
183
184
185
186
187
188
        config = VllmConfig(
            compilation_config=CompilationConfig(
                pass_config={"enable_attn_fusion": True, "enable_noop": True},
                custom_ops=["+quant_fp8"],
                cudagraph_mode=CUDAGraphMode.PIECEWISE,
                # work around for accessing all attntion ops
                splitting_ops=CompilationConfig()._attention_ops,
            )
        )
189
190

    # When both use_inductor_graph_partition and attn_fusion pass enabled.
191
192
193
194
195
196
197
198
199
    if _is_torch_equal_or_newer("2.9.0.dev"):
        config = VllmConfig(
            compilation_config=CompilationConfig(
                use_inductor_graph_partition=True,
                pass_config={"enable_attn_fusion": True, "enable_noop": True},
                custom_ops=["+quant_fp8"],
                cudagraph_mode=CUDAGraphMode.PIECEWISE,
            )
        )
200
201
202
203
        assert config.compilation_config.splitting_ops == []
        # enable_attn_fusion is directly support under
        # use_inductor_graph_partition=True, and cudagraph_mode
        # is unchanged.
204
        assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE