"lib/kvbm-engine/src/collectives/stub.rs" did not exist on "cf79c4fc8fa43aa9391fce7584e9f16012633211"
test_full_cudagraph.py 5.49 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
import contextlib
import os
5
import weakref
6
7
8

import pytest

9
from tests.utils import wait_for_gpu_memory_to_clear
10
from tests.v1.attention.utils import full_cg_backend_configs as backend_configs
11
12
from vllm import LLM, SamplingParams
from vllm.config import CompilationConfig
13
from vllm.platforms import current_platform
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34


@contextlib.contextmanager
def temporary_environ(env_vars):
    """
    Temporarily set environment variables and restore them afterward.
    We have to do this vs monkeypatch because monkeypatch doesn't work
    with "module" scoped fixtures.
    """
    original_env = {k: os.environ.get(k) for k in env_vars}
    try:
        os.environ.update(env_vars)
        yield
    finally:
        for k, v in original_env.items():
            if v is None:
                os.environ.pop(k, None)
            else:
                os.environ[k] = v


35
36
37
test_params_full_cudagraph = []

# deepseek-ai/DeepSeek-V2-Lite with MLA
38
MLA_backends = ["FlashMLA", "FlashAttentionMLA", "CutlassMLA"]
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
for mla_backend in MLA_backends:
    test_params_full_cudagraph.append(
        pytest.param(
            ("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend])))

# Qwen/Qwen2-1.5B-Instruct with other backends
other_backend_configs = [
    backend_configs[c] for c in backend_configs if c not in MLA_backends
]
for backend_config in other_backend_configs:
    test_params_full_cudagraph.append(
        pytest.param(("Qwen/Qwen2-1.5B-Instruct", backend_config)))


@pytest.fixture(scope="class")
def llm_pair(request):
    model, backend_config = request.param

    # Dynamically skip test if GPU capability is not met
    if backend_config.specific_gpu_arch and backend_config.specific_gpu_arch\
        != current_platform.get_device_capability():
        if backend_config.specific_gpu_arch == (9, 0):
            pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
        elif backend_config.specific_gpu_arch == (10, 0):
            pytest.skip("Only Blackwell GPUs support Cutlass MLA")

    env_vars = {
        "VLLM_USE_V1": "1",
        # Force native sampler to avoid potential nondeterminism in FlashInfer
        # when per-request generators are not used in V1.
        "VLLM_USE_FLASHINFER_SAMPLER": "0",
        **backend_config.env_vars,
    }
    with temporary_environ(env_vars):
73
74
        full = LLM(
            model=model,
75
            gpu_memory_utilization=0.43,
76
77
            trust_remote_code=True,
            max_model_len=1024,
78
79
80
81
82
            max_num_seqs=128,
            compilation_config=\
                CompilationConfig(**backend_config.comp_config),
            generation_config="vllm",
            seed=42,
83
84
85
        )
        piecewise = LLM(
            model=model,
86
            gpu_memory_utilization=0.43,
87
88
            trust_remote_code=True,
            max_model_len=1024,
89
90
91
92
            max_num_seqs=128,
            compilation_config=CompilationConfig(cudagraph_mode="PIECEWISE"),
            generation_config="vllm",
            seed=42,
93
94
        )

95
    # PyTest caches the fixture values so we use weakref.proxy to enable GC
96
97
98
99
100
101
102
103
104
105
    yield weakref.proxy(full), weakref.proxy(piecewise)
    del full
    del piecewise

    wait_for_gpu_memory_to_clear(
        devices=[0],
        threshold_ratio=0.1,
    )


106
@pytest.mark.parametrize("llm_pair", test_params_full_cudagraph, indirect=True)
107
class TestFullCUDAGraph:
108
    """
109
110
    Use a class such that an llm pair is constructed once for all
    batch_size/max_tokens combinations and released immediately after.
111

112
113
    Module-scope fixtures would stick around the whole time,
    meaning there would be multiple LLM instances hogging memory simultaneously.
114
115
    """

116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
    @pytest.mark.parametrize(("batch_size", "max_tokens"), [
        (1, 10),
        (7, 10),
        (16, 10),
        (25, 10),
        (32, 10),
        (45, 10),
        (64, 10),
        (123, 10),
        (8, 5),
        (8, 30),
    ])
    def test_full_cudagraph(self, batch_size, max_tokens,
                            llm_pair: tuple[LLM, LLM]):
        """
        Test various batch sizes and max_tokens to ensure that the
        full cudagraph compilation works for padded cases too.
        """

135
        full_cudagraph_llm, piecewise_llm = llm_pair
136

137
138
139
        prompts = ["the quick brown fox"] * batch_size
        # Use purely greedy decoding to avoid top-p truncation sensitivity
        # that can amplify tiny numeric differences across runtimes.
140
141
        sampling_params = SamplingParams(temperature=0.0,
                                         max_tokens=max_tokens,
142
                                         top_p=1.0)
143
144
145
146
147
148
149

        piecewise_responses = piecewise_llm.generate(prompts, sampling_params)
        full_responses = full_cudagraph_llm.generate(prompts, sampling_params)

        # Check that all responses are the same
        for piecewise_res, full_res in zip(piecewise_responses,
                                           full_responses):
150
151
            assert piecewise_res.outputs[0].text.lower() == \
                full_res.outputs[0].text.lower()
152
153


154
@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
155
156
157
def test_full_cudagraph_with_invalid_backend():
    with temporary_environ({
            "VLLM_USE_V1": "1",
158
159
            "VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION"
            # Flex_Attention is not supported with full cuda graph
160
    }), pytest.raises(RuntimeError):
161
        LLM(model="Qwen/Qwen2-1.5B-Instruct",
162
            compilation_config=CompilationConfig(cudagraph_mode="FULL"))