test_initialization.py 7.23 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
from functools import partial
5
6
7
8
9
from unittest.mock import patch

import pytest

from vllm import LLM
10
from vllm.utils.mem_constants import GiB_bytes
11
12
13
14
from vllm.v1.core.kv_cache_utils import (
    generate_scheduler_kv_cache_config,
    get_kv_cache_configs,
)
15
from vllm.v1.engine.core import EngineCore as V1EngineCore
16

17
from ..utils import create_new_process_for_each_test
18
19
20
21
22
23
from .registry import (
    _TRANSFORMERS_BACKEND_MODELS,
    AUTO_EXAMPLE_MODELS,
    HF_EXAMPLE_MODELS,
    HfExampleModels,
)
24
from .utils import dummy_hf_overrides
25

26
27
28
29
30
31
32
# This minimal list of model architectures is smaller than the total list of
# supported models. The intention is that in the "typical" regression testing
# scenario, we only test initializing these models. This subset was chosen
# to include representative examples of model varieties/workloads (conditional
# generation, sequence classification, causal LM, ranking, chat, reward model,
# multimodal, geospatial, voice, embedding, MTP)
MINIMAL_MODEL_ARCH_LIST = [
33
34
35
36
37
38
39
    "LlavaForConditionalGeneration",
    "Llama4ForConditionalGeneration",
    "BertForSequenceClassification",
    "Gemma3nForCausalLM",
    "JinaVLForRanking",
    "InternVLChatModel",
    "InternLM2ForRewardModel",
40
    "TransformersMultiModalForCausalLM",
41
42
43
44
    "PrithviGeoSpatialMAE",
    "UltravoxModel",
    "DeepSeekMTPModel",
    "XLMRobertaModel",
45
46
47
48
49
]

# This list is the complement of the minimal list above. The intention is that
# this list of models is only tested in a "special case" i.e. most PRs should
# not test these models
50
51
52
OTHER_MODEL_ARCH_LIST = set(HF_EXAMPLE_MODELS.get_supported_archs()) - set(
    MINIMAL_MODEL_ARCH_LIST
)
53

54

55
@create_new_process_for_each_test()
56
57
58
def can_initialize(
    model_arch: str, monkeypatch: pytest.MonkeyPatch, EXAMPLE_MODELS: HfExampleModels
):
59
60
61
    """The reason for using create_new_process_for_each_test is to avoid
    the WARNING:
        "We must use the 'spawn' multiprocessing start method. Overriding
62
        VLLM_WORKER_MULTIPROC_METHOD to 'spawn'."
63
    The spawn process causes the _initialize_kv_caches_v1 function below to
64
65
    become ineffective.
    """
66
67

    model_info = EXAMPLE_MODELS.get_hf_info(model_arch)
68
    model_info.check_available_online(on_fail="skip")
69
70
71
72
73
    model_info.check_transformers_version(
        on_fail="skip",
        check_max_version=False,
        check_version_reason="vllm",
    )
74

75
76
77
78
79
80
    hf_overrides_fn = partial(
        dummy_hf_overrides,
        model_arch=model_arch,
        exist_overrides=model_info.hf_overrides,
        use_original_num_layers=getattr(model_info, "use_original_num_layers", False),
    )
81

82
    # Avoid calling model.forward()
83
84
    def _initialize_kv_caches_v1(self, vllm_config):
        kv_cache_specs = self.model_executor.get_kv_cache_specs()
85
        kv_cache_configs = get_kv_cache_configs(
86
            vllm_config,
87
88
            kv_cache_specs,
            [10 * GiB_bytes],
89
        )
90
        scheduler_kv_cache_config = generate_scheduler_kv_cache_config(kv_cache_configs)
91
92
93
94
95
96
        vllm_config.cache_config.num_gpu_blocks = scheduler_kv_cache_config.num_blocks
        kv_cache_groups = scheduler_kv_cache_config.kv_cache_groups
        if kv_cache_groups:
            vllm_config.cache_config.block_size = min(
                g.kv_cache_spec.block_size for g in kv_cache_groups
            )
97

98
99
        vllm_config.validate_block_size()
        return scheduler_kv_cache_config
100

101
102
103
104
105
    if model_arch == "MiniMaxVL01ForConditionalGeneration":
        pytest.skip(
            "pickle error when loading `transformers.models.auto.CONFIG_MAPPING`"
        )

106
107
108
109
110
111
    if model_arch == "MoonshotKimiaForCausalLM":
        pytest.skip(
            "Kimi-Audio requires SpeechToTextConfig "
            "which is not configured in test environment"
        )

Jee Jee Li's avatar
Jee Jee Li committed
112
    if model_arch in ["DeepseekV32ForCausalLM", "GlmMoeDsaForCausalLM"]:
113
114
115
116
117
118
119
120
121
122
        from vllm.platforms import current_platform

        capability = current_platform.get_device_capability()
        if capability and capability.major < 9:
            pytest.skip(
                f"DeepseekV32 requires Hopper (9.0+) or Blackwell (10.0+) "
                f"for FLASHMLA_SPARSE backend. Current device has compute "
                f"capability {capability.major}.{capability.minor}"
            )

123
124
125
126
    with (
        patch.object(V1EngineCore, "_initialize_kv_caches", _initialize_kv_caches_v1),
        monkeypatch.context() as m,
    ):
127
128
129
        # FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
        # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
        # L4 supports FA3.
Li Xie's avatar
Li Xie committed
130
        # Step1ForCausalLM requires TRITON_ATTN for use_alibi_sqrt support.
131
        attention_config = (
Li Xie's avatar
Li Xie committed
132
133
134
            {"backend": "TRITON_ATTN"}
            if model_arch in ("GptOssForCausalLM", "Step1ForCausalLM")
            else None
135
        )
136
137
        if model_arch == "WhisperForConditionalGeneration":
            m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
138

139
140
141
142
        kwargs = {}
        if not model_info.enable_prefix_caching:
            kwargs["enable_prefix_caching"] = False

143
        LLM(
144
            model_info.default,
145
146
            tokenizer=model_info.tokenizer,
            tokenizer_mode=model_info.tokenizer_mode,
147
            revision=model_info.revision,
148
            enforce_eager=model_info.enforce_eager,
149
150
151
            skip_tokenizer_init=model_info.require_embed_inputs,
            enable_prompt_embeds=model_info.require_embed_inputs,
            enable_mm_embeds=model_info.require_embed_inputs,
152
            dtype=model_info.dtype,
153
154
            speculative_config={
                "model": model_info.speculative_model,
155
                "method": model_info.speculative_method,
156
                "num_speculative_tokens": 1,
157
158
159
            }
            if model_info.speculative_model
            else None,
160
            trust_remote_code=model_info.trust_remote_code,
161
            max_model_len=model_info.max_model_len,
162
            max_num_batched_tokens=model_info.max_num_batched_tokens,
163
164
            # these tests seem to produce leftover memory
            gpu_memory_utilization=0.80,
165
            load_format="dummy",
166
            model_impl="transformers"
167
168
            if model_arch in _TRANSFORMERS_BACKEND_MODELS
            else "vllm",
169
            hf_overrides=hf_overrides_fn,
170
            max_num_seqs=model_info.max_num_seqs,
171
            attention_config=attention_config,
172
            **kwargs,
173
        )
174
175


176
@pytest.mark.parametrize("model_arch", MINIMAL_MODEL_ARCH_LIST)
177
def test_can_initialize_small_subset(model_arch: str, monkeypatch: pytest.MonkeyPatch):
178
179
180
181
182
    """Test initializing small subset of supported models"""
    can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS)


@pytest.mark.parametrize("model_arch", OTHER_MODEL_ARCH_LIST)
183
def test_can_initialize_large_subset(model_arch: str, monkeypatch: pytest.MonkeyPatch):
184
    """Test initializing large subset of supported models
185

186
187
188
    This test covers the complement of the tests covered in the "small subset"
    test.
    """
189
190
191
    can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS)


192
193
@pytest.mark.parametrize("model_arch", AUTO_EXAMPLE_MODELS.get_supported_archs())
def test_implicit_converted_models(model_arch: str, monkeypatch: pytest.MonkeyPatch):
194
    can_initialize(model_arch, monkeypatch, AUTO_EXAMPLE_MODELS)