test_pipeline_parallel.py 15.4 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
7
8
9
"""
WARNING: This test runs in both single-node (4 GPUs) and multi-node
 (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
 important to set the distributed backend to "mp" to avoid Ray scheduling
 all workers in a node other than the head node, which can cause the test
 to fail.
"""
10
import json
11
import os
12
from dataclasses import dataclass
13
from typing import Literal, NamedTuple, Optional
14

15
16
import pytest

17
from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption
18
from vllm.logger import init_logger
19
from vllm.transformers_utils.config import get_config
20

21
from ..models.registry import HF_EXAMPLE_MODELS
22
from ..utils import compare_two_settings, create_new_process_for_each_test
23

24
25
logger = init_logger("test_pipeline_parallel")

26
27
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"

28

29
30
31
32
33
34
class ParallelSetup(NamedTuple):
    tp_size: int
    pp_size: int
    eager_mode: bool


35
36
class PPTestOptions(NamedTuple):
    multi_node_only: bool
37
    load_format: Optional[str] = None
38
39


40
41
@dataclass
class PPTestSettings:
42
43
    parallel_setups: list[ParallelSetup]
    distributed_backends: list[str]
44
    runner: RunnerOption
45
    test_options: PPTestOptions
46
47
48
49
50
51

    @staticmethod
    def detailed(
        *,
        tp_base: int = 1,
        pp_base: int = 2,
52
        multi_node_only: bool = False,
53
        runner: RunnerOption = "auto",
54
        load_format: Optional[str] = None,
55
56
57
58
59
    ):
        return PPTestSettings(
            parallel_setups=[
                ParallelSetup(tp_size=tp_base,
                              pp_size=pp_base,
60
                              eager_mode=False),
61
62
                ParallelSetup(tp_size=tp_base,
                              pp_size=2 * pp_base,
63
                              eager_mode=False),
64
65
                ParallelSetup(tp_size=tp_base,
                              pp_size=2 * pp_base,
66
                              eager_mode=True),
67
68
                ParallelSetup(tp_size=2 * tp_base,
                              pp_size=pp_base,
69
                              eager_mode=False),
70
71
                ParallelSetup(tp_size=2 * tp_base,
                              pp_size=pp_base,
72
                              eager_mode=True),
73
            ],
74
            distributed_backends=["mp", "ray"],
75
            runner=runner,
76
            test_options=PPTestOptions(multi_node_only=multi_node_only,
77
                                       load_format=load_format),
78
79
80
81
82
83
84
        )

    @staticmethod
    def fast(
        *,
        tp_base: int = 1,
        pp_base: int = 2,
85
        runner: RunnerOption = "auto",
86
        multi_node_only: bool = False,
87
        load_format: Optional[str] = None,
88
    ):
89

90
91
92
93
        return PPTestSettings(
            parallel_setups=[
                ParallelSetup(tp_size=tp_base,
                              pp_size=pp_base,
94
                              eager_mode=True),
95
96
            ],
            distributed_backends=["mp"],
97
            runner=runner,
98
            test_options=PPTestOptions(multi_node_only=multi_node_only,
99
                                       load_format=load_format),
100
101
        )

102
    def iter_params(self, model_id: str):
103
104
        opts = self.test_options

105
        for parallel_setup in self.parallel_setups:
106
107
            for backend in self.distributed_backends:
                yield (model_id, parallel_setup, backend, self.runner, opts)
108
109


110
111
112
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
# The values displayed here are only a rough indicator of the size of the model

113
# yapf: disable
114
115
TEXT_GENERATION_MODELS = {
    # [Decoder-only]
116
117
    # Uses Llama
    # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
118
119
120
    "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(load_format="dummy"),  # noqa: E501
    "baichuan-inc/Baichuan-7B": PPTestSettings.fast(),
    "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(),
121
    "bigscience/bloomz-1b1": PPTestSettings.fast(),
122
    "zai-org/chatglm3-6b": PPTestSettings.fast(),
123
124
125
    "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(load_format="dummy"),
    "databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"),
    "Deci/DeciLM-7B-instruct": PPTestSettings.fast(),
126
    "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
127
    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(tp_base=2),
128
129
    "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
    "tiiuae/falcon-7b": PPTestSettings.fast(),
130
    "google/gemma-1.1-2b-it": PPTestSettings.fast(),
131
132
133
134
    "google/gemma-2-9b": PPTestSettings.fast(),
    "gpt2": PPTestSettings.fast(),
    "bigcode/starcoder": PPTestSettings.fast(),
    "EleutherAI/gpt-j-6b": PPTestSettings.fast(),
135
    "EleutherAI/pythia-1.4b": PPTestSettings.fast(),
136
137
138
139
    "ibm/PowerLM-3b": PPTestSettings.fast(),
    "ibm/PowerMoE-3b": PPTestSettings.fast(),
    # Uses Llama
    # "internlm/internlm-chat-7b": PPTestSettings.fast(),
140
    "internlm/internlm2-chat-7b": PPTestSettings.fast(),
141
    "inceptionai/jais-13b-chat": PPTestSettings.fast(),
142
    "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
143
    "pfnet/plamo-2-1b": PPTestSettings.fast(),
144
    "meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
145
    # Tests TransformersForCausalLM
146
    "hmellor/Ilama-3.2-1B": PPTestSettings.fast(),
147
148
    "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
    "openbmb/MiniCPM3-4B": PPTestSettings.fast(),
149
150
    # Uses Llama
    # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
151
    "state-spaces/mamba-130m-hf": PPTestSettings.fast(),
152
    "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(load_format="dummy"),  # noqa: E501
153
154
155
    "mosaicml/mpt-7b": PPTestSettings.fast(),
    "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
    "allenai/OLMo-1B-hf": PPTestSettings.fast(),
156
    "allenai/OLMo-2-0425-1B": PPTestSettings.fast(),
157
    "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
158
    "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
159
    "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(),
160
    "adept/persimmon-8b-chat": PPTestSettings.fast(),
161
    "microsoft/phi-2": PPTestSettings.fast(),
162
163
164
    "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
    "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(multi_node_only=True, load_format="dummy"),  # noqa: E501
    "Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
165
    "Qwen/Qwen2.5-0.5B-Instruct": PPTestSettings.fast(),
166
167
168
    "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
    "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
    "bigcode/starcoder2-3b": PPTestSettings.fast(),
169
    "upstage/solar-pro-preview-instruct": PPTestSettings.fast(load_format="dummy"),  # noqa: E501
170
171
    # FIXME: Cannot load tokenizer in latest transformers version.
    # Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
172
    # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(),
173
174
175
    # [Encoder-only]
    # TODO: Implement PP
    # "facebook/bart-base": PPTestSettings.fast(),
176
177
}

178
179
EMBEDDING_MODELS = {  # type: ignore[var-annotated]
    # [Text-only]
180
    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(runner="pooling"),
181
    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
182
    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(
183
        load_format="dummy", runner="pooling"
184
    ),
185
186
}

187
188
MULTIMODAL_MODELS = {
    # [Decoder-only]
189
    "Salesforce/blip2-opt-6.7b": PPTestSettings.fast(),
190
191
    "facebook/chameleon-7b": PPTestSettings.fast(),
    "adept/fuyu-8b": PPTestSettings.fast(),
192
    "zai-org/glm-4v-9b": PPTestSettings.fast(),
193
    "OpenGVLab/InternVL2-1B": PPTestSettings.fast(),
194
195
196
197
    "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
    "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
    "llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
198
199
    "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(),
    "allenai/Molmo-7B-D-0924": PPTestSettings.fast(),
200
    "AIDC-AI/Ovis2-1B": PPTestSettings.fast(),
201
    "AIDC-AI/Ovis2.5-2B": PPTestSettings.fast(),
202
    "microsoft/Phi-3.5-vision-instruct": PPTestSettings.fast(),
203
204
    "mistralai/Pixtral-12B-2409": PPTestSettings.fast(load_format="dummy"),
    "Qwen/Qwen-VL-Chat": PPTestSettings.fast(),
205
    "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
206
    "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
207
    "fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(),
208
209
210
}
# yapf: enable

211
# NOTE: You can update this on your local machine to run specific tests
212
TEST_MODELS = [
213
    # [LANGUAGE GENERATION]
214
    "microsoft/Phi-3.5-MoE-instruct",
215
    "meta-llama/Llama-3.2-1B-Instruct",
216
    "hmellor/Ilama-3.2-1B",
217
    "ibm/PowerLM-3b",
218
    "deepseek-ai/DeepSeek-V2-Lite-Chat",
219
220
221
222
    # [LANGUAGE EMBEDDING]
    "intfloat/e5-mistral-7b-instruct",
    "BAAI/bge-multilingual-gemma2",
    # [MULTIMODAL GENERATION]
223
    "OpenGVLab/InternVL2-1B",
224
    "microsoft/Phi-3.5-vision-instruct",
225
    "fixie-ai/ultravox-v0_5-llama-3_2-1b",
226
227
    # [LANGUAGE GENERATION - HYBRID ARCH]
    "ai21labs/Jamba-tiny-dev",
228
229
230
]


231
def _compare_tp(
232
    model_id: str,
233
234
    parallel_setup: ParallelSetup,
    distributed_backend: str,
235
    runner: RunnerOption,
236
    test_options: PPTestOptions,
237
238
    num_gpus_available: int,
    *,
239
    method: Literal["generate", "encode"],
240
    is_multimodal: bool,
241
):
242
243
244
245
246
    (
        tp_size,
        pp_size,
        eager_mode,
    ) = parallel_setup
247
248
249
250
251
252
253
254
255

    multi_node_only, load_format = test_options

    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
    model_info.check_transformers_version(on_fail="skip")

    trust_remote_code = model_info.trust_remote_code
    tokenizer_mode = model_info.tokenizer_mode
    hf_overrides = model_info.hf_overrides
256
    hf_config = get_config(model_id, trust_remote_code)
257
258
    skip_tokenizer_init = model_info.skip_tokenizer_init
    max_num_seqs = model_info.max_num_seqs
259
260
261
262

    dtype = "float16"
    if hf_config.model_type in _FLOAT16_NOT_SUPPORTED_MODELS:
        dtype = "bfloat16"
263
264
265
266

    if load_format == "dummy":
        # Avoid OOM
        text_overrides = {
267
268
269
270
271
            "num_hidden_layers": 4,
            "hidden_size": 512,
            "intermediate_size": 800,
            "num_attention_heads": 4,
            "num_key_value_heads": 1,
272
273
274
275
276
277
278
279
        }

        if is_multimodal:
            hf_overrides.update({"text_config": text_overrides})
        else:
            hf_overrides.update(text_overrides)
    else:
        model_info.check_available_online(on_fail="skip")
280

281
282
    if num_gpus_available < tp_size * pp_size:
        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
283
    if VLLM_MULTI_NODE and distributed_backend == "mp":
284
285
        pytest.skip("Skipping multi-node pipeline parallel test for "
                    "multiprocessing distributed backend")
286
287
    if multi_node_only and not VLLM_MULTI_NODE:
        pytest.skip("Not in multi-node setting")
288

289
    common_args = [
290
291
        # use half precision for speed and memory savings in CI environment
        "--dtype",
292
        dtype,
293
        "--max-model-len",
294
295
296
297
298
299
        "2048",
        "--max-num-seqs",
        "8",
    ]
    if eager_mode:
        common_args.append("--enforce-eager")
300
301
    if runner != "auto":
        common_args.extend(["--runner", runner])
302
303
304
305
    if trust_remote_code:
        common_args.append("--trust-remote-code")
    if tokenizer_mode:
        common_args.extend(["--tokenizer-mode", tokenizer_mode])
306
307
308
    if load_format:
        common_args.extend(["--load-format", load_format])
    if hf_overrides:
309
        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
310
311
312
313
    if skip_tokenizer_init:
        common_args.append("--skip-tokenizer-init")
    if max_num_seqs:
        common_args.extend(["--max-num-seqs", f"{max_num_seqs}"])
314

315
    if distributed_backend == "ray":
316
        # For V1, test Ray Compiled Graph for all the tests
317
        pp_env = {
318
            "VLLM_USE_V1": "1",
319
320
321
322
323
            "VLLM_USE_RAY_COMPILED_DAG": "1",
            "VLLM_USE_RAY_SPMD_WORKER": "1",
            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
        }
        # Temporary. Currently when zeromq + SPMD is used, it does not properly
324
        # terminate because of a Ray Compiled Graph issue.
325
        common_args.append("--disable-frontend-multiprocessing")
326
327
    elif distributed_backend == "mp":
        pp_env = {
328
            "VLLM_USE_V1": "1",
329
        }
330
331
332
    else:
        pp_env = None

333
    tp_env = {
334
        "VLLM_USE_V1": "1",
335
336
    }

337
338
    pp_args = [
        *common_args,
339
        "--pipeline-parallel-size",
340
        str(pp_size),
341
        "--tensor-parallel-size",
342
        str(tp_size),
343
        "--distributed-executor-backend",
344
        distributed_backend,
345
    ]
346
347
348
349
350
351
352

    # compare without pipeline parallelism
    # NOTE: use mp backend for TP
    # PP tests might involve multiple nodes, and ray might
    #  schedule all workers in a node other than the head node,
    #  which can cause the test to fail.
    tp_args = [
353
        *common_args,
354
        "--tensor-parallel-size",
355
        str(tp_size),
356
357
358
359
        "--distributed-executor-backend",
        "mp",
    ]

360
361
362
363
364
365
    compare_two_settings(model_id,
                         pp_args,
                         tp_args,
                         pp_env,
                         tp_env,
                         method=method)
366
367
368


@pytest.mark.parametrize(
369
370
    ("model_id", "parallel_setup", "distributed_backend", "runner",
     "test_options"),
371
    [
372
373
        params for model_id, settings in TEXT_GENERATION_MODELS.items()
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
374
375
    ],
)
376
@create_new_process_for_each_test()
377
def test_tp_language_generation(
378
    model_id: str,
379
380
    parallel_setup: ParallelSetup,
    distributed_backend: str,
381
    runner: RunnerOption,
382
    test_options: PPTestOptions,
383
384
    num_gpus_available,
):
385
    pytest.skip("Skipping the test until V1 passes it.")
386
    _compare_tp(model_id,
387
388
                parallel_setup,
                distributed_backend,
389
                runner,
390
                test_options,
391
                num_gpus_available,
392
393
                method="generate",
                is_multimodal=False)
394
395
396


@pytest.mark.parametrize(
397
398
    ("model_id", "parallel_setup", "distributed_backend", "runner",
     "test_options"),
399
    [
400
401
        params for model_id, settings in EMBEDDING_MODELS.items()
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
402
403
    ],
)
404
@create_new_process_for_each_test()
405
def test_tp_language_embedding(
406
    model_id: str,
407
408
    parallel_setup: ParallelSetup,
    distributed_backend: str,
409
    runner: RunnerOption,
410
    test_options: PPTestOptions,
411
412
    num_gpus_available,
):
413
    pytest.skip("Skipping the test until V1 passes it.")
414
    _compare_tp(model_id,
415
416
                parallel_setup,
                distributed_backend,
417
                runner,
418
                test_options,
419
                num_gpus_available,
420
421
                method="encode",
                is_multimodal=False)
422
423
424


@pytest.mark.parametrize(
425
426
    ("model_id", "parallel_setup", "distributed_backend", "runner",
     "test_options"),
427
    [
428
429
        params for model_id, settings in MULTIMODAL_MODELS.items()
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
430
431
    ],
)
432
@create_new_process_for_each_test()
433
def test_tp_multimodal_generation(
434
    model_id: str,
435
436
    parallel_setup: ParallelSetup,
    distributed_backend: str,
437
    runner: RunnerOption,
438
    test_options: PPTestOptions,
439
440
    num_gpus_available,
):
441
    pytest.skip("Skipping the test until V1 passes it.")
442
    _compare_tp(model_id,
443
444
                parallel_setup,
                distributed_backend,
445
                runner,
446
                test_options,
447
                num_gpus_available,
448
449
                method="generate",
                is_multimodal=True)