test_pipeline_parallel.py 15.3 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
7
8
9
"""
WARNING: This test runs in both single-node (4 GPUs) and multi-node
 (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
 important to set the distributed backend to "mp" to avoid Ray scheduling
 all workers in a node other than the head node, which can cause the test
 to fail.
"""
10
import json
11
import os
12
from dataclasses import dataclass
13
from typing import Literal, NamedTuple, Optional
14

15
16
import pytest

17
from vllm.config.model import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption
18
from vllm.logger import init_logger
19
from vllm.transformers_utils.config import get_config
20

21
from ..models.registry import HF_EXAMPLE_MODELS
22
from ..utils import compare_two_settings, create_new_process_for_each_test
23

24
25
logger = init_logger("test_pipeline_parallel")

26
27
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"

28

29
30
31
32
33
34
class ParallelSetup(NamedTuple):
    tp_size: int
    pp_size: int
    eager_mode: bool


35
36
class PPTestOptions(NamedTuple):
    multi_node_only: bool
37
    load_format: Optional[str] = None
38
39


40
41
@dataclass
class PPTestSettings:
42
43
    parallel_setups: list[ParallelSetup]
    distributed_backends: list[str]
44
    runner: RunnerOption
45
    test_options: PPTestOptions
46
47
48
49
50
51

    @staticmethod
    def detailed(
        *,
        tp_base: int = 1,
        pp_base: int = 2,
52
        multi_node_only: bool = False,
53
        runner: RunnerOption = "auto",
54
        load_format: Optional[str] = None,
55
56
57
58
59
    ):
        return PPTestSettings(
            parallel_setups=[
                ParallelSetup(tp_size=tp_base,
                              pp_size=pp_base,
60
                              eager_mode=False),
61
62
                ParallelSetup(tp_size=tp_base,
                              pp_size=2 * pp_base,
63
                              eager_mode=False),
64
65
                ParallelSetup(tp_size=tp_base,
                              pp_size=2 * pp_base,
66
                              eager_mode=True),
67
68
                ParallelSetup(tp_size=2 * tp_base,
                              pp_size=pp_base,
69
                              eager_mode=False),
70
71
                ParallelSetup(tp_size=2 * tp_base,
                              pp_size=pp_base,
72
                              eager_mode=True),
73
            ],
74
            distributed_backends=["mp", "ray"],
75
            runner=runner,
76
            test_options=PPTestOptions(multi_node_only=multi_node_only,
77
                                       load_format=load_format),
78
79
80
81
82
83
84
        )

    @staticmethod
    def fast(
        *,
        tp_base: int = 1,
        pp_base: int = 2,
85
        runner: RunnerOption = "auto",
86
        multi_node_only: bool = False,
87
        load_format: Optional[str] = None,
88
    ):
89

90
91
92
93
        return PPTestSettings(
            parallel_setups=[
                ParallelSetup(tp_size=tp_base,
                              pp_size=pp_base,
94
                              eager_mode=True),
95
96
            ],
            distributed_backends=["mp"],
97
            runner=runner,
98
            test_options=PPTestOptions(multi_node_only=multi_node_only,
99
                                       load_format=load_format),
100
101
        )

102
    def iter_params(self, model_id: str):
103
104
        opts = self.test_options

105
        for parallel_setup in self.parallel_setups:
106
107
            for backend in self.distributed_backends:
                yield (model_id, parallel_setup, backend, self.runner, opts)
108
109


110
111
112
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
# The values displayed here are only a rough indicator of the size of the model

113
# yapf: disable
114
115
TEXT_GENERATION_MODELS = {
    # [Decoder-only]
116
117
    # Uses Llama
    # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
118
119
120
    "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(load_format="dummy"),  # noqa: E501
    "baichuan-inc/Baichuan-7B": PPTestSettings.fast(),
    "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(),
121
    "bigscience/bloomz-1b1": PPTestSettings.fast(),
122
    "zai-org/chatglm3-6b": PPTestSettings.fast(),
123
124
125
    "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(load_format="dummy"),
    "databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"),
    "Deci/DeciLM-7B-instruct": PPTestSettings.fast(),
126
    "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
127
    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(tp_base=2),
128
129
    "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
    "tiiuae/falcon-7b": PPTestSettings.fast(),
130
    "google/gemma-1.1-2b-it": PPTestSettings.fast(),
131
132
133
134
    "google/gemma-2-9b": PPTestSettings.fast(),
    "gpt2": PPTestSettings.fast(),
    "bigcode/starcoder": PPTestSettings.fast(),
    "EleutherAI/gpt-j-6b": PPTestSettings.fast(),
135
    "EleutherAI/pythia-1.4b": PPTestSettings.fast(),
136
137
138
139
    "ibm/PowerLM-3b": PPTestSettings.fast(),
    "ibm/PowerMoE-3b": PPTestSettings.fast(),
    # Uses Llama
    # "internlm/internlm-chat-7b": PPTestSettings.fast(),
140
    "internlm/internlm2-chat-7b": PPTestSettings.fast(),
141
    "inceptionai/jais-13b-chat": PPTestSettings.fast(),
142
    "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
143
    "pfnet/plamo-2-1b": PPTestSettings.fast(),
144
    "meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
145
    # Tests TransformersForCausalLM
146
    "hmellor/Ilama-3.2-1B": PPTestSettings.fast(),
147
148
    "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
    "openbmb/MiniCPM3-4B": PPTestSettings.fast(),
149
150
    # Uses Llama
    # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
151
    "state-spaces/mamba-130m-hf": PPTestSettings.fast(),
152
    "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(load_format="dummy"),  # noqa: E501
153
154
155
    "mosaicml/mpt-7b": PPTestSettings.fast(),
    "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
    "allenai/OLMo-1B-hf": PPTestSettings.fast(),
156
    "allenai/OLMo-2-0425-1B": PPTestSettings.fast(),
157
    "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
158
    "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
159
    "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(),
160
    "adept/persimmon-8b-chat": PPTestSettings.fast(),
161
    "microsoft/phi-2": PPTestSettings.fast(),
162
163
164
    "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
    "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(multi_node_only=True, load_format="dummy"),  # noqa: E501
    "Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
165
    "Qwen/Qwen2.5-0.5B-Instruct": PPTestSettings.fast(),
166
167
168
    "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
    "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
    "bigcode/starcoder2-3b": PPTestSettings.fast(),
169
    "upstage/solar-pro-preview-instruct": PPTestSettings.fast(load_format="dummy"),  # noqa: E501
170
171
    # FIXME: Cannot load tokenizer in latest transformers version.
    # Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
172
    # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(),
173
174
175
    # [Encoder-only]
    # TODO: Implement PP
    # "facebook/bart-base": PPTestSettings.fast(),
176
177
}

178
179
EMBEDDING_MODELS = {  # type: ignore[var-annotated]
    # [Text-only]
180
    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(runner="pooling"),
181
    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
182
    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(
183
        load_format="dummy", runner="pooling"
184
    ),
185
186
}

187
188
MULTIMODAL_MODELS = {
    # [Decoder-only]
189
    "Salesforce/blip2-opt-6.7b": PPTestSettings.fast(),
190
191
    "facebook/chameleon-7b": PPTestSettings.fast(),
    "adept/fuyu-8b": PPTestSettings.fast(),
192
    "zai-org/glm-4v-9b": PPTestSettings.fast(),
193
    "OpenGVLab/InternVL2-1B": PPTestSettings.fast(),
194
195
196
197
    "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
    "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
    "llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
198
199
    "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(),
    "allenai/Molmo-7B-D-0924": PPTestSettings.fast(),
200
    "AIDC-AI/Ovis2-1B": PPTestSettings.fast(),
201
    "AIDC-AI/Ovis2.5-2B": PPTestSettings.fast(),
202
    "microsoft/Phi-3.5-vision-instruct": PPTestSettings.fast(),
203
204
    "mistralai/Pixtral-12B-2409": PPTestSettings.fast(load_format="dummy"),
    "Qwen/Qwen-VL-Chat": PPTestSettings.fast(),
205
    "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
206
    "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
207
    "fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(),
208
209
210
}
# yapf: enable

211
# NOTE: You can update this on your local machine to run specific tests
212
TEST_MODELS = [
213
    # [LANGUAGE GENERATION]
214
    "microsoft/Phi-3.5-MoE-instruct",
215
    "meta-llama/Llama-3.2-1B-Instruct",
216
    "hmellor/Ilama-3.2-1B",
217
    "ibm/PowerLM-3b",
218
    "deepseek-ai/DeepSeek-V2-Lite-Chat",
219
220
221
222
    # [LANGUAGE EMBEDDING]
    "intfloat/e5-mistral-7b-instruct",
    "BAAI/bge-multilingual-gemma2",
    # [MULTIMODAL GENERATION]
223
    "OpenGVLab/InternVL2-1B",
224
    "microsoft/Phi-3.5-vision-instruct",
225
    "fixie-ai/ultravox-v0_5-llama-3_2-1b",
226
227
    # [LANGUAGE GENERATION - HYBRID ARCH]
    "ai21labs/Jamba-tiny-dev",
228
229
230
]


231
def _compare_tp(
232
    model_id: str,
233
234
    parallel_setup: ParallelSetup,
    distributed_backend: str,
235
    runner: RunnerOption,
236
    test_options: PPTestOptions,
237
238
    num_gpus_available: int,
    *,
239
    method: Literal["generate", "encode"],
240
    is_multimodal: bool,
241
):
242
243
244
245
246
    (
        tp_size,
        pp_size,
        eager_mode,
    ) = parallel_setup
247
248
249
250
251
252
253
254
255

    multi_node_only, load_format = test_options

    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
    model_info.check_transformers_version(on_fail="skip")

    trust_remote_code = model_info.trust_remote_code
    tokenizer_mode = model_info.tokenizer_mode
    hf_overrides = model_info.hf_overrides
256
    hf_config = get_config(model_id, trust_remote_code)
257
258
    skip_tokenizer_init = model_info.skip_tokenizer_init
    max_num_seqs = model_info.max_num_seqs
259
260
261
262

    dtype = "float16"
    if hf_config.model_type in _FLOAT16_NOT_SUPPORTED_MODELS:
        dtype = "bfloat16"
263
264
265
266

    if load_format == "dummy":
        # Avoid OOM
        text_overrides = {
267
268
269
270
271
            "num_hidden_layers": 4,
            "hidden_size": 512,
            "intermediate_size": 800,
            "num_attention_heads": 4,
            "num_key_value_heads": 1,
272
273
274
275
276
277
278
279
        }

        if is_multimodal:
            hf_overrides.update({"text_config": text_overrides})
        else:
            hf_overrides.update(text_overrides)
    else:
        model_info.check_available_online(on_fail="skip")
280

281
282
    if num_gpus_available < tp_size * pp_size:
        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
283
    if VLLM_MULTI_NODE and distributed_backend == "mp":
284
285
        pytest.skip("Skipping multi-node pipeline parallel test for "
                    "multiprocessing distributed backend")
286
287
    if multi_node_only and not VLLM_MULTI_NODE:
        pytest.skip("Not in multi-node setting")
288

289
    common_args = [
290
291
        # use half precision for speed and memory savings in CI environment
        "--dtype",
292
        dtype,
293
        "--max-model-len",
294
295
296
297
298
299
        "2048",
        "--max-num-seqs",
        "8",
    ]
    if eager_mode:
        common_args.append("--enforce-eager")
300
301
    if runner != "auto":
        common_args.extend(["--runner", runner])
302
303
304
305
    if trust_remote_code:
        common_args.append("--trust-remote-code")
    if tokenizer_mode:
        common_args.extend(["--tokenizer-mode", tokenizer_mode])
306
307
308
    if load_format:
        common_args.extend(["--load-format", load_format])
    if hf_overrides:
309
        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
310
311
312
313
    if skip_tokenizer_init:
        common_args.append("--skip-tokenizer-init")
    if max_num_seqs:
        common_args.extend(["--max-num-seqs", f"{max_num_seqs}"])
314

315
    if distributed_backend == "ray":
316
        # For V1, test Ray Compiled Graph for all the tests
317
        pp_env = {
318
            "VLLM_USE_V1": "1",
319
320
321
322
323
            "VLLM_USE_RAY_COMPILED_DAG": "1",
            "VLLM_USE_RAY_SPMD_WORKER": "1",
            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
        }
        # Temporary. Currently when zeromq + SPMD is used, it does not properly
324
        # terminate because of a Ray Compiled Graph issue.
325
        common_args.append("--disable-frontend-multiprocessing")
326
327
    elif distributed_backend == "mp":
        pp_env = {
328
            "VLLM_USE_V1": "1",
329
        }
330
331
332
    else:
        pp_env = None

333
    tp_env = {
334
        "VLLM_USE_V1": "1",
335
336
    }

337
338
    pp_args = [
        *common_args,
339
        "--pipeline-parallel-size",
340
        str(pp_size),
341
        "--tensor-parallel-size",
342
        str(tp_size),
343
        "--distributed-executor-backend",
344
        distributed_backend,
345
    ]
346
347
348
349
350
351
352

    # compare without pipeline parallelism
    # NOTE: use mp backend for TP
    # PP tests might involve multiple nodes, and ray might
    #  schedule all workers in a node other than the head node,
    #  which can cause the test to fail.
    tp_args = [
353
        *common_args,
354
        "--tensor-parallel-size",
355
        str(tp_size),
356
357
358
359
        "--distributed-executor-backend",
        "mp",
    ]

360
361
362
363
364
365
    compare_two_settings(model_id,
                         pp_args,
                         tp_args,
                         pp_env,
                         tp_env,
                         method=method)
366
367
368


@pytest.mark.parametrize(
369
370
    ("model_id", "parallel_setup", "distributed_backend", "runner",
     "test_options"),
371
    [
372
373
        params for model_id, settings in TEXT_GENERATION_MODELS.items()
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
374
375
    ],
)
376
@create_new_process_for_each_test()
377
def test_tp_language_generation(
378
    model_id: str,
379
380
    parallel_setup: ParallelSetup,
    distributed_backend: str,
381
    runner: RunnerOption,
382
    test_options: PPTestOptions,
383
384
    num_gpus_available,
):
385
    _compare_tp(model_id,
386
387
                parallel_setup,
                distributed_backend,
388
                runner,
389
                test_options,
390
                num_gpus_available,
391
392
                method="generate",
                is_multimodal=False)
393
394
395


@pytest.mark.parametrize(
396
397
    ("model_id", "parallel_setup", "distributed_backend", "runner",
     "test_options"),
398
    [
399
400
        params for model_id, settings in EMBEDDING_MODELS.items()
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
401
402
    ],
)
403
@create_new_process_for_each_test()
404
def test_tp_language_embedding(
405
    model_id: str,
406
407
    parallel_setup: ParallelSetup,
    distributed_backend: str,
408
    runner: RunnerOption,
409
    test_options: PPTestOptions,
410
411
    num_gpus_available,
):
412
    _compare_tp(model_id,
413
414
                parallel_setup,
                distributed_backend,
415
                runner,
416
                test_options,
417
                num_gpus_available,
418
419
                method="encode",
                is_multimodal=False)
420
421
422


@pytest.mark.parametrize(
423
424
    ("model_id", "parallel_setup", "distributed_backend", "runner",
     "test_options"),
425
    [
426
427
        params for model_id, settings in MULTIMODAL_MODELS.items()
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
428
429
    ],
)
430
@create_new_process_for_each_test()
431
def test_tp_multimodal_generation(
432
    model_id: str,
433
434
    parallel_setup: ParallelSetup,
    distributed_backend: str,
435
    runner: RunnerOption,
436
    test_options: PPTestOptions,
437
438
    num_gpus_available,
):
439
    _compare_tp(model_id,
440
441
                parallel_setup,
                distributed_backend,
442
                runner,
443
                test_options,
444
                num_gpus_available,
445
446
                method="generate",
                is_multimodal=True)