test_pipeline_parallel.py 17.2 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
7
8
9
"""
WARNING: This test runs in both single-node (4 GPUs) and multi-node
 (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
 important to set the distributed backend to "mp" to avoid Ray scheduling
 all workers in a node other than the head node, which can cause the test
 to fail.
"""
10
import json
11
import os
12
from dataclasses import dataclass
13
from typing import Literal, NamedTuple, Optional
14

15
16
import pytest

17
from vllm.config import TaskOption
18
19
from vllm.logger import init_logger

20
from ..models.registry import HF_EXAMPLE_MODELS
21
from ..utils import compare_two_settings, create_new_process_for_each_test
22

23
24
logger = init_logger("test_pipeline_parallel")

25
26
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"

27

28
29
30
31
32
33
34
35
36
37
38
39
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
    """
    For PP, we fall back to V0 by default. This means
    that the TP baseline runs with V1 while the PP engine
    runs with V0. This gives divergent results with dummy
    weights. Once we enable V1 by default for PP, we can
    remove this.
    """
    monkeypatch.setenv('VLLM_USE_V1', '0')


40
41
42
43
44
45
46
class ParallelSetup(NamedTuple):
    tp_size: int
    pp_size: int
    eager_mode: bool
    chunked_prefill: bool


47
48
class PPTestOptions(NamedTuple):
    multi_node_only: bool
49
    load_format: Optional[str] = None
50
51


52
53
@dataclass
class PPTestSettings:
54
    parallel_setups: list[ParallelSetup]
55
56
57
58
    # NOTE: the length of distributed_backends and
    # vllm_major_versions should be the same, and they
    # are first zipped together to iterate over all
    # test settings.
59
    distributed_backends: list[str]
60
    # vllm major version: "0" for V0, "1" for V1
61
    vllm_major_versions: list[str]
62
    task: TaskOption
63
    test_options: PPTestOptions
64

65
66
67
68
69
70
71
    def __post_init__(self):
        if len(self.distributed_backends) != len(self.vllm_major_versions):
            raise ValueError(
                f"Length mismatch: distributed_backends "
                f"({len(self.distributed_backends)}) != "
                f"vllm_major_versions ({len(self.vllm_major_versions)})")

72
73
74
75
76
    @staticmethod
    def detailed(
        *,
        tp_base: int = 1,
        pp_base: int = 2,
77
        multi_node_only: bool = False,
78
        task: TaskOption = "auto",
79
        load_format: Optional[str] = None,
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
    ):
        return PPTestSettings(
            parallel_setups=[
                ParallelSetup(tp_size=tp_base,
                              pp_size=pp_base,
                              eager_mode=False,
                              chunked_prefill=False),
                ParallelSetup(tp_size=tp_base,
                              pp_size=2 * pp_base,
                              eager_mode=False,
                              chunked_prefill=True),
                ParallelSetup(tp_size=tp_base,
                              pp_size=2 * pp_base,
                              eager_mode=True,
                              chunked_prefill=False),
                ParallelSetup(tp_size=2 * tp_base,
                              pp_size=pp_base,
                              eager_mode=False,
                              chunked_prefill=True),
                ParallelSetup(tp_size=2 * tp_base,
                              pp_size=pp_base,
                              eager_mode=True,
                              chunked_prefill=False),
            ],
104
105
            distributed_backends=["mp", "mp", "ray", "ray"],
            vllm_major_versions=["0", "1", "0", "1"],
106
            task=task,
107
            test_options=PPTestOptions(multi_node_only=multi_node_only,
108
                                       load_format=load_format),
109
110
111
112
113
114
115
        )

    @staticmethod
    def fast(
        *,
        tp_base: int = 1,
        pp_base: int = 2,
116
        task: TaskOption = "auto",
117
        multi_node_only: bool = False,
118
        load_format: Optional[str] = None,
119
120
121
122
123
124
125
126
127
    ):
        return PPTestSettings(
            parallel_setups=[
                ParallelSetup(tp_size=tp_base,
                              pp_size=pp_base,
                              eager_mode=True,
                              chunked_prefill=False),
            ],
            distributed_backends=["mp"],
128
            vllm_major_versions=["0"],
129
            task=task,
130
            test_options=PPTestOptions(multi_node_only=multi_node_only,
131
                                       load_format=load_format),
132
133
        )

134
    def iter_params(self, model_id: str):
135
136
        opts = self.test_options

137
        for parallel_setup in self.parallel_setups:
138
139
            for backend, vllm_major_version in zip(self.distributed_backends,
                                                   self.vllm_major_versions):
140
                yield (model_id, parallel_setup, backend, vllm_major_version,
141
                       self.task, opts)
142
143


144
145
146
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
# The values displayed here are only a rough indicator of the size of the model

147
# yapf: disable
148
149
TEXT_GENERATION_MODELS = {
    # [Decoder-only]
150
151
    # Uses Llama
    # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
152
153
154
    "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(load_format="dummy"),  # noqa: E501
    "baichuan-inc/Baichuan-7B": PPTestSettings.fast(),
    "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(),
155
    "bigscience/bloomz-1b1": PPTestSettings.fast(),
156
157
158
159
    "THUDM/chatglm3-6b": PPTestSettings.fast(),
    "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(load_format="dummy"),
    "databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"),
    "Deci/DeciLM-7B-instruct": PPTestSettings.fast(),
160
    "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
161
    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(),
162
163
    "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
    "tiiuae/falcon-7b": PPTestSettings.fast(),
164
    "google/gemma-1.1-2b-it": PPTestSettings.fast(),
165
166
167
168
    "google/gemma-2-9b": PPTestSettings.fast(),
    "gpt2": PPTestSettings.fast(),
    "bigcode/starcoder": PPTestSettings.fast(),
    "EleutherAI/gpt-j-6b": PPTestSettings.fast(),
169
    "EleutherAI/pythia-1.4b": PPTestSettings.fast(),
170
171
172
173
    "ibm/PowerLM-3b": PPTestSettings.fast(),
    "ibm/PowerMoE-3b": PPTestSettings.fast(),
    # Uses Llama
    # "internlm/internlm-chat-7b": PPTestSettings.fast(),
174
    "internlm/internlm2-chat-7b": PPTestSettings.fast(),
175
    "inceptionai/jais-13b-chat": PPTestSettings.fast(),
176
    "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
177
    "meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
178
    # Tests TransformersForCausalLM
179
    "ArthurZ/Ilama-3.2-1B": PPTestSettings.fast(),
180
181
    "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
    "openbmb/MiniCPM3-4B": PPTestSettings.fast(),
182
183
    # Uses Llama
    # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
184
    "state-spaces/mamba-130m-hf": PPTestSettings.fast(),
185
    "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(load_format="dummy"),  # noqa: E501
186
187
188
    "mosaicml/mpt-7b": PPTestSettings.fast(),
    "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
    "allenai/OLMo-1B-hf": PPTestSettings.fast(),
189
    "allenai/OLMo-2-0425-1B": PPTestSettings.fast(),
190
    "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
191
    "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
192
    "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(),
193
    "adept/persimmon-8b-chat": PPTestSettings.fast(),
194
    "microsoft/phi-2": PPTestSettings.fast(),
195
196
197
    "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
    "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(multi_node_only=True, load_format="dummy"),  # noqa: E501
    "Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
198
    "Qwen/Qwen2.5-0.5B-Instruct": PPTestSettings.fast(),
199
200
201
    "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
    "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
    "bigcode/starcoder2-3b": PPTestSettings.fast(),
202
    "upstage/solar-pro-preview-instruct": PPTestSettings.fast(load_format="dummy"),  # noqa: E501
203
204
    # FIXME: Cannot load tokenizer in latest transformers version.
    # Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
205
    # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(),
206
207
208
    # [Encoder-only]
    # TODO: Implement PP
    # "facebook/bart-base": PPTestSettings.fast(),
209
210
}

211
212
EMBEDDING_MODELS = {  # type: ignore[var-annotated]
    # [Text-only]
213
214
    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
215
    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(load_format="dummy"),
216
217
}

218
219
MULTIMODAL_MODELS = {
    # [Decoder-only]
220
    "Salesforce/blip2-opt-6.7b": PPTestSettings.fast(),
221
222
    "facebook/chameleon-7b": PPTestSettings.fast(),
    "adept/fuyu-8b": PPTestSettings.fast(),
223
224
    "THUDM/glm-4v-9b": PPTestSettings.fast(),
    "OpenGVLab/InternVL2-1B": PPTestSettings.fast(),
225
226
227
228
    "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
    "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
    "llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
229
230
    "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(),
    "allenai/Molmo-7B-D-0924": PPTestSettings.fast(),
231
    "AIDC-AI/Ovis2-1B": PPTestSettings.fast(),
232
    "microsoft/Phi-3.5-vision-instruct": PPTestSettings.fast(),
233
234
    "mistralai/Pixtral-12B-2409": PPTestSettings.fast(load_format="dummy"),
    "Qwen/Qwen-VL-Chat": PPTestSettings.fast(),
235
    "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
236
    "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
237
    "fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(),
238
    # [Encoder-decoder]
239
    # TODO: Implement PP
240
    # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
241
242
243
}
# yapf: enable

244
# NOTE: You can update this on your local machine to run specific tests
245
TEST_MODELS = [
246
    # [LANGUAGE GENERATION]
247
    "microsoft/Phi-3.5-MoE-instruct",
248
    "meta-llama/Llama-3.2-1B-Instruct",
249
    "ArthurZ/Ilama-3.2-1B",
250
251
252
253
254
    "ibm/PowerLM-3b",
    # [LANGUAGE EMBEDDING]
    "intfloat/e5-mistral-7b-instruct",
    "BAAI/bge-multilingual-gemma2",
    # [MULTIMODAL GENERATION]
255
    "OpenGVLab/InternVL2-1B",
256
    "microsoft/Phi-3.5-vision-instruct",
257
    "fixie-ai/ultravox-v0_5-llama-3_2-1b",
258
259
    # [LANGUAGE GENERATION - HYBRID ARCH]
    "ai21labs/Jamba-tiny-dev",
260
261
262
]


263
def _compare_tp(
264
    model_id: str,
265
266
    parallel_setup: ParallelSetup,
    distributed_backend: str,
267
    vllm_major_version: str,
268
    task: TaskOption,
269
    test_options: PPTestOptions,
270
271
    num_gpus_available: int,
    *,
272
    method: Literal["generate", "encode"],
273
    is_multimodal: bool,
274
):
275
276
277
278
279
280
    (
        tp_size,
        pp_size,
        eager_mode,
        chunked_prefill,
    ) = parallel_setup
281
282
283
284
285
286
287
288
289
290
291
292
293

    multi_node_only, load_format = test_options

    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
    model_info.check_transformers_version(on_fail="skip")

    trust_remote_code = model_info.trust_remote_code
    tokenizer_mode = model_info.tokenizer_mode
    hf_overrides = model_info.hf_overrides

    if load_format == "dummy":
        # Avoid OOM
        text_overrides = {
294
295
296
297
298
            "num_hidden_layers": 4,
            "hidden_size": 512,
            "intermediate_size": 800,
            "num_attention_heads": 4,
            "num_key_value_heads": 1,
299
300
301
302
303
304
305
306
        }

        if is_multimodal:
            hf_overrides.update({"text_config": text_overrides})
        else:
            hf_overrides.update(text_overrides)
    else:
        model_info.check_available_online(on_fail="skip")
307

308
309
    if num_gpus_available < tp_size * pp_size:
        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
310
    if VLLM_MULTI_NODE and distributed_backend == "mp":
311
312
        pytest.skip("Skipping multi-node pipeline parallel test for "
                    "multiprocessing distributed backend")
313
314
    if multi_node_only and not VLLM_MULTI_NODE:
        pytest.skip("Not in multi-node setting")
315

316
    common_args = [
317
318
        # use half precision for speed and memory savings in CI environment
        "--dtype",
319
        "float16",
320
        "--max-model-len",
321
322
323
324
325
326
327
328
        "2048",
        "--max-num-seqs",
        "8",
    ]
    if chunked_prefill:
        common_args.append("--enable-chunked-prefill")
    if eager_mode:
        common_args.append("--enforce-eager")
329
330
    if task != "auto":
        common_args.extend(["--task", task])
331
332
333
334
    if trust_remote_code:
        common_args.append("--trust-remote-code")
    if tokenizer_mode:
        common_args.extend(["--tokenizer-mode", tokenizer_mode])
335
336
337
    if load_format:
        common_args.extend(["--load-format", load_format])
    if hf_overrides:
338
        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
339

340
341
342
    specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
    if distributed_backend == "ray" and (vllm_major_version == "1"
                                         or specific_case):
343
344
        # For V1, test Ray Compiled Graph for all the tests
        # For V0, test Ray Compiled Graph for a subset of the tests
345
        pp_env = {
346
            "VLLM_USE_V1": vllm_major_version,
347
348
349
350
351
            "VLLM_USE_RAY_COMPILED_DAG": "1",
            "VLLM_USE_RAY_SPMD_WORKER": "1",
            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
        }
        # Temporary. Currently when zeromq + SPMD is used, it does not properly
352
        # terminate because of a Ray Compiled Graph issue.
353
        common_args.append("--disable-frontend-multiprocessing")
354
355
356
357
358
    elif distributed_backend == "mp":
        # Both V0/V1 of multiprocessing executor support PP
        pp_env = {
            "VLLM_USE_V1": vllm_major_version,
        }
359
360
361
    else:
        pp_env = None

362
363
364
365
    tp_env = {
        "VLLM_USE_V1": vllm_major_version,
    }

366
367
    pp_args = [
        *common_args,
368
        "--pipeline-parallel-size",
369
        str(pp_size),
370
        "--tensor-parallel-size",
371
        str(tp_size),
372
        "--distributed-executor-backend",
373
        distributed_backend,
374
    ]
375
376
377
378
379
380
381

    # compare without pipeline parallelism
    # NOTE: use mp backend for TP
    # PP tests might involve multiple nodes, and ray might
    #  schedule all workers in a node other than the head node,
    #  which can cause the test to fail.
    tp_args = [
382
        *common_args,
383
        "--tensor-parallel-size",
384
        str(tp_size),
385
386
387
388
        "--distributed-executor-backend",
        "mp",
    ]

389
    try:
390
391
392
393
394
395
        compare_two_settings(model_id,
                             pp_args,
                             tp_args,
                             pp_env,
                             tp_env,
                             method=method)
396
    except Exception:
397
398
399
        testing_ray_compiled_graph = pp_env is not None
        if testing_ray_compiled_graph and vllm_major_version == "0":
            # Ray Compiled Graph tests are flaky for V0,
400
401
            # so we don't want to fail the test
            logger.exception("Ray Compiled Graph tests failed")
402
403
        else:
            raise
404
405
406


@pytest.mark.parametrize(
407
408
    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
     "task", "test_options"),
409
    [
410
411
        params for model_id, settings in TEXT_GENERATION_MODELS.items()
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
412
413
    ],
)
414
@create_new_process_for_each_test()
415
def test_tp_language_generation(
416
    model_id: str,
417
418
    parallel_setup: ParallelSetup,
    distributed_backend: str,
419
    vllm_major_version: str,
420
    task: TaskOption,
421
    test_options: PPTestOptions,
422
423
    num_gpus_available,
):
424
    _compare_tp(model_id,
425
426
                parallel_setup,
                distributed_backend,
427
                vllm_major_version,
428
                task,
429
                test_options,
430
                num_gpus_available,
431
432
                method="generate",
                is_multimodal=False)
433
434
435


@pytest.mark.parametrize(
436
437
    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
     "task", "test_options"),
438
    [
439
440
        params for model_id, settings in EMBEDDING_MODELS.items()
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
441
442
    ],
)
443
@create_new_process_for_each_test()
444
def test_tp_language_embedding(
445
    model_id: str,
446
447
    parallel_setup: ParallelSetup,
    distributed_backend: str,
448
    vllm_major_version: str,
449
    task: TaskOption,
450
    test_options: PPTestOptions,
451
452
    num_gpus_available,
):
453
    _compare_tp(model_id,
454
455
                parallel_setup,
                distributed_backend,
456
                vllm_major_version,
457
                task,
458
                test_options,
459
                num_gpus_available,
460
461
                method="encode",
                is_multimodal=False)
462
463
464


@pytest.mark.parametrize(
465
466
    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
     "task", "test_options"),
467
    [
468
469
        params for model_id, settings in MULTIMODAL_MODELS.items()
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
470
471
    ],
)
472
@create_new_process_for_each_test()
473
def test_tp_multimodal_generation(
474
    model_id: str,
475
476
    parallel_setup: ParallelSetup,
    distributed_backend: str,
477
    vllm_major_version: str,
478
    task: TaskOption,
479
    test_options: PPTestOptions,
480
481
    num_gpus_available,
):
482
    _compare_tp(model_id,
483
484
                parallel_setup,
                distributed_backend,
485
                vllm_major_version,
486
                task,
487
                test_options,
488
                num_gpus_available,
489
490
                method="generate",
                is_multimodal=True)