test_pipeline_parallel.py 17 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
3
4
5
6
7
8
"""
WARNING: This test runs in both single-node (4 GPUs) and multi-node
 (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
 important to set the distributed backend to "mp" to avoid Ray scheduling
 all workers in a node other than the head node, which can cause the test
 to fail.
"""
9
import json
10
import os
11
from dataclasses import dataclass
12
from typing import Literal, NamedTuple, Optional
13

14
15
import pytest

16
from vllm.config import TaskOption
17
18
from vllm.logger import init_logger

19
from ..models.registry import HF_EXAMPLE_MODELS
20
from ..utils import compare_two_settings, create_new_process_for_each_test
21

22
23
logger = init_logger("test_pipeline_parallel")

24
25
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"

26

27
28
29
30
31
32
33
34
35
36
37
38
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
    """
    For PP, we fall back to V0 by default. This means
    that the TP baseline runs with V1 while the PP engine
    runs with V0. This gives divergent results with dummy
    weights. Once we enable V1 by default for PP, we can
    remove this.
    """
    monkeypatch.setenv('VLLM_USE_V1', '0')


39
40
41
42
43
44
45
class ParallelSetup(NamedTuple):
    tp_size: int
    pp_size: int
    eager_mode: bool
    chunked_prefill: bool


46
47
class PPTestOptions(NamedTuple):
    multi_node_only: bool
48
    load_format: Optional[str] = None
49
50


51
52
@dataclass
class PPTestSettings:
53
    parallel_setups: list[ParallelSetup]
54
55
56
57
    # NOTE: the length of distributed_backends and
    # vllm_major_versions should be the same, and they
    # are first zipped together to iterate over all
    # test settings.
58
    distributed_backends: list[str]
59
    # vllm major version: "0" for V0, "1" for V1
60
    vllm_major_versions: list[str]
61
    task: TaskOption
62
    test_options: PPTestOptions
63

64
65
66
67
68
69
70
    def __post_init__(self):
        if len(self.distributed_backends) != len(self.vllm_major_versions):
            raise ValueError(
                f"Length mismatch: distributed_backends "
                f"({len(self.distributed_backends)}) != "
                f"vllm_major_versions ({len(self.vllm_major_versions)})")

71
72
73
74
75
    @staticmethod
    def detailed(
        *,
        tp_base: int = 1,
        pp_base: int = 2,
76
        multi_node_only: bool = False,
77
        task: TaskOption = "auto",
78
        load_format: Optional[str] = None,
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
    ):
        return PPTestSettings(
            parallel_setups=[
                ParallelSetup(tp_size=tp_base,
                              pp_size=pp_base,
                              eager_mode=False,
                              chunked_prefill=False),
                ParallelSetup(tp_size=tp_base,
                              pp_size=2 * pp_base,
                              eager_mode=False,
                              chunked_prefill=True),
                ParallelSetup(tp_size=tp_base,
                              pp_size=2 * pp_base,
                              eager_mode=True,
                              chunked_prefill=False),
                ParallelSetup(tp_size=2 * tp_base,
                              pp_size=pp_base,
                              eager_mode=False,
                              chunked_prefill=True),
                ParallelSetup(tp_size=2 * tp_base,
                              pp_size=pp_base,
                              eager_mode=True,
                              chunked_prefill=False),
            ],
103
104
105
            # only ray is supported for V1
            distributed_backends=["mp", "ray", "ray"],
            vllm_major_versions=["0", "0", "1"],
106
            task=task,
107
            test_options=PPTestOptions(multi_node_only=multi_node_only,
108
                                       load_format=load_format),
109
110
111
112
113
114
115
        )

    @staticmethod
    def fast(
        *,
        tp_base: int = 1,
        pp_base: int = 2,
116
        task: TaskOption = "auto",
117
        multi_node_only: bool = False,
118
        load_format: Optional[str] = None,
119
120
121
122
123
124
125
126
127
    ):
        return PPTestSettings(
            parallel_setups=[
                ParallelSetup(tp_size=tp_base,
                              pp_size=pp_base,
                              eager_mode=True,
                              chunked_prefill=False),
            ],
            distributed_backends=["mp"],
128
            vllm_major_versions=["0"],
129
            task=task,
130
            test_options=PPTestOptions(multi_node_only=multi_node_only,
131
                                       load_format=load_format),
132
133
        )

134
    def iter_params(self, model_id: str):
135
136
        opts = self.test_options

137
        for parallel_setup in self.parallel_setups:
138
139
            for backend, vllm_major_version in zip(self.distributed_backends,
                                                   self.vllm_major_versions):
140
                yield (model_id, parallel_setup, backend, vllm_major_version,
141
                       self.task, opts)
142
143


144
145
146
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
# The values displayed here are only a rough indicator of the size of the model

147
# yapf: disable
148
149
TEXT_GENERATION_MODELS = {
    # [Decoder-only]
150
151
    # Uses Llama
    # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
152
153
154
    "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(load_format="dummy"),  # noqa: E501
    "baichuan-inc/Baichuan-7B": PPTestSettings.fast(),
    "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(),
155
    "bigscience/bloomz-1b1": PPTestSettings.fast(),
156
157
158
159
    "THUDM/chatglm3-6b": PPTestSettings.fast(),
    "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(load_format="dummy"),
    "databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"),
    "Deci/DeciLM-7B-instruct": PPTestSettings.fast(),
160
    "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
161
    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(),
162
163
    "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
    "tiiuae/falcon-7b": PPTestSettings.fast(),
164
    "google/gemma-1.1-2b-it": PPTestSettings.fast(),
165
166
167
168
    "google/gemma-2-9b": PPTestSettings.fast(),
    "gpt2": PPTestSettings.fast(),
    "bigcode/starcoder": PPTestSettings.fast(),
    "EleutherAI/gpt-j-6b": PPTestSettings.fast(),
169
    "EleutherAI/pythia-1.4b": PPTestSettings.fast(),
170
171
172
173
    "ibm/PowerLM-3b": PPTestSettings.fast(),
    "ibm/PowerMoE-3b": PPTestSettings.fast(),
    # Uses Llama
    # "internlm/internlm-chat-7b": PPTestSettings.fast(),
174
    "internlm/internlm2-chat-7b": PPTestSettings.fast(),
175
    "inceptionai/jais-13b-chat": PPTestSettings.fast(),
176
    "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
177
    "meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
178
    # Tests TransformersForCausalLM
179
    "ArthurZ/Ilama-3.2-1B": PPTestSettings.fast(),
180
181
    "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
    "openbmb/MiniCPM3-4B": PPTestSettings.fast(),
182
183
    # Uses Llama
    # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
184
    "state-spaces/mamba-130m-hf": PPTestSettings.fast(),
185
    "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(load_format="dummy"),  # noqa: E501
186
187
188
    "mosaicml/mpt-7b": PPTestSettings.fast(),
    "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
    "allenai/OLMo-1B-hf": PPTestSettings.fast(),
189
    "shanearora/OLMo-7B-1124-hf": PPTestSettings.fast(),
190
    "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
191
    "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
192
    "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(),
193
    "adept/persimmon-8b-chat": PPTestSettings.fast(),
194
    "microsoft/phi-2": PPTestSettings.fast(),
195
196
197
    "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
    "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(multi_node_only=True, load_format="dummy"),  # noqa: E501
    "Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
198
    "Qwen/Qwen2.5-0.5B-Instruct": PPTestSettings.fast(),
199
200
201
    "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
    "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
    "bigcode/starcoder2-3b": PPTestSettings.fast(),
202
    "upstage/solar-pro-preview-instruct": PPTestSettings.fast(load_format="dummy"),  # noqa: E501
203
204
    # FIXME: Cannot load tokenizer in latest transformers version.
    # Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
205
    # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(),
206
207
208
    # [Encoder-only]
    # TODO: Implement PP
    # "facebook/bart-base": PPTestSettings.fast(),
209
210
}

211
212
EMBEDDING_MODELS = {  # type: ignore[var-annotated]
    # [Text-only]
213
214
    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
215
    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(load_format="dummy"),
216
217
}

218
219
MULTIMODAL_MODELS = {
    # [Decoder-only]
220
    "Salesforce/blip2-opt-6.7b": PPTestSettings.fast(),
221
222
    "facebook/chameleon-7b": PPTestSettings.fast(),
    "adept/fuyu-8b": PPTestSettings.fast(),
223
224
    "THUDM/glm-4v-9b": PPTestSettings.fast(),
    "OpenGVLab/InternVL2-1B": PPTestSettings.fast(),
225
226
227
228
    "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
    "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
    "llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
229
230
    "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(),
    "allenai/Molmo-7B-D-0924": PPTestSettings.fast(),
231
    "microsoft/Phi-3.5-vision-instruct": PPTestSettings.fast(),
232
233
    "mistralai/Pixtral-12B-2409": PPTestSettings.fast(load_format="dummy"),
    "Qwen/Qwen-VL-Chat": PPTestSettings.fast(),
234
    "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
235
    "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
236
    "fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(),
237
    # [Encoder-decoder]
238
    # TODO: Implement PP
239
    # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
240
241
242
}
# yapf: enable

243
# NOTE: You can update this on your local machine to run specific tests
244
TEST_MODELS = [
245
    # [LANGUAGE GENERATION]
246
    "microsoft/Phi-3.5-MoE-instruct",
247
    "meta-llama/Llama-3.2-1B-Instruct",
248
    "ArthurZ/Ilama-3.2-1B",
249
250
251
252
253
    "ibm/PowerLM-3b",
    # [LANGUAGE EMBEDDING]
    "intfloat/e5-mistral-7b-instruct",
    "BAAI/bge-multilingual-gemma2",
    # [MULTIMODAL GENERATION]
254
    "OpenGVLab/InternVL2-1B",
255
    "microsoft/Phi-3.5-vision-instruct",
256
    "fixie-ai/ultravox-v0_5-llama-3_2-1b",
257
258
    # [LANGUAGE GENERATION - HYBRID ARCH]
    "ai21labs/Jamba-tiny-dev",
259
260
261
]


262
def _compare_tp(
263
    model_id: str,
264
265
    parallel_setup: ParallelSetup,
    distributed_backend: str,
266
    vllm_major_version: str,
267
    task: TaskOption,
268
    test_options: PPTestOptions,
269
270
    num_gpus_available: int,
    *,
271
    method: Literal["generate", "encode"],
272
    is_multimodal: bool,
273
):
274
275
276
277
278
279
    (
        tp_size,
        pp_size,
        eager_mode,
        chunked_prefill,
    ) = parallel_setup
280
281
282
283
284
285
286
287
288
289
290
291
292

    multi_node_only, load_format = test_options

    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
    model_info.check_transformers_version(on_fail="skip")

    trust_remote_code = model_info.trust_remote_code
    tokenizer_mode = model_info.tokenizer_mode
    hf_overrides = model_info.hf_overrides

    if load_format == "dummy":
        # Avoid OOM
        text_overrides = {
293
294
295
296
297
            "num_hidden_layers": 4,
            "hidden_size": 512,
            "intermediate_size": 800,
            "num_attention_heads": 4,
            "num_key_value_heads": 1,
298
299
300
301
302
303
304
305
        }

        if is_multimodal:
            hf_overrides.update({"text_config": text_overrides})
        else:
            hf_overrides.update(text_overrides)
    else:
        model_info.check_available_online(on_fail="skip")
306

307
308
    if num_gpus_available < tp_size * pp_size:
        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
309
    if VLLM_MULTI_NODE and distributed_backend == "mp":
310
311
        pytest.skip("Skipping multi-node pipeline parallel test for "
                    "multiprocessing distributed backend")
312
313
    if multi_node_only and not VLLM_MULTI_NODE:
        pytest.skip("Not in multi-node setting")
314

315
    common_args = [
316
317
        # use half precision for speed and memory savings in CI environment
        "--dtype",
318
        "float16",
319
        "--max-model-len",
320
321
322
323
324
325
326
327
        "2048",
        "--max-num-seqs",
        "8",
    ]
    if chunked_prefill:
        common_args.append("--enable-chunked-prefill")
    if eager_mode:
        common_args.append("--enforce-eager")
328
329
    if task != "auto":
        common_args.extend(["--task", task])
330
331
332
333
    if trust_remote_code:
        common_args.append("--trust-remote-code")
    if tokenizer_mode:
        common_args.extend(["--tokenizer-mode", tokenizer_mode])
334
335
336
    if load_format:
        common_args.extend(["--load-format", load_format])
    if hf_overrides:
337
        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
338

339
340
341
    specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
    if distributed_backend == "ray" and (vllm_major_version == "1"
                                         or specific_case):
342
343
        # For V1, test Ray Compiled Graph for all the tests
        # For V0, test Ray Compiled Graph for a subset of the tests
344
        pp_env = {
345
            "VLLM_USE_V1": vllm_major_version,
346
347
348
349
350
            "VLLM_USE_RAY_COMPILED_DAG": "1",
            "VLLM_USE_RAY_SPMD_WORKER": "1",
            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
        }
        # Temporary. Currently when zeromq + SPMD is used, it does not properly
351
        # terminate because of a Ray Compiled Graph issue.
352
353
354
355
        common_args.append("--disable-frontend-multiprocessing")
    else:
        pp_env = None

356
357
358
359
    tp_env = {
        "VLLM_USE_V1": vllm_major_version,
    }

360
361
    pp_args = [
        *common_args,
362
        "--pipeline-parallel-size",
363
        str(pp_size),
364
        "--tensor-parallel-size",
365
        str(tp_size),
366
        "--distributed-executor-backend",
367
        distributed_backend,
368
    ]
369
370
371
372
373
374
375

    # compare without pipeline parallelism
    # NOTE: use mp backend for TP
    # PP tests might involve multiple nodes, and ray might
    #  schedule all workers in a node other than the head node,
    #  which can cause the test to fail.
    tp_args = [
376
        *common_args,
377
        "--tensor-parallel-size",
378
        str(tp_size),
379
380
381
382
        "--distributed-executor-backend",
        "mp",
    ]

383
    try:
384
385
386
387
388
389
        compare_two_settings(model_id,
                             pp_args,
                             tp_args,
                             pp_env,
                             tp_env,
                             method=method)
390
    except Exception:
391
392
393
        testing_ray_compiled_graph = pp_env is not None
        if testing_ray_compiled_graph and vllm_major_version == "0":
            # Ray Compiled Graph tests are flaky for V0,
394
395
            # so we don't want to fail the test
            logger.exception("Ray Compiled Graph tests failed")
396
397
        else:
            raise
398
399
400


@pytest.mark.parametrize(
401
402
    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
     "task", "test_options"),
403
    [
404
405
        params for model_id, settings in TEXT_GENERATION_MODELS.items()
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
406
407
    ],
)
408
@create_new_process_for_each_test()
409
def test_tp_language_generation(
410
    model_id: str,
411
412
    parallel_setup: ParallelSetup,
    distributed_backend: str,
413
    vllm_major_version: str,
414
    task: TaskOption,
415
    test_options: PPTestOptions,
416
417
    num_gpus_available,
):
418
    _compare_tp(model_id,
419
420
                parallel_setup,
                distributed_backend,
421
                vllm_major_version,
422
                task,
423
                test_options,
424
                num_gpus_available,
425
426
                method="generate",
                is_multimodal=False)
427
428
429


@pytest.mark.parametrize(
430
431
    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
     "task", "test_options"),
432
    [
433
434
        params for model_id, settings in EMBEDDING_MODELS.items()
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
435
436
    ],
)
437
@create_new_process_for_each_test()
438
def test_tp_language_embedding(
439
    model_id: str,
440
441
    parallel_setup: ParallelSetup,
    distributed_backend: str,
442
    vllm_major_version: str,
443
    task: TaskOption,
444
    test_options: PPTestOptions,
445
446
    num_gpus_available,
):
447
    _compare_tp(model_id,
448
449
                parallel_setup,
                distributed_backend,
450
                vllm_major_version,
451
                task,
452
                test_options,
453
                num_gpus_available,
454
455
                method="encode",
                is_multimodal=False)
456
457
458


@pytest.mark.parametrize(
459
460
    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
     "task", "test_options"),
461
    [
462
463
        params for model_id, settings in MULTIMODAL_MODELS.items()
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
464
465
    ],
)
466
@create_new_process_for_each_test()
467
def test_tp_multimodal_generation(
468
    model_id: str,
469
470
    parallel_setup: ParallelSetup,
    distributed_backend: str,
471
    vllm_major_version: str,
472
    task: TaskOption,
473
    test_options: PPTestOptions,
474
475
    num_gpus_available,
):
476
    _compare_tp(model_id,
477
478
                parallel_setup,
                distributed_backend,
479
                vllm_major_version,
480
                task,
481
                test_options,
482
                num_gpus_available,
483
484
                method="generate",
                is_multimodal=True)