test_pipeline_parallel.py 14.8 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
7
8
9
"""
WARNING: This test runs in both single-node (4 GPUs) and multi-node
 (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
 important to set the distributed backend to "mp" to avoid Ray scheduling
 all workers in a node other than the head node, which can cause the test
 to fail.
"""
10

11
import json
12
import os
13
from dataclasses import dataclass
14
from typing import Literal, NamedTuple
15

16
17
import pytest

18
from vllm.config.model import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption
19
from vllm.logger import init_logger
20
from vllm.transformers_utils.config import get_config
21

22
from ..models.registry import HF_EXAMPLE_MODELS
23
from ..utils import compare_two_settings, create_new_process_for_each_test
24

25
26
logger = init_logger("test_pipeline_parallel")

27
28
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"

29

30
31
32
33
34
35
class ParallelSetup(NamedTuple):
    tp_size: int
    pp_size: int
    eager_mode: bool


36
37
class PPTestOptions(NamedTuple):
    multi_node_only: bool
38
    load_format: str | None = None
39
40


41
42
@dataclass
class PPTestSettings:
43
44
    parallel_setups: list[ParallelSetup]
    distributed_backends: list[str]
45
    runner: RunnerOption
46
    test_options: PPTestOptions
47
48
49
50
51
52

    @staticmethod
    def detailed(
        *,
        tp_base: int = 1,
        pp_base: int = 2,
53
        multi_node_only: bool = False,
54
        runner: RunnerOption = "auto",
55
        load_format: str | None = None,
56
57
58
    ):
        return PPTestSettings(
            parallel_setups=[
59
60
61
62
63
                ParallelSetup(tp_size=tp_base, pp_size=pp_base, eager_mode=False),
                ParallelSetup(tp_size=tp_base, pp_size=2 * pp_base, eager_mode=False),
                ParallelSetup(tp_size=tp_base, pp_size=2 * pp_base, eager_mode=True),
                ParallelSetup(tp_size=2 * tp_base, pp_size=pp_base, eager_mode=False),
                ParallelSetup(tp_size=2 * tp_base, pp_size=pp_base, eager_mode=True),
64
            ],
65
            distributed_backends=["mp", "ray"],
66
            runner=runner,
67
68
69
            test_options=PPTestOptions(
                multi_node_only=multi_node_only, load_format=load_format
            ),
70
71
72
73
74
75
76
        )

    @staticmethod
    def fast(
        *,
        tp_base: int = 1,
        pp_base: int = 2,
77
        runner: RunnerOption = "auto",
78
        multi_node_only: bool = False,
79
        load_format: str | None = None,
80
81
82
    ):
        return PPTestSettings(
            parallel_setups=[
83
                ParallelSetup(tp_size=tp_base, pp_size=pp_base, eager_mode=True),
84
85
            ],
            distributed_backends=["mp"],
86
            runner=runner,
87
88
89
            test_options=PPTestOptions(
                multi_node_only=multi_node_only, load_format=load_format
            ),
90
91
        )

92
    def iter_params(self, model_id: str):
93
94
        opts = self.test_options

95
        for parallel_setup in self.parallel_setups:
96
97
            for backend in self.distributed_backends:
                yield (model_id, parallel_setup, backend, self.runner, opts)
98
99


100
101
102
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
# The values displayed here are only a rough indicator of the size of the model

103
104
TEXT_GENERATION_MODELS = {
    # [Decoder-only]
105
106
    # Uses Llama
    # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
107
    "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(load_format="dummy"),
108
109
    "baichuan-inc/Baichuan-7B": PPTestSettings.fast(),
    "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(),
110
    "bigscience/bloomz-1b1": PPTestSettings.fast(),
111
    "zai-org/chatglm3-6b": PPTestSettings.fast(),
112
    "CohereLabs/c4ai-command-r-v01": PPTestSettings.fast(load_format="dummy"),
113
114
    "databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"),
    "Deci/DeciLM-7B-instruct": PPTestSettings.fast(),
115
    "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
116
    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(tp_base=2),
117
118
    "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
    "tiiuae/falcon-7b": PPTestSettings.fast(),
119
    "google/gemma-1.1-2b-it": PPTestSettings.fast(),
120
121
122
123
    "google/gemma-2-9b": PPTestSettings.fast(),
    "gpt2": PPTestSettings.fast(),
    "bigcode/starcoder": PPTestSettings.fast(),
    "EleutherAI/gpt-j-6b": PPTestSettings.fast(),
124
    "EleutherAI/pythia-1.4b": PPTestSettings.fast(),
125
126
127
128
    "ibm/PowerLM-3b": PPTestSettings.fast(),
    "ibm/PowerMoE-3b": PPTestSettings.fast(),
    # Uses Llama
    # "internlm/internlm-chat-7b": PPTestSettings.fast(),
129
    "internlm/internlm2-chat-7b": PPTestSettings.fast(),
130
    "inceptionai/jais-13b-chat": PPTestSettings.fast(),
131
    "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
132
    "pfnet/plamo-2-1b": PPTestSettings.fast(),
133
    "pfnet/plamo-3-nict-2b-base": PPTestSettings.fast(),
134
    "meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
135
    # Tests TransformersForCausalLM
136
    "hmellor/Ilama-3.2-1B": PPTestSettings.fast(),
137
138
    "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
    "openbmb/MiniCPM3-4B": PPTestSettings.fast(),
139
140
    # Uses Llama
    # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
141
    "state-spaces/mamba-130m-hf": PPTestSettings.fast(),
142
    "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(load_format="dummy"),
143
144
145
    "mosaicml/mpt-7b": PPTestSettings.fast(),
    "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
    "allenai/OLMo-1B-hf": PPTestSettings.fast(),
146
    "allenai/OLMo-2-0425-1B": PPTestSettings.fast(),
147
    "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
148
    "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
149
    "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(),
150
    "adept/persimmon-8b-chat": PPTestSettings.fast(),
151
    "microsoft/phi-2": PPTestSettings.fast(),
152
    "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
153
154
    "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(
        multi_node_only=True, load_format="dummy"
155
    ),
156
    "Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
157
    "Qwen/Qwen2.5-0.5B-Instruct": PPTestSettings.fast(),
158
159
160
    "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
    "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
    "bigcode/starcoder2-3b": PPTestSettings.fast(),
161
    "upstage/solar-pro-preview-instruct": PPTestSettings.fast(load_format="dummy"),
162
163
    # FIXME: Cannot load tokenizer in latest transformers version.
    # Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
164
    # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(),
165
166
167
    # [Encoder-only]
    # TODO: Implement PP
    # "facebook/bart-base": PPTestSettings.fast(),
168
169
}

170
171
EMBEDDING_MODELS = {  # type: ignore[var-annotated]
    # [Text-only]
172
    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(runner="pooling"),
173
    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
174
    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(
175
        load_format="dummy", runner="pooling"
176
    ),
177
178
}

179
180
MULTIMODAL_MODELS = {
    # [Decoder-only]
181
    "Salesforce/blip2-opt-6.7b": PPTestSettings.fast(),
182
183
    "facebook/chameleon-7b": PPTestSettings.fast(),
    "adept/fuyu-8b": PPTestSettings.fast(),
184
    "zai-org/glm-4v-9b": PPTestSettings.fast(),
185
    "OpenGVLab/InternVL2-1B": PPTestSettings.fast(),
186
187
188
189
    "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
    "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
    "llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
190
191
    "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(),
    "allenai/Molmo-7B-D-0924": PPTestSettings.fast(),
192
    "AIDC-AI/Ovis2-1B": PPTestSettings.fast(),
193
    "AIDC-AI/Ovis2.5-2B": PPTestSettings.fast(),
194
    "microsoft/Phi-3.5-vision-instruct": PPTestSettings.fast(),
195
196
    "mistralai/Pixtral-12B-2409": PPTestSettings.fast(load_format="dummy"),
    "Qwen/Qwen-VL-Chat": PPTestSettings.fast(),
197
    "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
198
    "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
199
    "fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(),
200
201
}

202
# NOTE: You can update this on your local machine to run specific tests
203
TEST_MODELS = [
204
    # [LANGUAGE GENERATION]
205
    "microsoft/Phi-3.5-MoE-instruct",
206
    "meta-llama/Llama-3.2-1B-Instruct",
207
    "hmellor/Ilama-3.2-1B",
208
    "ibm/PowerLM-3b",
209
    "deepseek-ai/DeepSeek-V2-Lite-Chat",
210
211
212
213
    # [LANGUAGE EMBEDDING]
    "intfloat/e5-mistral-7b-instruct",
    "BAAI/bge-multilingual-gemma2",
    # [MULTIMODAL GENERATION]
214
    "OpenGVLab/InternVL2-1B",
215
    "microsoft/Phi-3.5-vision-instruct",
216
    "fixie-ai/ultravox-v0_5-llama-3_2-1b",
217
218
    # [LANGUAGE GENERATION - HYBRID ARCH]
    "ai21labs/Jamba-tiny-dev",
219
220
221
]


222
def _compare_tp(
223
    model_id: str,
224
225
    parallel_setup: ParallelSetup,
    distributed_backend: str,
226
    runner: RunnerOption,
227
    test_options: PPTestOptions,
228
229
    num_gpus_available: int,
    *,
230
    method: Literal["generate", "encode"],
231
    is_multimodal: bool,
232
):
233
234
235
236
237
    (
        tp_size,
        pp_size,
        eager_mode,
    ) = parallel_setup
238
239
240
241
242
243
244
245
246

    multi_node_only, load_format = test_options

    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
    model_info.check_transformers_version(on_fail="skip")

    trust_remote_code = model_info.trust_remote_code
    tokenizer_mode = model_info.tokenizer_mode
    hf_overrides = model_info.hf_overrides
247
    hf_config = get_config(model_id, trust_remote_code)
248
    require_embed_inputs = model_info.require_embed_inputs
249
    max_num_seqs = model_info.max_num_seqs
250
    enable_prefix_caching = model_info.enable_prefix_caching
251
252
253
254

    dtype = "float16"
    if hf_config.model_type in _FLOAT16_NOT_SUPPORTED_MODELS:
        dtype = "bfloat16"
255
256
257
258

    if load_format == "dummy":
        # Avoid OOM
        text_overrides = {
259
260
261
262
263
            "num_hidden_layers": 4,
            "hidden_size": 512,
            "intermediate_size": 800,
            "num_attention_heads": 4,
            "num_key_value_heads": 1,
264
265
266
267
268
269
270
271
        }

        if is_multimodal:
            hf_overrides.update({"text_config": text_overrides})
        else:
            hf_overrides.update(text_overrides)
    else:
        model_info.check_available_online(on_fail="skip")
272

273
274
    if num_gpus_available < tp_size * pp_size:
        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
275
    if VLLM_MULTI_NODE and distributed_backend == "mp":
276
277
278
279
        pytest.skip(
            "Skipping multi-node pipeline parallel test for "
            "multiprocessing distributed backend"
        )
280
281
    if multi_node_only and not VLLM_MULTI_NODE:
        pytest.skip("Not in multi-node setting")
282

283
    common_args = [
284
285
        # use half precision for speed and memory savings in CI environment
        "--dtype",
286
        dtype,
287
        "--max-model-len",
288
289
290
291
292
293
        "2048",
        "--max-num-seqs",
        "8",
    ]
    if eager_mode:
        common_args.append("--enforce-eager")
294
295
    if runner != "auto":
        common_args.extend(["--runner", runner])
296
297
298
299
    if trust_remote_code:
        common_args.append("--trust-remote-code")
    if tokenizer_mode:
        common_args.extend(["--tokenizer-mode", tokenizer_mode])
300
301
302
    if load_format:
        common_args.extend(["--load-format", load_format])
    if hf_overrides:
303
        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
304
305
    if not enable_prefix_caching:
        common_args.append("--no-enable-prefix-caching")
306
307
308
309
310
311
312
313
    if require_embed_inputs:
        common_args.extend(
            [
                "--skip-tokenizer-init",
                "--enable-prompt-embeds",
                "--enable-mm-embeds",
            ]
        )
314
315
    if max_num_seqs:
        common_args.extend(["--max-num-seqs", f"{max_num_seqs}"])
316

317
    if distributed_backend == "ray":
318
        # Test Ray Compiled Graph for all the tests
319
320
321
322
        pp_env = {
            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
        }
        # Temporary. Currently when zeromq + SPMD is used, it does not properly
323
        # terminate because of a Ray Compiled Graph issue.
324
        common_args.append("--disable-frontend-multiprocessing")
325
    elif distributed_backend == "mp":
326
        pp_env = None
327
328
329
    else:
        pp_env = None

330
    tp_env = None
331

332
333
    pp_args = [
        *common_args,
334
        "--pipeline-parallel-size",
335
        str(pp_size),
336
        "--tensor-parallel-size",
337
        str(tp_size),
338
        "--distributed-executor-backend",
339
        distributed_backend,
340
    ]
341
342
343
344
345
346
347

    # compare without pipeline parallelism
    # NOTE: use mp backend for TP
    # PP tests might involve multiple nodes, and ray might
    #  schedule all workers in a node other than the head node,
    #  which can cause the test to fail.
    tp_args = [
348
        *common_args,
349
        "--tensor-parallel-size",
350
        str(tp_size),
351
352
353
354
        "--distributed-executor-backend",
        "mp",
    ]

355
    compare_two_settings(model_id, pp_args, tp_args, pp_env, tp_env, method=method)
356
357
358


@pytest.mark.parametrize(
359
    ("model_id", "parallel_setup", "distributed_backend", "runner", "test_options"),
360
    [
361
362
363
364
        params
        for model_id, settings in TEXT_GENERATION_MODELS.items()
        for params in settings.iter_params(model_id)
        if model_id in TEST_MODELS
365
366
    ],
)
367
@create_new_process_for_each_test()
368
def test_tp_language_generation(
369
    model_id: str,
370
371
    parallel_setup: ParallelSetup,
    distributed_backend: str,
372
    runner: RunnerOption,
373
    test_options: PPTestOptions,
374
375
    num_gpus_available,
):
376
377
378
379
380
381
382
383
384
385
    _compare_tp(
        model_id,
        parallel_setup,
        distributed_backend,
        runner,
        test_options,
        num_gpus_available,
        method="generate",
        is_multimodal=False,
    )
386
387
388


@pytest.mark.parametrize(
389
    ("model_id", "parallel_setup", "distributed_backend", "runner", "test_options"),
390
    [
391
392
393
394
        params
        for model_id, settings in EMBEDDING_MODELS.items()
        for params in settings.iter_params(model_id)
        if model_id in TEST_MODELS
395
396
    ],
)
397
@create_new_process_for_each_test()
398
def test_tp_language_embedding(
399
    model_id: str,
400
401
    parallel_setup: ParallelSetup,
    distributed_backend: str,
402
    runner: RunnerOption,
403
    test_options: PPTestOptions,
404
405
    num_gpus_available,
):
406
407
408
409
410
411
412
413
414
415
    _compare_tp(
        model_id,
        parallel_setup,
        distributed_backend,
        runner,
        test_options,
        num_gpus_available,
        method="encode",
        is_multimodal=False,
    )
416
417
418


@pytest.mark.parametrize(
419
    ("model_id", "parallel_setup", "distributed_backend", "runner", "test_options"),
420
    [
421
422
423
424
        params
        for model_id, settings in MULTIMODAL_MODELS.items()
        for params in settings.iter_params(model_id)
        if model_id in TEST_MODELS
425
426
    ],
)
427
@create_new_process_for_each_test()
428
def test_tp_multimodal_generation(
429
    model_id: str,
430
431
    parallel_setup: ParallelSetup,
    distributed_backend: str,
432
    runner: RunnerOption,
433
    test_options: PPTestOptions,
434
435
    num_gpus_available,
):
436
437
438
439
440
441
442
443
444
445
    _compare_tp(
        model_id,
        parallel_setup,
        distributed_backend,
        runner,
        test_options,
        num_gpus_available,
        method="generate",
        is_multimodal=True,
    )