test_pipeline_parallel.py 14.5 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
7
8
9
"""
WARNING: This test runs in both single-node (4 GPUs) and multi-node
 (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
 important to set the distributed backend to "mp" to avoid Ray scheduling
 all workers in a node other than the head node, which can cause the test
 to fail.
"""
10

11
import json
12
import os
13
from dataclasses import dataclass
14
from typing import Literal, NamedTuple
15

16
17
import pytest

18
from vllm.config.model import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption
19
from vllm.logger import init_logger
20
from vllm.transformers_utils.config import get_config
21

22
from ..models.registry import HF_EXAMPLE_MODELS
23
from ..utils import compare_two_settings, create_new_process_for_each_test
24

25
26
logger = init_logger("test_pipeline_parallel")

27
28
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"

29

30
31
32
33
34
35
class ParallelSetup(NamedTuple):
    tp_size: int
    pp_size: int
    eager_mode: bool


36
37
class PPTestOptions(NamedTuple):
    multi_node_only: bool
38
    load_format: str | None = None
39
40


41
42
@dataclass
class PPTestSettings:
43
44
    parallel_setups: list[ParallelSetup]
    distributed_backends: list[str]
45
    runner: RunnerOption
46
    test_options: PPTestOptions
47
48
49
50
51
52

    @staticmethod
    def detailed(
        *,
        tp_base: int = 1,
        pp_base: int = 2,
53
        multi_node_only: bool = False,
54
        runner: RunnerOption = "auto",
55
        load_format: str | None = None,
56
57
58
    ):
        return PPTestSettings(
            parallel_setups=[
59
60
61
62
63
                ParallelSetup(tp_size=tp_base, pp_size=pp_base, eager_mode=False),
                ParallelSetup(tp_size=tp_base, pp_size=2 * pp_base, eager_mode=False),
                ParallelSetup(tp_size=tp_base, pp_size=2 * pp_base, eager_mode=True),
                ParallelSetup(tp_size=2 * tp_base, pp_size=pp_base, eager_mode=False),
                ParallelSetup(tp_size=2 * tp_base, pp_size=pp_base, eager_mode=True),
64
            ],
65
            distributed_backends=["mp", "ray"],
66
            runner=runner,
67
68
69
            test_options=PPTestOptions(
                multi_node_only=multi_node_only, load_format=load_format
            ),
70
71
72
73
74
75
76
        )

    @staticmethod
    def fast(
        *,
        tp_base: int = 1,
        pp_base: int = 2,
77
        runner: RunnerOption = "auto",
78
        multi_node_only: bool = False,
79
        load_format: str | None = None,
80
81
82
    ):
        return PPTestSettings(
            parallel_setups=[
83
                ParallelSetup(tp_size=tp_base, pp_size=pp_base, eager_mode=True),
84
85
            ],
            distributed_backends=["mp"],
86
            runner=runner,
87
88
89
            test_options=PPTestOptions(
                multi_node_only=multi_node_only, load_format=load_format
            ),
90
91
        )

92
    def iter_params(self, model_id: str):
93
94
        opts = self.test_options

95
        for parallel_setup in self.parallel_setups:
96
97
            for backend in self.distributed_backends:
                yield (model_id, parallel_setup, backend, self.runner, opts)
98
99


100
101
102
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
# The values displayed here are only a rough indicator of the size of the model

103
104
TEXT_GENERATION_MODELS = {
    # [Decoder-only]
105
106
    # Uses Llama
    # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
107
    "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(load_format="dummy"),
108
109
    "baichuan-inc/Baichuan-7B": PPTestSettings.fast(),
    "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(),
110
    "bigscience/bloomz-1b1": PPTestSettings.fast(),
111
    "zai-org/chatglm3-6b": PPTestSettings.fast(),
112
113
114
    "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(load_format="dummy"),
    "databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"),
    "Deci/DeciLM-7B-instruct": PPTestSettings.fast(),
115
    "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
116
    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(tp_base=2),
117
118
    "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
    "tiiuae/falcon-7b": PPTestSettings.fast(),
119
    "google/gemma-1.1-2b-it": PPTestSettings.fast(),
120
121
122
123
    "google/gemma-2-9b": PPTestSettings.fast(),
    "gpt2": PPTestSettings.fast(),
    "bigcode/starcoder": PPTestSettings.fast(),
    "EleutherAI/gpt-j-6b": PPTestSettings.fast(),
124
    "EleutherAI/pythia-1.4b": PPTestSettings.fast(),
125
126
127
128
    "ibm/PowerLM-3b": PPTestSettings.fast(),
    "ibm/PowerMoE-3b": PPTestSettings.fast(),
    # Uses Llama
    # "internlm/internlm-chat-7b": PPTestSettings.fast(),
129
    "internlm/internlm2-chat-7b": PPTestSettings.fast(),
130
    "inceptionai/jais-13b-chat": PPTestSettings.fast(),
131
    "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
132
    "pfnet/plamo-2-1b": PPTestSettings.fast(),
133
    "meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
134
    # Tests TransformersForCausalLM
135
    "hmellor/Ilama-3.2-1B": PPTestSettings.fast(),
136
137
    "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
    "openbmb/MiniCPM3-4B": PPTestSettings.fast(),
138
139
    # Uses Llama
    # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
140
    "state-spaces/mamba-130m-hf": PPTestSettings.fast(),
141
    "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(load_format="dummy"),
142
143
144
    "mosaicml/mpt-7b": PPTestSettings.fast(),
    "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
    "allenai/OLMo-1B-hf": PPTestSettings.fast(),
145
    "allenai/OLMo-2-0425-1B": PPTestSettings.fast(),
146
    "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
147
    "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
148
    "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(),
149
    "adept/persimmon-8b-chat": PPTestSettings.fast(),
150
    "microsoft/phi-2": PPTestSettings.fast(),
151
    "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
152
153
    "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(
        multi_node_only=True, load_format="dummy"
154
    ),
155
    "Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
156
    "Qwen/Qwen2.5-0.5B-Instruct": PPTestSettings.fast(),
157
158
159
    "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
    "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
    "bigcode/starcoder2-3b": PPTestSettings.fast(),
160
    "upstage/solar-pro-preview-instruct": PPTestSettings.fast(load_format="dummy"),
161
162
    # FIXME: Cannot load tokenizer in latest transformers version.
    # Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
163
    # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(),
164
165
166
    # [Encoder-only]
    # TODO: Implement PP
    # "facebook/bart-base": PPTestSettings.fast(),
167
168
}

169
170
EMBEDDING_MODELS = {  # type: ignore[var-annotated]
    # [Text-only]
171
    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(runner="pooling"),
172
    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
173
    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(
174
        load_format="dummy", runner="pooling"
175
    ),
176
177
}

178
179
MULTIMODAL_MODELS = {
    # [Decoder-only]
180
    "Salesforce/blip2-opt-6.7b": PPTestSettings.fast(),
181
182
    "facebook/chameleon-7b": PPTestSettings.fast(),
    "adept/fuyu-8b": PPTestSettings.fast(),
183
    "zai-org/glm-4v-9b": PPTestSettings.fast(),
184
    "OpenGVLab/InternVL2-1B": PPTestSettings.fast(),
185
186
187
188
    "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
    "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
    "llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
189
190
    "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(),
    "allenai/Molmo-7B-D-0924": PPTestSettings.fast(),
191
    "AIDC-AI/Ovis2-1B": PPTestSettings.fast(),
192
    "AIDC-AI/Ovis2.5-2B": PPTestSettings.fast(),
193
    "microsoft/Phi-3.5-vision-instruct": PPTestSettings.fast(),
194
195
    "mistralai/Pixtral-12B-2409": PPTestSettings.fast(load_format="dummy"),
    "Qwen/Qwen-VL-Chat": PPTestSettings.fast(),
196
    "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
197
    "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
198
    "fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(),
199
200
}

201
# NOTE: You can update this on your local machine to run specific tests
202
TEST_MODELS = [
203
    # [LANGUAGE GENERATION]
204
    "microsoft/Phi-3.5-MoE-instruct",
205
    "meta-llama/Llama-3.2-1B-Instruct",
206
    "hmellor/Ilama-3.2-1B",
207
    "ibm/PowerLM-3b",
208
    "deepseek-ai/DeepSeek-V2-Lite-Chat",
209
210
211
212
    # [LANGUAGE EMBEDDING]
    "intfloat/e5-mistral-7b-instruct",
    "BAAI/bge-multilingual-gemma2",
    # [MULTIMODAL GENERATION]
213
    "OpenGVLab/InternVL2-1B",
214
    "microsoft/Phi-3.5-vision-instruct",
215
    "fixie-ai/ultravox-v0_5-llama-3_2-1b",
216
217
    # [LANGUAGE GENERATION - HYBRID ARCH]
    "ai21labs/Jamba-tiny-dev",
218
219
220
]


221
def _compare_tp(
222
    model_id: str,
223
224
    parallel_setup: ParallelSetup,
    distributed_backend: str,
225
    runner: RunnerOption,
226
    test_options: PPTestOptions,
227
228
    num_gpus_available: int,
    *,
229
    method: Literal["generate", "encode"],
230
    is_multimodal: bool,
231
):
232
233
234
235
236
    (
        tp_size,
        pp_size,
        eager_mode,
    ) = parallel_setup
237
238
239
240
241
242
243
244
245

    multi_node_only, load_format = test_options

    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
    model_info.check_transformers_version(on_fail="skip")

    trust_remote_code = model_info.trust_remote_code
    tokenizer_mode = model_info.tokenizer_mode
    hf_overrides = model_info.hf_overrides
246
    hf_config = get_config(model_id, trust_remote_code)
247
248
    skip_tokenizer_init = model_info.skip_tokenizer_init
    max_num_seqs = model_info.max_num_seqs
249
250
251
252

    dtype = "float16"
    if hf_config.model_type in _FLOAT16_NOT_SUPPORTED_MODELS:
        dtype = "bfloat16"
253
254
255
256

    if load_format == "dummy":
        # Avoid OOM
        text_overrides = {
257
258
259
260
261
            "num_hidden_layers": 4,
            "hidden_size": 512,
            "intermediate_size": 800,
            "num_attention_heads": 4,
            "num_key_value_heads": 1,
262
263
264
265
266
267
268
269
        }

        if is_multimodal:
            hf_overrides.update({"text_config": text_overrides})
        else:
            hf_overrides.update(text_overrides)
    else:
        model_info.check_available_online(on_fail="skip")
270

271
272
    if num_gpus_available < tp_size * pp_size:
        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
273
    if VLLM_MULTI_NODE and distributed_backend == "mp":
274
275
276
277
        pytest.skip(
            "Skipping multi-node pipeline parallel test for "
            "multiprocessing distributed backend"
        )
278
279
    if multi_node_only and not VLLM_MULTI_NODE:
        pytest.skip("Not in multi-node setting")
280

281
    common_args = [
282
283
        # use half precision for speed and memory savings in CI environment
        "--dtype",
284
        dtype,
285
        "--max-model-len",
286
287
288
289
290
291
        "2048",
        "--max-num-seqs",
        "8",
    ]
    if eager_mode:
        common_args.append("--enforce-eager")
292
293
    if runner != "auto":
        common_args.extend(["--runner", runner])
294
295
296
297
    if trust_remote_code:
        common_args.append("--trust-remote-code")
    if tokenizer_mode:
        common_args.extend(["--tokenizer-mode", tokenizer_mode])
298
299
300
    if load_format:
        common_args.extend(["--load-format", load_format])
    if hf_overrides:
301
        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
302
303
304
305
    if skip_tokenizer_init:
        common_args.append("--skip-tokenizer-init")
    if max_num_seqs:
        common_args.extend(["--max-num-seqs", f"{max_num_seqs}"])
306

307
    if distributed_backend == "ray":
308
        # For V1, test Ray Compiled Graph for all the tests
309
310
311
312
313
314
        pp_env = {
            "VLLM_USE_RAY_COMPILED_DAG": "1",
            "VLLM_USE_RAY_SPMD_WORKER": "1",
            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
        }
        # Temporary. Currently when zeromq + SPMD is used, it does not properly
315
        # terminate because of a Ray Compiled Graph issue.
316
        common_args.append("--disable-frontend-multiprocessing")
317
    elif distributed_backend == "mp":
318
        pp_env = None
319
320
321
    else:
        pp_env = None

322
    tp_env = None
323

324
325
    pp_args = [
        *common_args,
326
        "--pipeline-parallel-size",
327
        str(pp_size),
328
        "--tensor-parallel-size",
329
        str(tp_size),
330
        "--distributed-executor-backend",
331
        distributed_backend,
332
    ]
333
334
335
336
337
338
339

    # compare without pipeline parallelism
    # NOTE: use mp backend for TP
    # PP tests might involve multiple nodes, and ray might
    #  schedule all workers in a node other than the head node,
    #  which can cause the test to fail.
    tp_args = [
340
        *common_args,
341
        "--tensor-parallel-size",
342
        str(tp_size),
343
344
345
346
        "--distributed-executor-backend",
        "mp",
    ]

347
    compare_two_settings(model_id, pp_args, tp_args, pp_env, tp_env, method=method)
348
349
350


@pytest.mark.parametrize(
351
    ("model_id", "parallel_setup", "distributed_backend", "runner", "test_options"),
352
    [
353
354
355
356
        params
        for model_id, settings in TEXT_GENERATION_MODELS.items()
        for params in settings.iter_params(model_id)
        if model_id in TEST_MODELS
357
358
    ],
)
359
@create_new_process_for_each_test()
360
def test_tp_language_generation(
361
    model_id: str,
362
363
    parallel_setup: ParallelSetup,
    distributed_backend: str,
364
    runner: RunnerOption,
365
    test_options: PPTestOptions,
366
367
    num_gpus_available,
):
368
369
370
371
372
373
374
375
376
377
    _compare_tp(
        model_id,
        parallel_setup,
        distributed_backend,
        runner,
        test_options,
        num_gpus_available,
        method="generate",
        is_multimodal=False,
    )
378
379
380


@pytest.mark.parametrize(
381
    ("model_id", "parallel_setup", "distributed_backend", "runner", "test_options"),
382
    [
383
384
385
386
        params
        for model_id, settings in EMBEDDING_MODELS.items()
        for params in settings.iter_params(model_id)
        if model_id in TEST_MODELS
387
388
    ],
)
389
@create_new_process_for_each_test()
390
def test_tp_language_embedding(
391
    model_id: str,
392
393
    parallel_setup: ParallelSetup,
    distributed_backend: str,
394
    runner: RunnerOption,
395
    test_options: PPTestOptions,
396
397
    num_gpus_available,
):
398
399
400
401
402
403
404
405
406
407
    _compare_tp(
        model_id,
        parallel_setup,
        distributed_backend,
        runner,
        test_options,
        num_gpus_available,
        method="encode",
        is_multimodal=False,
    )
408
409
410


@pytest.mark.parametrize(
411
    ("model_id", "parallel_setup", "distributed_backend", "runner", "test_options"),
412
    [
413
414
415
416
        params
        for model_id, settings in MULTIMODAL_MODELS.items()
        for params in settings.iter_params(model_id)
        if model_id in TEST_MODELS
417
418
    ],
)
419
@create_new_process_for_each_test()
420
def test_tp_multimodal_generation(
421
    model_id: str,
422
423
    parallel_setup: ParallelSetup,
    distributed_backend: str,
424
    runner: RunnerOption,
425
    test_options: PPTestOptions,
426
427
    num_gpus_available,
):
428
429
430
431
432
433
434
435
436
437
    _compare_tp(
        model_id,
        parallel_setup,
        distributed_backend,
        runner,
        test_options,
        num_gpus_available,
        method="generate",
        is_multimodal=True,
    )