test_pipeline_parallel.py 15.4 KB
Newer Older
1
2
3
4
5
6
7
"""
WARNING: This test runs in both single-node (4 GPUs) and multi-node
 (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
 important to set the distributed backend to "mp" to avoid Ray scheduling
 all workers in a node other than the head node, which can cause the test
 to fail.
"""
8
import os
9
from dataclasses import dataclass
10
from typing import List, Literal, NamedTuple, Optional
11

12
13
import pytest

14
from vllm.config import TaskOption
15
16
from vllm.logger import init_logger

17
from ..utils import compare_two_settings, fork_new_process_for_each_test
18

19
20
logger = init_logger("test_pipeline_parallel")

21
22
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"

23

24
25
26
27
28
29
30
class ParallelSetup(NamedTuple):
    tp_size: int
    pp_size: int
    eager_mode: bool
    chunked_prefill: bool


31
32
33
34
class PPTestOptions(NamedTuple):
    multi_node_only: bool
    trust_remote_code: bool
    tokenizer_mode: Optional[str]
35
36
    load_format: Optional[str] = None
    hf_overrides: Optional[str] = None
37
38


39
40
41
42
@dataclass
class PPTestSettings:
    parallel_setups: List[ParallelSetup]
    distributed_backends: List[str]
43
    task: TaskOption
44
    test_options: PPTestOptions
45
46
47
48
49
50

    @staticmethod
    def detailed(
        *,
        tp_base: int = 1,
        pp_base: int = 2,
51
        multi_node_only: bool = False,
52
        task: TaskOption = "auto",
53
54
        trust_remote_code: bool = False,
        tokenizer_mode: Optional[str] = None,
55
56
        load_format: Optional[str] = None,
        hf_overrides: Optional[str] = None,
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
    ):
        return PPTestSettings(
            parallel_setups=[
                ParallelSetup(tp_size=tp_base,
                              pp_size=pp_base,
                              eager_mode=False,
                              chunked_prefill=False),
                ParallelSetup(tp_size=tp_base,
                              pp_size=2 * pp_base,
                              eager_mode=False,
                              chunked_prefill=True),
                ParallelSetup(tp_size=tp_base,
                              pp_size=2 * pp_base,
                              eager_mode=True,
                              chunked_prefill=False),
                ParallelSetup(tp_size=2 * tp_base,
                              pp_size=pp_base,
                              eager_mode=False,
                              chunked_prefill=True),
                ParallelSetup(tp_size=2 * tp_base,
                              pp_size=pp_base,
                              eager_mode=True,
                              chunked_prefill=False),
            ],
            distributed_backends=["mp", "ray"],
82
            task=task,
83
84
            test_options=PPTestOptions(multi_node_only=multi_node_only,
                                       trust_remote_code=trust_remote_code,
85
86
87
                                       tokenizer_mode=tokenizer_mode,
                                       load_format=load_format,
                                       hf_overrides=hf_overrides),
88
89
90
91
92
93
94
        )

    @staticmethod
    def fast(
        *,
        tp_base: int = 1,
        pp_base: int = 2,
95
        task: TaskOption = "auto",
96
        multi_node_only: bool = False,
97
98
        trust_remote_code: bool = False,
        tokenizer_mode: Optional[str] = None,
99
100
        load_format: Optional[str] = None,
        hf_overrides: Optional[str] = None,
101
102
103
104
105
106
107
108
109
    ):
        return PPTestSettings(
            parallel_setups=[
                ParallelSetup(tp_size=tp_base,
                              pp_size=pp_base,
                              eager_mode=True,
                              chunked_prefill=False),
            ],
            distributed_backends=["mp"],
110
            task=task,
111
112
            test_options=PPTestOptions(multi_node_only=multi_node_only,
                                       trust_remote_code=trust_remote_code,
113
114
115
                                       tokenizer_mode=tokenizer_mode,
                                       load_format=load_format,
                                       hf_overrides=hf_overrides),
116
117
118
        )

    def iter_params(self, model_name: str):
119
120
        opts = self.test_options

121
122
123
        for parallel_setup in self.parallel_setups:
            for distributed_backend in self.distributed_backends:
                yield (model_name, parallel_setup, distributed_backend,
124
                       self.task, opts)
125
126


127
128
129
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
# The values displayed here are only a rough indicator of the size of the model

130
# yapf: disable
131
132
TEXT_GENERATION_MODELS = {
    # [Decoder-only]
133
134
    # Uses Llama
    # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
135
    "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(tp_base=8, trust_remote_code=True),  # noqa: E501
136
137
138
139
140
    "baichuan-inc/Baichuan-7B": PPTestSettings.fast(trust_remote_code=True),
    "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
    "bigscience/bloomz-1b1": PPTestSettings.fast(),
    "THUDM/chatglm3-6b": PPTestSettings.fast(trust_remote_code=True),
    "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(tp_base=2, trust_remote_code=True),  # noqa: E501
141
    "databricks/dbrx-instruct": PPTestSettings.fast(tp_base=8),
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
    "Deci/DeciLM-7B-instruct": PPTestSettings.fast(trust_remote_code=True),
    "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
    "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
    "tiiuae/falcon-7b": PPTestSettings.fast(),
    "google/gemma-2b": PPTestSettings.fast(),
    "google/gemma-2-9b": PPTestSettings.fast(),
    "gpt2": PPTestSettings.fast(),
    "bigcode/starcoder": PPTestSettings.fast(),
    "EleutherAI/gpt-j-6b": PPTestSettings.fast(),
    "EleutherAI/pythia-12b": PPTestSettings.fast(),
    "ibm/PowerLM-3b": PPTestSettings.fast(),
    "ibm/PowerMoE-3b": PPTestSettings.fast(),
    # Uses Llama
    # "internlm/internlm-chat-7b": PPTestSettings.fast(),
    "internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True),
158
    "inceptionai/jais-13b-chat": PPTestSettings.fast(),
159
160
    # TODO: Implement PP
    # "ai21labs/AI21-Jamba-1.5-Mini": PPTestSettings.fast(),
161
    "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
162
163
164
165
166
167
168
169
    "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True),
    "openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True),
    # Uses Llama
    # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
    "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4),
    "mosaicml/mpt-7b": PPTestSettings.fast(),
    "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
    "allenai/OLMo-1B-hf": PPTestSettings.fast(),
170
    "shanearora/OLMo-7B-1124-hf": PPTestSettings.fast(),
171
    "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
172
173
    "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
    "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
174
    "adept/persimmon-8b-chat": PPTestSettings.fast(),
175
176
    "microsoft/phi-2": PPTestSettings.fast(),
    "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
177
    "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True, load_format="dummy", hf_overrides='{"num_hidden_layers": 4, "hidden_size": 512, "intermediate_size": 800, "num_attention_heads": 4, "num_key_value_heads": 1}'),  # noqa: E501
178
    "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
179
    "Qwen/Qwen2-7B-Instruct": PPTestSettings.fast(),
180
181
182
183
    "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
    "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
    "bigcode/starcoder2-3b": PPTestSettings.fast(),
    "upstage/solar-pro-preview-instruct": PPTestSettings.fast(tp_base=2),
184
185
    # FIXME: Cannot load tokenizer in latest transformers version.
    # Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
186
    # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
187
188
189
    # [Encoder-only]
    # TODO: Implement PP
    # "facebook/bart-base": PPTestSettings.fast(),
190
191
}

192
193
EMBEDDING_MODELS = {  # type: ignore[var-annotated]
    # [Text-only]
194
195
196
    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(tp_base=4, trust_remote_code=True),  # noqa: E501
197
198
}

199
200
MULTIMODAL_MODELS = {
    # [Decoder-only]
201
202
203
    "Salesforce/blip2-opt-2.7b": PPTestSettings.fast(),
    "facebook/chameleon-7b": PPTestSettings.fast(),
    "adept/fuyu-8b": PPTestSettings.fast(),
204
    "THUDM/glm-4v-9b": PPTestSettings.fast(trust_remote_code=True),
205
206
207
208
209
210
    "OpenGVLab/InternVL2-1B": PPTestSettings.fast(trust_remote_code=True),
    "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
    "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
    "llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
    "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(trust_remote_code=True),
211
    "allenai/Molmo-7B-D-0924": PPTestSettings.fast(trust_remote_code=True),
212
213
214
    "microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
    "mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"),  # noqa: E501
    "Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
215
    "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
216
217
    "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
    "fixie-ai/ultravox-v0_3": PPTestSettings.fast(),
218
    # [Encoder-decoder]
219
    # TODO: Implement PP
220
    # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
221
222
223
}
# yapf: enable

224
# NOTE: You can update this on your local machine to run specific tests
225
TEST_MODELS = [
226
    # [LANGUAGE GENERATION]
227
    "microsoft/Phi-3.5-MoE-instruct",
228
    "meta-llama/Meta-Llama-3-8B",
229
230
231
232
233
    "ibm/PowerLM-3b",
    # [LANGUAGE EMBEDDING]
    "intfloat/e5-mistral-7b-instruct",
    "BAAI/bge-multilingual-gemma2",
    # [MULTIMODAL GENERATION]
234
235
236
237
238
239
    "OpenGVLab/InternVL2-1B",
    "microsoft/Phi-3-vision-128k-instruct",
    "fixie-ai/ultravox-v0_3",
]


240
241
242
243
def _compare_tp(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
244
    task: TaskOption,
245
    test_options: PPTestOptions,
246
247
    num_gpus_available: int,
    *,
248
    method: Literal["generate", "encode"],
249
):
250
251
252
253
254
255
256
257
258
259
260
261
262
    (
        tp_size,
        pp_size,
        eager_mode,
        chunked_prefill,
    ) = parallel_setup
    (
        multi_node_only,
        trust_remote_code,
        tokenizer_mode,
        load_format,
        hf_overrides,
    ) = test_options
263

264
265
    if num_gpus_available < tp_size * pp_size:
        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
266
    if VLLM_MULTI_NODE and distributed_backend == "mp":
267
268
        pytest.skip("Skipping multi-node pipeline parallel test for "
                    "multiprocessing distributed backend")
269
270
    if multi_node_only and not VLLM_MULTI_NODE:
        pytest.skip("Not in multi-node setting")
271

272
    common_args = [
273
274
        # use half precision for speed and memory savings in CI environment
        "--dtype",
275
        "float16",
276
        "--max-model-len",
277
278
279
280
281
282
283
284
        "2048",
        "--max-num-seqs",
        "8",
    ]
    if chunked_prefill:
        common_args.append("--enable-chunked-prefill")
    if eager_mode:
        common_args.append("--enforce-eager")
285
286
    if task != "auto":
        common_args.extend(["--task", task])
287
288
289
290
    if trust_remote_code:
        common_args.append("--trust-remote-code")
    if tokenizer_mode:
        common_args.extend(["--tokenizer-mode", tokenizer_mode])
291
292
293
294
    if load_format:
        common_args.extend(["--load-format", load_format])
    if hf_overrides:
        common_args.extend(["--hf-overrides", hf_overrides])
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311

    if (distributed_backend == "ray" and tp_size == 2 and pp_size == 2
            and chunked_prefill):
        # Test Ray ADAG for a subset of the tests
        pp_env = {
            "VLLM_USE_RAY_COMPILED_DAG": "1",
            "VLLM_USE_RAY_SPMD_WORKER": "1",
            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
        }
        # Temporary. Currently when zeromq + SPMD is used, it does not properly
        # terminate because of aDAG issue.
        common_args.append("--disable-frontend-multiprocessing")
    else:
        pp_env = None

    pp_args = [
        *common_args,
312
        "--pipeline-parallel-size",
313
        str(pp_size),
314
        "--tensor-parallel-size",
315
        str(tp_size),
316
        "--distributed-executor-backend",
317
        distributed_backend,
318
    ]
319
320
321
322
323
324
325

    # compare without pipeline parallelism
    # NOTE: use mp backend for TP
    # PP tests might involve multiple nodes, and ray might
    #  schedule all workers in a node other than the head node,
    #  which can cause the test to fail.
    tp_args = [
326
        *common_args,
327
        "--tensor-parallel-size",
328
        str(tp_size),
329
330
331
332
        "--distributed-executor-backend",
        "mp",
    ]

333
    try:
334
335
336
337
338
        compare_two_settings(model_name,
                             pp_args,
                             tp_args,
                             pp_env,
                             method=method)
339
340
341
342
343
344
    except Exception:
        if pp_env is None:
            raise
        else:
            # Ray ADAG tests are flaky, so we don't want to fail the test
            logger.exception("Ray ADAG tests failed")
345
346
347


@pytest.mark.parametrize(
348
    ("model_name", "parallel_setup", "distributed_backend", "task",
349
     "test_options"),
350
    [
351
        params for model_name, settings in TEXT_GENERATION_MODELS.items()
352
353
354
355
356
357
358
359
360
        for params in settings.iter_params(model_name)
        if model_name in TEST_MODELS
    ],
)
@fork_new_process_for_each_test
def test_tp_language_generation(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
361
    task: TaskOption,
362
    test_options: PPTestOptions,
363
364
365
366
367
    num_gpus_available,
):
    _compare_tp(model_name,
                parallel_setup,
                distributed_backend,
368
                task,
369
                test_options,
370
371
372
373
374
                num_gpus_available,
                method="generate")


@pytest.mark.parametrize(
375
    ("model_name", "parallel_setup", "distributed_backend", "task",
376
     "test_options"),
377
    [
378
        params for model_name, settings in EMBEDDING_MODELS.items()
379
380
381
382
383
384
385
386
387
        for params in settings.iter_params(model_name)
        if model_name in TEST_MODELS
    ],
)
@fork_new_process_for_each_test
def test_tp_language_embedding(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
388
    task: TaskOption,
389
    test_options: PPTestOptions,
390
391
392
393
394
    num_gpus_available,
):
    _compare_tp(model_name,
                parallel_setup,
                distributed_backend,
395
                task,
396
                test_options,
397
398
399
400
401
                num_gpus_available,
                method="encode")


@pytest.mark.parametrize(
402
    ("model_name", "parallel_setup", "distributed_backend", "task",
403
     "test_options"),
404
    [
405
        params for model_name, settings in MULTIMODAL_MODELS.items()
406
407
408
409
410
411
412
413
414
        for params in settings.iter_params(model_name)
        if model_name in TEST_MODELS
    ],
)
@fork_new_process_for_each_test
def test_tp_multimodal_generation(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
415
    task: TaskOption,
416
    test_options: PPTestOptions,
417
418
419
420
421
    num_gpus_available,
):
    _compare_tp(model_name,
                parallel_setup,
                distributed_backend,
422
                task,
423
                test_options,
424
425
                num_gpus_available,
                method="generate")