test_pipeline_parallel.py 14.3 KB
Newer Older
1
2
3
4
5
6
7
"""
WARNING: This test runs in both single-node (4 GPUs) and multi-node
 (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
 important to set the distributed backend to "mp" to avoid Ray scheduling
 all workers in a node other than the head node, which can cause the test
 to fail.
"""
8
import os
9
from dataclasses import dataclass
10
from typing import List, Literal, NamedTuple, Optional
11

12
13
import pytest

14
from vllm.config import TaskOption
15
16
from vllm.logger import init_logger

17
from ..utils import compare_two_settings, fork_new_process_for_each_test
18

19
20
logger = init_logger("test_pipeline_parallel")

21
22
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"

23

24
25
26
27
28
29
30
class ParallelSetup(NamedTuple):
    tp_size: int
    pp_size: int
    eager_mode: bool
    chunked_prefill: bool


31
32
33
34
35
36
class PPTestOptions(NamedTuple):
    multi_node_only: bool
    trust_remote_code: bool
    tokenizer_mode: Optional[str]


37
38
39
40
@dataclass
class PPTestSettings:
    parallel_setups: List[ParallelSetup]
    distributed_backends: List[str]
41
    task: TaskOption
42
    test_options: PPTestOptions
43
44
45
46
47
48

    @staticmethod
    def detailed(
        *,
        tp_base: int = 1,
        pp_base: int = 2,
49
        multi_node_only: bool = False,
50
        task: TaskOption = "auto",
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
        trust_remote_code: bool = False,
        tokenizer_mode: Optional[str] = None,
    ):
        return PPTestSettings(
            parallel_setups=[
                ParallelSetup(tp_size=tp_base,
                              pp_size=pp_base,
                              eager_mode=False,
                              chunked_prefill=False),
                ParallelSetup(tp_size=tp_base,
                              pp_size=2 * pp_base,
                              eager_mode=False,
                              chunked_prefill=True),
                ParallelSetup(tp_size=tp_base,
                              pp_size=2 * pp_base,
                              eager_mode=True,
                              chunked_prefill=False),
                ParallelSetup(tp_size=2 * tp_base,
                              pp_size=pp_base,
                              eager_mode=False,
                              chunked_prefill=True),
                ParallelSetup(tp_size=2 * tp_base,
                              pp_size=pp_base,
                              eager_mode=True,
                              chunked_prefill=False),
            ],
            distributed_backends=["mp", "ray"],
78
            task=task,
79
80
81
            test_options=PPTestOptions(multi_node_only=multi_node_only,
                                       trust_remote_code=trust_remote_code,
                                       tokenizer_mode=tokenizer_mode),
82
83
84
85
86
87
88
        )

    @staticmethod
    def fast(
        *,
        tp_base: int = 1,
        pp_base: int = 2,
89
        task: TaskOption = "auto",
90
        multi_node_only: bool = False,
91
92
93
94
95
96
97
98
99
100
101
        trust_remote_code: bool = False,
        tokenizer_mode: Optional[str] = None,
    ):
        return PPTestSettings(
            parallel_setups=[
                ParallelSetup(tp_size=tp_base,
                              pp_size=pp_base,
                              eager_mode=True,
                              chunked_prefill=False),
            ],
            distributed_backends=["mp"],
102
            task=task,
103
104
105
            test_options=PPTestOptions(multi_node_only=multi_node_only,
                                       trust_remote_code=trust_remote_code,
                                       tokenizer_mode=tokenizer_mode),
106
107
108
        )

    def iter_params(self, model_name: str):
109
110
        opts = self.test_options

111
112
113
        for parallel_setup in self.parallel_setups:
            for distributed_backend in self.distributed_backends:
                yield (model_name, parallel_setup, distributed_backend,
114
                       self.task, opts)
115
116


117
118
119
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
# The values displayed here are only a rough indicator of the size of the model

120
121
122
123
# yapf: disable
GENERATION_MODEL_SETTINGS = {
    # [DETAILED TESTS]
    "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
124
    "microsoft/Phi-3-mini-4k-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True),  # noqa: E501
125
126
127
    # [FAST TESTS]
    # Uses Llama
    # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
128
    "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(tp_base=8, trust_remote_code=True),  # noqa: E501
129
130
131
132
133
    "baichuan-inc/Baichuan-7B": PPTestSettings.fast(trust_remote_code=True),
    "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
    "bigscience/bloomz-1b1": PPTestSettings.fast(),
    "THUDM/chatglm3-6b": PPTestSettings.fast(trust_remote_code=True),
    "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(tp_base=2, trust_remote_code=True),  # noqa: E501
134
    "databricks/dbrx-instruct": PPTestSettings.fast(tp_base=8),
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
    "Deci/DeciLM-7B-instruct": PPTestSettings.fast(trust_remote_code=True),
    "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
    "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
    "tiiuae/falcon-7b": PPTestSettings.fast(),
    "google/gemma-2b": PPTestSettings.fast(),
    "google/gemma-2-9b": PPTestSettings.fast(),
    "gpt2": PPTestSettings.fast(),
    "bigcode/starcoder": PPTestSettings.fast(),
    "EleutherAI/gpt-j-6b": PPTestSettings.fast(),
    "EleutherAI/pythia-12b": PPTestSettings.fast(),
    "ibm/PowerLM-3b": PPTestSettings.fast(),
    "ibm/PowerMoE-3b": PPTestSettings.fast(),
    # Uses Llama
    # "internlm/internlm-chat-7b": PPTestSettings.fast(),
    "internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True),
    "core42/jais-13b-chat": PPTestSettings.fast(),
    # TODO: Implement PP
    # "ai21labs/AI21-Jamba-1.5-Mini": PPTestSettings.fast(),
    "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True),
    "openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True),
    # Uses Llama
    # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
    "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4),
    "mosaicml/mpt-7b": PPTestSettings.fast(),
    "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
    "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
    "allenai/OLMo-1B-hf": PPTestSettings.fast(),
    "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
    "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
    "microsoft/phi-2": PPTestSettings.fast(),
    "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
167
    "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
168
169
170
171
172
173
174
175
176
177
178
179
180
    "adept/persimmon-8b-chat": PPTestSettings.fast(),
    "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
    "Qwen/Qwen2-beta-7B-Chat": PPTestSettings.fast(),
    "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
    "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
    "bigcode/starcoder2-3b": PPTestSettings.fast(),
    "upstage/solar-pro-preview-instruct": PPTestSettings.fast(tp_base=2),
    # FIXME: Cannot load tokenizer in latest transformers version
    # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
}

EMBEDDING_MODEL_SETTINGS = {  # type: ignore[var-annotated]
    # [FAST TESTS]
181
182
183
    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(tp_base=4, trust_remote_code=True),  # noqa: E501
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
}

MULTIMODAL_MODEL_SETTINGS = {
    # [FAST TESTS]
    "Salesforce/blip2-opt-2.7b": PPTestSettings.fast(),
    "facebook/chameleon-7b": PPTestSettings.fast(),
    "adept/fuyu-8b": PPTestSettings.fast(),
    "OpenGVLab/InternVL2-1B": PPTestSettings.fast(trust_remote_code=True),
    "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
    "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
    "llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
    "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(trust_remote_code=True),
    # TODO: Implement PP
    # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
    "microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
    "mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"),  # noqa: E501
    "Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
    "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
    "fixie-ai/ultravox-v0_3": PPTestSettings.fast(),
}

CONDITIONAL_GENERATION_MODEL_SETTINGS = {  # type: ignore[var-annotated]
    # [FAST TESTS]
    # TODO: Implement PP
    # "facebook/bart-base": PPTestSettings.fast(),
}
# yapf: enable

213
# NOTE: You can update this on your local machine to run specific tests
214
TEST_MODELS = [
215
    # [LANGUAGE GENERATION]
216
    "meta-llama/Meta-Llama-3-8B",
217
    "ibm/PowerLM-3b",
218
    "microsoft/Phi-3-mini-4k-instruct",
219
220
221
222
    # [LANGUAGE EMBEDDING]
    "intfloat/e5-mistral-7b-instruct",
    "BAAI/bge-multilingual-gemma2",
    # [MULTIMODAL GENERATION]
223
224
225
226
227
228
    "OpenGVLab/InternVL2-1B",
    "microsoft/Phi-3-vision-128k-instruct",
    "fixie-ai/ultravox-v0_3",
]


229
230
231
232
def _compare_tp(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
233
    task: TaskOption,
234
    test_options: PPTestOptions,
235
236
    num_gpus_available: int,
    *,
237
    method: Literal["generate", "encode"],
238
):
239
    tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup
240
    multi_node_only, trust_remote_code, tokenizer_mode = test_options
241

242
243
    if num_gpus_available < tp_size * pp_size:
        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
244
    if VLLM_MULTI_NODE and distributed_backend == "mp":
245
246
        pytest.skip("Skipping multi-node pipeline parallel test for "
                    "multiprocessing distributed backend")
247
248
    if multi_node_only and not VLLM_MULTI_NODE:
        pytest.skip("Not in multi-node setting")
249

250
    common_args = [
251
252
        # use half precision for speed and memory savings in CI environment
        "--dtype",
253
        "float16",
254
        "--max-model-len",
255
256
257
258
259
260
261
262
        "2048",
        "--max-num-seqs",
        "8",
    ]
    if chunked_prefill:
        common_args.append("--enable-chunked-prefill")
    if eager_mode:
        common_args.append("--enforce-eager")
263
264
    if task != "auto":
        common_args.extend(["--task", task])
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
    if trust_remote_code:
        common_args.append("--trust-remote-code")
    if tokenizer_mode:
        common_args.extend(["--tokenizer-mode", tokenizer_mode])

    if (distributed_backend == "ray" and tp_size == 2 and pp_size == 2
            and chunked_prefill):
        # Test Ray ADAG for a subset of the tests
        pp_env = {
            "VLLM_USE_RAY_COMPILED_DAG": "1",
            "VLLM_USE_RAY_SPMD_WORKER": "1",
            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
        }
        # Temporary. Currently when zeromq + SPMD is used, it does not properly
        # terminate because of aDAG issue.
        common_args.append("--disable-frontend-multiprocessing")
    else:
        pp_env = None

    pp_args = [
        *common_args,
286
        "--pipeline-parallel-size",
287
        str(pp_size),
288
        "--tensor-parallel-size",
289
        str(tp_size),
290
        "--distributed-executor-backend",
291
        distributed_backend,
292
    ]
293
294
295
296
297
298
299

    # compare without pipeline parallelism
    # NOTE: use mp backend for TP
    # PP tests might involve multiple nodes, and ray might
    #  schedule all workers in a node other than the head node,
    #  which can cause the test to fail.
    tp_args = [
300
        *common_args,
301
        "--tensor-parallel-size",
302
        str(tp_size),
303
304
305
306
        "--distributed-executor-backend",
        "mp",
    ]

307
    try:
308
309
310
311
312
        compare_two_settings(model_name,
                             pp_args,
                             tp_args,
                             pp_env,
                             method=method)
313
314
315
316
317
318
    except Exception:
        if pp_env is None:
            raise
        else:
            # Ray ADAG tests are flaky, so we don't want to fail the test
            logger.exception("Ray ADAG tests failed")
319
320
321


@pytest.mark.parametrize(
322
    ("model_name", "parallel_setup", "distributed_backend", "task",
323
     "test_options"),
324
325
326
327
328
329
330
331
332
333
334
    [
        params for model_name, settings in GENERATION_MODEL_SETTINGS.items()
        for params in settings.iter_params(model_name)
        if model_name in TEST_MODELS
    ],
)
@fork_new_process_for_each_test
def test_tp_language_generation(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
335
    task: TaskOption,
336
    test_options: PPTestOptions,
337
338
339
340
341
    num_gpus_available,
):
    _compare_tp(model_name,
                parallel_setup,
                distributed_backend,
342
                task,
343
                test_options,
344
345
346
347
348
                num_gpus_available,
                method="generate")


@pytest.mark.parametrize(
349
    ("model_name", "parallel_setup", "distributed_backend", "task",
350
     "test_options"),
351
352
353
354
355
356
357
358
359
360
361
    [
        params for model_name, settings in EMBEDDING_MODEL_SETTINGS.items()
        for params in settings.iter_params(model_name)
        if model_name in TEST_MODELS
    ],
)
@fork_new_process_for_each_test
def test_tp_language_embedding(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
362
    task: TaskOption,
363
    test_options: PPTestOptions,
364
365
366
367
368
    num_gpus_available,
):
    _compare_tp(model_name,
                parallel_setup,
                distributed_backend,
369
                task,
370
                test_options,
371
372
373
374
375
                num_gpus_available,
                method="encode")


@pytest.mark.parametrize(
376
    ("model_name", "parallel_setup", "distributed_backend", "task",
377
     "test_options"),
378
379
380
381
382
383
384
385
386
387
388
    [
        params for model_name, settings in MULTIMODAL_MODEL_SETTINGS.items()
        for params in settings.iter_params(model_name)
        if model_name in TEST_MODELS
    ],
)
@fork_new_process_for_each_test
def test_tp_multimodal_generation(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
389
    task: TaskOption,
390
    test_options: PPTestOptions,
391
392
393
394
395
    num_gpus_available,
):
    _compare_tp(model_name,
                parallel_setup,
                distributed_backend,
396
                task,
397
                test_options,
398
399
                num_gpus_available,
                method="generate")