"tests/vscode:/vscode.git/clone" did not exist on "f533b5837fa67f53957a12387a01067f9edef0d8"
test_pipeline_parallel.py 14.4 KB
Newer Older
1
2
3
4
5
6
7
"""
WARNING: This test runs in both single-node (4 GPUs) and multi-node
 (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
 important to set the distributed backend to "mp" to avoid Ray scheduling
 all workers in a node other than the head node, which can cause the test
 to fail.
"""
8
import os
9
from dataclasses import dataclass
10
from typing import List, Literal, NamedTuple, Optional
11

12
13
import pytest

14
from vllm.config import TaskOption
15
16
from vllm.logger import init_logger

17
from ..utils import compare_two_settings, fork_new_process_for_each_test
18

19
20
logger = init_logger("test_pipeline_parallel")

21
22
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"

23

24
25
26
27
28
29
30
class ParallelSetup(NamedTuple):
    tp_size: int
    pp_size: int
    eager_mode: bool
    chunked_prefill: bool


31
32
33
34
35
36
class PPTestOptions(NamedTuple):
    multi_node_only: bool
    trust_remote_code: bool
    tokenizer_mode: Optional[str]


37
38
39
40
@dataclass
class PPTestSettings:
    parallel_setups: List[ParallelSetup]
    distributed_backends: List[str]
41
    task: TaskOption
42
    test_options: PPTestOptions
43
44
45
46
47
48

    @staticmethod
    def detailed(
        *,
        tp_base: int = 1,
        pp_base: int = 2,
49
        multi_node_only: bool = False,
50
        task: TaskOption = "auto",
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
        trust_remote_code: bool = False,
        tokenizer_mode: Optional[str] = None,
    ):
        return PPTestSettings(
            parallel_setups=[
                ParallelSetup(tp_size=tp_base,
                              pp_size=pp_base,
                              eager_mode=False,
                              chunked_prefill=False),
                ParallelSetup(tp_size=tp_base,
                              pp_size=2 * pp_base,
                              eager_mode=False,
                              chunked_prefill=True),
                ParallelSetup(tp_size=tp_base,
                              pp_size=2 * pp_base,
                              eager_mode=True,
                              chunked_prefill=False),
                ParallelSetup(tp_size=2 * tp_base,
                              pp_size=pp_base,
                              eager_mode=False,
                              chunked_prefill=True),
                ParallelSetup(tp_size=2 * tp_base,
                              pp_size=pp_base,
                              eager_mode=True,
                              chunked_prefill=False),
            ],
            distributed_backends=["mp", "ray"],
78
            task=task,
79
80
81
            test_options=PPTestOptions(multi_node_only=multi_node_only,
                                       trust_remote_code=trust_remote_code,
                                       tokenizer_mode=tokenizer_mode),
82
83
84
85
86
87
88
        )

    @staticmethod
    def fast(
        *,
        tp_base: int = 1,
        pp_base: int = 2,
89
        task: TaskOption = "auto",
90
        multi_node_only: bool = False,
91
92
93
94
95
96
97
98
99
100
101
        trust_remote_code: bool = False,
        tokenizer_mode: Optional[str] = None,
    ):
        return PPTestSettings(
            parallel_setups=[
                ParallelSetup(tp_size=tp_base,
                              pp_size=pp_base,
                              eager_mode=True,
                              chunked_prefill=False),
            ],
            distributed_backends=["mp"],
102
            task=task,
103
104
105
            test_options=PPTestOptions(multi_node_only=multi_node_only,
                                       trust_remote_code=trust_remote_code,
                                       tokenizer_mode=tokenizer_mode),
106
107
108
        )

    def iter_params(self, model_name: str):
109
110
        opts = self.test_options

111
112
113
        for parallel_setup in self.parallel_setups:
            for distributed_backend in self.distributed_backends:
                yield (model_name, parallel_setup, distributed_backend,
114
                       self.task, opts)
115
116


117
118
119
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
# The values displayed here are only a rough indicator of the size of the model

120
121
122
123
# yapf: disable
GENERATION_MODEL_SETTINGS = {
    # [DETAILED TESTS]
    "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
124
    "microsoft/Phi-3-mini-4k-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True),  # noqa: E501
125
126
127
    # [FAST TESTS]
    # Uses Llama
    # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
128
    "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(tp_base=8, trust_remote_code=True),  # noqa: E501
129
130
131
132
133
    "baichuan-inc/Baichuan-7B": PPTestSettings.fast(trust_remote_code=True),
    "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
    "bigscience/bloomz-1b1": PPTestSettings.fast(),
    "THUDM/chatglm3-6b": PPTestSettings.fast(trust_remote_code=True),
    "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(tp_base=2, trust_remote_code=True),  # noqa: E501
134
    "databricks/dbrx-instruct": PPTestSettings.fast(tp_base=8),
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
    "Deci/DeciLM-7B-instruct": PPTestSettings.fast(trust_remote_code=True),
    "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
    "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
    "tiiuae/falcon-7b": PPTestSettings.fast(),
    "google/gemma-2b": PPTestSettings.fast(),
    "google/gemma-2-9b": PPTestSettings.fast(),
    "gpt2": PPTestSettings.fast(),
    "bigcode/starcoder": PPTestSettings.fast(),
    "EleutherAI/gpt-j-6b": PPTestSettings.fast(),
    "EleutherAI/pythia-12b": PPTestSettings.fast(),
    "ibm/PowerLM-3b": PPTestSettings.fast(),
    "ibm/PowerMoE-3b": PPTestSettings.fast(),
    # Uses Llama
    # "internlm/internlm-chat-7b": PPTestSettings.fast(),
    "internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True),
    "core42/jais-13b-chat": PPTestSettings.fast(),
    # TODO: Implement PP
    # "ai21labs/AI21-Jamba-1.5-Mini": PPTestSettings.fast(),
    "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True),
    "openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True),
    # Uses Llama
    # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
    "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4),
    "mosaicml/mpt-7b": PPTestSettings.fast(),
    "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
    "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
    "allenai/OLMo-1B-hf": PPTestSettings.fast(),
    "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
    "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
    "microsoft/phi-2": PPTestSettings.fast(),
    "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
167
    "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
168
169
170
171
172
173
174
175
176
177
178
179
180
    "adept/persimmon-8b-chat": PPTestSettings.fast(),
    "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
    "Qwen/Qwen2-beta-7B-Chat": PPTestSettings.fast(),
    "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
    "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
    "bigcode/starcoder2-3b": PPTestSettings.fast(),
    "upstage/solar-pro-preview-instruct": PPTestSettings.fast(tp_base=2),
    # FIXME: Cannot load tokenizer in latest transformers version
    # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
}

EMBEDDING_MODEL_SETTINGS = {  # type: ignore[var-annotated]
    # [FAST TESTS]
181
182
183
    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(tp_base=4, trust_remote_code=True),  # noqa: E501
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
}

MULTIMODAL_MODEL_SETTINGS = {
    # [FAST TESTS]
    "Salesforce/blip2-opt-2.7b": PPTestSettings.fast(),
    "facebook/chameleon-7b": PPTestSettings.fast(),
    "adept/fuyu-8b": PPTestSettings.fast(),
    "OpenGVLab/InternVL2-1B": PPTestSettings.fast(trust_remote_code=True),
    "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
    "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
    "llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
    "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(trust_remote_code=True),
    # TODO: Implement PP
    # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
    "microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
    "mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"),  # noqa: E501
    "Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
202
    "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
203
204
205
206
207
208
209
210
211
212
213
    "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
    "fixie-ai/ultravox-v0_3": PPTestSettings.fast(),
}

CONDITIONAL_GENERATION_MODEL_SETTINGS = {  # type: ignore[var-annotated]
    # [FAST TESTS]
    # TODO: Implement PP
    # "facebook/bart-base": PPTestSettings.fast(),
}
# yapf: enable

214
# NOTE: You can update this on your local machine to run specific tests
215
TEST_MODELS = [
216
    # [LANGUAGE GENERATION]
217
    "meta-llama/Meta-Llama-3-8B",
218
    "ibm/PowerLM-3b",
219
    "microsoft/Phi-3-mini-4k-instruct",
220
221
222
223
    # [LANGUAGE EMBEDDING]
    "intfloat/e5-mistral-7b-instruct",
    "BAAI/bge-multilingual-gemma2",
    # [MULTIMODAL GENERATION]
224
225
226
227
228
229
    "OpenGVLab/InternVL2-1B",
    "microsoft/Phi-3-vision-128k-instruct",
    "fixie-ai/ultravox-v0_3",
]


230
231
232
233
def _compare_tp(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
234
    task: TaskOption,
235
    test_options: PPTestOptions,
236
237
    num_gpus_available: int,
    *,
238
    method: Literal["generate", "encode"],
239
):
240
    tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup
241
    multi_node_only, trust_remote_code, tokenizer_mode = test_options
242

243
244
    if num_gpus_available < tp_size * pp_size:
        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
245
    if VLLM_MULTI_NODE and distributed_backend == "mp":
246
247
        pytest.skip("Skipping multi-node pipeline parallel test for "
                    "multiprocessing distributed backend")
248
249
    if multi_node_only and not VLLM_MULTI_NODE:
        pytest.skip("Not in multi-node setting")
250

251
    common_args = [
252
253
        # use half precision for speed and memory savings in CI environment
        "--dtype",
254
        "float16",
255
        "--max-model-len",
256
257
258
259
260
261
262
263
        "2048",
        "--max-num-seqs",
        "8",
    ]
    if chunked_prefill:
        common_args.append("--enable-chunked-prefill")
    if eager_mode:
        common_args.append("--enforce-eager")
264
265
    if task != "auto":
        common_args.extend(["--task", task])
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
    if trust_remote_code:
        common_args.append("--trust-remote-code")
    if tokenizer_mode:
        common_args.extend(["--tokenizer-mode", tokenizer_mode])

    if (distributed_backend == "ray" and tp_size == 2 and pp_size == 2
            and chunked_prefill):
        # Test Ray ADAG for a subset of the tests
        pp_env = {
            "VLLM_USE_RAY_COMPILED_DAG": "1",
            "VLLM_USE_RAY_SPMD_WORKER": "1",
            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
        }
        # Temporary. Currently when zeromq + SPMD is used, it does not properly
        # terminate because of aDAG issue.
        common_args.append("--disable-frontend-multiprocessing")
    else:
        pp_env = None

    pp_args = [
        *common_args,
287
        "--pipeline-parallel-size",
288
        str(pp_size),
289
        "--tensor-parallel-size",
290
        str(tp_size),
291
        "--distributed-executor-backend",
292
        distributed_backend,
293
    ]
294
295
296
297
298
299
300

    # compare without pipeline parallelism
    # NOTE: use mp backend for TP
    # PP tests might involve multiple nodes, and ray might
    #  schedule all workers in a node other than the head node,
    #  which can cause the test to fail.
    tp_args = [
301
        *common_args,
302
        "--tensor-parallel-size",
303
        str(tp_size),
304
305
306
307
        "--distributed-executor-backend",
        "mp",
    ]

308
    try:
309
310
311
312
313
        compare_two_settings(model_name,
                             pp_args,
                             tp_args,
                             pp_env,
                             method=method)
314
315
316
317
318
319
    except Exception:
        if pp_env is None:
            raise
        else:
            # Ray ADAG tests are flaky, so we don't want to fail the test
            logger.exception("Ray ADAG tests failed")
320
321
322


@pytest.mark.parametrize(
323
    ("model_name", "parallel_setup", "distributed_backend", "task",
324
     "test_options"),
325
326
327
328
329
330
331
332
333
334
335
    [
        params for model_name, settings in GENERATION_MODEL_SETTINGS.items()
        for params in settings.iter_params(model_name)
        if model_name in TEST_MODELS
    ],
)
@fork_new_process_for_each_test
def test_tp_language_generation(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
336
    task: TaskOption,
337
    test_options: PPTestOptions,
338
339
340
341
342
    num_gpus_available,
):
    _compare_tp(model_name,
                parallel_setup,
                distributed_backend,
343
                task,
344
                test_options,
345
346
347
348
349
                num_gpus_available,
                method="generate")


@pytest.mark.parametrize(
350
    ("model_name", "parallel_setup", "distributed_backend", "task",
351
     "test_options"),
352
353
354
355
356
357
358
359
360
361
362
    [
        params for model_name, settings in EMBEDDING_MODEL_SETTINGS.items()
        for params in settings.iter_params(model_name)
        if model_name in TEST_MODELS
    ],
)
@fork_new_process_for_each_test
def test_tp_language_embedding(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
363
    task: TaskOption,
364
    test_options: PPTestOptions,
365
366
367
368
369
    num_gpus_available,
):
    _compare_tp(model_name,
                parallel_setup,
                distributed_backend,
370
                task,
371
                test_options,
372
373
374
375
376
                num_gpus_available,
                method="encode")


@pytest.mark.parametrize(
377
    ("model_name", "parallel_setup", "distributed_backend", "task",
378
     "test_options"),
379
380
381
382
383
384
385
386
387
388
389
    [
        params for model_name, settings in MULTIMODAL_MODEL_SETTINGS.items()
        for params in settings.iter_params(model_name)
        if model_name in TEST_MODELS
    ],
)
@fork_new_process_for_each_test
def test_tp_multimodal_generation(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
390
    task: TaskOption,
391
    test_options: PPTestOptions,
392
393
394
395
396
    num_gpus_available,
):
    _compare_tp(model_name,
                parallel_setup,
                distributed_backend,
397
                task,
398
                test_options,
399
400
                num_gpus_available,
                method="generate")