test_pipeline_parallel.py 20.8 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
7
8
9
"""
WARNING: This test runs in both single-node (4 GPUs) and multi-node
 (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
 important to set the distributed backend to "mp" to avoid Ray scheduling
 all workers in a node other than the head node, which can cause the test
 to fail.
"""
10
import json
11
import os
12
from dataclasses import dataclass
13
from typing import Literal, NamedTuple, Optional
14

15
16
import pytest

17
from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption
18
from vllm.logger import init_logger
19
from vllm.transformers_utils.config import get_config
20

21
from ..models.registry import HF_EXAMPLE_MODELS
zhuwenwen's avatar
zhuwenwen committed
22
from ..utils import compare_two_settings, create_new_process_for_each_test, models_path_prefix
23

24
25
logger = init_logger("test_pipeline_parallel")

26
27
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"

28

29
30
31
32
33
34
35
36
37
38
39
40
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
    """
    For PP, we fall back to V0 by default. This means
    that the TP baseline runs with V1 while the PP engine
    runs with V0. This gives divergent results with dummy
    weights. Once we enable V1 by default for PP, we can
    remove this.
    """
    monkeypatch.setenv('VLLM_USE_V1', '0')


41
42
43
44
45
46
47
class ParallelSetup(NamedTuple):
    tp_size: int
    pp_size: int
    eager_mode: bool
    chunked_prefill: bool


48
49
class PPTestOptions(NamedTuple):
    multi_node_only: bool
50
    load_format: Optional[str] = None
51
52


53
54
@dataclass
class PPTestSettings:
55
    parallel_setups: list[ParallelSetup]
56
57
58
59
    # NOTE: the length of distributed_backends and
    # vllm_major_versions should be the same, and they
    # are first zipped together to iterate over all
    # test settings.
60
    distributed_backends: list[str]
61
    # vllm major version: "0" for V0, "1" for V1
62
    vllm_major_versions: list[str]
63
    runner: RunnerOption
64
    test_options: PPTestOptions
65

66
67
68
69
70
71
72
    def __post_init__(self):
        if len(self.distributed_backends) != len(self.vllm_major_versions):
            raise ValueError(
                f"Length mismatch: distributed_backends "
                f"({len(self.distributed_backends)}) != "
                f"vllm_major_versions ({len(self.vllm_major_versions)})")

73
74
75
76
77
    @staticmethod
    def detailed(
        *,
        tp_base: int = 1,
        pp_base: int = 2,
78
        multi_node_only: bool = False,
79
        runner: RunnerOption = "auto",
80
        load_format: Optional[str] = None,
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
    ):
        return PPTestSettings(
            parallel_setups=[
                ParallelSetup(tp_size=tp_base,
                              pp_size=pp_base,
                              eager_mode=False,
                              chunked_prefill=False),
                ParallelSetup(tp_size=tp_base,
                              pp_size=2 * pp_base,
                              eager_mode=False,
                              chunked_prefill=True),
                ParallelSetup(tp_size=tp_base,
                              pp_size=2 * pp_base,
                              eager_mode=True,
                              chunked_prefill=False),
                ParallelSetup(tp_size=2 * tp_base,
                              pp_size=pp_base,
                              eager_mode=False,
                              chunked_prefill=True),
                ParallelSetup(tp_size=2 * tp_base,
                              pp_size=pp_base,
                              eager_mode=True,
                              chunked_prefill=False),
            ],
105
106
            distributed_backends=["mp", "mp", "ray", "ray"],
            vllm_major_versions=["0", "1", "0", "1"],
107
            runner=runner,
108
            test_options=PPTestOptions(multi_node_only=multi_node_only,
109
                                       load_format=load_format),
110
111
112
113
114
115
116
        )

    @staticmethod
    def fast(
        *,
        tp_base: int = 1,
        pp_base: int = 2,
117
        runner: RunnerOption = "auto",
118
        multi_node_only: bool = False,
119
        load_format: Optional[str] = None,
120
    ):
121
122
        vllm_major_versions = ["1"] if runner == "pooling" else ["0"]

123
124
125
126
127
128
129
130
        return PPTestSettings(
            parallel_setups=[
                ParallelSetup(tp_size=tp_base,
                              pp_size=pp_base,
                              eager_mode=True,
                              chunked_prefill=False),
            ],
            distributed_backends=["mp"],
131
            vllm_major_versions=vllm_major_versions,
132
            runner=runner,
133
            test_options=PPTestOptions(multi_node_only=multi_node_only,
134
                                       load_format=load_format),
135
136
        )

137
    def iter_params(self, model_id: str):
138
139
        opts = self.test_options

140
        for parallel_setup in self.parallel_setups:
141
142
            for backend, vllm_major_version in zip(self.distributed_backends,
                                                   self.vllm_major_versions):
143
                yield (model_id, parallel_setup, backend, vllm_major_version,
144
                       self.runner, opts)
145
146


147
148
149
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
# The values displayed here are only a rough indicator of the size of the model

150
# yapf: disable
151
152
TEXT_GENERATION_MODELS = {
    # [Decoder-only]
153
154
    # Uses Llama
    # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
    os.path.join(models_path_prefix,"Snowflake/snowflake-arctic-instruct"): PPTestSettings.fast(load_format="dummy"),  # noqa: E501
    os.path.join(models_path_prefix,"baichuan-inc/Baichuan-7B"): PPTestSettings.fast(),
    os.path.join(models_path_prefix,"baichuan-inc/Baichuan2-13B-Chat"): PPTestSettings.fast(),
    os.path.join(models_path_prefix,"bigscience/bloomz-1b1"): PPTestSettings.fast(),
    os.path.join(models_path_prefix,"zai-org/chatglm3-6b"): PPTestSettings.fast(),
    os.path.join(models_path_prefix,"CohereForAI/c4ai-command-r-v01"): PPTestSettings.fast(load_format="dummy"),
    os.path.join(models_path_prefix,"databricks/dbrx-instruct"): PPTestSettings.fast(load_format="dummy"),
    os.path.join(models_path_prefix,"Deci/DeciLM-7B-instruct"): PPTestSettings.fast(),
    os.path.join(models_path_prefix,"deepseek-ai/deepseek-llm-7b-chat"): PPTestSettings.fast(),
    os.path.join(models_path_prefix,"deepseek-ai/DeepSeek-V2-Lite-Chat"): PPTestSettings.fast(tp_base=2),
    os.path.join(models_path_prefix,"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"): PPTestSettings.fast(),
    os.path.join(models_path_prefix,"tiiuae/falcon-7b"): PPTestSettings.fast(),
    os.path.join(models_path_prefix,"google/gemma-1.1-2b-it"): PPTestSettings.fast(),
    os.path.join(models_path_prefix,"google/gemma-2-9b"): PPTestSettings.fast(),
    os.path.join(models_path_prefix,"gpt2"): PPTestSettings.fast(),
    os.path.join(models_path_prefix,"bigcode/starcoder"): PPTestSettings.fast(),
    os.path.join(models_path_prefix,"EleutherAI/gpt-j-6b"): PPTestSettings.fast(),
    os.path.join(models_path_prefix,"EleutherAI/pythia-1.4b"): PPTestSettings.fast(),
    os.path.join(models_path_prefix,"ibm/PowerLM-3b"): PPTestSettings.fast(),
    os.path.join(models_path_prefix,"ibm/PowerMoE-3b"): PPTestSettings.fast(),
175
176
    # Uses Llama
    # "internlm/internlm-chat-7b": PPTestSettings.fast(),
177
178
179
180
181
    os.path.join(models_path_prefix,"internlm/internlm2-chat-7b"): PPTestSettings.fast(),
    os.path.join(models_path_prefix,"inceptionai/jais-13b-chat"): PPTestSettings.fast(),
    os.path.join(models_path_prefix,"ai21labs/Jamba-tiny-dev"): PPTestSettings.fast(),
    os.path.join(models_path_prefix,"pfnet/plamo-2-1b"): PPTestSettings.fast(),
    os.path.join(models_path_prefix,"meta-llama/Llama-3.2-1B-Instruct"): PPTestSettings.detailed(),
182
    # Tests TransformersForCausalLM
183
184
185
    os.path.join(models_path_prefix,"hmellor/Ilama-3.2-1B"): PPTestSettings.fast(),
    os.path.join(models_path_prefix,"openbmb/MiniCPM-2B-sft-bf16"): PPTestSettings.fast(),
    os.path.join(models_path_prefix,"openbmb/MiniCPM3-4B"): PPTestSettings.fast(),
186
187
    # Uses Llama
    # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
zhuwenwen's avatar
zhuwenwen committed
188
    os.path.join(models_path_prefix, "state-spaces/mamba-130m-hf"): PPTestSettings.fast(),
zhuwenwen's avatar
zhuwenwen committed
189
    os.path.join(models_path_prefix, "mistralai/Mixtral-8x7B-Instruct-v0.1"): PPTestSettings.fast(load_format="dummy"),  # noqa: E501
zhuwenwen's avatar
zhuwenwen committed
190
191
192
    os.path.join(models_path_prefix, "mosaicml/mpt-7b"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "nvidia/Minitron-8B-Base"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "allenai/OLMo-1B-hf"): PPTestSettings.fast(),
zhuwenwen's avatar
zhuwenwen committed
193
    os.path.join(models_path_prefix, "allenai/OLMo-2-0425-1B"): PPTestSettings.fast(),
zhuwenwen's avatar
zhuwenwen committed
194
195
    os.path.join(models_path_prefix, "allenai/OLMoE-1B-7B-0924-Instruct"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "facebook/opt-iml-max-1.3b"): PPTestSettings.fast(),
zhuwenwen's avatar
zhuwenwen committed
196
    os.path.join(models_path_prefix, "OrionStarAI/Orion-14B-Chat"): PPTestSettings.fast(),
zhuwenwen's avatar
zhuwenwen committed
197
198
    os.path.join(models_path_prefix, "adept/persimmon-8b-chat"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "microsoft/phi-2"): PPTestSettings.fast(),
zhuwenwen's avatar
zhuwenwen committed
199
200
201
    os.path.join(models_path_prefix, "microsoft/Phi-3-small-8k-instruct"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "microsoft/Phi-3.5-MoE-instruct"): PPTestSettings.detailed(multi_node_only=True, load_format="dummy"),  # noqa: E501
    os.path.join(models_path_prefix, "Qwen/Qwen-7B-Chat"): PPTestSettings.fast(),
zhuwenwen's avatar
zhuwenwen committed
202
    os.path.join(models_path_prefix, "Qwen/Qwen2.5-0.5B-Instruct"): PPTestSettings.fast(),
zhuwenwen's avatar
zhuwenwen committed
203
204
205
    os.path.join(models_path_prefix, "Qwen/Qwen1.5-MoE-A2.7B-Chat"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "stabilityai/stablelm-3b-4e1t"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "bigcode/starcoder2-3b"): PPTestSettings.fast(),
zhuwenwen's avatar
zhuwenwen committed
206
    os.path.join(models_path_prefix, "upstage/solar-pro-preview-instruct"): PPTestSettings.fast(load_format="dummy"),  # noqa: E501
207
208
    # FIXME: Cannot load tokenizer in latest transformers version.
    # Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
209
    # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(),
210
211
212
    # [Encoder-only]
    # TODO: Implement PP
    # "facebook/bart-base": PPTestSettings.fast(),
213
214
}

215
216
EMBEDDING_MODELS = {  # type: ignore[var-annotated]
    # [Text-only]
217
    os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"): PPTestSettings.fast(runner="pooling"),
218
219
220
    # TODO: re-enable when https://github.com/vllm-project/vllm/issues/23883
    # is fixed
    #"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
221
    os.path.join(models_path_prefix, "Qwen/Qwen2.5-Math-RM-72B"): PPTestSettings.fast(
222
        load_format="dummy", runner="pooling"
223
    ),
224
225
}

226
227
MULTIMODAL_MODELS = {
    # [Decoder-only]
228
229
230
231
232
233
234
235
236
237
238
239
    os.path.join(models_path_prefix, "Salesforce/blip2-opt-6.7b"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "facebook/chameleon-7b"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "adept/fuyu-8b"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "zai-org/glm-4v-9b"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "OpenGVLab/InternVL2-1B"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "llava-hf/LLaVA-NeXT-Video-7B-hf"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "openbmb/MiniCPM-Llama3-V-2_5"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "allenai/Molmo-7B-D-0924"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "AIDC-AI/Ovis2-1B"): PPTestSettings.fast(),
240
    os.path.join(models_path_prefix, "AIDC-AI/Ovis2.5-2B"): PPTestSettings.fast(),
241
242
243
244
245
246
    os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "mistralai/Pixtral-12B-2409"): PPTestSettings.fast(load_format="dummy"),
    os.path.join(models_path_prefix, "Qwen/Qwen-VL-Chat"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "Qwen/Qwen2-Audio-7B-Instruct"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_5-llama-3_2-1b"): PPTestSettings.fast(),
247
    # [Encoder-decoder]
248
    # TODO: Implement PP
249
    # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
250
251
252
}
# yapf: enable

253
# NOTE: You can update this on your local machine to run specific tests
254
TEST_MODELS = [
255
    # [LANGUAGE GENERATION]
zhuwenwen's avatar
zhuwenwen committed
256
    os.path.join(models_path_prefix, "microsoft/Phi-3.5-MoE-instruct"),
zhuwenwen's avatar
zhuwenwen committed
257
    os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
258
    os.path.join(models_path_prefix, "hmellor/Ilama-3.2-1B"),
zhuwenwen's avatar
zhuwenwen committed
259
    os.path.join(models_path_prefix, "ibm/PowerLM-3b"),
260
    os.path.join(models_path_prefix, "deepseek-ai/DeepSeek-V2-Lite-Chat"),
261
    # [LANGUAGE EMBEDDING]
zhuwenwen's avatar
zhuwenwen committed
262
263
    os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"),
    os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2"),
264
    # [MULTIMODAL GENERATION]
zhuwenwen's avatar
zhuwenwen committed
265
    os.path.join(models_path_prefix, "OpenGVLab/InternVL2-1B"),
zhuwenwen's avatar
zhuwenwen committed
266
    os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"),
zhuwenwen's avatar
zhuwenwen committed
267
    os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_5-llama-3_2-1b"),
268
    # [LANGUAGE GENERATION - HYBRID ARCH]
zhuwenwen's avatar
zhuwenwen committed
269
    os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-dev"),
270
271
272
]


273
def _compare_tp(
274
    model_id: str,
275
276
    parallel_setup: ParallelSetup,
    distributed_backend: str,
277
    vllm_major_version: str,
278
    runner: RunnerOption,
279
    test_options: PPTestOptions,
280
281
    num_gpus_available: int,
    *,
282
    method: Literal["generate", "encode"],
283
    is_multimodal: bool,
284
):
285
286
287
288
289
290
    (
        tp_size,
        pp_size,
        eager_mode,
        chunked_prefill,
    ) = parallel_setup
291
292
293
294
295
296
297
298
299

    multi_node_only, load_format = test_options

    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
    model_info.check_transformers_version(on_fail="skip")

    trust_remote_code = model_info.trust_remote_code
    tokenizer_mode = model_info.tokenizer_mode
    hf_overrides = model_info.hf_overrides
300
    hf_config = get_config(model_id, trust_remote_code)
301
302
    skip_tokenizer_init = model_info.skip_tokenizer_init
    max_num_seqs = model_info.max_num_seqs
303
304
305
306

    dtype = "float16"
    if hf_config.model_type in _FLOAT16_NOT_SUPPORTED_MODELS:
        dtype = "bfloat16"
307
308
309
310

    if load_format == "dummy":
        # Avoid OOM
        text_overrides = {
311
312
313
314
315
            "num_hidden_layers": 4,
            "hidden_size": 512,
            "intermediate_size": 800,
            "num_attention_heads": 4,
            "num_key_value_heads": 1,
316
317
318
319
320
321
322
323
        }

        if is_multimodal:
            hf_overrides.update({"text_config": text_overrides})
        else:
            hf_overrides.update(text_overrides)
    else:
        model_info.check_available_online(on_fail="skip")
324

325
326
    if num_gpus_available < tp_size * pp_size:
        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
327
    if VLLM_MULTI_NODE and distributed_backend == "mp":
328
329
        pytest.skip("Skipping multi-node pipeline parallel test for "
                    "multiprocessing distributed backend")
330
331
    if multi_node_only and not VLLM_MULTI_NODE:
        pytest.skip("Not in multi-node setting")
332

333
    common_args = [
334
335
        # use half precision for speed and memory savings in CI environment
        "--dtype",
336
        dtype,
337
        "--max-model-len",
338
339
340
341
342
343
344
345
        "2048",
        "--max-num-seqs",
        "8",
    ]
    if chunked_prefill:
        common_args.append("--enable-chunked-prefill")
    if eager_mode:
        common_args.append("--enforce-eager")
346
347
    if runner != "auto":
        common_args.extend(["--runner", runner])
348
349
350
351
    if trust_remote_code:
        common_args.append("--trust-remote-code")
    if tokenizer_mode:
        common_args.extend(["--tokenizer-mode", tokenizer_mode])
352
353
354
    if load_format:
        common_args.extend(["--load-format", load_format])
    if hf_overrides:
355
        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
356
357
358
359
    if skip_tokenizer_init:
        common_args.append("--skip-tokenizer-init")
    if max_num_seqs:
        common_args.extend(["--max-num-seqs", f"{max_num_seqs}"])
360

361
    specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
362
    testing_ray_compiled_graph = False
363
364
    if distributed_backend == "ray" and (vllm_major_version == "1"
                                         or specific_case):
365
366
        # For V1, test Ray Compiled Graph for all the tests
        # For V0, test Ray Compiled Graph for a subset of the tests
367
        pp_env = {
368
            "VLLM_USE_V1": vllm_major_version,
369
370
371
372
373
            "VLLM_USE_RAY_COMPILED_DAG": "1",
            "VLLM_USE_RAY_SPMD_WORKER": "1",
            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
        }
        # Temporary. Currently when zeromq + SPMD is used, it does not properly
374
        # terminate because of a Ray Compiled Graph issue.
375
        common_args.append("--disable-frontend-multiprocessing")
376
        testing_ray_compiled_graph = True
377
378
379
380
381
    elif distributed_backend == "mp":
        # Both V0/V1 of multiprocessing executor support PP
        pp_env = {
            "VLLM_USE_V1": vllm_major_version,
        }
382
383
384
    else:
        pp_env = None

385
386
387
388
    tp_env = {
        "VLLM_USE_V1": vllm_major_version,
    }

389
390
    pp_args = [
        *common_args,
391
        "--pipeline-parallel-size",
392
        str(pp_size),
393
        "--tensor-parallel-size",
394
        str(tp_size),
395
        "--distributed-executor-backend",
396
        distributed_backend,
397
    ]
398
399
400
401
402
403
404

    # compare without pipeline parallelism
    # NOTE: use mp backend for TP
    # PP tests might involve multiple nodes, and ray might
    #  schedule all workers in a node other than the head node,
    #  which can cause the test to fail.
    tp_args = [
405
        *common_args,
406
        "--tensor-parallel-size",
407
        str(tp_size),
408
409
410
411
        "--distributed-executor-backend",
        "mp",
    ]

412
    try:
413
414
415
416
417
418
        compare_two_settings(model_id,
                             pp_args,
                             tp_args,
                             pp_env,
                             tp_env,
                             method=method)
419
    except Exception:
420
421
        if testing_ray_compiled_graph and vllm_major_version == "0":
            # Ray Compiled Graph tests are flaky for V0,
422
423
            # so we don't want to fail the test
            logger.exception("Ray Compiled Graph tests failed")
424
        else:
425
            raise
426
427
428


@pytest.mark.parametrize(
429
    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
430
     "runner", "test_options"),
431
    [
432
433
        params for model_id, settings in TEXT_GENERATION_MODELS.items()
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
434
435
    ],
)
436
@create_new_process_for_each_test()
437
def test_tp_language_generation(
438
    model_id: str,
439
440
    parallel_setup: ParallelSetup,
    distributed_backend: str,
441
    vllm_major_version: str,
442
    runner: RunnerOption,
443
    test_options: PPTestOptions,
444
445
    num_gpus_available,
):
446
    _compare_tp(model_id,
447
448
                parallel_setup,
                distributed_backend,
449
                vllm_major_version,
450
                runner,
451
                test_options,
452
                num_gpus_available,
453
454
                method="generate",
                is_multimodal=False)
455
456
457


@pytest.mark.parametrize(
458
    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
459
     "runner", "test_options"),
460
    [
461
462
        params for model_id, settings in EMBEDDING_MODELS.items()
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
463
464
    ],
)
465
@create_new_process_for_each_test()
466
def test_tp_language_embedding(
467
    model_id: str,
468
469
    parallel_setup: ParallelSetup,
    distributed_backend: str,
470
    vllm_major_version: str,
471
    runner: RunnerOption,
472
    test_options: PPTestOptions,
473
474
    num_gpus_available,
):
475
    _compare_tp(model_id,
476
477
                parallel_setup,
                distributed_backend,
478
                vllm_major_version,
479
                runner,
480
                test_options,
481
                num_gpus_available,
482
483
                method="encode",
                is_multimodal=False)
484
485
486


@pytest.mark.parametrize(
487
    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
488
     "runner", "test_options"),
489
    [
490
491
        params for model_id, settings in MULTIMODAL_MODELS.items()
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
492
493
    ],
)
494
@create_new_process_for_each_test()
495
def test_tp_multimodal_generation(
496
    model_id: str,
497
498
    parallel_setup: ParallelSetup,
    distributed_backend: str,
499
    vllm_major_version: str,
500
    runner: RunnerOption,
501
    test_options: PPTestOptions,
502
503
    num_gpus_available,
):
504
    _compare_tp(model_id,
505
506
                parallel_setup,
                distributed_backend,
507
                vllm_major_version,
508
                runner,
509
                test_options,
510
                num_gpus_available,
511
512
                method="generate",
                is_multimodal=True)