test_pipeline_parallel.py 11.2 KB
Newer Older
1
2
3
4
5
6
7
"""
WARNING: This test runs in both single-node (4 GPUs) and multi-node
 (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
 important to set the distributed backend to "mp" to avoid Ray scheduling
 all workers in a node other than the head node, which can cause the test
 to fail.
"""
8
import os
9
10
from dataclasses import dataclass
from typing import List, NamedTuple, Optional
11

12
13
import pytest

14
15
from vllm.logger import init_logger

16
from ..utils import compare_two_settings, fork_new_process_for_each_test
17

18
19
logger = init_logger("test_pipeline_parallel")

20
21
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"

22

23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
class ParallelSetup(NamedTuple):
    tp_size: int
    pp_size: int
    eager_mode: bool
    chunked_prefill: bool


@dataclass
class PPTestSettings:
    parallel_setups: List[ParallelSetup]
    distributed_backends: List[str]
    trust_remote_code: bool
    tokenizer_mode: Optional[str]

    @staticmethod
    def detailed(
        *,
        tp_base: int = 1,
        pp_base: int = 2,
        trust_remote_code: bool = False,
        tokenizer_mode: Optional[str] = None,
    ):
        return PPTestSettings(
            parallel_setups=[
                ParallelSetup(tp_size=tp_base,
                              pp_size=pp_base,
                              eager_mode=False,
                              chunked_prefill=False),
                ParallelSetup(tp_size=tp_base,
                              pp_size=2 * pp_base,
                              eager_mode=False,
                              chunked_prefill=True),
                ParallelSetup(tp_size=tp_base,
                              pp_size=2 * pp_base,
                              eager_mode=True,
                              chunked_prefill=False),
                ParallelSetup(tp_size=2 * tp_base,
                              pp_size=pp_base,
                              eager_mode=False,
                              chunked_prefill=True),
                ParallelSetup(tp_size=2 * tp_base,
                              pp_size=pp_base,
                              eager_mode=True,
                              chunked_prefill=False),
            ],
            distributed_backends=["mp", "ray"],
            trust_remote_code=trust_remote_code,
            tokenizer_mode=tokenizer_mode,
        )

    @staticmethod
    def fast(
        *,
        tp_base: int = 1,
        pp_base: int = 2,
        trust_remote_code: bool = False,
        tokenizer_mode: Optional[str] = None,
    ):
        return PPTestSettings(
            parallel_setups=[
                ParallelSetup(tp_size=tp_base,
                              pp_size=pp_base,
                              eager_mode=True,
                              chunked_prefill=False),
            ],
            distributed_backends=["mp"],
            trust_remote_code=trust_remote_code,
            tokenizer_mode=tokenizer_mode,
        )

    def iter_params(self, model_name: str):
        for parallel_setup in self.parallel_setups:
            for distributed_backend in self.distributed_backends:
                yield (model_name, parallel_setup, distributed_backend,
                       self.trust_remote_code, self.tokenizer_mode)


# yapf: disable
GENERATION_MODEL_SETTINGS = {
    # [DETAILED TESTS]
    "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
    # [FAST TESTS]
    # Uses Llama
    # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
    # TODO: Test on larger GPU
    # "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
    "baichuan-inc/Baichuan-7B": PPTestSettings.fast(trust_remote_code=True),
    "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
    "bigscience/bloomz-1b1": PPTestSettings.fast(),
    "THUDM/chatglm3-6b": PPTestSettings.fast(trust_remote_code=True),
    "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(tp_base=2, trust_remote_code=True),  # noqa: E501
    # TODO: Test on larger GPU
    # "databricks/dbrx-instruct": PPTestSettings.fast(),
    "Deci/DeciLM-7B-instruct": PPTestSettings.fast(trust_remote_code=True),
    "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
    "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
    "tiiuae/falcon-7b": PPTestSettings.fast(),
    "google/gemma-2b": PPTestSettings.fast(),
    "google/gemma-2-9b": PPTestSettings.fast(),
    "gpt2": PPTestSettings.fast(),
    "bigcode/starcoder": PPTestSettings.fast(),
    "EleutherAI/gpt-j-6b": PPTestSettings.fast(),
    "EleutherAI/pythia-12b": PPTestSettings.fast(),
    "ibm/PowerLM-3b": PPTestSettings.fast(),
    "ibm/PowerMoE-3b": PPTestSettings.fast(),
    # Uses Llama
    # "internlm/internlm-chat-7b": PPTestSettings.fast(),
    "internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True),
    "core42/jais-13b-chat": PPTestSettings.fast(),
    # TODO: Implement PP
    # "ai21labs/AI21-Jamba-1.5-Mini": PPTestSettings.fast(),
    "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True),
    "openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True),
    # Uses Llama
    # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
    "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4),
    "mosaicml/mpt-7b": PPTestSettings.fast(),
    "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
    "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
    "allenai/OLMo-1B-hf": PPTestSettings.fast(),
    "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
    "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
    "microsoft/phi-2": PPTestSettings.fast(),
    "microsoft/Phi-3-mini-4k-instruct": PPTestSettings.fast(),
    "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
    # FIXME: https://github.com/vllm-project/vllm/issues/8553
    # "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
    "adept/persimmon-8b-chat": PPTestSettings.fast(),
    "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
    "Qwen/Qwen2-beta-7B-Chat": PPTestSettings.fast(),
    "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
    "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
    "bigcode/starcoder2-3b": PPTestSettings.fast(),
    "upstage/solar-pro-preview-instruct": PPTestSettings.fast(tp_base=2),
    # FIXME: Cannot load tokenizer in latest transformers version
    # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
}

EMBEDDING_MODEL_SETTINGS = {  # type: ignore[var-annotated]
    # [FAST TESTS]
    # Uses Llama
    # "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
}

MULTIMODAL_MODEL_SETTINGS = {
    # [FAST TESTS]
    "Salesforce/blip2-opt-2.7b": PPTestSettings.fast(),
    "facebook/chameleon-7b": PPTestSettings.fast(),
    "adept/fuyu-8b": PPTestSettings.fast(),
    "OpenGVLab/InternVL2-1B": PPTestSettings.fast(trust_remote_code=True),
    "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
    "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
    "llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
    "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(trust_remote_code=True),
    # TODO: Implement PP
    # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
    "microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
    "mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"),  # noqa: E501
    "Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
    "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
    "fixie-ai/ultravox-v0_3": PPTestSettings.fast(),
}

CONDITIONAL_GENERATION_MODEL_SETTINGS = {  # type: ignore[var-annotated]
    # [FAST TESTS]
    # TODO: Implement PP
    # "facebook/bart-base": PPTestSettings.fast(),
}
# yapf: enable

MODEL_SETTINGS = {
    **GENERATION_MODEL_SETTINGS,
    **EMBEDDING_MODEL_SETTINGS,
    **MULTIMODAL_MODEL_SETTINGS,
}

# You can update this on your local machine to run specific tests
TEST_MODELS = [
    "meta-llama/Meta-Llama-3-8B",
    "facebook/chameleon-7b",
    "OpenGVLab/InternVL2-1B",
    "microsoft/Phi-3-vision-128k-instruct",
    "mistralai/Pixtral-12B-2409",
    "fixie-ai/ultravox-v0_3",
]


212
@pytest.mark.parametrize(
213
214
    ("model_name", "parallel_setup", "distributed_backend",
     "trust_remote_code", "tokenizer_mode"),
215
    [
216
217
218
        params for model_name, settings in MODEL_SETTINGS.items()
        for params in settings.iter_params(model_name)
        if model_name in TEST_MODELS
219
220
    ],
)
221
@fork_new_process_for_each_test
222
223
224
225
226
227
228
229
def test_compare_tp(model_name: str, parallel_setup: ParallelSetup,
                    distributed_backend: str, trust_remote_code: bool,
                    tokenizer_mode: Optional[str], num_gpus_available):
    tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup

    if num_gpus_available < tp_size:
        pytest.skip(f"Need at least {tp_size} GPUs to run the test")
    if VLLM_MULTI_NODE and distributed_backend == "mp":
230
231
        pytest.skip("Skipping multi-node pipeline parallel test for "
                    "multiprocessing distributed backend")
232

233
    common_args = [
234
235
        # use half precision for speed and memory savings in CI environment
        "--dtype",
236
        "float16",
237
        "--max-model-len",
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
        "2048",
        "--max-num-seqs",
        "8",
    ]
    if chunked_prefill:
        common_args.append("--enable-chunked-prefill")
    if eager_mode:
        common_args.append("--enforce-eager")
    if trust_remote_code:
        common_args.append("--trust-remote-code")
    if tokenizer_mode:
        common_args.extend(["--tokenizer-mode", tokenizer_mode])

    if (distributed_backend == "ray" and tp_size == 2 and pp_size == 2
            and chunked_prefill):
        # Test Ray ADAG for a subset of the tests
        pp_env = {
            "VLLM_USE_RAY_COMPILED_DAG": "1",
            "VLLM_USE_RAY_SPMD_WORKER": "1",
            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
        }
        # Temporary. Currently when zeromq + SPMD is used, it does not properly
        # terminate because of aDAG issue.
        common_args.append("--disable-frontend-multiprocessing")
    else:
        pp_env = None

    pp_args = [
        *common_args,
267
        "--pipeline-parallel-size",
268
        str(pp_size),
269
        "--tensor-parallel-size",
270
        str(tp_size),
271
        "--distributed-executor-backend",
272
        distributed_backend,
273
    ]
274
275
276
277
278
279
280

    # compare without pipeline parallelism
    # NOTE: use mp backend for TP
    # PP tests might involve multiple nodes, and ray might
    #  schedule all workers in a node other than the head node,
    #  which can cause the test to fail.
    tp_args = [
281
        *common_args,
282
        "--tensor-parallel-size",
283
        str(tp_size),
284
285
286
287
        "--distributed-executor-backend",
        "mp",
    ]

288
    try:
289
        compare_two_settings(model_name, pp_args, tp_args, pp_env)
290
291
292
293
294
295
    except Exception:
        if pp_env is None:
            raise
        else:
            # Ray ADAG tests are flaky, so we don't want to fail the test
            logger.exception("Ray ADAG tests failed")