test_pipeline_parallel.py 4.37 KB
Newer Older
1
2
3
4
5
6
7
"""
WARNING: This test runs in both single-node (4 GPUs) and multi-node
 (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
 important to set the distributed backend to "mp" to avoid Ray scheduling
 all workers in a node other than the head node, which can cause the test
 to fail.
"""
8
9
import os

10
import pytest
11
12
from packaging import version
from transformers import __version__ as transformers_version
13

14
15
from vllm.logger import init_logger

16
from ..utils import compare_two_settings, fork_new_process_for_each_test
17

18
19
logger = init_logger("test_pipeline_parallel")

20
21
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"

22

23
24
25
26
27
28
29
30
31
32
33
34
35
36
@pytest.mark.parametrize(
    ("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, TRUST_REMOTE_CODE, "
     "MODEL_NAME, DIST_BACKEND"),
    [
        (2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
        (2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
        (1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
        (1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
        (1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
        (1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
        (1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
        (1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
        (2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
        (2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
37
38
39
40
41
        # NOTE: InternVL2 multi-node tests are flaky,
        # use mp backend to skip the multi-node tests
        (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "mp"),
        (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "mp"),
        (1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "mp"),
42
        (1, 2, 0, 1, 0, "Qwen/Qwen2-VL-2B-Instruct", "mp")
43
44
    ],
)
45
@fork_new_process_for_each_test
46
47
def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL,
                    TRUST_REMOTE_CODE, MODEL_NAME, DIST_BACKEND):
48
49
50
    if VLLM_MULTI_NODE and DIST_BACKEND == "mp":
        pytest.skip("Skipping multi-node pipeline parallel test for "
                    "multiprocessing distributed backend")
51

52
53
54
55
56
    # Skip tests that require transformers>=4.45.0
    if "Qwen2-VL" in MODEL_NAME and version.parse(
            transformers_version) < version.parse("4.45.0.dev0"):
        pytest.skip("This test requires transformers>=4.45.0")

57
    pp_args = [
58
59
        # use half precision for speed and memory savings in CI environment
        "--dtype",
60
        "float16",
61
62
        "--max-model-len",
        "8192",
63
64
65
66
67
        "--pipeline-parallel-size",
        str(PP_SIZE),
        "--tensor-parallel-size",
        str(TP_SIZE),
        "--distributed-executor-backend",
68
        DIST_BACKEND,
69
    ]
70
71
72
73
74
75
76
77
78

    # compare without pipeline parallelism
    # NOTE: use mp backend for TP
    # PP tests might involve multiple nodes, and ray might
    #  schedule all workers in a node other than the head node,
    #  which can cause the test to fail.
    tp_args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
79
80
81
        "float16",
        "--max-model-len",
        "8192",
82
        "--tensor-parallel-size",
83
        str(max(TP_SIZE, 2)),  # We only use 2 GPUs in the CI.
84
85
86
        "--distributed-executor-backend",
        "mp",
    ]
87
    if CHUNKED_PREFILL:
88
89
        pp_args.append("--enable-chunked-prefill")
        tp_args.append("--enable-chunked-prefill")
90
    if EAGER_MODE:
91
92
        pp_args.append("--enforce-eager")
        tp_args.append("--enforce-eager")
93
94
95
    if TRUST_REMOTE_CODE:
        pp_args.append("--trust-remote-code")
        tp_args.append("--trust-remote-code")
96
    pp_env = None
97
98
99
    if (DIST_BACKEND == "ray" and TP_SIZE == 2 and PP_SIZE == 2
            and CHUNKED_PREFILL):
        # Test Ray ADAG for a subset of the tests
100
101
102
        pp_env = {
            "VLLM_USE_RAY_COMPILED_DAG": "1",
            "VLLM_USE_RAY_SPMD_WORKER": "1",
103
            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
104
        }
105
106
107
108
        # Temporary. Currently when zeromq + SPMD is used, it does not properly
        # terminate because of aDAG issue.
        pp_args.append("--disable-frontend-multiprocessing")
        tp_args.append("--disable-frontend-multiprocessing")
109

110
111
112
113
114
115
116
117
    try:
        compare_two_settings(MODEL_NAME, pp_args, tp_args, pp_env)
    except Exception:
        if pp_env is None:
            raise
        else:
            # Ray ADAG tests are flaky, so we don't want to fail the test
            logger.exception("Ray ADAG tests failed")