test_pipeline_parallel.py 4.5 KB
Newer Older
1
2
3
4
5
6
7
"""
WARNING: This test runs in both single-node (4 GPUs) and multi-node
 (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
 important to set the distributed backend to "mp" to avoid Ray scheduling
 all workers in a node other than the head node, which can cause the test
 to fail.
"""
8
9
import os

10
11
import pytest

12
from ..utils import compare_two_settings, fork_new_process_for_each_test
13

14
15
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"

16

17
@pytest.mark.parametrize(
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
    ("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, "
     "MODEL_NAME, DIST_BACKEND, USE_RAY_ADAG, USE_RAY_ADAG_NCCL"), [
         (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
         (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
         (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
         (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
         (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
         (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
         (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
         (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
         (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
         (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
         (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
         (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
         (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
         (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
         (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
         (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
         (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
         (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
         (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
         (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
     ])
41
def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
42
                    DIST_BACKEND, USE_RAY_ADAG, USE_RAY_ADAG_NCCL):
43
44
45
    if VLLM_MULTI_NODE and DIST_BACKEND == "mp":
        pytest.skip("Skipping multi-node pipeline parallel test for "
                    "multiprocessing distributed backend")
46

47
    pp_args = [
48
49
        # use half precision for speed and memory savings in CI environment
        "--dtype",
50
        "float16",
51
52
53
54
55
        "--pipeline-parallel-size",
        str(PP_SIZE),
        "--tensor-parallel-size",
        str(TP_SIZE),
        "--distributed-executor-backend",
56
        DIST_BACKEND,
57
    ]
58
59
60
61
62
63
64
65
66
67
68

    # compare without pipeline parallelism
    # NOTE: use mp backend for TP
    # PP tests might involve multiple nodes, and ray might
    #  schedule all workers in a node other than the head node,
    #  which can cause the test to fail.
    tp_args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "bfloat16",
        "--tensor-parallel-size",
69
        str(max(TP_SIZE, 2)),  # We only use 2 GPUs in the CI.
70
71
72
        "--distributed-executor-backend",
        "mp",
    ]
73
    if CHUNKED_PREFILL:
74
75
        pp_args.append("--enable-chunked-prefill")
        tp_args.append("--enable-chunked-prefill")
76
    if EAGER_MODE:
77
78
        pp_args.append("--enforce-eager")
        tp_args.append("--enforce-eager")
79
80
81
82
83
84
85
86
87
88
    pp_env = None
    if USE_RAY_ADAG:
        assert DIST_BACKEND == "ray", (
            "Ray ADAG is only supported with Ray distributed backend")
        pp_env = {
            "VLLM_USE_RAY_COMPILED_DAG": "1",
            "VLLM_USE_RAY_SPMD_WORKER": "1",
            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL":
            str(int(USE_RAY_ADAG_NCCL)),
        }
89

90
    compare_two_settings(MODEL_NAME, pp_args, tp_args, pp_env)
91
92
93
94
95
96
97
98
99


@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
    (2, "JackFram/llama-160m"),
])
@pytest.mark.parametrize("ATTN_BACKEND", [
    "FLASH_ATTN",
    "FLASHINFER",
])
100
@fork_new_process_for_each_test
101
102
103
104
105
106
107
108
def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
    cudagraph_args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "float16",
        "--pipeline-parallel-size",
        str(PP_SIZE),
        "--distributed-executor-backend",
109
        "mp",
110
111
112
113
114
115
    ]
    os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND

    eager_args = cudagraph_args + ["--enforce-eager"]

    compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)