test_pipeline_parallel.py 1.59 KB
Newer Older
1
2
import pytest

3
from ..utils import compare_two_settings
4
5


6
@pytest.mark.parametrize(
7
    "TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME", [
8
9
10
        (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B"),
        (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B"),
        (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B"),
11
12
        (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B"),
        (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B"),
13
14
    ])
def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
15

16
    pp_args = [
17
18
19
20
21
22
23
24
25
26
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "bfloat16",
        "--pipeline-parallel-size",
        str(PP_SIZE),
        "--tensor-parallel-size",
        str(TP_SIZE),
        "--distributed-executor-backend",
        "ray",
    ]
27
28
29
30
31
32
33
34
35
36
37

    # compare without pipeline parallelism
    # NOTE: use mp backend for TP
    # PP tests might involve multiple nodes, and ray might
    #  schedule all workers in a node other than the head node,
    #  which can cause the test to fail.
    tp_args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "bfloat16",
        "--tensor-parallel-size",
38
        str(max(TP_SIZE, 2)),  # We only use 2 GPUs in the CI.
39
40
41
        "--distributed-executor-backend",
        "mp",
    ]
42
    if CHUNKED_PREFILL:
43
44
        pp_args.append("--enable-chunked-prefill")
        tp_args.append("--enable-chunked-prefill")
45
    if EAGER_MODE:
46
47
48
        pp_args.append("--enforce-eager")
        tp_args.append("--enforce-eager")

49
    compare_two_settings(MODEL_NAME, pp_args, tp_args)