test_pp_cudagraph.py 887 Bytes
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0

3
4
5
6
import os

import pytest

7
from ..utils import compare_two_settings, fork_new_process_for_each_test, models_path_prefix
8
9
10


@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
11
    (2, os.path.join(models_path_prefix, "JackFram/llama-160m")),
12
13
14
])
@pytest.mark.parametrize("ATTN_BACKEND", [
    "FLASH_ATTN",
15
    # "FLASHINFER",
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
])
@fork_new_process_for_each_test
def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
    cudagraph_args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "float16",
        "--pipeline-parallel-size",
        str(PP_SIZE),
        "--distributed-executor-backend",
        "mp",
    ]
    os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND

    eager_args = cudagraph_args + ["--enforce-eager"]

    compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)