test_pp_cudagraph.py 1.16 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
from __future__ import annotations
3

4
from typing import TYPE_CHECKING
5

zhuwenwen's avatar
zhuwenwen committed
6
import os
7
8
import pytest

zhuwenwen's avatar
zhuwenwen committed
9
from ..utils import compare_two_settings, create_new_process_for_each_test, models_path_prefix
10

11
12
if TYPE_CHECKING:
    from typing_extensions import LiteralString
13
14
15


@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
16
    (2, os.path.join(models_path_prefix, "JackFram/llama-160m")),
17
18
19
])
@pytest.mark.parametrize("ATTN_BACKEND", [
    "FLASH_ATTN",
20
    # "FLASHINFER",
21
])
22
@create_new_process_for_each_test()
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def test_pp_cudagraph(
    monkeypatch: pytest.MonkeyPatch,
    PP_SIZE: int,
    MODEL_NAME: str,
    ATTN_BACKEND: LiteralString,
):
    with monkeypatch.context() as m:
        cudagraph_args = [
            # use half precision for speed and memory savings in CI environment
            "--dtype",
            "float16",
            "--pipeline-parallel-size",
            str(PP_SIZE),
            "--distributed-executor-backend",
            "mp",
        ]
        m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND)

        eager_args = cudagraph_args + ["--enforce-eager"]

        compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)