test_integration.py 3.86 KB
Newer Older
1
2
3
4
5
"""Tests which cover integration of the speculative decoding framework with
other features, e.g. cuda graphs.
"""

import pytest
6
import os
7

8
from .conftest import run_equality_correctness_test
9
from ...utils import models_path_prefix
10

11
MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
12
13
14
15
16
17
18
19
20
21


@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
        # Required for spec decode.
        "use_v2_block_manager": True,

        # Verify equality when cuda graphs allowed.
        "enforce_eager": False,
22
        "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
23
24
25
26
27
28
    }])
@pytest.mark.parametrize(
    "per_test_common_llm_kwargs",
    [
        {
            # Identical models.
29
            "speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
30
31
32
33
34
35
36
37
            "num_speculative_tokens": 5,
        },
    ])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{}])
@pytest.mark.parametrize("batch_size", [8])
@pytest.mark.parametrize("output_len", [32])
@pytest.mark.parametrize("seed", [1])
38
39
40
41
def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
                                per_test_common_llm_kwargs,
                                baseline_llm_kwargs, test_llm_kwargs,
                                batch_size: int, output_len: int, seed: int):
42
43
    """Verify spec decode equality when cuda graphs are enabled.
    """
44
45
46
47
48
49
50
51
52
    run_equality_correctness_test(vllm_runner,
                                  common_llm_kwargs,
                                  per_test_common_llm_kwargs,
                                  baseline_llm_kwargs,
                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len=output_len,
                                  seed=seed,
                                  temperature=0.0)
53
54
55
56
57


@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
58
        "model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
59
60
61
62
63
64
65
66
67

        # Skip cuda graph recording for fast test.
        "enforce_eager": True,

        # Required for spec decode.
        "use_v2_block_manager": True,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
    {
68
        "speculative_model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
        "num_speculative_tokens": 5,
    },
])
@pytest.mark.parametrize(
    "test_llm_kwargs",
    [
        # Explicitly specify draft model quantization
        {
            "speculative_model_quantization": "gptq",
        },
        # Explicitly specify GPTQ-based draft model to use marlin quantization
        {
            "speculative_model_quantization": "marlin",
        },
        # Not explicitly specify draft model quantization
        {
            "speculative_model_quantization": None,
        },
    ])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("seed", [1])
91
92
93
94
95
def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
                                               per_test_common_llm_kwargs,
                                               baseline_llm_kwargs,
                                               test_llm_kwargs,
                                               batch_size: int, seed: int):
96
97
    """Verify spec decode works well with draft model quantization configs.
    """
98
99
100
101
102
103
104
105
106
    run_equality_correctness_test(vllm_runner,
                                  common_llm_kwargs,
                                  per_test_common_llm_kwargs,
                                  baseline_llm_kwargs,
                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len=32,
                                  seed=seed,
                                  temperature=0.0)