test_integration.py 5.78 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
7
"""Tests which cover integration of the speculative decoding framework with
other features, e.g. cuda graphs.
"""

import pytest
8
import os
9

10
from .conftest import run_equality_correctness_test
11
from ...utils import models_path_prefix
12

13
14
15

os.environ["LLAMA_NN"] = "0"

16
MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
17
18
19
20
21
22
23
24


@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{

        # Verify equality when cuda graphs allowed.
        "enforce_eager": False,
25
        "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
26
27
28
29
30
31
    }])
@pytest.mark.parametrize(
    "per_test_common_llm_kwargs",
    [
        {
            # Identical models.
32
            "speculative_config": {
zhuwenwen's avatar
zhuwenwen committed
33
                "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
34
35
                "num_speculative_tokens": 5,
            },
36
37
38
39
40
41
42
        },
    ])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{}])
@pytest.mark.parametrize("batch_size", [8])
@pytest.mark.parametrize("output_len", [32])
@pytest.mark.parametrize("seed", [1])
43
44
45
46
def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
                                per_test_common_llm_kwargs,
                                baseline_llm_kwargs, test_llm_kwargs,
                                batch_size: int, output_len: int, seed: int):
47
48
    """Verify spec decode equality when cuda graphs are enabled.
    """
49
50
51
52
53
54
55
56
57
    run_equality_correctness_test(vllm_runner,
                                  common_llm_kwargs,
                                  per_test_common_llm_kwargs,
                                  baseline_llm_kwargs,
                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len=output_len,
                                  seed=seed,
                                  temperature=0.0)
58
59
60
61
62


@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
63
        "model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
64
65
66
67

        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
    }])
zhuwenwen's avatar
zhuwenwen committed
68

69
@pytest.mark.parametrize("per_test_common_llm_kwargs", [])
70
71
72
73
74
@pytest.mark.parametrize(
    "test_llm_kwargs",
    [
        # Explicitly specify draft model quantization
        {
75
            "speculative_config": {
zhuwenwen's avatar
zhuwenwen committed
76
                "model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
77
78
79
                "num_speculative_tokens": 5,
                "quantization": "gptq",
            },
80
81
82
        },
        # Explicitly specify GPTQ-based draft model to use marlin quantization
        {
83
            "speculative_config": {
zhuwenwen's avatar
zhuwenwen committed
84
                "model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
85
86
87
                "num_speculative_tokens": 5,
                "quantization": "marlin",
            },
88
89
90
        },
        # Not explicitly specify draft model quantization
        {
91
            "speculative_config": {
zhuwenwen's avatar
zhuwenwen committed
92
                "model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
93
94
95
                "num_speculative_tokens": 5,
                "quantization": None,
            },
96
97
98
99
100
        },
    ])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("seed", [1])
101
102
103
104
105
def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
                                               per_test_common_llm_kwargs,
                                               baseline_llm_kwargs,
                                               test_llm_kwargs,
                                               batch_size: int, seed: int):
106
107
    """Verify spec decode works well with draft model quantization configs.
    """
108
109
110
111
112
113
114
115
116
    run_equality_correctness_test(vllm_runner,
                                  common_llm_kwargs,
                                  per_test_common_llm_kwargs,
                                  baseline_llm_kwargs,
                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len=32,
                                  seed=seed,
                                  temperature=0.0)
117
118
119
120
121
122
123
124
125
126
127
128


@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
        "model_name": MAIN_MODEL,

        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
129
130
@pytest.mark.parametrize("test_llm_kwargs", [{
    "speculative_config": {
zhuwenwen's avatar
zhuwenwen committed
131
        "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
132
133
134
135
        "num_speculative_tokens": 3,
        "disable_mqa_scorer": True,
    },
}])
136
137
138
139
140
141
142
143
144
145
146
@pytest.mark.parametrize("batch_size", [1, 5])
@pytest.mark.parametrize(
    "output_len",
    [
        # Use smaller output len for fast test.
        32,
    ])
@pytest.mark.parametrize("seed", [1])
def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
                    baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
                    output_len: int, seed: int):
147
    """Verify that speculative decoding generates the same output
148
149
150
151
152
153
154
155
156
157
    with batch expansion scorer and mqa scorer.
    """
    run_equality_correctness_test(vllm_runner,
                                  common_llm_kwargs,
                                  per_test_common_llm_kwargs,
                                  baseline_llm_kwargs,
                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len=output_len,
                                  seed=seed,
zhuwenwen's avatar
zhuwenwen committed
158
                                  temperature=0.0)