test_chat.py 5.72 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
import weakref
4

5
6
7
import pytest

from vllm import LLM
8
from vllm.distributed import cleanup_dist_env_and_memory
9
from vllm.sampling_params import SamplingParams
10

11
from ..openai.test_vision import TEST_IMAGE_ASSETS
12
13


14
15
16
17
@pytest.fixture(scope="function")
def text_llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
18
    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", enforce_eager=True, seed=0)
19

20
    yield weakref.proxy(llm)
21

22
    del llm
23
24
25
26

    cleanup_dist_env_and_memory()


27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
@pytest.fixture(scope="function")
def llm_for_failure_test():
    """
    Fixture for testing issue #26081.
    Uses a small max_model_len to easily trigger length errors.
    """
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(
        model="meta-llama/Llama-3.2-1B-Instruct",
        enforce_eager=True,
        seed=0,
        max_model_len=128,
        disable_log_stats=True,
    )

    yield weakref.proxy(llm)

    del llm

    cleanup_dist_env_and_memory()


50
def test_chat(text_llm):
51
52
    prompt1 = "Explain the concept of entropy."
    messages = [
53
54
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": prompt1},
55
    ]
56
    outputs = text_llm.chat(messages)
57
58
59
    assert len(outputs) == 1


60
def test_multi_chat(text_llm):
61
62
63
64
    prompt1 = "Explain the concept of entropy."
    prompt2 = "Explain what among us is."

    conversation1 = [
65
66
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": prompt1},
67
68
69
    ]

    conversation2 = [
70
71
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": prompt2},
72
73
74
75
    ]

    messages = [conversation1, conversation2]

76
    outputs = text_llm.chat(messages)
77
78
79
    assert len(outputs) == 2


80
81
82
83
@pytest.fixture(scope="function")
def vision_llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
84
    llm = LLM(
85
        model="microsoft/Phi-3.5-vision-instruct",
86
87
88
89
90
        max_model_len=4096,
        max_num_seqs=5,
        enforce_eager=True,
        trust_remote_code=True,
        limit_mm_per_prompt={"image": 2},
91
        seed=0,
92
93
    )

94
    yield weakref.proxy(llm)
95

96
    del llm
97
98
99
100

    cleanup_dist_env_and_memory()


101
102
103
@pytest.mark.parametrize(
    "image_urls", [[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]], indirect=True
)
104
def test_chat_multi_image(vision_llm, image_urls: list[str]):
105
106
107
108
109
110
111
112
113
114
115
116
    messages = [
        {
            "role": "user",
            "content": [
                *(
                    {"type": "image_url", "image_url": {"url": image_url}}
                    for image_url in image_urls
                ),
                {"type": "text", "text": "What's in this image?"},
            ],
        }
    ]
117
    outputs = vision_llm.chat(messages)
118
    assert len(outputs) >= 0
119
120


121
def test_llm_chat_tokenization_no_double_bos(text_llm):
122
123
124
125
126
    """
    LLM.chat() should not add special tokens when using chat templates.
    Check we get a single BOS token for llama chat.
    """
    messages = [
127
128
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": "Hello!"},
129
    ]
130
    outputs = text_llm.chat(messages)
131
    assert len(outputs) == 1
132
133

    prompt_token_ids = outputs[0].prompt_token_ids
134
135
    assert prompt_token_ids is not None

136
    bos_token = text_llm.get_tokenizer().bos_token_id
137
138
139
140

    # Ensure we have a single BOS
    assert prompt_token_ids[0] == bos_token
    assert prompt_token_ids[1] != bos_token, "Double BOS"
141
142
143
144
145
146
147
148
149
150
151
152
153


@pytest.fixture(scope="function")
def thinking_llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(
        model="Qwen/Qwen3-0.6B",
        max_model_len=4096,
        enforce_eager=True,
        seed=0,
    )

154
    yield weakref.proxy(llm)
155

156
    del llm
157
158
159
160
161
162
163

    cleanup_dist_env_and_memory()


@pytest.mark.parametrize("enable_thinking", [True, False])
def test_chat_extra_kwargs(thinking_llm, enable_thinking):
    messages = [
164
165
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": "What is 1+1?"},
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
    ]

    outputs = thinking_llm.chat(
        messages,
        chat_template_kwargs={"enable_thinking": enable_thinking},
    )
    assert len(outputs) == 1

    prompt_token_ids = outputs[0].prompt_token_ids
    assert prompt_token_ids is not None

    think_id = thinking_llm.get_tokenizer().get_vocab()["<think>"]

    if enable_thinking:
        assert think_id not in prompt_token_ids
    else:
        # The chat template includes dummy thinking process
        assert think_id in prompt_token_ids
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212


def test_chat_batch_failure_cleanup(llm_for_failure_test):
    """
    Tests that if a batch call to llm.chat() fails mid-way
    (e.g., due to one invalid prompt), the requests that
    were already enqueued are properly aborted and do not
    pollute the queue for subsequent calls.
    (Fixes Issue #26081)
    """
    llm = llm_for_failure_test
    valid_msg = [{"role": "user", "content": "Hello"}]
    long_text = "This is a very long text to test the error " * 50
    invalid_msg = [{"role": "user", "content": long_text}]
    batch_1 = [
        valid_msg,
        valid_msg,
        invalid_msg,
    ]
    batch_2 = [
        valid_msg,
        valid_msg,
    ]
    sampling_params = SamplingParams(temperature=0, max_tokens=10)
    with pytest.raises(ValueError, match="longer than the maximum model length"):
        llm.chat(batch_1, sampling_params=sampling_params)
    outputs_2 = llm.chat(batch_2, sampling_params=sampling_params)
    assert len(outputs_2) == len(batch_2)
    assert llm.llm_engine.get_num_unfinished_requests() == 0