test_chat.py 5.75 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
import weakref
4

5
6
import pytest

7
from tests.entrypoints.openai.chat_completion.test_vision import TEST_IMAGE_ASSETS
8
from vllm import LLM
9
from vllm.distributed import cleanup_dist_env_and_memory
10
from vllm.sampling_params import SamplingParams
11
12


13
14
15
16
@pytest.fixture(scope="function")
def text_llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
17
    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", enforce_eager=True, seed=0)
18

19
    yield weakref.proxy(llm)
20

21
    del llm
22
23
24
25

    cleanup_dist_env_and_memory()


26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
@pytest.fixture(scope="function")
def llm_for_failure_test():
    """
    Fixture for testing issue #26081.
    Uses a small max_model_len to easily trigger length errors.
    """
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(
        model="meta-llama/Llama-3.2-1B-Instruct",
        enforce_eager=True,
        seed=0,
        max_model_len=128,
        disable_log_stats=True,
    )

    yield weakref.proxy(llm)

    del llm

    cleanup_dist_env_and_memory()


49
def test_chat(text_llm):
50
51
    prompt1 = "Explain the concept of entropy."
    messages = [
52
53
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": prompt1},
54
    ]
55
    outputs = text_llm.chat(messages)
56
57
58
    assert len(outputs) == 1


59
def test_multi_chat(text_llm):
60
61
62
63
    prompt1 = "Explain the concept of entropy."
    prompt2 = "Explain what among us is."

    conversation1 = [
64
65
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": prompt1},
66
67
68
    ]

    conversation2 = [
69
70
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": prompt2},
71
72
73
74
    ]

    messages = [conversation1, conversation2]

75
    outputs = text_llm.chat(messages)
76
77
78
    assert len(outputs) == 2


79
80
81
82
@pytest.fixture(scope="function")
def vision_llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
83
    llm = LLM(
84
        model="microsoft/Phi-3.5-vision-instruct",
85
86
87
88
89
        max_model_len=4096,
        max_num_seqs=5,
        enforce_eager=True,
        trust_remote_code=True,
        limit_mm_per_prompt={"image": 2},
90
        seed=0,
91
92
    )

93
    yield weakref.proxy(llm)
94

95
    del llm
96
97
98
99

    cleanup_dist_env_and_memory()


100
101
102
@pytest.mark.parametrize(
    "image_urls", [[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]], indirect=True
)
103
def test_chat_multi_image(vision_llm, image_urls: list[str]):
104
105
106
107
108
109
110
111
112
113
114
115
    messages = [
        {
            "role": "user",
            "content": [
                *(
                    {"type": "image_url", "image_url": {"url": image_url}}
                    for image_url in image_urls
                ),
                {"type": "text", "text": "What's in this image?"},
            ],
        }
    ]
116
    outputs = vision_llm.chat(messages)
117
    assert len(outputs) >= 0
118
119


120
def test_llm_chat_tokenization_no_double_bos(text_llm):
121
122
123
124
125
    """
    LLM.chat() should not add special tokens when using chat templates.
    Check we get a single BOS token for llama chat.
    """
    messages = [
126
127
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": "Hello!"},
128
    ]
129
    outputs = text_llm.chat(messages)
130
    assert len(outputs) == 1
131
132

    prompt_token_ids = outputs[0].prompt_token_ids
133
134
    assert prompt_token_ids is not None

135
    bos_token = text_llm.get_tokenizer().bos_token_id
136
137
138
139

    # Ensure we have a single BOS
    assert prompt_token_ids[0] == bos_token
    assert prompt_token_ids[1] != bos_token, "Double BOS"
140
141
142
143
144
145
146
147
148
149
150
151
152


@pytest.fixture(scope="function")
def thinking_llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(
        model="Qwen/Qwen3-0.6B",
        max_model_len=4096,
        enforce_eager=True,
        seed=0,
    )

153
    yield weakref.proxy(llm)
154

155
    del llm
156
157
158
159
160
161
162

    cleanup_dist_env_and_memory()


@pytest.mark.parametrize("enable_thinking", [True, False])
def test_chat_extra_kwargs(thinking_llm, enable_thinking):
    messages = [
163
164
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": "What is 1+1?"},
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
    ]

    outputs = thinking_llm.chat(
        messages,
        chat_template_kwargs={"enable_thinking": enable_thinking},
    )
    assert len(outputs) == 1

    prompt_token_ids = outputs[0].prompt_token_ids
    assert prompt_token_ids is not None

    think_id = thinking_llm.get_tokenizer().get_vocab()["<think>"]

    if enable_thinking:
        assert think_id not in prompt_token_ids
    else:
        # The chat template includes dummy thinking process
        assert think_id in prompt_token_ids
183
184
185
186
187
188
189
190
191
192
193
194
195
196


def test_chat_batch_failure_cleanup(llm_for_failure_test):
    """
    Tests that if a batch call to llm.chat() fails mid-way
    (e.g., due to one invalid prompt), the requests that
    were already enqueued are properly aborted and do not
    pollute the queue for subsequent calls.
    (Fixes Issue #26081)
    """
    llm = llm_for_failure_test
    valid_msg = [{"role": "user", "content": "Hello"}]
    long_text = "This is a very long text to test the error " * 50
    invalid_msg = [{"role": "user", "content": long_text}]
197
198
199

    batch_1 = [valid_msg, valid_msg, invalid_msg]
    batch_2 = [valid_msg, valid_msg]
200
    sampling_params = SamplingParams(temperature=0, max_tokens=10)
201

202
    with pytest.raises(ValueError, match="maximum context length is"):
203
        llm.chat(batch_1, sampling_params=sampling_params)
204
205
    assert llm.llm_engine.get_num_unfinished_requests() == 0

206
207
208
    outputs_2 = llm.chat(batch_2, sampling_params=sampling_params)
    assert len(outputs_2) == len(batch_2)
    assert llm.llm_engine.get_num_unfinished_requests() == 0