test_chat.py 4.58 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
import weakref
4

5
6
7
import pytest

from vllm import LLM
8
from vllm.distributed import cleanup_dist_env_and_memory
9

10
from ..openai.test_vision import TEST_IMAGE_ASSETS
11
12


13
14
15
16
17
18
19
@pytest.fixture(scope="function")
def text_llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
              enforce_eager=True,
              seed=0)
20

21
    yield weakref.proxy(llm)
22

23
    del llm
24
25
26
27
28

    cleanup_dist_env_and_memory()


def test_chat(text_llm):
29
30
31
32
33
34
35
36
37
38
39
    prompt1 = "Explain the concept of entropy."
    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant"
        },
        {
            "role": "user",
            "content": prompt1
        },
    ]
40
    outputs = text_llm.chat(messages)
41
42
43
    assert len(outputs) == 1


44
def test_multi_chat(text_llm):
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
    prompt1 = "Explain the concept of entropy."
    prompt2 = "Explain what among us is."

    conversation1 = [
        {
            "role": "system",
            "content": "You are a helpful assistant"
        },
        {
            "role": "user",
            "content": prompt1
        },
    ]

    conversation2 = [
        {
            "role": "system",
            "content": "You are a helpful assistant"
        },
        {
            "role": "user",
            "content": prompt2
        },
    ]

    messages = [conversation1, conversation2]

72
    outputs = text_llm.chat(messages)
73
74
75
    assert len(outputs) == 2


76
77
78
79
@pytest.fixture(scope="function")
def vision_llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
80
    llm = LLM(
81
        model="microsoft/Phi-3.5-vision-instruct",
82
83
84
85
86
        max_model_len=4096,
        max_num_seqs=5,
        enforce_eager=True,
        trust_remote_code=True,
        limit_mm_per_prompt={"image": 2},
87
        seed=0,
88
89
    )

90
    yield weakref.proxy(llm)
91

92
    del llm
93
94
95
96
97

    cleanup_dist_env_and_memory()


@pytest.mark.parametrize("image_urls",
98
99
                         [[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]],
                         indirect=True)
100
def test_chat_multi_image(vision_llm, image_urls: list[str]):
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
    messages = [{
        "role":
        "user",
        "content": [
            *({
                "type": "image_url",
                "image_url": {
                    "url": image_url
                }
            } for image_url in image_urls),
            {
                "type": "text",
                "text": "What's in this image?"
            },
        ],
    }]
117
    outputs = vision_llm.chat(messages)
118
    assert len(outputs) >= 0
119
120


121
def test_llm_chat_tokenization_no_double_bos(text_llm):
122
123
124
125
126
127
128
129
130
131
132
133
134
135
    """
    LLM.chat() should not add special tokens when using chat templates.
    Check we get a single BOS token for llama chat.
    """
    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant"
        },
        {
            "role": "user",
            "content": "Hello!"
        },
    ]
136
    outputs = text_llm.chat(messages)
137
    assert len(outputs) == 1
138
139

    prompt_token_ids = outputs[0].prompt_token_ids
140
141
    assert prompt_token_ids is not None

142
    bos_token = text_llm.get_tokenizer().bos_token_id
143
144
145
146

    # Ensure we have a single BOS
    assert prompt_token_ids[0] == bos_token
    assert prompt_token_ids[1] != bos_token, "Double BOS"
147
148
149
150
151
152
153
154
155
156
157
158
159


@pytest.fixture(scope="function")
def thinking_llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(
        model="Qwen/Qwen3-0.6B",
        max_model_len=4096,
        enforce_eager=True,
        seed=0,
    )

160
    yield weakref.proxy(llm)
161

162
    del llm
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195

    cleanup_dist_env_and_memory()


@pytest.mark.parametrize("enable_thinking", [True, False])
def test_chat_extra_kwargs(thinking_llm, enable_thinking):
    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant"
        },
        {
            "role": "user",
            "content": "What is 1+1?"
        },
    ]

    outputs = thinking_llm.chat(
        messages,
        chat_template_kwargs={"enable_thinking": enable_thinking},
    )
    assert len(outputs) == 1

    prompt_token_ids = outputs[0].prompt_token_ids
    assert prompt_token_ids is not None

    think_id = thinking_llm.get_tokenizer().get_vocab()["<think>"]

    if enable_thinking:
        assert think_id not in prompt_token_ids
    else:
        # The chat template includes dummy thinking process
        assert think_id in prompt_token_ids