test_chat.py 4.53 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
import weakref
4

5
6
7
import pytest

from vllm import LLM
8
from vllm.distributed import cleanup_dist_env_and_memory
9
10
11
12

from ..openai.test_vision import TEST_IMAGE_URLS


13
14
15
16
17
18
19
@pytest.fixture(scope="function")
def text_llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
              enforce_eager=True,
              seed=0)
20

21
    yield weakref.proxy(llm)
22

23
    del llm
24
25
26
27
28

    cleanup_dist_env_and_memory()


def test_chat(text_llm):
29
30
31
32
33
34
35
36
37
38
39
    prompt1 = "Explain the concept of entropy."
    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant"
        },
        {
            "role": "user",
            "content": prompt1
        },
    ]
40
    outputs = text_llm.chat(messages)
41
42
43
    assert len(outputs) == 1


44
def test_multi_chat(text_llm):
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
    prompt1 = "Explain the concept of entropy."
    prompt2 = "Explain what among us is."

    conversation1 = [
        {
            "role": "system",
            "content": "You are a helpful assistant"
        },
        {
            "role": "user",
            "content": prompt1
        },
    ]

    conversation2 = [
        {
            "role": "system",
            "content": "You are a helpful assistant"
        },
        {
            "role": "user",
            "content": prompt2
        },
    ]

    messages = [conversation1, conversation2]

72
    outputs = text_llm.chat(messages)
73
74
75
    assert len(outputs) == 2


76
77
78
79
@pytest.fixture(scope="function")
def vision_llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
80
    llm = LLM(
81
        model="microsoft/Phi-3.5-vision-instruct",
82
83
84
85
86
        max_model_len=4096,
        max_num_seqs=5,
        enforce_eager=True,
        trust_remote_code=True,
        limit_mm_per_prompt={"image": 2},
87
        seed=0,
88
89
    )

90
    yield weakref.proxy(llm)
91

92
    del llm
93
94
95
96
97
98
99

    cleanup_dist_env_and_memory()


@pytest.mark.parametrize("image_urls",
                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
def test_chat_multi_image(vision_llm, image_urls: list[str]):
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
    messages = [{
        "role":
        "user",
        "content": [
            *({
                "type": "image_url",
                "image_url": {
                    "url": image_url
                }
            } for image_url in image_urls),
            {
                "type": "text",
                "text": "What's in this image?"
            },
        ],
    }]
116
    outputs = vision_llm.chat(messages)
117
    assert len(outputs) >= 0
118
119


120
def test_llm_chat_tokenization_no_double_bos(text_llm):
121
122
123
124
125
126
127
128
129
130
131
132
133
134
    """
    LLM.chat() should not add special tokens when using chat templates.
    Check we get a single BOS token for llama chat.
    """
    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant"
        },
        {
            "role": "user",
            "content": "Hello!"
        },
    ]
135
    outputs = text_llm.chat(messages)
136
    assert len(outputs) == 1
137
138

    prompt_token_ids = outputs[0].prompt_token_ids
139
140
    assert prompt_token_ids is not None

141
    bos_token = text_llm.get_tokenizer().bos_token_id
142
143
144
145

    # Ensure we have a single BOS
    assert prompt_token_ids[0] == bos_token
    assert prompt_token_ids[1] != bos_token, "Double BOS"
146
147
148
149
150
151
152
153
154
155
156
157
158


@pytest.fixture(scope="function")
def thinking_llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(
        model="Qwen/Qwen3-0.6B",
        max_model_len=4096,
        enforce_eager=True,
        seed=0,
    )

159
    yield weakref.proxy(llm)
160

161
    del llm
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194

    cleanup_dist_env_and_memory()


@pytest.mark.parametrize("enable_thinking", [True, False])
def test_chat_extra_kwargs(thinking_llm, enable_thinking):
    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant"
        },
        {
            "role": "user",
            "content": "What is 1+1?"
        },
    ]

    outputs = thinking_llm.chat(
        messages,
        chat_template_kwargs={"enable_thinking": enable_thinking},
    )
    assert len(outputs) == 1

    prompt_token_ids = outputs[0].prompt_token_ids
    assert prompt_token_ids is not None

    think_id = thinking_llm.get_tokenizer().get_vocab()["<think>"]

    if enable_thinking:
        assert think_id not in prompt_token_ids
    else:
        # The chat template includes dummy thinking process
        assert think_id in prompt_token_ids