test_chat.py 4.65 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
import weakref
4

zhuwenwen's avatar
zhuwenwen committed
5
import os
6
7
8
import pytest

from vllm import LLM
9
from vllm.distributed import cleanup_dist_env_and_memory
10
11

from ..openai.test_vision import TEST_IMAGE_URLS
zhuwenwen's avatar
zhuwenwen committed
12
from ...utils import models_path_prefix
13
14


15
16
17
18
@pytest.fixture(scope="function")
def text_llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
zhuwenwen's avatar
zhuwenwen committed
19
    llm = LLM(model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
20
21
              enforce_eager=True,
              seed=0)
22

23
    yield weakref.proxy(llm)
24

25
    del llm
26
27
28
29
30

    cleanup_dist_env_and_memory()


def test_chat(text_llm):
31
32
33
34
35
36
37
38
39
40
41
    prompt1 = "Explain the concept of entropy."
    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant"
        },
        {
            "role": "user",
            "content": prompt1
        },
    ]
42
    outputs = text_llm.chat(messages)
43
44
45
    assert len(outputs) == 1


46
def test_multi_chat(text_llm):
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
    prompt1 = "Explain the concept of entropy."
    prompt2 = "Explain what among us is."

    conversation1 = [
        {
            "role": "system",
            "content": "You are a helpful assistant"
        },
        {
            "role": "user",
            "content": prompt1
        },
    ]

    conversation2 = [
        {
            "role": "system",
            "content": "You are a helpful assistant"
        },
        {
            "role": "user",
            "content": prompt2
        },
    ]

    messages = [conversation1, conversation2]

74
    outputs = text_llm.chat(messages)
75
76
77
    assert len(outputs) == 2


78
79
80
81
@pytest.fixture(scope="function")
def vision_llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
82
    llm = LLM(
zhuwenwen's avatar
zhuwenwen committed
83
        model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"),
84
85
86
87
88
        max_model_len=4096,
        max_num_seqs=5,
        enforce_eager=True,
        trust_remote_code=True,
        limit_mm_per_prompt={"image": 2},
89
        seed=0,
90
91
    )

92
    yield weakref.proxy(llm)
93

94
    del llm
95
96
97
98
99
100
101

    cleanup_dist_env_and_memory()


@pytest.mark.parametrize("image_urls",
                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
def test_chat_multi_image(vision_llm, image_urls: list[str]):
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
    messages = [{
        "role":
        "user",
        "content": [
            *({
                "type": "image_url",
                "image_url": {
                    "url": image_url
                }
            } for image_url in image_urls),
            {
                "type": "text",
                "text": "What's in this image?"
            },
        ],
    }]
118
    outputs = vision_llm.chat(messages)
119
    assert len(outputs) >= 0
120
121


122
def test_llm_chat_tokenization_no_double_bos(text_llm):
123
124
125
126
127
128
129
130
131
132
133
134
135
136
    """
    LLM.chat() should not add special tokens when using chat templates.
    Check we get a single BOS token for llama chat.
    """
    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant"
        },
        {
            "role": "user",
            "content": "Hello!"
        },
    ]
137
    outputs = text_llm.chat(messages)
138
    assert len(outputs) == 1
139
140

    prompt_token_ids = outputs[0].prompt_token_ids
141
142
    assert prompt_token_ids is not None

143
    bos_token = text_llm.get_tokenizer().bos_token_id
144
145
146
147

    # Ensure we have a single BOS
    assert prompt_token_ids[0] == bos_token
    assert prompt_token_ids[1] != bos_token, "Double BOS"
148
149
150
151
152
153
154
155
156
157
158
159
160


@pytest.fixture(scope="function")
def thinking_llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(
        model="Qwen/Qwen3-0.6B",
        max_model_len=4096,
        enforce_eager=True,
        seed=0,
    )

161
    yield weakref.proxy(llm)
162

163
    del llm
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196

    cleanup_dist_env_and_memory()


@pytest.mark.parametrize("enable_thinking", [True, False])
def test_chat_extra_kwargs(thinking_llm, enable_thinking):
    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant"
        },
        {
            "role": "user",
            "content": "What is 1+1?"
        },
    ]

    outputs = thinking_llm.chat(
        messages,
        chat_template_kwargs={"enable_thinking": enable_thinking},
    )
    assert len(outputs) == 1

    prompt_token_ids = outputs[0].prompt_token_ids
    assert prompt_token_ids is not None

    think_id = thinking_llm.get_tokenizer().get_vocab()["<think>"]

    if enable_thinking:
        assert think_id not in prompt_token_ids
    else:
        # The chat template includes dummy thinking process
        assert think_id in prompt_token_ids