test_tokenization.py 10.6 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import pytest
5
import pytest_asyncio
6
7
import requests

8
from tests.utils import RemoteOpenAIServer
9
from vllm.tokenizers import get_tokenizer
10
11

# any model with a chat template should work here
12
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
13
14
15


@pytest.fixture(scope="module")
16
def server():
17
18
19
20
21
22
23
24
25
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "bfloat16",
        "--max-model-len",
        "8192",
        "--enforce-eager",
        "--max-num-seqs",
        "128",
26
        "--enable-tokenizer-info-endpoint",
27
28
29
    ]

    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
30
31
32
        yield remote_server


33
@pytest.fixture(scope="module")
34
35
def tokenizer_name(model_name: str):
    return model_name
36
37


38
39
40
41
@pytest_asyncio.fixture
async def client(server):
    async with server.get_async_client() as async_client:
        yield async_client
42
43
44
45


@pytest.mark.asyncio
@pytest.mark.parametrize(
46
    "model_name,tokenizer_name",
47
    [(MODEL_NAME, MODEL_NAME)],
48
    indirect=["tokenizer_name"],
49
)
50
51
52
53
54
async def test_tokenize_completions(
    server: RemoteOpenAIServer,
    model_name: str,
    tokenizer_name: str,
):
55
    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
56
57

    for add_special in [False, True]:
58
        prompt = "vllm1 This is a test prompt."
59
60
        tokens = tokenizer.encode(prompt, add_special_tokens=add_special)

61
62
63
64
65
66
67
68
        response = requests.post(
            server.url_for("tokenize"),
            json={
                "add_special_tokens": add_special,
                "model": model_name,
                "prompt": prompt,
            },
        )
69
70
        response.raise_for_status()

71
72
73
74
75
        result = response.json()
        assert result["tokens"] == tokens
        assert result["count"] == len(tokens)
        assert result["max_model_len"] == 8192
        assert result["token_strs"] is None
76
77
78
79


@pytest.mark.asyncio
@pytest.mark.parametrize(
80
    "model_name,tokenizer_name",
81
    [(MODEL_NAME, MODEL_NAME)],
82
    indirect=["tokenizer_name"],
83
)
84
85
86
87
88
async def test_tokenize_chat(
    server: RemoteOpenAIServer,
    model_name: str,
    tokenizer_name: str,
):
89
    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
90
91
92

    for add_generation in [False, True]:
        for add_special in [False, True]:
93
94
95
96
97
            conversation = [
                {"role": "user", "content": "Hi there!"},
                {"role": "assistant", "content": "Nice to meet you!"},
                {"role": "user", "content": "Can I ask a question? vllm1"},
            ]
98
99
100
101
            for continue_final in [False, True]:
                if add_generation and continue_final:
                    continue
                if continue_final:
102
                    conversation.append({"role": "assistant", "content": "Sure,"})
103
104
105
106
107

                prompt = tokenizer.apply_chat_template(
                    add_generation_prompt=add_generation,
                    continue_final_message=continue_final,
                    conversation=conversation,
108
109
110
111
112
113
114
115
116
117
118
119
120
121
                    tokenize=False,
                )
                tokens = tokenizer.encode(prompt, add_special_tokens=add_special)

                response = requests.post(
                    server.url_for("tokenize"),
                    json={
                        "add_generation_prompt": add_generation,
                        "continue_final_message": continue_final,
                        "add_special_tokens": add_special,
                        "messages": conversation,
                        "model": model_name,
                    },
                )
122
123
                response.raise_for_status()

124
125
126
127
128
                result = response.json()
                assert result["tokens"] == tokens
                assert result["count"] == len(tokens)
                assert result["max_model_len"] == 8192
                assert result["token_strs"] is None
129
130


131
132
133
@pytest.mark.asyncio
@pytest.mark.parametrize(
    "model_name,tokenizer_name",
134
    [(MODEL_NAME, MODEL_NAME)],
135
136
137
138
139
140
141
    indirect=["tokenizer_name"],
)
async def test_tokenize_chat_with_tools(
    server: RemoteOpenAIServer,
    model_name: str,
    tokenizer_name: str,
):
142
    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
143
144
145

    for add_generation in [False, True]:
        for add_special in [False, True]:
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
            conversation = [
                {
                    "role": "user",
                    "content": "What's the weather like in Paris today?",
                }
            ]

            tools = [
                {
                    "type": "function",
                    "function": {
                        "name": "get_weather",
                        "parameters": {
                            "type": "object",
                            "properties": {"location": {"type": "string"}},
161
162
                        },
                    },
163
164
                }
            ]
165
166
167
168
169

            for continue_final in [False, True]:
                if add_generation and continue_final:
                    continue
                if continue_final:
170
                    conversation.append({"role": "assistant", "content": "Sure,"})
171
172
173
174
175
176
177
178

                prompt = tokenizer.apply_chat_template(
                    add_generation_prompt=add_generation,
                    continue_final_message=continue_final,
                    conversation=conversation,
                    tools=tools,
                    tokenize=False,
                )
179
                tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
180
181
182
183
184
185
186
187
188
189
190
191
192
193

                response = requests.post(
                    server.url_for("tokenize"),
                    json={
                        "add_generation_prompt": add_generation,
                        "continue_final_message": continue_final,
                        "add_special_tokens": add_special,
                        "messages": conversation,
                        "model": model_name,
                        "tools": tools,
                    },
                )
                response.raise_for_status()

194
195
196
197
198
199
200
201
202
203
                result = response.json()
                assert result["tokens"] == tokens
                assert result["count"] == len(tokens)
                assert result["max_model_len"] == 8192
                assert result["token_strs"] is None


@pytest.mark.asyncio
@pytest.mark.parametrize(
    "model_name, tokenizer_name",
204
    [(MODEL_NAME, MODEL_NAME)],
205
206
207
208
209
210
211
    indirect=["tokenizer_name"],
)
async def test_tokenize_with_return_token_strs(
    server: RemoteOpenAIServer,
    model_name: str,
    tokenizer_name: str,
):
212
    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
213
214
215
216

    prompt = "This is a token_strs test prompt! vllm1"
    response = requests.post(
        server.url_for("tokenize"),
217
        json={"prompt": prompt, "model": model_name, "return_token_strs": True},
218
219
220
221
222
223
224
225
226
227
228
    )
    response.raise_for_status()

    tokens = tokenizer.encode(prompt, add_special_tokens=True)
    tokens_str = tokenizer.convert_ids_to_tokens(tokens)

    result = response.json()
    assert result["tokens"] == tokens
    assert result["count"] == len(tokens)
    assert result["max_model_len"] == 8192
    assert result["token_strs"] == tokens_str
229
230


231
232
@pytest.mark.asyncio
@pytest.mark.parametrize(
233
    "model_name,tokenizer_name",
234
    [(MODEL_NAME, MODEL_NAME)],
235
    indirect=["tokenizer_name"],
236
)
237
238
239
240
241
async def test_detokenize(
    server: RemoteOpenAIServer,
    model_name: str,
    tokenizer_name: str,
):
242
    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
243

244
    prompt = "This is a test prompt. vllm1"
245
246
    tokens = tokenizer.encode(prompt, add_special_tokens=False)

247
248
249
    response = requests.post(
        server.url_for("detokenize"), json={"model": model_name, "tokens": tokens}
    )
250
251
252
    response.raise_for_status()

    assert response.json() == {"prompt": prompt}
253
254
255
256
257


@pytest.mark.asyncio
@pytest.mark.parametrize(
    "model_name,tokenizer_name",
258
    [(MODEL_NAME, MODEL_NAME)],
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
    indirect=["tokenizer_name"],
)
async def test_tokenizer_info_basic(
    server: RemoteOpenAIServer,
    model_name: str,
    tokenizer_name: str,
):
    """Test basic tokenizer info endpoint functionality."""
    response = requests.get(server.url_for("tokenizer_info"))
    response.raise_for_status()
    result = response.json()
    assert "tokenizer_class" in result
    assert isinstance(result["tokenizer_class"], str)
    assert result["tokenizer_class"]


@pytest.mark.asyncio
async def test_tokenizer_info_schema(server: RemoteOpenAIServer):
    """Test that the response matches expected schema types."""
    response = requests.get(server.url_for("tokenizer_info"))
    response.raise_for_status()
    result = response.json()
    field_types = {
        "add_bos_token": bool,
        "add_prefix_space": bool,
        "clean_up_tokenization_spaces": bool,
        "split_special_tokens": bool,
        "bos_token": str,
        "eos_token": str,
        "pad_token": str,
        "unk_token": str,
        "chat_template": str,
        "errors": str,
        "model_max_length": int,
        "additional_special_tokens": list,
        "added_tokens_decoder": dict,
    }
    for field, expected_type in field_types.items():
        if field in result and result[field] is not None:
298
299
300
            assert isinstance(result[field], expected_type), (
                f"{field} should be {expected_type.__name__}"
            )
301
302
303
304


@pytest.mark.asyncio
async def test_tokenizer_info_consistency_with_tokenize(
305
306
    server: RemoteOpenAIServer,
):
307
308
309
310
311
312
    """Test that tokenizer info is consistent with tokenization endpoint."""
    info_response = requests.get(server.url_for("tokenizer_info"))
    info_response.raise_for_status()
    info = info_response.json()
    tokenize_response = requests.post(
        server.url_for("tokenize"),
313
        json={"model": MODEL_NAME, "prompt": "Hello world!"},
314
315
316
317
318
319
320
    )
    tokenize_response.raise_for_status()
    tokenize_result = tokenize_response.json()
    info_max_len = info.get("model_max_length")
    tokenize_max_len = tokenize_result.get("max_model_len")
    if info_max_len and tokenize_max_len:
        assert info_max_len >= tokenize_max_len, (
321
322
            "Info max length should be >= tokenize max length"
        )
323
324
325
326
327
328
329
330
331
332


@pytest.mark.asyncio
async def test_tokenizer_info_chat_template(server: RemoteOpenAIServer):
    """Test chat template is properly included."""
    response = requests.get(server.url_for("tokenizer_info"))
    response.raise_for_status()
    result = response.json()
    chat_template = result.get("chat_template")
    if chat_template:
333
        assert isinstance(chat_template, str), "Chat template should be a string"
334
        assert chat_template.strip(), "Chat template should not be empty"