test_tokenization.py 11.5 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import pytest
5
import pytest_asyncio
6
7
8
9
10
11
12
import requests

from vllm.transformers_utils.tokenizer import get_tokenizer

from ...utils import RemoteOpenAIServer

# any model with a chat template should work here
13
MODEL_NAME = "Qwen/Qwen3-0.6B"
14
15
16


@pytest.fixture(scope="module")
17
def server():
18
19
20
21
22
23
24
25
26
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "bfloat16",
        "--max-model-len",
        "8192",
        "--enforce-eager",
        "--max-num-seqs",
        "128",
27
        "--enable-tokenizer-info-endpoint",
28
29
30
    ]

    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
31
32
33
        yield remote_server


34
@pytest.fixture(scope="module")
35
36
def tokenizer_name(model_name: str):
    return model_name
37
38


39
40
41
42
@pytest_asyncio.fixture
async def client(server):
    async with server.get_async_client() as async_client:
        yield async_client
43
44
45
46


@pytest.mark.asyncio
@pytest.mark.parametrize(
47
    "model_name,tokenizer_name",
48
    [(MODEL_NAME, MODEL_NAME)],
49
    indirect=["tokenizer_name"],
50
)
51
52
53
54
55
async def test_tokenize_completions(
    server: RemoteOpenAIServer,
    model_name: str,
    tokenizer_name: str,
):
56
    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
57
58

    for add_special in [False, True]:
59
        prompt = "vllm1 This is a test prompt."
60
61
        tokens = tokenizer.encode(prompt, add_special_tokens=add_special)

62
63
64
65
66
67
68
69
        response = requests.post(
            server.url_for("tokenize"),
            json={
                "add_special_tokens": add_special,
                "model": model_name,
                "prompt": prompt,
            },
        )
70
71
        response.raise_for_status()

72
73
74
75
76
        result = response.json()
        assert result["tokens"] == tokens
        assert result["count"] == len(tokens)
        assert result["max_model_len"] == 8192
        assert result["token_strs"] is None
77
78
79
80


@pytest.mark.asyncio
@pytest.mark.parametrize(
81
    "model_name,tokenizer_name",
82
    [(MODEL_NAME, MODEL_NAME)],
83
    indirect=["tokenizer_name"],
84
)
85
86
87
88
89
async def test_tokenize_chat(
    server: RemoteOpenAIServer,
    model_name: str,
    tokenizer_name: str,
):
90
    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
91
92
93

    for add_generation in [False, True]:
        for add_special in [False, True]:
94
95
96
97
98
            conversation = [
                {"role": "user", "content": "Hi there!"},
                {"role": "assistant", "content": "Nice to meet you!"},
                {"role": "user", "content": "Can I ask a question? vllm1"},
            ]
99
100
101
102
            for continue_final in [False, True]:
                if add_generation and continue_final:
                    continue
                if continue_final:
103
                    conversation.append({"role": "assistant", "content": "Sure,"})
104
105
106
107
108

                prompt = tokenizer.apply_chat_template(
                    add_generation_prompt=add_generation,
                    continue_final_message=continue_final,
                    conversation=conversation,
109
110
111
112
113
114
115
116
117
118
119
120
121
122
                    tokenize=False,
                )
                tokens = tokenizer.encode(prompt, add_special_tokens=add_special)

                response = requests.post(
                    server.url_for("tokenize"),
                    json={
                        "add_generation_prompt": add_generation,
                        "continue_final_message": continue_final,
                        "add_special_tokens": add_special,
                        "messages": conversation,
                        "model": model_name,
                    },
                )
123
124
                response.raise_for_status()

125
126
127
128
129
                result = response.json()
                assert result["tokens"] == tokens
                assert result["count"] == len(tokens)
                assert result["max_model_len"] == 8192
                assert result["token_strs"] is None
130
131


132
133
134
@pytest.mark.asyncio
@pytest.mark.parametrize(
    "model_name,tokenizer_name",
135
    [(MODEL_NAME, MODEL_NAME)],
136
137
138
139
140
141
142
    indirect=["tokenizer_name"],
)
async def test_tokenize_chat_with_tools(
    server: RemoteOpenAIServer,
    model_name: str,
    tokenizer_name: str,
):
143
    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
144
145
146

    for add_generation in [False, True]:
        for add_special in [False, True]:
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
            conversation = [
                {
                    "role": "user",
                    "content": "What's the weather like in Paris today?",
                }
            ]

            tools = [
                {
                    "type": "function",
                    "function": {
                        "name": "get_weather",
                        "parameters": {
                            "type": "object",
                            "properties": {"location": {"type": "string"}},
162
163
                        },
                    },
164
165
                }
            ]
166
167
168
169
170

            for continue_final in [False, True]:
                if add_generation and continue_final:
                    continue
                if continue_final:
171
                    conversation.append({"role": "assistant", "content": "Sure,"})
172
173
174
175
176
177
178
179

                prompt = tokenizer.apply_chat_template(
                    add_generation_prompt=add_generation,
                    continue_final_message=continue_final,
                    conversation=conversation,
                    tools=tools,
                    tokenize=False,
                )
180
                tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
181
182
183
184
185
186
187
188
189
190
191
192
193
194

                response = requests.post(
                    server.url_for("tokenize"),
                    json={
                        "add_generation_prompt": add_generation,
                        "continue_final_message": continue_final,
                        "add_special_tokens": add_special,
                        "messages": conversation,
                        "model": model_name,
                        "tools": tools,
                    },
                )
                response.raise_for_status()

195
196
197
198
199
200
201
202
203
204
                result = response.json()
                assert result["tokens"] == tokens
                assert result["count"] == len(tokens)
                assert result["max_model_len"] == 8192
                assert result["token_strs"] is None


@pytest.mark.asyncio
@pytest.mark.parametrize(
    "model_name, tokenizer_name",
205
    [(MODEL_NAME, MODEL_NAME)],
206
207
208
209
210
211
212
    indirect=["tokenizer_name"],
)
async def test_tokenize_with_return_token_strs(
    server: RemoteOpenAIServer,
    model_name: str,
    tokenizer_name: str,
):
213
    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
214
215
216
217

    prompt = "This is a token_strs test prompt! vllm1"
    response = requests.post(
        server.url_for("tokenize"),
218
        json={"prompt": prompt, "model": model_name, "return_token_strs": True},
219
220
221
222
223
224
225
226
227
228
229
    )
    response.raise_for_status()

    tokens = tokenizer.encode(prompt, add_special_tokens=True)
    tokens_str = tokenizer.convert_ids_to_tokens(tokens)

    result = response.json()
    assert result["tokens"] == tokens
    assert result["count"] == len(tokens)
    assert result["max_model_len"] == 8192
    assert result["token_strs"] == tokens_str
230
231


232
233
@pytest.mark.asyncio
@pytest.mark.parametrize(
234
    "model_name,tokenizer_name",
235
    [(MODEL_NAME, MODEL_NAME)],
236
    indirect=["tokenizer_name"],
237
)
238
239
240
241
242
async def test_detokenize(
    server: RemoteOpenAIServer,
    model_name: str,
    tokenizer_name: str,
):
243
    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
244

245
    prompt = "This is a test prompt. vllm1"
246
247
    tokens = tokenizer.encode(prompt, add_special_tokens=False)

248
249
250
    response = requests.post(
        server.url_for("detokenize"), json={"model": model_name, "tokens": tokens}
    )
251
252
253
    response.raise_for_status()

    assert response.json() == {"prompt": prompt}
254
255
256
257
258


@pytest.mark.asyncio
@pytest.mark.parametrize(
    "model_name,tokenizer_name",
259
    [(MODEL_NAME, MODEL_NAME)],
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
    indirect=["tokenizer_name"],
)
async def test_tokenizer_info_basic(
    server: RemoteOpenAIServer,
    model_name: str,
    tokenizer_name: str,
):
    """Test basic tokenizer info endpoint functionality."""
    response = requests.get(server.url_for("tokenizer_info"))
    response.raise_for_status()
    result = response.json()
    assert "tokenizer_class" in result
    assert isinstance(result["tokenizer_class"], str)
    assert result["tokenizer_class"]


@pytest.mark.asyncio
async def test_tokenizer_info_schema(server: RemoteOpenAIServer):
    """Test that the response matches expected schema types."""
    response = requests.get(server.url_for("tokenizer_info"))
    response.raise_for_status()
    result = response.json()
    field_types = {
        "add_bos_token": bool,
        "add_prefix_space": bool,
        "clean_up_tokenization_spaces": bool,
        "split_special_tokens": bool,
        "bos_token": str,
        "eos_token": str,
        "pad_token": str,
        "unk_token": str,
        "chat_template": str,
        "errors": str,
        "model_max_length": int,
        "additional_special_tokens": list,
        "added_tokens_decoder": dict,
    }
    for field, expected_type in field_types.items():
        if field in result and result[field] is not None:
299
300
301
            assert isinstance(result[field], expected_type), (
                f"{field} should be {expected_type.__name__}"
            )
302
303
304
305


@pytest.mark.asyncio
async def test_tokenizer_info_added_tokens_structure(
306
307
    server: RemoteOpenAIServer,
):
308
309
310
311
312
313
314
315
316
317
    """Test added_tokens_decoder structure if present."""
    response = requests.get(server.url_for("tokenizer_info"))
    response.raise_for_status()
    result = response.json()
    added_tokens = result.get("added_tokens_decoder")
    if added_tokens:
        for token_id, token_info in added_tokens.items():
            assert isinstance(token_id, str), "Token IDs should be strings"
            assert isinstance(token_info, dict), "Token info should be a dict"
            assert "content" in token_info, "Token info should have content"
318
319
320
321
            assert "special" in token_info, "Token info should have special flag"
            assert isinstance(token_info["special"], bool), (
                "Special flag should be boolean"
            )
322
323
324
325


@pytest.mark.asyncio
async def test_tokenizer_info_consistency_with_tokenize(
326
327
    server: RemoteOpenAIServer,
):
328
329
330
331
332
333
    """Test that tokenizer info is consistent with tokenization endpoint."""
    info_response = requests.get(server.url_for("tokenizer_info"))
    info_response.raise_for_status()
    info = info_response.json()
    tokenize_response = requests.post(
        server.url_for("tokenize"),
334
        json={"model": MODEL_NAME, "prompt": "Hello world!"},
335
336
337
338
339
340
341
    )
    tokenize_response.raise_for_status()
    tokenize_result = tokenize_response.json()
    info_max_len = info.get("model_max_length")
    tokenize_max_len = tokenize_result.get("max_model_len")
    if info_max_len and tokenize_max_len:
        assert info_max_len >= tokenize_max_len, (
342
343
            "Info max length should be >= tokenize max length"
        )
344
345
346
347
348
349
350
351
352
353


@pytest.mark.asyncio
async def test_tokenizer_info_chat_template(server: RemoteOpenAIServer):
    """Test chat template is properly included."""
    response = requests.get(server.url_for("tokenizer_info"))
    response.raise_for_status()
    result = response.json()
    chat_template = result.get("chat_template")
    if chat_template:
354
        assert isinstance(chat_template, str), "Chat template should be a string"
355
        assert chat_template.strip(), "Chat template should not be empty"