test_tokenization.py 4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
import openai  # use the official client for correctness check
import pytest
import requests

from vllm.transformers_utils.tokenizer import get_tokenizer

from ...utils import RemoteOpenAIServer

# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"


@pytest.fixture(scope="module")
def server():
15
16
17
18
19
20
21
22
23
24
25
26
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "bfloat16",
        "--max-model-len",
        "8192",
        "--enforce-eager",
        "--max-num-seqs",
        "128",
    ]

    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
        yield remote_server


@pytest.fixture(scope="module")
def client(server):
    return server.get_async_client()


@pytest.mark.asyncio
@pytest.mark.parametrize(
    "model_name",
    [MODEL_NAME],
)
async def test_tokenize_completions(client: openai.AsyncOpenAI,
                                    model_name: str):
    base_url = str(client.base_url)[:-3].strip("/")
    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")

    for add_special in [False, True]:
        prompt = "This is a test prompt."
        tokens = tokenizer.encode(prompt, add_special_tokens=add_special)

        response = requests.post(base_url + "/tokenize",
                                 json={
                                     "add_special_tokens": add_special,
                                     "model": model_name,
                                     "prompt": prompt
                                 })
        response.raise_for_status()

        assert response.json() == {
            "tokens": tokens,
            "count": len(tokens),
            "max_model_len": 8192
        }


@pytest.mark.asyncio
@pytest.mark.parametrize(
    "model_name",
    [MODEL_NAME],
)
async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str):
    base_url = str(client.base_url)[:-3].strip("/")
    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")

    for add_generation in [False, True]:
        for add_special in [False, True]:
            conversation = [{
                "role": "user",
                "content": "Hi there!"
            }, {
                "role": "assistant",
                "content": "Nice to meet you!"
            }, {
                "role": "user",
                "content": "Can I ask a question?"
            }]

            prompt = tokenizer.apply_chat_template(
                add_generation_prompt=add_generation,
                conversation=conversation,
                tokenize=False)
            tokens = tokenizer.encode(prompt, add_special_tokens=add_special)

            response = requests.post(base_url + "/tokenize",
                                     json={
                                         "add_generation_prompt":
                                         add_generation,
                                         "add_special_tokens": add_special,
                                         "messages": conversation,
                                         "model": model_name
                                     })
            response.raise_for_status()

            assert response.json() == {
                "tokens": tokens,
                "count": len(tokens),
                "max_model_len": 8192
            }


@pytest.mark.asyncio
@pytest.mark.parametrize(
    "model_name",
    [MODEL_NAME],
)
async def test_detokenize(client: openai.AsyncOpenAI, model_name: str):
    base_url = str(client.base_url)[:-3].strip("/")
    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")

    prompt = "This is a test prompt."
    tokens = tokenizer.encode(prompt, add_special_tokens=False)

    response = requests.post(base_url + "/detokenize",
                             json={
                                 "model": model_name,
                                 "tokens": tokens
                             })
    response.raise_for_status()

    assert response.json() == {"prompt": prompt}