test_tokenization.py 5.15 KB
Newer Older
1
2
3
4
5
6
7
import openai  # use the official client for correctness check
import pytest
import requests

from vllm.transformers_utils.tokenizer import get_tokenizer

from ...utils import RemoteOpenAIServer
8
9
from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
from .test_completion import zephyr_lora_files  # noqa: F401
10
11
12
13
14
15

# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"


@pytest.fixture(scope="module")
16
def server(zephyr_lora_added_tokens_files: str):  # noqa: F811
17
18
19
20
21
22
23
24
25
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "bfloat16",
        "--max-model-len",
        "8192",
        "--enforce-eager",
        "--max-num-seqs",
        "128",
26
27
28
29
30
31
        # lora config
        "--enable-lora",
        "--lora-modules",
        f"zephyr-lora2={zephyr_lora_added_tokens_files}",
        "--max-lora-rank",
        "64",
32
33
34
    ]

    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
35
36
37
        yield remote_server


38
39
40
41
42
43
44
@pytest.fixture(scope="module")
def tokenizer_name(model_name: str,
                   zephyr_lora_added_tokens_files: str):  # noqa: F811
    return zephyr_lora_added_tokens_files if (
        model_name == "zephyr-lora2") else model_name


45
46
47
48
49
50
51
@pytest.fixture(scope="module")
def client(server):
    return server.get_async_client()


@pytest.mark.asyncio
@pytest.mark.parametrize(
52
53
54
    "model_name,tokenizer_name",
    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
    indirect=["tokenizer_name"],
55
56
)
async def test_tokenize_completions(client: openai.AsyncOpenAI,
57
                                    model_name: str, tokenizer_name: str):
58
    base_url = str(client.base_url)[:-3].strip("/")
59
60
    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
                              tokenizer_mode="fast")
61
62

    for add_special in [False, True]:
63
        prompt = "vllm1 This is a test prompt."
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
        tokens = tokenizer.encode(prompt, add_special_tokens=add_special)

        response = requests.post(base_url + "/tokenize",
                                 json={
                                     "add_special_tokens": add_special,
                                     "model": model_name,
                                     "prompt": prompt
                                 })
        response.raise_for_status()

        assert response.json() == {
            "tokens": tokens,
            "count": len(tokens),
            "max_model_len": 8192
        }


@pytest.mark.asyncio
@pytest.mark.parametrize(
83
84
85
    "model_name,tokenizer_name",
    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
    indirect=["tokenizer_name"],
86
)
87
88
async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
                             tokenizer_name: str):
89
    base_url = str(client.base_url)[:-3].strip("/")
90
91
    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
                              tokenizer_mode="fast")
92
93
94
95
96
97
98
99
100
101
102

    for add_generation in [False, True]:
        for add_special in [False, True]:
            conversation = [{
                "role": "user",
                "content": "Hi there!"
            }, {
                "role": "assistant",
                "content": "Nice to meet you!"
            }, {
                "role": "user",
103
                "content": "Can I ask a question? vllm1"
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
            }]

            prompt = tokenizer.apply_chat_template(
                add_generation_prompt=add_generation,
                conversation=conversation,
                tokenize=False)
            tokens = tokenizer.encode(prompt, add_special_tokens=add_special)

            response = requests.post(base_url + "/tokenize",
                                     json={
                                         "add_generation_prompt":
                                         add_generation,
                                         "add_special_tokens": add_special,
                                         "messages": conversation,
                                         "model": model_name
                                     })
            response.raise_for_status()

            assert response.json() == {
                "tokens": tokens,
                "count": len(tokens),
                "max_model_len": 8192
            }


@pytest.mark.asyncio
@pytest.mark.parametrize(
131
132
133
    "model_name,tokenizer_name",
    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
    indirect=["tokenizer_name"],
134
)
135
136
async def test_detokenize(client: openai.AsyncOpenAI, model_name: str,
                          tokenizer_name: str):
137
    base_url = str(client.base_url)[:-3].strip("/")
138
139
    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
                              tokenizer_mode="fast")
140

141
    prompt = "This is a test prompt. vllm1"
142
143
    tokens = tokenizer.encode(prompt, add_special_tokens=False)

144
    print(f"CALLING {base_url} FOR {model_name}")
145
146
147
148
149
150
151
152
    response = requests.post(base_url + "/detokenize",
                             json={
                                 "model": model_name,
                                 "tokens": tokens
                             })
    response.raise_for_status()

    assert response.json() == {"prompt": prompt}