"vllm/vscode:/vscode.git/clone" did not exist on "3d4e7d34be856cc4f54033e6a019059afacb5e76"
test_tokenization.py 5.83 KB
Newer Older
1
2
import openai  # use the official client for correctness check
import pytest
3
import pytest_asyncio
4
5
6
7
8
import requests

from vllm.transformers_utils.tokenizer import get_tokenizer

from ...utils import RemoteOpenAIServer
9
10
from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
from .test_completion import zephyr_lora_files  # noqa: F401
11
12
13
14
15
16

# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"


@pytest.fixture(scope="module")
17
def server(zephyr_lora_added_tokens_files: str):  # noqa: F811
18
19
20
21
22
23
24
25
26
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "bfloat16",
        "--max-model-len",
        "8192",
        "--enforce-eager",
        "--max-num-seqs",
        "128",
27
28
29
30
31
32
        # lora config
        "--enable-lora",
        "--lora-modules",
        f"zephyr-lora2={zephyr_lora_added_tokens_files}",
        "--max-lora-rank",
        "64",
33
34
35
    ]

    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
36
37
38
        yield remote_server


39
40
41
42
43
44
45
@pytest.fixture(scope="module")
def tokenizer_name(model_name: str,
                   zephyr_lora_added_tokens_files: str):  # noqa: F811
    return zephyr_lora_added_tokens_files if (
        model_name == "zephyr-lora2") else model_name


46
47
48
49
@pytest_asyncio.fixture
async def client(server):
    async with server.get_async_client() as async_client:
        yield async_client
50
51
52
53


@pytest.mark.asyncio
@pytest.mark.parametrize(
54
55
56
    "model_name,tokenizer_name",
    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
    indirect=["tokenizer_name"],
57
58
)
async def test_tokenize_completions(client: openai.AsyncOpenAI,
59
                                    model_name: str, tokenizer_name: str):
60
    base_url = str(client.base_url)[:-3].strip("/")
61
62
    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
                              tokenizer_mode="fast")
63
64

    for add_special in [False, True]:
65
        prompt = "vllm1 This is a test prompt."
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
        tokens = tokenizer.encode(prompt, add_special_tokens=add_special)

        response = requests.post(base_url + "/tokenize",
                                 json={
                                     "add_special_tokens": add_special,
                                     "model": model_name,
                                     "prompt": prompt
                                 })
        response.raise_for_status()

        assert response.json() == {
            "tokens": tokens,
            "count": len(tokens),
            "max_model_len": 8192
        }


@pytest.mark.asyncio
@pytest.mark.parametrize(
85
86
87
    "model_name,tokenizer_name",
    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
    indirect=["tokenizer_name"],
88
)
89
90
async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
                             tokenizer_name: str):
91
    base_url = str(client.base_url)[:-3].strip("/")
92
93
    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
                              tokenizer_mode="fast")
94
95
96
97
98
99
100
101
102
103
104

    for add_generation in [False, True]:
        for add_special in [False, True]:
            conversation = [{
                "role": "user",
                "content": "Hi there!"
            }, {
                "role": "assistant",
                "content": "Nice to meet you!"
            }, {
                "role": "user",
105
                "content": "Can I ask a question? vllm1"
106
            }]
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
            for continue_final in [False, True]:
                if add_generation and continue_final:
                    continue
                if continue_final:
                    conversation.append({
                        "role": "assistant",
                        "content": "Sure,"
                    })

                prompt = tokenizer.apply_chat_template(
                    add_generation_prompt=add_generation,
                    continue_final_message=continue_final,
                    conversation=conversation,
                    tokenize=False)
                tokens = tokenizer.encode(prompt,
                                          add_special_tokens=add_special)

                response = requests.post(base_url + "/tokenize",
                                         json={
                                             "add_generation_prompt":
                                             add_generation,
                                             "continue_final_message":
                                             continue_final,
                                             "add_special_tokens": add_special,
                                             "messages": conversation,
                                             "model": model_name
                                         })
                response.raise_for_status()

                assert response.json() == {
                    "tokens": tokens,
                    "count": len(tokens),
                    "max_model_len": 8192
                }
141
142
143
144


@pytest.mark.asyncio
@pytest.mark.parametrize(
145
146
147
    "model_name,tokenizer_name",
    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
    indirect=["tokenizer_name"],
148
)
149
150
async def test_detokenize(client: openai.AsyncOpenAI, model_name: str,
                          tokenizer_name: str):
151
    base_url = str(client.base_url)[:-3].strip("/")
152
153
    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
                              tokenizer_mode="fast")
154

155
    prompt = "This is a test prompt. vllm1"
156
157
    tokens = tokenizer.encode(prompt, add_special_tokens=False)

158
    print(f"CALLING {base_url} FOR {model_name}")
159
160
161
162
163
164
165
166
    response = requests.post(base_url + "/detokenize",
                             json={
                                 "model": model_name,
                                 "tokens": tokens
                             })
    response.raise_for_status()

    assert response.json() == {"prompt": prompt}