test_embedding.py 8.72 KB
Newer Older
1
2
3
import base64

import numpy as np
4
5
import openai
import pytest
6
import os
7
import pytest_asyncio
8
9
10
import requests

from vllm.transformers_utils.tokenizer import get_tokenizer
11

12
from ...utils import RemoteOpenAIServer, models_path_prefix
13

zhuwenwen's avatar
zhuwenwen committed
14
MODEL_NAME = os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")
15
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
16
17
18


@pytest.fixture(scope="module")
19
def server():
20
21
22
23
24
25
26
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "bfloat16",
        "--enforce-eager",
        "--max-model-len",
        "8192",
27
28
        "--chat-template",
        DUMMY_CHAT_TEMPLATE,
29
30
    ]

31
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
32
        yield remote_server
33
34


35
@pytest_asyncio.fixture
36
37
async def client(server):
    async with server.get_async_client() as async_client:
38
        yield async_client
39
40
41


@pytest.mark.asyncio
42
43
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
44
45
46
47
48
    input_texts = [
        "The chef prepared a delicious meal.",
    ]

    # test single embedding
49
    embeddings = await client.embeddings.create(
50
51
52
53
54
55
56
57
58
59
60
61
62
        model=model_name,
        input=input_texts,
        encoding_format="float",
    )
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
    assert len(embeddings.data[0].embedding) == 4096
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 9
    assert embeddings.usage.total_tokens == 9

    # test using token IDs
    input_tokens = [1, 1, 1, 1, 1]
63
    embeddings = await client.embeddings.create(
64
65
66
67
68
69
70
71
72
73
74
75
76
        model=model_name,
        input=input_tokens,
        encoding_format="float",
    )
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
    assert len(embeddings.data[0].embedding) == 4096
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 5
    assert embeddings.usage.total_tokens == 5


@pytest.mark.asyncio
77
78
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
79
80
81
82
83
    # test List[str]
    input_texts = [
        "The cat sat on the mat.", "A feline was resting on a rug.",
        "Stars twinkle brightly in the night sky."
    ]
84
    embeddings = await client.embeddings.create(
85
86
87
88
89
90
91
        model=model_name,
        input=input_texts,
        encoding_format="float",
    )
    assert embeddings.id is not None
    assert len(embeddings.data) == 3
    assert len(embeddings.data[0].embedding) == 4096
92
93
94
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 32
    assert embeddings.usage.total_tokens == 32
95
96
97
98

    # test List[List[int]]
    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
                    [25, 32, 64, 77]]
99
    embeddings = await client.embeddings.create(
100
101
102
103
104
105
106
107
108
109
        model=model_name,
        input=input_tokens,
        encoding_format="float",
    )
    assert embeddings.id is not None
    assert len(embeddings.data) == 4
    assert len(embeddings.data[0].embedding) == 4096
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 17
    assert embeddings.usage.total_tokens == 17
110
111
112


@pytest.mark.asyncio
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_conversation_embedding(server: RemoteOpenAIServer,
                                      client: openai.AsyncOpenAI,
                                      model_name: str):
    messages = [{
        "role": "user",
        "content": "The cat sat on the mat.",
    }, {
        "role": "assistant",
        "content": "A feline was resting on a rug.",
    }, {
        "role": "user",
        "content": "Stars twinkle brightly in the night sky.",
    }]

    chat_response = requests.post(server.url_for("v1/embeddings"),
                                  json={
                                      "model": model_name,
                                      "messages": messages,
                                      "encoding_format": "float",
                                  })
    chat_response.raise_for_status()
    chat_embeddings = chat_response.json()

    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
    prompt = tokenizer.apply_chat_template(
        messages,
        chat_template=DUMMY_CHAT_TEMPLATE,
        add_generation_prompt=True,
        continue_final_message=False,
        tokenize=False,
    )
    completion_response = await client.embeddings.create(
        model=model_name,
        input=prompt,
        encoding_format="float",
        # To be consistent with chat
        extra_body={"add_special_tokens": False},
    )
    completion_embeddings = completion_response.model_dump(mode="json")

    assert chat_embeddings.pop("id") is not None
    assert completion_embeddings.pop("id") is not None
    assert chat_embeddings.pop("created") <= completion_embeddings.pop(
        "created")
    assert chat_embeddings == completion_embeddings


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
164
165
166
167
168
169
                                      model_name: str):
    input_texts = [
        "Hello my name is",
        "The best thing about vLLM is that it supports many different models"
    ]

170
171
172
    responses_float = await client.embeddings.create(input=input_texts,
                                                     model=model_name,
                                                     encoding_format="float")
173

174
175
176
    responses_base64 = await client.embeddings.create(input=input_texts,
                                                      model=model_name,
                                                      encoding_format="base64")
177
178
179
180
181

    decoded_responses_base64_data = []
    for data in responses_base64.data:
        decoded_responses_base64_data.append(
            np.frombuffer(base64.b64decode(data.embedding),
182
                          dtype="float32").tolist())
183
184
185
186
187

    assert responses_float.data[0].embedding == decoded_responses_base64_data[
        0]
    assert responses_float.data[1].embedding == decoded_responses_base64_data[
        1]
188
189

    # Default response is float32 decoded from base64 by OpenAI Client
190
191
    responses_default = await client.embeddings.create(input=input_texts,
                                                       model=model_name)
192
193
194
195
196

    assert responses_float.data[0].embedding == responses_default.data[
        0].embedding
    assert responses_float.data[1].embedding == responses_default.data[
        1].embedding
197
198
199


@pytest.mark.asyncio
200
201
202
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
                                           model_name: str):
203
204
205
206
207
    input_texts = [
        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
    ]

    # test single embedding
208
    embeddings = await client.embeddings.create(
209
210
211
212
213
214
215
216
217
218
219
220
221
222
        model=model_name,
        input=input_texts,
        extra_body={"truncate_prompt_tokens": 10})
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
    assert len(embeddings.data[0].embedding) == 4096
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 10
    assert embeddings.usage.total_tokens == 10

    input_tokens = [
        1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
        9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
    ]
223
    embeddings = await client.embeddings.create(
224
225
226
227
228
229
230
231
232
233
234
235
236
        model=model_name,
        input=input_tokens,
        extra_body={"truncate_prompt_tokens": 10})

    assert embeddings.id is not None
    assert len(embeddings.data) == 1
    assert len(embeddings.data[0].embedding) == 4096
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 10
    assert embeddings.usage.total_tokens == 10


@pytest.mark.asyncio
237
238
239
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
                                                   model_name: str):
240
241
242
243
244
    input_texts = [
        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
    ]

    with pytest.raises(openai.BadRequestError):
245
        embeddings = await client.embeddings.create(
246
247
248
249
250
251
            model=model_name,
            input=input_texts,
            extra_body={"truncate_prompt_tokens": 8193})
        assert "error" in embeddings.object
        assert "truncate_prompt_tokens value is greater than max_model_len. "\
               "Please, select a smaller truncation size." in embeddings.message