"vscode:/vscode.git/clone" did not exist on "6f866c452fc870273670c9cf9235e855edd33111"
test_embedding.py 12.1 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
6
import base64

import numpy as np
7
8
import openai
import pytest
9
import pytest_asyncio
10
11
import requests

12
from vllm.entrypoints.openai.protocol import EmbeddingResponse
13
from vllm.transformers_utils.tokenizer import get_tokenizer
14

15
16
from ...models.language.pooling.embed_utils import (
    run_embedding_correctness_test)
17
from ...utils import RemoteOpenAIServer
18

19
MODEL_NAME = "intfloat/multilingual-e5-small"
20
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
21
DTYPE = "bfloat16"
22
23


24
25
26
27
28
29
30
31
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
    # Simple autouse wrapper to run both engines for each test
    # This can be promoted up to conftest.py to run for every
    # test in a package
    pass


32
@pytest.fixture(scope="module")
33
def server():
34
    args = [
35
36
        "--task",
        "embed",
37
38
        # use half precision for speed and memory savings in CI environment
        "--dtype",
39
        DTYPE,
40
41
        "--enforce-eager",
        "--max-model-len",
42
        "512",
43
44
        "--chat-template",
        DUMMY_CHAT_TEMPLATE,
45
46
    ]

47
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
48
        yield remote_server
49
50


51
@pytest_asyncio.fixture
52
53
async def client(server):
    async with server.get_async_client() as async_client:
54
        yield async_client
55
56


57
58
59
60
61
62
63
@pytest.fixture(scope="module")
def hf_model(hf_runner):
    with hf_runner(MODEL_NAME, dtype=DTYPE,
                   is_sentence_transformer=True) as hf_model:
        yield hf_model


64
@pytest.mark.asyncio
65
@pytest.mark.parametrize("model_name", [MODEL_NAME])
66
67
async def test_single_embedding(hf_model, client: openai.AsyncOpenAI,
                                model_name: str):
68
69
70
71
72
    input_texts = [
        "The chef prepared a delicious meal.",
    ]

    # test single embedding
73
    embedding_response = await client.embeddings.create(
74
75
76
77
        model=model_name,
        input=input_texts,
        encoding_format="float",
    )
78
79
80
    embeddings = EmbeddingResponse.model_validate(
        embedding_response.model_dump(mode="json"))

81
82
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
83
    assert len(embeddings.data[0].embedding) == 384
84
    assert embeddings.usage.completion_tokens == 0
85
86
    assert embeddings.usage.prompt_tokens == 11
    assert embeddings.usage.total_tokens == 11
87

88
    vllm_outputs = [d.embedding for d in embeddings.data]
89
    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
90

91
92
    # test using token IDs
    input_tokens = [1, 1, 1, 1, 1]
93
    embedding_response = await client.embeddings.create(
94
95
96
97
        model=model_name,
        input=input_tokens,
        encoding_format="float",
    )
98
99
100
    embeddings = EmbeddingResponse.model_validate(
        embedding_response.model_dump(mode="json"))

101
102
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
103
    assert len(embeddings.data[0].embedding) == 384
104
105
106
107
108
109
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 5
    assert embeddings.usage.total_tokens == 5


@pytest.mark.asyncio
110
@pytest.mark.parametrize("model_name", [MODEL_NAME])
111
112
async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI,
                               model_name: str):
113
    # test list[str]
114
115
116
117
    input_texts = [
        "The cat sat on the mat.", "A feline was resting on a rug.",
        "Stars twinkle brightly in the night sky."
    ]
118
    embedding_response = await client.embeddings.create(
119
120
121
122
        model=model_name,
        input=input_texts,
        encoding_format="float",
    )
123
124
125
    embeddings = EmbeddingResponse.model_validate(
        embedding_response.model_dump(mode="json"))

126
127
    assert embeddings.id is not None
    assert len(embeddings.data) == 3
128
    assert len(embeddings.data[0].embedding) == 384
129
    assert embeddings.usage.completion_tokens == 0
130
131
    assert embeddings.usage.prompt_tokens == 33
    assert embeddings.usage.total_tokens == 33
132

133
    vllm_outputs = [d.embedding for d in embeddings.data]
134
    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
135

136
    # test list[list[int]]
137
138
    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
                    [25, 32, 64, 77]]
139
    embedding_response = await client.embeddings.create(
140
141
142
143
        model=model_name,
        input=input_tokens,
        encoding_format="float",
    )
144
145
146
    embeddings = EmbeddingResponse.model_validate(
        embedding_response.model_dump(mode="json"))

147
148
    assert embeddings.id is not None
    assert len(embeddings.data) == 4
149
    assert len(embeddings.data[0].embedding) == 384
150
151
152
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 17
    assert embeddings.usage.total_tokens == 17
153
154
155


@pytest.mark.asyncio
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_conversation_embedding(server: RemoteOpenAIServer,
                                      client: openai.AsyncOpenAI,
                                      model_name: str):
    messages = [{
        "role": "user",
        "content": "The cat sat on the mat.",
    }, {
        "role": "assistant",
        "content": "A feline was resting on a rug.",
    }, {
        "role": "user",
        "content": "Stars twinkle brightly in the night sky.",
    }]

171
172
173
174
175
176
177
178
    chat_response = requests.post(
        server.url_for("v1/embeddings"),
        json={
            "model": model_name,
            "messages": messages,
            "encoding_format": "float",
        },
    )
179
    chat_response.raise_for_status()
180
    chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196

    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
    prompt = tokenizer.apply_chat_template(
        messages,
        chat_template=DUMMY_CHAT_TEMPLATE,
        add_generation_prompt=True,
        continue_final_message=False,
        tokenize=False,
    )
    completion_response = await client.embeddings.create(
        model=model_name,
        input=prompt,
        encoding_format="float",
        # To be consistent with chat
        extra_body={"add_special_tokens": False},
    )
197
198
    completion_embeddings = EmbeddingResponse.model_validate(
        completion_response.model_dump(mode="json"))
199

200
201
202
203
204
205
    assert chat_embeddings.id is not None
    assert completion_embeddings.id is not None
    assert chat_embeddings.created <= completion_embeddings.created
    assert chat_embeddings.model_dump(
        exclude={"id", "created"}) == (completion_embeddings.model_dump(
            exclude={"id", "created"}))
206
207
208
209


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
210
async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI,
211
212
213
214
215
216
                                      model_name: str):
    input_texts = [
        "Hello my name is",
        "The best thing about vLLM is that it supports many different models"
    ]

217
218
219
    responses_float = await client.embeddings.create(input=input_texts,
                                                     model=model_name,
                                                     encoding_format="float")
220
    float_data = [d.embedding for d in responses_float.data]
221
    run_embedding_correctness_test(hf_model, input_texts, float_data)
222

223
224
225
    responses_base64 = await client.embeddings.create(input=input_texts,
                                                      model=model_name,
                                                      encoding_format="base64")
226
    base64_data = []
227
    for data in responses_base64.data:
228
        base64_data.append(
229
            np.frombuffer(base64.b64decode(data.embedding),
230
                          dtype="float32").tolist())
231

232
    run_embedding_correctness_test(hf_model, input_texts, base64_data)
233
234

    # Default response is float32 decoded from base64 by OpenAI Client
235
236
    responses_default = await client.embeddings.create(input=input_texts,
                                                       model=model_name)
237
    default_data = [d.embedding for d in responses_default.data]
238
    run_embedding_correctness_test(hf_model, input_texts, default_data)
239
240
241


@pytest.mark.asyncio
242
243
244
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
                                           model_name: str):
245
246
247
248
249
    input_texts = [
        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
    ]

    # test single embedding
250
    embedding_response = await client.embeddings.create(
251
252
253
        model=model_name,
        input=input_texts,
        extra_body={"truncate_prompt_tokens": 10})
254
255
256
    embeddings = EmbeddingResponse.model_validate(
        embedding_response.model_dump(mode="json"))

257
258
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
259
    assert len(embeddings.data[0].embedding) == 384
260
261
262
263
264
265
266
267
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 10
    assert embeddings.usage.total_tokens == 10

    input_tokens = [
        1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
        9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
    ]
268
    embedding_response = await client.embeddings.create(
269
270
271
        model=model_name,
        input=input_tokens,
        extra_body={"truncate_prompt_tokens": 10})
272
273
    embeddings = EmbeddingResponse.model_validate(
        embedding_response.model_dump(mode="json"))
274
275
276

    assert embeddings.id is not None
    assert len(embeddings.data) == 1
277
    assert len(embeddings.data[0].embedding) == 384
278
279
280
281
282
283
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 10
    assert embeddings.usage.total_tokens == 10


@pytest.mark.asyncio
284
285
286
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
                                                   model_name: str):
287
288
289
290
291
    input_texts = [
        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
    ]

    with pytest.raises(openai.BadRequestError):
292
        response = await client.embeddings.create(
293
294
295
            model=model_name,
            input=input_texts,
            extra_body={"truncate_prompt_tokens": 8193})
296
        assert "error" in response.object
297
        assert "truncate_prompt_tokens value is greater than max_model_len. "\
298
               "Please, select a smaller truncation size." in response.message
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358


@pytest.mark.asyncio
async def test_invocations(server: RemoteOpenAIServer,
                           client: openai.AsyncOpenAI):
    input_texts = [
        "The chef prepared a delicious meal.",
    ]

    request_args = {
        "model": MODEL_NAME,
        "input": input_texts,
        "encoding_format": "float",
    }

    completion_response = await client.embeddings.create(**request_args)

    invocation_response = requests.post(server.url_for("invocations"),
                                        json=request_args)
    invocation_response.raise_for_status()

    completion_output = completion_response.model_dump()
    invocation_output = invocation_response.json()

    assert completion_output.keys() == invocation_output.keys()
    assert completion_output["data"] == invocation_output["data"]


@pytest.mark.asyncio
async def test_invocations_conversation(server: RemoteOpenAIServer):
    messages = [{
        "role": "user",
        "content": "The cat sat on the mat.",
    }, {
        "role": "assistant",
        "content": "A feline was resting on a rug.",
    }, {
        "role": "user",
        "content": "Stars twinkle brightly in the night sky.",
    }]

    request_args = {
        "model": MODEL_NAME,
        "messages": messages,
        "encoding_format": "float",
    }

    chat_response = requests.post(server.url_for("v1/embeddings"),
                                  json=request_args)
    chat_response.raise_for_status()

    invocation_response = requests.post(server.url_for("invocations"),
                                        json=request_args)
    invocation_response.raise_for_status()

    chat_output = chat_response.json()
    invocation_output = invocation_response.json()

    assert chat_output.keys() == invocation_output.keys()
    assert chat_output["data"] == invocation_output["data"]