test_embedding.py 13.9 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
6
import base64

import numpy as np
7
8
import openai
import pytest
9
import pytest_asyncio
10
import requests
11
12
import torch
import torch.nn.functional as F
13

14
15
16
17
from tests.models.language.pooling.embed_utils import (
    run_embedding_correctness_test)
from tests.models.utils import check_embeddings_close
from tests.utils import RemoteOpenAIServer
18
from vllm.entrypoints.openai.protocol import EmbeddingResponse
19
from vllm.transformers_utils.tokenizer import get_tokenizer
20

21
MODEL_NAME = "intfloat/multilingual-e5-small"
22
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
23
DTYPE = "bfloat16"
24
25
26


@pytest.fixture(scope="module")
27
def server():
28
    args = [
29
30
        "--runner",
        "pooling",
31
32
        # use half precision for speed and memory savings in CI environment
        "--dtype",
33
        DTYPE,
34
35
        "--enforce-eager",
        "--max-model-len",
36
        "512",
37
38
        "--chat-template",
        DUMMY_CHAT_TEMPLATE,
39
40
    ]

41
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
42
        yield remote_server
43
44


45
@pytest_asyncio.fixture
46
47
async def client(server):
    async with server.get_async_client() as async_client:
48
        yield async_client
49
50


51
52
53
54
55
56
57
@pytest.fixture(scope="module")
def hf_model(hf_runner):
    with hf_runner(MODEL_NAME, dtype=DTYPE,
                   is_sentence_transformer=True) as hf_model:
        yield hf_model


58
@pytest.mark.asyncio
59
@pytest.mark.parametrize("model_name", [MODEL_NAME])
60
61
async def test_single_embedding(hf_model, client: openai.AsyncOpenAI,
                                model_name: str):
62
63
64
65
66
    input_texts = [
        "The chef prepared a delicious meal.",
    ]

    # test single embedding
67
    embedding_response = await client.embeddings.create(
68
69
70
71
        model=model_name,
        input=input_texts,
        encoding_format="float",
    )
72
73
74
    embeddings = EmbeddingResponse.model_validate(
        embedding_response.model_dump(mode="json"))

75
76
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
77
    assert len(embeddings.data[0].embedding) == 384
78
    assert embeddings.usage.completion_tokens == 0
79
80
    assert embeddings.usage.prompt_tokens == 11
    assert embeddings.usage.total_tokens == 11
81

82
    vllm_outputs = [d.embedding for d in embeddings.data]
83
    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
84

85
86
    # test using token IDs
    input_tokens = [1, 1, 1, 1, 1]
87
    embedding_response = await client.embeddings.create(
88
89
90
91
        model=model_name,
        input=input_tokens,
        encoding_format="float",
    )
92
93
94
    embeddings = EmbeddingResponse.model_validate(
        embedding_response.model_dump(mode="json"))

95
96
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
97
    assert len(embeddings.data[0].embedding) == 384
98
99
100
101
102
103
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 5
    assert embeddings.usage.total_tokens == 5


@pytest.mark.asyncio
104
@pytest.mark.parametrize("model_name", [MODEL_NAME])
105
106
async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI,
                               model_name: str):
107
    # test list[str]
108
109
110
111
    input_texts = [
        "The cat sat on the mat.", "A feline was resting on a rug.",
        "Stars twinkle brightly in the night sky."
    ]
112
    embedding_response = await client.embeddings.create(
113
114
115
116
        model=model_name,
        input=input_texts,
        encoding_format="float",
    )
117
118
119
    embeddings = EmbeddingResponse.model_validate(
        embedding_response.model_dump(mode="json"))

120
121
    assert embeddings.id is not None
    assert len(embeddings.data) == 3
122
    assert len(embeddings.data[0].embedding) == 384
123
    assert embeddings.usage.completion_tokens == 0
124
125
    assert embeddings.usage.prompt_tokens == 33
    assert embeddings.usage.total_tokens == 33
126

127
    vllm_outputs = [d.embedding for d in embeddings.data]
128
    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
129

130
    # test list[list[int]]
131
132
    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
                    [25, 32, 64, 77]]
133
    embedding_response = await client.embeddings.create(
134
135
136
137
        model=model_name,
        input=input_tokens,
        encoding_format="float",
    )
138
139
140
    embeddings = EmbeddingResponse.model_validate(
        embedding_response.model_dump(mode="json"))

141
142
    assert embeddings.id is not None
    assert len(embeddings.data) == 4
143
    assert len(embeddings.data[0].embedding) == 384
144
145
146
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 17
    assert embeddings.usage.total_tokens == 17
147
148
149


@pytest.mark.asyncio
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_conversation_embedding(server: RemoteOpenAIServer,
                                      client: openai.AsyncOpenAI,
                                      model_name: str):
    messages = [{
        "role": "user",
        "content": "The cat sat on the mat.",
    }, {
        "role": "assistant",
        "content": "A feline was resting on a rug.",
    }, {
        "role": "user",
        "content": "Stars twinkle brightly in the night sky.",
    }]

165
166
167
168
169
170
171
172
    chat_response = requests.post(
        server.url_for("v1/embeddings"),
        json={
            "model": model_name,
            "messages": messages,
            "encoding_format": "float",
        },
    )
173
    chat_response.raise_for_status()
174
    chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190

    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
    prompt = tokenizer.apply_chat_template(
        messages,
        chat_template=DUMMY_CHAT_TEMPLATE,
        add_generation_prompt=True,
        continue_final_message=False,
        tokenize=False,
    )
    completion_response = await client.embeddings.create(
        model=model_name,
        input=prompt,
        encoding_format="float",
        # To be consistent with chat
        extra_body={"add_special_tokens": False},
    )
191
192
    completion_embeddings = EmbeddingResponse.model_validate(
        completion_response.model_dump(mode="json"))
193

194
195
196
197
198
199
    assert chat_embeddings.id is not None
    assert completion_embeddings.id is not None
    assert chat_embeddings.created <= completion_embeddings.created
    assert chat_embeddings.model_dump(
        exclude={"id", "created"}) == (completion_embeddings.model_dump(
            exclude={"id", "created"}))
200
201
202
203


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
204
async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI,
205
206
207
208
209
210
                                      model_name: str):
    input_texts = [
        "Hello my name is",
        "The best thing about vLLM is that it supports many different models"
    ]

211
212
213
    responses_float = await client.embeddings.create(input=input_texts,
                                                     model=model_name,
                                                     encoding_format="float")
214
    float_data = [d.embedding for d in responses_float.data]
215
    run_embedding_correctness_test(hf_model, input_texts, float_data)
216

217
218
219
    responses_base64 = await client.embeddings.create(input=input_texts,
                                                      model=model_name,
                                                      encoding_format="base64")
220
    base64_data = []
221
    for data in responses_base64.data:
222
        base64_data.append(
223
            np.frombuffer(base64.b64decode(data.embedding),
224
                          dtype="float32").tolist())
225

226
    run_embedding_correctness_test(hf_model, input_texts, base64_data)
227
228

    # Default response is float32 decoded from base64 by OpenAI Client
229
230
    responses_default = await client.embeddings.create(input=input_texts,
                                                       model=model_name)
231
    default_data = [d.embedding for d in responses_default.data]
232
    run_embedding_correctness_test(hf_model, input_texts, default_data)
233
234
235


@pytest.mark.asyncio
236
237
238
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
                                           model_name: str):
239
240
241
242
243
    input_texts = [
        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
    ]

    # test single embedding
244
    embedding_response = await client.embeddings.create(
245
246
247
        model=model_name,
        input=input_texts,
        extra_body={"truncate_prompt_tokens": 10})
248
249
250
    embeddings = EmbeddingResponse.model_validate(
        embedding_response.model_dump(mode="json"))

251
252
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
253
    assert len(embeddings.data[0].embedding) == 384
254
255
256
257
258
259
260
261
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 10
    assert embeddings.usage.total_tokens == 10

    input_tokens = [
        1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
        9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
    ]
262
    embedding_response = await client.embeddings.create(
263
264
265
        model=model_name,
        input=input_tokens,
        extra_body={"truncate_prompt_tokens": 10})
266
267
    embeddings = EmbeddingResponse.model_validate(
        embedding_response.model_dump(mode="json"))
268
269
270

    assert embeddings.id is not None
    assert len(embeddings.data) == 1
271
    assert len(embeddings.data[0].embedding) == 384
272
273
274
275
276
277
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 10
    assert embeddings.usage.total_tokens == 10


@pytest.mark.asyncio
278
279
280
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
                                                   model_name: str):
281
282
283
284
285
    input_texts = [
        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
    ]

    with pytest.raises(openai.BadRequestError):
286
        response = await client.embeddings.create(
287
288
289
            model=model_name,
            input=input_texts,
            extra_body={"truncate_prompt_tokens": 8193})
290
        assert "error" in response.object
291
        assert "truncate_prompt_tokens value is greater than max_model_len. "\
292
               "Please, select a smaller truncation size." in response.message
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317


@pytest.mark.asyncio
async def test_invocations(server: RemoteOpenAIServer,
                           client: openai.AsyncOpenAI):
    input_texts = [
        "The chef prepared a delicious meal.",
    ]

    request_args = {
        "model": MODEL_NAME,
        "input": input_texts,
        "encoding_format": "float",
    }

    completion_response = await client.embeddings.create(**request_args)

    invocation_response = requests.post(server.url_for("invocations"),
                                        json=request_args)
    invocation_response.raise_for_status()

    completion_output = completion_response.model_dump()
    invocation_output = invocation_response.json()

    assert completion_output.keys() == invocation_output.keys()
318
319
320
321
322
323
324
    for completion_data, invocation_data in zip(completion_output["data"],
                                                invocation_output["data"]):
        assert completion_data.keys() == invocation_data.keys()
        check_embeddings_close(embeddings_0_lst=[completion_data["embedding"]],
                               embeddings_1_lst=[invocation_data["embedding"]],
                               name_0="completion",
                               name_1="invocation")
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357


@pytest.mark.asyncio
async def test_invocations_conversation(server: RemoteOpenAIServer):
    messages = [{
        "role": "user",
        "content": "The cat sat on the mat.",
    }, {
        "role": "assistant",
        "content": "A feline was resting on a rug.",
    }, {
        "role": "user",
        "content": "Stars twinkle brightly in the night sky.",
    }]

    request_args = {
        "model": MODEL_NAME,
        "messages": messages,
        "encoding_format": "float",
    }

    chat_response = requests.post(server.url_for("v1/embeddings"),
                                  json=request_args)
    chat_response.raise_for_status()

    invocation_response = requests.post(server.url_for("invocations"),
                                        json=request_args)
    invocation_response.raise_for_status()

    chat_output = chat_response.json()
    invocation_output = invocation_response.json()

    assert chat_output.keys() == invocation_output.keys()
358
359
360
361
362
363
364
    for chat_data, invocation_data in zip(chat_output["data"],
                                          invocation_output["data"]):
        assert chat_data.keys() == invocation_data.keys()
        check_embeddings_close(embeddings_0_lst=[chat_data["embedding"]],
                               embeddings_1_lst=[invocation_data["embedding"]],
                               name_0="chat",
                               name_1="invocation")
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_normalize(server: RemoteOpenAIServer, model_name: str):
    input_text = ["The chef prepared a delicious meal."]

    async def get_outputs(normalize):
        request_args = {
            "model": MODEL_NAME,
            "input": input_text,
            "encoding_format": "float",
            "normalize": normalize
        }

        response = requests.post(server.url_for("v1/embeddings"),
                                 json=request_args)
        outputs = response.json()

        return torch.tensor([x['embedding'] for x in outputs["data"]])

    default = await get_outputs(normalize=None)
    w_normal = await get_outputs(normalize=True)
    wo_normal = await get_outputs(normalize=False)

    assert torch.allclose(default, w_normal,
                          atol=1e-2), "Default should use normal."
    assert not torch.allclose(w_normal, wo_normal,
                              atol=1e-2), "wo_normal should not use normal."
    assert torch.allclose(
        w_normal, F.normalize(wo_normal, p=2, dim=-1),
        atol=1e-2), "w_normal should be close to normal(wo_normal)."