test_embedding.py 19.3 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import base64
5
import json
6
7

import numpy as np
8
9
import openai
import pytest
10
import pytest_asyncio
11
import requests
12
13
import torch
import torch.nn.functional as F
14

15
from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
16
17
from tests.models.utils import check_embeddings_close
from tests.utils import RemoteOpenAIServer
18
19
from vllm.entrypoints.openai.protocol import (
    EmbeddingResponse,
20
    PoolingResponse,
21
)
22
from vllm.transformers_utils.tokenizer import get_tokenizer
23
24
25
26
27
28
29
from vllm.utils.serial_utils import (
    EMBED_DTYPE_TO_TORCH_DTYPE,
    ENDIANNESS,
    MetadataItem,
    binary2tensor,
    decode_pooling_output,
)
30

31
MODEL_NAME = "intfloat/multilingual-e5-small"
32
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
33
DTYPE = "bfloat16"
34
35
36


@pytest.fixture(scope="module")
37
def server():
38
    args = [
39
40
        "--runner",
        "pooling",
41
42
        # use half precision for speed and memory savings in CI environment
        "--dtype",
43
        DTYPE,
44
45
        "--enforce-eager",
        "--max-model-len",
46
        "512",
47
48
        "--chat-template",
        DUMMY_CHAT_TEMPLATE,
49
50
    ]

51
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
52
        yield remote_server
53
54


55
@pytest_asyncio.fixture
56
57
async def client(server):
    async with server.get_async_client() as async_client:
58
        yield async_client
59
60


61
62
@pytest.fixture(scope="module")
def hf_model(hf_runner):
63
    with hf_runner(MODEL_NAME, dtype=DTYPE, is_sentence_transformer=True) as hf_model:
64
65
66
        yield hf_model


67
@pytest.mark.asyncio
68
@pytest.mark.parametrize("model_name", [MODEL_NAME])
69
async def test_single_embedding(hf_model, client: openai.AsyncOpenAI, model_name: str):
70
71
72
73
74
    input_texts = [
        "The chef prepared a delicious meal.",
    ]

    # test single embedding
75
    embedding_response = await client.embeddings.create(
76
77
78
79
        model=model_name,
        input=input_texts,
        encoding_format="float",
    )
80
    embeddings = EmbeddingResponse.model_validate(
81
82
        embedding_response.model_dump(mode="json")
    )
83

84
85
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
86
    assert len(embeddings.data[0].embedding) == 384
87
    assert embeddings.usage.completion_tokens == 0
88
89
    assert embeddings.usage.prompt_tokens == 11
    assert embeddings.usage.total_tokens == 11
90

91
    vllm_outputs = [d.embedding for d in embeddings.data]
92
    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
93

94
95
    # test using token IDs
    input_tokens = [1, 1, 1, 1, 1]
96
    embedding_response = await client.embeddings.create(
97
98
99
100
        model=model_name,
        input=input_tokens,
        encoding_format="float",
    )
101
    embeddings = EmbeddingResponse.model_validate(
102
103
        embedding_response.model_dump(mode="json")
    )
104

105
106
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
107
    assert len(embeddings.data[0].embedding) == 384
108
109
110
111
112
113
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 5
    assert embeddings.usage.total_tokens == 5


@pytest.mark.asyncio
114
@pytest.mark.parametrize("model_name", [MODEL_NAME])
115
async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI, model_name: str):
116
    # test list[str]
117
    input_texts = [
118
119
120
        "The cat sat on the mat.",
        "A feline was resting on a rug.",
        "Stars twinkle brightly in the night sky.",
121
    ]
122
    embedding_response = await client.embeddings.create(
123
124
125
126
        model=model_name,
        input=input_texts,
        encoding_format="float",
    )
127
    embeddings = EmbeddingResponse.model_validate(
128
129
        embedding_response.model_dump(mode="json")
    )
130

131
132
    assert embeddings.id is not None
    assert len(embeddings.data) == 3
133
    assert len(embeddings.data[0].embedding) == 384
134
    assert embeddings.usage.completion_tokens == 0
135
136
    assert embeddings.usage.prompt_tokens == 33
    assert embeddings.usage.total_tokens == 33
137

138
    vllm_outputs = [d.embedding for d in embeddings.data]
139
    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
140

141
    # test list[list[int]]
142
143
144
145
146
147
    input_tokens = [
        [4, 5, 7, 9, 20],
        [15, 29, 499],
        [24, 24, 24, 24, 24],
        [25, 32, 64, 77],
    ]
148
    embedding_response = await client.embeddings.create(
149
150
151
152
        model=model_name,
        input=input_tokens,
        encoding_format="float",
    )
153
    embeddings = EmbeddingResponse.model_validate(
154
155
        embedding_response.model_dump(mode="json")
    )
156

157
158
    assert embeddings.id is not None
    assert len(embeddings.data) == 4
159
    assert len(embeddings.data[0].embedding) == 384
160
161
162
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 17
    assert embeddings.usage.total_tokens == 17
163
164
165


@pytest.mark.asyncio
166
@pytest.mark.parametrize("model_name", [MODEL_NAME])
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
async def test_conversation_embedding(
    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
):
    messages = [
        {
            "role": "user",
            "content": "The cat sat on the mat.",
        },
        {
            "role": "assistant",
            "content": "A feline was resting on a rug.",
        },
        {
            "role": "user",
            "content": "Stars twinkle brightly in the night sky.",
        },
    ]
184

185
186
187
188
189
190
191
192
    chat_response = requests.post(
        server.url_for("v1/embeddings"),
        json={
            "model": model_name,
            "messages": messages,
            "encoding_format": "float",
        },
    )
193
    chat_response.raise_for_status()
194
    chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210

    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
    prompt = tokenizer.apply_chat_template(
        messages,
        chat_template=DUMMY_CHAT_TEMPLATE,
        add_generation_prompt=True,
        continue_final_message=False,
        tokenize=False,
    )
    completion_response = await client.embeddings.create(
        model=model_name,
        input=prompt,
        encoding_format="float",
        # To be consistent with chat
        extra_body={"add_special_tokens": False},
    )
211
    completion_embeddings = EmbeddingResponse.model_validate(
212
213
        completion_response.model_dump(mode="json")
    )
214

215
216
217
    assert chat_embeddings.id is not None
    assert completion_embeddings.id is not None
    assert chat_embeddings.created <= completion_embeddings.created
218
219
220
    assert chat_embeddings.model_dump(exclude={"id", "created"}) == (
        completion_embeddings.model_dump(exclude={"id", "created"})
    )
221
222
223
224


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
225
226
227
async def test_batch_base64_embedding(
    hf_model, client: openai.AsyncOpenAI, model_name: str
):
228
229
    input_texts = [
        "Hello my name is",
230
        "The best thing about vLLM is that it supports many different models",
231
232
    ]

233
234
235
    responses_float = await client.embeddings.create(
        input=input_texts, model=model_name, encoding_format="float"
    )
236
    float_data = [d.embedding for d in responses_float.data]
237
    run_embedding_correctness_test(hf_model, input_texts, float_data)
238

239
240
241
    responses_base64 = await client.embeddings.create(
        input=input_texts, model=model_name, encoding_format="base64"
    )
242
    base64_data = []
243
    for data in responses_base64.data:
244
        base64_data.append(
245
246
            np.frombuffer(base64.b64decode(data.embedding), dtype="float32").tolist()
        )
247

248
    run_embedding_correctness_test(hf_model, input_texts, base64_data)
249
250

    # Default response is float32 decoded from base64 by OpenAI Client
251
252
253
    responses_default = await client.embeddings.create(
        input=input_texts, model=model_name
    )
254
    default_data = [d.embedding for d in responses_default.data]
255
    run_embedding_correctness_test(hf_model, input_texts, default_data)
256
257


258
259
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
260
261
async def test_base64_embed_dtype_and_endianness(
    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
262
263
264
265
266
267
268
269
270
271
):
    input_texts = [
        "The best thing about vLLM is that it supports many different models",
    ]

    responses_float = await client.embeddings.create(
        input=input_texts, model=model_name, encoding_format="float"
    )
    float_data = [d.embedding for d in responses_float.data]

272
273
274
275
276
277
278
279
280
281
282
    for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
        for endianness in ENDIANNESS:
            responses_base64 = requests.post(
                server.url_for("/v1/embeddings"),
                json={
                    "model": model_name,
                    "input": input_texts,
                    "encoding_format": "base64",
                    "embed_dtype": embed_dtype,
                    "endianness": endianness,
                },
283
284
            )

285
286
287
288
289
290
291
292
293
294
295
296
297
            base64_data = []
            for data in responses_base64.json()["data"]:
                binary = base64.b64decode(data["embedding"])
                tensor = binary2tensor(binary, (-1,), embed_dtype, endianness)
                base64_data.append(tensor.to(torch.float32).tolist())

            check_embeddings_close(
                embeddings_0_lst=float_data,
                embeddings_1_lst=base64_data,
                name_0="float_data",
                name_1="base64_data",
                tol=1e-2,
            )
298
299
300
301


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
302
303
async def test_bytes_embed_dtype_and_endianness(
    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
304
305
306
307
308
):
    input_texts = [
        "The best thing about vLLM is that it supports many different models",
    ]

309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
    responses_float = await client.embeddings.create(
        input=input_texts, model=model_name, encoding_format="float"
    )
    float_data = [d.embedding for d in responses_float.data]

    for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
        for endianness in ENDIANNESS:
            responses_bytes = requests.post(
                server.url_for("/v1/embeddings"),
                json={
                    "model": model_name,
                    "input": input_texts,
                    "encoding_format": "bytes",
                    "embed_dtype": embed_dtype,
                    "endianness": endianness,
                },
            )

            metadata = json.loads(responses_bytes.headers["metadata"])
            body = responses_bytes.content
            items = [MetadataItem(**x) for x in metadata["data"]]

            bytes_data = decode_pooling_output(items=items, body=body)
            bytes_data = [x.to(torch.float32).tolist() for x in bytes_data]

            check_embeddings_close(
                embeddings_0_lst=float_data,
                embeddings_1_lst=bytes_data,
                name_0="float_data",
                name_1="bytes_data",
                tol=1e-2,
            )


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"])
async def test_params_not_supported(
    server: RemoteOpenAIServer, model_name: str, param_name: str
):
    input_texts = [
        "The best thing about vLLM is that it supports many different models",
    ]
352
353
354
355
356
357
358

    responses_base64 = requests.post(
        server.url_for("/v1/embeddings"),
        json={
            "model": model_name,
            "input": input_texts,
            "encoding_format": "base64",
359
            param_name: f"bad_{param_name}",
360
361
362
363
        },
    )

    assert responses_base64.status_code == 400
364
365
    assert "literal_error" in responses_base64.json()["error"]["message"]
    assert f"bad_{param_name}" in responses_base64.json()["error"]["message"]
366
367


368
@pytest.mark.asyncio
369
@pytest.mark.parametrize("model_name", [MODEL_NAME])
370
async def test_single_embedding_truncation(client: openai.AsyncOpenAI, model_name: str):
371
372
373
374
375
    input_texts = [
        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
    ]

    # test single embedding
376
    embedding_response = await client.embeddings.create(
377
378
        model=model_name, input=input_texts, extra_body={"truncate_prompt_tokens": 10}
    )
379
    embeddings = EmbeddingResponse.model_validate(
380
381
        embedding_response.model_dump(mode="json")
    )
382

383
384
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
385
    assert len(embeddings.data[0].embedding) == 384
386
387
388
389
390
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 10
    assert embeddings.usage.total_tokens == 10

    input_tokens = [
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
        1,
        24428,
        289,
        18341,
        26165,
        285,
        19323,
        283,
        289,
        26789,
        3871,
        28728,
        9901,
        340,
        2229,
        385,
        340,
        315,
        28741,
        28804,
        2,
412
    ]
413
    embedding_response = await client.embeddings.create(
414
415
        model=model_name, input=input_tokens, extra_body={"truncate_prompt_tokens": 10}
    )
416
    embeddings = EmbeddingResponse.model_validate(
417
418
        embedding_response.model_dump(mode="json")
    )
419
420
421

    assert embeddings.id is not None
    assert len(embeddings.data) == 1
422
    assert len(embeddings.data[0].embedding) == 384
423
424
425
426
427
428
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 10
    assert embeddings.usage.total_tokens == 10


@pytest.mark.asyncio
429
@pytest.mark.parametrize("model_name", [MODEL_NAME])
430
431
432
async def test_single_embedding_truncation_invalid(
    client: openai.AsyncOpenAI, model_name: str
):
433
434
435
436
437
    input_texts = [
        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
    ]

    with pytest.raises(openai.BadRequestError):
438
        response = await client.embeddings.create(
439
440
            model=model_name,
            input=input_texts,
441
442
            extra_body={"truncate_prompt_tokens": 8193},
        )
443
        assert "error" in response.object
444
445
446
447
        assert (
            "truncate_prompt_tokens value is greater than max_model_len. "
            "Please, select a smaller truncation size." in response.message
        )
448
449
450


@pytest.mark.asyncio
451
async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenAI):
452
453
454
455
456
457
458
459
460
461
462
463
    input_texts = [
        "The chef prepared a delicious meal.",
    ]

    request_args = {
        "model": MODEL_NAME,
        "input": input_texts,
        "encoding_format": "float",
    }

    completion_response = await client.embeddings.create(**request_args)

464
465
466
    invocation_response = requests.post(
        server.url_for("invocations"), json=request_args
    )
467
468
469
470
471
472
    invocation_response.raise_for_status()

    completion_output = completion_response.model_dump()
    invocation_output = invocation_response.json()

    assert completion_output.keys() == invocation_output.keys()
473
474
475
    for completion_data, invocation_data in zip(
        completion_output["data"], invocation_output["data"]
    ):
476
        assert completion_data.keys() == invocation_data.keys()
477
478
479
480
481
482
        check_embeddings_close(
            embeddings_0_lst=[completion_data["embedding"]],
            embeddings_1_lst=[invocation_data["embedding"]],
            name_0="completion",
            name_1="invocation",
        )
483
484
485
486


@pytest.mark.asyncio
async def test_invocations_conversation(server: RemoteOpenAIServer):
487
488
489
490
491
492
493
494
495
496
497
498
499
500
    messages = [
        {
            "role": "user",
            "content": "The cat sat on the mat.",
        },
        {
            "role": "assistant",
            "content": "A feline was resting on a rug.",
        },
        {
            "role": "user",
            "content": "Stars twinkle brightly in the night sky.",
        },
    ]
501
502
503
504
505
506
507

    request_args = {
        "model": MODEL_NAME,
        "messages": messages,
        "encoding_format": "float",
    }

508
    chat_response = requests.post(server.url_for("v1/embeddings"), json=request_args)
509
510
    chat_response.raise_for_status()

511
512
513
    invocation_response = requests.post(
        server.url_for("invocations"), json=request_args
    )
514
515
516
517
518
519
    invocation_response.raise_for_status()

    chat_output = chat_response.json()
    invocation_output = invocation_response.json()

    assert chat_output.keys() == invocation_output.keys()
520
521
522
    for chat_data, invocation_data in zip(
        chat_output["data"], invocation_output["data"]
    ):
523
        assert chat_data.keys() == invocation_data.keys()
524
525
526
527
528
529
        check_embeddings_close(
            embeddings_0_lst=[chat_data["embedding"]],
            embeddings_1_lst=[invocation_data["embedding"]],
            name_0="chat",
            name_1="invocation",
        )
530
531
532
533
534
535
536
537
538
539
540
541


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_normalize(server: RemoteOpenAIServer, model_name: str):
    input_text = ["The chef prepared a delicious meal."]

    async def get_outputs(normalize):
        request_args = {
            "model": MODEL_NAME,
            "input": input_text,
            "encoding_format": "float",
542
            "normalize": normalize,
543
544
        }

545
        response = requests.post(server.url_for("v1/embeddings"), json=request_args)
546
547
        outputs = response.json()

548
        return torch.tensor([x["embedding"] for x in outputs["data"]])
549
550
551
552
553

    default = await get_outputs(normalize=None)
    w_normal = await get_outputs(normalize=True)
    wo_normal = await get_outputs(normalize=False)

554
555
556
557
558
559
560
    assert torch.allclose(default, w_normal, atol=1e-2), "Default should use normal."
    assert not torch.allclose(w_normal, wo_normal, atol=1e-2), (
        "wo_normal should not use normal."
    )
    assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), (
        "w_normal should be close to normal(wo_normal)."
    )
561
562
563
564


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
565
566
async def test_pooling_embed(server: RemoteOpenAIServer, model_name: str):
    task = "embed"
567
568
569
570
    input_text = ["The chef prepared a delicious meal."]

    response = requests.post(
        server.url_for("pooling"),
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
        json={
            "model": model_name,
            "input": input_text,
            "encoding_format": "float",
            "task": task,
        },
    )

    poolings = PoolingResponse.model_validate(response.json())

    assert len(poolings.data) == 1
    assert len(poolings.data[0].data) == 384


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_pooling_token_embed(server: RemoteOpenAIServer, model_name: str):
    task = "token_embed"
    input_text = ["The chef prepared a delicious meal."]

    response = requests.post(
        server.url_for("pooling"),
        json={
            "model": model_name,
            "input": input_text,
            "encoding_format": "float",
            "task": task,
        },
599
600
601
602
603
604
605
    )

    poolings = PoolingResponse.model_validate(response.json())

    assert len(poolings.data) == 1
    assert len(poolings.data[0].data) == 11
    assert len(poolings.data[0].data[0]) == 384
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("task", ["classify", "token_classify", "plugin"])
async def test_pooling_not_supported(
    server: RemoteOpenAIServer, model_name: str, task: str
):
    response = requests.post(
        server.url_for("pooling"),
        json={
            "model": model_name,
            "input": "test",
            "encoding_format": "float",
            "task": task,
        },
    )
    assert response.json()["error"]["type"] == "BadRequestError"
    assert response.json()["error"]["message"].startswith(
        f"Task {task} is not supported"
    )