test_online.py 21.3 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import base64
5
import json
6
7

import numpy as np
8
9
import openai
import pytest
10
import pytest_asyncio
11
import requests
12
13
import torch
import torch.nn.functional as F
14

15
from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
16
17
from tests.models.utils import check_embeddings_close
from tests.utils import RemoteOpenAIServer
18
19
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
20
from vllm.platforms import current_platform
21
from vllm.tokenizers import get_tokenizer
22
23
24
25
26
from vllm.utils.serial_utils import (
    EMBED_DTYPE_TO_TORCH_DTYPE,
    ENDIANNESS,
    MetadataItem,
    binary2tensor,
27
    build_metadata_items,
28
29
    decode_pooling_output,
)
30

31
32
33
34
35
if current_platform.is_rocm():
    pytest.skip(
        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
    )

36
MODEL_NAME = "intfloat/multilingual-e5-small"
37
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
38
DTYPE = "bfloat16"
39
40
41


@pytest.fixture(scope="module")
42
def server():
43
    args = [
44
45
        "--runner",
        "pooling",
46
47
        # use half precision for speed and memory savings in CI environment
        "--dtype",
48
        DTYPE,
49
50
        "--enforce-eager",
        "--max-model-len",
51
        "512",
52
53
        "--chat-template",
        DUMMY_CHAT_TEMPLATE,
54
55
    ]

56
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
57
        yield remote_server
58
59


60
@pytest_asyncio.fixture
61
62
async def client(server):
    async with server.get_async_client() as async_client:
63
        yield async_client
64
65


66
67
@pytest.fixture(scope="module")
def hf_model(hf_runner):
68
    with hf_runner(MODEL_NAME, dtype=DTYPE, is_sentence_transformer=True) as hf_model:
69
70
71
        yield hf_model


72
@pytest.mark.asyncio
73
@pytest.mark.parametrize("model_name", [MODEL_NAME])
74
async def test_single_embedding(hf_model, client: openai.AsyncOpenAI, model_name: str):
75
76
77
78
79
    input_texts = [
        "The chef prepared a delicious meal.",
    ]

    # test single embedding
80
    embedding_response = await client.embeddings.create(
81
82
83
84
        model=model_name,
        input=input_texts,
        encoding_format="float",
    )
85
    embeddings = EmbeddingResponse.model_validate(
86
87
        embedding_response.model_dump(mode="json")
    )
88

89
90
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
91
    assert len(embeddings.data[0].embedding) == 384
92
    assert embeddings.usage.completion_tokens == 0
93
94
    assert embeddings.usage.prompt_tokens == 11
    assert embeddings.usage.total_tokens == 11
95

96
    vllm_outputs = [d.embedding for d in embeddings.data]
97
    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
98

99
100
    # test using token IDs
    input_tokens = [1, 1, 1, 1, 1]
101
    embedding_response = await client.embeddings.create(
102
103
104
105
        model=model_name,
        input=input_tokens,
        encoding_format="float",
    )
106
    embeddings = EmbeddingResponse.model_validate(
107
108
        embedding_response.model_dump(mode="json")
    )
109

110
111
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
112
    assert len(embeddings.data[0].embedding) == 384
113
114
115
116
117
118
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 5
    assert embeddings.usage.total_tokens == 5


@pytest.mark.asyncio
119
@pytest.mark.parametrize("model_name", [MODEL_NAME])
120
async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI, model_name: str):
121
    # test list[str]
122
    input_texts = [
123
124
125
        "The cat sat on the mat.",
        "A feline was resting on a rug.",
        "Stars twinkle brightly in the night sky.",
126
    ]
127
    embedding_response = await client.embeddings.create(
128
129
130
131
        model=model_name,
        input=input_texts,
        encoding_format="float",
    )
132
    embeddings = EmbeddingResponse.model_validate(
133
134
        embedding_response.model_dump(mode="json")
    )
135

136
137
    assert embeddings.id is not None
    assert len(embeddings.data) == 3
138
    assert len(embeddings.data[0].embedding) == 384
139
    assert embeddings.usage.completion_tokens == 0
140
141
    assert embeddings.usage.prompt_tokens == 33
    assert embeddings.usage.total_tokens == 33
142

143
    vllm_outputs = [d.embedding for d in embeddings.data]
144
    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
145

146
    # test list[list[int]]
147
148
149
150
151
152
    input_tokens = [
        [4, 5, 7, 9, 20],
        [15, 29, 499],
        [24, 24, 24, 24, 24],
        [25, 32, 64, 77],
    ]
153
    embedding_response = await client.embeddings.create(
154
155
156
157
        model=model_name,
        input=input_tokens,
        encoding_format="float",
    )
158
    embeddings = EmbeddingResponse.model_validate(
159
160
        embedding_response.model_dump(mode="json")
    )
161

162
163
    assert embeddings.id is not None
    assert len(embeddings.data) == 4
164
    assert len(embeddings.data[0].embedding) == 384
165
166
167
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 17
    assert embeddings.usage.total_tokens == 17
168
169
170


@pytest.mark.asyncio
171
@pytest.mark.parametrize("model_name", [MODEL_NAME])
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
async def test_conversation_embedding(
    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
):
    messages = [
        {
            "role": "user",
            "content": "The cat sat on the mat.",
        },
        {
            "role": "assistant",
            "content": "A feline was resting on a rug.",
        },
        {
            "role": "user",
            "content": "Stars twinkle brightly in the night sky.",
        },
    ]
189

190
191
192
193
194
195
196
197
    chat_response = requests.post(
        server.url_for("v1/embeddings"),
        json={
            "model": model_name,
            "messages": messages,
            "encoding_format": "float",
        },
    )
198
    chat_response.raise_for_status()
199
    chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
200

201
    tokenizer = get_tokenizer(tokenizer_name=model_name)
202
203
204
205
206
207
208
209
210
211
212
213
214
215
    prompt = tokenizer.apply_chat_template(
        messages,
        chat_template=DUMMY_CHAT_TEMPLATE,
        add_generation_prompt=True,
        continue_final_message=False,
        tokenize=False,
    )
    completion_response = await client.embeddings.create(
        model=model_name,
        input=prompt,
        encoding_format="float",
        # To be consistent with chat
        extra_body={"add_special_tokens": False},
    )
216
    completion_embeddings = EmbeddingResponse.model_validate(
217
218
        completion_response.model_dump(mode="json")
    )
219

220
221
222
    assert chat_embeddings.id is not None
    assert completion_embeddings.id is not None
    assert chat_embeddings.created <= completion_embeddings.created
223
224
225
    assert chat_embeddings.model_dump(exclude={"id", "created"}) == (
        completion_embeddings.model_dump(exclude={"id", "created"})
    )
226
227
228
229


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
230
231
232
async def test_batch_base64_embedding(
    hf_model, client: openai.AsyncOpenAI, model_name: str
):
233
234
    input_texts = [
        "Hello my name is",
235
        "The best thing about vLLM is that it supports many different models",
236
237
    ]

238
239
240
    responses_float = await client.embeddings.create(
        input=input_texts, model=model_name, encoding_format="float"
    )
241
    float_data = [d.embedding for d in responses_float.data]
242
    run_embedding_correctness_test(hf_model, input_texts, float_data)
243

244
245
246
    responses_base64 = await client.embeddings.create(
        input=input_texts, model=model_name, encoding_format="base64"
    )
247
    base64_data = []
248
    for data in responses_base64.data:
249
        base64_data.append(
250
251
            np.frombuffer(base64.b64decode(data.embedding), dtype="float32").tolist()
        )
252

253
    run_embedding_correctness_test(hf_model, input_texts, base64_data)
254
255

    # Default response is float32 decoded from base64 by OpenAI Client
256
257
258
    responses_default = await client.embeddings.create(
        input=input_texts, model=model_name
    )
259
    default_data = [d.embedding for d in responses_default.data]
260
    run_embedding_correctness_test(hf_model, input_texts, default_data)
261
262


263
264
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
265
266
async def test_base64_embed_dtype_and_endianness(
    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
267
268
269
270
271
272
273
274
275
276
):
    input_texts = [
        "The best thing about vLLM is that it supports many different models",
    ]

    responses_float = await client.embeddings.create(
        input=input_texts, model=model_name, encoding_format="float"
    )
    float_data = [d.embedding for d in responses_float.data]

277
278
279
280
281
282
283
284
285
286
287
    for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
        for endianness in ENDIANNESS:
            responses_base64 = requests.post(
                server.url_for("/v1/embeddings"),
                json={
                    "model": model_name,
                    "input": input_texts,
                    "encoding_format": "base64",
                    "embed_dtype": embed_dtype,
                    "endianness": endianness,
                },
288
289
            )

290
291
292
293
294
295
296
297
298
299
300
301
302
            base64_data = []
            for data in responses_base64.json()["data"]:
                binary = base64.b64decode(data["embedding"])
                tensor = binary2tensor(binary, (-1,), embed_dtype, endianness)
                base64_data.append(tensor.to(torch.float32).tolist())

            check_embeddings_close(
                embeddings_0_lst=float_data,
                embeddings_1_lst=base64_data,
                name_0="float_data",
                name_1="base64_data",
                tol=1e-2,
            )
303
304
305
306


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
307
308
async def test_bytes_embed_dtype_and_endianness(
    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
309
310
311
312
313
):
    input_texts = [
        "The best thing about vLLM is that it supports many different models",
    ]

314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
    responses_float = await client.embeddings.create(
        input=input_texts, model=model_name, encoding_format="float"
    )
    float_data = [d.embedding for d in responses_float.data]

    for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
        for endianness in ENDIANNESS:
            responses_bytes = requests.post(
                server.url_for("/v1/embeddings"),
                json={
                    "model": model_name,
                    "input": input_texts,
                    "encoding_format": "bytes",
                    "embed_dtype": embed_dtype,
                    "endianness": endianness,
                },
            )

            metadata = json.loads(responses_bytes.headers["metadata"])
            body = responses_bytes.content
            items = [MetadataItem(**x) for x in metadata["data"]]

            bytes_data = decode_pooling_output(items=items, body=body)
            bytes_data = [x.to(torch.float32).tolist() for x in bytes_data]

            check_embeddings_close(
                embeddings_0_lst=float_data,
                embeddings_1_lst=bytes_data,
                name_0="float_data",
                name_1="bytes_data",
                tol=1e-2,
            )


348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_bytes_only_embed_dtype_and_endianness(
    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
):
    input_texts = [
        "The best thing about vLLM is that it supports many different models",
    ] * 2

    responses_float = await client.embeddings.create(
        input=input_texts, model=model_name, encoding_format="float"
    )
    float_data = [d.embedding for d in responses_float.data]
    embedding_size = len(float_data[0])

    for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
        for endianness in ENDIANNESS:
            responses_bytes = requests.post(
                server.url_for("/v1/embeddings"),
                json={
                    "model": model_name,
                    "input": input_texts,
                    "encoding_format": "bytes_only",
                    "embed_dtype": embed_dtype,
                    "endianness": endianness,
                },
            )

            assert "metadata" not in responses_bytes.headers
            body = responses_bytes.content
            items = build_metadata_items(
                embed_dtype=embed_dtype,
                endianness=endianness,
                shape=(embedding_size,),
                n_request=len(input_texts),
            )

            bytes_data = decode_pooling_output(items=items, body=body)
            bytes_data = [x.to(torch.float32).tolist() for x in bytes_data]

            check_embeddings_close(
                embeddings_0_lst=float_data,
                embeddings_1_lst=bytes_data,
                name_0="float_data",
                name_1="bytes_data",
                tol=1e-2,
            )


397
398
399
400
401
402
403
404
405
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"])
async def test_params_not_supported(
    server: RemoteOpenAIServer, model_name: str, param_name: str
):
    input_texts = [
        "The best thing about vLLM is that it supports many different models",
    ]
406
407
408
409
410
411
412

    responses_base64 = requests.post(
        server.url_for("/v1/embeddings"),
        json={
            "model": model_name,
            "input": input_texts,
            "encoding_format": "base64",
413
            param_name: f"bad_{param_name}",
414
415
416
417
        },
    )

    assert responses_base64.status_code == 400
418
419
    assert "literal_error" in responses_base64.json()["error"]["message"]
    assert f"bad_{param_name}" in responses_base64.json()["error"]["message"]
420
421


422
@pytest.mark.asyncio
423
@pytest.mark.parametrize("model_name", [MODEL_NAME])
424
async def test_single_embedding_truncation(client: openai.AsyncOpenAI, model_name: str):
425
426
427
428
429
    input_texts = [
        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
    ]

    # test single embedding
430
    embedding_response = await client.embeddings.create(
431
432
        model=model_name, input=input_texts, extra_body={"truncate_prompt_tokens": 10}
    )
433
    embeddings = EmbeddingResponse.model_validate(
434
435
        embedding_response.model_dump(mode="json")
    )
436

437
438
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
439
    assert len(embeddings.data[0].embedding) == 384
440
441
442
443
444
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 10
    assert embeddings.usage.total_tokens == 10

    input_tokens = [
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
        1,
        24428,
        289,
        18341,
        26165,
        285,
        19323,
        283,
        289,
        26789,
        3871,
        28728,
        9901,
        340,
        2229,
        385,
        340,
        315,
        28741,
        28804,
        2,
466
    ]
467
    embedding_response = await client.embeddings.create(
468
469
        model=model_name, input=input_tokens, extra_body={"truncate_prompt_tokens": 10}
    )
470
    embeddings = EmbeddingResponse.model_validate(
471
472
        embedding_response.model_dump(mode="json")
    )
473
474
475

    assert embeddings.id is not None
    assert len(embeddings.data) == 1
476
    assert len(embeddings.data[0].embedding) == 384
477
478
479
480
481
482
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 10
    assert embeddings.usage.total_tokens == 10


@pytest.mark.asyncio
483
@pytest.mark.parametrize("model_name", [MODEL_NAME])
484
485
486
async def test_single_embedding_truncation_invalid(
    client: openai.AsyncOpenAI, model_name: str
):
487
488
489
490
491
    input_texts = [
        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
    ]

    with pytest.raises(openai.BadRequestError):
492
        response = await client.embeddings.create(
493
494
            model=model_name,
            input=input_texts,
495
496
            extra_body={"truncate_prompt_tokens": 8193},
        )
497
        assert "error" in response.object
498
499
500
501
        assert (
            "truncate_prompt_tokens value is greater than max_model_len. "
            "Please, select a smaller truncation size." in response.message
        )
502
503
504


@pytest.mark.asyncio
505
async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenAI):
506
507
508
509
510
511
512
513
514
515
516
517
    input_texts = [
        "The chef prepared a delicious meal.",
    ]

    request_args = {
        "model": MODEL_NAME,
        "input": input_texts,
        "encoding_format": "float",
    }

    completion_response = await client.embeddings.create(**request_args)

518
519
520
    invocation_response = requests.post(
        server.url_for("invocations"), json=request_args
    )
521
522
523
524
525
526
    invocation_response.raise_for_status()

    completion_output = completion_response.model_dump()
    invocation_output = invocation_response.json()

    assert completion_output.keys() == invocation_output.keys()
527
528
529
    for completion_data, invocation_data in zip(
        completion_output["data"], invocation_output["data"]
    ):
530
        assert completion_data.keys() == invocation_data.keys()
531
532
533
534
535
536
        check_embeddings_close(
            embeddings_0_lst=[completion_data["embedding"]],
            embeddings_1_lst=[invocation_data["embedding"]],
            name_0="completion",
            name_1="invocation",
        )
537
538
539
540


@pytest.mark.asyncio
async def test_invocations_conversation(server: RemoteOpenAIServer):
541
542
543
544
545
546
547
548
549
550
551
552
553
554
    messages = [
        {
            "role": "user",
            "content": "The cat sat on the mat.",
        },
        {
            "role": "assistant",
            "content": "A feline was resting on a rug.",
        },
        {
            "role": "user",
            "content": "Stars twinkle brightly in the night sky.",
        },
    ]
555
556
557
558
559
560
561

    request_args = {
        "model": MODEL_NAME,
        "messages": messages,
        "encoding_format": "float",
    }

562
    chat_response = requests.post(server.url_for("v1/embeddings"), json=request_args)
563
564
    chat_response.raise_for_status()

565
566
567
    invocation_response = requests.post(
        server.url_for("invocations"), json=request_args
    )
568
569
570
571
572
573
    invocation_response.raise_for_status()

    chat_output = chat_response.json()
    invocation_output = invocation_response.json()

    assert chat_output.keys() == invocation_output.keys()
574
575
576
    for chat_data, invocation_data in zip(
        chat_output["data"], invocation_output["data"]
    ):
577
        assert chat_data.keys() == invocation_data.keys()
578
579
580
581
582
583
        check_embeddings_close(
            embeddings_0_lst=[chat_data["embedding"]],
            embeddings_1_lst=[invocation_data["embedding"]],
            name_0="chat",
            name_1="invocation",
        )
584
585
586
587
588
589
590
591
592
593
594
595


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_normalize(server: RemoteOpenAIServer, model_name: str):
    input_text = ["The chef prepared a delicious meal."]

    async def get_outputs(normalize):
        request_args = {
            "model": MODEL_NAME,
            "input": input_text,
            "encoding_format": "float",
596
            "normalize": normalize,
597
598
        }

599
        response = requests.post(server.url_for("v1/embeddings"), json=request_args)
600
601
        outputs = response.json()

602
        return torch.tensor([x["embedding"] for x in outputs["data"]])
603
604
605
606
607

    default = await get_outputs(normalize=None)
    w_normal = await get_outputs(normalize=True)
    wo_normal = await get_outputs(normalize=False)

608
609
610
611
612
613
614
    assert torch.allclose(default, w_normal, atol=1e-2), "Default should use normal."
    assert not torch.allclose(w_normal, wo_normal, atol=1e-2), (
        "wo_normal should not use normal."
    )
    assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), (
        "w_normal should be close to normal(wo_normal)."
    )
615
616
617
618


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
619
620
async def test_pooling_embed(server: RemoteOpenAIServer, model_name: str):
    task = "embed"
621
622
623
624
    input_text = ["The chef prepared a delicious meal."]

    response = requests.post(
        server.url_for("pooling"),
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
        json={
            "model": model_name,
            "input": input_text,
            "encoding_format": "float",
            "task": task,
        },
    )

    poolings = PoolingResponse.model_validate(response.json())

    assert len(poolings.data) == 1
    assert len(poolings.data[0].data) == 384


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_pooling_token_embed(server: RemoteOpenAIServer, model_name: str):
    task = "token_embed"
    input_text = ["The chef prepared a delicious meal."]

    response = requests.post(
        server.url_for("pooling"),
        json={
            "model": model_name,
            "input": input_text,
            "encoding_format": "float",
            "task": task,
        },
653
654
655
656
657
658
659
    )

    poolings = PoolingResponse.model_validate(response.json())

    assert len(poolings.data) == 1
    assert len(poolings.data[0].data) == 11
    assert len(poolings.data[0].data[0]) == 384
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("task", ["classify", "token_classify", "plugin"])
async def test_pooling_not_supported(
    server: RemoteOpenAIServer, model_name: str, task: str
):
    response = requests.post(
        server.url_for("pooling"),
        json={
            "model": model_name,
            "input": "test",
            "encoding_format": "float",
            "task": task,
        },
    )
    assert response.json()["error"]["type"] == "BadRequestError"
    assert response.json()["error"]["message"].startswith(
        f"Task {task} is not supported"
    )