test_online.py 19.5 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import base64
5
import json
6
7

import numpy as np
8
9
import openai
import pytest
10
import pytest_asyncio
11
import requests
12
13
import torch
import torch.nn.functional as F
14

15
from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
16
17
from tests.models.utils import check_embeddings_close
from tests.utils import RemoteOpenAIServer
18
19
from vllm.entrypoints.openai.protocol import (
    EmbeddingResponse,
20
    PoolingResponse,
21
)
22
from vllm.platforms import current_platform
23
from vllm.transformers_utils.tokenizer import get_tokenizer
24
25
26
27
28
29
30
from vllm.utils.serial_utils import (
    EMBED_DTYPE_TO_TORCH_DTYPE,
    ENDIANNESS,
    MetadataItem,
    binary2tensor,
    decode_pooling_output,
)
31

32
33
34
35
36
if current_platform.is_rocm():
    pytest.skip(
        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
    )

37
MODEL_NAME = "intfloat/multilingual-e5-small"
38
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
39
DTYPE = "bfloat16"
40
41
42


@pytest.fixture(scope="module")
43
def server():
44
    args = [
45
46
        "--runner",
        "pooling",
47
48
        # use half precision for speed and memory savings in CI environment
        "--dtype",
49
        DTYPE,
50
51
        "--enforce-eager",
        "--max-model-len",
52
        "512",
53
54
        "--chat-template",
        DUMMY_CHAT_TEMPLATE,
55
56
    ]

57
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
58
        yield remote_server
59
60


61
@pytest_asyncio.fixture
62
63
async def client(server):
    async with server.get_async_client() as async_client:
64
        yield async_client
65
66


67
68
@pytest.fixture(scope="module")
def hf_model(hf_runner):
69
    with hf_runner(MODEL_NAME, dtype=DTYPE, is_sentence_transformer=True) as hf_model:
70
71
72
        yield hf_model


73
@pytest.mark.asyncio
74
@pytest.mark.parametrize("model_name", [MODEL_NAME])
75
async def test_single_embedding(hf_model, client: openai.AsyncOpenAI, model_name: str):
76
77
78
79
80
    input_texts = [
        "The chef prepared a delicious meal.",
    ]

    # test single embedding
81
    embedding_response = await client.embeddings.create(
82
83
84
85
        model=model_name,
        input=input_texts,
        encoding_format="float",
    )
86
    embeddings = EmbeddingResponse.model_validate(
87
88
        embedding_response.model_dump(mode="json")
    )
89

90
91
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
92
    assert len(embeddings.data[0].embedding) == 384
93
    assert embeddings.usage.completion_tokens == 0
94
95
    assert embeddings.usage.prompt_tokens == 11
    assert embeddings.usage.total_tokens == 11
96

97
    vllm_outputs = [d.embedding for d in embeddings.data]
98
    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
99

100
101
    # test using token IDs
    input_tokens = [1, 1, 1, 1, 1]
102
    embedding_response = await client.embeddings.create(
103
104
105
106
        model=model_name,
        input=input_tokens,
        encoding_format="float",
    )
107
    embeddings = EmbeddingResponse.model_validate(
108
109
        embedding_response.model_dump(mode="json")
    )
110

111
112
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
113
    assert len(embeddings.data[0].embedding) == 384
114
115
116
117
118
119
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 5
    assert embeddings.usage.total_tokens == 5


@pytest.mark.asyncio
120
@pytest.mark.parametrize("model_name", [MODEL_NAME])
121
async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI, model_name: str):
122
    # test list[str]
123
    input_texts = [
124
125
126
        "The cat sat on the mat.",
        "A feline was resting on a rug.",
        "Stars twinkle brightly in the night sky.",
127
    ]
128
    embedding_response = await client.embeddings.create(
129
130
131
132
        model=model_name,
        input=input_texts,
        encoding_format="float",
    )
133
    embeddings = EmbeddingResponse.model_validate(
134
135
        embedding_response.model_dump(mode="json")
    )
136

137
138
    assert embeddings.id is not None
    assert len(embeddings.data) == 3
139
    assert len(embeddings.data[0].embedding) == 384
140
    assert embeddings.usage.completion_tokens == 0
141
142
    assert embeddings.usage.prompt_tokens == 33
    assert embeddings.usage.total_tokens == 33
143

144
    vllm_outputs = [d.embedding for d in embeddings.data]
145
    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
146

147
    # test list[list[int]]
148
149
150
151
152
153
    input_tokens = [
        [4, 5, 7, 9, 20],
        [15, 29, 499],
        [24, 24, 24, 24, 24],
        [25, 32, 64, 77],
    ]
154
    embedding_response = await client.embeddings.create(
155
156
157
158
        model=model_name,
        input=input_tokens,
        encoding_format="float",
    )
159
    embeddings = EmbeddingResponse.model_validate(
160
161
        embedding_response.model_dump(mode="json")
    )
162

163
164
    assert embeddings.id is not None
    assert len(embeddings.data) == 4
165
    assert len(embeddings.data[0].embedding) == 384
166
167
168
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 17
    assert embeddings.usage.total_tokens == 17
169
170
171


@pytest.mark.asyncio
172
@pytest.mark.parametrize("model_name", [MODEL_NAME])
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
async def test_conversation_embedding(
    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
):
    messages = [
        {
            "role": "user",
            "content": "The cat sat on the mat.",
        },
        {
            "role": "assistant",
            "content": "A feline was resting on a rug.",
        },
        {
            "role": "user",
            "content": "Stars twinkle brightly in the night sky.",
        },
    ]
190

191
192
193
194
195
196
197
198
    chat_response = requests.post(
        server.url_for("v1/embeddings"),
        json={
            "model": model_name,
            "messages": messages,
            "encoding_format": "float",
        },
    )
199
    chat_response.raise_for_status()
200
    chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216

    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
    prompt = tokenizer.apply_chat_template(
        messages,
        chat_template=DUMMY_CHAT_TEMPLATE,
        add_generation_prompt=True,
        continue_final_message=False,
        tokenize=False,
    )
    completion_response = await client.embeddings.create(
        model=model_name,
        input=prompt,
        encoding_format="float",
        # To be consistent with chat
        extra_body={"add_special_tokens": False},
    )
217
    completion_embeddings = EmbeddingResponse.model_validate(
218
219
        completion_response.model_dump(mode="json")
    )
220

221
222
223
    assert chat_embeddings.id is not None
    assert completion_embeddings.id is not None
    assert chat_embeddings.created <= completion_embeddings.created
224
225
226
    assert chat_embeddings.model_dump(exclude={"id", "created"}) == (
        completion_embeddings.model_dump(exclude={"id", "created"})
    )
227
228
229
230


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
231
232
233
async def test_batch_base64_embedding(
    hf_model, client: openai.AsyncOpenAI, model_name: str
):
234
235
    input_texts = [
        "Hello my name is",
236
        "The best thing about vLLM is that it supports many different models",
237
238
    ]

239
240
241
    responses_float = await client.embeddings.create(
        input=input_texts, model=model_name, encoding_format="float"
    )
242
    float_data = [d.embedding for d in responses_float.data]
243
    run_embedding_correctness_test(hf_model, input_texts, float_data)
244

245
246
247
    responses_base64 = await client.embeddings.create(
        input=input_texts, model=model_name, encoding_format="base64"
    )
248
    base64_data = []
249
    for data in responses_base64.data:
250
        base64_data.append(
251
252
            np.frombuffer(base64.b64decode(data.embedding), dtype="float32").tolist()
        )
253

254
    run_embedding_correctness_test(hf_model, input_texts, base64_data)
255
256

    # Default response is float32 decoded from base64 by OpenAI Client
257
258
259
    responses_default = await client.embeddings.create(
        input=input_texts, model=model_name
    )
260
    default_data = [d.embedding for d in responses_default.data]
261
    run_embedding_correctness_test(hf_model, input_texts, default_data)
262
263


264
265
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
266
267
async def test_base64_embed_dtype_and_endianness(
    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
268
269
270
271
272
273
274
275
276
277
):
    input_texts = [
        "The best thing about vLLM is that it supports many different models",
    ]

    responses_float = await client.embeddings.create(
        input=input_texts, model=model_name, encoding_format="float"
    )
    float_data = [d.embedding for d in responses_float.data]

278
279
280
281
282
283
284
285
286
287
288
    for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
        for endianness in ENDIANNESS:
            responses_base64 = requests.post(
                server.url_for("/v1/embeddings"),
                json={
                    "model": model_name,
                    "input": input_texts,
                    "encoding_format": "base64",
                    "embed_dtype": embed_dtype,
                    "endianness": endianness,
                },
289
290
            )

291
292
293
294
295
296
297
298
299
300
301
302
303
            base64_data = []
            for data in responses_base64.json()["data"]:
                binary = base64.b64decode(data["embedding"])
                tensor = binary2tensor(binary, (-1,), embed_dtype, endianness)
                base64_data.append(tensor.to(torch.float32).tolist())

            check_embeddings_close(
                embeddings_0_lst=float_data,
                embeddings_1_lst=base64_data,
                name_0="float_data",
                name_1="base64_data",
                tol=1e-2,
            )
304
305
306
307


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
308
309
async def test_bytes_embed_dtype_and_endianness(
    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
310
311
312
313
314
):
    input_texts = [
        "The best thing about vLLM is that it supports many different models",
    ]

315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
    responses_float = await client.embeddings.create(
        input=input_texts, model=model_name, encoding_format="float"
    )
    float_data = [d.embedding for d in responses_float.data]

    for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
        for endianness in ENDIANNESS:
            responses_bytes = requests.post(
                server.url_for("/v1/embeddings"),
                json={
                    "model": model_name,
                    "input": input_texts,
                    "encoding_format": "bytes",
                    "embed_dtype": embed_dtype,
                    "endianness": endianness,
                },
            )

            metadata = json.loads(responses_bytes.headers["metadata"])
            body = responses_bytes.content
            items = [MetadataItem(**x) for x in metadata["data"]]

            bytes_data = decode_pooling_output(items=items, body=body)
            bytes_data = [x.to(torch.float32).tolist() for x in bytes_data]

            check_embeddings_close(
                embeddings_0_lst=float_data,
                embeddings_1_lst=bytes_data,
                name_0="float_data",
                name_1="bytes_data",
                tol=1e-2,
            )


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"])
async def test_params_not_supported(
    server: RemoteOpenAIServer, model_name: str, param_name: str
):
    input_texts = [
        "The best thing about vLLM is that it supports many different models",
    ]
358
359
360
361
362
363
364

    responses_base64 = requests.post(
        server.url_for("/v1/embeddings"),
        json={
            "model": model_name,
            "input": input_texts,
            "encoding_format": "base64",
365
            param_name: f"bad_{param_name}",
366
367
368
369
        },
    )

    assert responses_base64.status_code == 400
370
371
    assert "literal_error" in responses_base64.json()["error"]["message"]
    assert f"bad_{param_name}" in responses_base64.json()["error"]["message"]
372
373


374
@pytest.mark.asyncio
375
@pytest.mark.parametrize("model_name", [MODEL_NAME])
376
async def test_single_embedding_truncation(client: openai.AsyncOpenAI, model_name: str):
377
378
379
380
381
    input_texts = [
        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
    ]

    # test single embedding
382
    embedding_response = await client.embeddings.create(
383
384
        model=model_name, input=input_texts, extra_body={"truncate_prompt_tokens": 10}
    )
385
    embeddings = EmbeddingResponse.model_validate(
386
387
        embedding_response.model_dump(mode="json")
    )
388

389
390
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
391
    assert len(embeddings.data[0].embedding) == 384
392
393
394
395
396
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 10
    assert embeddings.usage.total_tokens == 10

    input_tokens = [
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
        1,
        24428,
        289,
        18341,
        26165,
        285,
        19323,
        283,
        289,
        26789,
        3871,
        28728,
        9901,
        340,
        2229,
        385,
        340,
        315,
        28741,
        28804,
        2,
418
    ]
419
    embedding_response = await client.embeddings.create(
420
421
        model=model_name, input=input_tokens, extra_body={"truncate_prompt_tokens": 10}
    )
422
    embeddings = EmbeddingResponse.model_validate(
423
424
        embedding_response.model_dump(mode="json")
    )
425
426
427

    assert embeddings.id is not None
    assert len(embeddings.data) == 1
428
    assert len(embeddings.data[0].embedding) == 384
429
430
431
432
433
434
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 10
    assert embeddings.usage.total_tokens == 10


@pytest.mark.asyncio
435
@pytest.mark.parametrize("model_name", [MODEL_NAME])
436
437
438
async def test_single_embedding_truncation_invalid(
    client: openai.AsyncOpenAI, model_name: str
):
439
440
441
442
443
    input_texts = [
        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
    ]

    with pytest.raises(openai.BadRequestError):
444
        response = await client.embeddings.create(
445
446
            model=model_name,
            input=input_texts,
447
448
            extra_body={"truncate_prompt_tokens": 8193},
        )
449
        assert "error" in response.object
450
451
452
453
        assert (
            "truncate_prompt_tokens value is greater than max_model_len. "
            "Please, select a smaller truncation size." in response.message
        )
454
455
456


@pytest.mark.asyncio
457
async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenAI):
458
459
460
461
462
463
464
465
466
467
468
469
    input_texts = [
        "The chef prepared a delicious meal.",
    ]

    request_args = {
        "model": MODEL_NAME,
        "input": input_texts,
        "encoding_format": "float",
    }

    completion_response = await client.embeddings.create(**request_args)

470
471
472
    invocation_response = requests.post(
        server.url_for("invocations"), json=request_args
    )
473
474
475
476
477
478
    invocation_response.raise_for_status()

    completion_output = completion_response.model_dump()
    invocation_output = invocation_response.json()

    assert completion_output.keys() == invocation_output.keys()
479
480
481
    for completion_data, invocation_data in zip(
        completion_output["data"], invocation_output["data"]
    ):
482
        assert completion_data.keys() == invocation_data.keys()
483
484
485
486
487
488
        check_embeddings_close(
            embeddings_0_lst=[completion_data["embedding"]],
            embeddings_1_lst=[invocation_data["embedding"]],
            name_0="completion",
            name_1="invocation",
        )
489
490
491
492


@pytest.mark.asyncio
async def test_invocations_conversation(server: RemoteOpenAIServer):
493
494
495
496
497
498
499
500
501
502
503
504
505
506
    messages = [
        {
            "role": "user",
            "content": "The cat sat on the mat.",
        },
        {
            "role": "assistant",
            "content": "A feline was resting on a rug.",
        },
        {
            "role": "user",
            "content": "Stars twinkle brightly in the night sky.",
        },
    ]
507
508
509
510
511
512
513

    request_args = {
        "model": MODEL_NAME,
        "messages": messages,
        "encoding_format": "float",
    }

514
    chat_response = requests.post(server.url_for("v1/embeddings"), json=request_args)
515
516
    chat_response.raise_for_status()

517
518
519
    invocation_response = requests.post(
        server.url_for("invocations"), json=request_args
    )
520
521
522
523
524
525
    invocation_response.raise_for_status()

    chat_output = chat_response.json()
    invocation_output = invocation_response.json()

    assert chat_output.keys() == invocation_output.keys()
526
527
528
    for chat_data, invocation_data in zip(
        chat_output["data"], invocation_output["data"]
    ):
529
        assert chat_data.keys() == invocation_data.keys()
530
531
532
533
534
535
        check_embeddings_close(
            embeddings_0_lst=[chat_data["embedding"]],
            embeddings_1_lst=[invocation_data["embedding"]],
            name_0="chat",
            name_1="invocation",
        )
536
537
538
539
540
541
542
543
544
545
546
547


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_normalize(server: RemoteOpenAIServer, model_name: str):
    input_text = ["The chef prepared a delicious meal."]

    async def get_outputs(normalize):
        request_args = {
            "model": MODEL_NAME,
            "input": input_text,
            "encoding_format": "float",
548
            "normalize": normalize,
549
550
        }

551
        response = requests.post(server.url_for("v1/embeddings"), json=request_args)
552
553
        outputs = response.json()

554
        return torch.tensor([x["embedding"] for x in outputs["data"]])
555
556
557
558
559

    default = await get_outputs(normalize=None)
    w_normal = await get_outputs(normalize=True)
    wo_normal = await get_outputs(normalize=False)

560
561
562
563
564
565
566
    assert torch.allclose(default, w_normal, atol=1e-2), "Default should use normal."
    assert not torch.allclose(w_normal, wo_normal, atol=1e-2), (
        "wo_normal should not use normal."
    )
    assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), (
        "w_normal should be close to normal(wo_normal)."
    )
567
568
569
570


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
571
572
async def test_pooling_embed(server: RemoteOpenAIServer, model_name: str):
    task = "embed"
573
574
575
576
    input_text = ["The chef prepared a delicious meal."]

    response = requests.post(
        server.url_for("pooling"),
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
        json={
            "model": model_name,
            "input": input_text,
            "encoding_format": "float",
            "task": task,
        },
    )

    poolings = PoolingResponse.model_validate(response.json())

    assert len(poolings.data) == 1
    assert len(poolings.data[0].data) == 384


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_pooling_token_embed(server: RemoteOpenAIServer, model_name: str):
    task = "token_embed"
    input_text = ["The chef prepared a delicious meal."]

    response = requests.post(
        server.url_for("pooling"),
        json={
            "model": model_name,
            "input": input_text,
            "encoding_format": "float",
            "task": task,
        },
605
606
607
608
609
610
611
    )

    poolings = PoolingResponse.model_validate(response.json())

    assert len(poolings.data) == 1
    assert len(poolings.data[0].data) == 11
    assert len(poolings.data[0].data[0]) == 384
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("task", ["classify", "token_classify", "plugin"])
async def test_pooling_not_supported(
    server: RemoteOpenAIServer, model_name: str, task: str
):
    response = requests.post(
        server.url_for("pooling"),
        json={
            "model": model_name,
            "input": "test",
            "encoding_format": "float",
            "task": task,
        },
    )
    assert response.json()["error"]["type"] == "BadRequestError"
    assert response.json()["error"]["message"].startswith(
        f"Task {task} is not supported"
    )