test_online.py 19.5 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import base64
5
import json
6
7

import numpy as np
8
9
import openai
import pytest
10
import pytest_asyncio
11
import requests
12
13
import torch
import torch.nn.functional as F
14

15
from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
16
17
from tests.models.utils import check_embeddings_close
from tests.utils import RemoteOpenAIServer
18
19
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
20
from vllm.platforms import current_platform
21
from vllm.transformers_utils.tokenizer import get_tokenizer
22
23
24
25
26
27
28
from vllm.utils.serial_utils import (
    EMBED_DTYPE_TO_TORCH_DTYPE,
    ENDIANNESS,
    MetadataItem,
    binary2tensor,
    decode_pooling_output,
)
29

30
31
32
33
34
if current_platform.is_rocm():
    pytest.skip(
        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
    )

35
MODEL_NAME = "intfloat/multilingual-e5-small"
36
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
37
DTYPE = "bfloat16"
38
39
40


@pytest.fixture(scope="module")
41
def server():
42
    args = [
43
44
        "--runner",
        "pooling",
45
46
        # use half precision for speed and memory savings in CI environment
        "--dtype",
47
        DTYPE,
48
49
        "--enforce-eager",
        "--max-model-len",
50
        "512",
51
52
        "--chat-template",
        DUMMY_CHAT_TEMPLATE,
53
54
    ]

55
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
56
        yield remote_server
57
58


59
@pytest_asyncio.fixture
60
61
async def client(server):
    async with server.get_async_client() as async_client:
62
        yield async_client
63
64


65
66
@pytest.fixture(scope="module")
def hf_model(hf_runner):
67
    with hf_runner(MODEL_NAME, dtype=DTYPE, is_sentence_transformer=True) as hf_model:
68
69
70
        yield hf_model


71
@pytest.mark.asyncio
72
@pytest.mark.parametrize("model_name", [MODEL_NAME])
73
async def test_single_embedding(hf_model, client: openai.AsyncOpenAI, model_name: str):
74
75
76
77
78
    input_texts = [
        "The chef prepared a delicious meal.",
    ]

    # test single embedding
79
    embedding_response = await client.embeddings.create(
80
81
82
83
        model=model_name,
        input=input_texts,
        encoding_format="float",
    )
84
    embeddings = EmbeddingResponse.model_validate(
85
86
        embedding_response.model_dump(mode="json")
    )
87

88
89
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
90
    assert len(embeddings.data[0].embedding) == 384
91
    assert embeddings.usage.completion_tokens == 0
92
93
    assert embeddings.usage.prompt_tokens == 11
    assert embeddings.usage.total_tokens == 11
94

95
    vllm_outputs = [d.embedding for d in embeddings.data]
96
    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
97

98
99
    # test using token IDs
    input_tokens = [1, 1, 1, 1, 1]
100
    embedding_response = await client.embeddings.create(
101
102
103
104
        model=model_name,
        input=input_tokens,
        encoding_format="float",
    )
105
    embeddings = EmbeddingResponse.model_validate(
106
107
        embedding_response.model_dump(mode="json")
    )
108

109
110
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
111
    assert len(embeddings.data[0].embedding) == 384
112
113
114
115
116
117
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 5
    assert embeddings.usage.total_tokens == 5


@pytest.mark.asyncio
118
@pytest.mark.parametrize("model_name", [MODEL_NAME])
119
async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI, model_name: str):
120
    # test list[str]
121
    input_texts = [
122
123
124
        "The cat sat on the mat.",
        "A feline was resting on a rug.",
        "Stars twinkle brightly in the night sky.",
125
    ]
126
    embedding_response = await client.embeddings.create(
127
128
129
130
        model=model_name,
        input=input_texts,
        encoding_format="float",
    )
131
    embeddings = EmbeddingResponse.model_validate(
132
133
        embedding_response.model_dump(mode="json")
    )
134

135
136
    assert embeddings.id is not None
    assert len(embeddings.data) == 3
137
    assert len(embeddings.data[0].embedding) == 384
138
    assert embeddings.usage.completion_tokens == 0
139
140
    assert embeddings.usage.prompt_tokens == 33
    assert embeddings.usage.total_tokens == 33
141

142
    vllm_outputs = [d.embedding for d in embeddings.data]
143
    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
144

145
    # test list[list[int]]
146
147
148
149
150
151
    input_tokens = [
        [4, 5, 7, 9, 20],
        [15, 29, 499],
        [24, 24, 24, 24, 24],
        [25, 32, 64, 77],
    ]
152
    embedding_response = await client.embeddings.create(
153
154
155
156
        model=model_name,
        input=input_tokens,
        encoding_format="float",
    )
157
    embeddings = EmbeddingResponse.model_validate(
158
159
        embedding_response.model_dump(mode="json")
    )
160

161
162
    assert embeddings.id is not None
    assert len(embeddings.data) == 4
163
    assert len(embeddings.data[0].embedding) == 384
164
165
166
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 17
    assert embeddings.usage.total_tokens == 17
167
168
169


@pytest.mark.asyncio
170
@pytest.mark.parametrize("model_name", [MODEL_NAME])
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
async def test_conversation_embedding(
    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
):
    messages = [
        {
            "role": "user",
            "content": "The cat sat on the mat.",
        },
        {
            "role": "assistant",
            "content": "A feline was resting on a rug.",
        },
        {
            "role": "user",
            "content": "Stars twinkle brightly in the night sky.",
        },
    ]
188

189
190
191
192
193
194
195
196
    chat_response = requests.post(
        server.url_for("v1/embeddings"),
        json={
            "model": model_name,
            "messages": messages,
            "encoding_format": "float",
        },
    )
197
    chat_response.raise_for_status()
198
    chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
199

200
    tokenizer = get_tokenizer(tokenizer_name=model_name)
201
202
203
204
205
206
207
208
209
210
211
212
213
214
    prompt = tokenizer.apply_chat_template(
        messages,
        chat_template=DUMMY_CHAT_TEMPLATE,
        add_generation_prompt=True,
        continue_final_message=False,
        tokenize=False,
    )
    completion_response = await client.embeddings.create(
        model=model_name,
        input=prompt,
        encoding_format="float",
        # To be consistent with chat
        extra_body={"add_special_tokens": False},
    )
215
    completion_embeddings = EmbeddingResponse.model_validate(
216
217
        completion_response.model_dump(mode="json")
    )
218

219
220
221
    assert chat_embeddings.id is not None
    assert completion_embeddings.id is not None
    assert chat_embeddings.created <= completion_embeddings.created
222
223
224
    assert chat_embeddings.model_dump(exclude={"id", "created"}) == (
        completion_embeddings.model_dump(exclude={"id", "created"})
    )
225
226
227
228


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
229
230
231
async def test_batch_base64_embedding(
    hf_model, client: openai.AsyncOpenAI, model_name: str
):
232
233
    input_texts = [
        "Hello my name is",
234
        "The best thing about vLLM is that it supports many different models",
235
236
    ]

237
238
239
    responses_float = await client.embeddings.create(
        input=input_texts, model=model_name, encoding_format="float"
    )
240
    float_data = [d.embedding for d in responses_float.data]
241
    run_embedding_correctness_test(hf_model, input_texts, float_data)
242

243
244
245
    responses_base64 = await client.embeddings.create(
        input=input_texts, model=model_name, encoding_format="base64"
    )
246
    base64_data = []
247
    for data in responses_base64.data:
248
        base64_data.append(
249
250
            np.frombuffer(base64.b64decode(data.embedding), dtype="float32").tolist()
        )
251

252
    run_embedding_correctness_test(hf_model, input_texts, base64_data)
253
254

    # Default response is float32 decoded from base64 by OpenAI Client
255
256
257
    responses_default = await client.embeddings.create(
        input=input_texts, model=model_name
    )
258
    default_data = [d.embedding for d in responses_default.data]
259
    run_embedding_correctness_test(hf_model, input_texts, default_data)
260
261


262
263
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
264
265
async def test_base64_embed_dtype_and_endianness(
    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
266
267
268
269
270
271
272
273
274
275
):
    input_texts = [
        "The best thing about vLLM is that it supports many different models",
    ]

    responses_float = await client.embeddings.create(
        input=input_texts, model=model_name, encoding_format="float"
    )
    float_data = [d.embedding for d in responses_float.data]

276
277
278
279
280
281
282
283
284
285
286
    for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
        for endianness in ENDIANNESS:
            responses_base64 = requests.post(
                server.url_for("/v1/embeddings"),
                json={
                    "model": model_name,
                    "input": input_texts,
                    "encoding_format": "base64",
                    "embed_dtype": embed_dtype,
                    "endianness": endianness,
                },
287
288
            )

289
290
291
292
293
294
295
296
297
298
299
300
301
            base64_data = []
            for data in responses_base64.json()["data"]:
                binary = base64.b64decode(data["embedding"])
                tensor = binary2tensor(binary, (-1,), embed_dtype, endianness)
                base64_data.append(tensor.to(torch.float32).tolist())

            check_embeddings_close(
                embeddings_0_lst=float_data,
                embeddings_1_lst=base64_data,
                name_0="float_data",
                name_1="base64_data",
                tol=1e-2,
            )
302
303
304
305


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
306
307
async def test_bytes_embed_dtype_and_endianness(
    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
308
309
310
311
312
):
    input_texts = [
        "The best thing about vLLM is that it supports many different models",
    ]

313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
    responses_float = await client.embeddings.create(
        input=input_texts, model=model_name, encoding_format="float"
    )
    float_data = [d.embedding for d in responses_float.data]

    for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
        for endianness in ENDIANNESS:
            responses_bytes = requests.post(
                server.url_for("/v1/embeddings"),
                json={
                    "model": model_name,
                    "input": input_texts,
                    "encoding_format": "bytes",
                    "embed_dtype": embed_dtype,
                    "endianness": endianness,
                },
            )

            metadata = json.loads(responses_bytes.headers["metadata"])
            body = responses_bytes.content
            items = [MetadataItem(**x) for x in metadata["data"]]

            bytes_data = decode_pooling_output(items=items, body=body)
            bytes_data = [x.to(torch.float32).tolist() for x in bytes_data]

            check_embeddings_close(
                embeddings_0_lst=float_data,
                embeddings_1_lst=bytes_data,
                name_0="float_data",
                name_1="bytes_data",
                tol=1e-2,
            )


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"])
async def test_params_not_supported(
    server: RemoteOpenAIServer, model_name: str, param_name: str
):
    input_texts = [
        "The best thing about vLLM is that it supports many different models",
    ]
356
357
358
359
360
361
362

    responses_base64 = requests.post(
        server.url_for("/v1/embeddings"),
        json={
            "model": model_name,
            "input": input_texts,
            "encoding_format": "base64",
363
            param_name: f"bad_{param_name}",
364
365
366
367
        },
    )

    assert responses_base64.status_code == 400
368
369
    assert "literal_error" in responses_base64.json()["error"]["message"]
    assert f"bad_{param_name}" in responses_base64.json()["error"]["message"]
370
371


372
@pytest.mark.asyncio
373
@pytest.mark.parametrize("model_name", [MODEL_NAME])
374
async def test_single_embedding_truncation(client: openai.AsyncOpenAI, model_name: str):
375
376
377
378
379
    input_texts = [
        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
    ]

    # test single embedding
380
    embedding_response = await client.embeddings.create(
381
382
        model=model_name, input=input_texts, extra_body={"truncate_prompt_tokens": 10}
    )
383
    embeddings = EmbeddingResponse.model_validate(
384
385
        embedding_response.model_dump(mode="json")
    )
386

387
388
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
389
    assert len(embeddings.data[0].embedding) == 384
390
391
392
393
394
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 10
    assert embeddings.usage.total_tokens == 10

    input_tokens = [
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
        1,
        24428,
        289,
        18341,
        26165,
        285,
        19323,
        283,
        289,
        26789,
        3871,
        28728,
        9901,
        340,
        2229,
        385,
        340,
        315,
        28741,
        28804,
        2,
416
    ]
417
    embedding_response = await client.embeddings.create(
418
419
        model=model_name, input=input_tokens, extra_body={"truncate_prompt_tokens": 10}
    )
420
    embeddings = EmbeddingResponse.model_validate(
421
422
        embedding_response.model_dump(mode="json")
    )
423
424
425

    assert embeddings.id is not None
    assert len(embeddings.data) == 1
426
    assert len(embeddings.data[0].embedding) == 384
427
428
429
430
431
432
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 10
    assert embeddings.usage.total_tokens == 10


@pytest.mark.asyncio
433
@pytest.mark.parametrize("model_name", [MODEL_NAME])
434
435
436
async def test_single_embedding_truncation_invalid(
    client: openai.AsyncOpenAI, model_name: str
):
437
438
439
440
441
    input_texts = [
        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
    ]

    with pytest.raises(openai.BadRequestError):
442
        response = await client.embeddings.create(
443
444
            model=model_name,
            input=input_texts,
445
446
            extra_body={"truncate_prompt_tokens": 8193},
        )
447
        assert "error" in response.object
448
449
450
451
        assert (
            "truncate_prompt_tokens value is greater than max_model_len. "
            "Please, select a smaller truncation size." in response.message
        )
452
453
454


@pytest.mark.asyncio
455
async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenAI):
456
457
458
459
460
461
462
463
464
465
466
467
    input_texts = [
        "The chef prepared a delicious meal.",
    ]

    request_args = {
        "model": MODEL_NAME,
        "input": input_texts,
        "encoding_format": "float",
    }

    completion_response = await client.embeddings.create(**request_args)

468
469
470
    invocation_response = requests.post(
        server.url_for("invocations"), json=request_args
    )
471
472
473
474
475
476
    invocation_response.raise_for_status()

    completion_output = completion_response.model_dump()
    invocation_output = invocation_response.json()

    assert completion_output.keys() == invocation_output.keys()
477
478
479
    for completion_data, invocation_data in zip(
        completion_output["data"], invocation_output["data"]
    ):
480
        assert completion_data.keys() == invocation_data.keys()
481
482
483
484
485
486
        check_embeddings_close(
            embeddings_0_lst=[completion_data["embedding"]],
            embeddings_1_lst=[invocation_data["embedding"]],
            name_0="completion",
            name_1="invocation",
        )
487
488
489
490


@pytest.mark.asyncio
async def test_invocations_conversation(server: RemoteOpenAIServer):
491
492
493
494
495
496
497
498
499
500
501
502
503
504
    messages = [
        {
            "role": "user",
            "content": "The cat sat on the mat.",
        },
        {
            "role": "assistant",
            "content": "A feline was resting on a rug.",
        },
        {
            "role": "user",
            "content": "Stars twinkle brightly in the night sky.",
        },
    ]
505
506
507
508
509
510
511

    request_args = {
        "model": MODEL_NAME,
        "messages": messages,
        "encoding_format": "float",
    }

512
    chat_response = requests.post(server.url_for("v1/embeddings"), json=request_args)
513
514
    chat_response.raise_for_status()

515
516
517
    invocation_response = requests.post(
        server.url_for("invocations"), json=request_args
    )
518
519
520
521
522
523
    invocation_response.raise_for_status()

    chat_output = chat_response.json()
    invocation_output = invocation_response.json()

    assert chat_output.keys() == invocation_output.keys()
524
525
526
    for chat_data, invocation_data in zip(
        chat_output["data"], invocation_output["data"]
    ):
527
        assert chat_data.keys() == invocation_data.keys()
528
529
530
531
532
533
        check_embeddings_close(
            embeddings_0_lst=[chat_data["embedding"]],
            embeddings_1_lst=[invocation_data["embedding"]],
            name_0="chat",
            name_1="invocation",
        )
534
535
536
537
538
539
540
541
542
543
544
545


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_normalize(server: RemoteOpenAIServer, model_name: str):
    input_text = ["The chef prepared a delicious meal."]

    async def get_outputs(normalize):
        request_args = {
            "model": MODEL_NAME,
            "input": input_text,
            "encoding_format": "float",
546
            "normalize": normalize,
547
548
        }

549
        response = requests.post(server.url_for("v1/embeddings"), json=request_args)
550
551
        outputs = response.json()

552
        return torch.tensor([x["embedding"] for x in outputs["data"]])
553
554
555
556
557

    default = await get_outputs(normalize=None)
    w_normal = await get_outputs(normalize=True)
    wo_normal = await get_outputs(normalize=False)

558
559
560
561
562
563
564
    assert torch.allclose(default, w_normal, atol=1e-2), "Default should use normal."
    assert not torch.allclose(w_normal, wo_normal, atol=1e-2), (
        "wo_normal should not use normal."
    )
    assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), (
        "w_normal should be close to normal(wo_normal)."
    )
565
566
567
568


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
569
570
async def test_pooling_embed(server: RemoteOpenAIServer, model_name: str):
    task = "embed"
571
572
573
574
    input_text = ["The chef prepared a delicious meal."]

    response = requests.post(
        server.url_for("pooling"),
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
        json={
            "model": model_name,
            "input": input_text,
            "encoding_format": "float",
            "task": task,
        },
    )

    poolings = PoolingResponse.model_validate(response.json())

    assert len(poolings.data) == 1
    assert len(poolings.data[0].data) == 384


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_pooling_token_embed(server: RemoteOpenAIServer, model_name: str):
    task = "token_embed"
    input_text = ["The chef prepared a delicious meal."]

    response = requests.post(
        server.url_for("pooling"),
        json={
            "model": model_name,
            "input": input_text,
            "encoding_format": "float",
            "task": task,
        },
603
604
605
606
607
608
609
    )

    poolings = PoolingResponse.model_validate(response.json())

    assert len(poolings.data) == 1
    assert len(poolings.data[0].data) == 11
    assert len(poolings.data[0].data[0]) == 384
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("task", ["classify", "token_classify", "plugin"])
async def test_pooling_not_supported(
    server: RemoteOpenAIServer, model_name: str, task: str
):
    response = requests.post(
        server.url_for("pooling"),
        json={
            "model": model_name,
            "input": "test",
            "encoding_format": "float",
            "task": task,
        },
    )
    assert response.json()["error"]["type"] == "BadRequestError"
    assert response.json()["error"]["message"].startswith(
        f"Task {task} is not supported"
    )