test_pooling.py 14.3 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import base64
5
import json
6
7
8
9

import numpy as np
import pytest
import requests
10
import torch
11

12
from tests.models.utils import check_embeddings_close
13
from tests.utils import RemoteOpenAIServer
14
from vllm.entrypoints.openai.protocol import PoolingResponse
15
from vllm.transformers_utils.tokenizer import get_tokenizer
16
17
18
19
20
21
22
from vllm.utils.serial_utils import (
    EMBED_DTYPE_TO_TORCH_DTYPE,
    ENDIANNESS,
    MetadataItem,
    binary2tensor,
    decode_pooling_output,
)
23

24
MODEL_NAME = "internlm/internlm2-1_8b-reward"
25
26
27
28
29
30
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501


@pytest.fixture(scope="module")
def server():
    args = [
31
32
        "--runner",
        "pooling",
33
34
35
36
37
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "bfloat16",
        "--enforce-eager",
        "--max-model-len",
38
        "512",
39
40
        "--chat-template",
        DUMMY_CHAT_TEMPLATE,
41
        "--trust-remote-code",
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
    ]

    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_pooling(server: RemoteOpenAIServer, model_name: str):
    input_texts = [
        "The chef prepared a delicious meal.",
    ]

    # test single pooling
    response = requests.post(
        server.url_for("pooling"),
58
        json={"model": model_name, "input": input_texts, "encoding_format": "float"},
59
60
61
62
63
64
    )
    response.raise_for_status()
    poolings = PoolingResponse.model_validate(response.json())

    assert poolings.id is not None
    assert len(poolings.data) == 1
65
    assert len(poolings.data[0].data) == 8
66
    assert poolings.usage.completion_tokens == 0
67
68
    assert poolings.usage.prompt_tokens == 8
    assert poolings.usage.total_tokens == 8
69
70
71
72
73

    # test using token IDs
    input_tokens = [1, 1, 1, 1, 1]
    response = requests.post(
        server.url_for("pooling"),
74
        json={"model": model_name, "input": input_tokens, "encoding_format": "float"},
75
76
77
78
79
80
    )
    response.raise_for_status()
    poolings = PoolingResponse.model_validate(response.json())

    assert poolings.id is not None
    assert len(poolings.data) == 1
81
    assert len(poolings.data[0].data) == 5
82
83
84
85
86
87
88
89
    assert poolings.usage.completion_tokens == 0
    assert poolings.usage.prompt_tokens == 5
    assert poolings.usage.total_tokens == 5


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
90
    # test list[str]
91
    input_texts = [
92
93
94
        "The cat sat on the mat.",
        "A feline was resting on a rug.",
        "Stars twinkle brightly in the night sky.",
95
96
97
    ]
    response = requests.post(
        server.url_for("pooling"),
98
        json={"model": model_name, "input": input_texts, "encoding_format": "float"},
99
100
101
102
103
104
    )
    response.raise_for_status()
    poolings = PoolingResponse.model_validate(response.json())

    assert poolings.id is not None
    assert len(poolings.data) == 3
105
    assert len(poolings.data[0].data) == 8
106
    assert poolings.usage.completion_tokens == 0
107
108
    assert poolings.usage.prompt_tokens == 29
    assert poolings.usage.total_tokens == 29
109

110
    # test list[list[int]]
111
112
113
114
115
116
    input_tokens = [
        [4, 5, 7, 9, 20],
        [15, 29, 499],
        [24, 24, 24, 24, 24],
        [25, 32, 64, 77],
    ]
117
118
    response = requests.post(
        server.url_for("pooling"),
119
        json={"model": model_name, "input": input_tokens, "encoding_format": "float"},
120
121
122
123
124
125
    )
    response.raise_for_status()
    poolings = PoolingResponse.model_validate(response.json())

    assert poolings.id is not None
    assert len(poolings.data) == 4
126
    assert len(poolings.data[0].data) == 5
127
128
129
130
131
132
133
    assert poolings.usage.completion_tokens == 0
    assert poolings.usage.prompt_tokens == 17
    assert poolings.usage.total_tokens == 17


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
async def test_conversation_pooling(server: RemoteOpenAIServer, model_name: str):
    messages = [
        {
            "role": "user",
            "content": "The cat sat on the mat.",
        },
        {
            "role": "assistant",
            "content": "A feline was resting on a rug.",
        },
        {
            "role": "user",
            "content": "Stars twinkle brightly in the night sky.",
        },
    ]
149
150
151
152
153
154
155
156
157
158
159
160

    chat_response = requests.post(
        server.url_for("pooling"),
        json={
            "model": model_name,
            "messages": messages,
            "encoding_format": "float",
        },
    )
    chat_response.raise_for_status()
    chat_poolings = PoolingResponse.model_validate(chat_response.json())

161
162
163
164
165
    tokenizer = get_tokenizer(
        tokenizer_name=model_name,
        tokenizer_mode="fast",
        trust_remote_code=True,
    )
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
    prompt = tokenizer.apply_chat_template(
        messages,
        chat_template=DUMMY_CHAT_TEMPLATE,
        add_generation_prompt=True,
        continue_final_message=False,
        tokenize=False,
    )
    completions_response = requests.post(
        server.url_for("pooling"),
        json={
            "model": model_name,
            "input": prompt,
            "encoding_format": "float",
            # To be consistent with chat
            "add_special_tokens": False,
        },
    )
    completions_response.raise_for_status()
184
    completion_poolings = PoolingResponse.model_validate(completions_response.json())
185
186
187
188

    assert chat_poolings.id is not None
    assert completion_poolings.id is not None
    assert chat_poolings.created <= completion_poolings.created
189
190
191
    assert chat_poolings.model_dump(exclude={"id", "created"}) == (
        completion_poolings.model_dump(exclude={"id", "created"})
    )
192
193
194
195


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
196
async def test_batch_base64_pooling(server: RemoteOpenAIServer, model_name: str):
197
198
    input_texts = [
        "Hello my name is",
199
        "The best thing about vLLM is that it supports many different models",
200
201
202
203
204
205
206
207
208
209
210
211
    ]

    float_response = requests.post(
        server.url_for("pooling"),
        json={
            "input": input_texts,
            "model": model_name,
            "encoding_format": "float",
        },
    )
    float_response.raise_for_status()
    responses_float = PoolingResponse.model_validate(float_response.json())
212
    float_data = [np.array(d.data).squeeze(-1).tolist() for d in responses_float.data]
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227

    base64_response = requests.post(
        server.url_for("pooling"),
        json={
            "input": input_texts,
            "model": model_name,
            "encoding_format": "base64",
        },
    )
    base64_response.raise_for_status()
    responses_base64 = PoolingResponse.model_validate(base64_response.json())

    decoded_responses_base64_data = []
    for data in responses_base64.data:
        decoded_responses_base64_data.append(
228
229
230
231
232
233
234
235
236
            np.frombuffer(base64.b64decode(data.data), dtype="float32").tolist()
        )

    check_embeddings_close(
        embeddings_0_lst=float_data,
        embeddings_1_lst=decoded_responses_base64_data,
        name_0="float32",
        name_1="base64",
    )
237
238
239
240
241
242
243
244
245
246
247

    # Default response is float32 decoded from base64 by OpenAI Client
    default_response = requests.post(
        server.url_for("pooling"),
        json={
            "input": input_texts,
            "model": model_name,
        },
    )
    default_response.raise_for_status()
    responses_default = PoolingResponse.model_validate(default_response.json())
248
249
250
251
    default_data = [
        np.array(d.data).squeeze(-1).tolist() for d in responses_default.data
    ]

252
253
254
255
256
257
    check_embeddings_close(
        embeddings_0_lst=float_data,
        embeddings_1_lst=default_data,
        name_0="float32",
        name_1="default",
    )
258
259


260
261
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
262
263
264
async def test_base64_embed_dtype_and_endianness(
    server: RemoteOpenAIServer, model_name: str
):
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
    input_texts = [
        "The best thing about vLLM is that it supports many different models",
    ]

    url = server.url_for("pooling")
    float_response = requests.post(
        url,
        json={
            "model": model_name,
            "input": input_texts,
            "encoding_format": "float",
        },
    )
    responses_float = PoolingResponse.model_validate(float_response.json())
    float_data = [np.array(d.data).squeeze(-1).tolist() for d in responses_float.data]

281
282
283
284
285
286
287
288
289
290
291
    for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
        for endianness in ENDIANNESS:
            responses_base64 = requests.post(
                url,
                json={
                    "model": model_name,
                    "input": input_texts,
                    "encoding_format": "base64",
                    "embed_dtype": embed_dtype,
                    "endianness": endianness,
                },
292
293
            )

294
295
296
297
298
299
300
301
302
303
304
305
306
            base64_data = []
            for data in responses_base64.json()["data"]:
                binary = base64.b64decode(data["data"])
                tensor = binary2tensor(binary, (-1,), embed_dtype, endianness)
                base64_data.append(tensor.to(torch.float32).tolist())

            check_embeddings_close(
                embeddings_0_lst=float_data,
                embeddings_1_lst=base64_data,
                name_0="float_data",
                name_1="base64_data",
                tol=1e-2,
            )
307
308
309
310


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
311
async def test_bytes_embed_dtype_and_endianness(
312
313
314
315
316
317
    server: RemoteOpenAIServer, model_name: str
):
    input_texts = [
        "The best thing about vLLM is that it supports many different models",
    ]

318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
    url = server.url_for("pooling")
    float_response = requests.post(
        url,
        json={
            "model": model_name,
            "input": input_texts,
            "encoding_format": "float",
        },
    )
    responses_float = PoolingResponse.model_validate(float_response.json())
    float_data = [np.array(d.data).squeeze(-1).tolist() for d in responses_float.data]

    for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
        for endianness in ENDIANNESS:
            responses_bytes = requests.post(
                url,
                json={
                    "model": model_name,
                    "input": input_texts,
                    "encoding_format": "bytes",
                    "embed_dtype": embed_dtype,
                    "endianness": endianness,
                },
            )

            metadata = json.loads(responses_bytes.headers["metadata"])
            body = responses_bytes.content
            items = [MetadataItem(**x) for x in metadata["data"]]

            bytes_data = decode_pooling_output(items=items, body=body)
            bytes_data = [x.to(torch.float32).view(-1).tolist() for x in bytes_data]

            check_embeddings_close(
                embeddings_0_lst=float_data,
                embeddings_1_lst=bytes_data,
                name_0="float_data",
                name_1="bytes_data",
                tol=1e-2,
            )


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"])
async def test_params_not_supported(
    server: RemoteOpenAIServer, model_name: str, param_name: str
):
    input_texts = [
        "The best thing about vLLM is that it supports many different models",
    ]
368
369
370
371
372
373
374

    responses_base64 = requests.post(
        server.url_for("pooling"),
        json={
            "model": model_name,
            "input": input_texts,
            "encoding_format": "base64",
375
            param_name: f"bad_{param_name}",
376
377
378
379
        },
    )

    assert responses_base64.status_code == 400
380
381
    assert "literal_error" in responses_base64.json()["error"]["message"]
    assert f"bad_{param_name}" in responses_base64.json()["error"]["message"]
382
383


384
385
386
387
388
389
390
391
392
393
394
395
@pytest.mark.asyncio
async def test_invocations(server: RemoteOpenAIServer):
    input_texts = [
        "The chef prepared a delicious meal.",
    ]

    request_args = {
        "model": MODEL_NAME,
        "input": input_texts,
        "encoding_format": "float",
    }

396
    completion_response = requests.post(server.url_for("pooling"), json=request_args)
397
398
    completion_response.raise_for_status()

399
400
401
    invocation_response = requests.post(
        server.url_for("invocations"), json=request_args
    )
402
403
404
405
406
407
    invocation_response.raise_for_status()

    completion_output = completion_response.json()
    invocation_output = invocation_response.json()

    assert completion_output.keys() == invocation_output.keys()
408
409
410
    for completion_data, invocation_data in zip(
        completion_output["data"], invocation_output["data"]
    ):
411
        assert completion_data.keys() == invocation_data.keys()
412
413
414
415
416
417
        check_embeddings_close(
            embeddings_0_lst=completion_data["data"],
            embeddings_1_lst=invocation_data["data"],
            name_0="completion",
            name_1="invocation",
        )
418
419
420
421


@pytest.mark.asyncio
async def test_invocations_conversation(server: RemoteOpenAIServer):
422
423
424
425
426
427
428
429
430
431
432
433
434
435
    messages = [
        {
            "role": "user",
            "content": "The cat sat on the mat.",
        },
        {
            "role": "assistant",
            "content": "A feline was resting on a rug.",
        },
        {
            "role": "user",
            "content": "Stars twinkle brightly in the night sky.",
        },
    ]
436
437
438
439
440
441
442
443
444
445

    request_args = {
        "model": MODEL_NAME,
        "messages": messages,
        "encoding_format": "float",
    }

    chat_response = requests.post(server.url_for("pooling"), json=request_args)
    chat_response.raise_for_status()

446
447
448
    invocation_response = requests.post(
        server.url_for("invocations"), json=request_args
    )
449
450
451
452
    invocation_response.raise_for_status()

    chat_output = chat_response.json()
    invocation_output = invocation_response.json()
453

454
    assert chat_output.keys() == invocation_output.keys()
455
456
457
    for chat_data, invocation_data in zip(
        chat_output["data"], invocation_output["data"]
    ):
458
        assert chat_data.keys() == invocation_data.keys()
459
460
461
462
463
464
        check_embeddings_close(
            embeddings_0_lst=chat_data["data"],
            embeddings_1_lst=invocation_data["data"],
            name_0="chat",
            name_1="invocation",
        )