"vscode:/vscode.git/clone" did not exist on "ffd8e40dc75a9ee3ecf05c48baca7e76abc180e9"
test_embedding.py 8.66 KB
Newer Older
1
2
3
import base64

import numpy as np
4
5
import openai
import pytest
6
import pytest_asyncio
7
8
9
import requests

from vllm.transformers_utils.tokenizer import get_tokenizer
10

11
from ...utils import RemoteOpenAIServer
12

13
14
MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
15
16
17


@pytest.fixture(scope="module")
18
def server():
19
20
21
22
23
24
25
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "bfloat16",
        "--enforce-eager",
        "--max-model-len",
        "8192",
26
27
        "--chat-template",
        DUMMY_CHAT_TEMPLATE,
28
29
    ]

30
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
31
        yield remote_server
32
33


34
@pytest_asyncio.fixture
35
36
async def client(server):
    async with server.get_async_client() as async_client:
37
        yield async_client
38
39
40


@pytest.mark.asyncio
41
42
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
43
44
45
46
47
    input_texts = [
        "The chef prepared a delicious meal.",
    ]

    # test single embedding
48
    embeddings = await client.embeddings.create(
49
50
51
52
53
54
55
56
57
58
59
60
61
        model=model_name,
        input=input_texts,
        encoding_format="float",
    )
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
    assert len(embeddings.data[0].embedding) == 4096
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 9
    assert embeddings.usage.total_tokens == 9

    # test using token IDs
    input_tokens = [1, 1, 1, 1, 1]
62
    embeddings = await client.embeddings.create(
63
64
65
66
67
68
69
70
71
72
73
74
75
        model=model_name,
        input=input_tokens,
        encoding_format="float",
    )
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
    assert len(embeddings.data[0].embedding) == 4096
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 5
    assert embeddings.usage.total_tokens == 5


@pytest.mark.asyncio
76
77
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
78
79
80
81
82
    # test List[str]
    input_texts = [
        "The cat sat on the mat.", "A feline was resting on a rug.",
        "Stars twinkle brightly in the night sky."
    ]
83
    embeddings = await client.embeddings.create(
84
85
86
87
88
89
90
        model=model_name,
        input=input_texts,
        encoding_format="float",
    )
    assert embeddings.id is not None
    assert len(embeddings.data) == 3
    assert len(embeddings.data[0].embedding) == 4096
91
92
93
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 32
    assert embeddings.usage.total_tokens == 32
94
95
96
97

    # test List[List[int]]
    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
                    [25, 32, 64, 77]]
98
    embeddings = await client.embeddings.create(
99
100
101
102
103
104
105
106
107
108
        model=model_name,
        input=input_tokens,
        encoding_format="float",
    )
    assert embeddings.id is not None
    assert len(embeddings.data) == 4
    assert len(embeddings.data[0].embedding) == 4096
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 17
    assert embeddings.usage.total_tokens == 17
109
110
111


@pytest.mark.asyncio
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_conversation_embedding(server: RemoteOpenAIServer,
                                      client: openai.AsyncOpenAI,
                                      model_name: str):
    messages = [{
        "role": "user",
        "content": "The cat sat on the mat.",
    }, {
        "role": "assistant",
        "content": "A feline was resting on a rug.",
    }, {
        "role": "user",
        "content": "Stars twinkle brightly in the night sky.",
    }]

    chat_response = requests.post(server.url_for("v1/embeddings"),
                                  json={
                                      "model": model_name,
                                      "messages": messages,
                                      "encoding_format": "float",
                                  })
    chat_response.raise_for_status()
    chat_embeddings = chat_response.json()

    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
    prompt = tokenizer.apply_chat_template(
        messages,
        chat_template=DUMMY_CHAT_TEMPLATE,
        add_generation_prompt=True,
        continue_final_message=False,
        tokenize=False,
    )
    completion_response = await client.embeddings.create(
        model=model_name,
        input=prompt,
        encoding_format="float",
        # To be consistent with chat
        extra_body={"add_special_tokens": False},
    )
    completion_embeddings = completion_response.model_dump(mode="json")

    assert chat_embeddings.pop("id") is not None
    assert completion_embeddings.pop("id") is not None
    assert chat_embeddings.pop("created") <= completion_embeddings.pop(
        "created")
    assert chat_embeddings == completion_embeddings


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
163
164
165
166
167
168
                                      model_name: str):
    input_texts = [
        "Hello my name is",
        "The best thing about vLLM is that it supports many different models"
    ]

169
170
171
    responses_float = await client.embeddings.create(input=input_texts,
                                                     model=model_name,
                                                     encoding_format="float")
172

173
174
175
    responses_base64 = await client.embeddings.create(input=input_texts,
                                                      model=model_name,
                                                      encoding_format="base64")
176
177
178
179
180

    decoded_responses_base64_data = []
    for data in responses_base64.data:
        decoded_responses_base64_data.append(
            np.frombuffer(base64.b64decode(data.embedding),
181
                          dtype="float32").tolist())
182
183
184
185
186

    assert responses_float.data[0].embedding == decoded_responses_base64_data[
        0]
    assert responses_float.data[1].embedding == decoded_responses_base64_data[
        1]
187
188

    # Default response is float32 decoded from base64 by OpenAI Client
189
190
    responses_default = await client.embeddings.create(input=input_texts,
                                                       model=model_name)
191
192
193
194
195

    assert responses_float.data[0].embedding == responses_default.data[
        0].embedding
    assert responses_float.data[1].embedding == responses_default.data[
        1].embedding
196
197
198


@pytest.mark.asyncio
199
200
201
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
                                           model_name: str):
202
203
204
205
206
    input_texts = [
        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
    ]

    # test single embedding
207
    embeddings = await client.embeddings.create(
208
209
210
211
212
213
214
215
216
217
218
219
220
221
        model=model_name,
        input=input_texts,
        extra_body={"truncate_prompt_tokens": 10})
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
    assert len(embeddings.data[0].embedding) == 4096
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 10
    assert embeddings.usage.total_tokens == 10

    input_tokens = [
        1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
        9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
    ]
222
    embeddings = await client.embeddings.create(
223
224
225
226
227
228
229
230
231
232
233
234
235
        model=model_name,
        input=input_tokens,
        extra_body={"truncate_prompt_tokens": 10})

    assert embeddings.id is not None
    assert len(embeddings.data) == 1
    assert len(embeddings.data[0].embedding) == 4096
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 10
    assert embeddings.usage.total_tokens == 10


@pytest.mark.asyncio
236
237
238
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
                                                   model_name: str):
239
240
241
242
243
    input_texts = [
        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
    ]

    with pytest.raises(openai.BadRequestError):
244
        embeddings = await client.embeddings.create(
245
246
247
248
249
250
            model=model_name,
            input=input_texts,
            extra_body={"truncate_prompt_tokens": 8193})
        assert "error" in embeddings.object
        assert "truncate_prompt_tokens value is greater than max_model_len. "\
               "Please, select a smaller truncation size." in embeddings.message