test_online.py 15.1 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5

import pytest
import requests
6
7
import torch
import torch.nn.functional as F
8

9
from tests.utils import RemoteOpenAIServer
10
11
from vllm.entrypoints.pooling.classify.protocol import ClassificationResponse
from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
12
13
14

MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
DTYPE = "float32"  # Use float32 to avoid NaN issue
15
16
input_text = "This product was excellent and exceeded my expectations"
input_tokens = [1986, 1985, 572, 9073, 323, 33808, 847, 16665]
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33


@pytest.fixture(scope="module")
def server():
    args = [
        "--enforce-eager",
        "--max-model-len",
        "512",
        "--dtype",
        DTYPE,
    ]

    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server


@pytest.mark.parametrize("model_name", [MODEL_NAME])
34
35
36
37
38
def test_basic(server: RemoteOpenAIServer, model_name: str):
    # test /v1/models
    response = requests.get(server.url_for("/v1/models"))
    served_model = response.json()["data"][0]["id"]
    assert served_model == MODEL_NAME
39

40
41
42
43
44
45
46
47
48
49
50
    # test /tokenize
    response = requests.post(
        server.url_for("/tokenize"),
        json={"model": model_name, "prompt": input_text},
    )
    assert response.json()["tokens"] == input_tokens


@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_completion_request(server: RemoteOpenAIServer, model_name: str):
    # test input: str
51
52
    classification_response = requests.post(
        server.url_for("classify"),
53
        json={"model": model_name, "input": input_text},
54
55
56
    )

    classification_response.raise_for_status()
57
    output = ClassificationResponse.model_validate(classification_response.json())
58
59
60
61
62
63
64

    assert output.object == "list"
    assert output.model == MODEL_NAME
    assert len(output.data) == 1
    assert hasattr(output.data[0], "label")
    assert hasattr(output.data[0], "probs")

65
66
    # test input: list[int]
    classification_response = requests.post(
67
        server.url_for("classify"),
68
        json={"model": model_name, "input": input_tokens},
69
    )
70
71
72
73
74
75
76
77
78

    classification_response.raise_for_status()
    output = ClassificationResponse.model_validate(classification_response.json())

    assert output.object == "list"
    assert output.model == MODEL_NAME
    assert len(output.data) == 1
    assert hasattr(output.data[0], "label")
    assert hasattr(output.data[0], "probs")
79
80


81
@pytest.mark.parametrize("model_name", [MODEL_NAME])
82
83
def test_completion_request_batched(server: RemoteOpenAIServer, model_name: str):
    N = 10
84

85
    # test input: list[str]
86
87
    classification_response = requests.post(
        server.url_for("classify"),
88
        json={"model": model_name, "input": [input_text] * N},
89
    )
90
    output = ClassificationResponse.model_validate(classification_response.json())
91

92
    assert len(output.data) == N
93
94
95
96
97
98
99
    for i, item in enumerate(output.data):
        assert item.index == i
        assert hasattr(item, "label")
        assert hasattr(item, "probs")
        assert len(item.probs) == item.num_classes
        assert item.label in ["Default", "Spoiled"]

100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
    # test input: list[list[int]]
    classification_response = requests.post(
        server.url_for("classify"),
        json={"model": model_name, "input": [input_tokens] * N},
    )
    output = ClassificationResponse.model_validate(classification_response.json())

    assert len(output.data) == N
    for i, item in enumerate(output.data):
        assert item.index == i
        assert hasattr(item, "label")
        assert hasattr(item, "probs")
        assert len(item.probs) == item.num_classes
        assert item.label in ["Default", "Spoiled"]


@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_empty_input_error(server: RemoteOpenAIServer, model_name: str):
    classification_response = requests.post(
        server.url_for("classify"),
        json={"model": model_name, "input": ""},
    )

    error = classification_response.json()
    assert classification_response.status_code == 400
    assert "error" in error

    classification_response = requests.post(
        server.url_for("classify"),
        json={"model": model_name, "input": []},
    )

132
133
134
    error = classification_response.json()
    assert classification_response.status_code == 400
    assert "error" in error
135

136
137
138
139
140
141
142

@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_truncate_prompt_tokens(server: RemoteOpenAIServer, model_name: str):
    long_text = "hello " * 600

    classification_response = requests.post(
        server.url_for("classify"),
143
        json={"model": model_name, "input": long_text, "truncate_prompt_tokens": 5},
144
145
146
    )

    classification_response.raise_for_status()
147
    output = ClassificationResponse.model_validate(classification_response.json())
148
149
150
151
152
153
154

    assert len(output.data) == 1
    assert output.data[0].index == 0
    assert hasattr(output.data[0], "probs")
    assert output.usage.prompt_tokens == 5
    assert output.usage.total_tokens == 5

155
    # invalid_truncate_prompt_tokens
156
157
    classification_response = requests.post(
        server.url_for("classify"),
158
        json={"model": model_name, "input": "test", "truncate_prompt_tokens": 513},
159
160
161
162
    )

    error = classification_response.json()
    assert classification_response.status_code == 400
163
    assert "truncate_prompt_tokens" in error["error"]["message"]
164
165
166


@pytest.mark.parametrize("model_name", [MODEL_NAME])
167
def test_add_special_tokens(server: RemoteOpenAIServer, model_name: str):
168
169
    # The add_special_tokens parameter doesn't seem to be working with this model.
    # working with papluca/xlm-roberta-base-language-detection
170
    response = requests.post(
171
        server.url_for("classify"),
172
        json={"model": model_name, "input": input_text, "add_special_tokens": False},
173
    )
174
175
    response.raise_for_status()
    ClassificationResponse.model_validate(response.json())
176

177
    response = requests.post(
178
        server.url_for("classify"),
179
        json={"model": model_name, "input": input_text, "add_special_tokens": True},
180
    )
181
182
    response.raise_for_status()
    ClassificationResponse.model_validate(response.json())
183
184
185


@pytest.mark.asyncio
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_chat_request(server: RemoteOpenAIServer, model_name: str):
    messages = [
        {
            "role": "user",
            "content": "The cat sat on the mat.",
        },
        {
            "role": "assistant",
            "content": "A feline was resting on a rug.",
        },
        {
            "role": "user",
            "content": "Stars twinkle brightly in the night sky.",
        },
    ]

    # test chat request basic usage
    response = requests.post(
        server.url_for("classify"),
        json={"model": model_name, "messages": messages},
    )

    response.raise_for_status()
    output = ClassificationResponse.model_validate(response.json())

    assert output.object == "list"
    assert output.model == MODEL_NAME
    assert len(output.data) == 1
    assert hasattr(output.data[0], "label")
    assert hasattr(output.data[0], "probs")
    assert output.usage.prompt_tokens == 51

    # test add_generation_prompt
    response = requests.post(
        server.url_for("classify"),
        json={"model": model_name, "messages": messages, "add_generation_prompt": True},
    )

    response.raise_for_status()
    output = ClassificationResponse.model_validate(response.json())

    assert output.object == "list"
    assert output.model == MODEL_NAME
    assert len(output.data) == 1
    assert hasattr(output.data[0], "label")
    assert hasattr(output.data[0], "probs")
    assert output.usage.prompt_tokens == 54

    # test continue_final_message
    response = requests.post(
        server.url_for("classify"),
        json={
            "model": model_name,
            "messages": messages,
            "continue_final_message": True,
        },
    )

    response.raise_for_status()
    output = ClassificationResponse.model_validate(response.json())

    assert output.object == "list"
    assert output.model == MODEL_NAME
    assert len(output.data) == 1
    assert hasattr(output.data[0], "label")
    assert hasattr(output.data[0], "probs")
    assert output.usage.prompt_tokens == 49

    # test add_special_tokens
    # The add_special_tokens parameter doesn't seem to be working with this model.
    response = requests.post(
        server.url_for("classify"),
        json={"model": model_name, "messages": messages, "add_special_tokens": True},
    )

    response.raise_for_status()
    output = ClassificationResponse.model_validate(response.json())

    assert output.object == "list"
    assert output.model == MODEL_NAME
    assert len(output.data) == 1
    assert hasattr(output.data[0], "label")
    assert hasattr(output.data[0], "probs")
    assert output.usage.prompt_tokens == 51

    # test continue_final_message with add_generation_prompt
    response = requests.post(
        server.url_for("classify"),
        json={
            "model": model_name,
            "messages": messages,
            "continue_final_message": True,
            "add_generation_prompt": True,
        },
    )
    assert (
        "Cannot set both `continue_final_message` and `add_generation_prompt` to True."
        in response.json()["error"]["message"]
    )


@pytest.mark.asyncio
async def test_invocations_completion_request(server: RemoteOpenAIServer):
290
291
    request_args = {
        "model": MODEL_NAME,
292
        "input": input_text,
293
294
    }

295
296
297
    classification_response = requests.post(
        server.url_for("classify"), json=request_args
    )
298
299
    classification_response.raise_for_status()

300
301
302
    invocation_response = requests.post(
        server.url_for("invocations"), json=request_args
    )
303
304
305
306
307
308
    invocation_response.raise_for_status()

    classification_output = classification_response.json()
    invocation_output = invocation_response.json()

    assert classification_output.keys() == invocation_output.keys()
309
    for classification_data, invocation_data in zip(
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
        classification_output["data"], invocation_output["data"]
    ):
        assert classification_data.keys() == invocation_data.keys()
        assert classification_data["probs"] == pytest.approx(
            invocation_data["probs"], rel=0.01
        )


@pytest.mark.asyncio
async def test_invocations_chat_request(server: RemoteOpenAIServer):
    messages = [
        {
            "role": "user",
            "content": "The cat sat on the mat.",
        },
        {
            "role": "assistant",
            "content": "A feline was resting on a rug.",
        },
        {
            "role": "user",
            "content": "Stars twinkle brightly in the night sky.",
        },
    ]

    request_args = {"model": MODEL_NAME, "messages": messages}

    classification_response = requests.post(
        server.url_for("classify"), json=request_args
    )
    classification_response.raise_for_status()

    invocation_response = requests.post(
        server.url_for("invocations"), json=request_args
    )
    invocation_response.raise_for_status()

    classification_output = classification_response.json()
    invocation_output = invocation_response.json()

    assert classification_output.keys() == invocation_output.keys()
    for classification_data, invocation_data in zip(
352
353
        classification_output["data"], invocation_output["data"]
    ):
354
355
        assert classification_data.keys() == invocation_data.keys()
        assert classification_data["probs"] == pytest.approx(
356
357
            invocation_data["probs"], rel=0.01
        )
358
359
360
361


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
362
363
async def test_use_activation(server: RemoteOpenAIServer, model_name: str):
    async def get_outputs(use_activation):
364
365
        response = requests.post(
            server.url_for("classify"),
366
367
368
369
370
            json={
                "model": model_name,
                "input": input_text,
                "use_activation": use_activation,
            },
371
        )
372
        outputs = response.json()
373
        return torch.tensor([x["probs"] for x in outputs["data"]])
374

375
376
377
    default = await get_outputs(use_activation=None)
    w_activation = await get_outputs(use_activation=True)
    wo_activation = await get_outputs(use_activation=False)
378

379
380
381
382
383
384
385
386
387
    assert torch.allclose(default, w_activation, atol=1e-2), (
        "Default should use activation."
    )
    assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
        "wo_activation should not use activation."
    )
    assert torch.allclose(F.softmax(wo_activation, dim=-1), w_activation, atol=1e-2), (
        "w_activation should be close to activation(wo_activation)."
    )
388
389
390
391


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
392
async def test_score(server: RemoteOpenAIServer, model_name: str):
393
394
395
396
397
    # score api is only enabled for num_labels == 1.
    response = requests.post(
        server.url_for("score"),
        json={
            "model": model_name,
398
399
            "queries": "ping",
            "documents": "pong",
400
401
        },
    )
402
    assert response.json()["detail"] == "Not Found"
403
404
405
406


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
407
async def test_rerank(server: RemoteOpenAIServer, model_name: str):
408
409
410
411
412
413
414
415
416
    # rerank api is only enabled for num_labels == 1.
    response = requests.post(
        server.url_for("rerank"),
        json={
            "model": model_name,
            "query": "ping",
            "documents": ["pong"],
        },
    )
417
    assert response.json()["detail"] == "Not Found"
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str):
    response = requests.post(
        server.url_for("pooling"),
        json={
            "model": model_name,
            "input": input_text,
            "encoding_format": "float",
            "task": "classify",
        },
    )
    poolings = PoolingResponse.model_validate(response.json())
    assert len(poolings.data) == 1
    assert len(poolings.data[0].data) == 2


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: str):
    task = "token_classify"
    response = requests.post(
        server.url_for("pooling"),
        json={
            "model": model_name,
445
            "input": input_text,
446
447
448
449
            "encoding_format": "float",
            "task": task,
        },
    )
450
451
452
453
    poolings = PoolingResponse.model_validate(response.json())
    assert len(poolings.data) == 1
    assert len(poolings.data[0].data) == 8
    assert len(poolings.data[0].data[0]) == 2
454
455
456
457
458
459
460
461
462
463
464
465


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"])
async def test_pooling_not_supported(
    server: RemoteOpenAIServer, model_name: str, task: str
):
    response = requests.post(
        server.url_for("pooling"),
        json={
            "model": model_name,
466
            "input": input_text,
467
468
469
470
471
            "encoding_format": "float",
            "task": task,
        },
    )
    assert response.json()["error"]["type"] == "BadRequestError"
472
    assert response.json()["error"]["message"].startswith(f"Unsupported task: {task!r}")