test_online.py 15.2 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5

import pytest
import requests
6
7
import torch
import torch.nn.functional as F
8

9
from tests.utils import RemoteOpenAIServer
10
11
from vllm.entrypoints.pooling.classify.protocol import ClassificationResponse
from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
12
13
14

MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
DTYPE = "float32"  # Use float32 to avoid NaN issue
15
16
input_text = "This product was excellent and exceeded my expectations"
input_tokens = [1986, 1985, 572, 9073, 323, 33808, 847, 16665]
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33


@pytest.fixture(scope="module")
def server():
    args = [
        "--enforce-eager",
        "--max-model-len",
        "512",
        "--dtype",
        DTYPE,
    ]

    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server


@pytest.mark.parametrize("model_name", [MODEL_NAME])
34
35
36
37
38
def test_basic(server: RemoteOpenAIServer, model_name: str):
    # test /v1/models
    response = requests.get(server.url_for("/v1/models"))
    served_model = response.json()["data"][0]["id"]
    assert served_model == MODEL_NAME
39

40
41
42
43
44
45
46
47
48
49
50
    # test /tokenize
    response = requests.post(
        server.url_for("/tokenize"),
        json={"model": model_name, "prompt": input_text},
    )
    assert response.json()["tokens"] == input_tokens


@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_completion_request(server: RemoteOpenAIServer, model_name: str):
    # test input: str
51
52
    classification_response = requests.post(
        server.url_for("classify"),
53
        json={"model": model_name, "input": input_text},
54
55
56
    )

    classification_response.raise_for_status()
57
    output = ClassificationResponse.model_validate(classification_response.json())
58
59
60
61
62
63
64

    assert output.object == "list"
    assert output.model == MODEL_NAME
    assert len(output.data) == 1
    assert hasattr(output.data[0], "label")
    assert hasattr(output.data[0], "probs")

65
66
    # test input: list[int]
    classification_response = requests.post(
67
        server.url_for("classify"),
68
        json={"model": model_name, "input": input_tokens},
69
    )
70
71
72
73
74
75
76
77
78

    classification_response.raise_for_status()
    output = ClassificationResponse.model_validate(classification_response.json())

    assert output.object == "list"
    assert output.model == MODEL_NAME
    assert len(output.data) == 1
    assert hasattr(output.data[0], "label")
    assert hasattr(output.data[0], "probs")
79
80


81
@pytest.mark.parametrize("model_name", [MODEL_NAME])
82
83
def test_completion_request_batched(server: RemoteOpenAIServer, model_name: str):
    N = 10
84

85
    # test input: list[str]
86
87
    classification_response = requests.post(
        server.url_for("classify"),
88
        json={"model": model_name, "input": [input_text] * N},
89
    )
90
    output = ClassificationResponse.model_validate(classification_response.json())
91

92
    assert len(output.data) == N
93
94
95
96
97
98
99
    for i, item in enumerate(output.data):
        assert item.index == i
        assert hasattr(item, "label")
        assert hasattr(item, "probs")
        assert len(item.probs) == item.num_classes
        assert item.label in ["Default", "Spoiled"]

100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
    # test input: list[list[int]]
    classification_response = requests.post(
        server.url_for("classify"),
        json={"model": model_name, "input": [input_tokens] * N},
    )
    output = ClassificationResponse.model_validate(classification_response.json())

    assert len(output.data) == N
    for i, item in enumerate(output.data):
        assert item.index == i
        assert hasattr(item, "label")
        assert hasattr(item, "probs")
        assert len(item.probs) == item.num_classes
        assert item.label in ["Default", "Spoiled"]


@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_empty_input_error(server: RemoteOpenAIServer, model_name: str):
    classification_response = requests.post(
        server.url_for("classify"),
        json={"model": model_name, "input": ""},
    )

    error = classification_response.json()
    assert classification_response.status_code == 400
    assert "error" in error

    classification_response = requests.post(
        server.url_for("classify"),
        json={"model": model_name, "input": []},
    )
    classification_response.raise_for_status()
    output = ClassificationResponse.model_validate(classification_response.json())

    assert output.object == "list"
    assert isinstance(output.data, list)
    assert len(output.data) == 0

138
139
140
141
142
143
144

@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_truncate_prompt_tokens(server: RemoteOpenAIServer, model_name: str):
    long_text = "hello " * 600

    classification_response = requests.post(
        server.url_for("classify"),
145
        json={"model": model_name, "input": long_text, "truncate_prompt_tokens": 5},
146
147
148
    )

    classification_response.raise_for_status()
149
    output = ClassificationResponse.model_validate(classification_response.json())
150
151
152
153
154
155
156

    assert len(output.data) == 1
    assert output.data[0].index == 0
    assert hasattr(output.data[0], "probs")
    assert output.usage.prompt_tokens == 5
    assert output.usage.total_tokens == 5

157
    # invalid_truncate_prompt_tokens
158
159
    classification_response = requests.post(
        server.url_for("classify"),
160
        json={"model": model_name, "input": "test", "truncate_prompt_tokens": 513},
161
162
163
164
    )

    error = classification_response.json()
    assert classification_response.status_code == 400
165
    assert "truncate_prompt_tokens" in error["error"]["message"]
166
167
168


@pytest.mark.parametrize("model_name", [MODEL_NAME])
169
def test_add_special_tokens(server: RemoteOpenAIServer, model_name: str):
170
171
    # The add_special_tokens parameter doesn't seem to be working with this model.
    # working with papluca/xlm-roberta-base-language-detection
172
    response = requests.post(
173
        server.url_for("classify"),
174
        json={"model": model_name, "input": input_text, "add_special_tokens": False},
175
    )
176
177
    response.raise_for_status()
    ClassificationResponse.model_validate(response.json())
178

179
    response = requests.post(
180
        server.url_for("classify"),
181
        json={"model": model_name, "input": input_text, "add_special_tokens": True},
182
    )
183
184
    response.raise_for_status()
    ClassificationResponse.model_validate(response.json())
185
186
187


@pytest.mark.asyncio
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_chat_request(server: RemoteOpenAIServer, model_name: str):
    messages = [
        {
            "role": "user",
            "content": "The cat sat on the mat.",
        },
        {
            "role": "assistant",
            "content": "A feline was resting on a rug.",
        },
        {
            "role": "user",
            "content": "Stars twinkle brightly in the night sky.",
        },
    ]

    # test chat request basic usage
    response = requests.post(
        server.url_for("classify"),
        json={"model": model_name, "messages": messages},
    )

    response.raise_for_status()
    output = ClassificationResponse.model_validate(response.json())

    assert output.object == "list"
    assert output.model == MODEL_NAME
    assert len(output.data) == 1
    assert hasattr(output.data[0], "label")
    assert hasattr(output.data[0], "probs")
    assert output.usage.prompt_tokens == 51

    # test add_generation_prompt
    response = requests.post(
        server.url_for("classify"),
        json={"model": model_name, "messages": messages, "add_generation_prompt": True},
    )

    response.raise_for_status()
    output = ClassificationResponse.model_validate(response.json())

    assert output.object == "list"
    assert output.model == MODEL_NAME
    assert len(output.data) == 1
    assert hasattr(output.data[0], "label")
    assert hasattr(output.data[0], "probs")
    assert output.usage.prompt_tokens == 54

    # test continue_final_message
    response = requests.post(
        server.url_for("classify"),
        json={
            "model": model_name,
            "messages": messages,
            "continue_final_message": True,
        },
    )

    response.raise_for_status()
    output = ClassificationResponse.model_validate(response.json())

    assert output.object == "list"
    assert output.model == MODEL_NAME
    assert len(output.data) == 1
    assert hasattr(output.data[0], "label")
    assert hasattr(output.data[0], "probs")
    assert output.usage.prompt_tokens == 49

    # test add_special_tokens
    # The add_special_tokens parameter doesn't seem to be working with this model.
    response = requests.post(
        server.url_for("classify"),
        json={"model": model_name, "messages": messages, "add_special_tokens": True},
    )

    response.raise_for_status()
    output = ClassificationResponse.model_validate(response.json())

    assert output.object == "list"
    assert output.model == MODEL_NAME
    assert len(output.data) == 1
    assert hasattr(output.data[0], "label")
    assert hasattr(output.data[0], "probs")
    assert output.usage.prompt_tokens == 51

    # test continue_final_message with add_generation_prompt
    response = requests.post(
        server.url_for("classify"),
        json={
            "model": model_name,
            "messages": messages,
            "continue_final_message": True,
            "add_generation_prompt": True,
        },
    )
    assert (
        "Cannot set both `continue_final_message` and `add_generation_prompt` to True."
        in response.json()["error"]["message"]
    )


@pytest.mark.asyncio
async def test_invocations_completion_request(server: RemoteOpenAIServer):
292
293
    request_args = {
        "model": MODEL_NAME,
294
        "input": input_text,
295
296
    }

297
298
299
    classification_response = requests.post(
        server.url_for("classify"), json=request_args
    )
300
301
    classification_response.raise_for_status()

302
303
304
    invocation_response = requests.post(
        server.url_for("invocations"), json=request_args
    )
305
306
307
308
309
310
    invocation_response.raise_for_status()

    classification_output = classification_response.json()
    invocation_output = invocation_response.json()

    assert classification_output.keys() == invocation_output.keys()
311
    for classification_data, invocation_data in zip(
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
        classification_output["data"], invocation_output["data"]
    ):
        assert classification_data.keys() == invocation_data.keys()
        assert classification_data["probs"] == pytest.approx(
            invocation_data["probs"], rel=0.01
        )


@pytest.mark.asyncio
async def test_invocations_chat_request(server: RemoteOpenAIServer):
    messages = [
        {
            "role": "user",
            "content": "The cat sat on the mat.",
        },
        {
            "role": "assistant",
            "content": "A feline was resting on a rug.",
        },
        {
            "role": "user",
            "content": "Stars twinkle brightly in the night sky.",
        },
    ]

    request_args = {"model": MODEL_NAME, "messages": messages}

    classification_response = requests.post(
        server.url_for("classify"), json=request_args
    )
    classification_response.raise_for_status()

    invocation_response = requests.post(
        server.url_for("invocations"), json=request_args
    )
    invocation_response.raise_for_status()

    classification_output = classification_response.json()
    invocation_output = invocation_response.json()

    assert classification_output.keys() == invocation_output.keys()
    for classification_data, invocation_data in zip(
354
355
        classification_output["data"], invocation_output["data"]
    ):
356
357
        assert classification_data.keys() == invocation_data.keys()
        assert classification_data["probs"] == pytest.approx(
358
359
            invocation_data["probs"], rel=0.01
        )
360
361
362
363


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
364
365
async def test_use_activation(server: RemoteOpenAIServer, model_name: str):
    async def get_outputs(use_activation):
366
367
        response = requests.post(
            server.url_for("classify"),
368
369
370
371
372
            json={
                "model": model_name,
                "input": input_text,
                "use_activation": use_activation,
            },
373
        )
374
        outputs = response.json()
375
        return torch.tensor([x["probs"] for x in outputs["data"]])
376

377
378
379
    default = await get_outputs(use_activation=None)
    w_activation = await get_outputs(use_activation=True)
    wo_activation = await get_outputs(use_activation=False)
380

381
382
383
384
385
386
387
388
389
    assert torch.allclose(default, w_activation, atol=1e-2), (
        "Default should use activation."
    )
    assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
        "wo_activation should not use activation."
    )
    assert torch.allclose(F.softmax(wo_activation, dim=-1), w_activation, atol=1e-2), (
        "w_activation should be close to activation(wo_activation)."
    )
390
391
392
393


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
394
async def test_score(server: RemoteOpenAIServer, model_name: str):
395
396
397
398
399
    # score api is only enabled for num_labels == 1.
    response = requests.post(
        server.url_for("score"),
        json={
            "model": model_name,
400
401
            "queries": "ping",
            "documents": "pong",
402
403
404
405
406
407
408
        },
    )
    assert response.json()["error"]["type"] == "BadRequestError"


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
409
async def test_rerank(server: RemoteOpenAIServer, model_name: str):
410
411
412
413
414
415
416
417
418
419
    # rerank api is only enabled for num_labels == 1.
    response = requests.post(
        server.url_for("rerank"),
        json={
            "model": model_name,
            "query": "ping",
            "documents": ["pong"],
        },
    )
    assert response.json()["error"]["type"] == "BadRequestError"
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str):
    response = requests.post(
        server.url_for("pooling"),
        json={
            "model": model_name,
            "input": input_text,
            "encoding_format": "float",
            "task": "classify",
        },
    )
    poolings = PoolingResponse.model_validate(response.json())
    assert len(poolings.data) == 1
    assert len(poolings.data[0].data) == 2


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: str):
    task = "token_classify"
    response = requests.post(
        server.url_for("pooling"),
        json={
            "model": model_name,
447
            "input": input_text,
448
449
450
451
            "encoding_format": "float",
            "task": task,
        },
    )
452
453
454
455
    poolings = PoolingResponse.model_validate(response.json())
    assert len(poolings.data) == 1
    assert len(poolings.data[0].data) == 8
    assert len(poolings.data[0].data[0]) == 2
456
457
458
459
460
461
462
463
464
465
466
467


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"])
async def test_pooling_not_supported(
    server: RemoteOpenAIServer, model_name: str, task: str
):
    response = requests.post(
        server.url_for("pooling"),
        json={
            "model": model_name,
468
            "input": input_text,
469
470
471
472
473
474
475
476
            "encoding_format": "float",
            "task": task,
        },
    )
    assert response.json()["error"]["type"] == "BadRequestError"
    assert response.json()["error"]["message"].startswith(
        f"Task {task} is not supported"
    )