test_metrics.py 15 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
import asyncio
4
5
6
7
import subprocess
import sys
import tempfile
import time
8
9
10
11
from http import HTTPStatus

import openai
import pytest
12
import os
13
import pytest_asyncio
14
15
16
17
import requests
from prometheus_client.parser import text_string_to_metric_families
from transformers import AutoTokenizer

18
19
from vllm import version

20
from ...conftest import LocalAssetServer
21
from ...utils import RemoteOpenAIServer, models_path_prefix
22

23
MODELS = {
24
25
    "text": os.path.join(models_path_prefix, "TinyLlama/TinyLlama-1.1B-Chat-v1.0"),
    "multimodal": os.path.join(models_path_prefix, "HuggingFaceTB/SmolVLM-256M-Instruct"),
26
}
27
PREV_MINOR_VERSION = version._prev_minor_version()
28
29


30
31
@pytest.fixture(scope="module", params=list(MODELS.keys()))
def model_key(request):
32
33
34
    yield request.param


35
36
37
38
39
40
41
42
43
44
45
46
47
48
@pytest.fixture(scope="module")
def default_server_args():
    return [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "bfloat16",
        "--max-model-len",
        "1024",
        "--enforce-eager",
        "--max-num-seqs",
        "128",
    ]


49
50
51
52
53
54
55
56
57
@pytest.fixture(
    scope="module",
    params=[
        "",
        "--enable-chunked-prefill",
        "--disable-frontend-multiprocessing",
        f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
    ],
)
58
def server(model_key, default_server_args, request):
59
60
    if request.param:
        default_server_args.append(request.param)
61

62
63
    model_name = MODELS[model_key]
    with RemoteOpenAIServer(model_name, default_server_args) as remote_server:
64
65
66
67
68
69
70
        yield remote_server


@pytest_asyncio.fixture
async def client(server):
    async with server.get_async_client() as cl:
        yield cl
71
72
73


_PROMPT = "Hello my name is Robert and I love magic"
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113


def _get_expected_values(num_requests: int, prompt_ids: list[int], max_tokens: int):
    num_prompt_tokens = len(prompt_ids)

    # {metric_family: [(suffix, expected_value)]}
    return {
        "vllm:time_to_first_token_seconds": [("_count", num_requests)],
        "vllm:time_per_output_token_seconds": [
            ("_count", num_requests * (max_tokens - 1))
        ],
        "vllm:e2e_request_latency_seconds": [("_count", num_requests)],
        "vllm:request_queue_time_seconds": [("_count", num_requests)],
        "vllm:request_inference_time_seconds": [("_count", num_requests)],
        "vllm:request_prefill_time_seconds": [("_count", num_requests)],
        "vllm:request_decode_time_seconds": [("_count", num_requests)],
        "vllm:request_prompt_tokens": [
            ("_sum", num_requests * num_prompt_tokens),
            ("_count", num_requests),
        ],
        "vllm:request_generation_tokens": [
            ("_sum", num_requests * max_tokens),
            ("_count", num_requests),
        ],
        "vllm:request_params_n": [("_count", num_requests)],
        "vllm:request_params_max_tokens": [
            ("_sum", num_requests * max_tokens),
            ("_count", num_requests),
        ],
        "vllm:iteration_tokens_total": [
            (
                "_sum",
                num_requests * (num_prompt_tokens + max_tokens),
            ),
            ("_count", num_requests * max_tokens),
        ],
        "vllm:prompt_tokens": [("_total", num_requests * num_prompt_tokens)],
        "vllm:generation_tokens": [("_total", num_requests * max_tokens)],
        "vllm:request_success": [("_total", num_requests)],
    }
114
115
116


@pytest.mark.asyncio
117
async def test_metrics_counts(
118
119
    server: RemoteOpenAIServer,
    client: openai.AsyncClient,
120
    model_key: str,
121
):
122
123
124
125
126
127
128
129
130
131
    if model_key == "multimodal":
        pytest.skip("Unnecessary test")

    model_name = MODELS[model_key]
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    prompt_ids = tokenizer.encode(_PROMPT)
    num_requests = 10
    max_tokens = 10

    for _ in range(num_requests):
132
133
        # sending a request triggers the metrics to be logged.
        await client.completions.create(
134
135
136
            model=model_name,
            prompt=prompt_ids,
            max_tokens=max_tokens,
137
        )
138

139
    response = requests.get(server.url_for("metrics"))
140
141
142
143
    print(response.text)
    assert response.status_code == HTTPStatus.OK

    # Loop over all expected metric_families
144
145
146
    expected_values = _get_expected_values(num_requests, prompt_ids, max_tokens)
    for metric_family, suffix_values_list in expected_values.items():
        if metric_family not in EXPECTED_METRICS_V1 or (
147
148
149
            not server.show_hidden_metrics
            and metric_family in HIDDEN_DEPRECATED_METRICS
        ):
150
151
            continue

152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
        found_metric = False

        # Check to see if the metric_family is found in the prom endpoint.
        for family in text_string_to_metric_families(response.text):
            if family.name == metric_family:
                found_metric = True

                # Check that each suffix is found in the prom endpoint.
                for suffix, expected_value in suffix_values_list:
                    metric_name_w_suffix = f"{metric_family}{suffix}"
                    found_suffix = False

                    for sample in family.samples:
                        if sample.name == metric_name_w_suffix:
                            found_suffix = True

                            # For each suffix, value sure the value matches
                            # what we expect.
                            assert sample.value == expected_value, (
                                f"{metric_name_w_suffix} expected value of "
                                f"{expected_value} did not match found value "
173
174
                                f"{sample.value}"
                            )
175
176
177
178
179
180
                            break
                    assert found_suffix, (
                        f"Did not find {metric_name_w_suffix} in prom endpoint"
                    )
                break

181
        assert found_metric, f"Did not find {metric_family} in prom endpoint"
182
183


184
185
186
EXPECTED_METRICS_V1 = [
    "vllm:num_requests_running",
    "vllm:num_requests_waiting",
187
188
189
    "vllm:kv_cache_usage_perc",
    "vllm:prefix_cache_queries",
    "vllm:prefix_cache_hits",
190
    "vllm:num_preemptions_total",
191
192
    "vllm:prompt_tokens_total",
    "vllm:generation_tokens_total",
193
    "vllm:iteration_tokens_total",
194
    "vllm:cache_config_info",
195
    "vllm:request_success_total",
196
197
198
199
200
201
    "vllm:request_prompt_tokens_sum",
    "vllm:request_prompt_tokens_bucket",
    "vllm:request_prompt_tokens_count",
    "vllm:request_generation_tokens_sum",
    "vllm:request_generation_tokens_bucket",
    "vllm:request_generation_tokens_count",
202
203
204
205
206
207
    "vllm:request_params_n_sum",
    "vllm:request_params_n_bucket",
    "vllm:request_params_n_count",
    "vllm:request_params_max_tokens_sum",
    "vllm:request_params_max_tokens_bucket",
    "vllm:request_params_max_tokens_count",
208
209
210
    "vllm:time_per_output_token_seconds_sum",
    "vllm:time_per_output_token_seconds_bucket",
    "vllm:time_per_output_token_seconds_count",
211
212
213
214
215
216
    "vllm:time_to_first_token_seconds_sum",
    "vllm:time_to_first_token_seconds_bucket",
    "vllm:time_to_first_token_seconds_count",
    "vllm:inter_token_latency_seconds_sum",
    "vllm:inter_token_latency_seconds_bucket",
    "vllm:inter_token_latency_seconds_count",
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
    "vllm:e2e_request_latency_seconds_sum",
    "vllm:e2e_request_latency_seconds_bucket",
    "vllm:e2e_request_latency_seconds_count",
    "vllm:request_queue_time_seconds_sum",
    "vllm:request_queue_time_seconds_bucket",
    "vllm:request_queue_time_seconds_count",
    "vllm:request_inference_time_seconds_sum",
    "vllm:request_inference_time_seconds_bucket",
    "vllm:request_inference_time_seconds_count",
    "vllm:request_prefill_time_seconds_sum",
    "vllm:request_prefill_time_seconds_bucket",
    "vllm:request_prefill_time_seconds_count",
    "vllm:request_decode_time_seconds_sum",
    "vllm:request_decode_time_seconds_bucket",
    "vllm:request_decode_time_seconds_count",
232
233
]

234
235
236
237
238
EXPECTED_METRICS_MM = [
    "vllm:mm_cache_queries",
    "vllm:mm_cache_hits",
]

239
HIDDEN_DEPRECATED_METRICS: list[str] = [
240
241
242
    "vllm:gpu_cache_usage_perc",
    "vllm:gpu_prefix_cache_queries",
    "vllm:gpu_prefix_cache_hits",
243
244
245
246
    "vllm:time_per_output_token_seconds_sum",
    "vllm:time_per_output_token_seconds_bucket",
    "vllm:time_per_output_token_seconds_count",
]
247

248
249

@pytest.mark.asyncio
250
async def test_metrics_exist(
251
    local_asset_server: LocalAssetServer,
252
253
    server: RemoteOpenAIServer,
    client: openai.AsyncClient,
254
    model_key: str,
255
):
256
257
    model_name = MODELS[model_key]

258
    # sending a request triggers the metrics to be logged.
259
260
261
262
263
264
265
266
    if model_key == "text":
        await client.completions.create(
            model=model_name,
            prompt="Hello, my name is",
            max_tokens=5,
            temperature=0.0,
        )
    else:
267
        # https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg
268
269
270
271
272
273
        await client.chat.completions.create(
            model=model_name,
            messages=[
                {
                    "role": "user",
                    "content": [
274
275
276
277
278
279
280
281
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": local_asset_server.url_for(
                                    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
                                ),
                            },
                        },
282
283
284
285
286
287
288
                        {"type": "text", "text": "What's in this image?"},
                    ],
                }
            ],
            max_tokens=5,
            temperature=0.0,
        )
289

290
    response = requests.get(server.url_for("metrics"))
291
292
    assert response.status_code == HTTPStatus.OK

293
294
295
296
297
298
    expected_metrics = EXPECTED_METRICS_V1
    if model_key == "multimodal":
        # NOTE: Don't use in-place assignment
        expected_metrics = expected_metrics + EXPECTED_METRICS_MM

    for metric in expected_metrics:
299
        if metric in HIDDEN_DEPRECATED_METRICS and not server.show_hidden_metrics:
300
301
            continue
        assert metric in response.text
302
303


304
@pytest.mark.asyncio
305
async def test_abort_metrics_reset(
306
307
    server: RemoteOpenAIServer,
    client: openai.AsyncClient,
308
    model_key: str,
309
):
310
311
312
313
    model_name = MODELS[model_key]
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    prompt_ids = tokenizer.encode(_PROMPT)

314
    running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
315
        server,
316
    )
317
318
319
320
321
322
323
324
325
326
327

    # Expect no running requests or kvcache usage
    assert running_requests == 0
    assert waiting_requests == 0
    assert kv_cache_usage == 0.0

    # Start some long-running requests that we can abort
    tasks = []
    for _ in range(3):
        task = asyncio.create_task(
            client.completions.create(
328
329
                model=model_name,
                prompt=prompt_ids,
330
                max_tokens=100,  # Long generation to give time to abort
331
332
333
                temperature=0.0,
            )
        )
334
335
336
337
338
339
        tasks.append(task)

    # Wait a bit for requests to start processing
    await asyncio.sleep(0.5)

    # Check that we have running requests
340
    running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
341
        server,
342
    )
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360

    # Expect running requests and kvcache usage
    assert running_requests > 0
    assert kv_cache_usage > 0

    # Cancel all tasks to abort the requests
    for task in tasks:
        task.cancel()

    # Wait for cancellations to be processed
    await asyncio.sleep(1.0)

    # Check that metrics have reset to zero
    response = requests.get(server.url_for("metrics"))
    assert response.status_code == HTTPStatus.OK

    # Verify running and waiting requests counts and KV cache usage are zero
    running_requests_after, waiting_requests_after, kv_cache_usage_after = (
361
        _get_running_metrics_from_api(server)
362
    )
363

364
365
366
367
368
369
370
371
372
    assert running_requests_after == 0, (
        f"Expected 0 running requests after abort, got {running_requests_after}"
    )
    assert waiting_requests_after == 0, (
        f"Expected 0 waiting requests after abort, got {waiting_requests_after}"
    )
    assert kv_cache_usage_after == 0, (
        f"Expected 0% KV cache usage after abort, got {kv_cache_usage_after}"
    )
373
374


375
def _get_running_metrics_from_api(server: RemoteOpenAIServer):
376
377
378
379
380
381
382
383
    """Return (running_count, waiting_count, kv_cache_usage)"""

    response = requests.get(server.url_for("metrics"))
    assert response.status_code == HTTPStatus.OK

    # Verify running and waiting requests counts and KV cache usage are zero
    running_requests, waiting_requests, kv_cache_usage = None, None, None

384
    kv_cache_usage_metric = "vllm:kv_cache_usage_perc"
385

386
387
388
389
390
391
392
393
394
395
396
    for family in text_string_to_metric_families(response.text):
        if family.name == "vllm:num_requests_running":
            for sample in family.samples:
                if sample.name == "vllm:num_requests_running":
                    running_requests = sample.value
                    break
        elif family.name == "vllm:num_requests_waiting":
            for sample in family.samples:
                if sample.name == "vllm:num_requests_waiting":
                    waiting_requests = sample.value
                    break
397
        elif family.name == kv_cache_usage_metric:
398
            for sample in family.samples:
399
                if sample.name == kv_cache_usage_metric:
400
401
402
403
404
405
406
407
408
409
                    kv_cache_usage = sample.value
                    break

    assert running_requests is not None
    assert waiting_requests is not None
    assert kv_cache_usage is not None

    return running_requests, waiting_requests, kv_cache_usage


410
def test_metrics_exist_run_batch():
411
    input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}"""  # noqa: E501
412

413
414
    #base_url = "0.0.0.0"
    base_url = "localhost"
415
416
417
    port = "8001"
    server_url = f"http://{base_url}:{port}"

418
419
420
421
    with (
        tempfile.NamedTemporaryFile("w") as input_file,
        tempfile.NamedTemporaryFile("r") as output_file,
    ):
422
423
        input_file.write(input_batch)
        input_file.flush()
424
425
426
427
428
429
430
431
432
433
        proc = subprocess.Popen(
            [
                sys.executable,
                "-m",
                "vllm.entrypoints.openai.run_batch",
                "-i",
                input_file.name,
                "-o",
                output_file.name,
                "--model",
434
                os.path.join(models_path_prefix, "intfloat/multilingual-e5-small"),
435
436
437
438
439
440
441
                "--enable-metrics",
                "--url",
                base_url,
                "--port",
                port,
            ],
        )
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456

        def is_server_up(url):
            try:
                response = requests.get(url)
                return response.status_code == 200
            except requests.ConnectionError:
                return False

        while not is_server_up(server_url):
            time.sleep(1)

        response = requests.get(server_url + "/metrics")
        assert response.status_code == HTTPStatus.OK

        proc.wait()