"docs/pages/integrations/lmcache-integration.md" did not exist on "7ca6a562f4d5b5926b1d1299425e90033cb725c9"
test_trtllm.py 19.3 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
# SPDX-License-Identifier: Apache-2.0

4
import dataclasses
5
6
import logging
import os
7
from dataclasses import dataclass, field
8
from typing import Any
9
10
11

import pytest

12
from tests.serve.common import (
13
    SERVE_TEST_DIR,
14
15
16
17
    WORKSPACE_DIR,
    params_with_model_mark,
    run_serve_deployment,
)
18
from tests.utils.constants import DefaultPort
19
from tests.utils.engine_process import EngineConfig
20
from tests.utils.payload_builder import (
21
22
    TEXT_PROMPT,
    chat_payload,
23
    chat_payload_default,
24
    completion_payload,
25
26
    completion_payload_default,
    metric_payload_default,
27
    multimodal_payload_default,
28
)
29
from tests.utils.payloads import BasePayload
30
31
32
33

logger = logging.getLogger(__name__)


34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
@dataclass
class VideoGenerationPayload(BasePayload):
    """Payload for /v1/videos endpoint (TRT-LLM video diffusion)."""

    endpoint: str = "/v1/videos"
    timeout: int = 300

    def response_handler(self, response: Any) -> str:
        response.raise_for_status()
        result = response.json()
        assert result.get("status") == "completed", (
            f"Video generation not completed. Status: {result.get('status')}, "
            f"Error: {result.get('error', 'none')}"
        )
        assert (
            "data" in result
        ), f"Missing 'data' in response. Keys: {list(result.keys())}"
        assert len(result["data"]) > 0, "Empty data in video response"
        entry = result["data"][0]
        if "url" in entry:
            assert entry["url"], "Video response url is empty"
            return entry["url"]
        assert entry.get("b64_json"), "Video response b64_json is empty"
        return "b64_video_returned"

    def validate(self, response: Any, content: str) -> None:
        assert content, "Video response content is empty"


63
@dataclass
64
class TRTLLMConfig(EngineConfig):
65
66
    """Configuration for trtllm test scenarios"""

67
    stragglers: list[str] = field(default_factory=lambda: ["TRTLLM:EngineCore"])
68

69

70
trtllm_dir = os.environ.get("TRTLLM_DIR") or os.path.join(
71
    WORKSPACE_DIR, "examples/backends/trtllm"
72
)
73

74
75
76
# TensorRT-LLM test configurations
# NOTE: pytest.mark.gpu_1 tests take ~442s (7m 22s) total to run sequentially (with models pre-cached)
# TODO: Parallelize these tests to reduce total execution time
77
78
79
trtllm_configs = {
    "aggregated": TRTLLMConfig(
        name="aggregated",
80
        directory=trtllm_dir,
81
        script_name="agg_metrics.sh",
82
        marks=[
83
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 3.9 GiB
84
85
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
86
87
88
89
            pytest.mark.profiled_vram_gib(3.9),  # actual nvidia-smi peak 3.9 GiB
            pytest.mark.requested_trtllm_kv_tokens(
                2592
            ),  # KV cache cap (2x safety over min=1296)
90
91
92
            pytest.mark.timeout(
                300
            ),  # 3x measured time (44.66s) + download time (150s)
93
        ],
94
        model="Qwen/Qwen3-0.6B",
95
        frontend_port=DefaultPort.FRONTEND.value,
96
        delayed_start=5,
97
98
99
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
100
            metric_payload_default(min_num_requests=6, backend="trtllm"),
101
        ],
102
    ),
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
    "aggregated_unified": TRTLLMConfig(
        name="aggregated_unified",
        directory=trtllm_dir,
        script_name="agg.sh",
        script_args=["--unified"],
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.profiled_vram_gib(3.9),
            pytest.mark.requested_trtllm_kv_tokens(2592),
            pytest.mark.timeout(300),
            pytest.mark.pre_merge,
        ],
        model="Qwen/Qwen3-0.6B",
        frontend_port=DefaultPort.FRONTEND.value,
        delayed_start=5,
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
    ),
124
125
    "disaggregated": TRTLLMConfig(
        name="disaggregated",
126
        directory=trtllm_dir,
127
        script_name="disagg.sh",
128
        marks=[pytest.mark.gpu_2, pytest.mark.trtllm, pytest.mark.pre_merge],
129
        model="Qwen/Qwen3-0.6B",
130
        frontend_port=DefaultPort.FRONTEND.value,
131
132
133
134
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
135
    ),
136
137
138
139
    "disaggregated_same_gpu": TRTLLMConfig(
        name="disaggregated_same_gpu",
        directory=trtllm_dir,
        script_name="disagg_same_gpu.sh",
140
        marks=[
141
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 6.6 GiB
142
143
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
144
145
146
147
148
            pytest.mark.profiled_vram_gib(6.6),  # actual nvidia-smi peak 6.6 GiB
            pytest.mark.requested_trtllm_kv_tokens(
                512
            ),  # KV cache cap (2x safety over min=256)
            pytest.mark.timeout(432),  # ~6x profiled wall time 72s
149
        ],
150
        model="Qwen/Qwen3-0.6B",
151
        frontend_port=DefaultPort.FRONTEND.value,
152
153
        delayed_start=10,
        health_check_workers=True,
154
155
156
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
157
158
159
160
161
162
            metric_payload_default(
                port=DefaultPort.SYSTEM1.value, min_num_requests=6, backend="trtllm"
            ),
            metric_payload_default(
                port=DefaultPort.SYSTEM2.value, min_num_requests=6, backend="trtllm"
            ),
163
164
        ],
    ),
165
166
167
168
    "aggregated_logprobs": TRTLLMConfig(
        name="aggregated_logprobs",
        directory=trtllm_dir,
        script_name="agg.sh",
169
170
171
172
173
174
175
176
177
178
        marks=[
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 3.8 GiB
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
            pytest.mark.profiled_vram_gib(3.8),  # actual nvidia-smi peak 3.8 GiB
            pytest.mark.requested_trtllm_kv_tokens(
                2592
            ),  # KV cache cap (2x safety over min=1296)
            pytest.mark.timeout(300),  # 3x measured time (~44s) + download time (150s)
        ],
179
        model="Qwen/Qwen3-0.6B",
180
        frontend_port=DefaultPort.FRONTEND.value,
181
        delayed_start=5,
182
183
184
185
186
187
188
189
190
191
192
        request_payloads=[
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=False, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=None),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=0),
        ],
    ),
    "disaggregated_logprobs": TRTLLMConfig(
        name="disaggregated_logprobs",
        directory=trtllm_dir,
        script_name="disagg.sh",
Dmitry Tokarev's avatar
Dmitry Tokarev committed
193
194
        marks=[
            pytest.mark.gpu_2,
195
            pytest.mark.pre_merge,
Dmitry Tokarev's avatar
Dmitry Tokarev committed
196
197
            pytest.mark.trtllm,
        ],
198
        model="Qwen/Qwen3-0.6B",
199
        frontend_port=DefaultPort.FRONTEND.value,
200
201
202
203
204
205
206
        request_payloads=[
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=False, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=None),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=0),
        ],
    ),
207
208
    "aggregated_router": TRTLLMConfig(
        name="aggregated_router",
209
        directory=trtllm_dir,
210
        script_name="agg_router.sh",
211
212
213
214
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
215
216
217
            pytest.mark.timeout(
                300
            ),  # 3x measured time (37.91s) + download time (180s)
218
        ],
219
        model="Qwen/Qwen3-0.6B",
220
        frontend_port=DefaultPort.FRONTEND.value,
221
222
223
224
        request_payloads=[
            chat_payload_default(
                expected_log=[
                    r"Event processor for worker_id \d+ processing event: Stored\(",
225
                    r"Selected worker: worker_type=\w+, worker_id=\d+ dp_rank=.*?, logit: ",
226
227
228
229
                ]
            )
        ],
        env={
230
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info",
231
        },
232
233
234
    ),
    "disaggregated_router": TRTLLMConfig(
        name="disaggregated_router",
235
        directory=trtllm_dir,
236
        script_name="disagg_router.sh",
237
        marks=[pytest.mark.gpu_2, pytest.mark.trtllm, pytest.mark.nightly],
238
        model="Qwen/Qwen3-0.6B",
239
        frontend_port=DefaultPort.FRONTEND.value,
240
241
242
243
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
244
    ),
245
246
247
248
    "disaggregated_multimodal": TRTLLMConfig(
        name="disaggregated_multimodal",
        directory=trtllm_dir,
        script_name="disagg_multimodal.sh",
249
250
251
252
253
254
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.nightly,
        ],
255
        model="Qwen/Qwen2-VL-7B-Instruct",
256
        frontend_port=DefaultPort.FRONTEND.value,
257
258
259
260
        timeout=900,
        delayed_start=60,
        request_payloads=[multimodal_payload_default()],
    ),
261
262
263
    "aggregated_multimodal_router": TRTLLMConfig(
        name="aggregated_multimodal_router",
        directory=trtllm_dir,
264
        script_name="agg_multimodal_router.sh",
265
        marks=[
266
267
268
            pytest.mark.skip(
                reason="Nightly CI failure: https://linear.app/nvidia/issue/DYN-2608"
            ),
269
270
271
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
272
            pytest.mark.pre_merge,
273
        ],
274
        model="Qwen/Qwen3-VL-2B-Instruct",
275
276
277
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=60,
278
279
280
281
282
283
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
284
    ),
285
286
    # TensorRT-LLM EPD (Encode-Prefill-Decode) multimodal test for pre-merge CI
    # Uses Qwen3-VL-2B-Instruct model with 1 GPU (all workers share same GPU)
287
288
289
290
291
    #
    # TODO: Add Llama-4-Scout multimodal tests (agg_multimodal_llama, disagg_multimodal_llama)
    #       once CI supports gpu_8 runners and launch scripts are available
    "epd_multimodal": TRTLLMConfig(
        name="epd_multimodal",
292
        directory=trtllm_dir,
293
        script_name="epd_multimodal_image_and_embeddings.sh",
294
        marks=[
295
            pytest.mark.gpu_1,
296
297
            pytest.mark.trtllm,
            pytest.mark.multimodal,
298
            pytest.mark.pre_merge,
299
        ],
300
        model="Qwen/Qwen3-VL-2B-Instruct",
301
        frontend_port=DefaultPort.FRONTEND.value,
302
        timeout=900,
303
        delayed_start=120,
304
305
306
307
308
309
310
311
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
        env={
            "PREFILL_CUDA_VISIBLE_DEVICES": "0",
312
            "DECODE_CUDA_VISIBLE_DEVICES": "0",
313
314
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
        },
315
    ),
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
    # Test Encoder with Aggregated PD worker on same GPU
    # Make this pre-merge after TRTLLM #5938603 is fixed
    "e_pd_multimodal": TRTLLMConfig(
        name="e_pd_multimodal",
        directory=trtllm_dir,
        script_name="disagg_e_pd.sh",
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.nightly,
        ],
        model="Qwen/Qwen3-VL-2B-Instruct",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=120,
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
        env={
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
        },
    ),
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
    # LLaVA raw-embeddings E/PD test
    # Validates the raw-embeddings code path where pre-computed vision embeddings
    # (.pt tensor file) are sent via file:// URL instead of a raw image URL.
    #
    # Flow:
    #   1. Launch script generates embeddings using standalone HF vision encoder
    #   2. Encode + Aggregated PD workers start for LLaVA
    #   3. Test sends chat/completions request with file:///tmp/llava_embeddings.pt
    #
    # Uses gpu_2: encode worker on GPU 0, PD worker on GPU 1.
    # The 7B LLaVA model requires two GPUs because both encode and PD workers
    # load the full model (~14GB each in bfloat16), exceeding a single L4's 22GB.
    # Runs in the multi-GPU pre-merge CI (marker: pre_merge and trtllm and gpu_2).
    "raw_embeddings_epd": TRTLLMConfig(
        name="raw_embeddings_epd",
        directory=SERVE_TEST_DIR,
        script_name="agg_raw_embeddings_llava.sh",
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.pre_merge,
            pytest.mark.timeout(
                900
            ),  # Embeddings generation (~60s) + model load (~120s) + inference
        ],
        model="llava-hf/llava-v1.6-mistral-7b-hf",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=600,
        # Embeddings generation + worker startup takes longer than normal
        delayed_start=180,
        request_payloads=[
            multimodal_payload_default(
                image_url="file:///tmp/llava_embeddings.pt",
                text="Describe what this image shows.",
                expected_response=["bench", "person", "image", "picture"],
            )
        ],
        env={
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
            "PD_CUDA_VISIBLE_DEVICES": "1",
        },
    ),
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
    # TensorRT-LLM video diffusion test using Wan2.1-T2V-1.3B model.
    # Validates the end-to-end video generation pipeline (frontend → worker → /v1/videos).
    # Uses --skip-warmup (warmup at default resolution OOMs on 22 GB L4 GPU),
    # --disable-torch-compile, and small default resolution (480x272, 17 frames)
    # to fit within CI GPU memory constraints.
    "video_diffusion": TRTLLMConfig(
        name="video_diffusion",
        directory=trtllm_dir,
        script_name="agg_video_diffusion.sh",
        script_args=[
            "--skip-warmup",
            "--disable-torch-compile",
            "--default-height",
            "272",
            "--default-width",
            "480",
            "--default-num-frames",
            "17",
        ],
        marks=[
405
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 17.1 GiB
406
407
            pytest.mark.trtllm,
            pytest.mark.pre_merge,
408
409
410
411
412
413
414
415
            # Diffusion models don't use KV cache, so requested_trtllm_kv_tokens
            # doesn't apply.  requested_trtllm_vram_gib maps to
            # KvCacheConfig.max_gpu_total_bytes which has no effect on the
            # diffusion engine itself, but the parallel scheduler requires one
            # of the KV/VRAM markers to accept the test.  We set it to the
            # profiled peak so the scheduler's VRAM budget is accurate.
            pytest.mark.profiled_vram_gib(17.1),  # actual nvidia-smi peak 17.1 GiB
            pytest.mark.requested_trtllm_vram_gib(17.1),
416
417
418
419
420
421
422
            pytest.mark.timeout(
                600
            ),  # Video generation is slow even at small resolution
        ],
        model="Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=300,
423
        delayed_start=5,
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
        request_payloads=[
            VideoGenerationPayload(
                body={
                    "prompt": "A golden retriever running on a beach",
                    "size": "480x272",
                    "response_format": "url",
                    "nvext": {
                        "num_inference_steps": 10,
                        "num_frames": 17,
                        "guidance_scale": 5.0,
                        "seed": 42,
                    },
                },
                repeat_count=1,
                expected_response=[],
                expected_log=[],
            ),
        ],
    ),
443
444
445
446
    "completions_only": TRTLLMConfig(
        name="completions_only",
        directory=trtllm_dir,
        script_name="agg.sh",
447
448
449
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
450
            pytest.mark.post_merge,
451
            pytest.mark.skip(reason="DIS-1566"),
452
453
454
            pytest.mark.timeout(
                480
            ),  # 3x measured time (83.85s) + download time (210s) for 7B model
455
        ],
456
457
458
459
460
461
462
463
        model="deepseek-ai/deepseek-llm-7b-base",
        script_args=["--dyn-endpoint-types", "completions"],
        env={
            "MODEL_PATH": "deepseek-ai/deepseek-llm-7b-base",
            "SERVED_MODEL_NAME": "deepseek-ai/deepseek-llm-7b-base",
        },
        request_payloads=[
            completion_payload_default(),
464
            completion_payload(prompt=TEXT_PROMPT, logprobs=3),
465
466
        ],
    ),
467
468
469
}


Alec's avatar
Alec committed
470
@pytest.fixture(params=params_with_model_mark(trtllm_configs))
471
472
def trtllm_config_test(request):
    """Fixture that provides different trtllm test configurations"""
473
    return trtllm_configs[request.param]
474
475


476
@pytest.mark.trtllm
477
@pytest.mark.e2e
478
@pytest.mark.parametrize("num_system_ports", [2], indirect=True)
479
480
481
482
483
def test_deployment(
    trtllm_config_test,
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
484
    num_system_ports,
485
486
    predownload_models,
):
487
488
489
    """
    Test dynamo deployments with different configurations.
    """
490
491
492
    assert (
        num_system_ports >= 2
    ), "serve tests require at least SYSTEM_PORT1 + SYSTEM_PORT2"
493
494
495
496
497
498
499
500
501
502
503
504
    # Use per-test ports so tests can run safely under pytest-xdist.
    config = dataclasses.replace(
        trtllm_config_test, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    # Non-port env stays here; ports are wired by run_serve_deployment(ports=...).
    config.env.update(
        {
            "MODEL_PATH": config.model,
            "SERVED_MODEL_NAME": config.model,
        }
    )
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)
505
506


507
# TODO make this a normal guy
508
509
@pytest.mark.e2e
@pytest.mark.gpu_1
510
@pytest.mark.trtllm
511
@pytest.mark.pre_merge
512
@pytest.mark.timeout(660)  # 3x measured time (159.68s) + download time (180s)
513
def test_chat_only_aggregated_with_test_logits_processor(
514
515
516
517
518
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
    predownload_models,
    monkeypatch,
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
):
    """
    Run a single aggregated chat-completions test using Qwen 0.6B with the
    test logits processor enabled, and expect "Hello world" in the response.
    """

    # Enable HelloWorld logits processor only for this test
    monkeypatch.setenv("DYNAMO_ENABLE_TEST_LOGITS_PROCESSOR", "1")

    base = trtllm_configs["aggregated"]
    config = TRTLLMConfig(
        name="aggregated_qwen_chatonly",
        directory=base.directory,
        script_name=base.script_name,  # agg.sh
        marks=[],  # not used by this direct test
534
        request_payloads=[
535
            chat_payload_default(expected_response=["Hello world!"]),
536
        ],
537
538
539
540
541
        model="Qwen/Qwen3-0.6B",
        delayed_start=base.delayed_start,
        timeout=base.timeout,
    )

542
543
544
545
546
547
548
549
550
551
    config = dataclasses.replace(
        config, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    config.env.update(
        {
            "MODEL_PATH": config.model,
            "SERVED_MODEL_NAME": config.model,
        }
    )
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)