test_trtllm.py 20.5 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
# SPDX-License-Identifier: Apache-2.0

4
import dataclasses
5
6
import logging
import os
7
from dataclasses import dataclass, field
8
from typing import Any
9
10
11

import pytest

12
from tests.serve.common import (
13
    SERVE_TEST_DIR,
14
15
16
17
    WORKSPACE_DIR,
    params_with_model_mark,
    run_serve_deployment,
)
18
from tests.utils.constants import DefaultPort
19
from tests.utils.engine_process import EngineConfig
20
from tests.utils.payload_builder import (
21
22
    TEXT_PROMPT,
    chat_payload,
23
    chat_payload_default,
24
    completion_payload,
25
26
    completion_payload_default,
    metric_payload_default,
27
    multimodal_payload_default,
28
)
29
from tests.utils.payloads import BasePayload
30
31
32
33

logger = logging.getLogger(__name__)


34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
@dataclass
class VideoGenerationPayload(BasePayload):
    """Payload for /v1/videos endpoint (TRT-LLM video diffusion)."""

    endpoint: str = "/v1/videos"
    timeout: int = 300

    def response_handler(self, response: Any) -> str:
        response.raise_for_status()
        result = response.json()
        assert result.get("status") == "completed", (
            f"Video generation not completed. Status: {result.get('status')}, "
            f"Error: {result.get('error', 'none')}"
        )
        assert (
            "data" in result
        ), f"Missing 'data' in response. Keys: {list(result.keys())}"
        assert len(result["data"]) > 0, "Empty data in video response"
        entry = result["data"][0]
        if "url" in entry:
            assert entry["url"], "Video response url is empty"
            return entry["url"]
        assert entry.get("b64_json"), "Video response b64_json is empty"
        return "b64_video_returned"

    def validate(self, response: Any, content: str) -> None:
        assert content, "Video response content is empty"


63
@dataclass
64
class TRTLLMConfig(EngineConfig):
65
66
    """Configuration for trtllm test scenarios"""

67
    stragglers: list[str] = field(default_factory=lambda: ["TRTLLM:EngineCore"])
68

69

70
trtllm_dir = os.environ.get("TRTLLM_DIR") or os.path.join(
71
    WORKSPACE_DIR, "examples/backends/trtllm"
72
)
73

74
75
76
# TensorRT-LLM test configurations
# NOTE: pytest.mark.gpu_1 tests take ~442s (7m 22s) total to run sequentially (with models pre-cached)
# TODO: Parallelize these tests to reduce total execution time
77
78
79
trtllm_configs = {
    "aggregated": TRTLLMConfig(
        name="aggregated",
80
        directory=trtllm_dir,
81
        script_name="agg_metrics.sh",
82
        marks=[
83
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 3.9 GiB
84
85
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
86
87
88
89
            pytest.mark.profiled_vram_gib(3.9),  # actual nvidia-smi peak 3.9 GiB
            pytest.mark.requested_trtllm_kv_tokens(
                2592
            ),  # KV cache cap (2x safety over min=1296)
90
91
92
            pytest.mark.timeout(
                300
            ),  # 3x measured time (44.66s) + download time (150s)
93
        ],
94
        model="Qwen/Qwen3-0.6B",
95
        frontend_port=DefaultPort.FRONTEND.value,
96
        delayed_start=5,
97
98
99
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
100
            metric_payload_default(min_num_requests=6, backend="trtllm"),
101
        ],
102
    ),
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
    "aggregated_unified": TRTLLMConfig(
        name="aggregated_unified",
        directory=trtllm_dir,
        script_name="agg.sh",
        script_args=["--unified"],
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.profiled_vram_gib(3.9),
            pytest.mark.requested_trtllm_kv_tokens(2592),
            pytest.mark.timeout(300),
            pytest.mark.pre_merge,
        ],
        model="Qwen/Qwen3-0.6B",
        frontend_port=DefaultPort.FRONTEND.value,
        delayed_start=5,
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
    ),
124
125
    "disaggregated": TRTLLMConfig(
        name="disaggregated",
126
        directory=trtllm_dir,
127
        script_name="disagg.sh",
128
        marks=[pytest.mark.gpu_2, pytest.mark.trtllm, pytest.mark.pre_merge],
129
        model="Qwen/Qwen3-0.6B",
130
        frontend_port=DefaultPort.FRONTEND.value,
131
132
133
134
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
135
    ),
136
137
138
139
    "disaggregated_same_gpu": TRTLLMConfig(
        name="disaggregated_same_gpu",
        directory=trtllm_dir,
        script_name="disagg_same_gpu.sh",
140
        marks=[
141
142
143
            pytest.mark.skip(
                reason="Nightly CI failure: https://linear.app/nvidia/issue/OPS-4450"
            ),
144
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 6.6 GiB
145
146
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
147
148
149
150
151
            pytest.mark.profiled_vram_gib(6.6),  # actual nvidia-smi peak 6.6 GiB
            pytest.mark.requested_trtllm_kv_tokens(
                512
            ),  # KV cache cap (2x safety over min=256)
            pytest.mark.timeout(432),  # ~6x profiled wall time 72s
152
        ],
153
        model="Qwen/Qwen3-0.6B",
154
        frontend_port=DefaultPort.FRONTEND.value,
155
156
        delayed_start=10,
        health_check_workers=True,
157
158
159
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
160
161
162
163
164
165
            metric_payload_default(
                port=DefaultPort.SYSTEM1.value, min_num_requests=6, backend="trtllm"
            ),
            metric_payload_default(
                port=DefaultPort.SYSTEM2.value, min_num_requests=6, backend="trtllm"
            ),
166
167
        ],
    ),
168
169
170
171
    "aggregated_logprobs": TRTLLMConfig(
        name="aggregated_logprobs",
        directory=trtllm_dir,
        script_name="agg.sh",
172
173
174
175
176
177
178
179
180
181
        marks=[
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 3.8 GiB
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
            pytest.mark.profiled_vram_gib(3.8),  # actual nvidia-smi peak 3.8 GiB
            pytest.mark.requested_trtllm_kv_tokens(
                2592
            ),  # KV cache cap (2x safety over min=1296)
            pytest.mark.timeout(300),  # 3x measured time (~44s) + download time (150s)
        ],
182
        model="Qwen/Qwen3-0.6B",
183
        frontend_port=DefaultPort.FRONTEND.value,
184
        delayed_start=5,
185
186
187
188
189
190
191
192
193
194
195
        request_payloads=[
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=False, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=None),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=0),
        ],
    ),
    "disaggregated_logprobs": TRTLLMConfig(
        name="disaggregated_logprobs",
        directory=trtllm_dir,
        script_name="disagg.sh",
Dmitry Tokarev's avatar
Dmitry Tokarev committed
196
197
        marks=[
            pytest.mark.gpu_2,
198
            pytest.mark.pre_merge,
Dmitry Tokarev's avatar
Dmitry Tokarev committed
199
200
            pytest.mark.trtllm,
        ],
201
        model="Qwen/Qwen3-0.6B",
202
        frontend_port=DefaultPort.FRONTEND.value,
203
204
205
206
207
208
209
        request_payloads=[
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=False, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=None),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=0),
        ],
    ),
210
211
    "aggregated_router": TRTLLMConfig(
        name="aggregated_router",
212
        directory=trtllm_dir,
213
        script_name="agg_router.sh",
214
215
216
217
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
218
219
220
            pytest.mark.timeout(
                300
            ),  # 3x measured time (37.91s) + download time (180s)
221
        ],
222
        model="Qwen/Qwen3-0.6B",
223
        frontend_port=DefaultPort.FRONTEND.value,
224
225
226
227
        request_payloads=[
            chat_payload_default(
                expected_log=[
                    r"Event processor for worker_id \d+ processing event: Stored\(",
228
                    r"Selected worker: worker_type=\w+, worker_id=\d+ dp_rank=.*?, logit: ",
229
230
231
232
                ]
            )
        ],
        env={
233
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info",
234
        },
235
236
237
    ),
    "disaggregated_router": TRTLLMConfig(
        name="disaggregated_router",
238
        directory=trtllm_dir,
239
        script_name="disagg_router.sh",
240
        marks=[pytest.mark.gpu_2, pytest.mark.trtllm, pytest.mark.nightly],
241
        model="Qwen/Qwen3-0.6B",
242
        frontend_port=DefaultPort.FRONTEND.value,
243
244
245
246
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
247
    ),
248
249
250
251
    "disaggregated_multimodal": TRTLLMConfig(
        name="disaggregated_multimodal",
        directory=trtllm_dir,
        script_name="disagg_multimodal.sh",
252
253
254
255
256
257
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.nightly,
        ],
258
        model="Qwen/Qwen2-VL-7B-Instruct",
259
        frontend_port=DefaultPort.FRONTEND.value,
260
261
262
263
        timeout=900,
        delayed_start=60,
        request_payloads=[multimodal_payload_default()],
    ),
264
265
266
    "aggregated_multimodal_router": TRTLLMConfig(
        name="aggregated_multimodal_router",
        directory=trtllm_dir,
267
        script_name="agg_multimodal_router.sh",
268
        marks=[
269
270
271
            pytest.mark.skip(
                reason="Nightly CI failure: https://linear.app/nvidia/issue/DYN-2608"
            ),
272
273
274
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
275
            pytest.mark.pre_merge,
276
        ],
277
        model="Qwen/Qwen3-VL-2B-Instruct",
278
279
280
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=60,
281
282
283
284
285
286
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
287
    ),
288
289
    # TensorRT-LLM EPD (Encode-Prefill-Decode) multimodal test for pre-merge CI
    # Uses Qwen3-VL-2B-Instruct model with 1 GPU (all workers share same GPU)
290
291
292
293
294
    #
    # TODO: Add Llama-4-Scout multimodal tests (agg_multimodal_llama, disagg_multimodal_llama)
    #       once CI supports gpu_8 runners and launch scripts are available
    "epd_multimodal": TRTLLMConfig(
        name="epd_multimodal",
295
        directory=trtllm_dir,
296
        script_name="epd_multimodal_image_and_embeddings.sh",
297
        marks=[
298
            pytest.mark.gpu_1,
299
300
            pytest.mark.trtllm,
            pytest.mark.multimodal,
301
            pytest.mark.pre_merge,
302
        ],
303
        model="Qwen/Qwen3-VL-2B-Instruct",
304
        frontend_port=DefaultPort.FRONTEND.value,
305
        timeout=900,
306
        delayed_start=120,
307
308
309
310
311
312
313
314
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
        env={
            "PREFILL_CUDA_VISIBLE_DEVICES": "0",
315
            "DECODE_CUDA_VISIBLE_DEVICES": "0",
316
317
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
        },
318
    ),
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
    # Test Encoder with Aggregated PD worker on same GPU
    # Make this pre-merge after TRTLLM #5938603 is fixed
    "e_pd_multimodal": TRTLLMConfig(
        name="e_pd_multimodal",
        directory=trtllm_dir,
        script_name="disagg_e_pd.sh",
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.nightly,
        ],
        model="Qwen/Qwen3-VL-2B-Instruct",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=120,
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
        env={
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
        },
    ),
345
346
    # LLaVA raw-embeddings E/PD test
    # Validates the raw-embeddings code path where pre-computed vision embeddings
347
    # (.safetensors file) are sent via file:// URL instead of a raw image URL.
348
349
350
351
    #
    # Flow:
    #   1. Launch script generates embeddings using standalone HF vision encoder
    #   2. Encode + Aggregated PD workers start for LLaVA
352
    #   3. Test sends chat/completions request with file:///tmp/llava_embeddings.safetensors
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
    #
    # Uses gpu_2: encode worker on GPU 0, PD worker on GPU 1.
    # The 7B LLaVA model requires two GPUs because both encode and PD workers
    # load the full model (~14GB each in bfloat16), exceeding a single L4's 22GB.
    # Runs in the multi-GPU pre-merge CI (marker: pre_merge and trtllm and gpu_2).
    "raw_embeddings_epd": TRTLLMConfig(
        name="raw_embeddings_epd",
        directory=SERVE_TEST_DIR,
        script_name="agg_raw_embeddings_llava.sh",
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.pre_merge,
            pytest.mark.timeout(
                900
            ),  # Embeddings generation (~60s) + model load (~120s) + inference
        ],
        model="llava-hf/llava-v1.6-mistral-7b-hf",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=600,
        # Embeddings generation + worker startup takes longer than normal
        delayed_start=180,
        request_payloads=[
            multimodal_payload_default(
378
                image_url="file:///tmp/llava_embeddings.safetensors",
379
380
381
382
383
384
385
386
387
                text="Describe what this image shows.",
                expected_response=["bench", "person", "image", "picture"],
            )
        ],
        env={
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
            "PD_CUDA_VISIBLE_DEVICES": "1",
        },
    ),
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
    # TensorRT-LLM video diffusion test using Wan2.1-T2V-1.3B model.
    # Validates the end-to-end video generation pipeline (frontend → worker → /v1/videos).
    # Uses --skip-warmup (warmup at default resolution OOMs on 22 GB L4 GPU),
    # --disable-torch-compile, and small default resolution (480x272, 17 frames)
    # to fit within CI GPU memory constraints.
    "video_diffusion": TRTLLMConfig(
        name="video_diffusion",
        directory=trtllm_dir,
        script_name="agg_video_diffusion.sh",
        script_args=[
            "--skip-warmup",
            "--disable-torch-compile",
            "--default-height",
            "272",
            "--default-width",
            "480",
            "--default-num-frames",
            "17",
        ],
        marks=[
408
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 17.1 GiB
409
410
            pytest.mark.trtllm,
            pytest.mark.pre_merge,
411
412
413
414
415
416
417
418
            # Diffusion models don't use KV cache, so requested_trtllm_kv_tokens
            # doesn't apply.  requested_trtllm_vram_gib maps to
            # KvCacheConfig.max_gpu_total_bytes which has no effect on the
            # diffusion engine itself, but the parallel scheduler requires one
            # of the KV/VRAM markers to accept the test.  We set it to the
            # profiled peak so the scheduler's VRAM budget is accurate.
            pytest.mark.profiled_vram_gib(17.1),  # actual nvidia-smi peak 17.1 GiB
            pytest.mark.requested_trtllm_vram_gib(17.1),
419
420
421
422
423
424
425
            pytest.mark.timeout(
                600
            ),  # Video generation is slow even at small resolution
        ],
        model="Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=300,
426
        delayed_start=5,
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
        request_payloads=[
            VideoGenerationPayload(
                body={
                    "prompt": "A golden retriever running on a beach",
                    "size": "480x272",
                    "response_format": "url",
                    "nvext": {
                        "num_inference_steps": 10,
                        "num_frames": 17,
                        "guidance_scale": 5.0,
                        "seed": 42,
                    },
                },
                repeat_count=1,
                expected_response=[],
                expected_log=[],
            ),
        ],
    ),
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
    # Aggregated multimodal with --frontend-decoding enabled.
    # Verifies image URL inference works when images are decoded by the Rust
    # MediaDecoder in the frontend instead of the Python backend.
    "aggregated_multimodal_frontend_decoding": TRTLLMConfig(
        name="aggregated_multimodal_frontend_decoding",
        directory=trtllm_dir,
        script_name="agg_multimodal.sh",
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.pre_merge,
            pytest.mark.timeout(900),
        ],
        model="Qwen/Qwen3-VL-2B-Instruct",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=60,
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
        env={
            "AGG_ENGINE_ARGS": "/workspace/examples/backends/trtllm/engine_configs/qwen3-vl-2b-instruct/agg.yaml",
            "DYN_TRTLLM_FRONTEND_DECODING": "true",
        },
    ),
475
476
477
478
    "completions_only": TRTLLMConfig(
        name="completions_only",
        directory=trtllm_dir,
        script_name="agg.sh",
479
480
481
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
482
            pytest.mark.post_merge,
483
            pytest.mark.skip(reason="DIS-1566"),
484
485
486
            pytest.mark.timeout(
                480
            ),  # 3x measured time (83.85s) + download time (210s) for 7B model
487
        ],
488
489
490
491
492
493
494
495
        model="deepseek-ai/deepseek-llm-7b-base",
        script_args=["--dyn-endpoint-types", "completions"],
        env={
            "MODEL_PATH": "deepseek-ai/deepseek-llm-7b-base",
            "SERVED_MODEL_NAME": "deepseek-ai/deepseek-llm-7b-base",
        },
        request_payloads=[
            completion_payload_default(),
496
            completion_payload(prompt=TEXT_PROMPT, logprobs=3),
497
498
        ],
    ),
499
500
501
}


Alec's avatar
Alec committed
502
@pytest.fixture(params=params_with_model_mark(trtllm_configs))
503
504
def trtllm_config_test(request):
    """Fixture that provides different trtllm test configurations"""
505
    return trtllm_configs[request.param]
506
507


508
@pytest.mark.trtllm
509
@pytest.mark.e2e
510
@pytest.mark.parametrize("num_system_ports", [2], indirect=True)
511
512
513
514
515
def test_deployment(
    trtllm_config_test,
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
516
    num_system_ports,
517
518
    predownload_models,
):
519
520
521
    """
    Test dynamo deployments with different configurations.
    """
522
523
524
    assert (
        num_system_ports >= 2
    ), "serve tests require at least SYSTEM_PORT1 + SYSTEM_PORT2"
525
526
527
528
529
530
531
532
533
534
535
536
    # Use per-test ports so tests can run safely under pytest-xdist.
    config = dataclasses.replace(
        trtllm_config_test, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    # Non-port env stays here; ports are wired by run_serve_deployment(ports=...).
    config.env.update(
        {
            "MODEL_PATH": config.model,
            "SERVED_MODEL_NAME": config.model,
        }
    )
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)
537
538


539
# TODO make this a normal guy
540
541
@pytest.mark.e2e
@pytest.mark.gpu_1
542
@pytest.mark.trtllm
543
@pytest.mark.pre_merge
544
@pytest.mark.timeout(660)  # 3x measured time (159.68s) + download time (180s)
545
def test_chat_only_aggregated_with_test_logits_processor(
546
547
548
549
550
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
    predownload_models,
    monkeypatch,
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
):
    """
    Run a single aggregated chat-completions test using Qwen 0.6B with the
    test logits processor enabled, and expect "Hello world" in the response.
    """

    # Enable HelloWorld logits processor only for this test
    monkeypatch.setenv("DYNAMO_ENABLE_TEST_LOGITS_PROCESSOR", "1")

    base = trtllm_configs["aggregated"]
    config = TRTLLMConfig(
        name="aggregated_qwen_chatonly",
        directory=base.directory,
        script_name=base.script_name,  # agg.sh
        marks=[],  # not used by this direct test
566
        request_payloads=[
567
            chat_payload_default(expected_response=["Hello world!"]),
568
        ],
569
570
571
572
573
        model="Qwen/Qwen3-0.6B",
        delayed_start=base.delayed_start,
        timeout=base.timeout,
    )

574
575
576
577
578
579
580
581
582
583
    config = dataclasses.replace(
        config, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    config.env.update(
        {
            "MODEL_PATH": config.model,
            "SERVED_MODEL_NAME": config.model,
        }
    )
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)