test_trtllm.py 19.5 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
# SPDX-License-Identifier: Apache-2.0

4
import dataclasses
5
6
import logging
import os
7
from dataclasses import dataclass, field
8
9
10

import pytest

11
from tests.serve.common import (
12
    SERVE_TEST_DIR,
13
14
15
16
    WORKSPACE_DIR,
    params_with_model_mark,
    run_serve_deployment,
)
17
from tests.utils.constants import DefaultPort
18
from tests.utils.engine_process import EngineConfig
19
from tests.utils.payload_builder import (
20
21
    TEXT_PROMPT,
    chat_payload,
22
    chat_payload_default,
23
    completion_payload,
24
25
    completion_payload_default,
    metric_payload_default,
26
    multimodal_payload_default,
27
)
28
from tests.utils.payloads import VideoGenerationPayload
29
30
31
32
33

logger = logging.getLogger(__name__)


@dataclass
34
class TRTLLMConfig(EngineConfig):
35
36
    """Configuration for trtllm test scenarios"""

37
    stragglers: list[str] = field(default_factory=lambda: ["TRTLLM:EngineCore"])
38

39

40
trtllm_dir = os.environ.get("TRTLLM_DIR") or os.path.join(
41
    WORKSPACE_DIR, "examples/backends/trtllm"
42
)
43

44
45
46
# TensorRT-LLM test configurations
# NOTE: pytest.mark.gpu_1 tests take ~442s (7m 22s) total to run sequentially (with models pre-cached)
# TODO: Parallelize these tests to reduce total execution time
47
48
49
trtllm_configs = {
    "aggregated": TRTLLMConfig(
        name="aggregated",
50
        directory=trtllm_dir,
51
        script_name="agg_metrics.sh",
52
        marks=[
53
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 3.9 GiB
54
55
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
56
57
58
59
            pytest.mark.profiled_vram_gib(3.9),  # actual nvidia-smi peak 3.9 GiB
            pytest.mark.requested_trtllm_kv_tokens(
                2592
            ),  # KV cache cap (2x safety over min=1296)
60
61
62
            pytest.mark.timeout(
                300
            ),  # 3x measured time (44.66s) + download time (150s)
63
        ],
64
        model="Qwen/Qwen3-0.6B",
65
        frontend_port=DefaultPort.FRONTEND.value,
66
        delayed_start=5,
67
68
69
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
70
            metric_payload_default(min_num_requests=6, backend="trtllm"),
71
        ],
72
    ),
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
    "aggregated_unified": TRTLLMConfig(
        name="aggregated_unified",
        directory=trtllm_dir,
        script_name="agg.sh",
        script_args=["--unified"],
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.profiled_vram_gib(3.9),
            pytest.mark.requested_trtllm_kv_tokens(2592),
            pytest.mark.timeout(300),
            pytest.mark.pre_merge,
        ],
        model="Qwen/Qwen3-0.6B",
        frontend_port=DefaultPort.FRONTEND.value,
        delayed_start=5,
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
    ),
94
95
    "disaggregated": TRTLLMConfig(
        name="disaggregated",
96
        directory=trtllm_dir,
97
        script_name="disagg.sh",
98
        marks=[pytest.mark.gpu_2, pytest.mark.trtllm, pytest.mark.pre_merge],
99
        model="Qwen/Qwen3-0.6B",
100
        frontend_port=DefaultPort.FRONTEND.value,
101
102
103
104
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
105
    ),
106
107
108
109
    "disaggregated_same_gpu": TRTLLMConfig(
        name="disaggregated_same_gpu",
        directory=trtllm_dir,
        script_name="disagg_same_gpu.sh",
110
        marks=[
111
112
113
            pytest.mark.skip(
                reason="Nightly CI failure: https://linear.app/nvidia/issue/OPS-4450"
            ),
114
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 6.6 GiB
115
116
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
117
118
119
120
121
            pytest.mark.profiled_vram_gib(6.6),  # actual nvidia-smi peak 6.6 GiB
            pytest.mark.requested_trtllm_kv_tokens(
                512
            ),  # KV cache cap (2x safety over min=256)
            pytest.mark.timeout(432),  # ~6x profiled wall time 72s
122
        ],
123
        model="Qwen/Qwen3-0.6B",
124
        frontend_port=DefaultPort.FRONTEND.value,
125
126
        delayed_start=10,
        health_check_workers=True,
127
128
129
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
130
131
132
133
134
135
            metric_payload_default(
                port=DefaultPort.SYSTEM1.value, min_num_requests=6, backend="trtllm"
            ),
            metric_payload_default(
                port=DefaultPort.SYSTEM2.value, min_num_requests=6, backend="trtllm"
            ),
136
137
        ],
    ),
138
139
140
141
    "aggregated_logprobs": TRTLLMConfig(
        name="aggregated_logprobs",
        directory=trtllm_dir,
        script_name="agg.sh",
142
143
144
145
146
147
148
149
150
151
        marks=[
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 3.8 GiB
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
            pytest.mark.profiled_vram_gib(3.8),  # actual nvidia-smi peak 3.8 GiB
            pytest.mark.requested_trtllm_kv_tokens(
                2592
            ),  # KV cache cap (2x safety over min=1296)
            pytest.mark.timeout(300),  # 3x measured time (~44s) + download time (150s)
        ],
152
        model="Qwen/Qwen3-0.6B",
153
        frontend_port=DefaultPort.FRONTEND.value,
154
        delayed_start=5,
155
156
157
158
159
160
161
162
163
164
165
        request_payloads=[
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=False, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=None),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=0),
        ],
    ),
    "disaggregated_logprobs": TRTLLMConfig(
        name="disaggregated_logprobs",
        directory=trtllm_dir,
        script_name="disagg.sh",
Dmitry Tokarev's avatar
Dmitry Tokarev committed
166
167
        marks=[
            pytest.mark.gpu_2,
168
            pytest.mark.pre_merge,
Dmitry Tokarev's avatar
Dmitry Tokarev committed
169
170
            pytest.mark.trtllm,
        ],
171
        model="Qwen/Qwen3-0.6B",
172
        frontend_port=DefaultPort.FRONTEND.value,
173
174
175
176
177
178
179
        request_payloads=[
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=False, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=None),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=0),
        ],
    ),
180
181
    "aggregated_router": TRTLLMConfig(
        name="aggregated_router",
182
        directory=trtllm_dir,
183
        script_name="agg_router.sh",
184
185
186
187
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
188
189
190
            pytest.mark.timeout(
                300
            ),  # 3x measured time (37.91s) + download time (180s)
191
        ],
192
        model="Qwen/Qwen3-0.6B",
193
        frontend_port=DefaultPort.FRONTEND.value,
194
195
196
197
        request_payloads=[
            chat_payload_default(
                expected_log=[
                    r"Event processor for worker_id \d+ processing event: Stored\(",
198
                    r"Selected worker: worker_type=\w+, worker_id=\d+ dp_rank=.*?, logit: ",
199
200
201
202
                ]
            )
        ],
        env={
203
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info",
204
        },
205
206
207
    ),
    "disaggregated_router": TRTLLMConfig(
        name="disaggregated_router",
208
        directory=trtllm_dir,
209
        script_name="disagg_router.sh",
210
        marks=[pytest.mark.gpu_2, pytest.mark.trtllm, pytest.mark.nightly],
211
        model="Qwen/Qwen3-0.6B",
212
        frontend_port=DefaultPort.FRONTEND.value,
213
214
215
216
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
217
    ),
218
219
220
221
    "disaggregated_multimodal": TRTLLMConfig(
        name="disaggregated_multimodal",
        directory=trtllm_dir,
        script_name="disagg_multimodal.sh",
222
223
224
225
226
227
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.nightly,
        ],
228
        model="Qwen/Qwen2-VL-7B-Instruct",
229
        frontend_port=DefaultPort.FRONTEND.value,
230
231
232
233
        timeout=900,
        delayed_start=60,
        request_payloads=[multimodal_payload_default()],
    ),
234
235
236
    "aggregated_multimodal_router": TRTLLMConfig(
        name="aggregated_multimodal_router",
        directory=trtllm_dir,
237
        script_name="agg_multimodal_router.sh",
238
        marks=[
239
240
241
            pytest.mark.skip(
                reason="Nightly CI failure: https://linear.app/nvidia/issue/DYN-2608"
            ),
242
243
244
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
245
            pytest.mark.pre_merge,
246
        ],
247
        model="Qwen/Qwen3-VL-2B-Instruct",
248
249
250
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=60,
251
252
253
254
255
256
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
257
    ),
258
259
    # TensorRT-LLM EPD (Encode-Prefill-Decode) multimodal test for pre-merge CI
    # Uses Qwen3-VL-2B-Instruct model with 1 GPU (all workers share same GPU)
260
261
262
263
264
    #
    # TODO: Add Llama-4-Scout multimodal tests (agg_multimodal_llama, disagg_multimodal_llama)
    #       once CI supports gpu_8 runners and launch scripts are available
    "epd_multimodal": TRTLLMConfig(
        name="epd_multimodal",
265
        directory=trtllm_dir,
266
        script_name="epd_multimodal_image_and_embeddings.sh",
267
        marks=[
268
            pytest.mark.gpu_1,
269
270
            pytest.mark.trtllm,
            pytest.mark.multimodal,
271
            pytest.mark.pre_merge,
272
        ],
273
        model="Qwen/Qwen3-VL-2B-Instruct",
274
        frontend_port=DefaultPort.FRONTEND.value,
275
        timeout=900,
276
        delayed_start=120,
277
278
279
280
281
282
283
284
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
        env={
            "PREFILL_CUDA_VISIBLE_DEVICES": "0",
285
            "DECODE_CUDA_VISIBLE_DEVICES": "0",
286
287
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
        },
288
    ),
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
    # Test Encoder with Aggregated PD worker on same GPU
    # Make this pre-merge after TRTLLM #5938603 is fixed
    "e_pd_multimodal": TRTLLMConfig(
        name="e_pd_multimodal",
        directory=trtllm_dir,
        script_name="disagg_e_pd.sh",
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.nightly,
        ],
        model="Qwen/Qwen3-VL-2B-Instruct",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=120,
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
        env={
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
        },
    ),
315
316
    # LLaVA raw-embeddings E/PD test
    # Validates the raw-embeddings code path where pre-computed vision embeddings
317
    # (.safetensors file) are sent via file:// URL instead of a raw image URL.
318
319
320
321
    #
    # Flow:
    #   1. Launch script generates embeddings using standalone HF vision encoder
    #   2. Encode + Aggregated PD workers start for LLaVA
322
    #   3. Test sends chat/completions request with file:///tmp/llava_embeddings.safetensors
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
    #
    # Uses gpu_2: encode worker on GPU 0, PD worker on GPU 1.
    # The 7B LLaVA model requires two GPUs because both encode and PD workers
    # load the full model (~14GB each in bfloat16), exceeding a single L4's 22GB.
    # Runs in the multi-GPU pre-merge CI (marker: pre_merge and trtllm and gpu_2).
    "raw_embeddings_epd": TRTLLMConfig(
        name="raw_embeddings_epd",
        directory=SERVE_TEST_DIR,
        script_name="agg_raw_embeddings_llava.sh",
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.pre_merge,
            pytest.mark.timeout(
                900
            ),  # Embeddings generation (~60s) + model load (~120s) + inference
        ],
        model="llava-hf/llava-v1.6-mistral-7b-hf",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=600,
        # Embeddings generation + worker startup takes longer than normal
        delayed_start=180,
        request_payloads=[
            multimodal_payload_default(
348
                image_url="file:///tmp/llava_embeddings.safetensors",
349
350
351
352
353
354
355
356
357
                text="Describe what this image shows.",
                expected_response=["bench", "person", "image", "picture"],
            )
        ],
        env={
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
            "PD_CUDA_VISIBLE_DEVICES": "1",
        },
    ),
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
    # TensorRT-LLM video diffusion test using Wan2.1-T2V-1.3B model.
    # Validates the end-to-end video generation pipeline (frontend → worker → /v1/videos).
    # Uses --skip-warmup (warmup at default resolution OOMs on 22 GB L4 GPU),
    # --disable-torch-compile, and small default resolution (480x272, 17 frames)
    # to fit within CI GPU memory constraints.
    "video_diffusion": TRTLLMConfig(
        name="video_diffusion",
        directory=trtllm_dir,
        script_name="agg_video_diffusion.sh",
        script_args=[
            "--skip-warmup",
            "--disable-torch-compile",
            "--default-height",
            "272",
            "--default-width",
            "480",
            "--default-num-frames",
            "17",
        ],
        marks=[
378
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 17.1 GiB
379
380
            pytest.mark.trtllm,
            pytest.mark.pre_merge,
381
382
383
384
385
386
387
388
            # Diffusion models don't use KV cache, so requested_trtllm_kv_tokens
            # doesn't apply.  requested_trtllm_vram_gib maps to
            # KvCacheConfig.max_gpu_total_bytes which has no effect on the
            # diffusion engine itself, but the parallel scheduler requires one
            # of the KV/VRAM markers to accept the test.  We set it to the
            # profiled peak so the scheduler's VRAM budget is accurate.
            pytest.mark.profiled_vram_gib(17.1),  # actual nvidia-smi peak 17.1 GiB
            pytest.mark.requested_trtllm_vram_gib(17.1),
389
390
391
392
393
394
395
            pytest.mark.timeout(
                600
            ),  # Video generation is slow even at small resolution
        ],
        model="Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=300,
396
        delayed_start=5,
397
398
399
400
401
402
403
404
405
406
407
408
409
        request_payloads=[
            VideoGenerationPayload(
                body={
                    "prompt": "A golden retriever running on a beach",
                    "size": "480x272",
                    "response_format": "url",
                    "nvext": {
                        "num_inference_steps": 10,
                        "num_frames": 17,
                        "guidance_scale": 5.0,
                        "seed": 42,
                    },
                },
410
                timeout=300,
411
412
413
414
415
416
                repeat_count=1,
                expected_response=[],
                expected_log=[],
            ),
        ],
    ),
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
    # Aggregated multimodal with --frontend-decoding enabled.
    # Verifies image URL inference works when images are decoded by the Rust
    # MediaDecoder in the frontend instead of the Python backend.
    "aggregated_multimodal_frontend_decoding": TRTLLMConfig(
        name="aggregated_multimodal_frontend_decoding",
        directory=trtllm_dir,
        script_name="agg_multimodal.sh",
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.pre_merge,
            pytest.mark.timeout(900),
        ],
        model="Qwen/Qwen3-VL-2B-Instruct",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=60,
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
        env={
            "AGG_ENGINE_ARGS": "/workspace/examples/backends/trtllm/engine_configs/qwen3-vl-2b-instruct/agg.yaml",
            "DYN_TRTLLM_FRONTEND_DECODING": "true",
        },
    ),
446
447
448
449
    "completions_only": TRTLLMConfig(
        name="completions_only",
        directory=trtllm_dir,
        script_name="agg.sh",
450
451
452
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
453
            pytest.mark.post_merge,
454
            pytest.mark.skip(reason="DIS-1566"),
455
456
457
            pytest.mark.timeout(
                480
            ),  # 3x measured time (83.85s) + download time (210s) for 7B model
458
        ],
459
460
461
462
463
464
465
466
        model="deepseek-ai/deepseek-llm-7b-base",
        script_args=["--dyn-endpoint-types", "completions"],
        env={
            "MODEL_PATH": "deepseek-ai/deepseek-llm-7b-base",
            "SERVED_MODEL_NAME": "deepseek-ai/deepseek-llm-7b-base",
        },
        request_payloads=[
            completion_payload_default(),
467
            completion_payload(prompt=TEXT_PROMPT, logprobs=3),
468
469
        ],
    ),
470
471
472
}


Alec's avatar
Alec committed
473
@pytest.fixture(params=params_with_model_mark(trtllm_configs))
474
475
def trtllm_config_test(request):
    """Fixture that provides different trtllm test configurations"""
476
    return trtllm_configs[request.param]
477
478


479
@pytest.mark.trtllm
480
@pytest.mark.e2e
481
@pytest.mark.parametrize("num_system_ports", [2], indirect=True)
482
483
484
485
486
def test_deployment(
    trtllm_config_test,
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
487
    num_system_ports,
488
489
    predownload_models,
):
490
491
492
    """
    Test dynamo deployments with different configurations.
    """
493
494
495
    assert (
        num_system_ports >= 2
    ), "serve tests require at least SYSTEM_PORT1 + SYSTEM_PORT2"
496
497
498
499
500
501
502
503
504
505
506
507
    # Use per-test ports so tests can run safely under pytest-xdist.
    config = dataclasses.replace(
        trtllm_config_test, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    # Non-port env stays here; ports are wired by run_serve_deployment(ports=...).
    config.env.update(
        {
            "MODEL_PATH": config.model,
            "SERVED_MODEL_NAME": config.model,
        }
    )
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)
508
509


510
# TODO make this a normal guy
511
512
@pytest.mark.e2e
@pytest.mark.gpu_1
513
@pytest.mark.trtllm
514
@pytest.mark.pre_merge
515
@pytest.mark.timeout(660)  # 3x measured time (159.68s) + download time (180s)
516
def test_chat_only_aggregated_with_test_logits_processor(
517
518
519
520
521
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
    predownload_models,
    monkeypatch,
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
):
    """
    Run a single aggregated chat-completions test using Qwen 0.6B with the
    test logits processor enabled, and expect "Hello world" in the response.
    """

    # Enable HelloWorld logits processor only for this test
    monkeypatch.setenv("DYNAMO_ENABLE_TEST_LOGITS_PROCESSOR", "1")

    base = trtllm_configs["aggregated"]
    config = TRTLLMConfig(
        name="aggregated_qwen_chatonly",
        directory=base.directory,
        script_name=base.script_name,  # agg.sh
        marks=[],  # not used by this direct test
537
        request_payloads=[
538
            chat_payload_default(expected_response=["Hello world!"]),
539
        ],
540
541
542
543
544
        model="Qwen/Qwen3-0.6B",
        delayed_start=base.delayed_start,
        timeout=base.timeout,
    )

545
546
547
548
549
550
551
552
553
554
    config = dataclasses.replace(
        config, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    config.env.update(
        {
            "MODEL_PATH": config.model,
            "SERVED_MODEL_NAME": config.model,
        }
    )
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)