test_trtllm.py 18.6 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
# SPDX-License-Identifier: Apache-2.0

4
import dataclasses
5
6
import logging
import os
7
from dataclasses import dataclass, field
8
from typing import Any
9
10
11

import pytest

12
from tests.serve.common import (
13
    SERVE_TEST_DIR,
14
15
16
17
    WORKSPACE_DIR,
    params_with_model_mark,
    run_serve_deployment,
)
18
from tests.utils.constants import DefaultPort
19
from tests.utils.engine_process import EngineConfig
20
from tests.utils.payload_builder import (
21
22
    TEXT_PROMPT,
    chat_payload,
23
    chat_payload_default,
24
    completion_payload,
25
26
    completion_payload_default,
    metric_payload_default,
27
    multimodal_payload_default,
28
)
29
from tests.utils.payloads import BasePayload
30
31
32
33

logger = logging.getLogger(__name__)


34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
@dataclass
class VideoGenerationPayload(BasePayload):
    """Payload for /v1/videos endpoint (TRT-LLM video diffusion)."""

    endpoint: str = "/v1/videos"
    timeout: int = 300

    def response_handler(self, response: Any) -> str:
        response.raise_for_status()
        result = response.json()
        assert result.get("status") == "completed", (
            f"Video generation not completed. Status: {result.get('status')}, "
            f"Error: {result.get('error', 'none')}"
        )
        assert (
            "data" in result
        ), f"Missing 'data' in response. Keys: {list(result.keys())}"
        assert len(result["data"]) > 0, "Empty data in video response"
        entry = result["data"][0]
        if "url" in entry:
            assert entry["url"], "Video response url is empty"
            return entry["url"]
        assert entry.get("b64_json"), "Video response b64_json is empty"
        return "b64_video_returned"

    def validate(self, response: Any, content: str) -> None:
        assert content, "Video response content is empty"


63
@dataclass
64
class TRTLLMConfig(EngineConfig):
65
66
    """Configuration for trtllm test scenarios"""

67
    stragglers: list[str] = field(default_factory=lambda: ["TRTLLM:EngineCore"])
68

69

70
trtllm_dir = os.environ.get("TRTLLM_DIR") or os.path.join(
71
    WORKSPACE_DIR, "examples/backends/trtllm"
72
)
73

74
75
76
# TensorRT-LLM test configurations
# NOTE: pytest.mark.gpu_1 tests take ~442s (7m 22s) total to run sequentially (with models pre-cached)
# TODO: Parallelize these tests to reduce total execution time
77
78
79
trtllm_configs = {
    "aggregated": TRTLLMConfig(
        name="aggregated",
80
        directory=trtllm_dir,
81
        script_name="agg_metrics.sh",
82
        marks=[
83
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 3.9 GiB
84
85
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
86
87
88
89
            pytest.mark.profiled_vram_gib(3.9),  # actual nvidia-smi peak 3.9 GiB
            pytest.mark.requested_trtllm_kv_tokens(
                2592
            ),  # KV cache cap (2x safety over min=1296)
90
91
92
            pytest.mark.timeout(
                300
            ),  # 3x measured time (44.66s) + download time (150s)
93
        ],
94
        model="Qwen/Qwen3-0.6B",
95
        frontend_port=DefaultPort.FRONTEND.value,
96
        delayed_start=5,
97
98
99
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
100
            metric_payload_default(min_num_requests=6, backend="trtllm"),
101
        ],
102
103
104
    ),
    "disaggregated": TRTLLMConfig(
        name="disaggregated",
105
        directory=trtllm_dir,
106
        script_name="disagg.sh",
107
        marks=[pytest.mark.gpu_2, pytest.mark.trtllm, pytest.mark.pre_merge],
108
        model="Qwen/Qwen3-0.6B",
109
        frontend_port=DefaultPort.FRONTEND.value,
110
111
112
113
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
114
    ),
115
116
117
118
    "disaggregated_same_gpu": TRTLLMConfig(
        name="disaggregated_same_gpu",
        directory=trtllm_dir,
        script_name="disagg_same_gpu.sh",
119
        marks=[
120
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 6.6 GiB
121
122
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
123
124
125
126
127
            pytest.mark.profiled_vram_gib(6.6),  # actual nvidia-smi peak 6.6 GiB
            pytest.mark.requested_trtllm_kv_tokens(
                512
            ),  # KV cache cap (2x safety over min=256)
            pytest.mark.timeout(432),  # ~6x profiled wall time 72s
128
        ],
129
        model="Qwen/Qwen3-0.6B",
130
        frontend_port=DefaultPort.FRONTEND.value,
131
132
        delayed_start=10,
        health_check_workers=True,
133
134
135
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
136
137
138
139
140
141
            metric_payload_default(
                port=DefaultPort.SYSTEM1.value, min_num_requests=6, backend="trtllm"
            ),
            metric_payload_default(
                port=DefaultPort.SYSTEM2.value, min_num_requests=6, backend="trtllm"
            ),
142
143
        ],
    ),
144
145
146
147
    "aggregated_logprobs": TRTLLMConfig(
        name="aggregated_logprobs",
        directory=trtllm_dir,
        script_name="agg.sh",
148
149
150
151
152
153
154
155
156
157
        marks=[
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 3.8 GiB
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
            pytest.mark.profiled_vram_gib(3.8),  # actual nvidia-smi peak 3.8 GiB
            pytest.mark.requested_trtllm_kv_tokens(
                2592
            ),  # KV cache cap (2x safety over min=1296)
            pytest.mark.timeout(300),  # 3x measured time (~44s) + download time (150s)
        ],
158
        model="Qwen/Qwen3-0.6B",
159
        frontend_port=DefaultPort.FRONTEND.value,
160
        delayed_start=5,
161
162
163
164
165
166
167
168
169
170
171
        request_payloads=[
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=False, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=None),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=0),
        ],
    ),
    "disaggregated_logprobs": TRTLLMConfig(
        name="disaggregated_logprobs",
        directory=trtllm_dir,
        script_name="disagg.sh",
Dmitry Tokarev's avatar
Dmitry Tokarev committed
172
173
        marks=[
            pytest.mark.gpu_2,
174
            pytest.mark.pre_merge,
Dmitry Tokarev's avatar
Dmitry Tokarev committed
175
176
            pytest.mark.trtllm,
        ],
177
        model="Qwen/Qwen3-0.6B",
178
        frontend_port=DefaultPort.FRONTEND.value,
179
180
181
182
183
184
185
        request_payloads=[
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=False, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=None),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=0),
        ],
    ),
186
187
    "aggregated_router": TRTLLMConfig(
        name="aggregated_router",
188
        directory=trtllm_dir,
189
        script_name="agg_router.sh",
190
191
192
193
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
194
195
196
            pytest.mark.timeout(
                300
            ),  # 3x measured time (37.91s) + download time (180s)
197
        ],
198
        model="Qwen/Qwen3-0.6B",
199
        frontend_port=DefaultPort.FRONTEND.value,
200
201
202
203
        request_payloads=[
            chat_payload_default(
                expected_log=[
                    r"Event processor for worker_id \d+ processing event: Stored\(",
204
                    r"Selected worker: worker_type=\w+, worker_id=\d+ dp_rank=.*?, logit: ",
205
206
207
208
                ]
            )
        ],
        env={
209
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info",
210
        },
211
212
213
    ),
    "disaggregated_router": TRTLLMConfig(
        name="disaggregated_router",
214
        directory=trtllm_dir,
215
        script_name="disagg_router.sh",
216
        marks=[pytest.mark.gpu_2, pytest.mark.trtllm, pytest.mark.nightly],
217
        model="Qwen/Qwen3-0.6B",
218
        frontend_port=DefaultPort.FRONTEND.value,
219
220
221
222
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
223
    ),
224
225
226
227
    "disaggregated_multimodal": TRTLLMConfig(
        name="disaggregated_multimodal",
        directory=trtllm_dir,
        script_name="disagg_multimodal.sh",
228
229
230
231
232
233
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.nightly,
        ],
234
        model="Qwen/Qwen2-VL-7B-Instruct",
235
        frontend_port=DefaultPort.FRONTEND.value,
236
237
238
239
        timeout=900,
        delayed_start=60,
        request_payloads=[multimodal_payload_default()],
    ),
240
241
242
    "aggregated_multimodal_router": TRTLLMConfig(
        name="aggregated_multimodal_router",
        directory=trtllm_dir,
243
        script_name="agg_multimodal_router.sh",
244
        marks=[
245
246
247
            pytest.mark.skip(
                reason="Nightly CI failure: https://linear.app/nvidia/issue/DYN-2608"
            ),
248
249
250
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
251
            pytest.mark.pre_merge,
252
        ],
253
        model="Qwen/Qwen3-VL-2B-Instruct",
254
255
256
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=60,
257
258
259
260
261
262
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
263
    ),
264
265
    # TensorRT-LLM EPD (Encode-Prefill-Decode) multimodal test for pre-merge CI
    # Uses Qwen3-VL-2B-Instruct model with 1 GPU (all workers share same GPU)
266
267
268
269
270
    #
    # TODO: Add Llama-4-Scout multimodal tests (agg_multimodal_llama, disagg_multimodal_llama)
    #       once CI supports gpu_8 runners and launch scripts are available
    "epd_multimodal": TRTLLMConfig(
        name="epd_multimodal",
271
        directory=trtllm_dir,
272
        script_name="epd_multimodal_image_and_embeddings.sh",
273
        marks=[
274
            pytest.mark.gpu_1,
275
276
            pytest.mark.trtllm,
            pytest.mark.multimodal,
277
            pytest.mark.pre_merge,
278
        ],
279
        model="Qwen/Qwen3-VL-2B-Instruct",
280
        frontend_port=DefaultPort.FRONTEND.value,
281
        timeout=900,
282
        delayed_start=120,
283
284
285
286
287
288
289
290
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
        env={
            "PREFILL_CUDA_VISIBLE_DEVICES": "0",
291
            "DECODE_CUDA_VISIBLE_DEVICES": "0",
292
293
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
        },
294
    ),
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
    # Test Encoder with Aggregated PD worker on same GPU
    # Make this pre-merge after TRTLLM #5938603 is fixed
    "e_pd_multimodal": TRTLLMConfig(
        name="e_pd_multimodal",
        directory=trtllm_dir,
        script_name="disagg_e_pd.sh",
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.nightly,
        ],
        model="Qwen/Qwen3-VL-2B-Instruct",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=120,
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
        env={
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
        },
    ),
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
    # LLaVA raw-embeddings E/PD test
    # Validates the raw-embeddings code path where pre-computed vision embeddings
    # (.pt tensor file) are sent via file:// URL instead of a raw image URL.
    #
    # Flow:
    #   1. Launch script generates embeddings using standalone HF vision encoder
    #   2. Encode + Aggregated PD workers start for LLaVA
    #   3. Test sends chat/completions request with file:///tmp/llava_embeddings.pt
    #
    # Uses gpu_2: encode worker on GPU 0, PD worker on GPU 1.
    # The 7B LLaVA model requires two GPUs because both encode and PD workers
    # load the full model (~14GB each in bfloat16), exceeding a single L4's 22GB.
    # Runs in the multi-GPU pre-merge CI (marker: pre_merge and trtllm and gpu_2).
    "raw_embeddings_epd": TRTLLMConfig(
        name="raw_embeddings_epd",
        directory=SERVE_TEST_DIR,
        script_name="agg_raw_embeddings_llava.sh",
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.pre_merge,
            pytest.mark.timeout(
                900
            ),  # Embeddings generation (~60s) + model load (~120s) + inference
        ],
        model="llava-hf/llava-v1.6-mistral-7b-hf",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=600,
        # Embeddings generation + worker startup takes longer than normal
        delayed_start=180,
        request_payloads=[
            multimodal_payload_default(
                image_url="file:///tmp/llava_embeddings.pt",
                text="Describe what this image shows.",
                expected_response=["bench", "person", "image", "picture"],
            )
        ],
        env={
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
            "PD_CUDA_VISIBLE_DEVICES": "1",
        },
    ),
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
    # TensorRT-LLM video diffusion test using Wan2.1-T2V-1.3B model.
    # Validates the end-to-end video generation pipeline (frontend → worker → /v1/videos).
    # Uses --skip-warmup (warmup at default resolution OOMs on 22 GB L4 GPU),
    # --disable-torch-compile, and small default resolution (480x272, 17 frames)
    # to fit within CI GPU memory constraints.
    "video_diffusion": TRTLLMConfig(
        name="video_diffusion",
        directory=trtllm_dir,
        script_name="agg_video_diffusion.sh",
        script_args=[
            "--skip-warmup",
            "--disable-torch-compile",
            "--default-height",
            "272",
            "--default-width",
            "480",
            "--default-num-frames",
            "17",
        ],
        marks=[
384
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 17.1 GiB
385
386
            pytest.mark.trtllm,
            pytest.mark.pre_merge,
387
388
389
390
391
392
393
394
            # Diffusion models don't use KV cache, so requested_trtllm_kv_tokens
            # doesn't apply.  requested_trtllm_vram_gib maps to
            # KvCacheConfig.max_gpu_total_bytes which has no effect on the
            # diffusion engine itself, but the parallel scheduler requires one
            # of the KV/VRAM markers to accept the test.  We set it to the
            # profiled peak so the scheduler's VRAM budget is accurate.
            pytest.mark.profiled_vram_gib(17.1),  # actual nvidia-smi peak 17.1 GiB
            pytest.mark.requested_trtllm_vram_gib(17.1),
395
396
397
398
399
400
401
            pytest.mark.timeout(
                600
            ),  # Video generation is slow even at small resolution
        ],
        model="Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=300,
402
        delayed_start=5,
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
        request_payloads=[
            VideoGenerationPayload(
                body={
                    "prompt": "A golden retriever running on a beach",
                    "size": "480x272",
                    "response_format": "url",
                    "nvext": {
                        "num_inference_steps": 10,
                        "num_frames": 17,
                        "guidance_scale": 5.0,
                        "seed": 42,
                    },
                },
                repeat_count=1,
                expected_response=[],
                expected_log=[],
            ),
        ],
    ),
422
423
424
425
    "completions_only": TRTLLMConfig(
        name="completions_only",
        directory=trtllm_dir,
        script_name="agg.sh",
426
427
428
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
429
            pytest.mark.post_merge,
430
            pytest.mark.skip(reason="DIS-1566"),
431
432
433
            pytest.mark.timeout(
                480
            ),  # 3x measured time (83.85s) + download time (210s) for 7B model
434
        ],
435
436
437
438
439
440
441
442
        model="deepseek-ai/deepseek-llm-7b-base",
        script_args=["--dyn-endpoint-types", "completions"],
        env={
            "MODEL_PATH": "deepseek-ai/deepseek-llm-7b-base",
            "SERVED_MODEL_NAME": "deepseek-ai/deepseek-llm-7b-base",
        },
        request_payloads=[
            completion_payload_default(),
443
            completion_payload(prompt=TEXT_PROMPT, logprobs=3),
444
445
        ],
    ),
446
447
448
}


Alec's avatar
Alec committed
449
@pytest.fixture(params=params_with_model_mark(trtllm_configs))
450
451
def trtllm_config_test(request):
    """Fixture that provides different trtllm test configurations"""
452
    return trtllm_configs[request.param]
453
454


455
@pytest.mark.trtllm
456
@pytest.mark.e2e
457
@pytest.mark.parametrize("num_system_ports", [2], indirect=True)
458
459
460
461
462
def test_deployment(
    trtllm_config_test,
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
463
    num_system_ports,
464
465
    predownload_models,
):
466
467
468
    """
    Test dynamo deployments with different configurations.
    """
469
470
471
    assert (
        num_system_ports >= 2
    ), "serve tests require at least SYSTEM_PORT1 + SYSTEM_PORT2"
472
473
474
475
476
477
478
479
480
481
482
483
    # Use per-test ports so tests can run safely under pytest-xdist.
    config = dataclasses.replace(
        trtllm_config_test, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    # Non-port env stays here; ports are wired by run_serve_deployment(ports=...).
    config.env.update(
        {
            "MODEL_PATH": config.model,
            "SERVED_MODEL_NAME": config.model,
        }
    )
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)
484
485


486
# TODO make this a normal guy
487
488
@pytest.mark.e2e
@pytest.mark.gpu_1
489
@pytest.mark.trtllm
490
@pytest.mark.pre_merge
491
@pytest.mark.timeout(660)  # 3x measured time (159.68s) + download time (180s)
492
def test_chat_only_aggregated_with_test_logits_processor(
493
494
495
496
497
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
    predownload_models,
    monkeypatch,
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
):
    """
    Run a single aggregated chat-completions test using Qwen 0.6B with the
    test logits processor enabled, and expect "Hello world" in the response.
    """

    # Enable HelloWorld logits processor only for this test
    monkeypatch.setenv("DYNAMO_ENABLE_TEST_LOGITS_PROCESSOR", "1")

    base = trtllm_configs["aggregated"]
    config = TRTLLMConfig(
        name="aggregated_qwen_chatonly",
        directory=base.directory,
        script_name=base.script_name,  # agg.sh
        marks=[],  # not used by this direct test
513
        request_payloads=[
514
            chat_payload_default(expected_response=["Hello world!"]),
515
        ],
516
517
518
519
520
        model="Qwen/Qwen3-0.6B",
        delayed_start=base.delayed_start,
        timeout=base.timeout,
    )

521
522
523
524
525
526
527
528
529
530
    config = dataclasses.replace(
        config, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    config.env.update(
        {
            "MODEL_PATH": config.model,
            "SERVED_MODEL_NAME": config.model,
        }
    )
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)