test_trtllm.py 18.4 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
# SPDX-License-Identifier: Apache-2.0

4
import dataclasses
5
6
import logging
import os
7
from dataclasses import dataclass, field
8
from typing import Any
9
10
11

import pytest

12
from tests.serve.common import (
13
    SERVE_TEST_DIR,
14
15
16
17
    WORKSPACE_DIR,
    params_with_model_mark,
    run_serve_deployment,
)
18
from tests.utils.constants import DefaultPort
19
from tests.utils.engine_process import EngineConfig
20
from tests.utils.payload_builder import (
21
22
    TEXT_PROMPT,
    chat_payload,
23
    chat_payload_default,
24
    completion_payload,
25
26
    completion_payload_default,
    metric_payload_default,
27
    multimodal_payload_default,
28
)
29
from tests.utils.payloads import BasePayload
30
31
32
33

logger = logging.getLogger(__name__)


34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
@dataclass
class VideoGenerationPayload(BasePayload):
    """Payload for /v1/videos endpoint (TRT-LLM video diffusion)."""

    endpoint: str = "/v1/videos"
    timeout: int = 300

    def response_handler(self, response: Any) -> str:
        response.raise_for_status()
        result = response.json()
        assert result.get("status") == "completed", (
            f"Video generation not completed. Status: {result.get('status')}, "
            f"Error: {result.get('error', 'none')}"
        )
        assert (
            "data" in result
        ), f"Missing 'data' in response. Keys: {list(result.keys())}"
        assert len(result["data"]) > 0, "Empty data in video response"
        entry = result["data"][0]
        if "url" in entry:
            assert entry["url"], "Video response url is empty"
            return entry["url"]
        assert entry.get("b64_json"), "Video response b64_json is empty"
        return "b64_video_returned"

    def validate(self, response: Any, content: str) -> None:
        assert content, "Video response content is empty"


63
@dataclass
64
class TRTLLMConfig(EngineConfig):
65
66
    """Configuration for trtllm test scenarios"""

67
    stragglers: list[str] = field(default_factory=lambda: ["TRTLLM:EngineCore"])
68

69

70
trtllm_dir = os.environ.get("TRTLLM_DIR") or os.path.join(
71
    WORKSPACE_DIR, "examples/backends/trtllm"
72
)
73

74
75
76
# TensorRT-LLM test configurations
# NOTE: pytest.mark.gpu_1 tests take ~442s (7m 22s) total to run sequentially (with models pre-cached)
# TODO: Parallelize these tests to reduce total execution time
77
78
79
trtllm_configs = {
    "aggregated": TRTLLMConfig(
        name="aggregated",
80
        directory=trtllm_dir,
81
        script_name="agg_metrics.sh",
82
        marks=[
83
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 3.9 GiB
84
85
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
86
87
88
89
            pytest.mark.profiled_vram_gib(3.9),  # actual nvidia-smi peak 3.9 GiB
            pytest.mark.requested_trtllm_kv_tokens(
                2592
            ),  # KV cache cap (2x safety over min=1296)
90
91
92
            pytest.mark.timeout(
                300
            ),  # 3x measured time (44.66s) + download time (150s)
93
        ],
94
        model="Qwen/Qwen3-0.6B",
95
        frontend_port=DefaultPort.FRONTEND.value,
96
        delayed_start=5,
97
98
99
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
100
            metric_payload_default(min_num_requests=6, backend="trtllm"),
101
        ],
102
103
104
    ),
    "disaggregated": TRTLLMConfig(
        name="disaggregated",
105
        directory=trtllm_dir,
106
        script_name="disagg.sh",
107
        marks=[pytest.mark.gpu_2, pytest.mark.trtllm, pytest.mark.pre_merge],
108
        model="Qwen/Qwen3-0.6B",
109
        frontend_port=DefaultPort.FRONTEND.value,
110
111
112
113
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
114
    ),
115
116
117
118
    "disaggregated_same_gpu": TRTLLMConfig(
        name="disaggregated_same_gpu",
        directory=trtllm_dir,
        script_name="disagg_same_gpu.sh",
119
120
121
122
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
123
            pytest.mark.skip(reason="unstable"),
124
125
126
            pytest.mark.timeout(
                480
            ),  # 3x measured time (103.66s) + download time (150s)
127
        ],
128
        model="Qwen/Qwen3-0.6B",
129
        frontend_port=DefaultPort.FRONTEND.value,
130
131
132
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
133
134
135
136
137
138
            metric_payload_default(
                port=DefaultPort.SYSTEM1.value, min_num_requests=6, backend="trtllm"
            ),
            metric_payload_default(
                port=DefaultPort.SYSTEM2.value, min_num_requests=6, backend="trtllm"
            ),
139
140
        ],
    ),
141
142
143
144
    "aggregated_logprobs": TRTLLMConfig(
        name="aggregated_logprobs",
        directory=trtllm_dir,
        script_name="agg.sh",
145
146
147
148
149
150
151
152
153
154
        marks=[
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 3.8 GiB
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
            pytest.mark.profiled_vram_gib(3.8),  # actual nvidia-smi peak 3.8 GiB
            pytest.mark.requested_trtllm_kv_tokens(
                2592
            ),  # KV cache cap (2x safety over min=1296)
            pytest.mark.timeout(300),  # 3x measured time (~44s) + download time (150s)
        ],
155
        model="Qwen/Qwen3-0.6B",
156
        frontend_port=DefaultPort.FRONTEND.value,
157
        delayed_start=5,
158
159
160
161
162
163
164
165
166
167
168
        request_payloads=[
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=False, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=None),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=0),
        ],
    ),
    "disaggregated_logprobs": TRTLLMConfig(
        name="disaggregated_logprobs",
        directory=trtllm_dir,
        script_name="disagg.sh",
Dmitry Tokarev's avatar
Dmitry Tokarev committed
169
170
        marks=[
            pytest.mark.gpu_2,
171
            pytest.mark.pre_merge,
Dmitry Tokarev's avatar
Dmitry Tokarev committed
172
173
            pytest.mark.trtllm,
        ],
174
        model="Qwen/Qwen3-0.6B",
175
        frontend_port=DefaultPort.FRONTEND.value,
176
177
178
179
180
181
182
        request_payloads=[
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=False, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=None),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=0),
        ],
    ),
183
184
    "aggregated_router": TRTLLMConfig(
        name="aggregated_router",
185
        directory=trtllm_dir,
186
        script_name="agg_router.sh",
187
188
189
190
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
191
192
193
            pytest.mark.timeout(
                300
            ),  # 3x measured time (37.91s) + download time (180s)
194
        ],
195
        model="Qwen/Qwen3-0.6B",
196
        frontend_port=DefaultPort.FRONTEND.value,
197
198
199
200
        request_payloads=[
            chat_payload_default(
                expected_log=[
                    r"Event processor for worker_id \d+ processing event: Stored\(",
201
                    r"Selected worker: worker_type=\w+, worker_id=\d+ dp_rank=.*?, logit: ",
202
203
204
205
                ]
            )
        ],
        env={
206
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info",
207
        },
208
209
210
    ),
    "disaggregated_router": TRTLLMConfig(
        name="disaggregated_router",
211
        directory=trtllm_dir,
212
        script_name="disagg_router.sh",
213
        marks=[pytest.mark.gpu_2, pytest.mark.trtllm, pytest.mark.nightly],
214
        model="Qwen/Qwen3-0.6B",
215
        frontend_port=DefaultPort.FRONTEND.value,
216
217
218
219
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
220
    ),
221
222
223
224
    "disaggregated_multimodal": TRTLLMConfig(
        name="disaggregated_multimodal",
        directory=trtllm_dir,
        script_name="disagg_multimodal.sh",
225
226
227
228
229
230
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.nightly,
        ],
231
        model="Qwen/Qwen2-VL-7B-Instruct",
232
        frontend_port=DefaultPort.FRONTEND.value,
233
234
235
236
        timeout=900,
        delayed_start=60,
        request_payloads=[multimodal_payload_default()],
    ),
237
238
239
    "aggregated_multimodal_router": TRTLLMConfig(
        name="aggregated_multimodal_router",
        directory=trtllm_dir,
240
        script_name="agg_multimodal_router.sh",
241
        marks=[
242
243
244
            pytest.mark.skip(
                reason="Nightly CI failure: https://linear.app/nvidia/issue/DYN-2608"
            ),
245
246
247
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
248
            pytest.mark.pre_merge,
249
        ],
250
        model="Qwen/Qwen3-VL-2B-Instruct",
251
252
253
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=60,
254
255
256
257
258
259
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
260
    ),
261
262
    # TensorRT-LLM EPD (Encode-Prefill-Decode) multimodal test for pre-merge CI
    # Uses Qwen3-VL-2B-Instruct model with 1 GPU (all workers share same GPU)
263
264
265
266
267
    #
    # TODO: Add Llama-4-Scout multimodal tests (agg_multimodal_llama, disagg_multimodal_llama)
    #       once CI supports gpu_8 runners and launch scripts are available
    "epd_multimodal": TRTLLMConfig(
        name="epd_multimodal",
268
        directory=trtllm_dir,
269
        script_name="epd_multimodal_image_and_embeddings.sh",
270
        marks=[
271
            pytest.mark.gpu_1,
272
273
            pytest.mark.trtllm,
            pytest.mark.multimodal,
274
            pytest.mark.pre_merge,
275
        ],
276
        model="Qwen/Qwen3-VL-2B-Instruct",
277
        frontend_port=DefaultPort.FRONTEND.value,
278
        timeout=900,
279
        delayed_start=120,
280
281
282
283
284
285
286
287
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
        env={
            "PREFILL_CUDA_VISIBLE_DEVICES": "0",
288
            "DECODE_CUDA_VISIBLE_DEVICES": "0",
289
290
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
        },
291
    ),
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
    # Test Encoder with Aggregated PD worker on same GPU
    # Make this pre-merge after TRTLLM #5938603 is fixed
    "e_pd_multimodal": TRTLLMConfig(
        name="e_pd_multimodal",
        directory=trtllm_dir,
        script_name="disagg_e_pd.sh",
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.nightly,
        ],
        model="Qwen/Qwen3-VL-2B-Instruct",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=120,
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
        env={
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
        },
    ),
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
    # LLaVA raw-embeddings E/PD test
    # Validates the raw-embeddings code path where pre-computed vision embeddings
    # (.pt tensor file) are sent via file:// URL instead of a raw image URL.
    #
    # Flow:
    #   1. Launch script generates embeddings using standalone HF vision encoder
    #   2. Encode + Aggregated PD workers start for LLaVA
    #   3. Test sends chat/completions request with file:///tmp/llava_embeddings.pt
    #
    # Uses gpu_2: encode worker on GPU 0, PD worker on GPU 1.
    # The 7B LLaVA model requires two GPUs because both encode and PD workers
    # load the full model (~14GB each in bfloat16), exceeding a single L4's 22GB.
    # Runs in the multi-GPU pre-merge CI (marker: pre_merge and trtllm and gpu_2).
    "raw_embeddings_epd": TRTLLMConfig(
        name="raw_embeddings_epd",
        directory=SERVE_TEST_DIR,
        script_name="agg_raw_embeddings_llava.sh",
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.pre_merge,
            pytest.mark.timeout(
                900
            ),  # Embeddings generation (~60s) + model load (~120s) + inference
        ],
        model="llava-hf/llava-v1.6-mistral-7b-hf",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=600,
        # Embeddings generation + worker startup takes longer than normal
        delayed_start=180,
        request_payloads=[
            multimodal_payload_default(
                image_url="file:///tmp/llava_embeddings.pt",
                text="Describe what this image shows.",
                expected_response=["bench", "person", "image", "picture"],
            )
        ],
        env={
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
            "PD_CUDA_VISIBLE_DEVICES": "1",
        },
    ),
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
    # TensorRT-LLM video diffusion test using Wan2.1-T2V-1.3B model.
    # Validates the end-to-end video generation pipeline (frontend → worker → /v1/videos).
    # Uses --skip-warmup (warmup at default resolution OOMs on 22 GB L4 GPU),
    # --disable-torch-compile, and small default resolution (480x272, 17 frames)
    # to fit within CI GPU memory constraints.
    "video_diffusion": TRTLLMConfig(
        name="video_diffusion",
        directory=trtllm_dir,
        script_name="agg_video_diffusion.sh",
        script_args=[
            "--skip-warmup",
            "--disable-torch-compile",
            "--default-height",
            "272",
            "--default-width",
            "480",
            "--default-num-frames",
            "17",
        ],
        marks=[
381
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 17.1 GiB
382
383
            pytest.mark.trtllm,
            pytest.mark.pre_merge,
384
385
386
387
388
389
390
391
            # Diffusion models don't use KV cache, so requested_trtllm_kv_tokens
            # doesn't apply.  requested_trtllm_vram_gib maps to
            # KvCacheConfig.max_gpu_total_bytes which has no effect on the
            # diffusion engine itself, but the parallel scheduler requires one
            # of the KV/VRAM markers to accept the test.  We set it to the
            # profiled peak so the scheduler's VRAM budget is accurate.
            pytest.mark.profiled_vram_gib(17.1),  # actual nvidia-smi peak 17.1 GiB
            pytest.mark.requested_trtllm_vram_gib(17.1),
392
393
394
395
396
397
398
            pytest.mark.timeout(
                600
            ),  # Video generation is slow even at small resolution
        ],
        model="Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=300,
399
        delayed_start=5,
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
        request_payloads=[
            VideoGenerationPayload(
                body={
                    "prompt": "A golden retriever running on a beach",
                    "size": "480x272",
                    "response_format": "url",
                    "nvext": {
                        "num_inference_steps": 10,
                        "num_frames": 17,
                        "guidance_scale": 5.0,
                        "seed": 42,
                    },
                },
                repeat_count=1,
                expected_response=[],
                expected_log=[],
            ),
        ],
    ),
419
420
421
422
    "completions_only": TRTLLMConfig(
        name="completions_only",
        directory=trtllm_dir,
        script_name="agg.sh",
423
424
425
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
426
            pytest.mark.post_merge,
427
            pytest.mark.skip(reason="DIS-1566"),
428
429
430
            pytest.mark.timeout(
                480
            ),  # 3x measured time (83.85s) + download time (210s) for 7B model
431
        ],
432
433
434
435
436
437
438
439
        model="deepseek-ai/deepseek-llm-7b-base",
        script_args=["--dyn-endpoint-types", "completions"],
        env={
            "MODEL_PATH": "deepseek-ai/deepseek-llm-7b-base",
            "SERVED_MODEL_NAME": "deepseek-ai/deepseek-llm-7b-base",
        },
        request_payloads=[
            completion_payload_default(),
440
            completion_payload(prompt=TEXT_PROMPT, logprobs=3),
441
442
        ],
    ),
443
444
445
}


Alec's avatar
Alec committed
446
@pytest.fixture(params=params_with_model_mark(trtllm_configs))
447
448
def trtllm_config_test(request):
    """Fixture that provides different trtllm test configurations"""
449
    return trtllm_configs[request.param]
450
451


452
@pytest.mark.trtllm
453
@pytest.mark.e2e
454
@pytest.mark.parametrize("num_system_ports", [2], indirect=True)
455
456
457
458
459
def test_deployment(
    trtllm_config_test,
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
460
    num_system_ports,
461
462
    predownload_models,
):
463
464
465
    """
    Test dynamo deployments with different configurations.
    """
466
467
468
    assert (
        num_system_ports >= 2
    ), "serve tests require at least SYSTEM_PORT1 + SYSTEM_PORT2"
469
470
471
472
473
474
475
476
477
478
479
480
    # Use per-test ports so tests can run safely under pytest-xdist.
    config = dataclasses.replace(
        trtllm_config_test, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    # Non-port env stays here; ports are wired by run_serve_deployment(ports=...).
    config.env.update(
        {
            "MODEL_PATH": config.model,
            "SERVED_MODEL_NAME": config.model,
        }
    )
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)
481
482


483
# TODO make this a normal guy
484
485
@pytest.mark.e2e
@pytest.mark.gpu_1
486
@pytest.mark.trtllm
487
@pytest.mark.pre_merge
488
@pytest.mark.timeout(660)  # 3x measured time (159.68s) + download time (180s)
489
def test_chat_only_aggregated_with_test_logits_processor(
490
491
492
493
494
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
    predownload_models,
    monkeypatch,
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
):
    """
    Run a single aggregated chat-completions test using Qwen 0.6B with the
    test logits processor enabled, and expect "Hello world" in the response.
    """

    # Enable HelloWorld logits processor only for this test
    monkeypatch.setenv("DYNAMO_ENABLE_TEST_LOGITS_PROCESSOR", "1")

    base = trtllm_configs["aggregated"]
    config = TRTLLMConfig(
        name="aggregated_qwen_chatonly",
        directory=base.directory,
        script_name=base.script_name,  # agg.sh
        marks=[],  # not used by this direct test
510
        request_payloads=[
511
            chat_payload_default(expected_response=["Hello world!"]),
512
        ],
513
514
515
516
517
        model="Qwen/Qwen3-0.6B",
        delayed_start=base.delayed_start,
        timeout=base.timeout,
    )

518
519
520
521
522
523
524
525
526
527
    config = dataclasses.replace(
        config, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    config.env.update(
        {
            "MODEL_PATH": config.model,
            "SERVED_MODEL_NAME": config.model,
        }
    )
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)