test_trtllm.py 18.3 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
# SPDX-License-Identifier: Apache-2.0

4
import dataclasses
5
6
import logging
import os
7
from dataclasses import dataclass, field
8
from typing import Any
9
10
11

import pytest

12
from tests.serve.common import (
13
    SERVE_TEST_DIR,
14
15
16
17
    WORKSPACE_DIR,
    params_with_model_mark,
    run_serve_deployment,
)
18
from tests.utils.constants import DefaultPort
19
from tests.utils.engine_process import EngineConfig
20
from tests.utils.payload_builder import (
21
22
    TEXT_PROMPT,
    chat_payload,
23
    chat_payload_default,
24
    completion_payload,
25
26
    completion_payload_default,
    metric_payload_default,
27
    multimodal_payload_default,
28
)
29
from tests.utils.payloads import BasePayload
30
31
32
33

logger = logging.getLogger(__name__)


34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
@dataclass
class VideoGenerationPayload(BasePayload):
    """Payload for /v1/videos endpoint (TRT-LLM video diffusion)."""

    endpoint: str = "/v1/videos"
    timeout: int = 300

    def response_handler(self, response: Any) -> str:
        response.raise_for_status()
        result = response.json()
        assert result.get("status") == "completed", (
            f"Video generation not completed. Status: {result.get('status')}, "
            f"Error: {result.get('error', 'none')}"
        )
        assert (
            "data" in result
        ), f"Missing 'data' in response. Keys: {list(result.keys())}"
        assert len(result["data"]) > 0, "Empty data in video response"
        entry = result["data"][0]
        if "url" in entry:
            assert entry["url"], "Video response url is empty"
            return entry["url"]
        assert entry.get("b64_json"), "Video response b64_json is empty"
        return "b64_video_returned"

    def validate(self, response: Any, content: str) -> None:
        assert content, "Video response content is empty"


63
@dataclass
64
class TRTLLMConfig(EngineConfig):
65
66
    """Configuration for trtllm test scenarios"""

67
    stragglers: list[str] = field(default_factory=lambda: ["TRTLLM:EngineCore"])
68

69

70
trtllm_dir = os.environ.get("TRTLLM_DIR") or os.path.join(
71
    WORKSPACE_DIR, "examples/backends/trtllm"
72
)
73

74
75
76
# TensorRT-LLM test configurations
# NOTE: pytest.mark.gpu_1 tests take ~442s (7m 22s) total to run sequentially (with models pre-cached)
# TODO: Parallelize these tests to reduce total execution time
77
78
79
trtllm_configs = {
    "aggregated": TRTLLMConfig(
        name="aggregated",
80
        directory=trtllm_dir,
81
        script_name="agg_metrics.sh",
82
        marks=[
83
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 3.9 GiB
84
85
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
86
87
88
89
            pytest.mark.profiled_vram_gib(3.9),  # actual nvidia-smi peak 3.9 GiB
            pytest.mark.requested_trtllm_kv_tokens(
                2592
            ),  # KV cache cap (2x safety over min=1296)
90
91
92
            pytest.mark.timeout(
                300
            ),  # 3x measured time (44.66s) + download time (150s)
93
        ],
94
        model="Qwen/Qwen3-0.6B",
95
        frontend_port=DefaultPort.FRONTEND.value,
96
        delayed_start=5,
97
98
99
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
100
            metric_payload_default(min_num_requests=6, backend="trtllm"),
101
        ],
102
103
104
    ),
    "disaggregated": TRTLLMConfig(
        name="disaggregated",
105
        directory=trtllm_dir,
106
        script_name="disagg.sh",
107
        marks=[pytest.mark.gpu_2, pytest.mark.trtllm, pytest.mark.pre_merge],
108
        model="Qwen/Qwen3-0.6B",
109
        frontend_port=DefaultPort.FRONTEND.value,
110
111
112
113
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
114
    ),
115
116
117
118
    "disaggregated_same_gpu": TRTLLMConfig(
        name="disaggregated_same_gpu",
        directory=trtllm_dir,
        script_name="disagg_same_gpu.sh",
119
120
121
122
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
123
            pytest.mark.skip(reason="unstable"),
124
125
126
            pytest.mark.timeout(
                480
            ),  # 3x measured time (103.66s) + download time (150s)
127
        ],
128
        model="Qwen/Qwen3-0.6B",
129
        frontend_port=DefaultPort.FRONTEND.value,
130
131
132
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
133
134
135
136
137
138
            metric_payload_default(
                port=DefaultPort.SYSTEM1.value, min_num_requests=6, backend="trtllm"
            ),
            metric_payload_default(
                port=DefaultPort.SYSTEM2.value, min_num_requests=6, backend="trtllm"
            ),
139
140
        ],
    ),
141
142
143
144
    "aggregated_logprobs": TRTLLMConfig(
        name="aggregated_logprobs",
        directory=trtllm_dir,
        script_name="agg.sh",
145
146
147
148
149
150
151
152
153
154
        marks=[
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 3.8 GiB
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
            pytest.mark.profiled_vram_gib(3.8),  # actual nvidia-smi peak 3.8 GiB
            pytest.mark.requested_trtllm_kv_tokens(
                2592
            ),  # KV cache cap (2x safety over min=1296)
            pytest.mark.timeout(300),  # 3x measured time (~44s) + download time (150s)
        ],
155
        model="Qwen/Qwen3-0.6B",
156
        frontend_port=DefaultPort.FRONTEND.value,
157
        delayed_start=5,
158
159
160
161
162
163
164
165
166
167
168
        request_payloads=[
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=False, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=None),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=0),
        ],
    ),
    "disaggregated_logprobs": TRTLLMConfig(
        name="disaggregated_logprobs",
        directory=trtllm_dir,
        script_name="disagg.sh",
Dmitry Tokarev's avatar
Dmitry Tokarev committed
169
170
        marks=[
            pytest.mark.gpu_2,
171
            pytest.mark.pre_merge,
Dmitry Tokarev's avatar
Dmitry Tokarev committed
172
173
            pytest.mark.trtllm,
        ],
174
        model="Qwen/Qwen3-0.6B",
175
        frontend_port=DefaultPort.FRONTEND.value,
176
177
178
179
180
181
182
        request_payloads=[
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=False, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=None),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=0),
        ],
    ),
183
184
    "aggregated_router": TRTLLMConfig(
        name="aggregated_router",
185
        directory=trtllm_dir,
186
        script_name="agg_router.sh",
187
188
189
190
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
191
192
193
            pytest.mark.timeout(
                300
            ),  # 3x measured time (37.91s) + download time (180s)
194
        ],
195
        model="Qwen/Qwen3-0.6B",
196
        frontend_port=DefaultPort.FRONTEND.value,
197
198
199
200
        request_payloads=[
            chat_payload_default(
                expected_log=[
                    r"Event processor for worker_id \d+ processing event: Stored\(",
201
                    r"Selected worker: worker_type=\w+, worker_id=\d+ dp_rank=.*?, logit: ",
202
203
204
205
                ]
            )
        ],
        env={
206
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info",
207
        },
208
209
210
    ),
    "disaggregated_router": TRTLLMConfig(
        name="disaggregated_router",
211
        directory=trtllm_dir,
212
        script_name="disagg_router.sh",
213
        marks=[pytest.mark.gpu_2, pytest.mark.trtllm, pytest.mark.nightly],
214
        model="Qwen/Qwen3-0.6B",
215
        frontend_port=DefaultPort.FRONTEND.value,
216
217
218
219
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
220
    ),
221
222
223
224
    "disaggregated_multimodal": TRTLLMConfig(
        name="disaggregated_multimodal",
        directory=trtllm_dir,
        script_name="disagg_multimodal.sh",
225
226
227
228
229
230
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.nightly,
        ],
231
        model="Qwen/Qwen2-VL-7B-Instruct",
232
        frontend_port=DefaultPort.FRONTEND.value,
233
234
235
236
        timeout=900,
        delayed_start=60,
        request_payloads=[multimodal_payload_default()],
    ),
237
238
239
    "aggregated_multimodal_router": TRTLLMConfig(
        name="aggregated_multimodal_router",
        directory=trtllm_dir,
240
        script_name="agg_multimodal_router.sh",
241
242
243
244
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
245
            pytest.mark.pre_merge,
246
        ],
247
        model="Qwen/Qwen3-VL-2B-Instruct",
248
249
250
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=60,
251
252
253
254
255
256
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
257
    ),
258
259
    # TensorRT-LLM EPD (Encode-Prefill-Decode) multimodal test for pre-merge CI
    # Uses Qwen3-VL-2B-Instruct model with 1 GPU (all workers share same GPU)
260
261
262
263
264
    #
    # TODO: Add Llama-4-Scout multimodal tests (agg_multimodal_llama, disagg_multimodal_llama)
    #       once CI supports gpu_8 runners and launch scripts are available
    "epd_multimodal": TRTLLMConfig(
        name="epd_multimodal",
265
        directory=trtllm_dir,
266
        script_name="epd_multimodal_image_and_embeddings.sh",
267
        marks=[
268
            pytest.mark.gpu_1,
269
270
            pytest.mark.trtllm,
            pytest.mark.multimodal,
271
            pytest.mark.pre_merge,
272
        ],
273
        model="Qwen/Qwen3-VL-2B-Instruct",
274
        frontend_port=DefaultPort.FRONTEND.value,
275
        timeout=900,
276
        delayed_start=120,
277
278
279
280
281
282
283
284
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
        env={
            "PREFILL_CUDA_VISIBLE_DEVICES": "0",
285
            "DECODE_CUDA_VISIBLE_DEVICES": "0",
286
287
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
        },
288
    ),
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
    # Test Encoder with Aggregated PD worker on same GPU
    # Make this pre-merge after TRTLLM #5938603 is fixed
    "e_pd_multimodal": TRTLLMConfig(
        name="e_pd_multimodal",
        directory=trtllm_dir,
        script_name="disagg_e_pd.sh",
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.nightly,
        ],
        model="Qwen/Qwen3-VL-2B-Instruct",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=120,
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
        env={
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
        },
    ),
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
    # LLaVA raw-embeddings E/PD test
    # Validates the raw-embeddings code path where pre-computed vision embeddings
    # (.pt tensor file) are sent via file:// URL instead of a raw image URL.
    #
    # Flow:
    #   1. Launch script generates embeddings using standalone HF vision encoder
    #   2. Encode + Aggregated PD workers start for LLaVA
    #   3. Test sends chat/completions request with file:///tmp/llava_embeddings.pt
    #
    # Uses gpu_2: encode worker on GPU 0, PD worker on GPU 1.
    # The 7B LLaVA model requires two GPUs because both encode and PD workers
    # load the full model (~14GB each in bfloat16), exceeding a single L4's 22GB.
    # Runs in the multi-GPU pre-merge CI (marker: pre_merge and trtllm and gpu_2).
    "raw_embeddings_epd": TRTLLMConfig(
        name="raw_embeddings_epd",
        directory=SERVE_TEST_DIR,
        script_name="agg_raw_embeddings_llava.sh",
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.pre_merge,
            pytest.mark.timeout(
                900
            ),  # Embeddings generation (~60s) + model load (~120s) + inference
        ],
        model="llava-hf/llava-v1.6-mistral-7b-hf",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=600,
        # Embeddings generation + worker startup takes longer than normal
        delayed_start=180,
        request_payloads=[
            multimodal_payload_default(
                image_url="file:///tmp/llava_embeddings.pt",
                text="Describe what this image shows.",
                expected_response=["bench", "person", "image", "picture"],
            )
        ],
        env={
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
            "PD_CUDA_VISIBLE_DEVICES": "1",
        },
    ),
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
    # TensorRT-LLM video diffusion test using Wan2.1-T2V-1.3B model.
    # Validates the end-to-end video generation pipeline (frontend → worker → /v1/videos).
    # Uses --skip-warmup (warmup at default resolution OOMs on 22 GB L4 GPU),
    # --disable-torch-compile, and small default resolution (480x272, 17 frames)
    # to fit within CI GPU memory constraints.
    "video_diffusion": TRTLLMConfig(
        name="video_diffusion",
        directory=trtllm_dir,
        script_name="agg_video_diffusion.sh",
        script_args=[
            "--skip-warmup",
            "--disable-torch-compile",
            "--default-height",
            "272",
            "--default-width",
            "480",
            "--default-num-frames",
            "17",
        ],
        marks=[
378
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 17.1 GiB
379
380
            pytest.mark.trtllm,
            pytest.mark.pre_merge,
381
382
383
384
385
386
387
388
            # Diffusion models don't use KV cache, so requested_trtllm_kv_tokens
            # doesn't apply.  requested_trtllm_vram_gib maps to
            # KvCacheConfig.max_gpu_total_bytes which has no effect on the
            # diffusion engine itself, but the parallel scheduler requires one
            # of the KV/VRAM markers to accept the test.  We set it to the
            # profiled peak so the scheduler's VRAM budget is accurate.
            pytest.mark.profiled_vram_gib(17.1),  # actual nvidia-smi peak 17.1 GiB
            pytest.mark.requested_trtllm_vram_gib(17.1),
389
390
391
392
393
394
395
            pytest.mark.timeout(
                600
            ),  # Video generation is slow even at small resolution
        ],
        model="Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=300,
396
        delayed_start=5,
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
        request_payloads=[
            VideoGenerationPayload(
                body={
                    "prompt": "A golden retriever running on a beach",
                    "size": "480x272",
                    "response_format": "url",
                    "nvext": {
                        "num_inference_steps": 10,
                        "num_frames": 17,
                        "guidance_scale": 5.0,
                        "seed": 42,
                    },
                },
                repeat_count=1,
                expected_response=[],
                expected_log=[],
            ),
        ],
    ),
416
417
418
419
    "completions_only": TRTLLMConfig(
        name="completions_only",
        directory=trtllm_dir,
        script_name="agg.sh",
420
421
422
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
423
            pytest.mark.post_merge,
424
            pytest.mark.skip(reason="DIS-1566"),
425
426
427
            pytest.mark.timeout(
                480
            ),  # 3x measured time (83.85s) + download time (210s) for 7B model
428
        ],
429
430
431
432
433
434
435
436
        model="deepseek-ai/deepseek-llm-7b-base",
        script_args=["--dyn-endpoint-types", "completions"],
        env={
            "MODEL_PATH": "deepseek-ai/deepseek-llm-7b-base",
            "SERVED_MODEL_NAME": "deepseek-ai/deepseek-llm-7b-base",
        },
        request_payloads=[
            completion_payload_default(),
437
            completion_payload(prompt=TEXT_PROMPT, logprobs=3),
438
439
        ],
    ),
440
441
442
}


Alec's avatar
Alec committed
443
@pytest.fixture(params=params_with_model_mark(trtllm_configs))
444
445
def trtllm_config_test(request):
    """Fixture that provides different trtllm test configurations"""
446
    return trtllm_configs[request.param]
447
448


449
@pytest.mark.trtllm
450
@pytest.mark.e2e
451
@pytest.mark.parametrize("num_system_ports", [2], indirect=True)
452
453
454
455
456
def test_deployment(
    trtllm_config_test,
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
457
    num_system_ports,
458
459
    predownload_models,
):
460
461
462
    """
    Test dynamo deployments with different configurations.
    """
463
464
465
    assert (
        num_system_ports >= 2
    ), "serve tests require at least SYSTEM_PORT1 + SYSTEM_PORT2"
466
467
468
469
470
471
472
473
474
475
476
477
    # Use per-test ports so tests can run safely under pytest-xdist.
    config = dataclasses.replace(
        trtllm_config_test, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    # Non-port env stays here; ports are wired by run_serve_deployment(ports=...).
    config.env.update(
        {
            "MODEL_PATH": config.model,
            "SERVED_MODEL_NAME": config.model,
        }
    )
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)
478
479


480
# TODO make this a normal guy
481
482
@pytest.mark.e2e
@pytest.mark.gpu_1
483
@pytest.mark.trtllm
484
@pytest.mark.pre_merge
485
@pytest.mark.timeout(660)  # 3x measured time (159.68s) + download time (180s)
486
def test_chat_only_aggregated_with_test_logits_processor(
487
488
489
490
491
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
    predownload_models,
    monkeypatch,
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
):
    """
    Run a single aggregated chat-completions test using Qwen 0.6B with the
    test logits processor enabled, and expect "Hello world" in the response.
    """

    # Enable HelloWorld logits processor only for this test
    monkeypatch.setenv("DYNAMO_ENABLE_TEST_LOGITS_PROCESSOR", "1")

    base = trtllm_configs["aggregated"]
    config = TRTLLMConfig(
        name="aggregated_qwen_chatonly",
        directory=base.directory,
        script_name=base.script_name,  # agg.sh
        marks=[],  # not used by this direct test
507
        request_payloads=[
508
            chat_payload_default(expected_response=["Hello world!"]),
509
        ],
510
511
512
513
514
        model="Qwen/Qwen3-0.6B",
        delayed_start=base.delayed_start,
        timeout=base.timeout,
    )

515
516
517
518
519
520
521
522
523
524
    config = dataclasses.replace(
        config, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    config.env.update(
        {
            "MODEL_PATH": config.model,
            "SERVED_MODEL_NAME": config.model,
        }
    )
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)