test_trtllm.py 17.1 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
# SPDX-License-Identifier: Apache-2.0

4
import dataclasses
5
6
import logging
import os
7
from dataclasses import dataclass, field
8
from typing import Any
9
10
11

import pytest

12
from tests.serve.common import (
13
    SERVE_TEST_DIR,
14
15
16
17
    WORKSPACE_DIR,
    params_with_model_mark,
    run_serve_deployment,
)
18
from tests.utils.constants import DefaultPort
19
from tests.utils.engine_process import EngineConfig
20
from tests.utils.payload_builder import (
21
22
    TEXT_PROMPT,
    chat_payload,
23
    chat_payload_default,
24
    completion_payload,
25
26
    completion_payload_default,
    metric_payload_default,
27
    multimodal_payload_default,
28
)
29
from tests.utils.payloads import BasePayload
30
31
32
33

logger = logging.getLogger(__name__)


34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
@dataclass
class VideoGenerationPayload(BasePayload):
    """Payload for /v1/videos endpoint (TRT-LLM video diffusion)."""

    endpoint: str = "/v1/videos"
    timeout: int = 300

    def response_handler(self, response: Any) -> str:
        response.raise_for_status()
        result = response.json()
        assert result.get("status") == "completed", (
            f"Video generation not completed. Status: {result.get('status')}, "
            f"Error: {result.get('error', 'none')}"
        )
        assert (
            "data" in result
        ), f"Missing 'data' in response. Keys: {list(result.keys())}"
        assert len(result["data"]) > 0, "Empty data in video response"
        entry = result["data"][0]
        if "url" in entry:
            assert entry["url"], "Video response url is empty"
            return entry["url"]
        assert entry.get("b64_json"), "Video response b64_json is empty"
        return "b64_video_returned"

    def validate(self, response: Any, content: str) -> None:
        assert content, "Video response content is empty"


63
@dataclass
64
class TRTLLMConfig(EngineConfig):
65
66
    """Configuration for trtllm test scenarios"""

67
    stragglers: list[str] = field(default_factory=lambda: ["TRTLLM:EngineCore"])
68

69

70
trtllm_dir = os.environ.get("TRTLLM_DIR") or os.path.join(
71
    WORKSPACE_DIR, "examples/backends/trtllm"
72
)
73

74
75
76
# TensorRT-LLM test configurations
# NOTE: pytest.mark.gpu_1 tests take ~442s (7m 22s) total to run sequentially (with models pre-cached)
# TODO: Parallelize these tests to reduce total execution time
77
78
79
trtllm_configs = {
    "aggregated": TRTLLMConfig(
        name="aggregated",
80
        directory=trtllm_dir,
81
        script_name="agg_metrics.sh",
82
83
84
85
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
86
87
88
            pytest.mark.timeout(
                300
            ),  # 3x measured time (44.66s) + download time (150s)
89
        ],
90
        model="Qwen/Qwen3-0.6B",
91
        frontend_port=DefaultPort.FRONTEND.value,
92
93
94
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
95
            metric_payload_default(min_num_requests=6, backend="trtllm"),
96
        ],
97
98
99
    ),
    "disaggregated": TRTLLMConfig(
        name="disaggregated",
100
        directory=trtllm_dir,
101
        script_name="disagg.sh",
102
        marks=[pytest.mark.gpu_2, pytest.mark.trtllm, pytest.mark.pre_merge],
103
        model="Qwen/Qwen3-0.6B",
104
        frontend_port=DefaultPort.FRONTEND.value,
105
106
107
108
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
109
    ),
110
111
112
113
    "disaggregated_same_gpu": TRTLLMConfig(
        name="disaggregated_same_gpu",
        directory=trtllm_dir,
        script_name="disagg_same_gpu.sh",
114
115
116
117
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
118
            pytest.mark.skip(reason="unstable"),
119
120
121
            pytest.mark.timeout(
                480
            ),  # 3x measured time (103.66s) + download time (150s)
122
        ],
123
        model="Qwen/Qwen3-0.6B",
124
        frontend_port=DefaultPort.FRONTEND.value,
125
126
127
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
128
129
130
131
132
133
            metric_payload_default(
                port=DefaultPort.SYSTEM1.value, min_num_requests=6, backend="trtllm"
            ),
            metric_payload_default(
                port=DefaultPort.SYSTEM2.value, min_num_requests=6, backend="trtllm"
            ),
134
135
        ],
    ),
136
137
138
139
140
141
    "aggregated_logprobs": TRTLLMConfig(
        name="aggregated_logprobs",
        directory=trtllm_dir,
        script_name="agg.sh",
        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.trtllm],
        model="Qwen/Qwen3-0.6B",
142
        frontend_port=DefaultPort.FRONTEND.value,
143
144
145
146
147
148
149
150
151
152
153
        request_payloads=[
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=False, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=None),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=0),
        ],
    ),
    "disaggregated_logprobs": TRTLLMConfig(
        name="disaggregated_logprobs",
        directory=trtllm_dir,
        script_name="disagg.sh",
Dmitry Tokarev's avatar
Dmitry Tokarev committed
154
155
        marks=[
            pytest.mark.gpu_2,
156
            pytest.mark.pre_merge,
Dmitry Tokarev's avatar
Dmitry Tokarev committed
157
158
            pytest.mark.trtllm,
        ],
159
        model="Qwen/Qwen3-0.6B",
160
        frontend_port=DefaultPort.FRONTEND.value,
161
162
163
164
165
166
167
        request_payloads=[
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=False, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=None),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=0),
        ],
    ),
168
169
    "aggregated_router": TRTLLMConfig(
        name="aggregated_router",
170
        directory=trtllm_dir,
171
        script_name="agg_router.sh",
172
173
174
175
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
176
177
178
            pytest.mark.timeout(
                300
            ),  # 3x measured time (37.91s) + download time (180s)
179
        ],
180
        model="Qwen/Qwen3-0.6B",
181
        frontend_port=DefaultPort.FRONTEND.value,
182
183
184
185
        request_payloads=[
            chat_payload_default(
                expected_log=[
                    r"Event processor for worker_id \d+ processing event: Stored\(",
186
                    r"Selected worker: worker_type=\w+, worker_id=\d+ dp_rank=.*?, logit: ",
187
188
189
190
                ]
            )
        ],
        env={
191
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info",
192
        },
193
194
195
    ),
    "disaggregated_router": TRTLLMConfig(
        name="disaggregated_router",
196
        directory=trtllm_dir,
197
        script_name="disagg_router.sh",
198
        marks=[pytest.mark.gpu_2, pytest.mark.trtllm, pytest.mark.nightly],
199
        model="Qwen/Qwen3-0.6B",
200
        frontend_port=DefaultPort.FRONTEND.value,
201
202
203
204
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
205
    ),
206
207
208
209
    "disaggregated_multimodal": TRTLLMConfig(
        name="disaggregated_multimodal",
        directory=trtllm_dir,
        script_name="disagg_multimodal.sh",
210
211
212
213
214
215
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.nightly,
        ],
216
        model="Qwen/Qwen2-VL-7B-Instruct",
217
        frontend_port=DefaultPort.FRONTEND.value,
218
219
220
221
        timeout=900,
        delayed_start=60,
        request_payloads=[multimodal_payload_default()],
    ),
222
223
224
    "aggregated_multimodal_router": TRTLLMConfig(
        name="aggregated_multimodal_router",
        directory=trtllm_dir,
225
        script_name="agg_multimodal_router.sh",
226
227
228
229
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
230
            pytest.mark.pre_merge,
231
        ],
232
        model="Qwen/Qwen3-VL-2B-Instruct",
233
234
235
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=60,
236
237
238
239
240
241
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
242
    ),
243
244
    # TensorRT-LLM EPD (Encode-Prefill-Decode) multimodal test for pre-merge CI
    # Uses Qwen3-VL-2B-Instruct model with 1 GPU (all workers share same GPU)
245
246
247
248
249
    #
    # TODO: Add Llama-4-Scout multimodal tests (agg_multimodal_llama, disagg_multimodal_llama)
    #       once CI supports gpu_8 runners and launch scripts are available
    "epd_multimodal": TRTLLMConfig(
        name="epd_multimodal",
250
        directory=trtllm_dir,
251
        script_name="epd_multimodal_image_and_embeddings.sh",
252
        marks=[
253
            pytest.mark.gpu_1,
254
255
            pytest.mark.trtllm,
            pytest.mark.multimodal,
256
            pytest.mark.pre_merge,
257
        ],
258
        model="Qwen/Qwen3-VL-2B-Instruct",
259
        frontend_port=DefaultPort.FRONTEND.value,
260
        timeout=900,
261
        delayed_start=120,
262
263
264
265
266
267
268
269
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
        env={
            "PREFILL_CUDA_VISIBLE_DEVICES": "0",
270
            "DECODE_CUDA_VISIBLE_DEVICES": "0",
271
272
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
        },
273
    ),
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
    # Test Encoder with Aggregated PD worker on same GPU
    # Make this pre-merge after TRTLLM #5938603 is fixed
    "e_pd_multimodal": TRTLLMConfig(
        name="e_pd_multimodal",
        directory=trtllm_dir,
        script_name="disagg_e_pd.sh",
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.nightly,
        ],
        model="Qwen/Qwen3-VL-2B-Instruct",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=120,
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
        env={
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
        },
    ),
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
    # LLaVA raw-embeddings E/PD test
    # Validates the raw-embeddings code path where pre-computed vision embeddings
    # (.pt tensor file) are sent via file:// URL instead of a raw image URL.
    #
    # Flow:
    #   1. Launch script generates embeddings using standalone HF vision encoder
    #   2. Encode + Aggregated PD workers start for LLaVA
    #   3. Test sends chat/completions request with file:///tmp/llava_embeddings.pt
    #
    # Uses gpu_2: encode worker on GPU 0, PD worker on GPU 1.
    # The 7B LLaVA model requires two GPUs because both encode and PD workers
    # load the full model (~14GB each in bfloat16), exceeding a single L4's 22GB.
    # Runs in the multi-GPU pre-merge CI (marker: pre_merge and trtllm and gpu_2).
    "raw_embeddings_epd": TRTLLMConfig(
        name="raw_embeddings_epd",
        directory=SERVE_TEST_DIR,
        script_name="agg_raw_embeddings_llava.sh",
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.pre_merge,
            pytest.mark.timeout(
                900
            ),  # Embeddings generation (~60s) + model load (~120s) + inference
        ],
        model="llava-hf/llava-v1.6-mistral-7b-hf",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=600,
        # Embeddings generation + worker startup takes longer than normal
        delayed_start=180,
        request_payloads=[
            multimodal_payload_default(
                image_url="file:///tmp/llava_embeddings.pt",
                text="Describe what this image shows.",
                expected_response=["bench", "person", "image", "picture"],
            )
        ],
        env={
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
            "PD_CUDA_VISIBLE_DEVICES": "1",
        },
    ),
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
    # TensorRT-LLM video diffusion test using Wan2.1-T2V-1.3B model.
    # Validates the end-to-end video generation pipeline (frontend → worker → /v1/videos).
    # Uses --skip-warmup (warmup at default resolution OOMs on 22 GB L4 GPU),
    # --disable-torch-compile, and small default resolution (480x272, 17 frames)
    # to fit within CI GPU memory constraints.
    "video_diffusion": TRTLLMConfig(
        name="video_diffusion",
        directory=trtllm_dir,
        script_name="agg_video_diffusion.sh",
        script_args=[
            "--skip-warmup",
            "--disable-torch-compile",
            "--default-height",
            "272",
            "--default-width",
            "480",
            "--default-num-frames",
            "17",
        ],
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.pre_merge,
            pytest.mark.timeout(
                600
            ),  # Video generation is slow even at small resolution
        ],
        model="Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=300,
        delayed_start=60,  # Model loading takes time
        request_payloads=[
            VideoGenerationPayload(
                body={
                    "prompt": "A golden retriever running on a beach",
                    "size": "480x272",
                    "response_format": "url",
                    "nvext": {
                        "num_inference_steps": 10,
                        "num_frames": 17,
                        "guidance_scale": 5.0,
                        "seed": 42,
                    },
                },
                repeat_count=1,
                expected_response=[],
                expected_log=[],
            ),
        ],
    ),
393
394
395
396
    "completions_only": TRTLLMConfig(
        name="completions_only",
        directory=trtllm_dir,
        script_name="agg.sh",
397
398
399
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
400
            pytest.mark.post_merge,
401
            pytest.mark.skip(reason="DIS-1566"),
402
403
404
            pytest.mark.timeout(
                480
            ),  # 3x measured time (83.85s) + download time (210s) for 7B model
405
        ],
406
407
408
409
410
411
412
413
        model="deepseek-ai/deepseek-llm-7b-base",
        script_args=["--dyn-endpoint-types", "completions"],
        env={
            "MODEL_PATH": "deepseek-ai/deepseek-llm-7b-base",
            "SERVED_MODEL_NAME": "deepseek-ai/deepseek-llm-7b-base",
        },
        request_payloads=[
            completion_payload_default(),
414
            completion_payload(prompt=TEXT_PROMPT, logprobs=3),
415
416
        ],
    ),
417
418
419
}


Alec's avatar
Alec committed
420
@pytest.fixture(params=params_with_model_mark(trtllm_configs))
421
422
def trtllm_config_test(request):
    """Fixture that provides different trtllm test configurations"""
423
    return trtllm_configs[request.param]
424
425


426
@pytest.mark.trtllm
427
@pytest.mark.e2e
428
@pytest.mark.parametrize("num_system_ports", [2], indirect=True)
429
430
431
432
433
def test_deployment(
    trtllm_config_test,
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
434
    num_system_ports,
435
436
    predownload_models,
):
437
438
439
    """
    Test dynamo deployments with different configurations.
    """
440
441
442
    assert (
        num_system_ports >= 2
    ), "serve tests require at least SYSTEM_PORT1 + SYSTEM_PORT2"
443
444
445
446
447
448
449
450
451
452
453
454
    # Use per-test ports so tests can run safely under pytest-xdist.
    config = dataclasses.replace(
        trtllm_config_test, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    # Non-port env stays here; ports are wired by run_serve_deployment(ports=...).
    config.env.update(
        {
            "MODEL_PATH": config.model,
            "SERVED_MODEL_NAME": config.model,
        }
    )
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)
455
456


457
# TODO make this a normal guy
458
459
@pytest.mark.e2e
@pytest.mark.gpu_1
460
@pytest.mark.trtllm
461
@pytest.mark.pre_merge
462
@pytest.mark.timeout(660)  # 3x measured time (159.68s) + download time (180s)
463
def test_chat_only_aggregated_with_test_logits_processor(
464
465
466
467
468
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
    predownload_models,
    monkeypatch,
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
):
    """
    Run a single aggregated chat-completions test using Qwen 0.6B with the
    test logits processor enabled, and expect "Hello world" in the response.
    """

    # Enable HelloWorld logits processor only for this test
    monkeypatch.setenv("DYNAMO_ENABLE_TEST_LOGITS_PROCESSOR", "1")

    base = trtllm_configs["aggregated"]
    config = TRTLLMConfig(
        name="aggregated_qwen_chatonly",
        directory=base.directory,
        script_name=base.script_name,  # agg.sh
        marks=[],  # not used by this direct test
484
        request_payloads=[
485
            chat_payload_default(expected_response=["Hello world!"]),
486
        ],
487
488
489
490
491
        model="Qwen/Qwen3-0.6B",
        delayed_start=base.delayed_start,
        timeout=base.timeout,
    )

492
493
494
495
496
497
498
499
500
501
    config = dataclasses.replace(
        config, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    config.env.update(
        {
            "MODEL_PATH": config.model,
            "SERVED_MODEL_NAME": config.model,
        }
    )
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)