test_trtllm.py 15.3 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
# SPDX-License-Identifier: Apache-2.0

4
import dataclasses
5
6
import logging
import os
7
from dataclasses import dataclass, field
8
from typing import Any
9
10
11

import pytest

12
13
14
15
16
from tests.serve.common import (
    WORKSPACE_DIR,
    params_with_model_mark,
    run_serve_deployment,
)
17
from tests.utils.constants import DefaultPort
18
from tests.utils.engine_process import EngineConfig
19
from tests.utils.payload_builder import (
20
21
    TEXT_PROMPT,
    chat_payload,
22
    chat_payload_default,
23
    completion_payload,
24
25
    completion_payload_default,
    metric_payload_default,
26
    multimodal_payload_default,
27
)
28
from tests.utils.payloads import BasePayload
29
30
31
32

logger = logging.getLogger(__name__)


33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
@dataclass
class VideoGenerationPayload(BasePayload):
    """Payload for /v1/videos endpoint (TRT-LLM video diffusion)."""

    endpoint: str = "/v1/videos"
    timeout: int = 300

    def response_handler(self, response: Any) -> str:
        response.raise_for_status()
        result = response.json()
        assert result.get("status") == "completed", (
            f"Video generation not completed. Status: {result.get('status')}, "
            f"Error: {result.get('error', 'none')}"
        )
        assert (
            "data" in result
        ), f"Missing 'data' in response. Keys: {list(result.keys())}"
        assert len(result["data"]) > 0, "Empty data in video response"
        entry = result["data"][0]
        if "url" in entry:
            assert entry["url"], "Video response url is empty"
            return entry["url"]
        assert entry.get("b64_json"), "Video response b64_json is empty"
        return "b64_video_returned"

    def validate(self, response: Any, content: str) -> None:
        assert content, "Video response content is empty"


62
@dataclass
63
class TRTLLMConfig(EngineConfig):
64
65
    """Configuration for trtllm test scenarios"""

66
    stragglers: list[str] = field(default_factory=lambda: ["TRTLLM:EngineCore"])
67

68

69
trtllm_dir = os.environ.get("TRTLLM_DIR") or os.path.join(
70
    WORKSPACE_DIR, "examples/backends/trtllm"
71
)
72

73
74
75
# TensorRT-LLM test configurations
# NOTE: pytest.mark.gpu_1 tests take ~442s (7m 22s) total to run sequentially (with models pre-cached)
# TODO: Parallelize these tests to reduce total execution time
76
77
78
trtllm_configs = {
    "aggregated": TRTLLMConfig(
        name="aggregated",
79
        directory=trtllm_dir,
80
        script_name="agg_metrics.sh",
81
82
83
84
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
85
86
87
            pytest.mark.timeout(
                300
            ),  # 3x measured time (44.66s) + download time (150s)
88
        ],
89
        model="Qwen/Qwen3-0.6B",
90
        frontend_port=DefaultPort.FRONTEND.value,
91
92
93
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
94
            metric_payload_default(min_num_requests=6, backend="trtllm"),
95
        ],
96
97
98
    ),
    "disaggregated": TRTLLMConfig(
        name="disaggregated",
99
        directory=trtllm_dir,
100
        script_name="disagg.sh",
101
        marks=[pytest.mark.gpu_2, pytest.mark.trtllm, pytest.mark.pre_merge],
102
        model="Qwen/Qwen3-0.6B",
103
        frontend_port=DefaultPort.FRONTEND.value,
104
105
106
107
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
108
    ),
109
110
111
112
    "disaggregated_same_gpu": TRTLLMConfig(
        name="disaggregated_same_gpu",
        directory=trtllm_dir,
        script_name="disagg_same_gpu.sh",
113
114
115
116
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
117
            pytest.mark.skip(reason="unstable"),
118
119
120
            pytest.mark.timeout(
                480
            ),  # 3x measured time (103.66s) + download time (150s)
121
        ],
122
        model="Qwen/Qwen3-0.6B",
123
        frontend_port=DefaultPort.FRONTEND.value,
124
125
126
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
127
128
129
130
131
132
            metric_payload_default(
                port=DefaultPort.SYSTEM1.value, min_num_requests=6, backend="trtllm"
            ),
            metric_payload_default(
                port=DefaultPort.SYSTEM2.value, min_num_requests=6, backend="trtllm"
            ),
133
134
        ],
    ),
135
136
137
138
139
140
    "aggregated_logprobs": TRTLLMConfig(
        name="aggregated_logprobs",
        directory=trtllm_dir,
        script_name="agg.sh",
        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.trtllm],
        model="Qwen/Qwen3-0.6B",
141
        frontend_port=DefaultPort.FRONTEND.value,
142
143
144
145
146
147
148
149
150
151
152
        request_payloads=[
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=False, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=None),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=0),
        ],
    ),
    "disaggregated_logprobs": TRTLLMConfig(
        name="disaggregated_logprobs",
        directory=trtllm_dir,
        script_name="disagg.sh",
Dmitry Tokarev's avatar
Dmitry Tokarev committed
153
154
        marks=[
            pytest.mark.gpu_2,
155
            pytest.mark.pre_merge,
Dmitry Tokarev's avatar
Dmitry Tokarev committed
156
157
            pytest.mark.trtllm,
        ],
158
        model="Qwen/Qwen3-0.6B",
159
        frontend_port=DefaultPort.FRONTEND.value,
160
161
162
163
164
165
166
        request_payloads=[
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=False, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=None),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=0),
        ],
    ),
167
168
    "aggregated_router": TRTLLMConfig(
        name="aggregated_router",
169
        directory=trtllm_dir,
170
        script_name="agg_router.sh",
171
172
173
174
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
175
176
177
            pytest.mark.timeout(
                300
            ),  # 3x measured time (37.91s) + download time (180s)
178
        ],
179
        model="Qwen/Qwen3-0.6B",
180
        frontend_port=DefaultPort.FRONTEND.value,
181
182
183
184
        request_payloads=[
            chat_payload_default(
                expected_log=[
                    r"Event processor for worker_id \d+ processing event: Stored\(",
185
                    r"Selected worker: worker_type=\w+, worker_id=\d+ dp_rank=.*?, logit: ",
186
187
188
189
                ]
            )
        ],
        env={
190
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info",
191
        },
192
193
194
    ),
    "disaggregated_router": TRTLLMConfig(
        name="disaggregated_router",
195
        directory=trtllm_dir,
196
        script_name="disagg_router.sh",
197
        marks=[pytest.mark.gpu_2, pytest.mark.trtllm, pytest.mark.nightly],
198
        model="Qwen/Qwen3-0.6B",
199
        frontend_port=DefaultPort.FRONTEND.value,
200
201
202
203
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
204
    ),
205
206
207
208
    "disaggregated_multimodal": TRTLLMConfig(
        name="disaggregated_multimodal",
        directory=trtllm_dir,
        script_name="disagg_multimodal.sh",
209
210
211
212
213
214
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.nightly,
        ],
215
        model="Qwen/Qwen2-VL-7B-Instruct",
216
        frontend_port=DefaultPort.FRONTEND.value,
217
218
219
220
        timeout=900,
        delayed_start=60,
        request_payloads=[multimodal_payload_default()],
    ),
221
222
223
    "aggregated_multimodal_router": TRTLLMConfig(
        name="aggregated_multimodal_router",
        directory=trtllm_dir,
224
        script_name="agg_multimodal_router.sh",
225
226
227
228
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
229
            pytest.mark.pre_merge,
230
        ],
231
        model="Qwen/Qwen3-VL-2B-Instruct",
232
233
234
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=60,
235
236
237
238
239
240
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
241
    ),
242
243
    # TensorRT-LLM EPD (Encode-Prefill-Decode) multimodal test for pre-merge CI
    # Uses Qwen3-VL-2B-Instruct model with 1 GPU (all workers share same GPU)
244
245
246
247
248
    #
    # TODO: Add Llama-4-Scout multimodal tests (agg_multimodal_llama, disagg_multimodal_llama)
    #       once CI supports gpu_8 runners and launch scripts are available
    "epd_multimodal": TRTLLMConfig(
        name="epd_multimodal",
249
        directory=trtllm_dir,
250
        script_name="epd_multimodal_image_and_embeddings.sh",
251
        marks=[
252
            pytest.mark.gpu_1,
253
254
            pytest.mark.trtllm,
            pytest.mark.multimodal,
255
            pytest.mark.pre_merge,
256
        ],
257
        model="Qwen/Qwen3-VL-2B-Instruct",
258
        frontend_port=DefaultPort.FRONTEND.value,
259
        timeout=900,
260
        delayed_start=120,
261
262
263
264
265
266
267
268
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
        env={
            "PREFILL_CUDA_VISIBLE_DEVICES": "0",
269
            "DECODE_CUDA_VISIBLE_DEVICES": "0",
270
271
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
        },
272
    ),
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
    # Test Encoder with Aggregated PD worker on same GPU
    # Make this pre-merge after TRTLLM #5938603 is fixed
    "e_pd_multimodal": TRTLLMConfig(
        name="e_pd_multimodal",
        directory=trtllm_dir,
        script_name="disagg_e_pd.sh",
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.nightly,
        ],
        model="Qwen/Qwen3-VL-2B-Instruct",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=120,
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
        env={
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
        },
    ),
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
    # TensorRT-LLM video diffusion test using Wan2.1-T2V-1.3B model.
    # Validates the end-to-end video generation pipeline (frontend → worker → /v1/videos).
    # Uses --skip-warmup (warmup at default resolution OOMs on 22 GB L4 GPU),
    # --disable-torch-compile, and small default resolution (480x272, 17 frames)
    # to fit within CI GPU memory constraints.
    "video_diffusion": TRTLLMConfig(
        name="video_diffusion",
        directory=trtllm_dir,
        script_name="agg_video_diffusion.sh",
        script_args=[
            "--skip-warmup",
            "--disable-torch-compile",
            "--default-height",
            "272",
            "--default-width",
            "480",
            "--default-num-frames",
            "17",
        ],
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.pre_merge,
            pytest.mark.timeout(
                600
            ),  # Video generation is slow even at small resolution
        ],
        model="Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=300,
        delayed_start=60,  # Model loading takes time
        request_payloads=[
            VideoGenerationPayload(
                body={
                    "prompt": "A golden retriever running on a beach",
                    "size": "480x272",
                    "response_format": "url",
                    "nvext": {
                        "num_inference_steps": 10,
                        "num_frames": 17,
                        "guidance_scale": 5.0,
                        "seed": 42,
                    },
                },
                repeat_count=1,
                expected_response=[],
                expected_log=[],
            ),
        ],
    ),
349
350
351
352
    "completions_only": TRTLLMConfig(
        name="completions_only",
        directory=trtllm_dir,
        script_name="agg.sh",
353
354
355
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
356
            pytest.mark.post_merge,
357
            pytest.mark.skip(reason="DIS-1566"),
358
359
360
            pytest.mark.timeout(
                480
            ),  # 3x measured time (83.85s) + download time (210s) for 7B model
361
        ],
362
363
364
365
366
367
368
369
        model="deepseek-ai/deepseek-llm-7b-base",
        script_args=["--dyn-endpoint-types", "completions"],
        env={
            "MODEL_PATH": "deepseek-ai/deepseek-llm-7b-base",
            "SERVED_MODEL_NAME": "deepseek-ai/deepseek-llm-7b-base",
        },
        request_payloads=[
            completion_payload_default(),
370
            completion_payload(prompt=TEXT_PROMPT, logprobs=3),
371
372
        ],
    ),
373
374
375
}


Alec's avatar
Alec committed
376
@pytest.fixture(params=params_with_model_mark(trtllm_configs))
377
378
def trtllm_config_test(request):
    """Fixture that provides different trtllm test configurations"""
379
    return trtllm_configs[request.param]
380
381


382
@pytest.mark.trtllm
383
@pytest.mark.e2e
384
@pytest.mark.parametrize("num_system_ports", [2], indirect=True)
385
386
387
388
389
def test_deployment(
    trtllm_config_test,
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
390
    num_system_ports,
391
392
    predownload_models,
):
393
394
395
    """
    Test dynamo deployments with different configurations.
    """
396
397
398
    assert (
        num_system_ports >= 2
    ), "serve tests require at least SYSTEM_PORT1 + SYSTEM_PORT2"
399
400
401
402
403
404
405
406
407
408
409
410
    # Use per-test ports so tests can run safely under pytest-xdist.
    config = dataclasses.replace(
        trtllm_config_test, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    # Non-port env stays here; ports are wired by run_serve_deployment(ports=...).
    config.env.update(
        {
            "MODEL_PATH": config.model,
            "SERVED_MODEL_NAME": config.model,
        }
    )
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)
411
412


413
# TODO make this a normal guy
414
415
@pytest.mark.e2e
@pytest.mark.gpu_1
416
@pytest.mark.trtllm
417
@pytest.mark.pre_merge
418
@pytest.mark.timeout(660)  # 3x measured time (159.68s) + download time (180s)
419
def test_chat_only_aggregated_with_test_logits_processor(
420
421
422
423
424
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
    predownload_models,
    monkeypatch,
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
):
    """
    Run a single aggregated chat-completions test using Qwen 0.6B with the
    test logits processor enabled, and expect "Hello world" in the response.
    """

    # Enable HelloWorld logits processor only for this test
    monkeypatch.setenv("DYNAMO_ENABLE_TEST_LOGITS_PROCESSOR", "1")

    base = trtllm_configs["aggregated"]
    config = TRTLLMConfig(
        name="aggregated_qwen_chatonly",
        directory=base.directory,
        script_name=base.script_name,  # agg.sh
        marks=[],  # not used by this direct test
440
        request_payloads=[
441
            chat_payload_default(expected_response=["Hello world!"]),
442
        ],
443
444
445
446
447
        model="Qwen/Qwen3-0.6B",
        delayed_start=base.delayed_start,
        timeout=base.timeout,
    )

448
449
450
451
452
453
454
455
456
457
    config = dataclasses.replace(
        config, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    config.env.update(
        {
            "MODEL_PATH": config.model,
            "SERVED_MODEL_NAME": config.model,
        }
    )
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)