"tests/vscode:/vscode.git/clone" did not exist on "35055c6f7711ac2eefe62c3a4cfda4eb9149a4dd"
test_trtllm.py 21.9 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
# SPDX-License-Identifier: Apache-2.0

4
import dataclasses
5
6
import logging
import os
7
from dataclasses import dataclass, field
8
9
10

import pytest

11
from tests.serve.common import (
12
    SERVE_TEST_DIR,
13
14
15
16
    WORKSPACE_DIR,
    params_with_model_mark,
    run_serve_deployment,
)
17
from tests.utils.constants import DefaultPort
18
from tests.utils.engine_process import EngineConfig
19
from tests.utils.payload_builder import (
20
21
    TEXT_PROMPT,
    chat_payload,
22
    chat_payload_default,
23
    completion_payload,
24
25
    completion_payload_default,
    metric_payload_default,
26
    multimodal_payload_default,
27
)
28
from tests.utils.payloads import ImageGenerationPayload, VideoGenerationPayload
29
30
31
32
33

logger = logging.getLogger(__name__)


@dataclass
34
class TRTLLMConfig(EngineConfig):
35
36
    """Configuration for trtllm test scenarios"""

37
    stragglers: list[str] = field(default_factory=lambda: ["TRTLLM:EngineCore"])
38

39

40
trtllm_dir = os.environ.get("TRTLLM_DIR") or os.path.join(
41
    WORKSPACE_DIR, "examples/backends/trtllm"
42
)
43

44
45
46
# TensorRT-LLM test configurations
# NOTE: pytest.mark.gpu_1 tests take ~442s (7m 22s) total to run sequentially (with models pre-cached)
# TODO: Parallelize these tests to reduce total execution time
47
48
49
trtllm_configs = {
    "aggregated": TRTLLMConfig(
        name="aggregated",
50
        directory=trtllm_dir,
51
        script_name="agg_metrics.sh",
52
        marks=[
53
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 3.9 GiB
54
55
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
56
57
58
59
            pytest.mark.profiled_vram_gib(3.9),  # actual nvidia-smi peak 3.9 GiB
            pytest.mark.requested_trtllm_kv_tokens(
                2592
            ),  # KV cache cap (2x safety over min=1296)
60
61
62
            pytest.mark.timeout(
                300
            ),  # 3x measured time (44.66s) + download time (150s)
63
        ],
64
        model="Qwen/Qwen3-0.6B",
65
        frontend_port=DefaultPort.FRONTEND.value,
66
        delayed_start=5,
67
68
69
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
70
            metric_payload_default(min_num_requests=6, backend="trtllm"),
71
        ],
72
    ),
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
    "aggregated_unified": TRTLLMConfig(
        name="aggregated_unified",
        directory=trtllm_dir,
        script_name="agg.sh",
        script_args=["--unified"],
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.profiled_vram_gib(3.9),
            pytest.mark.requested_trtllm_kv_tokens(2592),
            pytest.mark.timeout(300),
            pytest.mark.pre_merge,
        ],
        model="Qwen/Qwen3-0.6B",
        frontend_port=DefaultPort.FRONTEND.value,
        delayed_start=5,
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
    ),
94
95
    "disaggregated": TRTLLMConfig(
        name="disaggregated",
96
        directory=trtllm_dir,
97
        script_name="disagg.sh",
98
        marks=[pytest.mark.gpu_2, pytest.mark.trtllm, pytest.mark.pre_merge],
99
        model="Qwen/Qwen3-0.6B",
100
        frontend_port=DefaultPort.FRONTEND.value,
101
102
103
104
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
105
    ),
106
107
108
109
    "disaggregated_same_gpu": TRTLLMConfig(
        name="disaggregated_same_gpu",
        directory=trtllm_dir,
        script_name="disagg_same_gpu.sh",
110
        marks=[
111
112
113
            pytest.mark.skip(
                reason="Nightly CI failure: https://linear.app/nvidia/issue/OPS-4450"
            ),
114
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 6.6 GiB
115
116
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
117
118
119
120
121
            pytest.mark.profiled_vram_gib(6.6),  # actual nvidia-smi peak 6.6 GiB
            pytest.mark.requested_trtllm_kv_tokens(
                512
            ),  # KV cache cap (2x safety over min=256)
            pytest.mark.timeout(432),  # ~6x profiled wall time 72s
122
        ],
123
        model="Qwen/Qwen3-0.6B",
124
        frontend_port=DefaultPort.FRONTEND.value,
125
126
        delayed_start=10,
        health_check_workers=True,
127
128
129
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
130
131
132
133
134
135
            metric_payload_default(
                port=DefaultPort.SYSTEM1.value, min_num_requests=6, backend="trtllm"
            ),
            metric_payload_default(
                port=DefaultPort.SYSTEM2.value, min_num_requests=6, backend="trtllm"
            ),
136
137
        ],
    ),
138
139
140
141
    "aggregated_logprobs": TRTLLMConfig(
        name="aggregated_logprobs",
        directory=trtllm_dir,
        script_name="agg.sh",
142
143
144
145
146
147
148
149
150
151
        marks=[
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 3.8 GiB
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
            pytest.mark.profiled_vram_gib(3.8),  # actual nvidia-smi peak 3.8 GiB
            pytest.mark.requested_trtllm_kv_tokens(
                2592
            ),  # KV cache cap (2x safety over min=1296)
            pytest.mark.timeout(300),  # 3x measured time (~44s) + download time (150s)
        ],
152
        model="Qwen/Qwen3-0.6B",
153
        frontend_port=DefaultPort.FRONTEND.value,
154
        delayed_start=5,
155
156
157
158
159
160
161
162
163
164
165
        request_payloads=[
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=False, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=None),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=0),
        ],
    ),
    "disaggregated_logprobs": TRTLLMConfig(
        name="disaggregated_logprobs",
        directory=trtllm_dir,
        script_name="disagg.sh",
Dmitry Tokarev's avatar
Dmitry Tokarev committed
166
167
        marks=[
            pytest.mark.gpu_2,
168
            pytest.mark.pre_merge,
Dmitry Tokarev's avatar
Dmitry Tokarev committed
169
170
            pytest.mark.trtllm,
        ],
171
        model="Qwen/Qwen3-0.6B",
172
        frontend_port=DefaultPort.FRONTEND.value,
173
174
175
176
177
178
179
        request_payloads=[
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=False, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=None),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=0),
        ],
    ),
180
181
    "aggregated_router": TRTLLMConfig(
        name="aggregated_router",
182
        directory=trtllm_dir,
183
        script_name="agg_router.sh",
184
185
186
187
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
188
189
190
            pytest.mark.timeout(
                300
            ),  # 3x measured time (37.91s) + download time (180s)
191
        ],
192
        model="Qwen/Qwen3-0.6B",
193
        frontend_port=DefaultPort.FRONTEND.value,
194
195
196
197
        request_payloads=[
            chat_payload_default(
                expected_log=[
                    r"Event processor for worker_id \d+ processing event: Stored\(",
198
                    r"Selected worker: worker_type=\w+, worker_id=\d+ dp_rank=.*?, logit: ",
199
200
201
202
                ]
            )
        ],
        env={
203
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info",
204
        },
205
206
207
    ),
    "disaggregated_router": TRTLLMConfig(
        name="disaggregated_router",
208
        directory=trtllm_dir,
209
        script_name="disagg_router.sh",
210
        marks=[pytest.mark.gpu_2, pytest.mark.trtllm, pytest.mark.nightly],
211
        model="Qwen/Qwen3-0.6B",
212
        frontend_port=DefaultPort.FRONTEND.value,
213
214
215
216
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
217
    ),
218
219
220
221
    "disaggregated_multimodal": TRTLLMConfig(
        name="disaggregated_multimodal",
        directory=trtllm_dir,
        script_name="disagg_multimodal.sh",
222
223
224
225
226
227
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.nightly,
        ],
228
        model="Qwen/Qwen2-VL-7B-Instruct",
229
        frontend_port=DefaultPort.FRONTEND.value,
230
231
232
233
        timeout=900,
        delayed_start=60,
        request_payloads=[multimodal_payload_default()],
    ),
234
235
236
    "aggregated_multimodal_router": TRTLLMConfig(
        name="aggregated_multimodal_router",
        directory=trtllm_dir,
237
        script_name="agg_multimodal_router.sh",
238
        marks=[
239
240
241
            pytest.mark.skip(
                reason="Nightly CI failure: https://linear.app/nvidia/issue/DYN-2608"
            ),
242
243
244
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
245
            pytest.mark.pre_merge,
246
        ],
247
        model="Qwen/Qwen3-VL-2B-Instruct",
248
249
250
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=60,
251
252
253
254
255
256
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
257
    ),
258
259
    # TensorRT-LLM EPD (Encode-Prefill-Decode) multimodal test for pre-merge CI
    # Uses Qwen3-VL-2B-Instruct model with 1 GPU (all workers share same GPU)
260
261
262
263
264
    #
    # TODO: Add Llama-4-Scout multimodal tests (agg_multimodal_llama, disagg_multimodal_llama)
    #       once CI supports gpu_8 runners and launch scripts are available
    "epd_multimodal": TRTLLMConfig(
        name="epd_multimodal",
265
        directory=trtllm_dir,
266
        script_name="epd_multimodal_image_and_embeddings.sh",
267
        marks=[
268
            pytest.mark.gpu_1,
269
270
            pytest.mark.trtllm,
            pytest.mark.multimodal,
271
            pytest.mark.pre_merge,
272
        ],
273
        model="Qwen/Qwen3-VL-2B-Instruct",
274
        frontend_port=DefaultPort.FRONTEND.value,
275
        timeout=900,
276
        delayed_start=120,
277
278
279
280
281
282
283
284
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
        env={
            "PREFILL_CUDA_VISIBLE_DEVICES": "0",
285
            "DECODE_CUDA_VISIBLE_DEVICES": "0",
286
287
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
        },
288
    ),
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
    # Test Encoder with Aggregated PD worker on same GPU
    # Make this pre-merge after TRTLLM #5938603 is fixed
    "e_pd_multimodal": TRTLLMConfig(
        name="e_pd_multimodal",
        directory=trtllm_dir,
        script_name="disagg_e_pd.sh",
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.nightly,
        ],
        model="Qwen/Qwen3-VL-2B-Instruct",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=120,
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
        env={
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
        },
    ),
315
316
    # LLaVA raw-embeddings E/PD test
    # Validates the raw-embeddings code path where pre-computed vision embeddings
317
    # (.safetensors file) are sent via file:// URL instead of a raw image URL.
318
319
320
321
    #
    # Flow:
    #   1. Launch script generates embeddings using standalone HF vision encoder
    #   2. Encode + Aggregated PD workers start for LLaVA
322
    #   3. Test sends chat/completions request with file:///tmp/llava_embeddings.safetensors
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
    #
    # Uses gpu_2: encode worker on GPU 0, PD worker on GPU 1.
    # The 7B LLaVA model requires two GPUs because both encode and PD workers
    # load the full model (~14GB each in bfloat16), exceeding a single L4's 22GB.
    # Runs in the multi-GPU pre-merge CI (marker: pre_merge and trtllm and gpu_2).
    "raw_embeddings_epd": TRTLLMConfig(
        name="raw_embeddings_epd",
        directory=SERVE_TEST_DIR,
        script_name="agg_raw_embeddings_llava.sh",
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.pre_merge,
            pytest.mark.timeout(
                900
            ),  # Embeddings generation (~60s) + model load (~120s) + inference
        ],
        model="llava-hf/llava-v1.6-mistral-7b-hf",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=600,
        # Embeddings generation + worker startup takes longer than normal
        delayed_start=180,
        request_payloads=[
            multimodal_payload_default(
348
                image_url="file:///tmp/llava_embeddings.safetensors",
349
350
351
352
353
354
355
356
357
                text="Describe what this image shows.",
                expected_response=["bench", "person", "image", "picture"],
            )
        ],
        env={
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
            "PD_CUDA_VISIBLE_DEVICES": "1",
        },
    ),
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
    # TensorRT-LLM video diffusion test using Wan2.1-T2V-1.3B model.
    # Validates the end-to-end video generation pipeline (frontend → worker → /v1/videos).
    # Uses --skip-warmup (warmup at default resolution OOMs on 22 GB L4 GPU),
    # --disable-torch-compile, and small default resolution (480x272, 17 frames)
    # to fit within CI GPU memory constraints.
    "video_diffusion": TRTLLMConfig(
        name="video_diffusion",
        directory=trtllm_dir,
        script_name="agg_video_diffusion.sh",
        script_args=[
            "--skip-warmup",
            "--disable-torch-compile",
            "--default-height",
            "272",
            "--default-width",
            "480",
            "--default-num-frames",
            "17",
        ],
        marks=[
378
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 17.1 GiB
379
380
            pytest.mark.trtllm,
            pytest.mark.pre_merge,
381
382
383
384
385
386
387
388
            # Diffusion models don't use KV cache, so requested_trtllm_kv_tokens
            # doesn't apply.  requested_trtllm_vram_gib maps to
            # KvCacheConfig.max_gpu_total_bytes which has no effect on the
            # diffusion engine itself, but the parallel scheduler requires one
            # of the KV/VRAM markers to accept the test.  We set it to the
            # profiled peak so the scheduler's VRAM budget is accurate.
            pytest.mark.profiled_vram_gib(17.1),  # actual nvidia-smi peak 17.1 GiB
            pytest.mark.requested_trtllm_vram_gib(17.1),
389
390
391
392
393
394
395
            pytest.mark.timeout(
                600
            ),  # Video generation is slow even at small resolution
        ],
        model="Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=300,
396
        delayed_start=5,
397
398
399
400
401
402
403
404
405
406
407
408
409
        request_payloads=[
            VideoGenerationPayload(
                body={
                    "prompt": "A golden retriever running on a beach",
                    "size": "480x272",
                    "response_format": "url",
                    "nvext": {
                        "num_inference_steps": 10,
                        "num_frames": 17,
                        "guidance_scale": 5.0,
                        "seed": 42,
                    },
                },
410
                timeout=300,
411
412
413
414
415
416
                repeat_count=1,
                expected_response=[],
                expected_log=[],
            ),
        ],
    ),
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
    # TensorRT-LLM image diffusion test using Flux.1-dev model.
    # Validates the end-to-end image generation pipeline (frontend → worker → /v1/images/generations).
    # Uses --skip-warmup (warmup at default resolution OOMs on 22 GB L4 GPU),
    # --disable-torch-compile, and small default resolution (256x256)
    # to fit within CI GPU memory constraints.
    "image_diffusion": TRTLLMConfig(
        name="image_diffusion",
        directory=trtllm_dir,
        script_name="agg_image_diffusion.sh",
        script_args=[
            "--skip-warmup",
            "--disable-torch-compile",
            "--default-height",
            "256",
            "--default-width",
            "256",
            "--default-num-images-per-prompt",
            "1",
        ],
        marks=[
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 20.0 GiB
            pytest.mark.trtllm,
            pytest.mark.pre_merge,
            # Diffusion models don't use KV cache, so requested_trtllm_kv_tokens
            # doesn't apply.  requested_trtllm_vram_gib maps to
            # KvCacheConfig.max_gpu_total_bytes which has no effect on the
            # diffusion engine itself, but the parallel scheduler requires one
            # of the KV/VRAM markers to accept the test.  We set it to the
            # profiled peak so the scheduler's VRAM budget is accurate.
            pytest.mark.profiled_vram_gib(
                20.0
            ),  # actual nvidia-smi peak 20.0 GiB [gluo FIXME] reprofil as new model is used
            pytest.mark.requested_trtllm_vram_gib(20.0),
            pytest.mark.timeout(
                600
            ),  # Image generation is slow even at small resolution
        ],
        model="black-forest-labs/FLUX.2-klein-4B",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=300,
        delayed_start=5,
        request_payloads=[
            ImageGenerationPayload(
                body={
                    "prompt": "A golden retriever running on a beach",
                    "size": "256x256",
                    "response_format": "url",
                    "nvext": {
                        "num_inference_steps": 10,
                        "guidance_scale": 5.0,
                        "seed": 42,
                    },
                },
                repeat_count=1,
                expected_response=[],
                expected_log=[],
            ),
        ],
    ),
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
    # Aggregated multimodal with --frontend-decoding enabled.
    # Verifies image URL inference works when images are decoded by the Rust
    # MediaDecoder in the frontend instead of the Python backend.
    "aggregated_multimodal_frontend_decoding": TRTLLMConfig(
        name="aggregated_multimodal_frontend_decoding",
        directory=trtllm_dir,
        script_name="agg_multimodal.sh",
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.pre_merge,
            pytest.mark.timeout(900),
        ],
        model="Qwen/Qwen3-VL-2B-Instruct",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=60,
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
        env={
            "AGG_ENGINE_ARGS": "/workspace/examples/backends/trtllm/engine_configs/qwen3-vl-2b-instruct/agg.yaml",
            "DYN_TRTLLM_FRONTEND_DECODING": "true",
        },
    ),
505
506
507
508
    "completions_only": TRTLLMConfig(
        name="completions_only",
        directory=trtllm_dir,
        script_name="agg.sh",
509
510
511
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
512
            pytest.mark.post_merge,
513
            pytest.mark.skip(reason="DIS-1566"),
514
515
516
            pytest.mark.timeout(
                480
            ),  # 3x measured time (83.85s) + download time (210s) for 7B model
517
        ],
518
519
520
521
522
523
524
525
        model="deepseek-ai/deepseek-llm-7b-base",
        script_args=["--dyn-endpoint-types", "completions"],
        env={
            "MODEL_PATH": "deepseek-ai/deepseek-llm-7b-base",
            "SERVED_MODEL_NAME": "deepseek-ai/deepseek-llm-7b-base",
        },
        request_payloads=[
            completion_payload_default(),
526
            completion_payload(prompt=TEXT_PROMPT, logprobs=3),
527
528
        ],
    ),
529
530
531
}


Alec's avatar
Alec committed
532
@pytest.fixture(params=params_with_model_mark(trtllm_configs))
533
534
def trtllm_config_test(request):
    """Fixture that provides different trtllm test configurations"""
535
    return trtllm_configs[request.param]
536
537


538
@pytest.mark.trtllm
539
@pytest.mark.e2e
540
@pytest.mark.parametrize("num_system_ports", [2], indirect=True)
541
542
543
544
545
def test_deployment(
    trtllm_config_test,
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
546
    num_system_ports,
547
548
    predownload_models,
):
549
550
551
    """
    Test dynamo deployments with different configurations.
    """
552
553
554
    assert (
        num_system_ports >= 2
    ), "serve tests require at least SYSTEM_PORT1 + SYSTEM_PORT2"
555
556
557
558
559
560
561
562
563
564
565
566
    # Use per-test ports so tests can run safely under pytest-xdist.
    config = dataclasses.replace(
        trtllm_config_test, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    # Non-port env stays here; ports are wired by run_serve_deployment(ports=...).
    config.env.update(
        {
            "MODEL_PATH": config.model,
            "SERVED_MODEL_NAME": config.model,
        }
    )
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)
567
568


569
# TODO make this a normal guy
570
571
@pytest.mark.e2e
@pytest.mark.gpu_1
572
@pytest.mark.trtllm
573
@pytest.mark.pre_merge
574
@pytest.mark.timeout(660)  # 3x measured time (159.68s) + download time (180s)
575
def test_chat_only_aggregated_with_test_logits_processor(
576
577
578
579
580
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
    predownload_models,
    monkeypatch,
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
):
    """
    Run a single aggregated chat-completions test using Qwen 0.6B with the
    test logits processor enabled, and expect "Hello world" in the response.
    """

    # Enable HelloWorld logits processor only for this test
    monkeypatch.setenv("DYNAMO_ENABLE_TEST_LOGITS_PROCESSOR", "1")

    base = trtllm_configs["aggregated"]
    config = TRTLLMConfig(
        name="aggregated_qwen_chatonly",
        directory=base.directory,
        script_name=base.script_name,  # agg.sh
        marks=[],  # not used by this direct test
596
        request_payloads=[
597
            chat_payload_default(expected_response=["Hello world!"]),
598
        ],
599
600
601
602
603
        model="Qwen/Qwen3-0.6B",
        delayed_start=base.delayed_start,
        timeout=base.timeout,
    )

604
605
606
607
608
609
610
611
612
613
    config = dataclasses.replace(
        config, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    config.env.update(
        {
            "MODEL_PATH": config.model,
            "SERVED_MODEL_NAME": config.model,
        }
    )
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)