feat: add TRTLLM text-to-image support (#8200)

Signed-off-by: Guan Luo <41310872+GuanLuo@users.noreply.github.com> Signed-off-by: GuanLuo <41310872+GuanLuo@users.noreply.github.com>

feat: add TRTLLM text-to-image support (#8200)
Signed-off-by: Guan Luo <41310872+GuanLuo@users.noreply.github.com> Signed-off-by: GuanLuo <41310872+GuanLuo@users.noreply.github.com>
1b0334c8 · GuanLuo · GitHub · 5135c321 · 1b0334c8
Unverified Commit 1b0334c8 authored Apr 22, 2026 by GuanLuo Committed by GitHub Apr 22, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 60 additions and 1 deletion

tests/serve/test_trtllm.py tests/serve/test_trtllm.py +60 -1

No files found.
--- a/tests/serve/test_trtllm.py
+++ b/tests/serve/test_trtllm.py
@@ -25,7 +25,7 @@ from tests.utils.payload_builder import (
    metric_payload_default,
    multimodal_payload_default,
 )
-from tests.utils.payloads import VideoGenerationPayload
+from tests.utils.payloads import ImageGenerationPayload, VideoGenerationPayload

 logger = logging.getLogger(__name__)

@@ -414,6 +414,65 @@ trtllm_configs = {
            ),
        ],
    ),
+    # TensorRT-LLM image diffusion test using Flux.1-dev model.
+    # Validates the end-to-end image generation pipeline (frontend → worker → /v1/images/generations).
+    # Uses --skip-warmup (warmup at default resolution OOMs on 22 GB L4 GPU),
+    # --disable-torch-compile, and small default resolution (256x256)
+    # to fit within CI GPU memory constraints.
+    "image_diffusion": TRTLLMConfig(
+        name="image_diffusion",
+        directory=trtllm_dir,
+        script_name="agg_image_diffusion.sh",
+        script_args=[
+            "--skip-warmup",
+            "--disable-torch-compile",
+            "--default-height",
+            "256",
+            "--default-width",
+            "256",
+            "--default-num-images-per-prompt",
+            "1",
+        ],
+        marks=[
+            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 20.0 GiB
+            pytest.mark.trtllm,
+            pytest.mark.pre_merge,
+            # Diffusion models don't use KV cache, so requested_trtllm_kv_tokens
+            # doesn't apply.  requested_trtllm_vram_gib maps to
+            # KvCacheConfig.max_gpu_total_bytes which has no effect on the
+            # diffusion engine itself, but the parallel scheduler requires one
+            # of the KV/VRAM markers to accept the test.  We set it to the
+            # profiled peak so the scheduler's VRAM budget is accurate.
+            pytest.mark.profiled_vram_gib(
+                20.0
+            ),  # actual nvidia-smi peak 20.0 GiB [gluo FIXME] reprofil as new model is used
+            pytest.mark.requested_trtllm_vram_gib(20.0),
+            pytest.mark.timeout(
+                600
+            ),  # Image generation is slow even at small resolution
+        ],
+        model="black-forest-labs/FLUX.2-klein-4B",
+        frontend_port=DefaultPort.FRONTEND.value,
+        timeout=300,
+        delayed_start=5,
+        request_payloads=[
+            ImageGenerationPayload(
+                body={
+                    "prompt": "A golden retriever running on a beach",
+                    "size": "256x256",
+                    "response_format": "url",
+                    "nvext": {
+                        "num_inference_steps": 10,
+                        "guidance_scale": 5.0,
+                        "seed": 42,
+                    },
+                },
+                repeat_count=1,
+                expected_response=[],
+                expected_log=[],
+            ),
+        ],
+    ),
    # Aggregated multimodal with --frontend-decoding enabled.
    # Verifies image URL inference works when images are decoded by the Rust
    # MediaDecoder in the frontend instead of the Python backend.