test: add TensorRT-LLM multimodal EPD test for nightly CI (#6193)

Signed-off-by: Kavita Narang <knarang@nvidia.com>

test: add TensorRT-LLM multimodal EPD test for nightly CI (#6193)
Signed-off-by: Kavita Narang <knarang@nvidia.com>
da7d3e9e · knarangN · GitHub · f91b42b9 · da7d3e9e
Unverified Commit da7d3e9e authored Feb 20, 2026 by knarangN Committed by GitHub Feb 20, 2026
Show whitespace changes
Inline Side-by-side

Showing with 22 additions and 6 deletions

tests/serve/test_trtllm.py tests/serve/test_trtllm.py +22 -6

No files found.
--- a/tests/serve/test_trtllm.py
+++ b/tests/serve/test_trtllm.py
@@ -199,21 +199,37 @@ trtllm_configs = {
        delayed_start=60,
        request_payloads=[multimodal_payload_default()],
    ),
-    "epd_multimodal_image_and_embeddings": TRTLLMConfig(
+    # TensorRT-LLM EPD (Encode-Prefill-Decode) multimodal test for nightly CI
-        name="epd_multimodal_image_and_embeddings",
+    # Uses llava model with 2 GPUs (encode shares GPU with prefill)
+    #
+    # TODO: Add Llama-4-Scout multimodal tests (agg_multimodal_llama, disagg_multimodal_llama)
+    #       once CI supports gpu_8 runners and launch scripts are available
+    "epd_multimodal": TRTLLMConfig(
+        name="epd_multimodal",
        directory=trtllm_dir,
-        script_name="epd_multimodal_image_and_embeddings.sh",
+        script_name="epd_multimodal_image.sh",
        marks=[
-            pytest.mark.gpu_4,
+            pytest.mark.gpu_2,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.nightly,
        ],
        model="llava-hf/llava-v1.6-mistral-7b-hf",
        frontend_port=DefaultPort.FRONTEND.value,
-        timeout=1200,
+        timeout=900,
        delayed_start=120,
-        request_payloads=[multimodal_payload_default()],
+        request_payloads=[
+            multimodal_payload_default(
+                text="Describe what you see in this image.",
+                expected_response=["mountain", "rock", "trees", "road"],
+            )
+        ],
+        env={
+            # Override GPU assignments to fit on 2 GPUs (encode shares with prefill)
+            "PREFILL_CUDA_VISIBLE_DEVICES": "0",
+            "DECODE_CUDA_VISIBLE_DEVICES": "1",
+            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
+        },
    ),
    "completions_only": TRTLLMConfig(
        name="completions_only",