ci: TRT-LLM multimodal CI (#4118)

Signed-off-by: Indrajit Bhosale <iamindrajitb@gmail.com>

ci: TRT-LLM multimodal CI (#4118)
Signed-off-by: Indrajit Bhosale <iamindrajitb@gmail.com>
a8e5328e · Indrajit Bhosale · GitHub · 0b8b7ffb · a8e5328e · a8e5328e
Unverified Commit a8e5328e authored Nov 19, 2025 by Indrajit Bhosale Committed by GitHub Nov 19, 2025
5 changed files
--- a/examples/backends/trtllm/launch/disagg_multimodal.sh
+++ b/examples/backends/trtllm/launch/disagg_multimodal.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Environment variables with defaults
+export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
+export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-VL-7B-Instruct"}
+export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"}
+export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/prefill.yaml"}
+export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/decode.yaml"}
+export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"}
+export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"}
+export MODALITY=${MODALITY:-"multimodal"}
+
+# Setup cleanup trap
+cleanup() {
+    echo "Cleaning up background processes..."
+    kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
+    wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
+    echo "Cleanup complete."
+}
+trap cleanup EXIT INT TERM
+
+
+# run frontend
+python3 -m dynamo.frontend --http-port 8000 &
+DYNAMO_PID=$!
+
+# run prefill worker
+CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
+  --model-path "$MODEL_PATH" \
+  --served-model-name "$SERVED_MODEL_NAME" \
+  --extra-engine-args  "$PREFILL_ENGINE_ARGS" \
+  --modality "$MODALITY" \
+  --disaggregation-mode prefill &
+PREFILL_PID=$!
+
+# run decode worker
+CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
+  --model-path "$MODEL_PATH" \
+  --served-model-name "$SERVED_MODEL_NAME" \
+  --extra-engine-args  "$DECODE_ENGINE_ARGS" \
+  --modality "$MODALITY" \
+  --disaggregation-mode decode
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -196,6 +196,7 @@ markers = [
    "trtllm: marks tests as requiring trtllm",
    "trtllm_marker: marks tests as requiring trtllm",
    "sglang: marks tests as requiring sglang",
+    "multimodal: marks tests as multimodal (image/video) tests",
    "slow: marks tests as known to be slow",
    "h100: marks tests to run on H100",
    "kvbm: marks tests for KV behavior and model determinism",

--- a/tests/README.md
+++ b/tests/README.md
@@ -68,7 +68,7 @@ Markers are required for all tests. They are used for test selection in CI and l
 | Category                | Marker(s)                | Description                        |
 |-------------------------|--------------------------|------------------------------------|
 | Lifecycle [required]    | pre_merge, post_merge, nightly,  weekly, release   | When the test should run           |
-| Test Type [required]    | unit, integration, e2e, benchmark, stress   | Nature of the test                 |
+| Test Type [required]    | unit, integration, e2e, benchmark, stress, multimodal   | Nature of the test                 |
 | Hardware [required]     | gpu_0, gpu_1, gpu_2,  gpu_4, gpu_8, h100      | Number/type of GPUs required       |
 | Component/Framework     | vllm, trtllm, sglang, kvbm, planner, router    | Backend or component specificity   |
 | Other                   | slow, skip, xfail        | Special handling                   |

--- a/tests/serve/test_trtllm.py
+++ b/tests/serve/test_trtllm.py
@@ -17,6 +17,7 @@ from tests.utils.payload_builder import (
    chat_payload_default,
    completion_payload_default,
    metric_payload_default,
+    multimodal_payload_default,
 )

 logger = logging.getLogger(__name__)
@@ -105,6 +106,17 @@ trtllm_configs = {
            completion_payload_default(),
        ],
    ),
+    "disaggregated_multimodal": TRTLLMConfig(
+        name="disaggregated_multimodal",
+        directory=trtllm_dir,
+        script_name="disagg_multimodal.sh",
+        marks=[pytest.mark.gpu_2, pytest.mark.trtllm_marker, pytest.mark.multimodal],
+        model="Qwen/Qwen2-VL-7B-Instruct",
+        models_port=8000,
+        timeout=900,
+        delayed_start=60,
+        request_payloads=[multimodal_payload_default()],
+    ),
 }



--- a/tests/utils/payload_builder.py
+++ b/tests/utils/payload_builder.py
@@ -66,6 +66,48 @@ def completion_payload_default(
    )


+def multimodal_payload_default(
+    image_url: str = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png",
+    text: str = "Describe the image",
+    repeat_count: int = 1,
+    expected_response: Optional[List[str]] = None,
+    expected_log: Optional[List[str]] = None,
+    max_tokens: int = 160,
+    temperature: Optional[float] = None,
+    stream: bool = False,
+) -> ChatPayload:
+    """Create a multimodal chat payload with image and text content.
+
+    Args:
+        image_url: URL of the image to include in the request
+        text: Text prompt to accompany the image
+        repeat_count: Number of times to repeat the request
+        expected_response: List of strings expected in the response
+        expected_log: List of regex patterns expected in logs
+        max_tokens: Maximum tokens to generate
+        temperature: Sampling temperature (optional)
+        stream: Whether to stream the response
+
+    Returns:
+        ChatPayload configured for multimodal requests
+    """
+    return chat_payload(
+        content=[
+            {"type": "text", "text": text},
+            {
+                "type": "image_url",
+                "image_url": {"url": image_url},
+            },
+        ],
+        repeat_count=repeat_count,
+        expected_response=expected_response or ["image"],
+        expected_log=expected_log or [],
+        max_tokens=max_tokens,
+        temperature=temperature,
+        stream=stream,
+    )
+
+
 def metric_payload_default(
    min_num_requests: int,
    repeat_count: int = 1,