Unverified Commit a8e5328e authored by Indrajit Bhosale's avatar Indrajit Bhosale Committed by GitHub
Browse files

ci: TRT-LLM multimodal CI (#4118)


Signed-off-by: default avatarIndrajit Bhosale <iamindrajitb@gmail.com>
parent 0b8b7ffb
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-VL-7B-Instruct"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/decode.yaml"}
export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"}
export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"}
export MODALITY=${MODALITY:-"multimodal"}
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# run frontend
python3 -m dynamo.frontend --http-port 8000 &
DYNAMO_PID=$!
# run prefill worker
CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$PREFILL_ENGINE_ARGS" \
--modality "$MODALITY" \
--disaggregation-mode prefill &
PREFILL_PID=$!
# run decode worker
CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$DECODE_ENGINE_ARGS" \
--modality "$MODALITY" \
--disaggregation-mode decode
......@@ -196,6 +196,7 @@ markers = [
"trtllm: marks tests as requiring trtllm",
"trtllm_marker: marks tests as requiring trtllm",
"sglang: marks tests as requiring sglang",
"multimodal: marks tests as multimodal (image/video) tests",
"slow: marks tests as known to be slow",
"h100: marks tests to run on H100",
"kvbm: marks tests for KV behavior and model determinism",
......
......@@ -68,7 +68,7 @@ Markers are required for all tests. They are used for test selection in CI and l
| Category | Marker(s) | Description |
|-------------------------|--------------------------|------------------------------------|
| Lifecycle [required] | pre_merge, post_merge, nightly, weekly, release | When the test should run |
| Test Type [required] | unit, integration, e2e, benchmark, stress | Nature of the test |
| Test Type [required] | unit, integration, e2e, benchmark, stress, multimodal | Nature of the test |
| Hardware [required] | gpu_0, gpu_1, gpu_2, gpu_4, gpu_8, h100 | Number/type of GPUs required |
| Component/Framework | vllm, trtllm, sglang, kvbm, planner, router | Backend or component specificity |
| Other | slow, skip, xfail | Special handling |
......
......@@ -17,6 +17,7 @@ from tests.utils.payload_builder import (
chat_payload_default,
completion_payload_default,
metric_payload_default,
multimodal_payload_default,
)
logger = logging.getLogger(__name__)
......@@ -105,6 +106,17 @@ trtllm_configs = {
completion_payload_default(),
],
),
"disaggregated_multimodal": TRTLLMConfig(
name="disaggregated_multimodal",
directory=trtllm_dir,
script_name="disagg_multimodal.sh",
marks=[pytest.mark.gpu_2, pytest.mark.trtllm_marker, pytest.mark.multimodal],
model="Qwen/Qwen2-VL-7B-Instruct",
models_port=8000,
timeout=900,
delayed_start=60,
request_payloads=[multimodal_payload_default()],
),
}
......
......@@ -66,6 +66,48 @@ def completion_payload_default(
)
def multimodal_payload_default(
image_url: str = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png",
text: str = "Describe the image",
repeat_count: int = 1,
expected_response: Optional[List[str]] = None,
expected_log: Optional[List[str]] = None,
max_tokens: int = 160,
temperature: Optional[float] = None,
stream: bool = False,
) -> ChatPayload:
"""Create a multimodal chat payload with image and text content.
Args:
image_url: URL of the image to include in the request
text: Text prompt to accompany the image
repeat_count: Number of times to repeat the request
expected_response: List of strings expected in the response
expected_log: List of regex patterns expected in logs
max_tokens: Maximum tokens to generate
temperature: Sampling temperature (optional)
stream: Whether to stream the response
Returns:
ChatPayload configured for multimodal requests
"""
return chat_payload(
content=[
{"type": "text", "text": text},
{
"type": "image_url",
"image_url": {"url": image_url},
},
],
repeat_count=repeat_count,
expected_response=expected_response or ["image"],
expected_log=expected_log or [],
max_tokens=max_tokens,
temperature=temperature,
stream=stream,
)
def metric_payload_default(
min_num_requests: int,
repeat_count: int = 1,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment