Unverified Commit a8e5328e authored by Indrajit Bhosale's avatar Indrajit Bhosale Committed by GitHub
Browse files

ci: TRT-LLM multimodal CI (#4118)


Signed-off-by: default avatarIndrajit Bhosale <iamindrajitb@gmail.com>
parent 0b8b7ffb
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-VL-7B-Instruct"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/decode.yaml"}
export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"}
export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"}
export MODALITY=${MODALITY:-"multimodal"}
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# run frontend
python3 -m dynamo.frontend --http-port 8000 &
DYNAMO_PID=$!
# run prefill worker
CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$PREFILL_ENGINE_ARGS" \
--modality "$MODALITY" \
--disaggregation-mode prefill &
PREFILL_PID=$!
# run decode worker
CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$DECODE_ENGINE_ARGS" \
--modality "$MODALITY" \
--disaggregation-mode decode
...@@ -196,6 +196,7 @@ markers = [ ...@@ -196,6 +196,7 @@ markers = [
"trtllm: marks tests as requiring trtllm", "trtllm: marks tests as requiring trtllm",
"trtllm_marker: marks tests as requiring trtllm", "trtllm_marker: marks tests as requiring trtllm",
"sglang: marks tests as requiring sglang", "sglang: marks tests as requiring sglang",
"multimodal: marks tests as multimodal (image/video) tests",
"slow: marks tests as known to be slow", "slow: marks tests as known to be slow",
"h100: marks tests to run on H100", "h100: marks tests to run on H100",
"kvbm: marks tests for KV behavior and model determinism", "kvbm: marks tests for KV behavior and model determinism",
......
...@@ -68,7 +68,7 @@ Markers are required for all tests. They are used for test selection in CI and l ...@@ -68,7 +68,7 @@ Markers are required for all tests. They are used for test selection in CI and l
| Category | Marker(s) | Description | | Category | Marker(s) | Description |
|-------------------------|--------------------------|------------------------------------| |-------------------------|--------------------------|------------------------------------|
| Lifecycle [required] | pre_merge, post_merge, nightly, weekly, release | When the test should run | | Lifecycle [required] | pre_merge, post_merge, nightly, weekly, release | When the test should run |
| Test Type [required] | unit, integration, e2e, benchmark, stress | Nature of the test | | Test Type [required] | unit, integration, e2e, benchmark, stress, multimodal | Nature of the test |
| Hardware [required] | gpu_0, gpu_1, gpu_2, gpu_4, gpu_8, h100 | Number/type of GPUs required | | Hardware [required] | gpu_0, gpu_1, gpu_2, gpu_4, gpu_8, h100 | Number/type of GPUs required |
| Component/Framework | vllm, trtllm, sglang, kvbm, planner, router | Backend or component specificity | | Component/Framework | vllm, trtllm, sglang, kvbm, planner, router | Backend or component specificity |
| Other | slow, skip, xfail | Special handling | | Other | slow, skip, xfail | Special handling |
......
...@@ -17,6 +17,7 @@ from tests.utils.payload_builder import ( ...@@ -17,6 +17,7 @@ from tests.utils.payload_builder import (
chat_payload_default, chat_payload_default,
completion_payload_default, completion_payload_default,
metric_payload_default, metric_payload_default,
multimodal_payload_default,
) )
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -105,6 +106,17 @@ trtllm_configs = { ...@@ -105,6 +106,17 @@ trtllm_configs = {
completion_payload_default(), completion_payload_default(),
], ],
), ),
"disaggregated_multimodal": TRTLLMConfig(
name="disaggregated_multimodal",
directory=trtllm_dir,
script_name="disagg_multimodal.sh",
marks=[pytest.mark.gpu_2, pytest.mark.trtllm_marker, pytest.mark.multimodal],
model="Qwen/Qwen2-VL-7B-Instruct",
models_port=8000,
timeout=900,
delayed_start=60,
request_payloads=[multimodal_payload_default()],
),
} }
......
...@@ -66,6 +66,48 @@ def completion_payload_default( ...@@ -66,6 +66,48 @@ def completion_payload_default(
) )
def multimodal_payload_default(
image_url: str = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png",
text: str = "Describe the image",
repeat_count: int = 1,
expected_response: Optional[List[str]] = None,
expected_log: Optional[List[str]] = None,
max_tokens: int = 160,
temperature: Optional[float] = None,
stream: bool = False,
) -> ChatPayload:
"""Create a multimodal chat payload with image and text content.
Args:
image_url: URL of the image to include in the request
text: Text prompt to accompany the image
repeat_count: Number of times to repeat the request
expected_response: List of strings expected in the response
expected_log: List of regex patterns expected in logs
max_tokens: Maximum tokens to generate
temperature: Sampling temperature (optional)
stream: Whether to stream the response
Returns:
ChatPayload configured for multimodal requests
"""
return chat_payload(
content=[
{"type": "text", "text": text},
{
"type": "image_url",
"image_url": {"url": image_url},
},
],
repeat_count=repeat_count,
expected_response=expected_response or ["image"],
expected_log=expected_log or [],
max_tokens=max_tokens,
temperature=temperature,
stream=stream,
)
def metric_payload_default( def metric_payload_default(
min_num_requests: int, min_num_requests: int,
repeat_count: int = 1, repeat_count: int = 1,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment