Unverified Commit 22fb3398 authored by KrishnanPrash's avatar KrishnanPrash Committed by GitHub
Browse files

ci(trtllm): fix aggregated_multimodal_router test (#7460)


Signed-off-by: default avatarKrishnan Prashanth <kprashanth@nvidia.com>
parent 45be2fdc
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Launch script for Aggregated Multimodal with MM Router Worker
#
# Architecture:
# Frontend --> MM Router Worker --> TRT-LLM Worker
# (KV-aware routing) (aggregated multimodal)
#
# The MM Router Worker sits between frontend and TRT-LLM, computing
# mm_hash for images and routing to the best worker based on KV cache overlap.
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-VL-2B-Instruct"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-VL-2B-Instruct"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3-vl-2b-instruct/agg.yaml"}
export MODALITY=${MODALITY:-"multimodal"}
export MODEL_TYPE=${MODEL_TYPE:-"qwen3_vl"}
export BLOCK_SIZE=${BLOCK_SIZE:-32}
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --multimodal "Launching Aggregated Multimodal + MM Router" "$MODEL_PATH" "$HTTP_PORT"
# TRT-LLM worker: "__internal" suffix hides it from frontend discovery.
python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "${SERVED_MODEL_NAME}__internal" \
--extra-engine-args "$AGG_ENGINE_ARGS" \
--modality "$MODALITY" \
--publish-events-and-metrics \
--kv-block-size "$BLOCK_SIZE" &
# MM Router Worker: registers with the real model name; does KV-aware routing internally.
(cd "$DYNAMO_HOME" && python3 -m examples.backends.trtllm.mm_router_worker \
--model "$MODEL_PATH" \
--model-type "$MODEL_TYPE" \
--namespace dynamo \
--component mm_router \
--endpoint generate \
--downstream-component tensorrt_llm \
--downstream-endpoint generate \
--block-size "$BLOCK_SIZE") &
# Frontend: round-robin to mm_router (KV routing happens inside mm_router, not here).
python3 -m dynamo.frontend --router-mode round-robin &
wait_any_exit
...@@ -231,11 +231,11 @@ def _compute_tokens_per_image( ...@@ -231,11 +231,11 @@ def _compute_tokens_per_image(
processor_output: dict, processor: Any, model_type: str processor_output: dict, processor: Any, model_type: str
) -> list[int]: ) -> list[int]:
"""Compute the number of visual tokens for each image from processor output.""" """Compute the number of visual tokens for each image from processor output."""
if model_type == "qwen2_vl": if model_type in ("qwen2_vl", "qwen3_vl"):
grid_thw = processor_output.get("image_grid_thw") grid_thw = processor_output.get("image_grid_thw")
if grid_thw is None: if grid_thw is None:
raise ValueError( raise ValueError(
"image_grid_thw not found in processor output for Qwen2-VL" f"image_grid_thw not found in processor output for {model_type}"
) )
merge_size = getattr(processor.image_processor, "merge_size", 2) merge_size = getattr(processor.image_processor, "merge_size", 2)
...@@ -254,8 +254,14 @@ def _get_replacement_id(model_path: str) -> int: ...@@ -254,8 +254,14 @@ def _get_replacement_id(model_path: str) -> int:
try: try:
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
replacement_id = config.vocab_size + 1 # Some models (e.g. Qwen3-VL) store vocab_size in text_config, not top-level.
logger.info(f"Got vocab_size={config.vocab_size} from AutoConfig") vocab_size = getattr(config, "vocab_size", None)
if vocab_size is None and hasattr(config, "text_config"):
vocab_size = getattr(config.text_config, "vocab_size", None)
if vocab_size is None:
raise AttributeError("vocab_size not found in config or config.text_config")
replacement_id = vocab_size + 1
logger.info(f"Got vocab_size={vocab_size} from AutoConfig")
return replacement_id return replacement_id
except Exception as e: except Exception as e:
raise RuntimeError( raise RuntimeError(
......
...@@ -30,11 +30,11 @@ from tests.utils.managed_process import ManagedProcess ...@@ -30,11 +30,11 @@ from tests.utils.managed_process import ManagedProcess
from tests.utils.payloads import check_models_api from tests.utils.payloads import check_models_api
from tests.utils.port_utils import allocate_ports from tests.utils.port_utils import allocate_ports
TRTLLM_MM_MODEL = "Qwen/Qwen2-VL-2B-Instruct" TRTLLM_MM_MODEL = "Qwen/Qwen3-VL-2B-Instruct"
TRTLLM_MM_MODEL_TYPE = "qwen2_vl" TRTLLM_MM_MODEL_TYPE = "qwen3_vl"
BLOCK_SIZE = 32 BLOCK_SIZE = 32
NAMESPACE = "test-mm" NAMESPACE = "test-mm"
# Broad guardrails for TRT-LLM + Qwen2-VL-2B under block size 32. # Broad guardrails for TRT-LLM + Qwen3-VL-2B under block size 32.
THREE_IMAGE_TOTAL_BLOCKS_RANGE = (80, 520) THREE_IMAGE_TOTAL_BLOCKS_RANGE = (80, 520)
SINGLE_IMAGE_TOTAL_BLOCKS_RANGE = (20, 260) SINGLE_IMAGE_TOTAL_BLOCKS_RANGE = (20, 260)
......
...@@ -190,18 +190,23 @@ trtllm_configs = { ...@@ -190,18 +190,23 @@ trtllm_configs = {
"aggregated_multimodal_router": TRTLLMConfig( "aggregated_multimodal_router": TRTLLMConfig(
name="aggregated_multimodal_router", name="aggregated_multimodal_router",
directory=trtllm_dir, directory=trtllm_dir,
script_name="agg_multimodal.sh", script_name="agg_multimodal_router.sh",
marks=[ marks=[
pytest.mark.gpu_1, pytest.mark.gpu_1,
pytest.mark.trtllm, pytest.mark.trtllm,
pytest.mark.multimodal, pytest.mark.multimodal,
pytest.mark.nightly, pytest.mark.pre_merge,
], ],
model="Qwen/Qwen2-VL-7B-Instruct", model="Qwen/Qwen3-VL-2B-Instruct",
frontend_port=DefaultPort.FRONTEND.value, frontend_port=DefaultPort.FRONTEND.value,
timeout=900, timeout=900,
delayed_start=60, delayed_start=60,
request_payloads=[multimodal_payload_default()], request_payloads=[
multimodal_payload_default(
text="Describe what you see in this image.",
expected_response=["mountain", "rock", "trees", "road"],
)
],
), ),
# TensorRT-LLM EPD (Encode-Prefill-Decode) multimodal test for pre-merge CI # TensorRT-LLM EPD (Encode-Prefill-Decode) multimodal test for pre-merge CI
# Uses Qwen3-VL-2B-Instruct model with 1 GPU (all workers share same GPU) # Uses Qwen3-VL-2B-Instruct model with 1 GPU (all workers share same GPU)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment