ci(trtllm): fix aggregated_multimodal_router test (#7460)

Signed-off-by: Krishnan Prashanth <kprashanth@nvidia.com>

ci(trtllm): fix aggregated_multimodal_router test (#7460)
Signed-off-by: Krishnan Prashanth <kprashanth@nvidia.com>
22fb3398 · KrishnanPrash · GitHub · 45be2fdc · 22fb3398 · 22fb3398
Unverified Commit 22fb3398 authored Mar 17, 2026 by KrishnanPrash Committed by GitHub Mar 17, 2026
4 changed files
--- a/examples/backends/trtllm/launch/agg_multimodal_router.sh
+++ b/examples/backends/trtllm/launch/agg_multimodal_router.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# Launch script for Aggregated Multimodal with MM Router Worker
+#
+# Architecture:
+#   Frontend  -->  MM Router Worker  -->  TRT-LLM Worker
+#                  (KV-aware routing)     (aggregated multimodal)
+#
+# The MM Router Worker sits between frontend and TRT-LLM, computing
+# mm_hash for images and routing to the best worker based on KV cache overlap.
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
+export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-VL-2B-Instruct"}
+export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-VL-2B-Instruct"}
+export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3-vl-2b-instruct/agg.yaml"}
+export MODALITY=${MODALITY:-"multimodal"}
+export MODEL_TYPE=${MODEL_TYPE:-"qwen3_vl"}
+export BLOCK_SIZE=${BLOCK_SIZE:-32}
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+print_launch_banner --multimodal "Launching Aggregated Multimodal + MM Router" "$MODEL_PATH" "$HTTP_PORT"
+# TRT-LLM worker: "__internal" suffix hides it from frontend discovery.
+python3 -m dynamo.trtllm \
+  --model-path "$MODEL_PATH" \
+  --served-model-name "${SERVED_MODEL_NAME}__internal" \
+  --extra-engine-args "$AGG_ENGINE_ARGS" \
+  --modality "$MODALITY" \
+  --publish-events-and-metrics \
+  --kv-block-size "$BLOCK_SIZE" &
+# MM Router Worker: registers with the real model name; does KV-aware routing internally.
+(cd "$DYNAMO_HOME" && python3 -m examples.backends.trtllm.mm_router_worker \
+  --model "$MODEL_PATH" \
+  --model-type "$MODEL_TYPE" \
+  --namespace dynamo \
+  --component mm_router \
+  --endpoint generate \
+  --downstream-component tensorrt_llm \
+  --downstream-endpoint generate \
+  --block-size "$BLOCK_SIZE") &
+# Frontend: round-robin to mm_router (KV routing happens inside mm_router, not here).
+python3 -m dynamo.frontend --router-mode round-robin &
+wait_any_exit
--- a/examples/backends/trtllm/mm_router_worker/mm_processor.py
+++ b/examples/backends/trtllm/mm_router_worker/mm_processor.py
@@ -231,11 +231,11 @@ def _compute_tokens_per_image(
    processor_output: dict, processor: Any, model_type: str
 ) -> list[int]:
    """Compute the number of visual tokens for each image from processor output."""
-    if model_type == "qwen2_vl":
+    if model_type in ("qwen2_vl", "qwen3_vl"):
        grid_thw = processor_output.get("image_grid_thw")
        if grid_thw is None:
            raise ValueError(
-                "image_grid_thw not found in processor output for Qwen2-VL"
+                f"image_grid_thw not found in processor output for {model_type}"
            )
        merge_size = getattr(processor.image_processor, "merge_size", 2)
@@ -254,8 +254,14 @@ def _get_replacement_id(model_path: str) -> int:
    try:
        config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-        replacement_id = config.vocab_size + 1
+        # Some models (e.g. Qwen3-VL) store vocab_size in text_config, not top-level.
-        logger.info(f"Got vocab_size={config.vocab_size} from AutoConfig")
+        vocab_size = getattr(config, "vocab_size", None)
+        if vocab_size is None and hasattr(config, "text_config"):
+            vocab_size = getattr(config.text_config, "vocab_size", None)
+        if vocab_size is None:
+            raise AttributeError("vocab_size not found in config or config.text_config")
+        replacement_id = vocab_size + 1
+        logger.info(f"Got vocab_size={vocab_size} from AutoConfig")
        return replacement_id
    except Exception as e:
        raise RuntimeError(

--- a/tests/mm_router/test_mm_router_e2e.py
+++ b/tests/mm_router/test_mm_router_e2e.py
@@ -30,11 +30,11 @@ from tests.utils.managed_process import ManagedProcess
 from tests.utils.payloads import check_models_api
 from tests.utils.port_utils import allocate_ports
-TRTLLM_MM_MODEL = "Qwen/Qwen2-VL-2B-Instruct"
+TRTLLM_MM_MODEL = "Qwen/Qwen3-VL-2B-Instruct"
-TRTLLM_MM_MODEL_TYPE = "qwen2_vl"
+TRTLLM_MM_MODEL_TYPE = "qwen3_vl"
 BLOCK_SIZE = 32
 NAMESPACE = "test-mm"
-# Broad guardrails for TRT-LLM + Qwen2-VL-2B under block size 32.
+# Broad guardrails for TRT-LLM + Qwen3-VL-2B under block size 32.
 THREE_IMAGE_TOTAL_BLOCKS_RANGE = (80, 520)
 SINGLE_IMAGE_TOTAL_BLOCKS_RANGE = (20, 260)

--- a/tests/serve/test_trtllm.py
+++ b/tests/serve/test_trtllm.py
@@ -190,18 +190,23 @@ trtllm_configs = {
    "aggregated_multimodal_router": TRTLLMConfig(
        name="aggregated_multimodal_router",
        directory=trtllm_dir,
-        script_name="agg_multimodal.sh",
+        script_name="agg_multimodal_router.sh",
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
-            pytest.mark.nightly,
+            pytest.mark.pre_merge,
        ],
-        model="Qwen/Qwen2-VL-7B-Instruct",
+        model="Qwen/Qwen3-VL-2B-Instruct",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=60,
-        request_payloads=[multimodal_payload_default()],
+        request_payloads=[
+            multimodal_payload_default(
+                text="Describe what you see in this image.",
+                expected_response=["mountain", "rock", "trees", "road"],
+            )
+        ],
    ),
    # TensorRT-LLM EPD (Encode-Prefill-Decode) multimodal test for pre-merge CI
    # Uses Qwen3-VL-2B-Instruct model with 1 GPU (all workers share same GPU)