fix: Fix sglang multimodal test (#3862)

Signed-off-by: krishung5 <krish@nvidia.com>

fix: Fix sglang multimodal test (#3862)
Signed-off-by: krishung5 <krish@nvidia.com>
77cecf4e · Kris Hung · GitHub · cbe0b177 · 77cecf4e · 77cecf4e
Unverified Commit 77cecf4e authored Oct 24, 2025 by Kris Hung Committed by GitHub Oct 24, 2025
4 changed files
--- a/components/backends/sglang/launch/multimodal_agg.sh
+++ b/components/backends/sglang/launch/multimodal_agg.sh
@@ -23,6 +23,10 @@ while [[ $# -gt 0 ]]; do
            MODEL_NAME=$2
            shift 2
            ;;
+        --served-model-name)
+            SERVED_MODEL_NAME=$2
+            shift 2
+            ;;
        --chat-template)
            PROVIDED_CHAT_TEMPLATE=$2
            shift 2
@@ -31,6 +35,7 @@ while [[ $# -gt 0 ]]; do
            echo "Usage: $0 [OPTIONS]"
            echo "Options:"
            echo "  --model <model_name> Specify the model to use (default: $MODEL_NAME)"
+            echo "  --served-model-name <served_model_name> Specify the served model name to use (default: empty)"
            echo "  --chat-template <template> Specify the SGLang chat template to use (default: $CHAT_TEMPLATE)"
            echo "  -h, --help           Show this help message"
            exit 0
@@ -48,20 +53,21 @@ if [[ -n "$PROVIDED_CHAT_TEMPLATE" ]]; then
    CHAT_TEMPLATE="$PROVIDED_CHAT_TEMPLATE"
 fi
-# Get the directory where this script is located
+# Prepare served-model-name argument if provided
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SERVED_MODEL_ARG=""
-SGLANG_BACKEND_DIR="$SCRIPT_DIR/src"
+if [[ -n "$SERVED_MODEL_NAME" ]]; then
+    SERVED_MODEL_ARG="--served-model-name $SERVED_MODEL_NAME"
+fi
 # run ingress
 python3 -m dynamo.frontend --http-port=8000 &
 DYNAMO_PID=$!
 # run SGLang multimodal processor
-python3 -m dynamo.sglang --multimodal-processor --model-path "$MODEL_NAME" --chat-template "$CHAT_TEMPLATE" &
+python3 -m dynamo.sglang --multimodal-processor --model-path "$MODEL_NAME" $SERVED_MODEL_ARG --chat-template "$CHAT_TEMPLATE" &
 # run SGLang multimodal encode worker
-CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.sglang --multimodal-encode-worker --model-path "$MODEL_NAME" --chat-template "$CHAT_TEMPLATE" &
+CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.sglang --multimodal-encode-worker --model-path "$MODEL_NAME" $SERVED_MODEL_ARG --chat-template "$CHAT_TEMPLATE" &
 # run SGLang multimodal inference worker
 # TODO: Remove disable-radix-cache once the issue is fixed.
@@ -69,6 +75,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.sglang --multimodal-encode-worker --mod
 CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
  --multimodal-worker \
  --model-path "$MODEL_NAME" \
+  $SERVED_MODEL_ARG \
  --page-size 16 \
  --tp 1 \
  --trust-remote-code \

--- a/components/backends/sglang/launch/multimodal_disagg.sh
+++ b/components/backends/sglang/launch/multimodal_disagg.sh
@@ -23,6 +23,10 @@ while [[ $# -gt 0 ]]; do
            MODEL_NAME=$2
            shift 2
            ;;
+        --served-model-name)
+            SERVED_MODEL_NAME=$2
+            shift 2
+            ;;
        --chat-template)
            PROVIDED_CHAT_TEMPLATE=$2
            shift 2
@@ -31,6 +35,7 @@ while [[ $# -gt 0 ]]; do
            echo "Usage: $0 [OPTIONS]"
            echo "Options:"
            echo "  --model <model_name> Specify the model to use (default: $MODEL_NAME)"
+            echo "  --served-model-name <served_model_name> Specify the served model name to use (default: empty)"
            echo "  --chat-template <template> Specify the SGLang chat template to use (default: $CHAT_TEMPLATE)"
            echo "  -h, --help           Show this help message"
            exit 0
@@ -48,20 +53,21 @@ if [[ -n "$PROVIDED_CHAT_TEMPLATE" ]]; then
    CHAT_TEMPLATE="$PROVIDED_CHAT_TEMPLATE"
 fi
-# Get the directory where this script is located
+# Prepare served-model-name argument if provided
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SERVED_MODEL_ARG=""
-SGLANG_BACKEND_DIR="$SCRIPT_DIR/src"
+if [[ -n "$SERVED_MODEL_NAME" ]]; then
+    SERVED_MODEL_ARG="--served-model-name $SERVED_MODEL_NAME"
+fi
 # run ingress
 python3 -m dynamo.frontend --http-port=8000 &
 DYNAMO_PID=$!
 # run SGLang multimodal processor
-python3 -m dynamo.sglang --multimodal-processor --model-path "$MODEL_NAME" --chat-template "$CHAT_TEMPLATE" &
+python3 -m dynamo.sglang --multimodal-processor --model-path "$MODEL_NAME" $SERVED_MODEL_ARG --chat-template "$CHAT_TEMPLATE" &
 # run SGLang multimodal encode worker
-CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.sglang --multimodal-encode-worker --model-path "$MODEL_NAME" --chat-template "$CHAT_TEMPLATE" &
+CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.sglang --multimodal-encode-worker --model-path "$MODEL_NAME" $SERVED_MODEL_ARG --chat-template "$CHAT_TEMPLATE" &
 # run SGLang multimodal prefill worker
 # TODO: Remove disable-radix-cache once the issue is fixed.
@@ -69,6 +75,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.sglang --multimodal-encode-worker --mod
 CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
  --multimodal-worker \
  --model-path "$MODEL_NAME" \
+  $SERVED_MODEL_ARG \
  --page-size 16 \
  --tp 1 \
  --trust-remote-code \
@@ -83,6 +90,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
 CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.sglang \
  --multimodal-worker \
  --model-path "$MODEL_NAME" \
+  $SERVED_MODEL_ARG \
  --page-size 16 \
  --tp 1 \
  --trust-remote-code \

--- a/components/src/dynamo/sglang/multimodal_utils/multimodal_encode_utils.py
+++ b/components/src/dynamo/sglang/multimodal_utils/multimodal_encode_utils.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 import logging
+from pathlib import Path
 from typing import Any, Dict, Optional
 import torch
@@ -15,6 +16,75 @@ class SupportedModels:
    QWEN_2_5_VL_7B = "Qwen/Qwen2.5-VL-7B-Instruct"
+def normalize_model_name(model_name: str) -> str:
+    """
+    Extract and normalize model name from various formats including HuggingFace cache paths.
+    Args:
+        model_name: Model identifier which can be:
+            - A simple model name: "Qwen/Qwen2.5-VL-7B-Instruct"
+            - A HuggingFace cache path: "/root/.cache/huggingface/hub/models--Qwen--Qwen2.5-VL-7B-Instruct/..."
+            - A local path to a model directory
+    Returns:
+        Normalized model name in the format "organization/model-name"
+    Examples:
+        >>> normalize_model_name("Qwen/Qwen2.5-VL-7B-Instruct")
+        "Qwen/Qwen2.5-VL-7B-Instruct"
+        >>> normalize_model_name("/root/.cache/huggingface/hub/models--Qwen--Qwen2.5-VL-7B-Instruct/snapshots/...")
+        "Qwen/Qwen2.5-VL-7B-Instruct"
+    """
+    # If it's already a simple model name (org/model format), return as-is
+    if "/" in model_name and not model_name.startswith("/"):
+        return model_name
+    # Handle HuggingFace cache paths
+    if "models--" in model_name:
+        # Extract from cache path format: models--ORG--MODEL-NAME
+        # Split on "models--" then on "--" to handle dashes in org/model names
+        parts_after_models = model_name.split("models--", 1)
+        if len(parts_after_models) > 1:
+            # Split the remaining part on "--" and take the last two segments
+            segments = parts_after_models[1].split("--")
+            if len(segments) >= 2:
+                # Take all segments except the last as org (rejoined with dashes)
+                # and the last segment (before any slash) as model name
+                org_segments = segments[:-1]
+                model_segment = segments[-1].split("/")[
+                    0
+                ]  # Remove any path after model name
+                org = "--".join(org_segments)  # Rejoin org parts with dashes
+                model = model_segment
+                return f"{org}/{model}"
+    # Handle local directory paths - extract the last directory name
+    path = Path(model_name)
+    if path.exists() and path.is_dir():
+        return path.name
+    # If no pattern matches, return the original name
+    return model_name
+def is_model_supported(model_name: str, supported_model: str) -> bool:
+    """
+    Check if a model name matches a supported model, handling various naming formats.
+    Args:
+        model_name: The model name to check (may be path, cache name, etc.)
+        supported_model: The supported model identifier
+    Returns:
+        True if the model is supported, False otherwise
+    """
+    normalized_name = normalize_model_name(model_name).lower()
+    normalized_supported = normalize_model_name(supported_model).lower()
+    return normalized_name == normalized_supported
 def get_qwen_image_features(
    vision_encoder: torch.nn.Module, image_embeds: Dict[str, Any]
 ) -> torch.Tensor:
@@ -71,11 +141,15 @@ def encode_image_embeddings(
    """
    with torch.no_grad():
        # Route through the correct encoder based on model
-        if model_name == SupportedModels.QWEN_2_5_VL_7B:
+        if is_model_supported(model_name, SupportedModels.QWEN_2_5_VL_7B):
            embeddings = get_qwen_image_features(vision_encoder, image_embeds)
        else:
-            raise NotImplementedError(f"Model not supported: {model_name}")
+            # Provide more helpful error message with normalized model name
+            normalized_name = normalize_model_name(model_name)
+            raise NotImplementedError(
+                f"Model not supported: {normalized_name} (original: {model_name})"
+            )
        # Normalize output shape
        if isinstance(embeddings, (tuple, list)):

--- a/components/src/dynamo/sglang/request_handlers/multimodal/encode_worker_handler.py
+++ b/components/src/dynamo/sglang/request_handlers/multimodal/encode_worker_handler.py
@@ -49,6 +49,7 @@ class MultimodalEncodeWorkerHandler(BaseWorkerHandler):
        super().__init__(component, engine=None, config=config)
        self.pd_worker_client = pd_worker_client
        self.model = config.server_args.model_path
+        self.served_model_name = config.server_args.served_model_name
        self.image_loader = ImageLoader(cache_size=CACHE_SIZE_MAXIMUM)
@@ -124,7 +125,7 @@ class MultimodalEncodeWorkerHandler(BaseWorkerHandler):
            image_embeds = self.image_processor(images=image, return_tensors="pt")
            precomputed_embeddings = encode_image_embeddings(
-                model_name=self.model,
+                model_name=self.served_model_name,
                image_embeds=image_embeds,
                vision_encoder=self.vision_model,
                projector=None,