Unverified Commit 77cecf4e authored by Kris Hung's avatar Kris Hung Committed by GitHub
Browse files

fix: Fix sglang multimodal test (#3862)


Signed-off-by: default avatarkrishung5 <krish@nvidia.com>
parent cbe0b177
...@@ -23,6 +23,10 @@ while [[ $# -gt 0 ]]; do ...@@ -23,6 +23,10 @@ while [[ $# -gt 0 ]]; do
MODEL_NAME=$2 MODEL_NAME=$2
shift 2 shift 2
;; ;;
--served-model-name)
SERVED_MODEL_NAME=$2
shift 2
;;
--chat-template) --chat-template)
PROVIDED_CHAT_TEMPLATE=$2 PROVIDED_CHAT_TEMPLATE=$2
shift 2 shift 2
...@@ -31,6 +35,7 @@ while [[ $# -gt 0 ]]; do ...@@ -31,6 +35,7 @@ while [[ $# -gt 0 ]]; do
echo "Usage: $0 [OPTIONS]" echo "Usage: $0 [OPTIONS]"
echo "Options:" echo "Options:"
echo " --model <model_name> Specify the model to use (default: $MODEL_NAME)" echo " --model <model_name> Specify the model to use (default: $MODEL_NAME)"
echo " --served-model-name <served_model_name> Specify the served model name to use (default: empty)"
echo " --chat-template <template> Specify the SGLang chat template to use (default: $CHAT_TEMPLATE)" echo " --chat-template <template> Specify the SGLang chat template to use (default: $CHAT_TEMPLATE)"
echo " -h, --help Show this help message" echo " -h, --help Show this help message"
exit 0 exit 0
...@@ -48,20 +53,21 @@ if [[ -n "$PROVIDED_CHAT_TEMPLATE" ]]; then ...@@ -48,20 +53,21 @@ if [[ -n "$PROVIDED_CHAT_TEMPLATE" ]]; then
CHAT_TEMPLATE="$PROVIDED_CHAT_TEMPLATE" CHAT_TEMPLATE="$PROVIDED_CHAT_TEMPLATE"
fi fi
# Get the directory where this script is located # Prepare served-model-name argument if provided
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SERVED_MODEL_ARG=""
SGLANG_BACKEND_DIR="$SCRIPT_DIR/src" if [[ -n "$SERVED_MODEL_NAME" ]]; then
SERVED_MODEL_ARG="--served-model-name $SERVED_MODEL_NAME"
fi
# run ingress # run ingress
python3 -m dynamo.frontend --http-port=8000 & python3 -m dynamo.frontend --http-port=8000 &
DYNAMO_PID=$! DYNAMO_PID=$!
# run SGLang multimodal processor # run SGLang multimodal processor
python3 -m dynamo.sglang --multimodal-processor --model-path "$MODEL_NAME" --chat-template "$CHAT_TEMPLATE" & python3 -m dynamo.sglang --multimodal-processor --model-path "$MODEL_NAME" $SERVED_MODEL_ARG --chat-template "$CHAT_TEMPLATE" &
# run SGLang multimodal encode worker # run SGLang multimodal encode worker
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.sglang --multimodal-encode-worker --model-path "$MODEL_NAME" --chat-template "$CHAT_TEMPLATE" & CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.sglang --multimodal-encode-worker --model-path "$MODEL_NAME" $SERVED_MODEL_ARG --chat-template "$CHAT_TEMPLATE" &
# run SGLang multimodal inference worker # run SGLang multimodal inference worker
# TODO: Remove disable-radix-cache once the issue is fixed. # TODO: Remove disable-radix-cache once the issue is fixed.
...@@ -69,6 +75,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.sglang --multimodal-encode-worker --mod ...@@ -69,6 +75,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.sglang --multimodal-encode-worker --mod
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--multimodal-worker \ --multimodal-worker \
--model-path "$MODEL_NAME" \ --model-path "$MODEL_NAME" \
$SERVED_MODEL_ARG \
--page-size 16 \ --page-size 16 \
--tp 1 \ --tp 1 \
--trust-remote-code \ --trust-remote-code \
......
...@@ -23,6 +23,10 @@ while [[ $# -gt 0 ]]; do ...@@ -23,6 +23,10 @@ while [[ $# -gt 0 ]]; do
MODEL_NAME=$2 MODEL_NAME=$2
shift 2 shift 2
;; ;;
--served-model-name)
SERVED_MODEL_NAME=$2
shift 2
;;
--chat-template) --chat-template)
PROVIDED_CHAT_TEMPLATE=$2 PROVIDED_CHAT_TEMPLATE=$2
shift 2 shift 2
...@@ -31,6 +35,7 @@ while [[ $# -gt 0 ]]; do ...@@ -31,6 +35,7 @@ while [[ $# -gt 0 ]]; do
echo "Usage: $0 [OPTIONS]" echo "Usage: $0 [OPTIONS]"
echo "Options:" echo "Options:"
echo " --model <model_name> Specify the model to use (default: $MODEL_NAME)" echo " --model <model_name> Specify the model to use (default: $MODEL_NAME)"
echo " --served-model-name <served_model_name> Specify the served model name to use (default: empty)"
echo " --chat-template <template> Specify the SGLang chat template to use (default: $CHAT_TEMPLATE)" echo " --chat-template <template> Specify the SGLang chat template to use (default: $CHAT_TEMPLATE)"
echo " -h, --help Show this help message" echo " -h, --help Show this help message"
exit 0 exit 0
...@@ -48,20 +53,21 @@ if [[ -n "$PROVIDED_CHAT_TEMPLATE" ]]; then ...@@ -48,20 +53,21 @@ if [[ -n "$PROVIDED_CHAT_TEMPLATE" ]]; then
CHAT_TEMPLATE="$PROVIDED_CHAT_TEMPLATE" CHAT_TEMPLATE="$PROVIDED_CHAT_TEMPLATE"
fi fi
# Get the directory where this script is located # Prepare served-model-name argument if provided
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SERVED_MODEL_ARG=""
SGLANG_BACKEND_DIR="$SCRIPT_DIR/src" if [[ -n "$SERVED_MODEL_NAME" ]]; then
SERVED_MODEL_ARG="--served-model-name $SERVED_MODEL_NAME"
fi
# run ingress # run ingress
python3 -m dynamo.frontend --http-port=8000 & python3 -m dynamo.frontend --http-port=8000 &
DYNAMO_PID=$! DYNAMO_PID=$!
# run SGLang multimodal processor # run SGLang multimodal processor
python3 -m dynamo.sglang --multimodal-processor --model-path "$MODEL_NAME" --chat-template "$CHAT_TEMPLATE" & python3 -m dynamo.sglang --multimodal-processor --model-path "$MODEL_NAME" $SERVED_MODEL_ARG --chat-template "$CHAT_TEMPLATE" &
# run SGLang multimodal encode worker # run SGLang multimodal encode worker
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.sglang --multimodal-encode-worker --model-path "$MODEL_NAME" --chat-template "$CHAT_TEMPLATE" & CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.sglang --multimodal-encode-worker --model-path "$MODEL_NAME" $SERVED_MODEL_ARG --chat-template "$CHAT_TEMPLATE" &
# run SGLang multimodal prefill worker # run SGLang multimodal prefill worker
# TODO: Remove disable-radix-cache once the issue is fixed. # TODO: Remove disable-radix-cache once the issue is fixed.
...@@ -69,6 +75,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.sglang --multimodal-encode-worker --mod ...@@ -69,6 +75,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.sglang --multimodal-encode-worker --mod
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--multimodal-worker \ --multimodal-worker \
--model-path "$MODEL_NAME" \ --model-path "$MODEL_NAME" \
$SERVED_MODEL_ARG \
--page-size 16 \ --page-size 16 \
--tp 1 \ --tp 1 \
--trust-remote-code \ --trust-remote-code \
...@@ -83,6 +90,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \ ...@@ -83,6 +90,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.sglang \ CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.sglang \
--multimodal-worker \ --multimodal-worker \
--model-path "$MODEL_NAME" \ --model-path "$MODEL_NAME" \
$SERVED_MODEL_ARG \
--page-size 16 \ --page-size 16 \
--tp 1 \ --tp 1 \
--trust-remote-code \ --trust-remote-code \
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import logging import logging
from pathlib import Path
from typing import Any, Dict, Optional from typing import Any, Dict, Optional
import torch import torch
...@@ -15,6 +16,75 @@ class SupportedModels: ...@@ -15,6 +16,75 @@ class SupportedModels:
QWEN_2_5_VL_7B = "Qwen/Qwen2.5-VL-7B-Instruct" QWEN_2_5_VL_7B = "Qwen/Qwen2.5-VL-7B-Instruct"
def normalize_model_name(model_name: str) -> str:
"""
Extract and normalize model name from various formats including HuggingFace cache paths.
Args:
model_name: Model identifier which can be:
- A simple model name: "Qwen/Qwen2.5-VL-7B-Instruct"
- A HuggingFace cache path: "/root/.cache/huggingface/hub/models--Qwen--Qwen2.5-VL-7B-Instruct/..."
- A local path to a model directory
Returns:
Normalized model name in the format "organization/model-name"
Examples:
>>> normalize_model_name("Qwen/Qwen2.5-VL-7B-Instruct")
"Qwen/Qwen2.5-VL-7B-Instruct"
>>> normalize_model_name("/root/.cache/huggingface/hub/models--Qwen--Qwen2.5-VL-7B-Instruct/snapshots/...")
"Qwen/Qwen2.5-VL-7B-Instruct"
"""
# If it's already a simple model name (org/model format), return as-is
if "/" in model_name and not model_name.startswith("/"):
return model_name
# Handle HuggingFace cache paths
if "models--" in model_name:
# Extract from cache path format: models--ORG--MODEL-NAME
# Split on "models--" then on "--" to handle dashes in org/model names
parts_after_models = model_name.split("models--", 1)
if len(parts_after_models) > 1:
# Split the remaining part on "--" and take the last two segments
segments = parts_after_models[1].split("--")
if len(segments) >= 2:
# Take all segments except the last as org (rejoined with dashes)
# and the last segment (before any slash) as model name
org_segments = segments[:-1]
model_segment = segments[-1].split("/")[
0
] # Remove any path after model name
org = "--".join(org_segments) # Rejoin org parts with dashes
model = model_segment
return f"{org}/{model}"
# Handle local directory paths - extract the last directory name
path = Path(model_name)
if path.exists() and path.is_dir():
return path.name
# If no pattern matches, return the original name
return model_name
def is_model_supported(model_name: str, supported_model: str) -> bool:
"""
Check if a model name matches a supported model, handling various naming formats.
Args:
model_name: The model name to check (may be path, cache name, etc.)
supported_model: The supported model identifier
Returns:
True if the model is supported, False otherwise
"""
normalized_name = normalize_model_name(model_name).lower()
normalized_supported = normalize_model_name(supported_model).lower()
return normalized_name == normalized_supported
def get_qwen_image_features( def get_qwen_image_features(
vision_encoder: torch.nn.Module, image_embeds: Dict[str, Any] vision_encoder: torch.nn.Module, image_embeds: Dict[str, Any]
) -> torch.Tensor: ) -> torch.Tensor:
...@@ -71,11 +141,15 @@ def encode_image_embeddings( ...@@ -71,11 +141,15 @@ def encode_image_embeddings(
""" """
with torch.no_grad(): with torch.no_grad():
# Route through the correct encoder based on model # Route through the correct encoder based on model
if model_name == SupportedModels.QWEN_2_5_VL_7B: if is_model_supported(model_name, SupportedModels.QWEN_2_5_VL_7B):
embeddings = get_qwen_image_features(vision_encoder, image_embeds) embeddings = get_qwen_image_features(vision_encoder, image_embeds)
else: else:
raise NotImplementedError(f"Model not supported: {model_name}") # Provide more helpful error message with normalized model name
normalized_name = normalize_model_name(model_name)
raise NotImplementedError(
f"Model not supported: {normalized_name} (original: {model_name})"
)
# Normalize output shape # Normalize output shape
if isinstance(embeddings, (tuple, list)): if isinstance(embeddings, (tuple, list)):
......
...@@ -49,6 +49,7 @@ class MultimodalEncodeWorkerHandler(BaseWorkerHandler): ...@@ -49,6 +49,7 @@ class MultimodalEncodeWorkerHandler(BaseWorkerHandler):
super().__init__(component, engine=None, config=config) super().__init__(component, engine=None, config=config)
self.pd_worker_client = pd_worker_client self.pd_worker_client = pd_worker_client
self.model = config.server_args.model_path self.model = config.server_args.model_path
self.served_model_name = config.server_args.served_model_name
self.image_loader = ImageLoader(cache_size=CACHE_SIZE_MAXIMUM) self.image_loader = ImageLoader(cache_size=CACHE_SIZE_MAXIMUM)
...@@ -124,7 +125,7 @@ class MultimodalEncodeWorkerHandler(BaseWorkerHandler): ...@@ -124,7 +125,7 @@ class MultimodalEncodeWorkerHandler(BaseWorkerHandler):
image_embeds = self.image_processor(images=image, return_tensors="pt") image_embeds = self.image_processor(images=image, return_tensors="pt")
precomputed_embeddings = encode_image_embeddings( precomputed_embeddings = encode_image_embeddings(
model_name=self.model, model_name=self.served_model_name,
image_embeds=image_embeds, image_embeds=image_embeds,
vision_encoder=self.vision_model, vision_encoder=self.vision_model,
projector=None, projector=None,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment