Unverified Commit ddbb4f50 authored by Kris Hung's avatar Kris Hung Committed by GitHub
Browse files

chore: Clean up for sglang multimodal example (#3420)


Signed-off-by: default avatarkrishung5 <krish@nvidia.com>
parent 73b0cdb4
......@@ -12,7 +12,7 @@ cleanup() {
trap cleanup EXIT INT TERM
# run clear_namespace
python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo
python3 -m dynamo.sglang.clear_namespace --namespace dynamo
# run ingress
python3 -m dynamo.frontend --http-port=8000 &
......
......@@ -12,7 +12,7 @@ cleanup() {
trap cleanup EXIT INT TERM
# run clear_namespace
python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo
python3 -m dynamo.sglang.clear_namespace --namespace dynamo
# run ingress
python -m dynamo.frontend --router-mode kv --http-port=8000 &
......
......@@ -12,7 +12,7 @@ cleanup() {
trap cleanup EXIT INT TERM
# run clear_namespace
python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo
python3 -m dynamo.sglang.clear_namespace --namespace dynamo
# run ingress
python3 -m dynamo.frontend --http-port=8000 &
......
......@@ -12,7 +12,7 @@ cleanup() {
trap cleanup EXIT INT TERM
# run clear_namespace
python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo
python3 -m dynamo.sglang.clear_namespace --namespace dynamo
# run ingress
python3 -m dynamo.frontend --http-port=8000 &
......
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# Default values
MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
......@@ -45,8 +52,12 @@ fi
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SGLANG_BACKEND_DIR="$SCRIPT_DIR/src"
# run clear_namespace
python3 -m dynamo.sglang.clear_namespace --namespace dynamo
# run ingress
python -m dynamo.frontend --http-port=8000 &
python3 -m dynamo.frontend --http-port=8000 &
DYNAMO_PID=$!
# run SGLang multimodal processor
python3 -m dynamo.sglang --multimodal-processor --model-path "$MODEL_NAME" --chat-template "$CHAT_TEMPLATE" &
......@@ -55,6 +66,8 @@ python3 -m dynamo.sglang --multimodal-processor --model-path "$MODEL_NAME" --cha
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.sglang --multimodal-encode-worker --model-path "$MODEL_NAME" --chat-template "$CHAT_TEMPLATE" &
# run SGLang multimodal inference worker
# TODO: Remove disable-radix-cache once the issue is fixed.
# See https://github.com/sgl-project/sglang/pull/11203.
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--multimodal-worker \
--model-path "$MODEL_NAME" \
......@@ -62,6 +75,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--tp 1 \
--trust-remote-code \
--skip-tokenizer-init \
--disable-radix-cache \
--disaggregation-transfer-backend nixl &
# Wait for all background processes to complete
......
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# Default values
MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
......@@ -45,8 +52,12 @@ fi
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SGLANG_BACKEND_DIR="$SCRIPT_DIR/src"
# run clear_namespace
python3 -m dynamo.sglang.clear_namespace --namespace dynamo
# run ingress
python -m dynamo.frontend --http-port=8000 &
python3 -m dynamo.frontend --http-port=8000 &
DYNAMO_PID=$!
# run SGLang multimodal processor
python3 -m dynamo.sglang --multimodal-processor --model-path "$MODEL_NAME" --chat-template "$CHAT_TEMPLATE" &
......@@ -55,6 +66,8 @@ python3 -m dynamo.sglang --multimodal-processor --model-path "$MODEL_NAME" --cha
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.sglang --multimodal-encode-worker --model-path "$MODEL_NAME" --chat-template "$CHAT_TEMPLATE" &
# run SGLang multimodal prefill worker
# TODO: Remove disable-radix-cache once the issue is fixed.
# See https://github.com/sgl-project/sglang/pull/11203.
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--multimodal-worker \
--model-path "$MODEL_NAME" \
......@@ -65,6 +78,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--disaggregation-mode prefill \
--disaggregation-bootstrap-port 12345 \
--host 0.0.0.0 \
--disable-radix-cache \
--disaggregation-transfer-backend nixl &
# run SGLang multimodal decode worker
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from dynamo.sglang.multimodal_utils.multimodal_chat_processor import (
multimodal_request_to_sglang,
process_sglang_stream_response,
)
from dynamo.sglang.multimodal_utils.multimodal_encode_utils import (
encode_image_embeddings,
)
from dynamo.sglang.multimodal_utils.multimodal_image_loader import ImageLoader
__all__ = [
"multimodal_request_to_sglang",
"process_sglang_stream_response",
"encode_image_embeddings",
"ImageLoader",
]
......@@ -8,32 +8,6 @@ from sglang.srt.parser.conversation import chat_templates
logger = logging.getLogger(__name__)
def clean_addcriterion(text: str) -> str:
"""
Removes the addCriterion prefix from the text output.
To prevent addCriterion from appearing in outputs, an assistant placeholder must be added to the conversation.
However, adding the assistant placeholder causes subsequent requests to fail with shape mismatch errors on the engine side.
The root cause is still under investigation, so this temporary workaround is in place to maintain functionality.
"""
if text.startswith(" addCriterion"):
cleaned_text = text[13:].lstrip() # 12 = len(" addCriterion")
logger.debug(
f"🛠️ HACK: Removed ' addCriterion' prefix: '{text[:20]}...' -> '{cleaned_text[:20]}...'"
)
return cleaned_text
if text.startswith("addCriterion"):
cleaned_text = text[12:].lstrip() # 11 = len("addCriterion")
logger.debug(
f"🛠️ HACK: Removed 'addCriterion' prefix: '{text[:20]}...' -> '{cleaned_text[:20]}...'"
)
return cleaned_text
return text
def multimodal_request_to_sglang(raw_request, tokenizer, chat_template):
conv = chat_templates[chat_template].copy()
conv.messages = []
......@@ -53,6 +27,7 @@ def multimodal_request_to_sglang(raw_request, tokenizer, chat_template):
elif msg.role == "assistant":
conv.append_message(conv.roles[1], msg.content)
conv.append_message(conv.roles[1], "")
logger.debug(f"conv: {conv}")
# Tokenize and prepare input_ids
......@@ -109,11 +84,11 @@ def detokenize_sglang_response(response_data, tokenizer):
# Ensure response_data is a dictionary
if not isinstance(response_data, dict):
return clean_addcriterion(str(response_data))
return str(response_data)
# Get text content - detokenize if needed
if "text" in response_data and response_data["text"]:
return clean_addcriterion(response_data["text"])
return response_data["text"]
elif "token_ids" in response_data and response_data["token_ids"]:
token_ids = response_data["token_ids"]
if isinstance(token_ids, list) and token_ids:
......@@ -122,7 +97,7 @@ def detokenize_sglang_response(response_data, tokenizer):
logger.debug(
f"Detokenized {len(token_ids)} tokens to: '{text_content}'"
)
return clean_addcriterion(text_content)
return text_content
# Return empty string if no content to detokenize
return ""
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from typing import List, Optional, Union
from typing import Any, List, Literal, Optional, Tuple, Union
from pydantic import BaseModel, Field
from pydantic import BaseModel, ConfigDict, Field
from sglang.srt.entrypoints.openai.protocol import ChatCompletionRequest
import dynamo.nixl_connect as connect
TokenIdType = int
# ============================================================================
# Standard LLM Protocol Types
# ============================================================================
# TODO: move these to common for all LLMs once we adopt dynamo-run
# derived from lib/llm/src/protocols/common/preprocessor.rs
class StopConditions(BaseModel):
......@@ -59,3 +64,70 @@ class DisaggPreprocessedRequest(BaseModel):
request: Union[PreprocessedRequest, ChatCompletionRequest]
sampling_params: dict
data_parallel_rank: Optional[int] = None
# ============================================================================
# Multimodal Protocol Types
# ============================================================================
class TextContent(BaseModel):
type: Literal["text"]
text: str
class ImageURLDetail(BaseModel):
url: str
class ImageContent(BaseModel):
type: Literal["image_url"]
image_url: ImageURLDetail
class VideoURLDetail(BaseModel):
url: str
class VideoContent(BaseModel):
type: Literal["video_url"]
video_url: VideoURLDetail
MessageContent = Union[TextContent, ImageContent, VideoContent]
class ChatMessage(BaseModel):
role: Literal["user", "system", "assistant"]
content: List[MessageContent]
class MultiModalRequest(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)
model: str
messages: List[ChatMessage]
max_tokens: Optional[int] = None
temperature: Optional[float] = None
stream: Optional[bool] = False
class MultiModalInput(BaseModel):
image_url: Optional[str] = None
video_url: Optional[str] = None
class SglangMultimodalRequest(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)
request: PreprocessedRequest
multimodal_input: Optional[MultiModalInput] = Field(default_factory=MultiModalInput)
image_grid_thw: Optional[List[Any]] = None
embeddings_shape: Optional[
Union[Tuple[int, int, int], Tuple[int, int, int, int]]
] = None
serialized_request: Optional[connect.RdmaMetadata] = None
class DisaggSglangMultimodalRequest(BaseModel):
request: SglangMultimodalRequest
sampling_params: dict
data_parallel_rank: Optional[int] = None
......@@ -12,10 +12,9 @@ import dynamo.nixl_connect as connect
from dynamo._core import Client, Component
from dynamo.runtime import DistributedRuntime
from dynamo.sglang.args import Config
from dynamo.sglang.multimodal_utils import ImageLoader, encode_image_embeddings
from dynamo.sglang.protocol import SglangMultimodalRequest
from dynamo.sglang.request_handlers.handler_base import BaseWorkerHandler
from dynamo.sglang.utils.multimodal_encode_utils import encode_image_embeddings
from dynamo.sglang.utils.multimodal_image_loader import ImageLoader
from dynamo.sglang.utils.multimodal_protocol import SglangMultimodalRequest
logger = logging.getLogger(__name__)
......
......@@ -11,16 +11,16 @@ from transformers import AutoTokenizer
from dynamo._core import Client, Component
from dynamo.sglang.args import Config
from dynamo.sglang.request_handlers.handler_base import BaseWorkerHandler
from dynamo.sglang.utils.multimodal_chat_processor import (
from dynamo.sglang.multimodal_utils import (
multimodal_request_to_sglang,
process_sglang_stream_response,
)
from dynamo.sglang.utils.multimodal_protocol import (
from dynamo.sglang.protocol import (
MultiModalInput,
MultiModalRequest,
SglangMultimodalRequest,
)
from dynamo.sglang.request_handlers.handler_base import BaseWorkerHandler
logger = logging.getLogger(__name__)
......
......@@ -12,11 +12,11 @@ import torch
import dynamo.nixl_connect as connect
from dynamo._core import Client, Component
from dynamo.sglang.args import Config, DisaggregationMode
from dynamo.sglang.request_handlers.handler_base import BaseWorkerHandler
from dynamo.sglang.utils.multimodal_protocol import (
from dynamo.sglang.protocol import (
DisaggSglangMultimodalRequest,
SglangMultimodalRequest,
)
from dynamo.sglang.request_handlers.handler_base import BaseWorkerHandler
logger = logging.getLogger(__name__)
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from typing import Any, List, Literal, Optional, Tuple, Union
from pydantic import BaseModel, ConfigDict, Field
import dynamo.nixl_connect as connect
from dynamo.sglang.protocol import PreprocessedRequest
TokenIdType = int
class TextContent(BaseModel):
type: Literal["text"]
text: str
class ImageURLDetail(BaseModel):
url: str
class ImageContent(BaseModel):
type: Literal["image_url"]
image_url: ImageURLDetail
class VideoURLDetail(BaseModel):
url: str
class VideoContent(BaseModel):
type: Literal["video_url"]
video_url: VideoURLDetail
MessageContent = Union[TextContent, ImageContent, VideoContent]
class ChatMessage(BaseModel):
role: Literal["user", "system", "assistant"]
content: List[MessageContent]
class MultiModalRequest(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)
model: str
messages: List[ChatMessage]
max_tokens: Optional[int] = None
temperature: Optional[float] = None
stream: Optional[bool] = False
class MultiModalInput(BaseModel):
image_url: Optional[str] = None
video_url: Optional[str] = None
class SglangMultimodalRequest(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)
request: PreprocessedRequest
multimodal_input: Optional[MultiModalInput] = Field(default_factory=MultiModalInput)
image_grid_thw: Optional[List[Any]] = None
embeddings_shape: Optional[
Union[Tuple[int, int, int], Tuple[int, int, int, int]]
] = None
serialized_request: Optional[connect.RdmaMetadata] = None
class DisaggSglangMultimodalRequest(BaseModel):
request: SglangMultimodalRequest
sampling_params: dict
data_parallel_rank: Optional[int] = None
......@@ -133,6 +133,7 @@ addopts = [
"--ignore-glob=*/llm/tensorrtllm*",
"--ignore-glob=docs/*",
"--ignore-glob=components/src/dynamo/sglang/request_handlers/*",
"--ignore-glob=components/src/dynamo/sglang/multimodal_utils/*",
"--ignore-glob=components/backends/sglang/slurm_jobs/*",
# FIXME: Get relative/generic blob paths to work here
]
......
......@@ -12,7 +12,7 @@ cleanup() {
trap cleanup EXIT INT TERM
# run clear_namespace
python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo
python3 -m dynamo.sglang.clear_namespace --namespace dynamo
# run ingress
python3 -m dynamo.frontend --http-port=8000 &
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment