"...ssh:/git@developer.sourcefind.cn:2222/OpenDAS/dynamo.git" did not exist on "51c103b2c8f69a99a147116753457808e457725b"
Unverified Commit b1c18bb1 authored by Zhuangcheng Gu's avatar Zhuangcheng Gu Committed by GitHub
Browse files

feat(sglang): add video input support for aggregated serving (#7941)


Signed-off-by: default avatarChokoyo <40918450+Chokoyo@users.noreply.github.com>
parent 73f8557a
...@@ -18,6 +18,27 @@ from dynamo.sglang.publisher import DynamoSglangPublisher ...@@ -18,6 +18,27 @@ from dynamo.sglang.publisher import DynamoSglangPublisher
from dynamo.sglang.request_handlers.handler_base import BaseWorkerHandler from dynamo.sglang.request_handlers.handler_base import BaseWorkerHandler
def _extract_media_urls(mm_data: Dict[str, Any], media_key: str) -> list[str] | None:
"""Normalize multimodal URL items from the frontend wire format."""
items = mm_data.get(media_key)
if not items:
return None
urls: list[str] = []
for item in items:
if isinstance(item, str):
urls.append(item)
continue
if isinstance(item, dict):
url = item.get("Url")
if isinstance(url, str):
urls.append(url)
return urls or None
class DecodeWorkerHandler(BaseWorkerHandler): class DecodeWorkerHandler(BaseWorkerHandler):
"""Handler for decode workers in both aggregated and disaggregated serving modes.""" """Handler for decode workers in both aggregated and disaggregated serving modes."""
...@@ -157,18 +178,11 @@ class DecodeWorkerHandler(BaseWorkerHandler): ...@@ -157,18 +178,11 @@ class DecodeWorkerHandler(BaseWorkerHandler):
async for out in self._process_text_stream(decode, context): async for out in self._process_text_stream(decode, context):
yield out yield out
else: else:
# Extract image URLs for multimodal requests. SGLang's mm_data_processor # Extract image/video URLs for multimodal requests. SGLang's mm_data_processor
# handles loading/preprocessing, and the scheduler does vision encoding. # handles loading/preprocessing, and the scheduler does vision encoding.
image_data: list[str] | None = None mm_data = request.get("multi_modal_data", {})
image_items = request.get("multi_modal_data", {}).get("image_url") image_data = _extract_media_urls(mm_data, "image_url")
if image_items: video_data = _extract_media_urls(mm_data, "video_url")
image_data = []
for item in image_items:
if isinstance(item, str):
image_data.append(item)
elif isinstance(item, dict) and "Url" in item:
image_data.append(item["Url"])
image_data = image_data or None
trace_header = build_trace_headers(context) if self.enable_trace else None trace_header = build_trace_headers(context) if self.enable_trace else None
...@@ -179,6 +193,7 @@ class DecodeWorkerHandler(BaseWorkerHandler): ...@@ -179,6 +193,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
agg = await self.engine.async_generate( agg = await self.engine.async_generate(
**input_param, **input_param,
image_data=image_data, image_data=image_data,
video_data=video_data,
sampling_params=sampling_params, sampling_params=sampling_params,
stream=True, stream=True,
return_routed_experts=return_routed_experts, return_routed_experts=return_routed_experts,
......
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import pytest
from dynamo.sglang.request_handlers.llm.decode_handler import _extract_media_urls
pytestmark = [
pytest.mark.unit,
pytest.mark.sglang,
pytest.mark.gpu_0,
pytest.mark.profiled_vram_gib(0),
pytest.mark.pre_merge,
]
def test_extract_media_urls_supports_string_and_wire_items():
mm_data = {
"video_url": [
"file:///tmp/test.mp4",
{"Url": "https://example.com/test.mp4"},
{"ignored": "value"},
]
}
assert _extract_media_urls(mm_data, "video_url") == [
"file:///tmp/test.mp4",
"https://example.com/test.mp4",
]
def test_extract_media_urls_returns_none_for_missing_or_invalid_items():
assert _extract_media_urls({}, "image_url") is None
assert (
_extract_media_urls({"image_url": [{"ignored": "value"}]}, "image_url") is None
)
...@@ -44,7 +44,7 @@ Dynamo provides support for improving latency and throughput for vision-and-lang ...@@ -44,7 +44,7 @@ Dynamo provides support for improving latency and throughput for vision-and-lang
|-------|-------|-------|-------| |-------|-------|-------|-------|
| **[vLLM](https://github.com/ai-dynamo/dynamo/blob/main/docs/features/multimodal/multimodal-vllm.md)** | ✅ | 🧪 | 🧪 | | **[vLLM](https://github.com/ai-dynamo/dynamo/blob/main/docs/features/multimodal/multimodal-vllm.md)** | ✅ | 🧪 | 🧪 |
| **[TRT-LLM](https://github.com/ai-dynamo/dynamo/blob/main/docs/features/multimodal/multimodal-trtllm.md)** | ✅ | ❌ | ❌ | | **[TRT-LLM](https://github.com/ai-dynamo/dynamo/blob/main/docs/features/multimodal/multimodal-trtllm.md)** | ✅ | ❌ | ❌ |
| **[SGLang](https://github.com/ai-dynamo/dynamo/blob/main/docs/features/multimodal/multimodal-sglang.md)** | ✅ | | ❌ | | **[SGLang](https://github.com/ai-dynamo/dynamo/blob/main/docs/features/multimodal/multimodal-sglang.md)** | ✅ | 🧪 | ❌ |
**Status:** ✅ Supported | 🧪 Experimental | ❌ Not supported **Status:** ✅ Supported | 🧪 Experimental | ❌ Not supported
......
...@@ -12,7 +12,7 @@ This document provides a comprehensive guide for multimodal inference using SGLa ...@@ -12,7 +12,7 @@ This document provides a comprehensive guide for multimodal inference using SGLa
|----------|--------------|------------|---------------|-------| |----------|--------------|------------|---------------|-------|
| **Image** | HTTP/HTTPS URL | Yes | Yes | Vision encoder generates embeddings | | **Image** | HTTP/HTTPS URL | Yes | Yes | Vision encoder generates embeddings |
| **Image** | Data URL (Base64) | No | No | | | **Image** | Data URL (Base64) | No | No | |
| **Video** | HTTP/HTTPS URL | No | No | | | **Video** | HTTP/HTTPS/`file://` URL | Yes | No | Aggregated only |
| **Audio** | HTTP/HTTPS URL | No | No | | | **Audio** | HTTP/HTTPS URL | No | No | |
### Supported URL Formats ### Supported URL Formats
...@@ -20,6 +20,7 @@ This document provides a comprehensive guide for multimodal inference using SGLa ...@@ -20,6 +20,7 @@ This document provides a comprehensive guide for multimodal inference using SGLa
| Format | Example | Description | | Format | Example | Description |
|--------|---------|-------------| |--------|---------|-------------|
| **HTTP/HTTPS** | `http://example.com/image.jpg` | Remote media files | | **HTTP/HTTPS** | `http://example.com/image.jpg` | Remote media files |
| **file://** | `file:///tmp/test.mp4` | Local files accessible to the backend |
## Deployment Patterns ## Deployment Patterns
...@@ -68,19 +69,19 @@ git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) ...@@ -68,19 +69,19 @@ git checkout $(git describe --tags $(git rev-list --tags --max-count=1))
### Workflow ### Workflow
The `DecodeWorkerHandler` receives multimodal requests with image URLs and passes them directly to SGLang's engine. SGLang's internal `mm_data_processor` handles image fetching, loading, encoding, and token expansion. The `DecodeWorkerHandler` receives multimodal requests with image/video URLs and passes them directly to SGLang's engine. SGLang's internal `mm_data_processor` handles image/video fetching, loading, encoding, and token expansion.
```mermaid ```mermaid
flowchart LR flowchart LR
HTTP --> worker HTTP --> worker
worker --tokenized text + image_urls--> SGLang[SGLang Engine] worker --tokenized text + image/video URLs--> SGLang[SGLang Engine]
``` ```
### Launch ### Launch
```bash ```bash
cd $DYNAMO_HOME/examples/backends/sglang cd $DYNAMO_HOME/examples/backends/sglang
./launch/agg.sh --model Qwen/Qwen2.5-VL-7B-Instruct --chat-template qwen2-vl ./launch/agg_vision.sh --model-path Qwen/Qwen2-VL-7B-Instruct
``` ```
**Client:** **Client:**
...@@ -112,6 +113,35 @@ curl http://localhost:8000/v1/chat/completions \ ...@@ -112,6 +113,35 @@ curl http://localhost:8000/v1/chat/completions \
}' | jq }' | jq
``` ```
Video requests use the same aggregated path:
```bash
curl http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "Qwen/Qwen2-VL-7B-Instruct",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Describe the video in detail"
},
{
"type": "video_url",
"video_url": {
"url": "https://samplelib.com/mp4/sample-5s.mp4"
}
}
]
}
],
"max_tokens": 50,
"stream": false
}' | jq
```
## E/PD Serving (Encode Separate) ## E/PD Serving (Encode Separate)
### Components ### Components
......
...@@ -21,7 +21,7 @@ This document provides a comprehensive compatibility matrix for key Dynamo featu ...@@ -21,7 +21,7 @@ This document provides a comprehensive compatibility matrix for key Dynamo featu
| **SLA-Based Planner** | ✅ | ✅ | ✅ | [Planner Doc][planner] | | **SLA-Based Planner** | ✅ | ✅ | ✅ | [Planner Doc][planner] |
| **KV Block Manager** | 🚧 | ✅ | ✅ | [KVBM Doc][kvbm] | | **KV Block Manager** | 🚧 | ✅ | ✅ | [KVBM Doc][kvbm] |
| **Multimodal (Image)** | ✅ | ✅ | ✅ | [Multimodal Doc][mm] | | **Multimodal (Image)** | ✅ | ✅ | ✅ | [Multimodal Doc][mm] |
| **Multimodal (Video)** | | | ✅ | [Multimodal Doc][mm] | | **Multimodal (Video)** | 🚧 | | ✅ | [Multimodal Doc][mm] |
| **Multimodal (Audio)** | | | 🚧 | [Multimodal Doc][mm] | | **Multimodal (Audio)** | | | 🚧 | [Multimodal Doc][mm] |
| **Request Migration** | ✅ | 🚧 | ✅ | [Migration Doc][migration] | | **Request Migration** | ✅ | 🚧 | ✅ | [Migration Doc][migration] |
| **Request Cancellation** | 🚧 | ✅ | ✅ | Backend READMEs | | **Request Cancellation** | 🚧 | ✅ | ✅ | Backend READMEs |
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# #
# Aggregated multimodal (vision + LLM) serving. # Aggregated multimodal (image/video + LLM) serving.
# GPUs: 1 # GPUs: 1
set -e set -e
...@@ -12,7 +12,8 @@ SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" ...@@ -12,7 +12,8 @@ SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh" source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Default values # Default values
MODEL="Qwen/Qwen3-VL-8B-Instruct" # TODO: Update default to Qwen3-VL-2B-Instruct after SGLang 0.5.10+ upgrade.
MODEL="Qwen/Qwen2-VL-7B-Instruct"
CHAT_TEMPLATE="" CHAT_TEMPLATE=""
ENABLE_OTEL=false ENABLE_OTEL=false
...@@ -61,7 +62,7 @@ if [ "$ENABLE_OTEL" = true ]; then ...@@ -61,7 +62,7 @@ if [ "$ENABLE_OTEL" = true ]; then
fi fi
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --multimodal "Launching Aggregated Multimodal Serving" "$MODEL" "$HTTP_PORT" print_launch_banner --multimodal "Launching Aggregated Vision Serving" "$MODEL" "$HTTP_PORT"
# run ingress # run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
...@@ -74,7 +75,8 @@ if [ -n "$CHAT_TEMPLATE" ]; then ...@@ -74,7 +75,8 @@ if [ -n "$CHAT_TEMPLATE" ]; then
TEMPLATE_ARGS+=(--chat-template "$CHAT_TEMPLATE") TEMPLATE_ARGS+=(--chat-template "$CHAT_TEMPLATE")
fi fi
# run worker with vision model (SGLang auto-detects chat template from HF tokenizer) # run worker with a vision model (SGLang auto-detects chat template from HF tokenizer)
# The SGLang engine handles image/video loading and vision encoding internally.
OTEL_SERVICE_NAME=dynamo-worker DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \ OTEL_SERVICE_NAME=dynamo-worker DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python3 -m dynamo.sglang \ python3 -m dynamo.sglang \
--model-path "$MODEL" \ --model-path "$MODEL" \
......
...@@ -42,6 +42,9 @@ class SGLangConfig(EngineConfig): ...@@ -42,6 +42,9 @@ class SGLangConfig(EngineConfig):
sglang_dir = os.environ.get("SGLANG_DIR") or os.path.join( sglang_dir = os.environ.get("SGLANG_DIR") or os.path.join(
WORKSPACE_DIR, "examples/backends/sglang" WORKSPACE_DIR, "examples/backends/sglang"
) )
REMOTE_VIDEO_TEST_URI = (
"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
)
# SGLang test configurations # SGLang test configurations
# NOTE: pytest.mark.gpu_1 tests take ~167s (2m 47s) total to run sequentially (with models pre-cached) # NOTE: pytest.mark.gpu_1 tests take ~167s (2m 47s) total to run sequentially (with models pre-cached)
...@@ -307,6 +310,45 @@ sglang_configs = { ...@@ -307,6 +310,45 @@ sglang_configs = {
) )
], ],
), ),
"video_agg_qwen": SGLangConfig(
# Tests aggregated video inference using DecodeWorkerHandler
# with in-process vision encoding (no separate encode worker).
# Reuses agg_vision.sh because image and video share the same aggregated
# multimodal SGLang request path.
name="video_agg_qwen",
directory=sglang_dir,
script_name="agg_vision.sh",
marks=[
pytest.mark.gpu_1,
pytest.mark.profiled_vram_gib(13.3), # same as multimodal_e_pd_qwen
pytest.mark.timeout(360),
pytest.mark.pre_merge,
],
model="Qwen/Qwen2-VL-7B-Instruct",
script_args=[
"--model-path",
"Qwen/Qwen2-VL-7B-Instruct",
"--mem-fraction-static",
"0.8",
],
timeout=360,
frontend_port=DefaultPort.FRONTEND.value,
request_payloads=[
chat_payload(
[
{"type": "text", "text": "Describe the video in detail"},
{
"type": "video_url",
"video_url": {"url": REMOTE_VIDEO_TEST_URI},
},
],
repeat_count=1,
expected_response=["guitar", "tablet", "draw"],
temperature=0.0,
max_tokens=100,
)
],
),
"embedding_agg": SGLangConfig( "embedding_agg": SGLangConfig(
name="embedding_agg", name="embedding_agg",
directory=sglang_dir, directory=sglang_dir,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment