feat(sglang): add video input support for aggregated serving (#7941)

Signed-off-by: Chokoyo <40918450+Chokoyo@users.noreply.github.com>

feat(sglang): add video input support for aggregated serving (#7941)
Signed-off-by: Chokoyo <40918450+Chokoyo@users.noreply.github.com>
b1c18bb1 · Zhuangcheng Gu · GitHub · 73f8557a · b1c18bb1 · b1c18bb1
Unverified Commit b1c18bb1 authored Apr 07, 2026 by Zhuangcheng Gu Committed by GitHub Apr 07, 2026
7 changed files
--- a/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py
+++ b/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py
@@ -18,6 +18,27 @@ from dynamo.sglang.publisher import DynamoSglangPublisher
 from dynamo.sglang.request_handlers.handler_base import BaseWorkerHandler
+def _extract_media_urls(mm_data: Dict[str, Any], media_key: str) -> list[str] | None:
+    """Normalize multimodal URL items from the frontend wire format."""
+    items = mm_data.get(media_key)
+    if not items:
+        return None
+    urls: list[str] = []
+    for item in items:
+        if isinstance(item, str):
+            urls.append(item)
+            continue
+        if isinstance(item, dict):
+            url = item.get("Url")
+            if isinstance(url, str):
+                urls.append(url)
+    return urls or None
 class DecodeWorkerHandler(BaseWorkerHandler):
    """Handler for decode workers in both aggregated and disaggregated serving modes."""
@@ -157,18 +178,11 @@ class DecodeWorkerHandler(BaseWorkerHandler):
                async for out in self._process_text_stream(decode, context):
                    yield out
        else:
-            # Extract image URLs for multimodal requests. SGLang's mm_data_processor
+            # Extract image/video URLs for multimodal requests. SGLang's mm_data_processor
            # handles loading/preprocessing, and the scheduler does vision encoding.
-            image_data: list[str] | None = None
+            mm_data = request.get("multi_modal_data", {})
-            image_items = request.get("multi_modal_data", {}).get("image_url")
+            image_data = _extract_media_urls(mm_data, "image_url")
-            if image_items:
+            video_data = _extract_media_urls(mm_data, "video_url")
-                image_data = []
-                for item in image_items:
-                    if isinstance(item, str):
-                        image_data.append(item)
-                    elif isinstance(item, dict) and "Url" in item:
-                        image_data.append(item["Url"])
-                image_data = image_data or None
            trace_header = build_trace_headers(context) if self.enable_trace else None
@@ -179,6 +193,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
            agg = await self.engine.async_generate(
                **input_param,
                image_data=image_data,
+                video_data=video_data,
                sampling_params=sampling_params,
                stream=True,
                return_routed_experts=return_routed_experts,

--- a/components/src/dynamo/sglang/tests/test_sglang_decode_handler.py
+++ b/components/src/dynamo/sglang/tests/test_sglang_decode_handler.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+from dynamo.sglang.request_handlers.llm.decode_handler import _extract_media_urls
+pytestmark = [
+    pytest.mark.unit,
+    pytest.mark.sglang,
+    pytest.mark.gpu_0,
+    pytest.mark.profiled_vram_gib(0),
+    pytest.mark.pre_merge,
+]
+def test_extract_media_urls_supports_string_and_wire_items():
+    mm_data = {
+        "video_url": [
+            "file:///tmp/test.mp4",
+            {"Url": "https://example.com/test.mp4"},
+            {"ignored": "value"},
+        ]
+    }
+    assert _extract_media_urls(mm_data, "video_url") == [
+        "file:///tmp/test.mp4",
+        "https://example.com/test.mp4",
+    ]
+def test_extract_media_urls_returns_none_for_missing_or_invalid_items():
+    assert _extract_media_urls({}, "image_url") is None
+    assert (
+        _extract_media_urls({"image_url": [{"ignored": "value"}]}, "image_url") is None
+    )
--- a/docs/features/multimodal/README.md
+++ b/docs/features/multimodal/README.md
@@ -44,7 +44,7 @@ Dynamo provides support for improving latency and throughput for vision-and-lang
 |-------|-------|-------|-------|
 | **[vLLM](https://github.com/ai-dynamo/dynamo/blob/main/docs/features/multimodal/multimodal-vllm.md)** | ✅ | 🧪  | 🧪 |
 | **[TRT-LLM](https://github.com/ai-dynamo/dynamo/blob/main/docs/features/multimodal/multimodal-trtllm.md)** | ✅ | ❌ | ❌ |
-| **[SGLang](https://github.com/ai-dynamo/dynamo/blob/main/docs/features/multimodal/multimodal-sglang.md)** | ✅ | ❌ | ❌ |
+| **[SGLang](https://github.com/ai-dynamo/dynamo/blob/main/docs/features/multimodal/multimodal-sglang.md)** | ✅ | 🧪 | ❌ |
 **Status:** ✅ Supported | 🧪 Experimental | ❌ Not supported

--- a/docs/features/multimodal/multimodal-sglang.md
+++ b/docs/features/multimodal/multimodal-sglang.md
@@ -12,7 +12,7 @@ This document provides a comprehensive guide for multimodal inference using SGLa
 |----------|--------------|------------|---------------|-------|
 | **Image** | HTTP/HTTPS URL | Yes | Yes | Vision encoder generates embeddings |
 | **Image** | Data URL (Base64) | No | No |  |
-| **Video** | HTTP/HTTPS URL | No | No |  |
+| **Video** | HTTP/HTTPS/`file://` URL | Yes | No | Aggregated only |
 | **Audio** | HTTP/HTTPS URL | No | No |  |
 ### Supported URL Formats
@@ -20,6 +20,7 @@ This document provides a comprehensive guide for multimodal inference using SGLa
 | Format | Example | Description |
 |--------|---------|-------------|
 | **HTTP/HTTPS** | `http://example.com/image.jpg` | Remote media files |
+| **file://** | `file:///tmp/test.mp4` | Local files accessible to the backend |
 ## Deployment Patterns
@@ -68,19 +69,19 @@ git checkout $(git describe --tags $(git rev-list --tags --max-count=1))
 ### Workflow
-The `DecodeWorkerHandler` receives multimodal requests with image URLs and passes them directly to SGLang's engine. SGLang's internal `mm_data_processor` handles image fetching, loading, encoding, and token expansion.
+The `DecodeWorkerHandler` receives multimodal requests with image/video URLs and passes them directly to SGLang's engine. SGLang's internal `mm_data_processor` handles image/video fetching, loading, encoding, and token expansion.
 ```mermaid
 flowchart LR
  HTTP --> worker
-  worker --tokenized text + image_urls--> SGLang[SGLang Engine]
+  worker --tokenized text + image/video URLs--> SGLang[SGLang Engine]
 ```
 ### Launch
 ```bash
 cd $DYNAMO_HOME/examples/backends/sglang
-./launch/agg.sh --model Qwen/Qwen2.5-VL-7B-Instruct --chat-template qwen2-vl
+./launch/agg_vision.sh --model-path Qwen/Qwen2-VL-7B-Instruct
 ```
 **Client:**
@@ -112,6 +113,35 @@ curl http://localhost:8000/v1/chat/completions \
  }' | jq
 ```
+Video requests use the same aggregated path:
+```bash
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "Qwen/Qwen2-VL-7B-Instruct",
+    "messages": [
+      {
+        "role": "user",
+        "content": [
+          {
+            "type": "text",
+            "text": "Describe the video in detail"
+          },
+          {
+            "type": "video_url",
+            "video_url": {
+              "url": "https://samplelib.com/mp4/sample-5s.mp4"
+            }
+          }
+        ]
+      }
+    ],
+    "max_tokens": 50,
+    "stream": false
+  }' | jq
+```
 ## E/PD Serving (Encode Separate)
 ### Components

--- a/docs/reference/feature-matrix.md
+++ b/docs/reference/feature-matrix.md
@@ -21,7 +21,7 @@ This document provides a comprehensive compatibility matrix for key Dynamo featu
 | **SLA-Based Planner** | ✅ | ✅ | ✅ | [Planner Doc][planner] |
 | **KV Block Manager** | 🚧 | ✅ | ✅ | [KVBM Doc][kvbm] |
 | **Multimodal (Image)** | ✅ | ✅ | ✅ | [Multimodal Doc][mm] |
-| **Multimodal (Video)** | | | ✅ | [Multimodal Doc][mm] |
+| **Multimodal (Video)** | 🚧 | | ✅ | [Multimodal Doc][mm] |
 | **Multimodal (Audio)** | | | 🚧 | [Multimodal Doc][mm] |
 | **Request Migration** | ✅ | 🚧 | ✅ | [Migration Doc][migration] |
 | **Request Cancellation** | 🚧 | ✅ | ✅ | Backend READMEs |

--- a/examples/backends/sglang/launch/agg_vision.sh
+++ b/examples/backends/sglang/launch/agg_vision.sh
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
-# Aggregated multimodal (vision + LLM) serving.
+# Aggregated multimodal (image/video + LLM) serving.
 # GPUs: 1
 set -e
@@ -12,7 +12,8 @@ SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
 source "$SCRIPT_DIR/../../../common/launch_utils.sh"
 # Default values
-MODEL="Qwen/Qwen3-VL-8B-Instruct"
+# TODO: Update default to Qwen3-VL-2B-Instruct after SGLang 0.5.10+ upgrade.
+MODEL="Qwen/Qwen2-VL-7B-Instruct"
 CHAT_TEMPLATE=""
 ENABLE_OTEL=false
@@ -61,7 +62,7 @@ if [ "$ENABLE_OTEL" = true ]; then
 fi
 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
-print_launch_banner --multimodal "Launching Aggregated Multimodal Serving" "$MODEL" "$HTTP_PORT"
+print_launch_banner --multimodal "Launching Aggregated Vision Serving" "$MODEL" "$HTTP_PORT"
 # run ingress
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
@@ -74,7 +75,8 @@ if [ -n "$CHAT_TEMPLATE" ]; then
    TEMPLATE_ARGS+=(--chat-template "$CHAT_TEMPLATE")
 fi
-# run worker with vision model (SGLang auto-detects chat template from HF tokenizer)
+# run worker with a vision model (SGLang auto-detects chat template from HF tokenizer)
+# The SGLang engine handles image/video loading and vision encoding internally.
 OTEL_SERVICE_NAME=dynamo-worker DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
 python3 -m dynamo.sglang \
  --model-path "$MODEL" \

--- a/tests/serve/test_sglang.py
+++ b/tests/serve/test_sglang.py
@@ -42,6 +42,9 @@ class SGLangConfig(EngineConfig):
 sglang_dir = os.environ.get("SGLANG_DIR") or os.path.join(
    WORKSPACE_DIR, "examples/backends/sglang"
 )
+REMOTE_VIDEO_TEST_URI = (
+    "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
+)
 # SGLang test configurations
 # NOTE: pytest.mark.gpu_1 tests take ~167s (2m 47s) total to run sequentially (with models pre-cached)
@@ -307,6 +310,45 @@ sglang_configs = {
            )
        ],
    ),
+    "video_agg_qwen": SGLangConfig(
+        # Tests aggregated video inference using DecodeWorkerHandler
+        # with in-process vision encoding (no separate encode worker).
+        # Reuses agg_vision.sh because image and video share the same aggregated
+        # multimodal SGLang request path.
+        name="video_agg_qwen",
+        directory=sglang_dir,
+        script_name="agg_vision.sh",
+        marks=[
+            pytest.mark.gpu_1,
+            pytest.mark.profiled_vram_gib(13.3),  # same as multimodal_e_pd_qwen
+            pytest.mark.timeout(360),
+            pytest.mark.pre_merge,
+        ],
+        model="Qwen/Qwen2-VL-7B-Instruct",
+        script_args=[
+            "--model-path",
+            "Qwen/Qwen2-VL-7B-Instruct",
+            "--mem-fraction-static",
+            "0.8",
+        ],
+        timeout=360,
+        frontend_port=DefaultPort.FRONTEND.value,
+        request_payloads=[
+            chat_payload(
+                [
+                    {"type": "text", "text": "Describe the video in detail"},
+                    {
+                        "type": "video_url",
+                        "video_url": {"url": REMOTE_VIDEO_TEST_URI},
+                    },
+                ],
+                repeat_count=1,
+                expected_response=["guitar", "tablet", "draw"],
+                temperature=0.0,
+                max_tokens=100,
+            )
+        ],
+    ),
    "embedding_agg": SGLangConfig(
        name="embedding_agg",
        directory=sglang_dir,