Unverified Commit ddee21cb authored by KrishnanPrash's avatar KrishnanPrash Committed by GitHub
Browse files

feat: SGLang aggregated multimodal support (#5450)

parent abe9d127
......@@ -141,12 +141,26 @@ class DecodeWorkerHandler(BaseWorkerHandler):
async for out in self._process_text_stream(decode, context):
yield out
else:
# Extract image URLs for multimodal requests. SGLang's mm_data_processor
# handles loading/preprocessing, and the scheduler does vision encoding.
image_data = None
image_items = request.get("multi_modal_data", {}).get("image_url")
if image_items:
image_data = []
for item in image_items:
if isinstance(item, str):
image_data.append(item)
elif isinstance(item, dict) and "Url" in item:
image_data.append(item["Url"])
image_data = image_data or None
trace_header = (
self._get_trace_header(context) if self.enable_trace else None
)
agg = await self.engine.async_generate(
**input_param,
image_data=image_data,
sampling_params=sampling_params,
stream=True,
external_trace_header=trace_header,
......
......@@ -17,7 +17,7 @@ limitations under the License.
# SGLang Multimodal
This document provides a comprehensive guide for multimodal inference using SGLang backend in Dynamo. SGLang multimodal uses specialized **E/PD or E/P/D** flows with **NIXL (RDMA)** for zero-copy tensor transfer.
This document provides a comprehensive guide for multimodal inference using SGLang backend in Dynamo. SGLang multimodal supports **EPD**, **E/PD**, and **E/P/D** flows, with NIXL (RDMA) for zero-copy tensor transfer in disaggregated modes.
## Support Matrix
......@@ -36,12 +36,12 @@ This document provides a comprehensive guide for multimodal inference using SGLa
## Deployment Patterns
SGLang supports E/PD and E/P/D patterns only (always has a separate encode worker). See [Multimodal Architecture Patterns](index.md#architecture-patterns) for detailed explanations.
SGLang supports EPD, E/PD, and E/P/D patterns. See [Multimodal Architecture Patterns](index.md#architecture-patterns) for detailed explanations.
| Pattern | Supported | Launch Script | Notes |
|---------|-----------|---------------|-------|
| EPD (Simple Aggregated) | | N/A | Not supported |
| E/PD (Encode Separate) | ✅ | `multimodal_agg.sh` | Vision encoder separate |
| EPD (Simple Aggregated) | | `agg.sh` | Internal encoding |
| E/PD (Encode Separate) | ✅ | `multimodal_epd.sh` | Vision encoder separate |
| E/P/D (Full Disaggregation) | ✅ | `multimodal_disagg.sh` | KV cache via bootstrap |
| EP/D (Traditional Disaggregated) | ❌ | N/A | Not supported |
......@@ -74,6 +74,58 @@ You can find the [latest release](https://github.com/ai-dynamo/dynamo/releases/l
git checkout $(git describe --tags $(git rev-list --tags --max-count=1))
```
## EPD Serving (Simple Aggregated)
### Components
- worker: [DecodeWorkerHandler](../../components/src/dynamo/sglang/request_handlers/llm/decode_handler.py) handles encoding, prefilling, and decoding in a single process.
### Workflow
The `DecodeWorkerHandler` receives multimodal requests with image URLs and passes them directly to SGLang's engine. SGLang's internal `mm_data_processor` handles image fetching, loading, encoding, and token expansion.
```mermaid
flowchart LR
HTTP --> worker
worker --tokenized text + image_urls--> SGLang[SGLang Engine]
```
### Launch
```bash
cd $DYNAMO_HOME/examples/backends/sglang
./launch/agg.sh --model Qwen/Qwen2.5-VL-7B-Instruct --chat-template qwen2-vl
```
**Client:**
```bash
curl http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "Qwen/Qwen2.5-VL-7B-Instruct",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Describe the image."
},
{
"type": "image_url",
"image_url": {
"url": "http://images.cocodataset.org/test2017/000000155781.jpg"
}
}
]
}
],
"max_tokens": 50,
"stream": false
}' | jq
```
## E/PD Serving (Encode Separate)
### Components
......@@ -105,7 +157,7 @@ flowchart LR
```bash
cd $DYNAMO_HOME/examples/backends/sglang
./launch/multimodal_agg.sh
./launch/multimodal_epd.sh
```
**Client:**
......@@ -344,6 +396,7 @@ Supported templates: `qwen2-vl`, `llama-3`, `vicuna`, etc.
| Use Case | NIXL Used? | Data Transfer | Notes |
|----------|------------|---------------|-------|
| EPD (Simple Aggregated) | No | N/A | All processing internal to SGLang |
| E/PD (Encode Separate) | Yes | Encoder → PD (embeddings) | Vision encoder separate |
| E/P/D (Full Disaggregation) | Yes | Encoder → Prefill (embeddings) | KV cache via SGLang bootstrap |
......
......@@ -155,10 +155,11 @@ sglang_configs = {
)
],
),
"multimodal_agg_qwen": SGLangConfig(
name="multimodal_agg_qwen",
"multimodal_epd_qwen": SGLangConfig(
# E/PD architecture: Encode worker (GPU 0) + Prefill/Decode worker (GPU 1)
name="multimodal_epd_qwen",
directory=sglang_dir,
script_name="multimodal_agg.sh",
script_name="multimodal_epd.sh",
marks=[pytest.mark.gpu_2, pytest.mark.nightly],
model="Qwen/Qwen2.5-VL-7B-Instruct",
delayed_start=0,
......@@ -184,6 +185,46 @@ sglang_configs = {
)
],
),
"multimodal_agg_qwen": SGLangConfig(
# Tests single-process aggregated multimodal inference using DecodeWorkerHandler
# with in-process vision encoding (no separate encode worker)
name="multimodal_agg_qwen",
directory=sglang_dir,
script_name="agg.sh",
marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.nightly,
pytest.mark.timeout(300),
],
model="Qwen/Qwen2.5-VL-7B-Instruct",
script_args=[
"--model-path",
"Qwen/Qwen2.5-VL-7B-Instruct",
"--chat-template",
"qwen2-vl",
],
delayed_start=0,
timeout=360,
frontend_port=DefaultPort.FRONTEND.value,
request_payloads=[
chat_payload(
[
{"type": "text", "text": "What is in this image?"},
{
"type": "image_url",
"image_url": {
"url": "http://images.cocodataset.org/test2017/000000155781.jpg"
},
},
],
repeat_count=1,
expected_response=["image"],
temperature=0.0,
max_tokens=100,
)
],
),
"embedding_agg": SGLangConfig(
name="embedding_agg",
directory=sglang_dir,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment