fix: restore E/P/D multimodal disagg serving and add Qwen3-VL-30B-A3B support (#6533)

Signed-off-by: Krishnan Prashanth <kprashanth@nvidia.com> Signed-off-by: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com>

fix: restore E/P/D multimodal disagg serving and add Qwen3-VL-30B-A3B support (#6533)
Signed-off-by: Krishnan Prashanth <kprashanth@nvidia.com> Signed-off-by: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com>
8e236375 · KrishnanPrash · GitHub · 5c64ffc3 · 8e236375 · 8e236375
Unverified Commit 8e236375 authored Feb 24, 2026 by KrishnanPrash Committed by GitHub Feb 24, 2026
7 changed files
--- a/components/src/dynamo/vllm/handlers.py
+++ b/components/src/dynamo/vllm/handlers.py
@@ -1041,8 +1041,8 @@ class BaseWorkerHandler(ABC):
                prompt_tokens + completion_tokens if prompt_tokens is not None else None
            ),
            "prompt_tokens_details": (
-                {"cached_tokens": request_output.num_cached_tokens}
-                if request_output.num_cached_tokens
+                {"cached_tokens": num_cached}
+                if (num_cached := getattr(request_output, "num_cached_tokens", None))
                else None
            ),
        }

--- a/components/src/dynamo/vllm/multimodal_handlers/multimodal_pd_worker_handler.py
+++ b/components/src/dynamo/vllm/multimodal_handlers/multimodal_pd_worker_handler.py
@@ -199,22 +199,6 @@ class MultimodalPDWorkerHandler(BaseWorkerHandler):
        logger.debug(f"Prepared multimodal data size: {len(multi_modal_data['image'])}")
        logger.debug("Multimodal data keys: %s", list(multi_modal_data.keys()))

-    # ── Response serialization ───────────────────────────────────────
-
-    @staticmethod
-    def _serialize_response(response) -> str:
-        """Build a JSON-serialized ``MyRequestOutput`` from an engine response."""
-        return MyRequestOutput(
-            request_id=response.request_id,
-            prompt=response.prompt,
-            prompt_token_ids=response.prompt_token_ids,
-            prompt_logprobs=response.prompt_logprobs,
-            outputs=response.outputs,
-            finished=response.finished,
-            metrics=response.metrics,
-            kv_transfer_params=response.kv_transfer_params,
-        ).model_dump_json()
-
    @staticmethod
    def _format_engine_output(
        response, num_output_tokens_so_far: int
@@ -346,13 +330,16 @@ class MultimodalPDWorkerHandler(BaseWorkerHandler):
                f"— ensure the same adapter is loaded on the decode worker."
            )

+        num_output_tokens_so_far = 0
        async for (
            decode_response
        ) in await self.decode_worker_client.round_robin(  # type: ignore[union-attr]
            request.model_dump_json()
        ):
            output = MyRequestOutput.model_validate_json(decode_response.data())  # type: ignore[attr-defined]
-            yield self._serialize_response(output)
+            yield self._format_engine_output(output, num_output_tokens_so_far)
+            if output.outputs:
+                num_output_tokens_so_far = len(output.outputs[0].token_ids)

    # ── Public entry point ───────────────────────────────────────────


--- a/components/src/dynamo/vllm/multimodal_utils/model.py
+++ b/components/src/dynamo/vllm/multimodal_utils/model.py
@@ -38,6 +38,8 @@ class SupportedModels:
    QWEN_2_5_VL_3B = "Qwen/Qwen2.5-VL-3B-Instruct"
    QWEN_2_5_VL_7B = "Qwen/Qwen2.5-VL-7B-Instruct"
    QWEN_2_5_VL_32B = "Qwen/Qwen2.5-VL-32B-Instruct"
+    QWEN_3_VL_2B = "Qwen/Qwen3-VL-2B-Instruct"
+    QWEN_3_VL_30B_A3B = "Qwen/Qwen3-VL-30B-A3B-Instruct"
    QWEN_3_VL_30B_A3B_FP8 = "Qwen/Qwen3-VL-30B-A3B-Instruct-FP8"
    QWEN_3_VL_8B_FP8 = "Qwen/Qwen3-VL-8B-Instruct-FP8"
    LLAVA_NEXT_VIDEO_7B = "llava-hf/LLaVA-NeXT-Video-7B-hf"
@@ -118,6 +120,8 @@ QWEN_VL_MODELS = [
    SupportedModels.QWEN_2_5_VL_3B,
    SupportedModels.QWEN_2_5_VL_7B,
    SupportedModels.QWEN_2_5_VL_32B,
+    SupportedModels.QWEN_3_VL_2B,
+    SupportedModels.QWEN_3_VL_30B_A3B,
    SupportedModels.QWEN_3_VL_30B_A3B_FP8,
    SupportedModels.QWEN_3_VL_8B_FP8,
 ]

--- a/components/src/dynamo/vllm/tests/multimodal_handlers/test_vllm_multimodal_pd_worker_handler.py
+++ b/components/src/dynamo/vllm/tests/multimodal_handlers/test_vllm_multimodal_pd_worker_handler.py
@@ -15,7 +15,6 @@ from dynamo.common.memory.multimodal_embedding_cache_manager import (
 )
 from dynamo.vllm.multimodal_handlers import multimodal_pd_worker_handler as mod
 from dynamo.vllm.multimodal_utils.protocol import (
-    MyRequestOutput,
    PatchedTokensPrompt,
    vLLMMultimodalRequest,
 )
@@ -105,7 +104,7 @@ def _make_vllm_request(request_id: str = "req-1") -> vLLMMultimodalRequest:


 def _make_engine_response(request_id: str = "req-1", finished: bool = True):
-    """Create a mock engine response with the fields _serialize_response needs."""
+    """Create a mock engine response with the fields _format_engine_output needs."""
    resp = MagicMock()
    resp.request_id = request_id
    resp.prompt = "test"
@@ -274,16 +273,28 @@ class TestGenerateDisagg:

        handler.engine_client.generate = fake_generate

-        decode_output = MyRequestOutput(
-            request_id="req-1",
-            prompt="test",
-            prompt_token_ids=[1, 2, 3],
-            outputs=[],
-            finished=True,
-            kv_transfer_params={"block_ids": [0, 1]},
+        decode_json = json.dumps(
+            {
+                "request_id": "req-1",
+                "prompt": "test",
+                "prompt_token_ids": [1, 2, 3],
+                "outputs": [
+                    {
+                        "index": 0,
+                        "text": "",
+                        "token_ids": [42],
+                        "cumulative_logprob": None,
+                        "logprobs": None,
+                        "finish_reason": "stop",
+                        "stop_reason": None,
+                    }
+                ],
+                "finished": True,
+                "kv_transfer_params": {"block_ids": [0, 1]},
+            }
        )
        decode_resp = MagicMock()
-        decode_resp.data.return_value = decode_output.model_dump_json()
+        decode_resp.data.return_value = decode_json

        async def fake_round_robin(payload):
            async def _stream():
@@ -299,6 +310,6 @@ class TestGenerateDisagg:
            chunks.append(chunk)

        assert len(chunks) == 1
-        parsed = json.loads(chunks[0])
-        assert parsed["request_id"] == "req-1"
-        assert parsed["finished"] is True
+        assert isinstance(chunks[0], dict)
+        assert chunks[0]["token_ids"] == [42]
+        assert chunks[0]["finish_reason"] == "stop"
--- a/components/src/dynamo/vllm/worker_factory.py
+++ b/components/src/dynamo/vllm/worker_factory.py
@@ -187,19 +187,19 @@ class WorkerFactory:
        if kv_publisher:
            handler.kv_publisher = kv_publisher

-        # Register model with the frontend so it can route requests
-        model_type = parse_endpoint_types(config.endpoint_types)
-        model_input = (
-            ModelInput.Text if config.use_vllm_tokenizer else ModelInput.Tokens
-        )
-        await self.register_vllm_model(
-            model_input,
-            model_type,
-            generate_endpoint,
-            config,
-            engine_client,
-            vllm_config,
-        )
+        if not config.multimodal_decode_worker:
+            model_type = parse_endpoint_types(config.endpoint_types)
+            model_input = (
+                ModelInput.Text if config.use_vllm_tokenizer else ModelInput.Tokens
+            )
+            await self.register_vllm_model(
+                model_input,
+                model_type,
+                generate_endpoint,
+                config,
+                engine_client,
+                vllm_config,
+            )

        metrics_labels = [("model", config.served_model_name or config.model)]
        try:

--- a/examples/backends/vllm/launch/disagg_multimodal_epd.sh
+++ b/examples/backends/vllm/launch/disagg_multimodal_epd.sh
@@ -71,7 +71,7 @@ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 CUDA_VISIBLE_DEVICES=$DYN_ENCODE_WORKER_GPU py
 # Start prefill worker (also handles encode routing via --route-to-encoder)
 echo "Starting prefill worker on GPU $DYN_PREFILL_WORKER_GPU (GPU mem: $DYN_PREFILL_GPU_MEM)..."
 VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
-CUDA_VISIBLE_DEVICES=$DYN_PREFILL_WORKER_GPU python -m dynamo.vllm --route-to-encoder --disaggregation-mode prefill --enable-multimodal --enable-mm-embeds --model $MODEL_NAME --gpu-memory-utilization $DYN_PREFILL_GPU_MEM $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
+CUDA_VISIBLE_DEVICES=$DYN_PREFILL_WORKER_GPU python -m dynamo.vllm --multimodal-worker --route-to-encoder --disaggregation-mode prefill --enable-multimodal --enable-mm-embeds --model $MODEL_NAME --gpu-memory-utilization $DYN_PREFILL_GPU_MEM $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &

 # Start decode worker
 echo "Starting decode worker on GPU $DYN_DECODE_WORKER_GPU (GPU mem: $DYN_DECODE_GPU_MEM)..."

--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -276,13 +276,13 @@ vllm_configs = {
            completion_payload_default(),
        ],
    ),
-    "multimodal_disagg_qwen2vl_2b_e_pd": VLLMConfig(
-        name="multimodal_disagg_qwen2vl_2b_e_pd",
+    "multimodal_disagg_qwen3vl_2b_e_pd": VLLMConfig(
+        name="multimodal_disagg_qwen3vl_2b_e_pd",
        directory=vllm_dir,
        script_name="disagg_multimodal_e_pd.sh",
        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
-        model="Qwen/Qwen2-VL-2B-Instruct",
-        script_args=["--model", "Qwen/Qwen2-VL-2B-Instruct", "--single-gpu"],
+        model="Qwen/Qwen3-VL-2B-Instruct",
+        script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"],
        request_payloads=[
            chat_payload(
                [
@@ -335,13 +335,21 @@ vllm_configs = {
            )
        ],
    ),
-    "multimodal_agg_llava_epd": VLLMConfig(
-        name="multimodal_agg_llava_epd",
+    "multimodal_disagg_qwen3vl_2b_epd": VLLMConfig(
+        name="multimodal_disagg_qwen3vl_2b_epd",
        directory=vllm_dir,
-        script_name="agg_multimodal_epd.sh",
-        marks=[pytest.mark.gpu_2, pytest.mark.nightly],
-        model="llava-hf/llava-1.5-7b-hf",
-        script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
+        script_name="disagg_multimodal_epd.sh",
+        marks=[pytest.mark.gpu_2, pytest.mark.pre_merge],
+        model="Qwen/Qwen3-VL-2B-Instruct",
+        script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct"],
+        env={
+            "DYN_ENCODE_WORKER_GPU": "0",
+            "DYN_PREFILL_WORKER_GPU": "0",
+            "DYN_DECODE_WORKER_GPU": "1",
+            "DYN_ENCODE_GPU_MEM": "0.4",
+            "DYN_PREFILL_GPU_MEM": "0.4",
+            "DYN_DECODE_GPU_MEM": "0.85",
+        },
        request_payloads=[
            chat_payload(
                [
@@ -355,39 +363,12 @@ vllm_configs = {
                    },
                ],
                repeat_count=1,
-                expected_response=["purple"],
+                expected_response=["green"],
                temperature=0.0,
                max_tokens=100,
            )
        ],
    ),
-    "multimodal_agg_qwen_epd": VLLMConfig(
-        name="multimodal_agg_qwen_epd",
-        directory=vllm_dir,
-        script_name="agg_multimodal_epd.sh",
-        marks=[pytest.mark.gpu_2, pytest.mark.nightly],
-        model="Qwen/Qwen2.5-VL-7B-Instruct",
-        delayed_start=0,
-        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
-        timeout=360,
-        request_payloads=[
-            chat_payload(
-                [
-                    {
-                        "type": "text",
-                        "text": "What colors are in the following image? Respond only with the colors.",
-                    },
-                    {
-                        "type": "image_url",
-                        "image_url": {"url": MULTIMODAL_IMG_URL},
-                    },
-                ],
-                repeat_count=1,
-                expected_response=["purple"],
-                max_tokens=100,
-            )
-        ],
-    ),
    "multimodal_agg_qwen": VLLMConfig(
        name="multimodal_agg_qwen",
        directory=vllm_dir,