refactor: remove dedicated multimodal PD worker (#7556)

Signed-off-by: Guan Luo <41310872+GuanLuo@users.noreply.github.com>

refactor: remove dedicated multimodal PD worker (#7556)
Signed-off-by: Guan Luo <41310872+GuanLuo@users.noreply.github.com>
4b1d442b · GuanLuo · GitHub · b00514c2 · 4b1d442b · 4b1d442b
Unverified Commit 4b1d442b authored Mar 31, 2026 by GuanLuo Committed by GitHub Apr 01, 2026
13 changed files
--- a/components/src/dynamo/common/constants.py
+++ b/components/src/dynamo/common/constants.py
@@ -12,6 +12,7 @@ class DisaggregationMode(Enum):
    AGGREGATED = "agg"
    PREFILL = "prefill"
    DECODE = "decode"
+    ENCODE = "encode"


 class EmbeddingTransferMode(Enum):

--- a/components/src/dynamo/vllm/args.py
+++ b/components/src/dynamo/vllm/args.py
@@ -153,21 +153,11 @@ def update_dynamo_config_with_engine(
    # Capture user-provided --endpoint before defaults overwrite it
    user_endpoint = dynamo_config.endpoint

-    if dynamo_config.route_to_encoder:
-        dynamo_config.component = "processor"
-        dynamo_config.endpoint = "generate"
-    elif dynamo_config.multimodal_encode_worker:
-        dynamo_config.component = "encoder"
-        dynamo_config.endpoint = "generate"
-    elif dynamo_config.multimodal_decode_worker:
-        dynamo_config.component = "decoder"
-        dynamo_config.endpoint = "generate"
-    elif (
-        dynamo_config.multimodal_worker
-        and dynamo_config.disaggregation_mode == DisaggregationMode.PREFILL
-    ):
-        dynamo_config.component = "backend"
+    # Multi-modal related component/endpoint resolution
+    if dynamo_config.disaggregation_mode == DisaggregationMode.ENCODE:
+        dynamo_config.component = "encode"
        dynamo_config.endpoint = "generate"
+    # Standard component/endpoint resolution
    elif dynamo_config.disaggregation_mode == DisaggregationMode.PREFILL:
        dynamo_config.component = "prefill"
        dynamo_config.endpoint = "generate"

--- a/components/src/dynamo/vllm/backend_args.py
+++ b/components/src/dynamo/vllm/backend_args.py
@@ -226,6 +226,9 @@ class DynamoVllmConfig(ConfigBase):
           Raise if legacy booleans are also set.
        2. If legacy --is-prefill-worker or --is-decode-worker is set,
           emit DeprecationWarning and translate to enum.
+        3. If legacy multimodal flags are set, translate to enum,
+           emit DeprecationWarning and translate to enum, raise if conflicting
+           with --disaggregation-mode.
        3. Apply default (AGGREGATED) if nothing was provided.
        4. Sync boolean fields from the resolved enum value.
        """
@@ -263,6 +266,14 @@ class DynamoVllmConfig(ConfigBase):
                )
                self.disaggregation_mode = DisaggregationMode.DECODE

+        # Porting multimodal legacy flags
+        if (
+            self.multimodal_decode_worker
+            or self.multimodal_encode_worker
+            or self.multimodal_worker
+        ):
+            self._resolve_disaggregation_model_from_legacy_multimodal_flags()
+
        # Apply default if neither new flag nor legacy flags were provided
        if self.disaggregation_mode is None:
            self.disaggregation_mode = DisaggregationMode.AGGREGATED
@@ -271,6 +282,64 @@ class DynamoVllmConfig(ConfigBase):
        self.is_prefill_worker = self.disaggregation_mode == DisaggregationMode.PREFILL
        self.is_decode_worker = self.disaggregation_mode == DisaggregationMode.DECODE

+    def _resolve_disaggregation_model_from_legacy_multimodal_flags(self) -> None:
+        """
+        Resolve disaggregation mode from legacy multimodal flags, emit DeprecationWarning
+        and raise ValueError if conflicting with --disaggregation-mode.
+
+        Transformation rules:
+        1. If --multimodal-decode-worker is set, use DisaggregationMode.DECODE.
+        2. If --multimodal-encode-worker is set, use DisaggregationMode.ENCODE.
+        3. If --multimodal-worker is set, default to DisaggregationMode.AGGREGATED unless
+           --disaggregation-mode is set.
+        """
+        if self.multimodal_decode_worker:
+            warnings.warn(
+                "--multimodal-decode-worker is deprecated, use --disaggregation-mode=decode and --enable-multimodal",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+            if (
+                self.disaggregation_mode is not None
+                and self.disaggregation_mode != DisaggregationMode.DECODE
+            ):
+                raise ValueError(
+                    f"Cannot set --multimodal-decode-worker while --disaggregation-mode is not '{DisaggregationMode.DECODE.value}'"
+                )
+            self.disaggregation_mode = DisaggregationMode.DECODE
+        if self.multimodal_encode_worker:
+            warnings.warn(
+                "--multimodal-encode-worker is deprecated, use --disaggregation-mode=encode and --enable-multimodal",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+            if (
+                self.disaggregation_mode is not None
+                and self.disaggregation_mode != DisaggregationMode.ENCODE
+            ):
+                raise ValueError(
+                    f"Cannot set --multimodal-encode-worker while --disaggregation-mode is not '{DisaggregationMode.ENCODE.value}'"
+                )
+            self.disaggregation_mode = DisaggregationMode.ENCODE
+        if self.multimodal_worker:
+            warnings.warn(
+                "--multimodal-worker is deprecated, use --disaggregation-mode=agg or --disaggregation-mode=prefill and --enable-multimodal",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+            if (
+                self.disaggregation_mode is not None
+                and self.disaggregation_mode != DisaggregationMode.AGGREGATED
+                and self.disaggregation_mode != DisaggregationMode.PREFILL
+            ):
+                raise ValueError(
+                    f"Cannot set --multimodal-worker while --disaggregation-mode is not '{DisaggregationMode.AGGREGATED.value}' or '{DisaggregationMode.PREFILL.value}'"
+                )
+            # only set 'self.disaggregation_mode' if it is not already set, '--multimodal-worker' may be specified with
+            # '--disaggregation-mode=prefill' as prefill workers in P/D disaggregation or without for aggregation.
+            if self.disaggregation_mode is None:
+                self.disaggregation_mode = DisaggregationMode.AGGREGATED
+
    def _count_multimodal_roles(self) -> int:
        """Return the number of multimodal worker roles set (0 or 1 allowed).


--- a/components/src/dynamo/vllm/multimodal_handlers/__init__.py
+++ b/components/src/dynamo/vllm/multimodal_handlers/__init__.py
@@ -2,13 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0

 from dynamo.vllm.multimodal_handlers.encode_worker_handler import EncodeWorkerHandler
-from dynamo.vllm.multimodal_handlers.multimodal_pd_worker_handler import (
-    MultimodalPDWorkerHandler,
-)
-from dynamo.vllm.multimodal_handlers.worker_handler import MultimodalDecodeWorkerHandler

 __all__ = [
    "EncodeWorkerHandler",
-    "MultimodalPDWorkerHandler",
-    "MultimodalDecodeWorkerHandler",
 ]
--- a/components/src/dynamo/vllm/multimodal_handlers/multimodal_pd_worker_handler.py
+++ b/components/src/dynamo/vllm/multimodal_handlers/multimodal_pd_worker_handler.py
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-import copy
-import logging
-import uuid
-from typing import Any, Optional
-
-import torch
-from vllm.inputs.data import TokensPrompt
-from vllm.v1.engine.async_llm import AsyncLLM
-
-from dynamo.common.memory.multimodal_embedding_cache_manager import (
-    MultimodalEmbeddingCacheManager,
-)
-from dynamo.common.multimodal.embedding_transfer import (
-    LocalEmbeddingReceiver,
-    NixlReadEmbeddingReceiver,
-    NixlWriteEmbeddingReceiver,
-)
-from dynamo.common.utils import nvtx_utils as _nvtx
-from dynamo.common.utils.otel_tracing import build_trace_headers
-from dynamo.common.utils.time_section import time_and_log_code_section
-from dynamo.runtime import Client, DistributedRuntime
-
-from ..args import Config
-from ..constants import DisaggregationMode, EmbeddingTransferMode
-from ..handlers import BaseWorkerHandler, build_sampling_params
-from ..multimodal_utils import (
-    MyRequestOutput,
-    PatchedTokensPrompt,
-    vLLMMultimodalRequest,
-)
-from ..multimodal_utils.model import is_qwen_vl_model
-from ..multimodal_utils.prefill_worker_utils import MultiModalEmbeddingLoader
-
-logger = logging.getLogger(__name__)
-
-IMAGE_URL_KEY = "image_url"
-
-
-class MultimodalPDWorkerHandler(BaseWorkerHandler[dict, dict]):
-    """Prefill/Decode or Prefill-only worker for multimodal serving"""
-
-    def __init__(
-        self,
-        runtime,
-        engine_client: AsyncLLM,
-        config: Config,
-        encode_worker_client: Optional[Client] = None,
-        decode_worker_client: Optional[Client] = None,
-        shutdown_event=None,
-        generate_endpoint=None,
-    ):
-        # Get default_sampling_params from config
-        default_sampling_params = (
-            config.engine_args.create_model_config().get_diff_sampling_param()
-        )
-
-        # Call BaseWorkerHandler.__init__ with proper parameters
-        super().__init__(
-            runtime,
-            config,
-            engine_client,
-            default_sampling_params,
-            enable_multimodal=config.enable_multimodal,
-            generate_endpoint=generate_endpoint,
-            shutdown_event=shutdown_event,
-        )
-
-        self.config = config
-        self.decode_worker_client = decode_worker_client
-        self.enable_disagg = config.disaggregation_mode == DisaggregationMode.PREFILL
-
-        # Initialize multimodal-specific components
-        logger.info("Multimodal PD Worker startup started.")
-
-        # Embedding loader consist of two main components:
-        # 1) An remote encode worker client and matching embedding receiver,
-        #    which can request remote encode and handle the transfer of embeddings
-        #    from the encode worker to this prefill worker.
-        # 2) A local embedding cache manager, which can store previously fetched embeddings
-        #    and used to determine whether remote encode is necessary for a given mm data.
-        self.encode_worker_client = encode_worker_client  # type: ignore
-        if config.embedding_transfer_mode == EmbeddingTransferMode.LOCAL:
-            self.embedding_receiver = LocalEmbeddingReceiver()  # type: ignore
-        elif config.embedding_transfer_mode == EmbeddingTransferMode.NIXL_WRITE:
-            self.embedding_receiver = NixlWriteEmbeddingReceiver()  # type: ignore
-        elif config.embedding_transfer_mode == EmbeddingTransferMode.NIXL_READ:
-            # [gluo FIXME] can't use pre-registered tensor as NIXL requires descriptors
-            # to be at matching size, need to overwrite nixl connect library
-            self.embedding_receiver = NixlReadEmbeddingReceiver(max_items=0)  # type: ignore
-        else:
-            raise ValueError(
-                f"Invalid embedding transfer mode: {config.embedding_transfer_mode}"
-            )
-        self.embedding_cache_manager: MultimodalEmbeddingCacheManager | None = None
-        if config.multimodal_embedding_cache_capacity_gb > 0:
-            capacity_bytes = int(
-                config.multimodal_embedding_cache_capacity_gb * 1024**3
-            )
-            self.embedding_cache_manager = MultimodalEmbeddingCacheManager(
-                capacity_bytes
-            )
-        self.embedding_loader: MultiModalEmbeddingLoader = MultiModalEmbeddingLoader(
-            encode_worker_client=self.encode_worker_client,  # type: ignore
-            receiver=self.embedding_receiver,
-            embedding_cache_manager=self.embedding_cache_manager,
-        )
-
-        logger.info("Multimodal PD Worker has been initialized")
-
-    async def async_init(self, runtime: DistributedRuntime):
-        """Async initialization for connector that requires async setup"""
-        logger.info("Multimodal PD Worker async initialization completed.")
-
-    def _parse_frontend_request(
-        self, raw_request: dict
-    ) -> tuple[vLLMMultimodalRequest, list[str]]:
-        """Parse a raw frontend dict into a vLLMMultimodalRequest and image URLs.
-
-        The Rust frontend sends a dict with ``token_ids`` and
-        ``multi_modal_data`` (containing image URLs). This method extracts
-        those fields into a structured request. No I/O is performed here;
-        embedding fetching is handled separately by ``_load_multimodal_data``.
-        """
-        request_id = str(uuid.uuid4().hex)
-
-        image_urls: list[str] = []
-        mm_data = raw_request.get("multi_modal_data")
-        if mm_data is not None:
-            for item in mm_data.get(IMAGE_URL_KEY, []):
-                if isinstance(item, dict) and "Url" in item:
-                    image_urls.append(item["Url"])
-                elif isinstance(item, dict) and "Decoded" in item:
-                    image_urls.append(item["Decoded"])
-
-        sampling_params = build_sampling_params(
-            raw_request, self.default_sampling_params
-        )
-
-        request = vLLMMultimodalRequest(
-            engine_prompt=PatchedTokensPrompt(
-                prompt_token_ids=raw_request["token_ids"]
-            ),
-            sampling_params=sampling_params,
-            request_id=request_id,
-            model=raw_request.get("model"),
-        )
-
-        return request, image_urls
-
-    # ── Multimodal data loading ──────────────────────────────────────
-
-    async def _load_multimodal_data(
-        self, image_urls: list[str], request_id: str, context=None
-    ) -> dict[str, Any]:
-        """Fetch embeddings from encode workers and load into an engine-ready dict.
-
-        Returns an empty dict when no encode worker is configured or no images
-        are present.
-        """
-
-        return await self.embedding_loader.load_multimodal_embeddings(
-            image_urls,
-            request_id,
-            model=self.config.model,
-            context=context,
-        )
-
-    # ── Request metadata finalization ────────────────────────────────
-
-    def _finalize_request_metadata(
-        self,
-        request: vLLMMultimodalRequest,
-        multi_modal_data: dict[str, Any],
-    ) -> None:
-        """Attach model-specific metadata to the request for the decode worker.
-
-        For Qwen VL (mRoPE) models, captures image grid dimensions and
-        embedding shapes so the decode worker can reconstruct
-        ``multi_modal_data`` consistently for multiple images.
-        """
-        if is_qwen_vl_model(self.config.model) and isinstance(
-            multi_modal_data.get("image"), dict
-        ):
-            image_data = multi_modal_data["image"]
-            image_grid_thw = image_data.get("image_grid_thw")
-            image_embeds = image_data.get("image_embeds")
-            if image_grid_thw is not None:
-                request.image_grid_thw = (
-                    image_grid_thw.tolist()
-                    if isinstance(image_grid_thw, torch.Tensor)
-                    else image_grid_thw
-                )
-            if image_embeds is not None:
-                request.embeddings_shape = list(image_embeds.shape)
-        # prune empty multimodal data, vLLM will expect multi_modal_uuids if the mm items are empty
-        # i.e. ValueError: multi_modal_data['image'] is empty but multi_modal_uuids['image'] is missing.
-        for key, value in multi_modal_data.items():
-            if not isinstance(value, torch.Tensor):
-                if not value:
-                    del multi_modal_data[key]
-                else:
-                    logger.debug(
-                        f"Prepared multimodal data key {key}, number of items: {len(multi_modal_data[key])}"
-                    )
-
-        logger.debug("Multimodal data keys: %s", list(multi_modal_data.keys()))
-
-    @staticmethod
-    def _format_engine_output(
-        response, num_output_tokens_so_far: int
-    ) -> dict[str, Any]:
-        """Format a vLLM RequestOutput as an LLMEngineOutput-compatible dict.
-
-        This produces the same incremental dict format that the regular
-        (non-multimodal) handler yields, which the Rust frontend expects
-        after model registration.
-        """
-        if not response.outputs:
-            return {
-                "finish_reason": "error: No outputs from vLLM engine",
-                "token_ids": [],
-            }
-
-        output = response.outputs[0]
-        out: dict[str, Any] = {
-            "token_ids": output.token_ids[num_output_tokens_so_far:],
-        }
-
-        if output.finish_reason:
-            # Inline normalization: map vLLM's "abort" to Dynamo's "cancelled"
-            finish_reason = output.finish_reason
-            if finish_reason.startswith("abort"):
-                finish_reason = "cancelled"
-            out["finish_reason"] = finish_reason
-            out["completion_usage"] = BaseWorkerHandler._build_completion_usage(
-                request_output=response,
-            )
-        if output.stop_reason:
-            out["stop_reason"] = output.stop_reason
-
-        return out
-
-    # ── Aggregated generation (prefill + decode locally) ─────────────
-
-    async def _generate_agg(
-        self,
-        request: vLLMMultimodalRequest,
-        multi_modal_data: dict[str, Any],
-        rng_ttft=None,
-        context=None,
-    ):
-        """Run prefill and decode on this worker (aggregated mode)."""
-        lora_request = self._resolve_lora_request(request.model)
-        trace_headers = build_trace_headers(context) if context else None
-        gen = self.engine_client.generate(
-            prompt=TokensPrompt(
-                prompt_token_ids=request.engine_prompt["prompt_token_ids"],
-                multi_modal_data=multi_modal_data,
-            ),
-            sampling_params=request.sampling_params,
-            request_id=request.request_id,
-            lora_request=lora_request,
-            trace_headers=trace_headers,
-        )
-
-        num_output_tokens_so_far = 0
-        first_token = True
-        try:
-            async for response in gen:
-                if first_token:
-                    if rng_ttft is not None:
-                        _nvtx.end_range(rng_ttft)
-                    first_token = False
-                logger.debug(
-                    f"Response kv_transfer_params: {response.kv_transfer_params}"
-                )
-                logger.debug(
-                    f"length of expanded prompt ids: {len(response.prompt_token_ids)}"
-                )
-                chunk = self._format_engine_output(response, num_output_tokens_so_far)
-
-                # Capture token count BEFORE yield — vLLM may mutate
-                # response.outputs[0].token_ids in-place while we're suspended.
-                if response.outputs:
-                    num_output_tokens_so_far = len(response.outputs[0].token_ids)
-                yield chunk
-        finally:
-            if first_token:
-                if rng_ttft is not None:
-                    _nvtx.end_range(rng_ttft)
-
-    # ── Disaggregated generation (prefill here, decode remote) ───────
-
-    async def _generate_disagg(
-        self,
-        request: vLLMMultimodalRequest,
-        multi_modal_data: dict[str, Any],
-        rng_ttft=None,
-        context=None,
-    ):
-        """Prefill locally, then forward to a remote decode worker."""
-        with _nvtx.annotate(
-            "mm:pd:disagg_prefill", color="darkred"
-        ), time_and_log_code_section(
-            f"[PREFILL] request: {request.request_id} prefill time"
-        ):
-            # Prepare prefill-only request
-            prefill_only_request = copy.deepcopy(request)
-            extra_args = prefill_only_request.sampling_params.extra_args or {}
-            extra_args["kv_transfer_params"] = {"do_remote_decode": True}
-            prefill_only_request.sampling_params.extra_args = extra_args
-            prefill_only_request.sampling_params.max_tokens = 1
-            prefill_only_request.sampling_params.min_tokens = 1
-            logger.debug("Prefill request: %s", prefill_only_request)
-
-            lora_request = self._resolve_lora_request(request.model)
-            trace_headers = build_trace_headers(context) if context else None
-            gen = self.engine_client.generate(
-                prompt=TokensPrompt(
-                    prompt_token_ids=prefill_only_request.engine_prompt[
-                        "prompt_token_ids"
-                    ],
-                    multi_modal_data=multi_modal_data,
-                ),
-                sampling_params=prefill_only_request.sampling_params,
-                request_id=prefill_only_request.request_id,
-                lora_request=lora_request,
-                trace_headers=trace_headers,
-            )
-
-            # Drain prefill generator (max_tokens=1, expect a single response)
-            async for prefill_response in gen:
-                pass
-        if rng_ttft is not None:
-            _nvtx.end_range(rng_ttft)
-
-        # Qwen VL (mRoPE): keep the ORIGINAL unexpanded prompt.
-        # The decode worker passes multi_modal_data which causes vLLM to
-        # expand the prompt identically to prefill, ensuring block counts match.
-        #
-        # Other models: use the expanded prompt from prefill response.
-        # They don't pass multi_modal_data in decode, so they need the
-        # already-expanded prompt to match the KV cache layout.
-        if not is_qwen_vl_model(self.config.model):
-            request.engine_prompt[
-                "prompt_token_ids"
-            ] = prefill_response.prompt_token_ids
-
-        logger.debug(
-            f"Prefill response kv_transfer_params: {prefill_response.kv_transfer_params}"
-        )
-        extra_args = request.sampling_params.extra_args or {}
-        extra_args["kv_transfer_params"] = prefill_response.kv_transfer_params
-        extra_args.pop("serialized_request", None)
-        request.sampling_params.extra_args = extra_args
-        logger.debug("Decode request: %s", request)
-
-        # Serialized request is lightweight: token IDs, sampling params with
-        # kv_transfer_params, and small Qwen metadata (image_grid_thw,
-        # embeddings_shape).  Heavy multimodal data was consumed locally by
-        # engine_client.generate() and multimodal_inputs was cleared by
-        # `_finalize_request_metadata`.
-        #
-        # request.model (LoRA name) is preserved in the serialized request
-        # so the decode worker can resolve the same LoRA adapter.
-        if lora_request and request.model:
-            logger.debug(
-                f"Forwarding disaggregated decode with LoRA '{request.model}' "
-                f"— ensure the same adapter is loaded on the decode worker."
-            )
-
-        with (
-            _nvtx.annotate("mm:pd:disagg_remote_decode", color="purple"),
-            time_and_log_code_section(
-                f"[PREFILL] request: {request.request_id} remote decode time"
-            ) as decode_timer,
-        ):
-            num_output_tokens_so_far = 0
-            if self.decode_worker_client is None:
-                raise RuntimeError("Decode worker client is not configured.")
-            async for (decode_response) in await self.decode_worker_client.round_robin(
-                request.model_dump_json(), context=context
-            ):
-                output = MyRequestOutput.model_validate_json(decode_response.data())
-                yield self._format_engine_output(output, num_output_tokens_so_far)
-                if output.outputs:
-                    if num_output_tokens_so_far == 0:
-                        decode_timer.stop_interval()  # Log time to first decode response
-                    num_output_tokens_so_far = len(output.outputs[0].token_ids)
-
-    # ── Public entry point ───────────────────────────────────────────
-
-    async def generate(self, raw_request: dict, context):
-        """Parse the request, load multimodal data, and run inference."""
-        rng_pd = _nvtx.start_range("mm:pd_worker_generate", color="green")
-        rng_ttft = _nvtx.start_range("mm:pd:ttft", color="orange")
-
-        with time_and_log_code_section("[REQUEST] embedding processing time"):
-            rng_parse = _nvtx.start_range("mm:pd:parse_request", color="cyan")
-            request, image_urls = self._parse_frontend_request(raw_request)
-            logger.debug(f"Received PD request: {{ id: {request.request_id} }}.")
-            _nvtx.end_range(rng_parse)
-
-            rng_load = _nvtx.start_range("mm:pd:load_multimodal", color="yellow")
-            multi_modal_data = await self._load_multimodal_data(
-                image_urls, request.request_id, context
-            )
-            _nvtx.end_range(rng_load)
-
-            self._finalize_request_metadata(request, multi_modal_data)
-
-        if self.enable_disagg and self.decode_worker_client:
-            rng_disagg = _nvtx.start_range("mm:pd:generate_disagg", color="red")
-            async for chunk in self._generate_disagg(
-                request, multi_modal_data, rng_ttft, context=context
-            ):
-                yield chunk
-            _nvtx.end_range(rng_disagg)
-        else:
-            rng_agg = _nvtx.start_range("mm:pd:generate_agg", color="red")
-            async for chunk in self._generate_agg(
-                request, multi_modal_data, rng_ttft, context=context
-            ):
-                yield chunk
-            _nvtx.end_range(rng_agg)
-
-        _nvtx.end_range(rng_pd)
--- a/components/src/dynamo/vllm/multimodal_handlers/worker_handler.py
+++ b/components/src/dynamo/vllm/multimodal_handlers/worker_handler.py
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-import logging
-from typing import AsyncIterator
-
-from vllm.inputs.data import TokensPrompt
-
-import dynamo.nixl_connect as connect
-from dynamo.common.utils import nvtx_utils as _nvtx
-from dynamo.common.utils.otel_tracing import build_trace_headers
-from dynamo.common.utils.time_section import time_and_log_code_section
-from dynamo.runtime import DistributedRuntime
-
-from ..args import Config
-from ..constants import DisaggregationMode
-from ..handlers import BaseWorkerHandler
-from ..multimodal_utils import MyRequestOutput, vLLMMultimodalRequest
-from ..multimodal_utils.model import construct_qwen_decode_mm_data, is_qwen_vl_model
-
-logger = logging.getLogger(__name__)
-
-
-class MultimodalDecodeWorkerHandler(BaseWorkerHandler[vLLMMultimodalRequest, str]):
-    """Decode worker for disaggregated multimodal serving"""
-
-    def __init__(
-        self,
-        runtime,
-        engine_client,
-        config: Config,
-        shutdown_event=None,
-        generate_endpoint=None,
-    ):
-        # Get default_sampling_params from config
-        default_sampling_params = (
-            config.engine_args.create_model_config().get_diff_sampling_param()
-        )
-
-        # Call BaseWorkerHandler.__init__ with proper parameters
-        super().__init__(
-            runtime,
-            config,
-            engine_client,
-            default_sampling_params,
-            enable_multimodal=config.enable_multimodal,
-            generate_endpoint=generate_endpoint,
-            shutdown_event=shutdown_event,
-        )
-
-        self.config = config
-        self.enable_disagg = config.disaggregation_mode == DisaggregationMode.PREFILL
-
-    async def async_init(self, runtime: DistributedRuntime):
-        """Async initialization - connector needs async setup"""
-        self._connector = connect.Connector()
-        logger.info("Multimodal Decode Worker async initialization completed.")
-
-    async def generate(
-        self, request: vLLMMultimodalRequest, context
-    ) -> AsyncIterator[str]:
-        rng_decode = _nvtx.start_range("mm:decode_worker_generate", color="blue")
-        logger.debug(f"Got raw request: {request}")
-        if not isinstance(request, vLLMMultimodalRequest):
-            if isinstance(request, str):
-                request = vLLMMultimodalRequest.model_validate_json(request)
-            else:
-                request = vLLMMultimodalRequest.model_validate(request)
-        with time_and_log_code_section(
-            f"[DECODE] request: {request.request_id} preprocessing time"
-        ):
-            logger.debug(f"Received decode request: {{ id: {request.request_id} }}.")
-
-            # For Qwen VL models with mRoPE, we need to pass multi_modal_data containing
-            # image_grid_thw for position embeddings calculation. The decode worker
-            # receives the ORIGINAL unexpanded prompt (with placeholders), and vLLM
-            # will expand it using the multi_modal_data, ensuring the block count
-            # matches what prefill computed.
-            #
-            # We pass unique placeholder embeddings (seeded by request_id) since the
-            # actual embeddings are already in the KV cache from prefill. The unique
-            # values prevent incorrect prefix cache matches between different images.
-            multi_modal_data = None
-            if is_qwen_vl_model(self.config.model):
-                image_grid_thw = getattr(request, "image_grid_thw", None)
-                embeddings_shape = getattr(request, "embeddings_shape", None)
-                if image_grid_thw is None or embeddings_shape is None:
-                    logger.warning(
-                        "Missing Qwen VL decode fields (image_grid_thw/embeddings_shape); "
-                        "skipping multi_modal_data construction."
-                    )
-                else:
-                    multi_modal_data = construct_qwen_decode_mm_data(
-                        image_grid_thw, embeddings_shape, request.request_id
-                    )
-            lora_request = self._resolve_lora_request(request.model)
-            trace_headers = build_trace_headers(context) if context else None
-
-        with time_and_log_code_section(
-            f"[DECODE] request: {request.request_id} generate time"
-        ) as gen_timer:
-            gen = self.engine_client.generate(
-                prompt=TokensPrompt(
-                    prompt_token_ids=request.engine_prompt["prompt_token_ids"],
-                    multi_modal_data=multi_modal_data,
-                ),
-                sampling_params=request.sampling_params,
-                request_id=request.request_id,
-                lora_request=lora_request,
-                trace_headers=trace_headers,
-            )
-
-            rng_first = _nvtx.start_range("mm:decode:first_token", color="darkred")
-            first_token = True
-            try:
-                async for response in gen:
-                    if first_token:
-                        gen_timer.stop_interval()  # Log time to first response
-                        _nvtx.end_range(rng_first)
-                        first_token = False
-                    logger.debug(
-                        f"Response kv_transfer_params: {response.kv_transfer_params}"
-                    )
-                    yield MyRequestOutput(
-                        request_id=response.request_id,
-                        prompt=response.prompt,
-                        prompt_token_ids=response.prompt_token_ids,
-                        prompt_logprobs=response.prompt_logprobs,
-                        outputs=response.outputs,
-                        finished=response.finished,
-                        metrics=response.metrics,
-                        kv_transfer_params=response.kv_transfer_params,
-                    ).model_dump_json()
-            finally:
-                if first_token:
-                    _nvtx.end_range(rng_first)
-                _nvtx.end_range(rng_decode)
--- a/components/src/dynamo/vllm/tests/multimodal_utils/test_vllm_prefill_worker_utils.py
+++ b/components/src/dynamo/vllm/tests/multimodal_utils/test_vllm_prefill_worker_utils.py
@@ -26,7 +26,7 @@ MODEL = "test-model"
 DTYPE = torch.float16


-class TestMultimodalEmbeddingsLoader:
+class TestMultimodalEmbeddingLoader:
    @pytest.mark.asyncio
    async def test_all_cached(self):
        """All URLs cached -> no encode worker call, returns accumulated mm_data."""

--- a/components/src/dynamo/vllm/tests/test_backend_args.py
+++ b/components/src/dynamo/vllm/tests/test_backend_args.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Unit tests for vLLM backend arguments.
+
+[gluo NOTE] currently the test cover is being added as part of multimodal related test coverage,
+need to add more tests to cover different code paths of DynamoVllmConfig.
+"""
+
+
+import pytest
+
+from dynamo.vllm.backend_args import DisaggregationMode, DynamoVllmConfig
+
+pytestmark = [
+    pytest.mark.unit,
+    pytest.mark.vllm,
+    pytest.mark.pre_merge,
+    pytest.mark.gpu_0,
+]
+
+
+def create_config() -> DynamoVllmConfig:
+    """
+    Create a config with default values. This is needed as the config
+    is instantiated by the argparse parser with dynamically generated fields,
+    so we need to create a config with default values manually if not using
+    from_cli_args() method.
+
+    All multimodal flags are False, disaggregation mode is None.
+    Returns:
+        DynamoVllmConfig: A config with default values.
+    """
+    config = DynamoVllmConfig()
+    config.disaggregation_mode = None
+    config.multimodal_worker = False
+    config.multimodal_encode_worker = False
+    config.multimodal_decode_worker = False
+    return config
+
+
+class TestResolveDisaggregationModeFromLegacyMultimodalFlags:
+    """
+    Test suite for resolving disaggregation mode when legacy multimodal flags are set.
+    """
+
+    @pytest.mark.parametrize(
+        "mode",
+        [
+            None,  # Not specified
+            DisaggregationMode.AGGREGATED,
+            # DisaggregationMode.PREFILL, # test in 'test_prefill_worker' below
+            DisaggregationMode.DECODE,
+            DisaggregationMode.ENCODE,
+        ],
+    )
+    def test_agg_worker(self, mode):
+        config = create_config()
+        config.disaggregation_mode = mode
+        config.multimodal_worker = True
+        with pytest.warns(DeprecationWarning):
+            if mode is None or mode == DisaggregationMode.AGGREGATED:
+                config._resolve_disaggregation_model_from_legacy_multimodal_flags()
+                assert config.disaggregation_mode == DisaggregationMode.AGGREGATED
+            else:
+                with pytest.raises(ValueError):
+                    config._resolve_disaggregation_model_from_legacy_multimodal_flags()
+
+    # special case of 'test_agg_worker' above, test the prefill worker case
+    def test_prefill_worker(self):
+        config = create_config()
+        config.disaggregation_mode = DisaggregationMode.PREFILL
+        config.multimodal_worker = True
+        with pytest.warns(DeprecationWarning):
+            config._resolve_disaggregation_model_from_legacy_multimodal_flags()
+            assert config.disaggregation_mode == DisaggregationMode.PREFILL
+
+    @pytest.mark.parametrize(
+        "mode",
+        [
+            None,  # Not specified
+            DisaggregationMode.AGGREGATED,
+            DisaggregationMode.PREFILL,
+            DisaggregationMode.DECODE,
+            DisaggregationMode.ENCODE,
+        ],
+    )
+    def test_encode_worker(self, mode):
+        config = create_config()
+        config.disaggregation_mode = mode
+        config.multimodal_encode_worker = True
+        with pytest.warns(DeprecationWarning):
+            if mode is None or mode == DisaggregationMode.ENCODE:
+                config._resolve_disaggregation_model_from_legacy_multimodal_flags()
+                assert config.disaggregation_mode == DisaggregationMode.ENCODE
+            else:
+                with pytest.raises(ValueError):
+                    config._resolve_disaggregation_model_from_legacy_multimodal_flags()
+
+    @pytest.mark.parametrize(
+        "mode",
+        [
+            None,  # Not specified
+            DisaggregationMode.AGGREGATED,
+            DisaggregationMode.PREFILL,
+            DisaggregationMode.DECODE,
+            DisaggregationMode.ENCODE,
+        ],
+    )
+    def test_decode_worker(self, mode):
+        config = create_config()
+        config.disaggregation_mode = mode
+        config.multimodal_decode_worker = True
+        with pytest.warns(DeprecationWarning):
+            if mode is None or mode == DisaggregationMode.DECODE:
+                config._resolve_disaggregation_model_from_legacy_multimodal_flags()
+                assert config.disaggregation_mode == DisaggregationMode.DECODE
+            else:
+                with pytest.raises(ValueError):
+                    config._resolve_disaggregation_model_from_legacy_multimodal_flags()
--- a/components/src/dynamo/vllm/tests/test_vllm_worker_factory.py
+++ b/components/src/dynamo/vllm/tests/test_vllm_worker_factory.py
@@ -33,53 +33,6 @@ def _make_config(**overrides) -> Mock:
    return Mock(**defaults)


-class TestHandles:
-    """Test WorkerFactory.handles() config detection."""
-
-    # Legacy worker config
-    @pytest.mark.parametrize("route_to_encode", [True, False])
-    def test_multimodal_encode_worker(self, route_to_encode: bool) -> None:
-        # 'route_to_encoder' can be passed, the worker creation may ignore it.
-        config = _make_config(
-            multimodal_encode_worker=True, route_to_encoder=route_to_encode
-        )
-        assert WorkerFactory.handles(config)
-
-    @pytest.mark.parametrize("route_to_encode", [True, False])
-    def test_multimodal_worker(self, route_to_encode: bool) -> None:
-        config = _make_config(multimodal_worker=True, route_to_encoder=route_to_encode)
-        assert WorkerFactory.handles(config)
-
-    @pytest.mark.parametrize("route_to_encode", [True, False])
-    def test_multimodal_decode_worker(self, route_to_encode: bool) -> None:
-        config = _make_config(
-            multimodal_decode_worker=True, route_to_encoder=route_to_encode
-        )
-        assert WorkerFactory.handles(config)
-
-    # Tests for no standalone encode worker setting
-    @pytest.mark.parametrize("route_to_encode", [True, False])
-    def test_no_multimodal_flags(self, route_to_encode: bool) -> None:
-        config = _make_config(route_to_encoder=route_to_encode)
-        assert WorkerFactory.handles(config)
-
-    @pytest.mark.parametrize("route_to_encode", [True, False])
-    def test_prefill(self, route_to_encode: bool) -> None:
-        config = _make_config(
-            disaggregation_mode=DisaggregationMode.PREFILL,
-            route_to_encoder=route_to_encode,
-        )
-        assert WorkerFactory.handles(config)
-
-    @pytest.mark.parametrize("route_to_encode", [True, False])
-    def test_decode(self, route_to_encode: bool) -> None:
-        config = _make_config(
-            disaggregation_mode=DisaggregationMode.DECODE,
-            route_to_encoder=route_to_encode,
-        )
-        assert WorkerFactory.handles(config)
-
-
 @pytest.mark.asyncio
 class TestCreate:
    """Test WorkerFactory.create() routing."""
@@ -136,13 +89,11 @@ class TestCreate:

        factory._create_decode_worker.assert_called_once()  # type: ignore[union-attr]

-    # Tests with legacy worker config.
    @pytest.mark.parametrize("route_to_encode", [True, False])
-    async def test_routes_to_multimodal_encode(
-        self, factory: WorkerFactory, route_to_encode: bool
-    ) -> None:
+    async def test_encode(self, factory: WorkerFactory, route_to_encode: bool) -> None:
        config = _make_config(
-            multimodal_encode_worker=True, route_to_encoder=route_to_encode
+            disaggregation_mode=DisaggregationMode.ENCODE,
+            route_to_encoder=route_to_encode,
        )
        shutdown_event = asyncio.Event()

@@ -150,30 +101,6 @@ class TestCreate:

        factory._create_multimodal_encode_worker.assert_called_once()  # type: ignore[union-attr]

-    @pytest.mark.parametrize("route_to_encode", [True, False])
-    async def test_routes_to_multimodal_worker(
-        self, factory: WorkerFactory, route_to_encode: bool
-    ) -> None:
-        config = _make_config(multimodal_worker=True, route_to_encoder=route_to_encode)
-        shutdown_event = asyncio.Event()
-
-        await factory.create(Mock(), config, shutdown_event, [])
-
-        factory._create_multimodal_worker.assert_called_once()  # type: ignore[union-attr]
-
-    @pytest.mark.parametrize("route_to_encode", [True, False])
-    async def test_routes_multimodal_decode_worker(
-        self, factory: WorkerFactory, route_to_encode: bool
-    ) -> None:
-        config = _make_config(
-            multimodal_decode_worker=True, route_to_encoder=route_to_encode
-        )
-        shutdown_event = asyncio.Event()
-
-        await factory.create(Mock(), config, shutdown_event, [])
-
-        factory._create_multimodal_worker.assert_called_once()  # type: ignore[union-attr]
-
    async def test_passes_snapshot_engine(self, factory: WorkerFactory) -> None:
        config = _make_config(multimodal_worker=True)
        runtime = Mock()
@@ -195,7 +122,7 @@ class TestCreate:
            snapshot_engine=snapshot_engine,
        )

-        factory._create_multimodal_worker.assert_called_once_with(  # type: ignore[union-attr]
+        factory._create_decode_worker.assert_called_once_with(  # type: ignore[union-attr]
            runtime,
            config,
            shutdown_event,

--- a/components/src/dynamo/vllm/tests/multimodal_handlers/test_vllm_multimodal_pd_worker_handler.py
+++ b/components/src/dynamo/vllm/tests/multimodal_handlers/test_vllm_multimodal_pd_worker_handler.py
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0

-"""Unit tests for MultimodalPDWorkerHandler."""
+"""Unit tests for WorkerHandler in combination with multimodal handling."""
+# [gluo FIXME] This suite of tests is added for MultimodalPDWorkerHandler,
+# which is now removed. Yet the concept of this tests is still valid that
+# we need to have unit tests for the worker handlers.
+# Need to revisit the tests and update them to test the worker handlers.

 import json
 from collections import defaultdict
@@ -10,10 +14,10 @@ from unittest.mock import AsyncMock, MagicMock, patch
 import pytest
 import torch

+import dynamo.vllm.handlers as mod
 from dynamo.common.memory.multimodal_embedding_cache_manager import (
    MultimodalEmbeddingCacheManager,
 )
-from dynamo.vllm.multimodal_handlers import multimodal_pd_worker_handler as mod
 from dynamo.vllm.multimodal_utils.protocol import (
    PatchedTokensPrompt,
    vLLMMultimodalRequest,
@@ -64,17 +68,17 @@ def _make_handler(
    config: MagicMock | None = None,
    encode_worker_client: MagicMock | None = None,
    decode_worker_client: MagicMock | None = None,
-) -> mod.MultimodalPDWorkerHandler:
+) -> mod.DecodeWorkerHandler:
    """Construct a handler with BaseWorkerHandler.__init__ bypassed."""
    if config is None:
        config = _make_config()
    with patch.object(mod.BaseWorkerHandler, "__init__", return_value=None):
-        return mod.MultimodalPDWorkerHandler(
+        return mod.DecodeWorkerHandler(
            runtime=MagicMock(),
-            engine_client=MagicMock(),
            config=config,
+            engine_client=MagicMock(),
+            default_sampling_params={},
            encode_worker_client=encode_worker_client,
-            decode_worker_client=decode_worker_client,
        )


@@ -123,6 +127,7 @@ def _make_engine_response(request_id: str = "req-1", finished: bool = True):
 # ── Tests ────────────────────────────────────────────────────────────


+@pytest.mark.skip(reason="Need to revisit tests, see comment at top of the file")
 class TestInit:
    def test_embedding_cache_created_when_capacity_set(self):
        capacity_gb = 0.1
@@ -136,6 +141,7 @@ class TestInit:
        assert handler.embedding_cache_manager._capacity_bytes == expected_bytes


+@pytest.mark.skip(reason="Need to revisit tests, see comment at top of the file")
 class TestParseFrontendRequest:
    def test_extracts_token_ids_and_sampling_params(self):
        """Parses token_ids and sampling_params from raw frontend dict."""
@@ -159,6 +165,7 @@ class TestParseFrontendRequest:
        assert image_urls == ["http://a.png", "http://b.png"]


+@pytest.mark.skip(reason="Need to revisit tests, see comment at top of the file")
 class TestLoadMultimodalData:
    @pytest.mark.asyncio
    async def test_no_encode_client_returns_empty(self):
@@ -209,6 +216,7 @@ class TestLoadMultimodalData:
        assert mock_load.call_args.kwargs["model"] == handler.config.model


+@pytest.mark.skip(reason="Need to revisit tests, see comment at top of the file")
 class TestGenerateAgg:
    @pytest.mark.asyncio
    async def test_streams_serialized_responses(self):
@@ -238,6 +246,7 @@ class TestGenerateAgg:
        assert chunks[0]["finish_reason"] == "stop"


+@pytest.mark.skip(reason="Need to revisit tests, see comment at top of the file")
 class TestGenerateDisagg:
    @pytest.mark.asyncio
    async def test_prefills_then_forwards_to_decode(self):

--- a/components/src/dynamo/vllm/worker_factory.py
+++ b/components/src/dynamo/vllm/worker_factory.py
@@ -22,11 +22,7 @@ from .args import Config
 from .constants import DisaggregationMode
 from .handlers import DecodeWorkerHandler, PrefillWorkerHandler
 from .health_check import VllmHealthCheckPayload, VllmPrefillHealthCheckPayload
-from .multimodal_handlers import (
-    EncodeWorkerHandler,
-    MultimodalDecodeWorkerHandler,
-    MultimodalPDWorkerHandler,
-)
+from .multimodal_handlers import EncodeWorkerHandler
 from .publisher import StatLoggerFactory

 logger = logging.getLogger(__name__)
@@ -58,65 +54,6 @@ class WorkerFactory:
        self.setup_fpm_relay = setup_fpm_relay_fn
        self.setup_metrics_collection = setup_metrics_collection_fn

-    @staticmethod
-    def handles(config: Config) -> bool:
-        """Return True if this factory handles the given config."""
-        try:
-            WorkerFactory._validate_config(config)
-            return True
-        except (ValueError, NotImplementedError) as e:
-            logger.error(
-                f"WorkerFactory cannot handle config: {e}, provided config: {WorkerFactory._config_str(config)}"
-            )
-            return False
-
-    @staticmethod
-    def _config_str(config: Config) -> str:
-        """Helper function to format config for logging."""
-        return (
-            "{ "
-            f"multimodal_worker: {config.multimodal_worker}, "
-            f"multimodal_decode_worker: {config.multimodal_decode_worker}, "
-            f"multimodal_encode_worker: {config.multimodal_encode_worker}, "
-            f"disaggregation_mode: {config.disaggregation_mode}, "
-            f"route_to_encoder: {config.route_to_encoder}"
-            " }"
-        )
-
-    @staticmethod
-    def _validate_config(config: Config) -> None:
-        # [gluo FIXME] We are validating config combination for
-        # the transition away from "legacy" E/PD creation, which uses specialized
-        # P/D classes.
-        # In the future, we should rely on Dynamo runtime for P/D orchestration,
-        # thus the P/D worker in 'handlers.py' should soon be extended to support
-        # remote encode workflow, i.e. aware of encode worker client and perform remote
-        # encode when needed.
-        # Until then, we have validation on disaggregation mode and multimodal settings
-        # to guide user to use legacy mode for unsupported combination (see FIXME below).
-        legacy_multimodal_llm_worker = (
-            config.multimodal_worker or config.multimodal_decode_worker
-        )
-        if legacy_multimodal_llm_worker:
-            # [gluo] Sanity check, may be removed once legacy mode is removed.
-            # In the legacy mode, the specialized worker have P -> (optional D),
-            # so multimodal worker can be AGGREGATED or PREFILL, while
-            # multimodal decode worker can only be DECODE.
-            if (
-                config.multimodal_decode_worker
-                and config.disaggregation_mode == DisaggregationMode.PREFILL
-            ):
-                raise ValueError(
-                    "Multimodal decode worker with PREFILL disaggregation mode is not supported."
-                )
-            if (
-                config.multimodal_worker
-                and config.disaggregation_mode == DisaggregationMode.DECODE
-            ):
-                raise ValueError(
-                    "Multimodal worker with DECODE disaggregation mode is not supported."
-                )
-
    async def create(
        self,
        runtime: DistributedRuntime,
@@ -126,38 +63,12 @@ class WorkerFactory:
        snapshot_engine: Optional[EngineSetupResult] = None,
    ) -> None:
        """Create the appropriate multimodal worker based on config flags."""
-        WorkerFactory._validate_config(config)

-        # Standalone encode worker
-        if config.multimodal_encode_worker:
+        if config.disaggregation_mode == DisaggregationMode.ENCODE:
            await self._create_multimodal_encode_worker(
                runtime, config, shutdown_event, shutdown_endpoints
            )
-            return
-
-        # [gluo WIP] This conditional should only be within worker creation,
-        # put here as some LLM worker setting is not compatible with
-        # standalone encode worker, so check supported combinations early.
-        # LLM connects to standalone encode worker
-        legacy_multimodal_llm_worker = (
-            config.multimodal_worker or config.multimodal_decode_worker
-        )
-
-        # Create P/D worker, internally may use remote encode worker for multimodal work
-        if legacy_multimodal_llm_worker:
-            await self._create_multimodal_worker(
-                runtime,
-                config,
-                shutdown_event,
-                shutdown_endpoints,
-                snapshot_engine=snapshot_engine,
-            )
-            return
-
-        # [gluo FIXME] currently refactoring DecodeWorkerHandler from main.py for
-        # the use case of only disaggregating encode worker, so adding only decode
-        # worker creation for now, which is used in DisaggregationMode.AGGREGATED.
-        if config.disaggregation_mode == DisaggregationMode.PREFILL:
+        elif config.disaggregation_mode == DisaggregationMode.PREFILL:
            await self._create_prefill_worker(
                runtime,
                config,
@@ -166,6 +77,7 @@ class WorkerFactory:
                snapshot_engine=snapshot_engine,
            )
        else:
+            # AGGREGATED or DECODE
            await self._create_decode_worker(
                runtime,
                config,
@@ -175,162 +87,6 @@ class WorkerFactory:
            )
        return

-    async def _create_multimodal_worker(
-        self,
-        runtime: DistributedRuntime,
-        config: Config,
-        shutdown_event: asyncio.Event,
-        shutdown_endpoints: list,  # mutated in place
-        snapshot_engine: Optional[EngineSetupResult] = None,
-    ) -> None:
-        """
-        Initialize multimodal worker component.
-
-        Supports:
-        - --multimodal-worker: PD worker that may receive embeddings from encoder
-        - --multimodal-decode-worker: Decode-only worker
-
-        Modes:
-        - Aggregated (P+D): Prefill and decode on same worker
-        - Disaggregated (P→D): Prefill forwards to separate decode worker
-        """
-        generate_endpoint = runtime.endpoint(
-            f"{config.namespace}.{config.component}.{config.endpoint}"
-        )
-        clear_endpoint = runtime.endpoint(
-            f"{config.namespace}.{config.component}.clear_kv_blocks"
-        )
-        shutdown_endpoints[:] = [generate_endpoint, clear_endpoint]
-
-        lora_enabled = config.engine_args.enable_lora
-        if lora_enabled:
-            load_lora_endpoint = runtime.endpoint(
-                f"{config.namespace}.{config.component}.load_lora"
-            )
-            unload_lora_endpoint = runtime.endpoint(
-                f"{config.namespace}.{config.component}.unload_lora"
-            )
-            list_loras_endpoint = runtime.endpoint(
-                f"{config.namespace}.{config.component}.list_loras"
-            )
-            shutdown_endpoints.extend(
-                [load_lora_endpoint, unload_lora_endpoint, list_loras_endpoint]
-            )
-        # Use pre-created engine if provided (checkpoint mode), otherwise create new
-        if snapshot_engine is not None:
-            (
-                engine_client,
-                vllm_config,
-                _default_sampling_params,
-                prometheus_temp_dir,
-                _component_gauges,
-            ) = snapshot_engine
-        else:
-            (
-                engine_client,
-                vllm_config,
-                _default_sampling_params,
-                prometheus_temp_dir,
-                _component_gauges,
-            ) = self.setup_vllm_engine(config)
-
-        # Set up encode worker client when routing to encoder is enabled
-        encode_worker_client = await self._maybe_get_encode_worker_client(
-            runtime, config
-        )
-
-        # Set up decode worker client for disaggregated mode
-        decode_worker_client = None
-        if config.disaggregation_mode == DisaggregationMode.PREFILL:
-            decode_worker_client = await runtime.endpoint(
-                f"{config.namespace}.decoder.generate"
-            ).client()
-            await decode_worker_client.wait_for_instances()
-            logger.info("Connected to decode worker for disaggregated mode")
-
-        # Choose handler based on worker type
-        handler: MultimodalDecodeWorkerHandler | MultimodalPDWorkerHandler
-        if config.multimodal_decode_worker:
-            handler = MultimodalDecodeWorkerHandler(
-                runtime,
-                engine_client,
-                config,
-                shutdown_event,
-                generate_endpoint=generate_endpoint,
-            )
-        else:
-            handler = MultimodalPDWorkerHandler(
-                runtime,
-                engine_client,
-                config,
-                encode_worker_client,
-                decode_worker_client,
-                shutdown_event,
-                generate_endpoint=generate_endpoint,
-            )
-        handler.add_temp_dir(prometheus_temp_dir)
-
-        await handler.async_init(runtime)
-
-        # Set up KV event publisher for prefix caching if enabled
-        kv_publisher = self.setup_kv_event_publisher(
-            config, generate_endpoint, vllm_config
-        )
-        if kv_publisher:
-            handler.kv_publisher = kv_publisher  # type: ignore[attr-defined, union-attr]
-
-        if not config.multimodal_decode_worker:
-            model_type = parse_endpoint_types(config.endpoint_types)
-            model_input = (
-                ModelInput.Text if config.use_vllm_tokenizer else ModelInput.Tokens
-            )
-            await self.register_vllm_model(
-                model_input,
-                model_type,
-                generate_endpoint,
-                config,
-                engine_client,
-                vllm_config,
-            )
-
-        metrics_labels = [("model", config.served_model_name or config.model)]
-        try:
-            serve_tasks = [
-                generate_endpoint.serve_endpoint(
-                    handler.generate,
-                    metrics_labels=metrics_labels,
-                ),
-                clear_endpoint.serve_endpoint(
-                    handler.clear_kv_blocks,
-                    metrics_labels=metrics_labels,
-                ),
-            ]
-
-            if lora_enabled:
-                serve_tasks.extend(
-                    [
-                        load_lora_endpoint.serve_endpoint(
-                            handler.load_lora,
-                            metrics_labels=metrics_labels,
-                        ),
-                        unload_lora_endpoint.serve_endpoint(
-                            handler.unload_lora,
-                            metrics_labels=metrics_labels,
-                        ),
-                        list_loras_endpoint.serve_endpoint(
-                            handler.list_loras,
-                            metrics_labels=metrics_labels,
-                        ),
-                    ]
-                )
-
-            await asyncio.gather(*serve_tasks)
-        except Exception as e:
-            logger.error(f"Failed to serve endpoints: {e}")
-            raise
-        finally:
-            handler.cleanup()
-
    async def _create_multimodal_encode_worker(
        self,
        runtime: DistributedRuntime,
@@ -780,11 +536,12 @@ class WorkerFactory:
    ) -> Optional[Any]:
        """Helper function to get encode worker client if routing to encoder is enabled."""
        if config.route_to_encoder:
+            # [gluo NOTE] hardcoded component name
            encode_worker_client = await runtime.endpoint(
-                f"{config.namespace}.encoder.generate"
+                f"{config.namespace}.encode.generate"
            ).client()
            logger.info("Waiting for Encoder Worker Instances ...")
            await encode_worker_client.wait_for_instances()
-            logger.info("Connected to encoder workers")
+            logger.info("Connected to encode workers")
            return encode_worker_client
        return None
--- a/docs/api/nixl-connect/README.md
+++ b/docs/api/nixl-connect/README.md
@@ -140,13 +140,13 @@ flowchart LR

 #### Code Examples

-See [MultimodalPDWorkerHandler](https://github.com/ai-dynamo/dynamo/blob/main/components/src/dynamo/vllm/multimodal_handlers/worker_handler.py) or [MultimodalDecodeWorkerHandler](https://github.com/ai-dynamo/dynamo/blob/main/components/src/dynamo/vllm/multimodal_handlers/worker_handler.py) from our Multimodal example,
-for how they coordinate directly with the Encode Worker by creating a [`WritableOperation`](writable-operation.md),
+See [NixlReadEmbeddingSender](https://github.com/ai-dynamo/dynamo/blob/main/components/src/dynamo/common/multimodal/embedding_transfer.py),
+for how they coordinate directly with the Encode Worker by creating a [`ReadableOperation`](readable-operation.md),
 sending the operation's metadata via Dynamo's round-robin dispatcher, and awaiting the operation for completion before making use of the transferred data.

-See [MultimodalEncodeWorkerHandler](https://github.com/ai-dynamo/dynamo/blob/main/components/src/dynamo/vllm/multimodal_handlers/encode_worker_handler.py) from our Multimodal example,
+See [NixlReadEmbeddingReceiver](https://github.com/ai-dynamo/dynamo/blob/main/components/src/dynamo/common/multimodal/embedding_transfer.py),
 for how the resulting embeddings are registered with the NIXL subsystem by creating a [`Descriptor`](descriptor.md),
-a [`WriteOperation`](write-operation.md) is created using the metadata provided by the requesting worker,
+a [`ReadOperation`](read-operation.md) is created using the metadata provided by the requesting worker,
 and the worker awaits for the data transfer to complete for yielding a response.



--- a/docs/features/multimodal/multimodal-vllm.md
+++ b/docs/features/multimodal/multimodal-vllm.md
@@ -66,13 +66,13 @@ git checkout $(git describe --tags $(git rev-list --tags --max-count=1))

 **Components:**

- workers: [EncodeWorkerHandler](https://github.com/ai-dynamo/dynamo/blob/main/components/src/dynamo/vllm/multimodal_handlers/encode_worker_handler.py) for encoding and [MultimodalPDWorkerHandler](https://github.com/ai-dynamo/dynamo/blob/main/components/src/dynamo/vllm/multimodal_handlers/worker_handler.py) for prefilling and decoding.
+- workers: [EncodeWorkerHandler](https://github.com/ai-dynamo/dynamo/blob/main/components/src/dynamo/vllm/multimodal_handlers/encode_worker_handler.py) for encoding and [DecodeWorkerHandler](https://github.com/ai-dynamo/dynamo/blob/main/components/src/dynamo/vllm/handlers.py) for prefilling and decoding.
 - processor: Tokenizes the prompt and passes it to the EncodeWorkerHandler.
 - frontend: HTTP endpoint to handle incoming requests.

 **Workflow:**

-The EncodeWorkerHandler encodes the image and passes the embeddings to the MultimodalPDWorkerHandler via NATS and RDMA. The work complete event is sent via NATS, while the embeddings tensor is transferred via RDMA through the NIXL interface.
+The EncodeWorkerHandler encodes the image and passes the embeddings to the DecodeWorkerHandler via NATS and RDMA. The work complete event is sent via NATS, while the embeddings tensor is transferred via RDMA through the NIXL interface.

 ```mermaid
 flowchart LR
@@ -130,7 +130,7 @@ curl http://localhost:8000/v1/chat/completions \

 **Components:**

- workers: [EncodeWorkerHandler](https://github.com/ai-dynamo/dynamo/blob/main/components/src/dynamo/vllm/multimodal_handlers/encode_worker_handler.py) for encoding, [MultimodalDecodeWorkerHandler](https://github.com/ai-dynamo/dynamo/blob/main/components/src/dynamo/vllm/multimodal_handlers/worker_handler.py) for decoding, and [MultimodalPDWorkerHandler](https://github.com/ai-dynamo/dynamo/blob/main/components/src/dynamo/vllm/multimodal_handlers/worker_handler.py) for prefilling.
+- workers: [EncodeWorkerHandler](https://github.com/ai-dynamo/dynamo/blob/main/components/src/dynamo/vllm/multimodal_handlers/encode_worker_handler.py) for encoding, [DecodeWorkerHandler](https://github.com/ai-dynamo/dynamo/blob/main/components/src/dynamo/vllm/handlers.py) for decoding, and [PrefillWorkerHandler](https://github.com/ai-dynamo/dynamo/blob/main/components/src/dynamo/vllm/handlers.py) for prefilling.
 - processor: Tokenizes the prompt and passes it to the EncodeWorkerHandler.
 - frontend: HTTP endpoint to handle incoming requests.