feat: NIXL Based RDMA Support w/ Multimodal Example (#1060)

75e774d4 · J Wyman · GitHub · 9acaa8d1 · 75e774d4 · 75e774d4
Unverified Commit 75e774d4 authored May 27, 2025 by J Wyman Committed by GitHub May 27, 2025
11 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-exclude: ^(src/grpc_generated|.*\.patch$)
+exclude: ^(src/grpc_generated|.*\.patch$|.*/connect/.*\.py)
 repos:
 - repo: https://github.com/timothycrosley/isort
  rev: 5.12.0

--- a/examples/multimodal/components/worker.py
+++ b/examples/multimodal/components/worker.py
@@ -19,10 +19,11 @@ import os
 import signal
 from typing import Optional

+import connect
 import torch
 from components.disagg_router import PyDisaggregatedRouter
-from components.encode_worker import EncodeWorker
-from components.prefill_worker import PrefillWorker
+from components.encode_worker import VllmEncodeWorker
+from components.prefill_worker import VllmPrefillWorker
 from transformers import LlavaForConditionalGeneration
 from utils.logging import check_required_workers
 from utils.nixl import NixlMetadataStore
@@ -53,11 +54,11 @@ logger = logging.getLogger(__name__)
    resources={"gpu": 1, "cpu": "10", "memory": "20Gi"},
    workers=1,
 )
-class VllmWorker:
+class VllmDecodeWorker:
    # For disaggregated serving, we need to link the prefill worker to the vllm worker
-    prefill_worker = depends(PrefillWorker)
+    prefill_worker = depends(VllmPrefillWorker)
    # For aggregated serving, we need to link the encode worker to the vllm worker.
-    encode_worker = depends(EncodeWorker)
+    encode_worker = depends(VllmEncodeWorker)

    def __init__(self):
        self.client = None
@@ -141,7 +142,11 @@ class VllmWorker:
                vision_tower.vision_model.embeddings.position_embedding.num_embeddings
            )
        else:
-            enc_comp_ns, enc_comp_name = EncodeWorker.dynamo_address()  # type: ignore
+            EMBEDDINGS_SHAPE = (1, 577, 4096)
+            EMBEDDINGS_DTYPE = torch.float16
+            EMBEDDINGS_DEVICE = "cuda"
+
+            enc_comp_ns, enc_comp_name = VllmEncodeWorker.dynamo_address()  # type: ignore
            self.encode_worker_client = (
                await runtime.namespace(enc_comp_ns)
                .component(enc_comp_name)
@@ -149,9 +154,22 @@ class VllmWorker:
                .client()
            )

+            self._connector = connect.Connector(runtime=runtime, namespace=enc_comp_ns)
+            await self._connector.initialize()
+
+            # Create a longer-lived buffer for receiving the image embeddings.
+            embeddings = torch.empty(
+                EMBEDDINGS_SHAPE, dtype=EMBEDDINGS_DTYPE, device=EMBEDDINGS_DEVICE
+            )
+            descriptor = connect.Descriptor(embeddings)
+            # Register the descriptor w/ NIXL (this is optional, if not done here the connect subsytem will take care of this automatically).
+            descriptor.register_memory(self._connector)
+            self._embeddings_descriptor = (embeddings, descriptor)
+
            await check_required_workers(self.encode_worker_client, self.min_workers)
            self.disaggregated_router = None
-        logger.info("VllmWorker has been initialized")
+
+        logger.info("Initialization complete.")

    def shutdown_vllm_engine(self, signum, frame):
        """Shutdown the background loop"""
@@ -159,7 +177,7 @@ class VllmWorker:
        loop = asyncio.get_event_loop()
        try:
            self.engine_client.close()
-            logger.info("VllmWorker shutdown complete")
+            logger.info("Shutdown complete.")
        except Exception as e:
            logger.error(f"Error during shutdown: {e}")
        finally:
@@ -177,8 +195,18 @@ class VllmWorker:

    @endpoint()
    async def generate(self, request: vLLMMultimodalRequest):
-        image_features = None
+        request_id = request.request_id
+        image_url = request.image_url
+        logger.info(
+            f"Received multimodal request {{ id: {request_id}, image_url: '{image_url}' }}."
+        )
+        embeddings = None
        if self.do_remote_prefill:
+            logger.debug(
+                f"Disaggregated: request {{ id: {request_id}, image_url: '{image_url}' }}"
+                " prefill worker will populate the decode model's key-value cache ahead of time;"
+                " no direct encode worker interaction required."
+            )
            if self.disaggregated_router is not None:
                async with PrefillQueue.get_instance(
                    nats_server=self._prefill_queue_nats_server,
@@ -195,21 +223,21 @@ class VllmWorker:
                disagg_router_decision = True

            if self.do_remote_prefill and disagg_router_decision:
+                logger.debug(
+                    f"Prefilling remotely for request {{ id: {request_id}, image_url: '{image_url}' }} with length {len(request.engine_prompt['prompt_token_ids'])}"
+                )
                remote_prefill_params = RemotePrefillParams(
                    is_remote_prefill=True,
                    remote_prefill_request_callback=self.get_remote_prefill_request_callback(),
                    # Pass the image url as part of the RemotePrefillParams, which will be passed to the prefill worker via RemotePrefillRequest
                    multimodal_data_source={
-                        "image_url": request.image_url,
+                        "image_url": image_url,
                    },
                )
-                logger.info(
-                    f"Prefilling remotely for request {request.request_id} with length {len(request.engine_prompt['prompt_token_ids'])}"
-                )
            else:
                remote_prefill_params = None
-                logger.info(
-                    f"Prefilling locally for request {request.request_id} with length {len(request.engine_prompt['prompt_token_ids'])}"
+                logger.debug(
+                    f"Prefilling locally for request {{ id: {request_id}, image_url: '{image_url}' }} with length {len(request.engine_prompt['prompt_token_ids'])}"
                )

            # The decode worker will pre-allocate the memory based on the prompt token length for the prefill worker to transfer the kv cache.
@@ -231,33 +259,61 @@ class VllmWorker:
            )

        else:
-            # For aggregated serving, the vllm worker will directly send the encode request to the encode worker.
+            logger.debug(
+                f"Aggregated: request {{ id: {request_id}, image_url: '{image_url}' }}"
+                " no prefill worker available, embeddings directly from encode worker."
+            )
+            # Extract the pre-allocated, reusable image embeddings tensor and its descriptor.
+            # Doing this avoids unnessesary memory de/registration with NIXL.
+            embeddings, descriptor = self._embeddings_descriptor
+
+            with self._connector.create_writable(descriptor) as writable:
+                # Extract serialized metadata about the operation from the writable operation,
+                # and use it to create a new EncodeRequest.
+                encode_request = EncodeRequest(
+                    request_id=request_id,
+                    image_url=image_url,
+                    serialized_request=writable.to_serialized(),
+                )
+                logger.debug(f"Encode request: {encode_request.model_dump_json()}")
                encode_generator = await self.encode_worker_client.round_robin(
-                EncodeRequest(
-                    image_url=request.image_url,
-                ).model_dump_json()
+                    encode_request.model_dump_json()
                )
-            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
                async for encode_response in encode_generator:
                    encode_output = EncodeResponse.model_validate_json(
                        encode_response.data()
                    )
-                image_features = torch.tensor(
-                    encode_output.image_features, device=device, dtype=torch.float16
+                    logger.info(
+                        f"Received response: {{ id: {encode_output.request_id} }}"
                    )

+                # Wait for the write operation to complete.
+                # This will block until the write operation is complete.
+                # This await should be a no-op since we've already received a response from the encode worker.
+                await writable.wait_for_completion()
+                # At this point, the `embeddings` tensor is filled with the image embeddings from the remote encode worker.
+
            remote_prefill_params = None
            logger.info(
-                f"Prefilling locally for request {request.request_id} with length {len(request.engine_prompt['prompt_token_ids'])}"
+                f"Prefilling locally for request {{ id: {request_id}, image_url: '{image_url}' }} with length {len(request.engine_prompt['prompt_token_ids'])}"
            )
            prompt_ids = request.engine_prompt["prompt_token_ids"]

        # rust HTTP requires Delta streaming
        request.sampling_params.output_kind = RequestOutputKind.DELTA

-        if image_features is not None:
-            multi_modal_data = {"image": image_features}
+        # When using aggregated serving, the encode worker will have provided the key-value cache updates via the prefill worker.
+        # When using disaggregated serving, the encode worker will have provided the key-value cache updates via the encode worker.
+        if embeddings is not None:
+            logger.debug(
+                "Aggregated: embedding data from encode worker provided via multi-modal data to decode model."
+            )
+            multi_modal_data = {"image": embeddings}
        else:
+            logger.debug(
+                "Disaggregated: no embedding data required as prefill will have provided key-value cache updates via encode worker."
+            )
            multi_modal_data = None

        async for response in self.engine_client.generate(
@@ -269,6 +325,9 @@ class VllmWorker:
            request_id=request.request_id,
            remote_prefill_params=remote_prefill_params,
        ):
+            logger.debug(
+                f"Yeilding response {{ id: {response.request_id}, prompt: '{response.prompt}' }}"
+            )
            yield MyRequestOutput(
                request_id=response.request_id,
                prompt=response.prompt,

--- a/examples/multimodal/components/encode_worker.py
+++ b/examples/multimodal/components/encode_worker.py
@@ -15,8 +15,10 @@

 import logging
 from io import BytesIO
+from queue import Queue
 from typing import AsyncIterator

+import connect
 import requests
 import torch
 from PIL import Image
@@ -24,10 +26,25 @@ from transformers import AutoImageProcessor, LlavaForConditionalGeneration
 from utils.protocol import EncodeRequest, EncodeResponse
 from utils.vllm import parse_vllm_args

-from dynamo.sdk import endpoint, service
+from dynamo.sdk import async_on_start, endpoint, service

 logger = logging.getLogger(__name__)

+try:
+    import cupy as array_module
+
+    if not array_module.cuda.is_available():
+        raise ImportError("CUDA is not available.")
+    DEVICE = "cuda"
+    logger.info("Using cupy for array operations (GPU mode).")
+except ImportError as e:
+    logger.warning(f"Failed to import cupy, falling back to numpy: {e}.")
+    import numpy as array_module
+
+    DEVICE = "cpu"
+
+CACHE_SIZE_MAXIMUM = 8
+

 @service(
    dynamo={
@@ -36,7 +53,7 @@ logger = logging.getLogger(__name__)
    resources={"gpu": 1, "cpu": "10", "memory": "20Gi"},
    workers=1,
 )
-class EncodeWorker:
+class VllmEncodeWorker:
    def __init__(self) -> None:
        class_name = self.__class__.__name__
        self.engine_args = parse_vllm_args(class_name, "")
@@ -50,9 +67,50 @@ class EncodeWorker:
            self.MODEL_ID, device_map="auto", torch_dtype=torch.float16
        ).eval()

+        self._image_cache: dict[str, Image.Image] = {}
+        self._cache_queue: Queue[str] = Queue(maxsize=CACHE_SIZE_MAXIMUM)
+
    @endpoint()
    async def encode(self, request: EncodeRequest) -> AsyncIterator[EncodeResponse]:
-        image = self.open_image(request.image_url)
+        logger.debug(
+            f"Received encode request: {{ id: {request.request_id}, image_url: '{request.image_url}' }}."
+        )
+
+        request_id = request.request_id
+        image_url = request.image_url.lower()
+
+        # The following steps encode the requested image and provided useful embeddings.
+        # 1. Open the image from the provided URL.
+        # 2. Process the image using the image processor.
+        # 3. Run the image through the vision model's vision tower.
+        # 4. Run the results of the vision tower through the multi-modal projector.
+        # 5. Create a descriptor for the embeddings.
+        # 6. Create a write operation using the serialized request and the descriptor.
+        # 7. Await for the write operation to complete.
+        # 8. Yield the encode response.
+
+        # Either retrieve the image from the cache or download it and then cache it.
+        if request.image_url in self._image_cache:
+            image = self._image_cache[image_url]
+            logger.debug(
+                f"Image found in cache for request: {{ id: {request_id}, image_url: '{image_url}' }}."
+            )
+        else:
+            image = self.open_image(image_url)
+            logger.debug(
+                f"Downloading/opening image for request: {{ id: {request_id}, image_url: '{image_url}' }}."
+            )
+            # Cache the image for future use, and evict the oldest image if the cache is full.
+            if self._cache_queue.full():
+                oldest_image_url = self._cache_queue.get()
+                del self._image_cache[oldest_image_url]
+
+            self._image_cache[request.image_url] = image
+            self._cache_queue.put(request.image_url)
+
+        logger.debug(
+            f"Processing image for request: {{ id: {request_id}, image_url: '{image_url}' }}"
+        )
        image_embeds = self.image_processor(images=image, return_tensors="pt")

        with torch.no_grad():
@@ -60,22 +118,56 @@ class EncodeWorker:
            vision_outputs = self.vision_model.vision_tower(
                image_embeds["pixel_values"].to(self.vision_model.device)
            )
+            logger.debug("Vision model completed.")
+
+            embeddings = vision_outputs.last_hidden_state
+            embeddings = self.vision_model.multi_modal_projector(embeddings)
+
+            logger.debug(
+                f"Embeddings: {{ shape: {embeddings.shape}, dtype: {embeddings.dtype}, device: {embeddings.device}, ptr: {embeddings.data_ptr()}, elements: {{ count: {embeddings.numel()}, size: {embeddings.element_size()} }} }}."
+            )
+
+            if request.serialized_request is None:
+                logger.error(
+                    f"Request serialized_request is None for request: {{ id: {request_id}, image_url: '{image_url}' }}."
+                )
+
+            # Create a descriptor for the embeddings, this will register the memory with the connector (and the NIXL runtime).
+            descriptor = connect.Descriptor(embeddings)
+            # Create a write operation using the serialized request and the descriptor.
+            # This will begin the RDMA transfer of the embeddings to the remote worker.
+            write_op = await self._connector.begin_write(
+                descriptor,
+                request.serialized_request,
+            )
+            # Await for the write operation to complete.
+            # This will block until the data has been written to the remote worker or an error occurs.
+            await write_op.wait_for_completion()

-            image_features = vision_outputs.last_hidden_state
-            image_features = self.vision_model.multi_modal_projector(image_features)
            yield EncodeResponse(
-                image_features=image_features.tolist()
+                request_id=request.request_id,
            ).model_dump_json()

+    @async_on_start()
+    async def on_start(self):
+        logger.info("Startup started.")
+        # Create and initialize a dynamo connector for this worker.
+        # We'll needs this to move data between this worker and remote workers efficiently.
+        self._connector = connect.Connector()
+        await self._connector.initialize()
+        logger.info("Startup completed.")
+
    def open_image(self, image: str) -> Image.Image:
        # TODO: Have a seperate field for url and non url - and avoid auto detection
        try:
+            # Acquire the image and convert it to the format (RGB) the image processor model expects.
            if image.startswith("http") or image.startswith("https"):
                response = requests.get(image)
                image_data = Image.open(BytesIO(response.content)).convert("RGB")
            else:
                image_data = Image.open(image).convert("RGB")
+
+            return image_data
        except Exception as e:
            logger.error(f"Error opening image: {e}")
            raise e
-        return image_data
--- a/examples/multimodal/components/prefill_worker.py
+++ b/examples/multimodal/components/prefill_worker.py
@@ -20,8 +20,9 @@ import os
 import signal
 import sys

+import connect
 import torch
-from components.encode_worker import EncodeWorker
+from components.encode_worker import VllmEncodeWorker
 from pydantic import BaseModel
 from utils.logging import check_required_workers
 from utils.nixl import NixlMetadataStore
@@ -38,6 +39,11 @@ from dynamo.sdk import async_on_start, depends, dynamo_context, endpoint, servic

 logger = logging.getLogger(__name__)

+# Constants for the shape and dtype of the embeddings tensor.
+EMBEDDINGS_SHAPE = (1, 577, 4096)
+EMBEDDINGS_DTYPE = torch.float16
+EMBEDDINGS_DEVICE = "cuda"
+

 class RequestType(BaseModel):
    text: str
@@ -50,8 +56,8 @@ class RequestType(BaseModel):
    resources={"gpu": 1, "cpu": "10", "memory": "20Gi"},
    workers=1,
 )
-class PrefillWorker:
-    encode_worker = depends(EncodeWorker)
+class VllmPrefillWorker:
+    encode_worker = depends(VllmEncodeWorker)

    def __init__(self):
        class_name = self.__class__.__name__
@@ -95,7 +101,7 @@ class PrefillWorker:
            raise RuntimeError("Failed to initialize engine client")
        runtime = dynamo_context["runtime"]

-        enc_comp_ns, enc_comp_name = EncodeWorker.dynamo_address()  # type: ignore
+        enc_comp_ns, enc_comp_name = VllmEncodeWorker.dynamo_address()  # type: ignore
        self.encode_worker_client = (
            await runtime.namespace(enc_comp_ns)
            .component(enc_comp_name)
@@ -103,6 +109,20 @@ class PrefillWorker:
            .client()
        )

+        self._connector = connect.Connector(runtime=runtime, namespace=enc_comp_ns)
+        await self._connector.initialize()
+
+        # Create a longer-lived buffer for receiving the image embeddings.
+        embeddings = torch.empty(
+            EMBEDDINGS_SHAPE,
+            dtype=EMBEDDINGS_DTYPE,
+            device=EMBEDDINGS_DEVICE,
+        )
+        descriptor = connect.Descriptor(embeddings)
+        # Register the descriptor w/ NIXL (this is optional, if not done here the connect subsytem will take care of this automatically).
+        descriptor.register_memory(self._connector)
+        self._embeddings_descriptor = (embeddings, descriptor)
+
        await check_required_workers(self.encode_worker_client, self.min_workers)

        metadata = self.engine_client.nixl_metadata
@@ -119,19 +139,19 @@ class PrefillWorker:
                sys.exit(1)

        task.add_done_callback(prefill_queue_handler_cb)
-        logger.info("PrefillWorker initialized")
+        logger.info("Initialization complete.")

    def shutdown_vllm_engine(self, signum, frame):
        """Shutdown the background loop"""
-        logger.info(f"Received signal {signum}, shutting down")
+        logger.info(f"Shutdown started, signal {signum} received.")
        loop = asyncio.get_event_loop()
        try:
            self.engine_client.close()
-            logger.info("PrefillWorker shutdown complete")
        except Exception as e:
            logger.error(f"Error during shutdown: {e}")
        finally:
            loop.stop()
+        logger.info("Shutdown complete.")

    async def prefill_queue_handler(self):
        logger.info("Prefill queue handler entered")
@@ -166,16 +186,42 @@ class PrefillWorker:
        if request.multimodal_data_source["image_url"] is None:
            raise ValueError("No image url provided for prefill request")

+        request_id = request.request_id
+        engine_id = request.engine_id
+        image_url = request.multimodal_data_source["image_url"]
+
+        logger.info(
+            f"Received prefill request {{ id: {request_id}, engine_id: {engine_id}, image_url: '{image_url}' }}."
+        )
+
+        # Extract the pre-allocated, reusable image embeddings tensor and its descriptor.
+        # Doing this avoids unnessesary memory de/registration with NIXL.
+        embeddings, descriptor = self._embeddings_descriptor
+
+        # Create a new writable operation from the descriptor.
+        with self._connector.create_writable(descriptor) as writable:
+            # Extract serialized metadata about the operation from the writable operation,
+            # and use it to create a new EncodeRequest.
            encode_generator = await self.encode_worker_client.round_robin(
                EncodeRequest(
-                image_url=request.multimodal_data_source["image_url"],
+                    request_id=request_id,
+                    image_url=image_url,
+                    serialized_request=writable.to_serialized(),
                ).model_dump_json()
            )
            async for encode_response in encode_generator:
-            encode_output = EncodeResponse.model_validate_json(encode_response.data())
-            image_features = torch.tensor(
-                encode_output.image_features, device="cpu", dtype=torch.float16
+                encode_output = EncodeResponse.model_validate_json(
+                    encode_response.data(),
                )
+                logger.debug(
+                    f"Received response: {{ id: {encode_output.request_id} }}."
+                )
+
+            # Wait for the write operation to complete.
+            # This will block until the write operation is complete.
+            # This await should be a no-op since we've already received a response from the encode worker.
+            await writable.wait_for_completion()
+            # At this point, the `embeddings` tensor is filled with the image embeddings from the remote encode worker.

            sampling_params = request.sampling_params
            sampling_params.max_tokens = 1
@@ -184,26 +230,26 @@ class PrefillWorker:
            remote_prefill_params = RemotePrefillParams(
                is_remote_decode=True,
                decode_block_ids=request.block_ids,
-            decode_engine_id=request.engine_id,
+                decode_engine_id=engine_id,
                decode_computed_block_ids=request.computed_block_ids,
            )

            # TODO check if metadata has changed
            # and reload - currently only loading once
-        if request.engine_id not in self._loaded_metadata:
+            if engine_id not in self._loaded_metadata:
                remote_metadata = await self._metadata_store.get(request.engine_id)
                await self.engine_client.add_remote_nixl_metadata(remote_metadata)
                logger.info(
-                f"Loaded nixl metadata from engine {request.engine_id} into "
+                    f"Loaded nixl metadata from engine {engine_id} into "
                    f"engine {self.engine_client.nixl_metadata.engine_id}"
                )
-            self._loaded_metadata.add(request.engine_id)
+                self._loaded_metadata.add(engine_id)

            # To make sure the decode worker can pre-allocate the memory with the correct size for the prefill worker to transfer the kv cache,
            # some placeholder dummy tokens were inserted based on the embedding size in the worker.py.
            # The structure of the prompt is "\nUSER: <image> <dummy_tokens>\n<user_prompt>\nASSISTANT:", need to remove the dummy tokens after the image token.
            IMAGE_TOKEN_ID = 32000
-        embedding_size = image_features.shape[1]
+            embedding_size = embeddings.shape[1]
            padding_size = embedding_size - 1
            image_token_index = request.prompt_token_ids.index(IMAGE_TOKEN_ID)
            dummy_token_index = image_token_index + 1
@@ -213,10 +259,10 @@ class PrefillWorker:
            )

            async for _ in self.engine_client.generate(
-            request_id=request.request_id,
+                request_id=request_id,
                prompt=TokensPrompt(
                    prompt_token_ids=prompt_token_ids,
-                multi_modal_data={"image": image_features},
+                    multi_modal_data={"image": embeddings},
                ),
                sampling_params=sampling_params,
                remote_prefill_params=remote_prefill_params,

--- a/examples/multimodal/components/processor.py
+++ b/examples/multimodal/components/processor.py
@@ -19,7 +19,7 @@ import uuid
 from enum import Enum
 from typing import AsyncIterator, Tuple, Union

-from components.worker import VllmWorker
+from components.decode_worker import VllmDecodeWorker
 from transformers import AutoTokenizer
 from utils.chat_processor import ChatProcessor, CompletionsProcessor, ProcessMixIn
 from utils.logging import check_required_workers
@@ -53,7 +53,7 @@ class Processor(ProcessMixIn):
    vLLM pre and post processing
    """

-    worker = depends(VllmWorker)
+    worker = depends(VllmDecodeWorker)

    def __init__(self):
        class_name = self.__class__.__name__
@@ -83,7 +83,7 @@ class Processor(ProcessMixIn):
    @async_on_start
    async def async_init(self):
        runtime = dynamo_context["runtime"]
-        comp_ns, comp_name = VllmWorker.dynamo_address()  # type: ignore
+        comp_ns, comp_name = VllmDecodeWorker.dynamo_address()  # type: ignore
        self.worker_client = (
            await runtime.namespace(comp_ns)
            .component(comp_name)

--- a/examples/multimodal/configs/agg.yaml
+++ b/examples/multimodal/configs/agg.yaml
@@ -21,7 +21,7 @@ Processor:
  router: round-robin
  common-configs: [model, block-size, max-model-len]

-VllmWorker:
+VllmDecodeWorker:
  enforce-eager: true
  max-num-batched-tokens: 16384
  enable-prefix-caching: true
@@ -33,7 +33,7 @@ VllmWorker:
      gpu: 1
  common-configs: [model, block-size, max-model-len]

-EncodeWorker:
+VllmEncodeWorker:
  tensor-parallel-size: 1
  router: random
  ServiceArgs:

--- a/examples/multimodal/configs/disagg.yaml
+++ b/examples/multimodal/configs/disagg.yaml
@@ -22,7 +22,7 @@ Processor:
  router: round-robin
  common-configs: [model, block-size]

-VllmWorker:
+VllmDecodeWorker:
  remote-prefill: true
  conditional-disagg: true
  max-local-prefill-length: 10
@@ -33,7 +33,7 @@ VllmWorker:
      gpu: 1
  common-configs: [model, block-size, max-model-len, kv-transfer-config]

-PrefillWorker:
+VllmPrefillWorker:
  max-num-batched-tokens: 16384
  ServiceArgs:
    workers: 1
@@ -41,7 +41,7 @@ PrefillWorker:
      gpu: 1
  common-configs: [model, block-size, max-model-len, kv-transfer-config]

-EncodeWorker:
+VllmEncodeWorker:
  tensor-parallel-size: 1
  router: random
  ServiceArgs:

--- a/examples/multimodal/connect/__init__.py
+++ b/examples/multimodal/connect/__init__.py
--- a/examples/multimodal/graphs/agg.py
+++ b/examples/multimodal/graphs/agg.py
@@ -6,6 +6,7 @@
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,9 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from components.encode_worker import EncodeWorker
+from components.decode_worker import VllmDecodeWorker
+from components.encode_worker import VllmEncodeWorker
 from components.frontend import Frontend
 from components.processor import Processor
-from components.worker import VllmWorker

-Frontend.link(Processor).link(VllmWorker).link(EncodeWorker)
+Frontend.link(Processor).link(VllmDecodeWorker).link(VllmEncodeWorker)
--- a/examples/multimodal/graphs/disagg.py
+++ b/examples/multimodal/graphs/disagg.py
@@ -13,10 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from components.encode_worker import EncodeWorker
+from components.decode_worker import VllmDecodeWorker
+from components.encode_worker import VllmEncodeWorker
 from components.frontend import Frontend
-from components.prefill_worker import PrefillWorker
+from components.prefill_worker import VllmPrefillWorker
 from components.processor import Processor
-from components.worker import VllmWorker

-Frontend.link(Processor).link(VllmWorker).link(PrefillWorker).link(EncodeWorker)
+Frontend.link(Processor).link(VllmDecodeWorker).link(VllmPrefillWorker).link(
+    VllmEncodeWorker
+)
--- a/examples/multimodal/utils/protocol.py
+++ b/examples/multimodal/utils/protocol.py
@@ -17,6 +17,7 @@
 import json
 from typing import Any, List, Optional

+import connect
 import msgspec
 from pydantic import BaseModel, ConfigDict, field_validator
 from pydantic_core import core_schema
@@ -111,12 +112,13 @@ class EncodeRequest(BaseModel):

    model_config = ConfigDict(arbitrary_types_allowed=True)
    image_url: str
+    request_id: str
+    serialized_request: Optional[connect.SerializedRequest] = None


 class EncodeResponse(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)
-
-    image_features: List[List[List[float]]]
+    request_id: str


 class MyRequestOutput(BaseModel):
@@ -129,7 +131,6 @@ class MyRequestOutput(BaseModel):
    """

    model_config = ConfigDict(arbitrary_types_allowed=True)
-
    request_id: str
    prompt: Optional[str] = None
    prompt_token_ids: Optional[List[int]] = None