refactor: remove old examples with old UX (#1899)

f00d700e · Alec · GitHub · c7080419 · c7080419 · c7080419
Unverified Commit f00d700e authored Jul 14, 2025 by Alec Committed by GitHub Jul 14, 2025
20 changed files
--- a/examples/llm/graphs/disagg_router.py
+++ b/examples/llm/graphs/disagg_router.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from components.frontend import Frontend
-from components.kv_router import Router
-from components.planner_service import Planner
-from components.prefill_worker import PrefillWorker
-from components.processor import Processor
-from components.worker import VllmWorker
-Frontend.link(Processor).link(Router).link(VllmWorker).link(PrefillWorker)
-Frontend.link(Planner)
--- a/examples/llm/multinode-examples.md
+++ b/examples/llm/multinode-examples.md
-../../docs/examples/multinode.md
\ No newline at end of file
--- a/examples/llm/utils/chat_processor.py
+++ b/examples/llm/utils/chat_processor.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-import time
-from typing import AsyncIterator, List, Optional, Protocol, Union, runtime_checkable
-from vllm.config import ModelConfig
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.entrypoints.chat_utils import ConversationMessage
-from vllm.entrypoints.openai.protocol import (
-    ChatCompletionRequest,
-    CompletionRequest,
-    RequestResponseMetadata,
-)
-from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
-from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
-from vllm.entrypoints.openai.serving_engine import RequestPrompt
-from vllm.inputs.data import TokensPrompt
-from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-@runtime_checkable
-class ProcessMixInRequired(Protocol):
-    engine_args: AsyncEngineArgs
-    chat_processor: "ChatProcessor | None"
-    completions_processor: "CompletionsProcessor | None"
-    model_config: ModelConfig
-    default_sampling_params: SamplingParams
-class ProcessMixIn(ProcessMixInRequired):
-    """
-    Mixin for pre and post processing for vLLM
-    Requires engine_args, engine_client, processor, model_config to be initialized
-    """
-    engine_args: AsyncEngineArgs
-    chat_processor: "ChatProcessor | None"
-    completions_processor: "CompletionsProcessor | None"
-    model_config: ModelConfig
-    default_sampling_params: SamplingParams
-    def __init__(self):
-        pass
-    def _get_processor(
-        self, raw_request: Union[CompletionRequest, ChatCompletionRequest]
-    ):
-        # Determine the processor type based on the request structure
-        return (
-            self.chat_processor
-            if isinstance(raw_request, ChatCompletionRequest)
-            else self.completions_processor
-        )
-    async def _parse_raw_request(
-        self, raw_request: Union[CompletionRequest, ChatCompletionRequest]
-    ):
-        processor = self._get_processor(raw_request)
-        if processor is None:
-            raise RuntimeError("Processor has not been initialized")
-        request = processor.parse_raw_request(raw_request)
-        preprocess_result = await processor.preprocess(raw_request)
-        default_max_tokens = self.model_config.max_model_len - len(
-            preprocess_result.engine_prompt["prompt_token_ids"]
-        )
-        sampling_params = request.to_sampling_params(
-            default_max_tokens,
-            self.model_config.logits_processor_pattern,
-            self.default_sampling_params,
-        )
-        return (
-            request,
-            preprocess_result.conversation,
-            preprocess_result.request_prompt,
-            preprocess_result.engine_prompt,
-            sampling_params,
-        )
-    async def _stream_response(self, request, generator, request_id, conversation):
-        processor = self._get_processor(request)
-        if processor is None:
-            raise RuntimeError("processor has not been initialized")
-        return processor.stream_response(
-            request,
-            generator,
-            request_id,
-            conversation,
-        )
-class PreprocessResult:
-    def __init__(
-        self,
-        conversation: Optional[ConversationMessage],
-        request_prompt: RequestPrompt,
-        engine_prompt: TokensPrompt,
-    ):
-        self.conversation = conversation
-        self.request_prompt = request_prompt
-        self.engine_prompt = engine_prompt
-class ChatProcessor:
-    def __init__(self, tokenizer: AnyTokenizer, model_config: ModelConfig):
-        self.tokenizer = tokenizer
-        self.model_config = model_config
-        self.openai_serving = OpenAIServingChat(
-            engine_client=None,
-            model_config=model_config,
-            models=None,
-            request_logger=None,
-            response_role="assistant",
-            chat_template=None,
-            chat_template_content_format="auto",
-        )
-    def parse_raw_request(
-        self, raw_request: ChatCompletionRequest
-    ) -> ChatCompletionRequest:
-        return ChatCompletionRequest.parse_obj(raw_request)
-    async def preprocess(self, raw_request: ChatCompletionRequest) -> PreprocessResult:
-        request = self.parse_raw_request(raw_request)
-        (
-            conversation,
-            request_prompts,
-            engine_prompts,
-        ) = await self.openai_serving._preprocess_chat(
-            request,
-            self.tokenizer,
-            request.messages,
-            chat_template=request.chat_template or self.tokenizer.chat_template,
-            chat_template_content_format=self.openai_serving.chat_template_content_format,
-            add_generation_prompt=request.add_generation_prompt,
-            continue_final_message=request.continue_final_message,
-            tool_dicts=None,
-            documents=request.documents,
-            chat_template_kwargs=request.chat_template_kwargs,
-            tool_parser=self.openai_serving.tool_parser,
-            truncate_prompt_tokens=request.truncate_prompt_tokens,
-            add_special_tokens=request.add_special_tokens,
-        )
-        return PreprocessResult(conversation[0], request_prompts[0], engine_prompts[0])
-    async def stream_response(
-        self,
-        request: ChatCompletionRequest,
-        result_generator: AsyncIterator,
-        request_id: str,
-        conversation: List,
-    ):
-        request_metadata = RequestResponseMetadata(request_id=request_id)
-        if not request.stream:
-            raise ValueError("Only streaming responses are supported")
-        async for raw_response in self.openai_serving.chat_completion_stream_generator(
-            request,
-            result_generator,
-            request_id,
-            request.model,
-            conversation,
-            self.tokenizer,
-            request_metadata,
-        ):
-            if raw_response.startswith("data: [DONE]"):
-                break
-            response = json.loads(raw_response.lstrip("data: "))
-            yield response
-class CompletionsProcessor:
-    def __init__(self, tokenizer: AnyTokenizer, model_config: ModelConfig):
-        self.tokenizer = tokenizer
-        self.model_config = model_config
-        self.openai_serving = OpenAIServingCompletion(
-            engine_client=None,
-            model_config=model_config,
-            models=None,
-            request_logger=None,
-        )
-    def parse_raw_request(self, raw_request: CompletionRequest) -> CompletionRequest:
-        return CompletionRequest.parse_obj(raw_request)
-    async def preprocess(self, raw_request: CompletionRequest) -> PreprocessResult:
-        request = self.parse_raw_request(raw_request)
-        (
-            request_prompts,
-            engine_prompts,
-        ) = await self.openai_serving._preprocess_completion(
-            request,
-            self.tokenizer,
-            input_or_inputs=request.prompt,
-            truncate_prompt_tokens=request.truncate_prompt_tokens,
-            add_special_tokens=request.add_special_tokens,
-        )
-        return PreprocessResult(None, request_prompts[0], engine_prompts[0])
-    async def stream_response(
-        self,
-        request: CompletionRequest,
-        result_generator: AsyncIterator,
-        request_id: str,
-        conversation: Optional[List[ConversationMessage]] = None,
-    ):
-        request_metadata = RequestResponseMetadata(request_id=request_id)
-        if not request.stream:
-            raise ValueError("Only streaming responses are supported")
-        async for raw_response in self.openai_serving.completion_stream_generator(
-            request,
-            result_generator,
-            request_id,
-            int(time.time()),  # created_time
-            request.model,
-            1,  # num_prompts
-            self.tokenizer,
-            request_metadata,
-        ):
-            if raw_response.startswith("data: [DONE]"):
-                break
-            response = json.loads(raw_response.lstrip("data: "))
-            yield response
--- a/examples/llm/utils/check_worker.py
+++ b/examples/llm/utils/check_worker.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import asyncio
-import logging
-from dynamo._core import Client
-logger = logging.getLogger(__name__)
-async def check_required_workers(
-    workers_client: Client, required_workers: int, on_change=True, poll_interval=0.5
-):
-    """Wait until the minimum number of workers are ready."""
-    worker_ids = workers_client.instance_ids()
-    num_workers = len(worker_ids)
-    while num_workers < required_workers:
-        await asyncio.sleep(poll_interval)
-        worker_ids = workers_client.instance_ids()
-        new_count = len(worker_ids)
-        if (not on_change) or new_count != num_workers:
-            logger.info(
-                f"Waiting for more workers to be ready.\n"
-                f" Current: {new_count},"
-                f" Required: {required_workers}"
-            )
-        num_workers = new_count
-    print(f"Workers ready: {worker_ids}")
-    return worker_ids
--- a/examples/llm/utils/clear_namespace.py
+++ b/examples/llm/utils/clear_namespace.py
-#  SPDX-FileCopyrightText: Copyright (c) 2020 Atalaya Tech. Inc
-#  SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#  SPDX-License-Identifier: Apache-2.0
-#  #
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#  #
-#  http://www.apache.org/licenses/LICENSE-2.0
-#  #
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#  Modifications Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES
-import argparse
-import asyncio
-import logging
-import os
-from utils.prefill_queue import PrefillQueue
-from dynamo.runtime import DistributedRuntime, EtcdKvCache, dynamo_worker
-from dynamo.runtime.logging import configure_dynamo_logging
-configure_dynamo_logging()
-logger = logging.getLogger(__name__)
-@dynamo_worker()
-async def clear_namespace(runtime: DistributedRuntime, namespace: str):
-    etcd_kv_cache = await EtcdKvCache.create(
-        runtime.etcd_client(),
-        f"/{namespace}/",
-        {},
-    )
-    await etcd_kv_cache.clear_all()
-    logger.info(f"Cleared /{namespace} in EtcdKvCache")
-    prefill_queue_nats_server = os.getenv("NATS_SERVER", "nats://localhost:4222")
-    prefill_queue_stream_name = f"{namespace}_prefill_queue"
-    async with PrefillQueue.get_instance(
-        nats_server=prefill_queue_nats_server,
-        stream_name=prefill_queue_stream_name,
-        dequeue_timeout=3,
-    ) as prefill_queue:
-        cleared_count = await prefill_queue.clear_queue()
-        logger.info(
-            f"Cleared {cleared_count} requests from prefill queue{prefill_queue_stream_name}"
-        )
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--namespace", type=str, required=True)
-    args = parser.parse_args()
-    asyncio.run(clear_namespace(args.namespace))
--- a/examples/llm/utils/nats_queue.py
+++ b/examples/llm/utils/nats_queue.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import asyncio
-from contextlib import asynccontextmanager
-from typing import ClassVar, Optional
-from dynamo._core import NatsQueue
-class NATSQueue:
-    _instance: ClassVar[Optional["NATSQueue"]] = None
-    _lock: ClassVar[asyncio.Lock] = asyncio.Lock()
-    def __init__(
-        self,
-        stream_name: str = "default",
-        nats_server: str = "nats://localhost:4222",
-        dequeue_timeout: float = 1,
-    ):
-        self.nats_q = NatsQueue(stream_name, nats_server, dequeue_timeout)
-    @classmethod
-    @asynccontextmanager
-    async def get_instance(
-        cls,
-        *,
-        stream_name: str = "default",
-        nats_server: str = "nats://localhost:4222",
-        dequeue_timeout: float = 1,
-    ):
-        """Get or create a singleton instance of NATSq"""
-        # TODO: check if this _lock is needed with GIL
-        async with cls._lock:
-            if cls._instance is None:
-                cls._instance = cls(
-                    stream_name=stream_name,
-                    nats_server=nats_server,
-                    dequeue_timeout=dequeue_timeout,
-                )
-                await cls._instance.connect()
-            try:
-                yield cls._instance
-            except Exception:
-                if cls._instance:
-                    await cls._instance.close()
-                cls._instance = None
-                raise
-    # TODO: check to see if this can be replaced by something like get_instance().close()
-    @classmethod
-    async def shutdown(cls):
-        """Explicitly close the singleton instance if it exists"""
-        async with cls._lock:
-            if cls._instance:
-                await cls._instance.close()
-                cls._instance = None
-    async def connect(self):
-        await self.nats_q.connect()
-    async def ensure_connection(self):
-        await self.nats_q.ensure_connection()
-    async def close(self):
-        await self.nats_q.close()
-    # TODO: is enqueue/dequeue_object a better name for a general queue?
-    async def enqueue_task(self, task_data: bytes) -> None:
-        await self.nats_q.enqueue_task(task_data)
-    async def dequeue_task(self, timeout: Optional[float] = None) -> Optional[bytes]:
-        return await self.nats_q.dequeue_task(timeout)
-    async def get_queue_size(self) -> int:
-        return await self.nats_q.get_queue_size()
-    async def clear_queue(self) -> int:
-        try:
-            cleared_count = 0
-            # Continue until we can't dequeue any more messages
-            while True:
-                # use a small timeout
-                message = await self.dequeue_task(timeout=0.1)
-                if message is None:
-                    break
-                cleared_count += 1
-            return cleared_count
-        except Exception as e:
-            raise RuntimeError(f"Failed to clear queue: {e}")
--- a/examples/llm/utils/nixl.py
+++ b/examples/llm/utils/nixl.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import logging
-import os
-from contextlib import contextmanager
-import msgspec
-from vllm.distributed.device_communicators.nixl import NixlMetadata
-from dynamo.runtime import DistributedRuntime
-METADATA_DIR = "/tmp/nixl"
-logger = logging.getLogger(__name__)
-@contextmanager
-def temp_metadata_file(engine_id, metadata: NixlMetadata):
-    os.makedirs(METADATA_DIR, exist_ok=True)
-    path = f"{METADATA_DIR}/{engine_id}.nixl_meta"
-    with open(path, "wb") as f:
-        encoded = msgspec.msgpack.encode(metadata)
-        logger.info(f"Size of encoded metadata: {len(encoded)}")
-        f.write(encoded)
-    try:
-        yield path
-    finally:
-        if os.path.exists(path):
-            os.remove(path)
-def find_remote_metadata(engine_id):
-    # find and load metadata from METADATA_DIR that do not match engine_id
-    remote_metadata = []
-    for file in os.listdir(METADATA_DIR):
-        if file.endswith(".nixl_meta"):
-            if file.split(".")[0] != engine_id:
-                with open(os.path.join(METADATA_DIR, file), "rb") as f:
-                    remote_metadata.append(
-                        msgspec.msgpack.decode(f.read(), type=NixlMetadata)
-                    )
-    return remote_metadata
-class NixlMetadataStore:
-    NIXL_METADATA_KEY = "nixl_metadata"
-    def __init__(self, namespace: str, runtime: DistributedRuntime) -> None:
-        self._namespace = namespace
-        # TODO Remove metadata from etcd on delete
-        self._stored: set[str] = set()
-        self._cached: dict[str, NixlMetadata] = {}
-        self._client = runtime.etcd_client()
-        if self._client is None:
-            raise Exception("Cannot be used with static workers")
-        self._key_prefix = f"{self._namespace}/{NixlMetadataStore.NIXL_METADATA_KEY}"
-    async def put(self, engine_id, metadata: NixlMetadata):
-        serialized_metadata = msgspec.msgpack.encode(metadata)
-        key = "/".join([self._key_prefix, engine_id])
-        # create with primary lease so that the kv entry will be deleted when the worker shutdowns
-        try:
-            # TODO: should we create a series of function in etcd client to use primary lease?
-            await self._client.kv_create_or_validate(
-                key, serialized_metadata, self._client.primary_lease_id()
-            )
-        except Exception as e:
-            logger.warning(f"A different metadata exists for engine {engine_id}: {e}")
-        self._stored.add(engine_id)
-    async def get(self, engine_id) -> NixlMetadata:
-        try:
-            if engine_id in self._cached:
-                return self._cached[engine_id]
-            key = "/".join([self._key_prefix, engine_id])
-            key_values = await self._client.kv_get_prefix(key)
-            deserialized_metadata = None
-            for item in key_values:
-                deserialized_metadata = msgspec.msgpack.decode(
-                    item["value"], type=NixlMetadata
-                )
-                break
-            if deserialized_metadata is None:
-                raise Exception("metadata not found in etcd")
-            self._cached[engine_id] = deserialized_metadata
-            # TODO watch for changes and update cache
-            # self._client.add_watch_callback(
-            #     key,
-            #     self._watch_callback,
-            # )
-        except Exception as e:
-            raise Exception("Error retrieving metadata for engine {engine_id}") from e
-        return deserialized_metadata
--- a/examples/llm/utils/prefill_queue.py
+++ b/examples/llm/utils/prefill_queue.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Optional
-import msgspec
-from utils.nats_queue import NATSQueue
-from vllm.remote_prefill import RemotePrefillRequest
-class PrefillQueue(NATSQueue):
-    """
-    A wrapper of NATSQueue for PrefillRequest.
-    The stream name is forced to be "prefill_queue".
-    """
-    def __init__(
-        self,
-        stream_name="prefill_queue",
-        nats_server: str = "nats://localhost:4222",
-        dequeue_timeout: float = 1,
-    ):
-        super().__init__(
-            stream_name=stream_name,
-            nats_server=nats_server,
-            dequeue_timeout=dequeue_timeout,
-        )
-    async def enqueue_prefill_request(
-        self, prefill_request: RemotePrefillRequest
-    ) -> None:
-        encoded_request = msgspec.json.encode(prefill_request)
-        await self.enqueue_task(encoded_request)
-    async def dequeue_prefill_request(self) -> Optional[RemotePrefillRequest]:
-        encoded_request = await self.dequeue_task()
-        if encoded_request is not None:
-            prefill_request = msgspec.json.decode(
-                encoded_request, type=RemotePrefillRequest
-            )
-            return prefill_request
-        else:
-            return None
--- a/examples/llm/utils/protocol.py
+++ b/examples/llm/utils/protocol.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-from typing import Any, List, Optional
-import msgspec
-from pydantic import BaseModel, ConfigDict, field_validator
-from pydantic_core import core_schema
-from typing_extensions import NotRequired
-from vllm.inputs.data import TokensPrompt
-from vllm.outputs import CompletionOutput
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import PromptLogprobs, RequestMetrics
-class Request(BaseModel):
-    prompt: str
-    sampling_params: dict
-class Tokens(BaseModel):
-    tokens: list[int]
-class LocalBlockHashes(BaseModel):
-    hashes: list[int]
-    tokens: list[int]
-    num_tokens: int
-class PrefillRequest(Request):
-    request_id: str
-class Response(BaseModel):
-    text: str
-class PrefillResponse(BaseModel):
-    prefilled: bool
-# Hack to override the type of multi_modal_data in TokensPrompt
-# as pydantic doesn't understand generic types
-# TokensPrompt is defined here: https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/inputs/data.py#L38
-# multi_modal_data is defined here: https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/inputs.py#L103
-# ModalityData is defined here: https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/inputs.py#L80
-class PatchedTokensPrompt(TokensPrompt):
-    multi_modal_data: NotRequired[Optional[Any]]  # type: ignore
-# Monkey-patch the SamplingParams type to add a dummy core schema so pydantic can validate it
-# Sampling params is a mspspec struct
-# SamplingParams is defined here: https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/sampling_params.py#L88
-SamplingParams.__get_pydantic_core_schema__ = classmethod(
-    lambda cls, source, handler: core_schema.any_schema()
-)
-class vLLMGenerateRequest(BaseModel):
-    """
-    Serializable class of all the fields vLLM engine requires for inference
-    """
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-    engine_prompt: PatchedTokensPrompt
-    sampling_params: SamplingParams
-    request_id: str
-    prefix_hit_rate: Optional[float] = 0.0
-    @field_validator("sampling_params", mode="before")
-    @classmethod
-    def parse_sampling_params(cls, v: Any) -> SamplingParams:
-        if isinstance(v, str):
-            v = json.loads(v)
-        if isinstance(v, dict):
-            return SamplingParams(**v)
-        return v
-    model_config = ConfigDict(
-        json_encoders={SamplingParams: lambda v: msgspec.json.encode(v)}
-    )
-class MyRequestOutput(BaseModel):
-    """
-    RequestOutput from vLLM is not serializable by default
-    https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/outputs.py#L85
-    This class is used to serialize the RequestOutput and any recursively defined types
-    We can do this because PromptLogprobs, RequestMetrics, and CompletionOutput are all serializable dataclasses
-    """
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-    request_id: str
-    prompt: Optional[str] = None
-    prompt_token_ids: Optional[List[int]] = None
-    prompt_logprobs: Optional[PromptLogprobs] = None
-    outputs: List[CompletionOutput]
-    finished: bool
-    metrics: Optional[RequestMetrics] = None
-    # lora_request: Optional[LoRARequest] = None
-    # encoder_prompt: Optional[str] = None
-    # encoder_prompt_token_ids: Optional[List[int]] = None
-    # num_cached_tokens: Optional[int] = None
-    # multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
--- a/examples/llm/utils/vllm.py
+++ b/examples/llm/utils/vllm.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: rename to avoid ambiguity with vllm package
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.utils import FlexibleArgumentParser
-from dynamo.sdk.lib.config import ServiceConfig
-class RouterType:
-    RANDOM = "random"
-    ROUND_ROBIN = "round-robin"
-    KV = "kv"
-    KV_LOAD = "kv-load"
-    APPROX_KV = "approx-kv"
-def parse_vllm_args(service_name, prefix) -> AsyncEngineArgs:
-    config = ServiceConfig.get_instance()
-    vllm_args = config.as_args(service_name, prefix=prefix)
-    parser = FlexibleArgumentParser()
-    parser.add_argument(
-        "--router",
-        type=str,
-        choices=[
-            RouterType.RANDOM,
-            RouterType.ROUND_ROBIN,
-            RouterType.KV,
-            RouterType.KV_LOAD,
-            RouterType.APPROX_KV,
-        ],
-        default=RouterType.RANDOM,
-        help="Router type to use for scheduling requests to workers",
-    )
-    parser.add_argument(
-        "--router-num-threads",
-        type=int,
-        default=4,
-        help="Number of threads to use for the router to process the requests",
-    )
-    parser.add_argument(
-        "--remote-prefill", action="store_true", help="Enable remote prefill"
-    )
-    parser.add_argument(
-        "--conditional-disagg",
-        action="store_true",
-        help="Use disaggregated router to decide whether to prefill locally or remotely",
-    )
-    parser.add_argument(
-        "--max-local-prefill-length",
-        type=int,
-        default=1000,
-        help="Maximum length for local prefill. If remote prefill is enabled and the prefill length is greater than this value the request will be sent for remote prefill, otherwise prefill phase will run locally.",
-    )
-    parser.add_argument(
-        "--max-prefill-queue-size",
-        type=int,
-        default=3,
-        help="Maximum queue size for remote prefill. If the prefill queue size is greater than this value, prefill phase of the incoming request will be executed locally.",
-    )
-    parser = AsyncEngineArgs.add_cli_args(parser)
-    args = parser.parse_args(vllm_args)
-    engine_args = AsyncEngineArgs.from_cli_args(args)
-    engine_args.router = args.router
-    engine_args.router_num_threads = args.router_num_threads
-    engine_args.remote_prefill = args.remote_prefill
-    engine_args.conditional_disagg = args.conditional_disagg
-    engine_args.max_local_prefill_length = args.max_local_prefill_length
-    engine_args.max_prefill_queue_size = args.max_prefill_queue_size
-    return engine_args
--- a/examples/llm_hello_world/llm_hello_world.py
+++ b/examples/llm_hello_world/llm_hello_world.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import asyncio
-import logging
-import random
-from fastapi import FastAPI
-from fastapi.responses import StreamingResponse
-from pydantic import BaseModel
-from dynamo.sdk import (
-    DYNAMO_IMAGE,
-    AbstractService,
-    abstract_endpoint,
-    api,
-    depends,
-    endpoint,
-    service,
-)
-logger = logging.getLogger(__name__)
-class ChatRequest(BaseModel):
-    text: str
-"""
-Pipeline Architecture:
-Users/Clients (HTTP)
-      │
-      ▼
-┌─────────────┐
-│  Frontend   │  HTTP API endpoint (/v1/chat/completions)
-└─────────────┘
-      │ dynamo/runtime
-      ▼
-┌─────────────┐
-│   Router    │  Routes requests to appropriate worker
-└─────────────┘
-      │ dynamo/runtime
-      ▼
-┌─────────────┐
-│   Worker    │  Generates text using LLM
-└─────────────┘
-"""
-class WorkerInterface(AbstractService):
-    """Interface for LLM workers."""
-    @abstract_endpoint  # enforces that the service implements the method, but also that it is properly decorated
-    async def generate(self, request: ChatRequest):
-        pass
-class RouterInterface(AbstractService):
-    """Interface for request routers."""
-    @abstract_endpoint
-    async def generate(self, request: ChatRequest):
-        pass
-@service(
-    dynamo={"namespace": "llm-hello-world"},
-    image=DYNAMO_IMAGE,
-)
-class VllmWorker(WorkerInterface):
-    @endpoint()
-    async def generate(self, request: ChatRequest):
-        # Convert to Spongebob case (randomly capitalize letters)
-        for token in request.text.split():
-            spongebob_token = "".join(
-                c.upper() if random.random() < 0.5 else c.lower() for c in token
-            )
-            yield spongebob_token
-@service(
-    dynamo={"namespace": "llm-hello-world"},
-    image=DYNAMO_IMAGE,
-)
-class TRTLLMWorker(WorkerInterface):
-    @endpoint()
-    async def generate(self, request: ChatRequest):
-        # Convert to SHOUTING case
-        for token in request.text.split():
-            yield token.upper()
-@service(
-    dynamo={"namespace": "llm-hello-world"},
-    image=DYNAMO_IMAGE,
-)
-class SlowRouter(RouterInterface):
-    worker = depends(WorkerInterface)  # Will be overridden by link()
-    @endpoint()
-    async def generate(self, request: ChatRequest):
-        print("Routing slow")
-        async for response in self.worker.generate(request.model_dump_json()):
-            await asyncio.sleep(1)  # Simulate slow routing with a 1-second delay
-            yield response
-@service(
-    dynamo={"namespace": "llm-hello-world"},
-    image=DYNAMO_IMAGE,
-)
-class FastRouter(RouterInterface):
-    worker = depends(WorkerInterface)  # Will be overridden by link()
-    @endpoint()
-    async def generate(self, request: ChatRequest):
-        print("Routing fast")
-        async for response in self.worker.generate(request.model_dump_json()):
-            await asyncio.sleep(0.1)  # Simulate fast routing with a 0.1-second delay
-            yield response
-app = FastAPI()
-@service(
-    dynamo={"namespace": "llm-hello-world"},
-    image=DYNAMO_IMAGE,
-    app=app,
-)
-class Frontend:
-    router = depends(RouterInterface)  # Will be overridden by link()
-    @api()
-    async def generate(self, request: ChatRequest):
-        print(f"Received request: {request}")
-        async def content_generator():
-            async for response in self.router.generate(request.model_dump_json()):
-                print(f"Received response: {response}")
-                # Format as SSE
-                yield f"data: {response}\n\n"
-        return StreamingResponse(
-            content_generator(),
-            media_type="text/event-stream",
-            headers={
-                "Cache-Control": "no-cache",
-                "Connection": "keep-alive",
-                "X-Accel-Buffering": "no",
-            },
-        )
-# Mix and match pipelines (Tests)
-# Frontend.link(SlowRouter).link(TRTLLMWorker) # type: ignore[attr-defined]
-# slow_pipeline = Frontend.link(SlowRouter).link(VllmWorker) # type: ignore[attr-defined]
-Frontend.link(FastRouter).link(VllmWorker)  # type: ignore[attr-defined]
-"""
-Example usage:
-fast_pipeline = Frontend.link(FastRouter).link(TRTLLMWorker)
-# slow_pipeline = Frontend.link(SlowRouter).link(VllmWorker)
-# mixed_pipeline = Frontend.link(FastRouter).link(VllmWorker)
-# Basic setup with VLLM worker and slow router
-The interface-based design allows for:
-1. Easy swapping of implementations (VLLM vs TRT-LLM)
-2. Different routing strategies (slow vs fast)
-3. Type safety through interface contracts
-"""
--- a/examples/llm_hello_world/llm_types.py
+++ b/examples/llm_hello_world/llm_types.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from pydantic import BaseModel
-class ChatRequest(BaseModel):
-    text: str
--- a/examples/vllm_v1/README.md
+++ b/examples/vllm_v1/README.md
--- a/examples/vllm_v1/components/args.py
+++ b/examples/vllm_v1/components/args.py
--- a/examples/vllm_v1/components/handlers.py
+++ b/examples/vllm_v1/components/handlers.py
--- a/examples/vllm_v1/components/main.py
+++ b/examples/vllm_v1/components/main.py
--- a/examples/vllm_v1/components/protocol.py
+++ b/examples/vllm_v1/components/protocol.py
--- a/examples/vllm_v1/components/publisher.py
+++ b/examples/vllm_v1/components/publisher.py
--- a/examples/vllm_v1/deepseek-r1.md
+++ b/examples/vllm_v1/deepseek-r1.md
--- a/examples/vllm_v1/deploy/agg.yaml
+++ b/examples/vllm_v1/deploy/agg.yaml