refactor: rename vllm_nixl to vllm and make default (#100)

5bcdb734 · Neelay Shah · GitHub · a7c35dcf · a7c35dcf · a7c35dcf
Commit 5bcdb734 authored Mar 11, 2025 by Neelay Shah Committed by GitHub Mar 11, 2025
20 changed files
--- a/examples/python_rs/llm/vllm/kv_router/worker.py
+++ b/examples/python_rs/llm/vllm/kv_router/worker.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import asyncio
-import os
-from typing import AsyncIterator
-
-import uvloop
-from common.base_engine import BaseVllmEngine
-from common.parser import parse_vllm_args
-from common.protocol import MyRequestOutput, vLLMGenerateRequest
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.logger import logger as vllm_logger
-from vllm.sampling_params import RequestOutputKind
-
-from dynamo.llm import KvMetricsPublisher
-from dynamo.runtime import DistributedRuntime, dynamo_endpoint, dynamo_worker
-
-vllm_logger.info(f"VLLM_KV_CAPI_PATH: {os.environ['VLLM_KV_CAPI_PATH']}")
-
-
-class VllmEngine(BaseVllmEngine):
-    """
-    vLLM Inference Engine
-    """
-
-    def __init__(
-        self, engine_args: AsyncEngineArgs, metrics_publisher: KvMetricsPublisher
-    ):
-        self.metrics_publisher = metrics_publisher
-        self.engine_args = engine_args
-        super().__init__(engine_args)
-
-    async def initialize(self):
-        await super().initialize()
-        assert self.engine_client is not None, "engine_client was not initialized"
-        self.engine_client.set_metrics_publisher(self.metrics_publisher)
-
-    @dynamo_endpoint(vLLMGenerateRequest, MyRequestOutput)
-    async def generate(self, request) -> AsyncIterator:
-        assert (
-            self.engine_client is not None
-        ), "engine_client was not initialized, must call initialize() first"
-
-        sampling_params = request.sampling_params
-        # rust HTTP requires Delta streaming
-        sampling_params.output_kind = RequestOutputKind.DELTA
-
-        async for response in self.engine_client.generate(
-            request.engine_prompt, sampling_params, request.request_id
-        ):
-            # MyRequestOutput takes care of serializing the response as
-            # vLLM's RequestOutput is not serializable by default
-            yield MyRequestOutput(
-                request_id=response.request_id,
-                prompt=response.prompt,
-                prompt_token_ids=response.prompt_token_ids,
-                prompt_logprobs=response.prompt_logprobs,
-                outputs=response.outputs,
-                finished=response.finished,
-            ).model_dump_json()
-
-
-@dynamo_worker()
-async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
-    """
-    Serve the dynamo.vllm.generate endpoint.
-    """
-    worker_component = runtime.namespace("dynamo").component("vllm")
-    await worker_component.create_service()
-
-    worker_endpoint = worker_component.endpoint("generate")
-
-    VLLM_WORKER_ID = worker_endpoint.lease_id()
-    os.environ["VLLM_WORKER_ID"] = str(VLLM_WORKER_ID)
-    vllm_logger.info(f"Generate endpoint ID: {VLLM_WORKER_ID}")
-
-    VLLM_KV_NAMESPACE = "dynamo"
-    os.environ["VLLM_KV_NAMESPACE"] = str(VLLM_KV_NAMESPACE)
-
-    VLLM_KV_COMPONENT = "vllm"
-    os.environ["VLLM_KV_COMPONENT"] = str(VLLM_KV_COMPONENT)
-
-    metrics_publisher = KvMetricsPublisher()
-    vllm_engine = VllmEngine(engine_args, metrics_publisher)
-    await vllm_engine.initialize()
-    # Initially send dummy metrics to kick start,
-    # vLLM will not update stat until forward pass is triggered
-    metrics_publisher.publish(
-        0,  # request_active_slots
-        1024,  # request_total_slots
-        0,  # kv_active_blocks
-        1024,  # kv_total_blocks
-        0,  # num_requests_waiting
-        0.0,  # gpu_cache_usage_perc
-        0.0,  # gpu_prefix_cache_hit_rate
-    )
-
-    await asyncio.gather(
-        worker_endpoint.serve_endpoint(vllm_engine.generate),
-        metrics_publisher.create_endpoint(worker_component),
-    )
-
-
-if __name__ == "__main__":
-    uvloop.install()
-    engine_args = parse_vllm_args()
-    asyncio.run(worker(engine_args))
--- a/examples/python_rs/llm/vllm/monolith/worker.py
+++ b/examples/python_rs/llm/vllm/monolith/worker.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import asyncio
-import json
-from typing import AsyncGenerator, AsyncIterator
-
-import uvloop
-from common.parser import parse_vllm_args
-from vllm.config import ModelConfig
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.openai.api_server import (
-    build_async_engine_client_from_engine_args,
-)
-from vllm.entrypoints.openai.protocol import (
-    ChatCompletionRequest,
-    ChatCompletionResponse,
-    ChatCompletionStreamResponse,
-    CompletionRequest,
-    CompletionResponse,
-    CompletionStreamResponse,
-    ErrorResponse,
-)
-from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
-from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
-from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
-
-from dynamo.runtime import DistributedRuntime, dynamo_endpoint, dynamo_worker
-
-
-class VllmEngine:
-    def __init__(
-        self, engine_client: AsyncIterator[EngineClient], model_config: ModelConfig
-    ):
-        self.engine_client = engine_client
-        self.model_config = model_config
-
-        # Ensure served_model_name matches the openai model name
-        # Use --served-model-name to explicitly set this or it will fallback to --model
-        models = OpenAIServingModels(
-            engine_client=engine_client,
-            model_config=model_config,
-            base_model_paths=[
-                BaseModelPath(
-                    name=model_config.served_model_name,
-                    model_path=model_config.model,
-                )
-            ],
-        )
-
-        self.chat_serving = OpenAIServingChat(
-            engine_client=self.engine_client,
-            model_config=self.model_config,
-            models=models,
-            response_role="assistant",
-            request_logger=None,
-            chat_template=None,
-            chat_template_content_format="auto",
-        )
-        self.completion_serving = OpenAIServingCompletion(
-            engine_client=self.engine_client,
-            model_config=self.model_config,
-            models=models,
-            request_logger=None,
-        )
-
-    @dynamo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
-    async def generate_chat(self, request):
-        result = await self.chat_serving.create_chat_completion(request)
-
-        if isinstance(result, AsyncGenerator):
-            async for raw_response in result:
-                if raw_response.startswith("data: [DONE]"):
-                    break
-                response = json.loads(raw_response.lstrip("data: "))
-                yield response
-
-        # We should always be streaming so should never get here
-        elif isinstance(result, ChatCompletionResponse):
-            raise RuntimeError("ChatCompletionResponse support not implemented")
-
-        elif isinstance(result, ErrorResponse):
-            error = result.dict()
-            raise RuntimeError(
-                f"Error {error['code']}: {error['message']} "
-                f"(type: {error['type']}, param: {error['param']})"
-            )
-
-        else:
-            raise TypeError(f"Unexpected response type: {type(result)}")
-
-    @dynamo_endpoint(CompletionRequest, CompletionStreamResponse)
-    async def generate_completions(self, request):
-        result = await self.completion_serving.create_completion(request)
-
-        if isinstance(result, AsyncGenerator):
-            async for raw_response in result:
-                if raw_response.startswith("data: [DONE]"):
-                    break
-                response = json.loads(raw_response.lstrip("data: "))
-                yield response
-
-        # We should always be streaming so should never get here
-        elif isinstance(result, CompletionResponse):
-            raise RuntimeError("CompletionResponse support not implemented")
-
-        elif isinstance(result, ErrorResponse):
-            error = result.dict()
-            raise RuntimeError(
-                f"Error {error['code']}: {error['message']} "
-                f"(type: {error['type']}, param: {error['param']})"
-            )
-
-        else:
-            raise TypeError(f"Unexpected response type: {type(result)}")
-
-
-@dynamo_worker()
-async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
-    """
-    Instantiate a `backend` component and serve the `generate` endpoint
-    A `Component` can serve multiple endpoints
-    """
-    component = runtime.namespace("dynamo").component("vllm")
-    await component.create_service()
-
-    chat_endpoint = component.endpoint("chat/completions")
-    completions_endpoint = component.endpoint("completions")
-
-    async with build_async_engine_client_from_engine_args(engine_args) as engine_client:
-        model_config = await engine_client.get_model_config()
-        engine = VllmEngine(engine_client, model_config)
-
-        await asyncio.gather(
-            chat_endpoint.serve_endpoint(engine.generate_chat),
-            completions_endpoint.serve_endpoint(engine.generate_completions),
-        )
-
-
-if __name__ == "__main__":
-    uvloop.install()
-    engine_args = parse_vllm_args()
-    asyncio.run(worker(engine_args))
--- a/examples/python_rs/llm/vllm_nixl/prefill_worker.py
+++ b/examples/python_rs/llm/vllm_nixl/prefill_worker.py
--- a/examples/python_rs/llm/vllm/preprocessor/__init__.py
+++ b/examples/python_rs/llm/vllm/preprocessor/__init__.py
--- a/examples/python_rs/llm/vllm/preprocessor/common.py
+++ b/examples/python_rs/llm/vllm/preprocessor/common.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-from dataclasses import dataclass, field
-
-from huggingface_hub import snapshot_download
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.utils import FlexibleArgumentParser
-
-
-@dataclass
-class NvAsyncEngineArgs(AsyncEngineArgs):
-    model_path: str = field(default="")
-
-
-def parse_vllm_args() -> NvAsyncEngineArgs:
-    parser = FlexibleArgumentParser()
-    parser = AsyncEngineArgs.add_cli_args(parser)
-    parser.add_argument(
-        "--model-path",
-        type=str,
-        default="",
-    )
-    args = parser.parse_args()
-    if args.model_path == "":
-        if os.environ.get("HF_TOKEN"):
-            args.model_path = snapshot_download(args.model)
-        else:
-            raise ValueError(
-                "Please set HF_TOKEN environment variable "
-                "or pass --model-path to load the model"
-            )
-    return NvAsyncEngineArgs.from_cli_args(args)
--- a/examples/python_rs/llm/vllm/preprocessor/curl.sh
+++ b/examples/python_rs/llm/vllm/preprocessor/curl.sh
-#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-PORT=8080
-
-# list models
-echo "\n\n### Listing models"
-curl http://localhost:$PORT/v1/models
-
-# create completion
-echo "\n\n### Creating completions"
-curl -X 'POST' \
-  "http://localhost:$PORT/v1/chat/completions" \
-  -H 'accept: application/json' \
-  -H 'Content-Type: application/json' \
-  -d '{
-    "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
-    "messages": [
-      {
-        "role":"user",
-        "content":"what is deep learning?"
-      }
-    ],
-    "max_tokens": 64,
-    "stream": true,
-    "temperature": 0.7,
-    "top_p": 0.9,
-    "frequency_penalty": 0.1,
-    "presence_penalty": 0.2,
-    "top_k": 5
-  }'
--- a/examples/python_rs/llm/vllm/preprocessor/processor.py
+++ b/examples/python_rs/llm/vllm/preprocessor/processor.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import asyncio
-
-import uvloop
-from preprocessor.common import parse_vllm_args
-
-from dynamo.runtime import (
-    DistributedRuntime,
-    ModelDeploymentCard,
-    OAIChatPreprocessor,
-    dynamo_worker,
-)
-
-uvloop.install()
-
-
-@dynamo_worker()
-async def preprocessor(runtime: DistributedRuntime, model_name: str, model_path: str):
-    # create model deployment card
-    mdc = await ModelDeploymentCard.from_local_path(model_path, model_name)
-    # create preprocessor endpoint
-    component = runtime.namespace("dynamo").component("preprocessor")
-    await component.create_service()
-    endpoint = component.endpoint("generate")
-
-    # create backend endpoint
-    backend = runtime.namespace("dynamo").component("backend").endpoint("generate")
-
-    # start preprocessor service with next backend
-    chat = OAIChatPreprocessor(mdc, endpoint, next=backend)
-    await chat.start()
-
-
-if __name__ == "__main__":
-    args = parse_vllm_args()
-    asyncio.run(preprocessor(args.model, args.model_path))
--- a/examples/python_rs/llm/vllm/preprocessor/worker.py
+++ b/examples/python_rs/llm/vllm/preprocessor/worker.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import asyncio
-import inspect
-import uuid
-from contextlib import AsyncContextDecorator
-from typing import Any
-
-import uvloop
-from preprocessor.common import NvAsyncEngineArgs, parse_vllm_args
-from vllm import SamplingParams
-from vllm.entrypoints.openai.api_server import (
-    build_async_engine_client_from_engine_args,
-)
-from vllm.outputs import CompletionOutput
-
-from dynamo.runtime import (
-    Backend,
-    DistributedRuntime,
-    ModelDeploymentCard,
-    dynamo_endpoint,
-    dynamo_worker,
-)
-
-finish_reason_map = {
-    None: None,
-    "stop": "stop",
-    "abort": "cancelled",
-    "length": "length",
-    "error": "error",
-}
-
-
-class DeltaState:
-    """
-    The vLLM AsyncEngine returns the full internal state of each slot per forward pass.
-    The OpenAI ChatCompletionResponseDelta object only requires the delta, so this object
-    is used to track the state of the last forward pass to calculate the delta.
-    """
-
-    def __init__(self):
-        self.token_ids = None
-        self.last_token_count = 0
-
-    def delta(self, choice):
-        self.token_ids = choice.token_ids
-        tokens_produced = len(choice.token_ids) - self.last_token_count
-        self.last_token_count = len(choice.token_ids)
-        return choice.token_ids[-tokens_produced:]
-
-
-class VllmEngine(AsyncContextDecorator):
-    """
-    Request handler for the generate endpoint
-    """
-
-    def __init__(self, engine_args: NvAsyncEngineArgs, mdc: ModelDeploymentCard):
-        self.mdc = mdc
-        self.engine_args = engine_args
-        print("vllm backend started")
-
-    async def __aenter__(self):
-        await self.async_init()
-        return self
-
-    async def __aexit__(self, exc_type, exc_value, traceback):
-        print("vllm backend exited")
-
-    async def async_init(self):
-        self._engine_context = build_async_engine_client_from_engine_args(
-            self.engine_args, False
-        )
-        if self._engine_context is not None:
-            self.engine_client = await self._engine_context.__aenter__()
-        else:
-            raise RuntimeError("Failed to initialize engine client")
-
-    def to_backend_output(self, response: CompletionOutput, delta_token_ids: list[int]):
-        return {
-            "token_ids": delta_token_ids,
-            "tokens": [],
-            "finish_reason": finish_reason_map.get(response.finish_reason, "stop"),
-            "cum_log_probs": response.cumulative_logprob,
-            "text": None,
-        }
-
-    def to_sampling_params(self, request) -> SamplingParams:
-        sampling_params_names = inspect.signature(SamplingParams).parameters.keys()
-        sampling_params = {
-            k: v
-            for k, v in request.get("sampling_options", {}).items()
-            if k in sampling_params_names and v is not None
-        }
-        return SamplingParams(**sampling_params)
-
-    @dynamo_endpoint(Any, CompletionOutput)
-    async def generate(self, request):
-        state = DeltaState()
-        request_id = str(uuid.uuid4())
-        sampling_params = self.to_sampling_params(request)
-        inputs = {"prompt_token_ids": request["token_ids"]}
-        stream = self.engine_client.generate(
-            inputs, sampling_params, request_id=request_id
-        )
-        async for request_output in stream:
-            for choice in request_output.outputs:
-                delta_token_ids = state.delta(choice)
-                yield self.to_backend_output(choice, delta_token_ids)
-
-
-@dynamo_worker()
-async def worker(runtime: DistributedRuntime, engine_args: NvAsyncEngineArgs):
-    """
-    Instantiate a `backend` component and serve the `generate` endpoint
-    A `Component` can serve multiple endpoints
-    """
-    component = runtime.namespace("dynamo").component("backend")
-    await component.create_service()
-
-    endpoint = component.endpoint("generate")
-
-    mdc = await ModelDeploymentCard.from_local_path(
-        engine_args.model_path, engine_args.model
-    )
-    async with VllmEngine(engine_args, mdc) as engine:
-        backend = Backend(mdc, endpoint)
-        await backend.start(engine.generate)
-
-
-if __name__ == "__main__":
-    uvloop.install()
-    engine_args = parse_vllm_args()
-    asyncio.run(worker(engine_args))
--- a/examples/python_rs/llm/vllm_nixl/processor.py
+++ b/examples/python_rs/llm/vllm_nixl/processor.py
--- a/examples/python_rs/llm/vllm_nixl/protocol.py
+++ b/examples/python_rs/llm/vllm_nixl/protocol.py
--- a/examples/python_rs/llm/vllm_nixl/routerless/prefill_worker.py
+++ b/examples/python_rs/llm/vllm_nixl/routerless/prefill_worker.py
--- a/examples/python_rs/llm/vllm_nixl/routerless/worker.py
+++ b/examples/python_rs/llm/vllm_nixl/routerless/worker.py
--- a/examples/python_rs/llm/vllm/scripts/baseline.sh
+++ b/examples/python_rs/llm/vllm/scripts/baseline.sh
-#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# LIMITATIONS:
-# - Must use a single GPU for workers as CUDA_VISIBLE_DEVICES is set to a fixed value
-# - Must use a single node
-
-set -xe
-
-if [ $# -lt 2 ]; then
-    echo "Usage: $0 <number_of_workers> <routing_strategy> <log_dir_name> [model_name] [model_args] [chat_endpoint_name] [completions_endpoint_name]"
-    echo "Error: Must specify at least number of workers and routing strategy and log_dir_name"
-    echo "Optional: model_name (default: deepseek-ai/DeepSeek-R1-Distill-Llama-8B)"
-    echo "Optional: model_args (quoted string with model arguments)"
-    echo "Optional: chat_endpoint_name (default: dynamo.vllm.chat/completions)"
-    echo "Optional: completions_endpoint_name (default: dynamo.vllm.completions)"
-    exit 1
-fi
-
-# If using Cache, can set this to not check HF
-export HF_HUB_OFFLINE=1
-export GLOO_SOCKET_IFNAME=lo
-
-# Required for Qwen2.5 R1 Distilled in order to set --block-size 64 and --kv-cache-dtype fp8
-uv pip install flashinfer-python -i https://flashinfer.ai/whl/cu124/torch2.5/
-export VLLM_ATTENTION_BACKEND=FLASHINFER
-
-NUM_WORKERS=$1
-ROUTING_STRATEGY=$2
-LOG_DIR_NAME=$3
-MODEL_NAME=${4:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
-CUSTOM_MODEL_ARGS=$5
-CHAT_ENDPOINT_NAME=${6:-"dynamo.vllm.chat/completions"}
-COMPLETIONS_ENDPOINT_NAME=${7:-"dynamo.vllm.completions"}
-VALID_STRATEGIES=("random")
-SESSION_NAME="v"
-WORKDIR="/workspace/examples/python_rs/llm/vllm"
-INIT_CMD="source /opt/dynamo/venv/bin/activate && cd $WORKDIR"
-# Default model args
-DEFAULT_MODEL_ARGS="--model $MODEL_NAME \
-    --tokenizer $MODEL_NAME \
-    --enable-prefix-caching \
-    --block-size 64"
-
-# Use custom model args if provided, otherwise use default
-if [ -n "$CUSTOM_MODEL_ARGS" ]; then
-    MODEL_ARGS="$CUSTOM_MODEL_ARGS"
-    echo "Using custom model arguments"
-else
-    MODEL_ARGS="$DEFAULT_MODEL_ARGS"
-    echo "Using default model arguments"
-fi
-
-# Create logs directory if it doesn't exist
-LOGS_DIR="/logs/$LOG_DIR_NAME"
-mkdir -p $LOGS_DIR
-chmod -R 775 $LOGS_DIR
-
-if [[ ! " ${VALID_STRATEGIES[@]} " =~ " ${ROUTING_STRATEGY} " ]]; then
-    echo "Error: Invalid routing strategy. Must be one of: ${VALID_STRATEGIES[*]}"
-    exit 1
-fi
-########################################################
-# HTTP Server
-########################################################
-HTTP_CMD="DYN_LOG=DEBUG http |& tee $LOGS_DIR/http.log"
-tmux new-session -d -s "$SESSION_NAME-http"
-tmux send-keys -t "$SESSION_NAME-http" "$INIT_CMD && $HTTP_CMD" C-m
-
-########################################################
-# LLMCTL
-########################################################
-LLMCTL_CMD="sleep 5 && \
-    llmctl http remove chat $MODEL_NAME && \
-    llmctl http remove completions $MODEL_NAME && \
-    llmctl http add chat $MODEL_NAME $CHAT_ENDPOINT_NAME && \
-    llmctl http add completions $MODEL_NAME $COMPLETIONS_ENDPOINT_NAME && \
-    llmctl http list |& tee $LOGS_DIR/llmctl.log"
-tmux new-session -d -s "$SESSION_NAME-llmctl"
-tmux send-keys -t "$SESSION_NAME-llmctl" "$INIT_CMD && $LLMCTL_CMD" C-m
-
-########################################################
-# Workers
-########################################################
-WORKER_CMD="RUST_LOG=info python3 -m monolith.worker $MODEL_ARGS"
-
-for i in $(seq 1 $NUM_WORKERS); do
-        tmux new-session -d -s "$SESSION_NAME-$i"
-done
-
-for i in $(seq 1 $NUM_WORKERS); do
-        tmux send-keys -t "$SESSION_NAME-$i" "$INIT_CMD && CUDA_VISIBLE_DEVICES=$((i-1)) $WORKER_CMD |& tee $LOGS_DIR/worker-$i.log" C-m
-done
\ No newline at end of file
--- a/examples/python_rs/llm/vllm/scripts/disaggregated.sh
+++ b/examples/python_rs/llm/vllm/scripts/disaggregated.sh
-#!/bin/bash -e
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# LIMITATIONS:
-# - Must have at least 2 GPUs since CUDA_VISIBLE_DEVICES is hard-coded to 0 and 1
-# - Must use a single node
-
-if [ $# -gt 2 ]; then
-    echo "Usage: $0 [model_name] [endpoint_name]"
-    echo "Optional: model_name (default: deepseek-ai/DeepSeek-R1-Distill-Llama-8B)"
-    echo "Optional: endpoint_name (default: dynamo.vllm.generate)"
-    exit 1
-fi
-
-MODEL_NAME=${1:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
-ENDPOINT_NAME=${2:-"dynamo.vllm.generate"}
-SESSION_NAME="vllm_disagg"
-WORKDIR="$(dirname $0)/.."
-INIT_CMD="cd $WORKDIR"
-
-########################################################
-# TMUX SESSION SETUP
-########################################################
-
-# Start new session
-tmux new-session -d -s "$SESSION_NAME"
-
-# Split into 4 equal panes
-tmux split-window -h
-tmux split-window -v
-tmux select-pane -t 0
-tmux split-window -v
-
-########################################################
-# HTTP Server
-########################################################
-HTTP_HOST="localhost"
-HTTP_PORT=8080
-HTTP_CMD="DYN_LOG=DEBUG http --host ${HTTP_HOST} --port ${HTTP_PORT}"
-tmux select-pane -t 0
-tmux send-keys "$INIT_CMD && $HTTP_CMD" C-m
-
-########################################################
-# LLMCTL
-########################################################
-LLMCTL_CMD="sleep 5 && llmctl http remove chat-model $MODEL_NAME && \
-    llmctl http add chat-model $MODEL_NAME $ENDPOINT_NAME && \
-    llmctl http list chat-model"
-tmux select-pane -t 1
-tmux send-keys "$INIT_CMD && $LLMCTL_CMD" C-m
-
-CURL_CMD="curl ${HTTP_HOST}:${HTTP_PORT}/v1/chat/completions \
-  -H \"Content-Type: application/json\" \
-  -d '{
-    \"model\": \"$MODEL_NAME\",
-    \"messages\": [
-      {\"role\": \"user\", \"content\": \"What is the capital of France?\"}
-    ],
-    \"stream\": true,
-    \"max_tokens\": 10
-  }'"
-# Prepare a curl command for a quick test, but don't execute it since the server
-# needs to spin up first.
-tmux send-keys "$CURL_CMD"
-
-########################################################
-# Processor
-########################################################
-
-# skip
-
-########################################################
-# Router
-########################################################
-
-# skip
-
-########################################################
-# Prefill
-########################################################
-PREFILL_CMD="VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=0 \
-    python3 -m disaggregated.prefill_worker \
-    --model $MODEL_NAME \
-    --gpu-memory-utilization 0.8 \
-    --enforce-eager \
-    --max-model-len 1000 \
-    --tensor-parallel-size 1 \
-    --kv-transfer-config \
-    '{\"kv_connector\":\"DynamoNcclConnector\",\"kv_role\":\"kv_producer\",\"kv_rank\":0,\"kv_parallel_size\":2}'"
-
-tmux select-pane -t 2
-tmux send-keys "$INIT_CMD && $PREFILL_CMD" C-m
-
-########################################################
-# Decode
-########################################################
-DECODE_CMD="VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=1 \
-    python3 -m disaggregated.decode_worker \
-    --model $MODEL_NAME \
-    --gpu-memory-utilization 0.8 \
-    --enforce-eager \
-    --max-model-len 1000 \
-    --tensor-parallel-size 1 \
-    --kv-transfer-config \
-    '{\"kv_connector\":\"DynamoNcclConnector\",\"kv_role\":\"kv_consumer\",\"kv_rank\":1,\"kv_parallel_size\":2}'"
-
-tmux select-pane -t 3
-tmux send-keys "$INIT_CMD && $DECODE_CMD" C-m
-tmux attach-session -t "$SESSION_NAME"
--- a/examples/python_rs/llm/vllm/scripts/kv-router-run.sh
+++ b/examples/python_rs/llm/vllm/scripts/kv-router-run.sh
-#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# LIMITATIONS:
-# - Must use a single GPU for workers as CUDA_VISIBLE_DEVICES is set to a fixed value
-# - Must use a single node
-
-if [ $# -lt 3 ]; then
-    echo "Usage: $0 <number_of_workers> <log_dir_name> [model_name] [model_args] [chat_endpoint_name] [completions_endpoint_name]"
-    echo "Error: Must specify at least number of workers, log_dir_name"
-    echo "Optional: model_name (default: deepseek-ai/DeepSeek-R1-Distill-Llama-8B)"
-    echo "Optional: model_args (quoted string with model arguments)"
-    echo "Optional: chat_endpoint_name (default: dynamo.process.chat/completions)"
-    echo "Optional: completions_endpoint_name (default: dynamo.process.completions)"
-    exit 1
-fi
-
-# Uncomment if using Cache
-# export HF_HUB_OFFLINE=1
-
-# https://github.com/vllm-project/vllm/issues/10734#issuecomment-2507201353
-# Fix for:torch.distributed.DistBackendError: File name too long
-# export GLOO_SOCKET_IFNAME=lo
-
-
-NUM_WORKERS=$1
-LOG_DIR_NAME=$2
-MODEL_NAME=${3:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
-CUSTOM_MODEL_ARGS=$4
-CHAT_ENDPOINT_NAME=${5:-"dynamo.process.chat/completions"}
-COMPLETIONS_ENDPOINT_NAME=${6:-"dynamo.process.completions"}
-SESSION_NAME="v"
-WORKDIR="/workspace/examples/python_rs/llm/vllm"
-INIT_CMD="cd $WORKDIR"
-
-
-# Default model args
-DEFAULT_MODEL_ARGS="--model $MODEL_NAME \
-    --tokenizer $MODEL_NAME \
-    --enable-prefix-caching \
-    --block-size 64"
-
-# Use custom model args if provided, otherwise use default
-if [ -n "$CUSTOM_MODEL_ARGS" ]; then
-    MODEL_ARGS="$CUSTOM_MODEL_ARGS"
-    echo "Using custom model arguments"
-else
-    MODEL_ARGS="$DEFAULT_MODEL_ARGS"
-    echo "Using default model arguments"
-fi
-
-# Create logs directory if it doesn't exist
-LOGS_DIR="/logs/$LOG_DIR_NAME"
-mkdir -p $LOGS_DIR
-chmod -R 775 $LOGS_DIR
-
-########################################################
-# HTTP Server
-########################################################
-HTTP_CMD="DYN_LOG=DEBUG http |& tee $LOGS_DIR/http.log"
-tmux new-session -d -s "$SESSION_NAME-http"
-tmux send-keys -t "$SESSION_NAME-http" "$INIT_CMD && $HTTP_CMD" C-m
-
-########################################################
-# LLMCTL
-########################################################
-LLMCTL_CMD="sleep 5 && \
-    llmctl http remove chat $MODEL_NAME && \
-    llmctl http remove completions $MODEL_NAME && \
-    llmctl http add chat $MODEL_NAME $CHAT_ENDPOINT_NAME && \
-    llmctl http add completions $MODEL_NAME $COMPLETIONS_ENDPOINT_NAME && \
-    llmctl http list |& tee $LOGS_DIR/llmctl.log"
-tmux new-session -d -s "$SESSION_NAME-llmctl"
-tmux send-keys -t "$SESSION_NAME-llmctl" "$INIT_CMD && $LLMCTL_CMD" C-m
-
-########################################################
-# Processor
-########################################################
-PROCESSOR_CMD="RUST_LOG=info python3 -m kv_router.processor $MODEL_ARGS |& tee $LOGS_DIR/processor.log"
-tmux new-session -d -s "$SESSION_NAME-processor"
-tmux send-keys -t "$SESSION_NAME-processor" "$INIT_CMD && $PROCESSOR_CMD" C-m
-
-########################################################
-# Router
-########################################################
-ROUTER_CMD="RUST_LOG=info python3 -m kv_router.router \
-    --min-workers $NUM_WORKERS |& tee $LOGS_DIR/router.log"
-
-tmux new-session -d -s "$SESSION_NAME-router"
-tmux send-keys -t "$SESSION_NAME-router" "$INIT_CMD && $ROUTER_CMD" C-m
-
-########################################################
-# Workers
-########################################################
-WORKER_CMD="RUST_LOG=info python3 -m kv_router.worker $MODEL_ARGS"
-
-for i in $(seq 1 $NUM_WORKERS); do
-    tmux new-session -d -s "$SESSION_NAME-$i"
-done
-
-for i in $(seq 1 $NUM_WORKERS); do
-    tmux send-keys -t "$SESSION_NAME-$i" "$INIT_CMD && CUDA_VISIBLE_DEVICES=$((i-1)) $WORKER_CMD |& tee $LOGS_DIR/worker-$i.log" C-m
-done
\ No newline at end of file
--- a/examples/python_rs/llm/vllm/sdk_basic_service/README.md
+++ b/examples/python_rs/llm/vllm/sdk_basic_service/README.md
-<!--
-SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-SPDX-License-Identifier: Apache-2.0
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-->
-
-## Overview
-
-Pipeline Architecture:
-
-```
-Users/Clients (HTTP)
-      │
-      ▼
-┌─────────────┐
-│  Frontend   │  HTTP API endpoint (/generate)
-└─────────────┘
-      │ dynamo/runtime
-      ▼
-┌─────────────┐
-│   Middle    │
-└─────────────┘
-      │ dynamo/runtime
-      ▼
-┌─────────────┐
-│  Backend    │
-└─────────────┘
-```
-
-
-## Unified serve
-1. Launch all three services using a single command -
-
-```bash
-cd /workspace/examples/python_rs/llm/vllm
-
-dynamo-sdk serve sdk_basic_service.basic:Frontend
-```
-
-2. Send request to frontend using curl -
-
-```bash
-curl -X 'POST' \
-  'http://localhost:3000/generate' \
-  -H 'accept: text/event-stream' \
-  -H 'Content-Type: application/json' \
-  -d '{
-  "text": "test"
-}'
-```
--- a/examples/python_rs/llm/vllm/sdk_basic_service/basic.py
+++ b/examples/python_rs/llm/vllm/sdk_basic_service/basic.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from pydantic import BaseModel
-
-from dynamo.sdk import api, depends, dynamo_endpoint, service
-
-"""
-Pipeline Architecture:
-
-Users/Clients (HTTP)
-      │
-      ▼
-┌─────────────┐
-│  Frontend   │  HTTP API endpoint (/generate)
-└─────────────┘
-      │ dynamo/runtime
-      ▼
-┌─────────────┐
-│   Middle    │
-└─────────────┘
-      │ dynamo/runtime
-      ▼
-┌─────────────┐
-│  Backend    │
-└─────────────┘
-"""
-
-
-class RequestType(BaseModel):
-    text: str
-
-
-class ResponseType(BaseModel):
-    text: str
-
-
-@service(
-    resources={"cpu": "2"},
-    traffic={"timeout": 30},
-    dynamo={
-        "enabled": True,
-        "namespace": "inference",
-    },
-    workers=3,
-)
-class Backend:
-    def __init__(self) -> None:
-        print("Starting backend")
-
-    @dynamo_endpoint()
-    async def generate(self, req: RequestType):
-        """Generate tokens."""
-        req_text = req.text
-        print(f"Backend received: {req_text}")
-        text = f"{req_text}-back"
-        for token in text.split():
-            yield f"Backend: {token}"
-
-
-@service(
-    resources={"cpu": "2"},
-    traffic={"timeout": 30},
-    dynamo={"enabled": True, "namespace": "inference"},
-)
-class Middle:
-    backend = depends(Backend)
-
-    def __init__(self) -> None:
-        print("Starting middle")
-
-    @dynamo_endpoint()
-    async def generate(self, req: RequestType):
-        """Forward requests to backend."""
-        req_text = req.text
-        print(f"Middle received: {req_text}")
-        text = f"{req_text}-mid"
-        next_request = RequestType(text=text).model_dump_json()
-        async for response in self.backend.generate(next_request):
-            print(f"Middle received response: {response}")
-            yield f"Middle: {response}"
-
-
-@service(resources={"cpu": "1"}, traffic={"timeout": 60})  # Regular HTTP API
-class Frontend:
-    middle = depends(Middle)
-
-    def __init__(self) -> None:
-        print("Starting frontend")
-
-    @api
-    async def generate(self, text):
-        """Stream results from the pipeline."""
-        print(f"Frontend received: {text}")
-        print(f"Frontend received type: {type(text)}")
-        txt = RequestType(text=text)
-        print(f"Frontend sending: {type(txt)}")
-        async for response in self.middle.generate(txt.model_dump_json()):
-            yield f"Frontend: {response}"
--- a/examples/python_rs/llm/vllm/sdk_kv_router/README.md
+++ b/examples/python_rs/llm/vllm/sdk_kv_router/README.md
-<!--
-SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-SPDX-License-Identifier: Apache-2.0
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-->
-
-Run this example using command below
-
-```bash
-
-cd /workspace/examples/python_rs/llm/vllm
-
-dynamo-sdk serve sdk_kv_router.frontend:Frontend
-```
-
-
-Send request to http service:
-```bash
-
-curl -X 'POST' \
-  'http://localhost:3000/chat_completion' \
-  -H 'accept: text/event-stream' \
-  -H 'Content-Type: application/json' \
-  -d '{
-  "msg": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden."
-}'
-```
--- a/examples/python_rs/llm/vllm/sdk_kv_router/frontend.py
+++ b/examples/python_rs/llm/vllm/sdk_kv_router/frontend.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from sdk_kv_router.processor import Processor
-
-from dynamo.sdk import DYNAMO_IMAGE, api, depends, service
-
-
-@service(traffic={"timeout": 10000}, image=DYNAMO_IMAGE)
-class Frontend:
-    processor = depends(Processor)
-
-    def __init__(self):
-        print("frontend init")
-
-    @api
-    async def chat_completion(self, msg: str):
-        # Call the generate method
-        generator = self.processor.generate(
-            {
-                "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
-                "messages": [{"role": "user", "content": msg}],
-                "stream": True,
-                "max_tokens": 10,
-            }
-        )
-
-        # Now iterate over the async generator
-        async for response in generator:
-            print("client response_data:", response)
-            yield response
--- a/examples/python_rs/llm/vllm/sdk_kv_router/processor.py
+++ b/examples/python_rs/llm/vllm/sdk_kv_router/processor.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import uuid
-from typing import AsyncIterator
-
-import bentoml
-from sdk_kv_router.router import Router
-from sdk_kv_router.worker import VllmEngine
-
-with bentoml.importing():
-    from transformers import AutoTokenizer
-    from vllm.engine.arg_utils import AsyncEngineArgs
-    from vllm.entrypoints.openai.protocol import ChatCompletionRequest
-    from vllm.outputs import RequestOutput
-    from vllm.transformers_utils.tokenizer import AnyTokenizer
-    from common.chat_processor import ChatProcessor, ProcessMixIn
-    from common.protocol import MyRequestOutput, Tokens, vLLMGenerateRequest
-
-from dynamo.sdk import depends, dynamo_context, dynamo_endpoint, service
-
-
-@service(
-    dynamo={
-        "enabled": True,
-        "namespace": "dynamo",
-    },
-    resources={"cpu": "10", "memory": "20Gi"},
-    workers=1,
-)
-class Processor(ProcessMixIn):
-    """
-    vLLM pre and post processing
-    """
-
-    workers = depends(VllmEngine)
-    router = depends(Router)
-
-    def __init__(self):
-        model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
-        self.engine_args = AsyncEngineArgs(
-            model=model,
-            tokenizer=model,
-            enable_prefix_caching=True,
-            block_size=64,
-            max_model_len=16384,
-        )
-        self.model_config = self.engine_args.create_model_config()
-        self.tokenizer = self._create_tokenizer()
-        self.chat_processor = ChatProcessor(self.tokenizer, self.model_config)
-
-    def _create_tokenizer(self) -> AnyTokenizer:
-        """Create a TokenizerGroup using engine arguments similar to VLLM's approach"""
-        model_path = self.engine_args.model
-
-        # Create the base tokenizer with VLLM's typical settings
-        base_tokenizer = AutoTokenizer.from_pretrained(
-            model_path,
-            trust_remote_code=True,
-            padding_side="left",
-            truncation_side="left",
-            use_fast=True,  # VLLM might use the fast tokenizer for efficiency
-        )
-        return base_tokenizer
-
-    async def generate_responses(
-        self, engine_generator
-    ) -> AsyncIterator[RequestOutput]:
-        async for resp in engine_generator:
-            # Deserialize the response from the engine
-            # Creates correct vLLM objects for each field
-            output = MyRequestOutput.model_validate_json(resp.data())
-            yield RequestOutput(
-                request_id=output.request_id,
-                prompt=output.prompt,
-                prompt_token_ids=output.prompt_token_ids,
-                prompt_logprobs=output.prompt_logprobs,
-                outputs=output.outputs,
-                finished=output.finished,
-                metrics=output.metrics,
-            )
-
-    @dynamo_endpoint()
-    async def generate(self, raw_request: ChatCompletionRequest):
-        request_id = str(uuid.uuid4())
-        (
-            request,
-            conversation,
-            prompt,
-            engine_prompt,
-            sampling_params,
-        ) = await self._parse_raw_request(raw_request)
-        worker_id = None
-        async for worker in self.router.generate(
-            Tokens(tokens=engine_prompt["prompt_token_ids"]).model_dump_json()
-        ):
-            worker_id = worker
-            break
-        runtime = dynamo_context["runtime"]
-        comp_ns, comp_name = VllmEngine.dynamo_address()  # type: ignore
-        worker_client = (
-            await runtime.namespace(comp_ns)
-            .component(comp_name)
-            .endpoint("generate")
-            .client()
-        )
-        if worker_id == "":
-            engine_generator = await worker_client.generate(
-                vLLMGenerateRequest(
-                    engine_prompt=engine_prompt,
-                    sampling_params=sampling_params,
-                    request_id=request_id,
-                ).model_dump_json()
-            )
-        else:
-            engine_generator = await worker_client.direct(
-                vLLMGenerateRequest(
-                    engine_prompt=engine_prompt,
-                    sampling_params=sampling_params,
-                    request_id=request_id,
-                ).model_dump_json(),
-                uuid.UUID(worker_id).int,
-            )
-        output = self.generate_responses(engine_generator)
-
-        async for response in await self._stream_response(
-            request, output, request_id, conversation
-        ):
-            yield response