refactor: remove old examples with old UX (#1899)

f00d700e · Alec · GitHub · c7080419 · c7080419 · c7080419
Unverified Commit f00d700e authored Jul 14, 2025 by Alec Committed by GitHub Jul 14, 2025
20 changed files
--- a/examples/llm/components/planner.py
+++ b/examples/llm/components/planner.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import asyncio
-import json
-import logging
-import os
-import time
-from datetime import datetime
-from typing import Any, List
-import numpy as np
-from rich.console import Console
-from rich.table import Table
-from tensorboardX import SummaryWriter
-from utils.prefill_queue import PrefillQueue
-from dynamo.llm import KvMetricsAggregator
-from dynamo.planner import KubernetesConnector, LocalConnector
-from dynamo.planner.defaults import LoadPlannerDefaults
-from dynamo.runtime import DistributedRuntime, dynamo_worker
-from dynamo.runtime.logging import configure_dynamo_logging
-configure_dynamo_logging()
-logger = logging.getLogger(__name__)
-# will not decrease decode worker number within 3 adjustment interval after a new decode worker
-# is added. this is to leave time for the new decode worker to populate its kv cache.
-NEW_DECODE_WORKER_GRACE_PERIOD = 3
-# we do not scale up prefill worker if the prefill queue size is estimated to reduce within
-# --prefill-queue-scale-up-threshold within the next NEW_PREFILL_WORKER_QUEUE_BUFFER_PERIOD
-# adjustment intervals following the trend observed in the current adjustment interval.
-# this is to account for the time for prefill workers to start.
-NEW_PREFILL_WORKER_QUEUE_BUFFER_PERIOD = 3
-class Planner:
-    def __init__(self, runtime: DistributedRuntime, args: argparse.Namespace):
-        self.runtime = runtime
-        self.args = args
-        self.namespace = args.namespace
-        if args.environment == "local":
-            self.connector = LocalConnector(args.namespace, runtime)
-        elif args.environment == "kubernetes":
-            self.connector = KubernetesConnector(args.namespace)
-        else:
-            raise ValueError(f"Invalid environment: {args.environment}")
-        self._prefill_queue_nats_server = os.getenv(
-            "NATS_SERVER", "nats://localhost:4222"
-        )
-        self._prefill_queue_stream_name = f"{self.namespace}_prefill_queue"
-        self.prefill_client: Any | None = None
-        self.workers_client: Any | None = None
-        self.p_endpoints: List[int] = []
-        self.d_endpoints: List[int] = []
-        self.decode_worker_remaining_grace_period = 0
-        if args.log_dir is None:
-            args.log_dir = f"logs/{datetime.now().strftime('%m%d_%H%M%S')}"
-        self.writer = SummaryWriter(args.log_dir)
-        logger.info(f"Components present in namespace: {args.namespace}")
-        self.init_time = time.time()
-        # Set the appropriate logger function for repeated metric logging
-        self._repeating_log_func = logger.debug if args.no_operation else logger.info
-    async def set_metric_aggregator(self):
-        # TODO: separate KV metrics and prefill metrics
-        kv_listener = self.runtime.namespace(self.namespace).component("VllmWorker")
-        await kv_listener.create_service()
-        self.metrics_aggregator = KvMetricsAggregator(kv_listener)
-    async def get_workers_info(self):
-        try:
-            if self.prefill_client is None:
-                self.prefill_client = (
-                    await self.runtime.namespace(self.namespace)
-                    .component("PrefillWorker")
-                    .endpoint("mock")
-                    .client()
-                )
-                # TODO: remove this sleep after rust client() is blocking until watching state
-                await asyncio.sleep(0.1)
-            # TODO: use etcd events instead of pulling instance_ids
-            p_endpoints = self.prefill_client.instance_ids()
-        except Exception:
-            p_endpoints = []
-            self._repeating_log_func(
-                "No prefill workers found, operating in aggregated mode"
-            )
-        try:
-            if self.workers_client is None:
-                self.workers_client = (
-                    await self.runtime.namespace(self.namespace)
-                    .component("VllmWorker")
-                    .endpoint("generate")
-                    .client()
-                )
-                # TODO: remove this sleep after rust client() is blocking until watching state
-                await asyncio.sleep(0.1)
-            # TODO: use etcd events instead of pulling instance_ids
-            d_endpoints = self.workers_client.instance_ids()
-        except Exception as e:
-            raise RuntimeError(f"Failed to get decode worker endpoints: {e}")
-        return p_endpoints, d_endpoints
-    async def reset_adjustment_interval(self):
-        self._repeating_log_func(
-            f"Reset metrics for new adjustment interval at t={time.time() - self.init_time:.1f}s"
-        )
-        self.p_endpoints, self.d_endpoints = await self.get_workers_info()
-        self._repeating_log_func(
-            f"Number of prefill workers: {len(self.p_endpoints)}, number of decode workers: {len(self.d_endpoints)}"
-        )
-        self.metrics_collection_time = []
-        self.prefill_queue_load = []
-        self.kv_load = []
-        self.last_adjustment_time = time.time()
-    async def collect_metrics(self):
-        self._repeating_log_func(
-            f"Collecting metrics at t={time.time() - self.init_time:.1f}s"
-        )
-        # collect prefill queue load
-        try:
-            async with PrefillQueue.get_instance(
-                nats_server=self._prefill_queue_nats_server,
-                stream_name=self._prefill_queue_stream_name,
-            ) as prefill_queue:
-                prefill_queue_size = await prefill_queue.get_queue_size()
-                measure_time = time.time() - self.init_time
-            self.prefill_queue_load.append(prefill_queue_size)
-            self._repeating_log_func(
-                f"Collected prefill queue size at t={measure_time:.1f}s: {int(prefill_queue_size)}"
-            )
-            self.writer.add_scalar(
-                "prefill_queue_size", prefill_queue_size, measure_time
-            )
-        except Exception as e:
-            self._repeating_log_func(
-                f"Failed to collect prefill queue size metrics: {e}"
-            )
-        # collect kv load
-        total_active_requests: int = 0
-        total_queued_requests: int = 0
-        metrics = await self.metrics_aggregator.get_metrics()
-        try:
-            prev_kv_load_len = len(self.kv_load)
-            for endpoint in metrics.endpoints:
-                kv_load = getattr(endpoint, "gpu_cache_usage_perc", 0.0)
-                num_requests_waiting = getattr(endpoint, "num_requests_waiting", 0)
-                total_queued_requests += num_requests_waiting
-                request_active_slots = getattr(endpoint, "request_active_slots", None)
-                if request_active_slots:
-                    total_active_requests += request_active_slots
-                    if num_requests_waiting > 0:
-                        # estimate kv load after waiting requests are scheduled based on current isl/osl
-                        # TODO: use actual isl/osl estimation after the request_active_slot bug in disaggg is fixed
-                        # Currently, we assume each request uses 0.02 kv cache
-                        # kv_load = kv_load * (request_active_slots + num_requests_waiting) / request_active_slots
-                        kv_load = kv_load + 0.02 * num_requests_waiting
-                self.kv_load.append(kv_load)
-            measure_time = time.time() - self.init_time
-            self._repeating_log_func(
-                f"Collected kv load at t={measure_time:.1f}s: {self.kv_load[prev_kv_load_len:]} (act/pnd req: {total_active_requests}/{total_queued_requests})"
-            )
-            average_kv_load = np.mean(self.kv_load[prev_kv_load_len:])
-            self.writer.add_scalar("average_kv_load", average_kv_load, measure_time)
-            self.writer.add_scalar(
-                "total_queued_requests", total_queued_requests, measure_time
-            )
-        except Exception as e:
-            self._repeating_log_func(f"Failed to collect kv load metrics: {e}")
-        p_endpoints, d_endpoints = await self.get_workers_info()
-        self.writer.add_scalar(
-            "num_prefill_workers", len(p_endpoints), time.time() - self.init_time
-        )
-        self.writer.add_scalar(
-            "num_decode_workers", len(d_endpoints), time.time() - self.init_time
-        )
-        curr_gpu_usage = (
-            len(p_endpoints) * self.args.prefill_engine_num_gpu
-            + len(d_endpoints) * self.args.decode_engine_num_gpu
-        )
-        self.writer.add_scalar("num_gpu", curr_gpu_usage, time.time() - self.init_time)
-        self.metrics_collection_time.append(time.time())
-    async def make_adjustments(self):
-        # Note: all adjustments are blocking. Non-blocking adjustment and metric pulling
-        # make the optimization problem too complex and should not be needed in most cases.
-        logger.info(f"Making adjustments at t={time.time() - self.init_time:.1f}s")
-        # check if decode/prefill workers is still the same
-        # note that we only check length as endpoint ids might change
-        new_p_endpoints, new_d_endpoints = await self.get_workers_info()
-        if len(new_p_endpoints) != len(self.p_endpoints) or len(new_d_endpoints) != len(
-            self.d_endpoints
-        ):
-            logger.info("Decode/prefill workers changed, no adjustments will be made")
-            return
-        # compute current gpu usage
-        curr_gpu_usage = (
-            len(self.p_endpoints) * self.args.prefill_engine_num_gpu
-            + len(self.d_endpoints) * self.args.decode_engine_num_gpu
-        )
-        logger.info(f"Current engines use {curr_gpu_usage} GPUs")
-        avg_prefill_queue_load = np.mean(self.prefill_queue_load) / len(
-            self.p_endpoints
-        )
-        avg_kv_load = np.mean(self.kv_load)
-        # first check if we need to scale down any workers
-        if (
-            avg_prefill_queue_load < self.args.prefill_queue_scale_down_threshold
-            and len(self.p_endpoints) > self.args.min_endpoint
-        ):
-            logger.info(
-                f"Average prefill queue load ({avg_prefill_queue_load:.2f}) is below threshold ({self.args.prefill_queue_scale_down_threshold:.2f}), scaling down prefill workers"
-            )
-            success = await self.connector.remove_component("PrefillWorker")
-            if success:
-                curr_gpu_usage -= self.args.prefill_engine_num_gpu
-            else:
-                logger.info("Failed to scale down prefill worker")
-        if (
-            avg_kv_load < self.args.decode_kv_scale_down_threshold
-            and len(self.d_endpoints) > self.args.min_endpoint
-        ):
-            if self.decode_worker_remaining_grace_period > 0:
-                logger.info(
-                    f"Decode worker remaining grace period is {self.decode_worker_remaining_grace_period}, skipping scale down"
-                )
-            else:
-                logger.info(
-                    f"Average kv load ({avg_kv_load:.2f}) is below threshold ({self.args.decode_kv_scale_down_threshold:.2f}), scaling down decode workers"
-                )
-                success = await self.connector.remove_component("VllmWorker")
-                if success:
-                    curr_gpu_usage -= self.args.decode_engine_num_gpu
-                else:
-                    logger.info("Failed to scale down decode worker")
-        # check if we need to scale up workers
-        # we first check for prefill worker because prefill queueing can also lead
-        # to high kv load on decode workers
-        if (
-            avg_prefill_queue_load > self.args.prefill_queue_scale_up_threshold
-            and curr_gpu_usage + self.args.prefill_engine_num_gpu
-            <= self.args.max_gpu_budget
-        ):
-            logger.info(
-                f"Average prefill queue load ({avg_prefill_queue_load:.2f}) is above threshold ({self.args.prefill_queue_scale_up_threshold:.2f})"
-            )
-            # check prefill queue size trend:
-            prefill_queue_size_change = (
-                self.prefill_queue_load[-1] - self.prefill_queue_load[0]
-            )
-            predicted_prefill_future_queue_size = (
-                self.prefill_queue_load[-1]
-                + prefill_queue_size_change * NEW_PREFILL_WORKER_QUEUE_BUFFER_PERIOD
-            )
-            if (
-                predicted_prefill_future_queue_size
-                > self.args.prefill_queue_scale_up_threshold
-            ):
-                logger.info(
-                    f"Predicted future prefill queue size ({predicted_prefill_future_queue_size:.2f}) is also above threshold ({self.args.prefill_queue_scale_up_threshold:.2f}), scaling up prefill workers"
-                )
-                success = await self.connector.add_component("PrefillWorker")
-                if success:
-                    curr_gpu_usage += self.args.prefill_engine_num_gpu
-                else:
-                    logger.info("Failed to scale up prefill worker")
-            else:
-                logger.info(
-                    f"Predicted future prefill queue size ({predicted_prefill_future_queue_size:.2f}) is below threshold ({self.args.prefill_queue_scale_up_threshold:.2f}), skipping prefill worker scaling"
-                )
-        if (
-            avg_kv_load > self.args.decode_kv_scale_up_threshold
-            and curr_gpu_usage + self.args.decode_engine_num_gpu
-            <= self.args.max_gpu_budget
-        ):
-            logger.info(
-                f"Average kv load ({avg_kv_load:.2f}) is above threshold ({self.args.decode_kv_scale_up_threshold:.2f}), scaling up decode workers"
-            )
-            success = await self.connector.add_component("VllmWorker")
-            if success:
-                curr_gpu_usage += self.args.decode_engine_num_gpu
-                self.decode_worker_remaining_grace_period = (
-                    NEW_DECODE_WORKER_GRACE_PERIOD
-                )
-            else:
-                logger.info("Failed to scale up decode worker")
-        # no adjustment needed, just log the current metrics
-        if (
-            avg_prefill_queue_load > self.args.prefill_queue_scale_down_threshold
-            and avg_prefill_queue_load < self.args.prefill_queue_scale_up_threshold
-        ):
-            logger.info(
-                f"Average prefill queue load ({avg_prefill_queue_load:.2f}) is within threshold, no prefill worker scaling needed"
-            )
-        if (
-            avg_kv_load > self.args.decode_kv_scale_down_threshold
-            and avg_kv_load < self.args.decode_kv_scale_up_threshold
-        ):
-            logger.info(
-                f"Average kv load ({avg_kv_load:.2f}) is within threshold, no decode worker scaling needed"
-            )
-        logger.info(f"Engines after adjustment use {curr_gpu_usage} GPUs")
-        if self.decode_worker_remaining_grace_period > 0:
-            self.decode_worker_remaining_grace_period -= 1
-    async def run(self):
-        """Main loop for the planner"""
-        await self.set_metric_aggregator()
-        if self._repeating_log_func == logger.debug:
-            logger.info(
-                "Running in no-operation mode - detailed metrics will be logged at DEBUG level"
-            )
-        await self.reset_adjustment_interval()
-        while True:
-            current_time = time.time()
-            # Collect metrics at each metric pulling interval
-            if (
-                len(self.metrics_collection_time) == 0
-                or current_time - self.metrics_collection_time[-1]
-                >= self.args.metric_pulling_interval
-            ):
-                await self.collect_metrics()
-            # Check if it's time for adjustment
-            if (
-                current_time - self.last_adjustment_time
-                >= self.args.adjustment_interval
-            ):
-                if not self.args.no_operation:
-                    # blockingly make adjustments to avoid overcompensation
-                    await self.make_adjustments()
-                await self.reset_adjustment_interval()
-            # Sleep to avoid busy waiting
-            await asyncio.sleep(self.args.metric_pulling_interval / 10)
-# @dynamo_worker()
-# TODO: let's make it such that planner still works via CLI invokation
-async def start_planner(runtime: DistributedRuntime, args: argparse.Namespace):
-    planner = Planner(runtime, args)
-    console = Console()
-    table = Table()
-    table.add_column("Component", style="cyan")
-    table.add_column("Endpoint", style="green")
-    components = await runtime.etcd_client().kv_get_prefix(args.namespace)
-    for component in components:
-        try:
-            data = json.loads(component["value"].decode("utf-8"))
-            if "component" in data:
-                name = data["component"]
-                endpoint = data["endpoint"]
-                table.add_row(name, endpoint)
-        except Exception:
-            # Some entries may not be valid JSON or might be binary data
-            pass
-    console.print(table)
-    await planner.run()
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Common planner arguments
-    parser.add_argument(
-        "--namespace",
-        type=str,
-        default=LoadPlannerDefaults.namespace,
-        help="Namespace planner will look at",
-    )
-    parser.add_argument(
-        "--environment",
-        type=str,
-        default=LoadPlannerDefaults.environment,
-        help="Environment to run the planner in (local, kubernetes)",
-    )
-    parser.add_argument(
-        "--no-operation",
-        action="store_true",
-        default=LoadPlannerDefaults.no_operation,
-        help="Do not make any adjustments, just observe the metrics",
-    )
-    parser.add_argument(
-        "--log-dir",
-        type=str,
-        default=LoadPlannerDefaults.log_dir,
-        help="Tensorboard logging directory",
-    )
-    parser.add_argument(
-        "--adjustment-interval",
-        type=int,
-        default=LoadPlannerDefaults.adjustment_interval,
-        help="Interval in seconds between scaling adjustments",
-    )
-    parser.add_argument(
-        "--max-gpu-budget",
-        type=int,
-        default=LoadPlannerDefaults.max_gpu_budget,
-        help="Maximum number of GPUs to use",
-    )
-    parser.add_argument(
-        "--min-endpoint",
-        type=int,
-        default=LoadPlannerDefaults.min_endpoint,
-        help="Minimum number of endpoints to keep for prefill/decode workers",
-    )
-    parser.add_argument(
-        "--metric-pulling-interval",
-        type=int,
-        default=LoadPlannerDefaults.metric_pulling_interval,
-        help="Interval in seconds between metric pulls",
-    )
-    parser.add_argument(
-        "--decode-engine-num-gpu",
-        type=int,
-        default=LoadPlannerDefaults.decode_engine_num_gpu,
-        help="Number of GPUs per decode engine",
-    )
-    parser.add_argument(
-        "--prefill-engine-num-gpu",
-        type=int,
-        default=LoadPlannerDefaults.prefill_engine_num_gpu,
-        help="Number of GPUs per prefill engine",
-    )
-    # Load-planner specific arguments
-    parser.add_argument(
-        "--decode-kv-scale-up-threshold",
-        type=float,
-        default=LoadPlannerDefaults.decode_kv_scale_up_threshold,
-        help="KV cache utilization threshold to scale up decode workers",
-    )
-    parser.add_argument(
-        "--decode-kv-scale-down-threshold",
-        type=float,
-        default=LoadPlannerDefaults.decode_kv_scale_down_threshold,
-        help="KV cache utilization threshold to scale down decode workers",
-    )
-    parser.add_argument(
-        "--prefill-queue-scale-up-threshold",
-        type=float,
-        default=LoadPlannerDefaults.prefill_queue_scale_up_threshold,
-        help="Queue utilization threshold to scale up prefill workers, this threshold is per prefill worker",
-    )
-    parser.add_argument(
-        "--prefill-queue-scale-down-threshold",
-        type=float,
-        default=LoadPlannerDefaults.prefill_queue_scale_down_threshold,
-        help="Queue utilization threshold to scale down prefill workers, this threshold is per prefill worker",
-    )
-    args = parser.parse_args()
-    asyncio.run(dynamo_worker()(start_planner)(args))
--- a/examples/llm/components/planner_service.py
+++ b/examples/llm/components/planner_service.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import logging
-from pydantic import BaseModel
-from components.planner import start_planner  # type: ignore[attr-defined]
-from dynamo.planner.defaults import LoadPlannerDefaults
-from dynamo.runtime.logging import configure_dynamo_logging
-from dynamo.sdk import async_on_start, dynamo_context, endpoint, service
-from dynamo.sdk.core.protocol.interface import ComponentType
-from dynamo.sdk.lib.config import ServiceConfig
-from dynamo.sdk.lib.image import DYNAMO_IMAGE
-logger = logging.getLogger(__name__)
-class RequestType(BaseModel):
-    text: str
-@service(
-    dynamo={
-        "namespace": "dynamo",
-        "component_type": ComponentType.PLANNER,
-    },
-    resources={"cpu": "10", "memory": "20Gi"},
-    workers=1,
-    image=DYNAMO_IMAGE,
-)
-class Planner:
-    def __init__(self):
-        configure_dynamo_logging(service_name="Planner")
-        logger.info("Starting planner")
-        self.runtime = dynamo_context["runtime"]
-        config = ServiceConfig.get_instance()
-        # Get namespace directly from dynamo_context as it contains the active namespace
-        self.namespace = dynamo_context["namespace"]
-        config_instance = config.get("Planner", {})
-        self.args = argparse.Namespace(
-            namespace=self.namespace,
-            environment=config_instance.get(
-                "environment", LoadPlannerDefaults.environment
-            ),
-            no_operation=config_instance.get(
-                "no-operation", LoadPlannerDefaults.no_operation
-            ),
-            log_dir=config_instance.get("log-dir", LoadPlannerDefaults.log_dir),
-            adjustment_interval=config_instance.get(
-                "adjustment-interval", LoadPlannerDefaults.adjustment_interval
-            ),
-            metric_pulling_interval=config_instance.get(
-                "metric-pulling-interval", LoadPlannerDefaults.metric_pulling_interval
-            ),
-            max_gpu_budget=config_instance.get(
-                "max-gpu-budget", LoadPlannerDefaults.max_gpu_budget
-            ),
-            min_endpoint=config_instance.get(
-                "min-endpoint", LoadPlannerDefaults.min_endpoint
-            ),
-            decode_kv_scale_up_threshold=config_instance.get(
-                "decode-kv-scale-up-threshold",
-                LoadPlannerDefaults.decode_kv_scale_up_threshold,
-            ),
-            decode_kv_scale_down_threshold=config_instance.get(
-                "decode-kv-scale-down-threshold",
-                LoadPlannerDefaults.decode_kv_scale_down_threshold,
-            ),
-            prefill_queue_scale_up_threshold=config_instance.get(
-                "prefill-queue-scale-up-threshold",
-                LoadPlannerDefaults.prefill_queue_scale_up_threshold,
-            ),
-            prefill_queue_scale_down_threshold=config_instance.get(
-                "prefill-queue-scale-down-threshold",
-                LoadPlannerDefaults.prefill_queue_scale_down_threshold,
-            ),
-            decode_engine_num_gpu=config_instance.get(
-                "decode-engine-num-gpu", LoadPlannerDefaults.decode_engine_num_gpu
-            ),
-            prefill_engine_num_gpu=config_instance.get(
-                "prefill-engine-num-gpu", LoadPlannerDefaults.prefill_engine_num_gpu
-            ),
-        )
-    @async_on_start
-    async def async_init(self):
-        import asyncio
-        await asyncio.sleep(30)
-        logger.info("Calling start_planner")
-        await start_planner(self.runtime, self.args)
-        logger.info("Planner started")
-    @endpoint()
-    async def generate(self, request: RequestType):
-        """Dummy endpoint to satisfy that each component has an endpoint"""
-        yield "mock endpoint"
--- a/examples/llm/components/prefill_worker.py
+++ b/examples/llm/components/prefill_worker.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import asyncio
-import logging
-import os
-import signal
-import sys
-from pydantic import BaseModel
-from utils.nixl import NixlMetadataStore
-from utils.prefill_queue import PrefillQueue
-from utils.vllm import parse_vllm_args
-from vllm.entrypoints.openai.api_server import (
-    build_async_engine_client_from_engine_args,
-)
-from vllm.inputs.data import TokensPrompt
-from vllm.remote_prefill import RemotePrefillParams, RemotePrefillRequest
-from dynamo.sdk import async_on_start, dynamo_context, endpoint, service
-logger = logging.getLogger(__name__)
-class RequestType(BaseModel):
-    text: str
-@service(
-    dynamo={
-        "namespace": "dynamo",
-    },
-    resources={"gpu": 1, "cpu": "10", "memory": "20Gi"},
-    workers=1,
-)
-class PrefillWorker:
-    def __init__(self):
-        class_name = self.__class__.__name__
-        self.engine_args = parse_vllm_args(class_name, "")
-        self._loaded_metadata = set()
-        self.initialized = False
-        if self.engine_args.enable_chunked_prefill is not False:
-            logger.info("Chunked prefill is not supported yet, setting to False")
-            self.engine_args.enable_chunked_prefill = False
-        if self.engine_args.pipeline_parallel_size != 1:
-            logger.info("Pipeline parallel size is not supported yet, setting to 1")
-            self.engine_args.pipeline_parallel_size = 1
-        if self.engine_args.disable_async_output_proc is not True:
-            logger.info("Async output processing is not supported yet, setting to True")
-            self.engine_args.disable_async_output_proc = True
-        if self.engine_args.enforce_eager is not True:
-            logger.info("Prefill must be done eagerly, setting to True")
-            self.engine_args.enforce_eager = True
-        if self.engine_args.enable_prefix_caching is not False:
-            logger.info(
-                "Prefix caching is not supported yet in prefill worker, setting to False"
-            )
-            self.engine_args.enable_prefix_caching = False
-    @async_on_start
-    async def async_init(self):
-        self._engine_context = build_async_engine_client_from_engine_args(
-            self.engine_args
-        )
-        if self._engine_context is not None:
-            self.engine_client = await self._engine_context.__aenter__()
-        else:
-            raise RuntimeError("Failed to initialize engine client")
-        runtime = dynamo_context["runtime"]
-        metadata = self.engine_client.nixl_metadata
-        self._metadata_store = NixlMetadataStore("dynamo", runtime)
-        await self._metadata_store.put(metadata.engine_id, metadata)
-        self.task = asyncio.create_task(self.prefill_queue_handler())
-        def prefill_queue_handler_cb(fut):
-            try:
-                fut.result()
-                logger.info("prefill queue handler exited successfully")
-            except Exception as e:
-                logger.error(f"[ERROR] prefill queue handler failed: {e!r}")
-                sys.exit(1)
-        self.task.add_done_callback(prefill_queue_handler_cb)
-        self.shutdown_requested = False
-        # Set up signal handler for graceful shutdown
-        # TODO: move to dynamo sdk
-        loop = asyncio.get_running_loop()
-        def signal_handler():
-            # Schedule the shutdown coroutine instead of calling it directly
-            asyncio.create_task(self.graceful_shutdown(runtime))
-        for sig in (signal.SIGTERM, signal.SIGINT):
-            loop.add_signal_handler(sig, signal_handler)
-        logger.info("PrefillWorker initialized")
-    async def graceful_shutdown(self, runtime):
-        logger.info("Received shutdown signal, shutting down DistributedRuntime")
-        # first shutdown the vllm engine
-        self.shutdown_requested = True
-        await asyncio.wait_for(self.task, timeout=None)
-        # then shutdown the mock endpoint
-        runtime.shutdown()
-        logger.info("DistributedRuntime shutdown complete")
-    def shutdown_vllm_engine(self):
-        """Shutdown the background loop"""
-        logger.info("Shutting down vllm engine")
-        loop = asyncio.get_event_loop()
-        try:
-            self.engine_client.close()
-            logger.info("PrefillWorker shutdown complete")
-        except Exception as e:
-            logger.error(f"Error during shutdown: {e}")
-        finally:
-            loop.stop()
-    async def prefill_queue_handler(self):
-        logger.info("Prefill queue handler entered")
-        prefill_queue_nats_server = os.getenv("NATS_SERVER", "nats://localhost:4222")
-        namespace, _ = PrefillWorker.dynamo_address()  # type: ignore
-        prefill_queue_stream_name = f"{namespace}_prefill_queue"
-        logger.info(
-            f"Prefill queue: {prefill_queue_nats_server}:{prefill_queue_stream_name}"
-        )
-        self.initialized = True
-        # TODO: integrate prefill_queue to a dynamo endpoint
-        async with PrefillQueue.get_instance(
-            nats_server=prefill_queue_nats_server,
-            stream_name=prefill_queue_stream_name,
-        ) as prefill_queue:
-            logger.info("prefill queue handler started")
-            while True:
-                # TODO: this might add a small overhead to pull prefill from nats
-                # need to test and check how much overhead it is
-                prefill_request = await prefill_queue.dequeue_prefill_request()
-                if prefill_request is not None:
-                    logger.info(
-                        f"Dequeued prefill request: {prefill_request.request_id}"
-                    )
-                    async for _ in self.generate(prefill_request):
-                        pass
-                if self.shutdown_requested:
-                    logger.info(
-                        "Shutdown requested, checking if engine has any pending prefill sending requests"
-                    )
-                    while True:
-                        if not await self.engine_client.has_unfinished_requests():
-                            break
-                        logger.info(
-                            "Engine has pending prefill sending requests, rechecking in 1 second..."
-                        )
-                        await asyncio.sleep(1)
-                    self.shutdown_vllm_engine()
-                    break
-    async def generate(self, request: RemotePrefillRequest):
-        sampling_params = request.sampling_params
-        sampling_params.max_tokens = 1
-        sampling_params.min_tokens = 1
-        remote_prefill_params = RemotePrefillParams(
-            is_remote_decode=True,
-            decode_block_ids=request.block_ids,
-            decode_engine_id=request.engine_id,
-            decode_computed_block_ids=request.computed_block_ids,
-        )
-        # TODO check if metadata has changed
-        # and reload - currently only loading once
-        if request.engine_id not in self._loaded_metadata:
-            remote_metadata = await self._metadata_store.get(request.engine_id)
-            await self.engine_client.add_remote_nixl_metadata(remote_metadata)
-            logger.info(
-                f"Loaded nixl metadata from engine {request.engine_id} into "
-                f"engine {self.engine_client.nixl_metadata.engine_id}"
-            )
-            self._loaded_metadata.add(request.engine_id)
-        async for _ in self.engine_client.generate(
-            request_id=request.request_id,
-            prompt=TokensPrompt(prompt_token_ids=request.prompt_token_ids),
-            sampling_params=sampling_params,
-            remote_prefill_params=remote_prefill_params,
-        ):
-            yield
-    @endpoint()
-    async def mock(self, req: RequestType):
-        yield f"mock_response: {req}"
--- a/examples/llm/components/processor.py
+++ b/examples/llm/components/processor.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import asyncio
-import logging
-import uuid
-from enum import Enum
-from typing import Any, AsyncIterator, Dict, List, Tuple, Union
-from components.kv_router import Router
-from components.worker import VllmWorker
-from transformers import AutoTokenizer
-from utils.chat_processor import ChatProcessor, CompletionsProcessor, ProcessMixIn
-from utils.check_worker import check_required_workers
-from utils.protocol import LocalBlockHashes, MyRequestOutput, vLLMGenerateRequest
-from utils.vllm import RouterType, parse_vllm_args
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest
-from vllm.outputs import RequestOutput
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-from dynamo.llm import KvMetricsAggregator, compute_block_hash_for_seq_py
-from dynamo.runtime import EtcdKvCache
-from dynamo.sdk import async_on_start, depends, dynamo_context, endpoint, service
-logger = logging.getLogger(__name__)
-class RequestType(Enum):
-    CHAT = "chat"
-    COMPLETION = "completion"
-@service(
-    dynamo={
-        "namespace": "dynamo",
-    },
-    resources={"cpu": "10", "memory": "20Gi"},
-    workers=1,
-)
-class Processor(ProcessMixIn):
-    """
-    vLLM pre and post processing
-    """
-    worker = depends(VllmWorker)
-    router = depends(Router)
-    def __init__(self):
-        class_name = self.__class__.__name__
-        self.engine_args = parse_vllm_args(class_name, "")
-        self.model_config = self.engine_args.create_model_config()
-        self.default_sampling_params = self.model_config.get_diff_sampling_param()
-        self.tokenizer = self._create_tokenizer(self.engine_args)
-        self.chat_processor = ChatProcessor(self.tokenizer, self.model_config)
-        self.completions_processor = CompletionsProcessor(
-            self.tokenizer, self.model_config
-        )
-        self.min_workers = 1
-        self.request_queue: asyncio.Queue[Dict[str, Any]] = asyncio.Queue()
-        self.request_futures: Dict[str, asyncio.Future] = {}
-        self.num_worker_tasks = (
-            self.engine_args.router_num_threads
-        )  # Number of worker tasks to process the queue
-        self.worker_tasks: List[asyncio.Task] = []
-        print(f"Processor init: {self.engine_args.router}")
-    def _create_tokenizer(self, engine_args: AsyncEngineArgs) -> AnyTokenizer:
-        """Create a TokenizerGroup using engine arguments similar to VLLM's approach"""
-        model_path = engine_args.model
-        # Create the base tokenizer with VLLM's typical settings
-        base_tokenizer = AutoTokenizer.from_pretrained(
-            model_path,
-            trust_remote_code=True,
-            padding_side="left",
-            truncation_side="left",
-            use_fast=True,  # VLLM might use the fast tokenizer for efficiency
-        )
-        return base_tokenizer
-    @async_on_start
-    async def async_init(self):
-        runtime = dynamo_context["runtime"]
-        comp_ns, comp_name = VllmWorker.dynamo_address()  # type: ignore
-        self.worker_client = (
-            await runtime.namespace(comp_ns)
-            .component(comp_name)
-            .endpoint("generate")
-            .client()
-        )
-        self.use_router = self.engine_args.router in (
-            RouterType.KV,
-            RouterType.KV_LOAD,
-            RouterType.APPROX_KV,
-        )
-        if self.use_router:
-            router_ns, router_name = Router.dynamo_address()  # type: ignore
-            self.router_client = (
-                await runtime.namespace(router_ns)
-                .component(router_name)
-                .endpoint("generate")
-                .client()
-            )
-        await check_required_workers(self.worker_client, self.min_workers)
-        kv_listener = runtime.namespace("dynamo").component("VllmWorker")
-        await kv_listener.create_service()
-        self.metrics_aggregator = KvMetricsAggregator(kv_listener)
-        self.etcd_kv_cache = await EtcdKvCache.create(
-            runtime.etcd_client(),
-            f"/{comp_ns}/processor/",
-            {"router": self.engine_args.router},
-        )
-        # Start multiple worker tasks to process the queue
-        self._start_worker_tasks()
-    def _start_worker_tasks(self):
-        """Start multiple worker tasks to process the queue concurrently"""
-        # Clear any existing worker tasks
-        for task in self.worker_tasks:
-            if not task.done():
-                task.cancel()
-        self.worker_tasks = []
-        # Create new worker tasks
-        for i in range(self.num_worker_tasks):
-            task = asyncio.create_task(self._process_queue(worker_id=i))
-            self.worker_tasks.append(task)
-        logger.info(f"Started {self.num_worker_tasks} queue worker tasks")
-    async def _process_queue(self, worker_id: int):
-        """Background task to process the request queue"""
-        logger.info(f"Queue worker {worker_id} started")
-        while True:
-            try:
-                # Get the next request from the queue
-                request_data = await self.request_queue.get()
-                # Process the request
-                try:
-                    await self._process_request(request_data)
-                except Exception as e:
-                    logger.error(f"Worker {worker_id}: Error processing request: {e}")
-                finally:
-                    # Mark the task as done
-                    self.request_queue.task_done()
-            except asyncio.CancelledError:
-                logger.info(f"Queue worker {worker_id} was cancelled")
-                break
-            except Exception as e:
-                logger.error(
-                    f"Worker {worker_id}: Unexpected error in queue processing: {e}"
-                )
-                # Sleep briefly to avoid tight error loops
-                await asyncio.sleep(0.1)
-    async def _get_kv_load(self):
-        metrics = await self.metrics_aggregator.get_metrics()
-        kv_load = {}
-        for end_point in metrics.endpoints:
-            worker_id = end_point.worker_id
-            kv_load[worker_id] = getattr(end_point, "gpu_cache_usage_perc", 0.0)
-        return kv_load
-    async def _get_pending_requests(self):
-        metrics = await self.metrics_aggregator.get_metrics()
-        pending_requests = {}
-        for end_point in metrics.endpoints:
-            worker_id = end_point.worker_id
-            pending_requests[worker_id] = getattr(endpoint, "num_requests_waiting", 0)
-        return pending_requests
-    async def _generate(
-        self,
-        raw_request: Union[CompletionRequest, ChatCompletionRequest],
-        request_type: RequestType,
-    ):
-        request_id = str(uuid.uuid4())
-        logger.debug(f"Got raw request: {raw_request}")
-        # Create a future for this request
-        future: asyncio.Future[AsyncIterator[Any]] = asyncio.Future()
-        self.request_futures[request_id] = future
-        # Enqueue the request with minimal processing
-        await self.request_queue.put(
-            {
-                "request_id": request_id,
-                "raw_request": raw_request,
-                "request_type": request_type,
-            }
-        )
-        try:
-            # Wait for the future to complete and yield the results
-            generator = await future
-            async for response in generator:
-                yield response
-        finally:
-            # Clean up the future when done
-            if request_id in self.request_futures:
-                del self.request_futures[request_id]
-    async def _process_request(self, request_data: Dict[str, Any]):
-        """Process a single request from the queue"""
-        request_id = request_data["request_id"]
-        raw_request = request_data["raw_request"]
-        request_type = request_data["request_type"]
-        try:
-            # Parse the raw request here instead of in _generate
-            (
-                request,
-                conversation,
-                prompt,
-                engine_prompt,
-                sampling_params,
-            ) = await self._parse_raw_request(raw_request)
-            # Create an async generator function to process this request
-            async def process_and_stream():
-                # TODO: queue request at processor when engines are full
-                router_mode = (await self.etcd_kv_cache.get("router")).decode()
-                self.use_router = router_mode in (
-                    RouterType.KV,
-                    RouterType.KV_LOAD,
-                    RouterType.APPROX_KV,
-                )
-                prefix_hit_rate = 0.0  # Default value
-                if self.use_router:
-                    token_ids = engine_prompt["prompt_token_ids"]
-                    router_generator = await self.router_client.generate(
-                        LocalBlockHashes(
-                            hashes=compute_block_hash_for_seq_py(
-                                token_ids, self.engine_args.block_size
-                            ),
-                            tokens=token_ids,
-                            num_tokens=len(token_ids),
-                        ).model_dump_json()
-                    )
-                    decision = await router_generator.__anext__()
-                    worker_id, prefix_hit_rate = decision.data()
-                    prefix_hit_rate = float(prefix_hit_rate)
-                # Create request object once with default prefix_hit_rate
-                request_obj = vLLMGenerateRequest(
-                    engine_prompt=engine_prompt,
-                    sampling_params=sampling_params,
-                    request_id=request_id,
-                    prefix_hit_rate=prefix_hit_rate,
-                ).model_dump_json()
-                if self.use_router:
-                    if worker_id == "":
-                        engine_generator = await self.worker_client.generate(
-                            request_obj
-                        )
-                    else:
-                        engine_generator = await self.worker_client.direct(
-                            request_obj, int(worker_id)
-                        )
-                elif router_mode == RouterType.RANDOM:
-                    engine_generator = await self.worker_client.generate(request_obj)
-                elif router_mode == RouterType.ROUND_ROBIN:
-                    engine_generator = await self.worker_client.round_robin(request_obj)
-                output_generator = self._generate_responses(
-                    engine_generator, request_type
-                )
-                # Stream responses directly to the caller
-                async for response in await self._stream_response(
-                    request, output_generator, request_id, conversation
-                ):
-                    yield response
-            # Set the future result to our async generator
-            if request_id in self.request_futures:
-                self.request_futures[request_id].set_result(process_and_stream())
-        except Exception as e:
-            logger.error(f"Error processing request {request_id}: {e}")
-            # Set exception on the future if it still exists
-            if (
-                request_id in self.request_futures
-                and not self.request_futures[request_id].done()
-            ):
-                self.request_futures[request_id].set_exception(e)
-    async def _generate_responses(
-        self, engine_generator: AsyncIterator[RequestOutput], request_type: RequestType
-    ) -> AsyncIterator[Union[RequestOutput, Tuple[int, RequestOutput]]]:
-        prompt_idx = 0
-        async for resp in engine_generator:
-            # Deserialize the response from the engine
-            # Creates correct vLLM objects for each field
-            output = MyRequestOutput.model_validate_json(resp.data())
-            # OpenAIServingChat.chat_completion_stream_generator() method expects a RequestOutput object
-            request_output = RequestOutput(
-                request_id=output.request_id,
-                prompt=output.prompt,
-                prompt_token_ids=output.prompt_token_ids,
-                prompt_logprobs=output.prompt_logprobs,
-                outputs=output.outputs,
-                finished=output.finished,
-                metrics=output.metrics,
-            )
-            if request_type == RequestType.CHAT:
-                # For chat requests, yield the request_output directly.
-                yield request_output
-            elif request_type == RequestType.COMPLETION:
-                # Completion requests can have multiple prompts and stream generator requires the prompt index
-                yield (prompt_idx, request_output)
-            else:
-                raise NotImplementedError(
-                    f"Request type {request_type} not implemented"
-                )
-    @endpoint(name="chat/completions")
-    async def chat_completions(self, raw_request: ChatCompletionRequest):
-        async for response in self._generate(raw_request, RequestType.CHAT):
-            yield response
-    # @endpoint()
-    # async def completions(self, raw_request: CompletionRequest):
-    #     async for response in self._generate(raw_request, RequestType.COMPLETION):
-    #         yield response
--- a/examples/llm/components/worker.py
+++ b/examples/llm/components/worker.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import asyncio
-import logging
-import os
-import signal
-from components.disagg_router import PyDisaggregatedRouter
-from components.prefill_worker import PrefillWorker
-from utils.nixl import NixlMetadataStore
-from utils.prefill_queue import PrefillQueue
-from utils.protocol import MyRequestOutput, vLLMGenerateRequest
-from utils.vllm import RouterType, parse_vllm_args
-from vllm.entrypoints.openai.api_server import (
-    build_async_engine_client_from_engine_args,
-)
-from vllm.remote_prefill import RemotePrefillParams, RemotePrefillRequest
-from vllm.sampling_params import RequestOutputKind
-from dynamo.llm import ForwardPassMetrics, KvStats, WorkerMetricsPublisher, WorkerStats
-from dynamo.sdk import async_on_start, depends, dynamo_context, endpoint, service
-logger = logging.getLogger(__name__)
-@service(
-    dynamo={
-        "namespace": "dynamo",
-    },
-    resources={"gpu": 1, "cpu": "10", "memory": "20Gi"},
-    workers=1,
-)
-class VllmWorker:
-    prefill_worker = depends(PrefillWorker)
-    def __init__(self):
-        self.client = None
-        self.disaggregated_router: PyDisaggregatedRouter = None  # type: ignore
-        class_name = self.__class__.__name__
-        self.engine_args = parse_vllm_args(class_name, "")
-        self.do_remote_prefill = self.engine_args.remote_prefill
-        self._prefill_queue_nats_server = os.getenv(
-            "NATS_SERVER", "nats://localhost:4222"
-        )
-        self.namespace, _ = VllmWorker.dynamo_address()  # type: ignore
-        self._prefill_queue_stream_name = f"{self.namespace}_prefill_queue"
-        logger.info(
-            f"Prefill queue: {self._prefill_queue_nats_server}:{self._prefill_queue_stream_name}"
-        )
-        if self.engine_args.remote_prefill:
-            if self.engine_args.enable_chunked_prefill is not False:
-                logger.info("Chunked prefill is not supported yet, setting to False")
-                self.engine_args.enable_chunked_prefill = False
-            if self.engine_args.preemption_mode != "swap":
-                logger.info("Preemption mode is not supported yet, setting to swap")
-                self.engine_args.preemption_mode = "swap"
-            if self.engine_args.pipeline_parallel_size != 1:
-                logger.info("Pipeline parallel size is not supported yet, setting to 1")
-                self.engine_args.pipeline_parallel_size = 1
-        if self.engine_args.router in (RouterType.KV, RouterType.APPROX_KV):
-            if not self.engine_args.enable_prefix_caching:
-                logger.info(
-                    "When using KV router, prefix caching must be enabled, setting to True"
-                )
-                self.engine_args.enable_prefix_caching = True
-            VLLM_WORKER_ID = dynamo_context["endpoints"][0].lease_id()
-            os.environ["VLLM_WORKER_ID"] = str(VLLM_WORKER_ID)
-            os.environ["VLLM_KV_NAMESPACE"] = "dynamo"
-            os.environ["VLLM_KV_COMPONENT"] = class_name
-        self.metrics_publisher = WorkerMetricsPublisher()
-        signal.signal(signal.SIGTERM, self.shutdown_vllm_engine)
-        signal.signal(signal.SIGINT, self.shutdown_vllm_engine)
-    @async_on_start
-    async def async_init(self):
-        self._engine_context = build_async_engine_client_from_engine_args(
-            self.engine_args
-        )
-        if self._engine_context is not None:
-            self.engine_client = await self._engine_context.__aenter__()
-        else:
-            raise RuntimeError("Failed to initialize engine client")
-        self.engine_client.set_metrics_publisher(self.metrics_publisher)
-        # Initially send dummy metrics to kick start,
-        # vLLM will not update stat until forward pass is triggered
-        worker_stats = WorkerStats(
-            0,  # request_active_slots
-            1024,  # request_total_slots
-            0,  # num_requests_waiting
-            None,  # data_parallel_rank
-        )
-        kv_stats = KvStats(
-            0,  # kv_active_blocks
-            1024,  # kv_total_blocks
-            0.0,  # gpu_cache_usage_perc
-            0.0,  # gpu_prefix_cache_hit_rate
-        )
-        metrics = ForwardPassMetrics(
-            worker_stats=worker_stats,
-            kv_stats=kv_stats,
-            spec_decode_stats=None,
-        )
-        self.metrics_publisher.publish(metrics)
-        task = asyncio.create_task(self.create_metrics_publisher_endpoint())
-        task.add_done_callback(
-            lambda _: logger.info("metrics publisher endpoint created")
-        )
-        runtime = dynamo_context["runtime"]
-        if self.engine_args.remote_prefill:
-            metadata = self.engine_client.nixl_metadata
-            metadata_store = NixlMetadataStore("dynamo", runtime)
-            await metadata_store.put(metadata.engine_id, metadata)
-        if self.engine_args.conditional_disagg:
-            self.disaggregated_router = PyDisaggregatedRouter(
-                runtime,
-                self.namespace,
-                max_local_prefill_length=self.engine_args.max_local_prefill_length,
-                max_prefill_queue_size=self.engine_args.max_prefill_queue_size,
-            )
-            await self.disaggregated_router.async_init()
-        else:
-            self.disaggregated_router = None
-        # Set up signal handler for graceful shutdown
-        # TODO: move to dynamo sdk
-        loop = asyncio.get_running_loop()
-        def signal_handler():
-            # Schedule the shutdown coroutine instead of calling it directly
-            asyncio.create_task(self.graceful_shutdown(runtime))
-        for sig in (signal.SIGTERM, signal.SIGINT):
-            loop.add_signal_handler(sig, signal_handler)
-        logger.info("VllmWorker has been initialized")
-    async def graceful_shutdown(self, runtime):
-        logger.info("Received shutdown signal, shutting down DistributedRuntime")
-        runtime.shutdown()
-        logger.info("DistributedRuntime shutdown complete")
-    def shutdown_vllm_engine(self, signum, frame):
-        """Shutdown the background loop"""
-        logger.info(f"Received signal {signum}, shutting down")
-        loop = asyncio.get_event_loop()
-        try:
-            self.engine_client.close()
-            logger.info("VllmWorker shutdown complete")
-        except Exception as e:
-            logger.error(f"Error during shutdown: {e}")
-        finally:
-            loop.stop()
-    async def create_metrics_publisher_endpoint(self):
-        component = dynamo_context["component"]
-        logger.info("Creating metrics publisher endpoint with primary lease")
-        await self.metrics_publisher.create_endpoint(component)
-    def get_remote_prefill_request_callback(self):
-        # TODO: integrate prefill_queue to dynamo endpoint
-        async def callback(request: RemotePrefillRequest):
-            async with PrefillQueue.get_instance(
-                nats_server=self._prefill_queue_nats_server,
-                stream_name=self._prefill_queue_stream_name,
-            ) as prefill_queue:
-                await prefill_queue.enqueue_prefill_request(request)
-        return callback
-    # TODO: use the same child lease for metrics publisher endpoint and generate endpoint
-    @endpoint()
-    async def generate(self, request: vLLMGenerateRequest):
-        # TODO: consider prefix hit when deciding prefill locally or remotely
-        if self.disaggregated_router is not None:
-            async with PrefillQueue.get_instance(
-                nats_server=self._prefill_queue_nats_server,
-                stream_name=self._prefill_queue_stream_name,
-            ) as prefill_queue:
-                prefill_queue_size = await prefill_queue.get_queue_size()
-            disagg_router_decision = await self.disaggregated_router.prefill_remote(
-                len(request.engine_prompt["prompt_token_ids"]),
-                request.prefix_hit_rate,
-                prefill_queue_size,
-            )
-        else:
-            # always prefill remotely if no disaggregated router is provided
-            disagg_router_decision = True
-        if self.do_remote_prefill and disagg_router_decision:
-            remote_prefill_params = RemotePrefillParams(
-                is_remote_prefill=True,
-                remote_prefill_request_callback=self.get_remote_prefill_request_callback(),
-            )
-            logger.info(
-                f"Prefilling remotely for request {request.request_id} with length {len(request.engine_prompt['prompt_token_ids'])}"
-            )
-        else:
-            remote_prefill_params = None
-            logger.info(
-                f"Prefilling locally for request {request.request_id} with length {len(request.engine_prompt['prompt_token_ids'])}"
-            )
-        # rust HTTP requires Delta streaming
-        request.sampling_params.output_kind = RequestOutputKind.DELTA
-        async for response in self.engine_client.generate(
-            prompt=request.engine_prompt,
-            sampling_params=request.sampling_params,
-            request_id=request.request_id,
-            remote_prefill_params=remote_prefill_params,
-        ):
-            yield MyRequestOutput(
-                request_id=response.request_id,
-                prompt=response.prompt,
-                prompt_token_ids=response.prompt_token_ids,
-                prompt_logprobs=response.prompt_logprobs,
-                outputs=response.outputs,
-                finished=response.finished,
-            ).model_dump_json()
--- a/examples/llm/configs/agg.yaml
+++ b/examples/llm/configs/agg.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-Common:
-  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-  block-size: 64
-  max-model-len: 16384
-Frontend:
-  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-  endpoint: dynamo.Processor.chat/completions
-  port: 8000
-Processor:
-  router: round-robin
-  router-num-threads: 4
-  common-configs: [model, block-size, max-model-len]
-VllmWorker:
-  enforce-eager: true
-  max-num-batched-tokens: 16384
-  enable-prefix-caching: true
-  ServiceArgs:
-    workers: 1
-    resources:
-      gpu: '1'
-  common-configs: [model, block-size, max-model-len]
-Planner:
-  environment: local
-  no-operation: true
\ No newline at end of file
--- a/examples/llm/configs/agg_router.yaml
+++ b/examples/llm/configs/agg_router.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-Common:
-  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-  router: kv
-  block-size: 64
-  max-model-len: 16384
-  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
-Frontend:
-  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-  endpoint: dynamo.Processor.chat/completions
-  port: 8000
-Processor:
-  common-configs: [model, block-size, max-model-len, router]
-Router:
-  min-workers: 1
-  softmax-sample: true
-  common-configs: [model, block-size, router]
-VllmWorker:
-  enforce-eager: true
-  max-num-batched-tokens: 16384
-  enable-prefix-caching: true
-  tensor-parallel-size: 1
-  ServiceArgs:
-    workers: 1
-    resources:
-      gpu: '1'
-  common-configs: [model, block-size, max-model-len, router, kv-transfer-config]
-Planner:
-  environment: local
-  no-operation: true
\ No newline at end of file
--- a/examples/llm/configs/disagg.yaml
+++ b/examples/llm/configs/disagg.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-Common:
-  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-  block-size: 64
-  max-model-len: 16384
-  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
-Frontend:
-  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-  endpoint: dynamo.Processor.chat/completions
-  port: 8000
-Processor:
-  router: round-robin
-  common-configs: [model, block-size]
-VllmWorker:
-  remote-prefill: true
-  conditional-disagg: true
-  max-local-prefill-length: 10
-  max-prefill-queue-size: 2
-  ServiceArgs:
-    workers: 1
-    resources:
-      gpu: '1'
-  common-configs: [model, block-size, max-model-len, kv-transfer-config]
-PrefillWorker:
-  max-num-batched-tokens: 16384
-  ServiceArgs:
-    workers: 1
-    resources:
-      gpu: '1'
-  common-configs: [model, block-size, max-model-len, kv-transfer-config]
-Planner:
-  environment: local
-  no-operation: true
--- a/examples/llm/configs/disagg_router.yaml
+++ b/examples/llm/configs/disagg_router.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-Common:
-  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-  block-size: 64
-  max-model-len: 16384
-  router: kv
-  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
-Frontend:
-  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-  endpoint: dynamo.Processor.chat/completions
-  port: 8000
-Processor:
-  common-configs: [model, block-size, max-model-len, router]
-Router:
-  min-workers: 1
-  common-configs: [model, block-size, router]
-VllmWorker:
-  max-num-batched-tokens: 16384
-  remote-prefill: true
-  conditional-disagg: true
-  max-local-prefill-length: 10
-  max-prefill-queue-size: 2
-  tensor-parallel-size: 1
-  enable-prefix-caching: true
-  ServiceArgs:
-    workers: 1
-    resources:
-      gpu: '1'
-  common-configs: [model, block-size, max-model-len, router, kv-transfer-config]
-PrefillWorker:
-  max-num-batched-tokens: 16384
-  ServiceArgs:
-    workers: 1
-    resources:
-      gpu: '1'
-  common-configs: [model, block-size, max-model-len, kv-transfer-config]
-Planner:
-  environment: local
-  no-operation: true
\ No newline at end of file
--- a/examples/llm/configs/multinode-405b.yaml
+++ b/examples/llm/configs/multinode-405b.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This configuration file is used in the multinode-examples.md file
-# to start the 405B model on 3 nodes.
-Frontend:
-  served_model_name: nvidia/Llama-3.1-405B-Instruct-FP8
-  endpoint: dynamo.Processor.chat/completions
-  port: 8000
-Processor:
-  model: nvidia/Llama-3.1-405B-Instruct-FP8
-  block-size: 64
-  max-model-len: 8192
-  router: kv
-Router:
-  model: nvidia/Llama-3.1-405B-Instruct-FP8
-  min-workers: 1
-VllmWorker:
-  model: nvidia/Llama-3.1-405B-Instruct-FP8
-  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
-  block-size: 64
-  max-model-len: 8192
-  max-num-seqs: 16
-  remote-prefill: true
-  conditional-disagg: true
-  max-local-prefill-length: 10
-  max-prefill-queue-size: 2
-  gpu-memory-utilization: 0.95
-  tensor-parallel-size: 8
-  router: kv
-  quantization: modelopt
-  enable-prefix-caching: true
-  ServiceArgs:
-    workers: 1
-    resources:
-      gpu: '8'
-PrefillWorker:
-  model: nvidia/Llama-3.1-405B-Instruct-FP8
-  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
-  block-size: 64
-  max-model-len: 8192
-  max-num-seqs: 16
-  gpu-memory-utilization: 0.95
-  tensor-parallel-size: 8
-  quantization: modelopt
-  ServiceArgs:
-    workers: 1
-    resources:
-      gpu: '8'
\ No newline at end of file
--- a/examples/llm/configs/multinode_agg_r1.yaml
+++ b/examples/llm/configs/multinode_agg_r1.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-Common:
-  model: deepseek-ai/DeepSeek-R1
-  block-size: 64
-  max-model-len: 16384
-Frontend:
-  served_model_name: deepseek-ai/DeepSeek-R1
-  endpoint: dynamo.Processor.chat/completions
-  port: 8000
-Processor:
-  router: round-robin
-  common-configs: [model, block-size, max-model-len]
-VllmWorker:
-  enforce-eager: true
-  max-num-batched-tokens: 16384
-  enable-prefix-caching: true
-  router: random
-  tensor-parallel-size: 16
-  ServiceArgs:
-    workers: 1
-    resources:
-      gpu: '1'
-  common-configs: [model, block-size, max-model-len]
--- a/examples/llm/configs/mutinode_disagg_r1.yaml
+++ b/examples/llm/configs/mutinode_disagg_r1.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-Common:
-  model: deepseek-ai/DeepSeek-R1
-  block-size: 64
-  max-model-len: 16384
-  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
-  tensor-parallel-size: 16
-  disable-log-requests: true
-Frontend:
-  served_model_name: deepseek-ai/DeepSeek-R1
-  endpoint: dynamo.Processor.chat/completions
-  port: 8000
-Processor:
-  router: round-robin
-  common-configs: [model, block-size]
-VllmWorker:
-  remote-prefill: true
-  conditional-disagg: false
-  ServiceArgs:
-    workers: 1
-    resources:
-      gpu: '16'
-  common-configs: [model, block-size, max-model-len, kv-transfer-config, tensor-parallel-size, disable-log-requests]
-PrefillWorker:
-  max-num-batched-tokens: 16384
-  ServiceArgs:
-    workers: 1
-    resources:
-      gpu: '16'
-  common-configs: [model, block-size, max-model-len, kv-transfer-config, tensor-parallel-size, disable-log-requests]
--- a/examples/llm/deploy/agg.yaml
+++ b/examples/llm/deploy/agg.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-apiVersion: nvidia.com/v1alpha1
-kind: DynamoGraphDeployment
-metadata:
-  name: llm-agg
-spec:
-  envs:
-    - name: DYN_DEPLOYMENT_CONFIG
-      value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"router":"round-robin","router-num-threads":4,"common-configs":["model","block-size","max-model-len"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
-  services:
-    Frontend:
-      dynamoNamespace: llm-agg
-      componentType: main
-      replicas: 1
-      resources:
-        requests:
-          cpu: "1"
-          memory: "2Gi"
-        limits:
-          cpu: "1"
-          memory: "2Gi"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/llm
-          args:
-            - dynamo
-            - serve
-            - graphs.agg:Frontend
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - Frontend
-    Processor:
-      dynamoNamespace: llm-agg
-      componentType: worker
-      replicas: 1
-      resources:
-        requests:
-          cpu: "1"
-          memory: "2Gi"
-        limits:
-          cpu: "1"
-          memory: "2Gi"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/llm
-          args:
-            - dynamo
-            - serve
-            - graphs.agg:Processor
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - Processor
-    VllmWorker:
-      envFromSecret: hf-token-secret
-      dynamoNamespace: llm-agg
-      replicas: 1
-      resources:
-        requests:
-          cpu: "10"
-          memory: "20Gi"
-          gpu: "1"
-        limits:
-          cpu: "10"
-          memory: "20Gi"
-          gpu: "1"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/llm
-          args:
-            - dynamo
-            - serve
-            - graphs.agg:VllmWorker
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - VllmWorker
--- a/examples/llm/deploy/agg_router.yaml
+++ b/examples/llm/deploy/agg_router.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-apiVersion: nvidia.com/v1alpha1
-kind: DynamoGraphDeployment
-metadata:
-  name: agg-router
-spec:
-  envs:
-    - name: DYN_DEPLOYMENT_CONFIG
-      value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","router":"kv","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"common-configs":["model","block-size","max-model-len","router"]},"Router":{"min-workers":1,"softmax-sample":true,"common-configs":["model","block-size","router"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"tensor-parallel-size":1,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","router","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
-  services:
-    Frontend:
-      dynamoNamespace: llm-agg-router
-      componentType: main
-      replicas: 1
-      resources:
-        requests:
-          cpu: "1"
-          memory: "2Gi"
-        limits:
-          cpu: "1"
-          memory: "2Gi"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/llm
-          args:
-            - dynamo
-            - serve
-            - graphs.agg_router:Frontend
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - Frontend
-    Processor:
-      dynamoNamespace: llm-agg-router
-      componentType: worker
-      replicas: 1
-      resources:
-        requests:
-          cpu: "1"
-          memory: "2Gi"
-        limits:
-          cpu: "1"
-          memory: "2Gi"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/llm
-          args:
-            - dynamo
-            - serve
-            - graphs.agg_router:Processor
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - Processor
-    Router:
-      dynamoNamespace: llm-agg-router
-      componentType: worker
-      replicas: 1
-      resources:
-        requests:
-          cpu: "1"
-          memory: "2Gi"
-        limits:
-          cpu: "1"
-          memory: "2Gi"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/llm
-          args:
-            - dynamo
-            - serve
-            - graphs.agg_router:Router
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - Router
-    VllmWorker:
-      envFromSecret: hf-token-secret
-      dynamoNamespace: llm-agg-router
-      replicas: 1
-      resources:
-        requests:
-          cpu: "10"
-          memory: "20Gi"
-          gpu: "1"
-        limits:
-          cpu: "10"
-          memory: "20Gi"
-          gpu: "1"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/llm
-          args:
-            - dynamo
-            - serve
-            - graphs.agg_router:VllmWorker
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - VllmWorker
--- a/examples/llm/deploy/disagg.yaml
+++ b/examples/llm/deploy/disagg.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-apiVersion: nvidia.com/v1alpha1
-kind: DynamoGraphDeployment
-metadata:
-  name: llm-disagg
-spec:
-  envs:
-    - name: DYN_DEPLOYMENT_CONFIG
-      value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"router":"round-robin","common-configs":["model","block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
-  services:
-    Frontend:
-      dynamoNamespace: llm-disagg
-      componentType: main
-      replicas: 1
-      resources:
-        requests:
-          cpu: "1"
-          memory: "2Gi"
-        limits:
-          cpu: "1"
-          memory: "2Gi"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/llm
-          args:
-            - dynamo
-            - serve
-            - graphs.disagg:Frontend
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - Frontend
-    Processor:
-      dynamoNamespace: llm-disagg
-      componentType: worker
-      replicas: 1
-      resources:
-        requests:
-          cpu: "1"
-          memory: "2Gi"
-        limits:
-          cpu: "1"
-          memory: "2Gi"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/llm
-          args:
-            - dynamo
-            - serve
-            - graphs.disagg:Processor
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - Processor
-    VllmWorker:
-      envFromSecret: hf-token-secret
-      dynamoNamespace: llm-disagg
-      replicas: 1
-      resources:
-        requests:
-          cpu: "10"
-          memory: "20Gi"
-          gpu: "1"
-        limits:
-          cpu: "10"
-          memory: "20Gi"
-          gpu: "1"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/llm
-          args:
-            - dynamo
-            - serve
-            - graphs.disagg:VllmWorker
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - VllmWorker
-    PrefillWorker:
-      envFromSecret: hf-token-secret
-      dynamoNamespace: llm-disagg
-      replicas: 1
-      resources:
-        requests:
-          cpu: "10"
-          memory: "20Gi"
-          gpu: "1"
-        limits:
-          cpu: "10"
-          memory: "20Gi"
-          gpu: "1"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/llm
-          args:
-            - dynamo
-            - serve
-            - graphs.disagg:PrefillWorker
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - PrefillWorker
--- a/examples/llm/deploy/disagg_router.yaml
+++ b/examples/llm/deploy/disagg_router.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-apiVersion: nvidia.com/v1alpha1
-kind: DynamoGraphDeployment
-metadata:
-  name: disagg-router
-spec:
-  envs:
-    - name: DYN_DEPLOYMENT_CONFIG
-      value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"router":"kv","kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"common-configs":["model","block-size","max-model-len","router"]},"Router":{"min-workers":1,"common-configs":["model","block-size","router"]},"VllmWorker":{"max-num-batched-tokens":16384,"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"tensor-parallel-size":1,"enable-prefix-caching":true,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","router","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
-  services:
-    Frontend:
-      dynamoNamespace: llm-disagg-router
-      componentType: main
-      replicas: 1
-      resources:
-        requests:
-          cpu: "1"
-          memory: "2Gi"
-        limits:
-          cpu: "1"
-          memory: "2Gi"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/llm
-          args:
-            - dynamo
-            - serve
-            - graphs.disagg_router:Frontend
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - Frontend
-    Processor:
-      dynamoNamespace: llm-disagg-router
-      componentType: worker
-      replicas: 1
-      resources:
-        requests:
-          cpu: "1"
-          memory: "2Gi"
-        limits:
-          cpu: "1"
-          memory: "2Gi"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/llm
-          args:
-            - dynamo
-            - serve
-            - graphs.disagg_router:Processor
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - Processor
-    Router:
-      dynamoNamespace: llm-disagg-router
-      componentType: worker
-      replicas: 1
-      resources:
-        requests:
-          cpu: "1"
-          memory: "2Gi"
-        limits:
-          cpu: "1"
-          memory: "2Gi"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/llm
-          args:
-            - dynamo
-            - serve
-            - graphs.disagg_router:Router
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - Router
-    VllmWorker:
-      envFromSecret: hf-token-secret
-      dynamoNamespace: llm-disagg-router
-      replicas: 1
-      resources:
-        requests:
-          cpu: "10"
-          memory: "20Gi"
-          gpu: "1"
-        limits:
-          cpu: "10"
-          memory: "20Gi"
-          gpu: "1"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/llm
-          args:
-            - dynamo
-            - serve
-            - graphs.disagg_router:VllmWorker
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - VllmWorker
-    PrefillWorker:
-      envFromSecret: hf-token-secret
-      dynamoNamespace: llm-disagg-router
-      replicas: 1
-      resources:
-        requests:
-          cpu: "10"
-          memory: "20Gi"
-          gpu: "1"
-        limits:
-          cpu: "10"
-          memory: "20Gi"
-          gpu: "1"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/llm
-          args:
-            - dynamo
-            - serve
-            - graphs.disagg_router:PrefillWorker
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - PrefillWorker
--- a/examples/llm/graphs/__init__.py
+++ b/examples/llm/graphs/__init__.py
--- a/examples/llm/graphs/agg.py
+++ b/examples/llm/graphs/agg.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from components.frontend import Frontend
-from components.planner_service import Planner
-from components.processor import Processor
-from components.worker import VllmWorker
-Frontend.link(Processor).link(VllmWorker)
-Frontend.link(Planner)
--- a/examples/llm/graphs/agg_router.py
+++ b/examples/llm/graphs/agg_router.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from components.frontend import Frontend
-from components.kv_router import Router
-from components.planner_service import Planner
-from components.processor import Processor
-from components.worker import VllmWorker
-Frontend.link(Processor).link(Router).link(VllmWorker)
-Frontend.link(Planner)
--- a/examples/llm/graphs/disagg.py
+++ b/examples/llm/graphs/disagg.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from components.frontend import Frontend
-from components.planner_service import Planner
-from components.prefill_worker import PrefillWorker
-from components.processor import Processor
-from components.worker import VllmWorker
-Frontend.link(Processor).link(VllmWorker).link(PrefillWorker)
-Frontend.link(Planner)