refactor: remove old examples with old UX (#1899)

f00d700e · Alec · GitHub · c7080419 · c7080419 · c7080419
Unverified Commit f00d700e authored Jul 14, 2025 by Alec Committed by GitHub Jul 14, 2025
20 changed files
--- a/examples/hello_world/multinode_example/components/utils.py
+++ b/examples/hello_world/multinode_example/components/utils.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import asyncio
-import logging
-
-from pydantic import BaseModel
-
-from dynamo._core import Client
-
-logger = logging.getLogger(__name__)
-
-
-class GeneralRequest(BaseModel):
-    prompt: str = "user input"
-    request_id: str = "id_string"
-
-
-class GeneralResponse(BaseModel):
-    worker_output: str = "generated output"
-    request_id: str = "id_string"
-
-
-async def check_required_workers(
-    workers_client: Client,
-    required_workers: int,
-    on_change=True,
-    poll_interval=5,
-    tag="",
-):
-    """Wait until the minimum number of workers are ready."""
-    worker_ids = workers_client.endpoint_ids()
-    num_workers = len(worker_ids)
-    new_count = -1  # Force to log "waiting for worker" once
-    while num_workers < required_workers:
-        if (not on_change) or new_count != num_workers:
-            num_workers = new_count if new_count >= 0 else num_workers
-            logger.info(
-                f" {tag} Waiting for more workers to be ready.\n"
-                f" Current: {num_workers},"
-                f" Required: {required_workers}"
-            )
-        await asyncio.sleep(poll_interval)
-        worker_ids = workers_client.endpoint_ids()
-        new_count = len(worker_ids)
-
-    return worker_ids
--- a/examples/hello_world/multinode_example/components/worker.py
+++ b/examples/hello_world/multinode_example/components/worker.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import socket
-
-from components.utils import GeneralRequest, GeneralResponse
-
-from dynamo.sdk import DYNAMO_IMAGE, dynamo_endpoint, service
-
-logger = logging.getLogger(__name__)
-
-
-@service(
-    dynamo={
-        "enabled": True,
-        "namespace": "dynamo-demo",
-    },
-    image=DYNAMO_IMAGE,
-    resources={"cpu": "10", "memory": "20Gi"},
-    workers=1,
-)
-class DummyWorker:
-    def __init__(self):
-        self.hostname = socket.gethostname()
-
-    @dynamo_endpoint()
-    async def generate(self, request: GeneralRequest):
-        logger.info(f"{self.hostname}: Worker invoked")
-        yield GeneralResponse(
-            request_id=request.request_id,
-            worker_output=request.prompt + "_GeneratedBy_" + self.hostname,
-        ).model_dump_json()
--- a/examples/hello_world/multinode_example/configs/multi_worker.yaml
+++ b/examples/hello_world/multinode_example/configs/multi_worker.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-Processor:
-  min_worker: 2
-  router: round-robin
--- a/examples/hello_world/multinode_example/configs/one_worker.yaml
+++ b/examples/hello_world/multinode_example/configs/one_worker.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-Processor:
-  min_worker: 1
-  router: random
--- a/examples/llm/README.md
+++ b/examples/llm/README.md
-<!--
-SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-SPDX-License-Identifier: Apache-2.0
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-->
-
-# LLM Deployment Examples
-
-This directory contains examples and reference implementations for deploying Large Language Models (LLMs) in various configurations.
-
-## Use the Latest Release
-
-We recommend using the latest stable release of dynamo to avoid breaking changes:
-
-[![GitHub Release](https://img.shields.io/github/v/release/ai-dynamo/dynamo)](https://github.com/ai-dynamo/dynamo/releases/latest)
-
-You can find the latest release [here](https://github.com/ai-dynamo/dynamo/releases/latest) and check out the corresponding branch with:
-
-```bash
-git checkout $(git describe --tags $(git rev-list --tags --max-count=1))
-```
-
-## Components
-
- workers: Prefill and decode worker handles actual LLM inference
- router: Handles API requests and routes them to appropriate workers based on specified strategy
- frontend: OpenAI compatible http server handles incoming requests
-
-## Deployment Architectures
-
-### Aggregated
-Single-instance deployment where both prefill and decode are done by the same worker.
-
-### Disaggregated
-Distributed deployment where prefill and decode are done by separate workers that can scale independently.
-
-```mermaid
-sequenceDiagram
-    participant D as VllmWorker
-    participant Q as PrefillQueue
-    participant P as PrefillWorker
-
-    Note over D: Request is routed to decode
-    D->>D: Decide if prefill should be done locally or remotely
-
-        D->>D: Allocate KV blocks
-        D->>Q: Put RemotePrefillRequest on the queue
-
-        P->>Q: Pull request from the queue
-        P-->>D: Read cached KVs from Decode
-
-        D->>D: Decode other requests
-        P->>P: Run prefill
-        P-->>D: Write prefilled KVs into allocated blocks
-        P->>D: Send completion notification
-        Note over D: Notification received when prefill is done
-        D->>D: Schedule decoding
-```
-
-## Getting Started
-
-1. Choose a deployment architecture based on your requirements
-2. Configure the components as needed
-3. Deploy using the provided scripts
-
-### Prerequisites
-
-Start required services (etcd and NATS) using [Docker Compose](../../deploy/metrics/docker-compose.yml)
-```bash
-docker compose -f deploy/metrics/docker-compose.yml up -d
-```
-
-### Build docker
-
-```bash
-# On an x86 machine
-./container/build.sh --framework vllm
-
-# On an ARM machine (ex: GB200)
-./container/build.sh --framework vllm --platform linux/arm64
-```
-
-> [!NOTE]
-> Building a vLLM docker image for ARM machines currently involves building vLLM from source,
-> which has known issues with being slow and requiring a lot of system RAM:
-> https://github.com/vllm-project/vllm/issues/8878
->
-> You can tune the number of parallel build jobs for building VLLM from source
-> on ARM based on your available cores and system RAM with `VLLM_MAX_JOBS`.
->
-> For example, on an ARM machine with low system resources:
-> `./container/build.sh --framework vllm --platform linux/arm64 --build-arg VLLM_MAX_JOBS=2`
->
-> For example, on a GB200 which has very high CPU cores and memory resource:
-> `./container/build.sh --framework vllm --platform linux/arm64 --build-arg VLLM_MAX_JOBS=64`
->
-> When vLLM has pre-built ARM wheels published, this process can be improved.
-
-### Run container
-
-```
-./container/run.sh -it --framework vllm
-```
-
-## Run Deployment
-
-This figure shows an overview of the major components to deploy:
-
-```
-                                                 +----------------+
-                                          +------| prefill worker |-------+
-                                   notify |      |                |       |
-                                 finished |      +----------------+       | pull
-                                          v                               v
-+------+      +-----------+      +------------------+    push     +---------------+
-| HTTP |----->| processor |----->| decode/monolith  |------------>| prefill queue |
-|      |<-----|           |<-----|      worker      |             |               |
-+------+      +-----------+      +------------------+             +---------------+
-                  |    ^                  |
-       query best |    | return           | publish kv events
-           worker |    | worker_id        v
-                  |    |         +------------------+
-                  |    +---------|     kv-router    |
-                  +------------->|                  |
-                                 +------------------+
-
-```
-
-> [!NOTE]
-> The planner component is enabled by default for all deployment architectures but is set to no-op mode. This means the planner observes metrics but doesn't take scaling actions. To enable active scaling, you can add `--Planner.no-operation=false` to your `dynamo serve` command. For more details, see the [Planner documentation](../../components/planner/README.md).
-
-### Example architectures
-_Note_: For a non-dockerized deployment, first export `DYNAMO_HOME` to point to the dynamo repository root, e.g. `export DYNAMO_HOME=$(pwd)`
-
-#### Aggregated serving
-```bash
-cd $DYNAMO_HOME/examples/llm
-dynamo serve graphs.agg:Frontend -f ./configs/agg.yaml
-```
-
-#### Aggregated serving with KV Routing
-```bash
-cd $DYNAMO_HOME/examples/llm
-dynamo serve graphs.agg_router:Frontend -f ./configs/agg_router.yaml
-```
-
-#### Disaggregated serving
-```bash
-cd $DYNAMO_HOME/examples/llm
-dynamo serve graphs.disagg:Frontend -f ./configs/disagg.yaml
-```
-
-#### Disaggregated serving with KV Routing
-```bash
-cd $DYNAMO_HOME/examples/llm
-dynamo serve graphs.disagg_router:Frontend -f ./configs/disagg_router.yaml
-```
-
-### Client
-
-In another terminal:
-```bash
-# this test request has around 200 tokens isl
-
-curl localhost:8000/v1/chat/completions   -H "Content-Type: application/json"   -d '{
-    "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
-    "messages": [
-    {
-        "role": "user",
-        "content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden."
-    }
-    ],
-    "stream":false,
-    "max_tokens": 30
-  }'
-
-```
-
-### Multi-node deployment
-
-See [multinode.md](../../docs/examples/multinode.md) for more details.
-
-### Close deployment
-
-See [close deployment](../../docs/guides/dynamo_serve.md#close-deployment) section to learn about how to close the deployment.
-
-## Deploy to Kubernetes
-
-These examples can be deployed to a Kubernetes cluster using [Dynamo Cloud](../../docs/guides/dynamo_deploy/dynamo_cloud.md) and the Dynamo CLI.
-
-### Prerequisites
-
-You must have first followed the instructions in [deploy/cloud/helm/README.md](../../deploy/cloud/helm/README.md) to install Dynamo Cloud on your Kubernetes cluster.
-
-**Note**: The `KUBE_NS` variable in the following steps must match the Kubernetes namespace where you installed Dynamo Cloud. You must also expose the `dynamo-store` service externally. This will be the endpoint the CLI uses to interface with Dynamo Cloud.
-
-### Deployment Steps
-
-For detailed deployment instructions, please refer to the [Operator Deployment Guide](../../docs/guides/dynamo_deploy/operator_deployment.md). The following are the specific commands for the LLM examples:
-
-```bash
-# Set your project root directory
-export PROJECT_ROOT=$(pwd)
-
-# Configure environment variables (see operator_deployment.md for details)
-export KUBE_NS=dynamo-cloud
-export DYNAMO_CLOUD=http://localhost:8080  # If using port-forward
-# OR
-# export DYNAMO_CLOUD=https://dynamo-cloud.nvidia.com  # If using Ingress/VirtualService
-
-# Build the Dynamo base image (see operator_deployment.md for details)
-export DYNAMO_IMAGE=<your-registry>/<your-image-name>:<your-tag>
-
-# Build the service
-cd $PROJECT_ROOT/examples/llm
-DYNAMO_TAG=$(dynamo build graphs.agg:Frontend | grep "Successfully built" |  awk '{ print $NF }' | sed 's/\.$//')
-
-# Deploy to Kubernetes
-export DEPLOYMENT_NAME=llm-agg
-# TODO: Deploy your service using a DynamoGraphDeployment CR.
-```
-
-**Note**: To avoid rate limiting from unauthenticated requests to HuggingFace (HF), you can provide your `HF_TOKEN` as a secret in your deployment. See the [operator deployment guide](../../docs/guides/dynamo_deploy/operator_deployment.md#referencing-secrets-in-your-deployment) for instructions on referencing secrets like `HF_TOKEN` in your deployment configuration.
-
-**Note**: Optionally add `--Planner.no-operation=false` at the end of the deployment command to enable the planner component to take scaling actions on your deployment.
-
-### Testing the Deployment
-
-Once the deployment is complete, you can test it. If you have ingress available for your deployment, you can directly call the url returned
-in `dynamo deployment get ${DEPLOYMENT_NAME}` and skip the steps to find and forward the frontend pod.
-
-```bash
-# Find your frontend pod
-export FRONTEND_POD=$(kubectl get pods -n ${KUBE_NS} | grep "${DEPLOYMENT_NAME}-frontend" | sort -k1 | tail -n1 | awk '{print $1}')
-
-# Forward the pod's port to localhost
-dynamo-operator-deployment.yaml/$FRONTEND_POD 3000:3000 -n ${KUBE_NS}
-
-# Test the API endpoint
-curl localhost:3000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
-    "messages": [
-    {
-        "role": "user",
-        "content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden."
-    }
-    ],
-    "stream":false,
-    "max_tokens": 30
-  }'
-```
-
-For more details on managing deployments, testing, and troubleshooting, please refer to the [Operator Deployment Guide](../../docs/guides/dynamo_deploy/operator_deployment.md).
--- a/examples/llm/__init__.py
+++ b/examples/llm/__init__.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
--- a/examples/llm/benchmarks/README.md
+++ b/examples/llm/benchmarks/README.md
-<!--
-SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-SPDX-License-Identifier: Apache-2.0
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-->
-
-# LLM Deployment Benchmarking Guide
-
-This guide provides detailed steps on benchmarking Large Language Models (LLMs) in single and multi-node configurations.
-
-> [!NOTE]
-> We recommend trying out the [LLM Deployment Examples](./README.md) before benchmarking.
-
-
-## Prerequisites
-
-> [!Important]
-> At least one 8xH100-80GB node is required for the following instructions.
-
- 1. Build benchmarking image
-
-    ```bash
-    ./container/build.sh
-    ```
-
- 2. Download model
-
-    ```bash
-    huggingface-cli download neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
-    ```
-
- 3. Start NATS and ETCD
-
-    ```bash
-    docker compose -f deploy/metrics/docker-compose.yml up -d
-    ```
-
-> [!NOTE]
-> This guide was tested on node(s) with the following hardware configuration:
->
-> * **GPUs**:
->   8xH100-80GB-HBM3 (GPU Memory Bandwidth 3.2 TBs)
->
-> * **CPU**:
->   2 x Intel Sapphire Rapids, Intel(R) Xeon(R) Platinum 8480CL E5, 112 cores (56 cores per CPU), 2.00 GHz (Base), 3.8 Ghz (Max boost), PCIe Gen5
->
-> * **NVLink**:
->   NVLink 4th Generation, 900 GB/s (GPU to GPU NVLink bidirectional bandwidth), 18 Links per GPU
->
-> * **InfiniBand**:
->   8x400Gbit/s (Compute Links), 2x400Gbit/s (Storage Links)
->
-> Benchmarking with a different hardware configuration may yield suboptimal results.
-
-
-## Disaggregated Single Node Benchmarking
-
-> [!Important]
-> One 8xH100-80GB node is required for the following instructions.
-
-In the following setup we compare Dynamo disaggregated vLLM performance to
-[native vLLM Aggregated Baseline](#vllm-aggregated-baseline-benchmarking) on a single node. These were chosen to optimize
-for Output Token Throughput (per sec) when both are performing under similar Inter Token Latency (ms).
-For more details on your use case please see the [Performance Tuning Guide](/docs/guides/disagg_perf_tuning.md).
-
-In this setup, we will be using 4 prefill workers and 1 decode worker.
-Each prefill worker will use tensor parallel 1 and the decode worker will use tensor parallel 4.
-
-With the Dynamo repository, benchmarking image and model available, and **NATS and ETCD started**, perform the following steps:
-
- 1. Run benchmarking container
-
-    ```bash
-    ./container/run.sh --mount-workspace
-    ```
-
-> [!Tip]
-> The huggingface home source mount can be changed by setting `--hf-cache ~/.cache/huggingface`.
-
- 2. Start disaggregated services
-
-    ```bash
-    cd /workspace/examples/llm
-    dynamo serve benchmarks.disagg:Frontend -f benchmarks/disagg.yaml 1> disagg.log 2>&1 &
-    ```
-
-> [!Tip]
-> Check the `disagg.log` to make sure the service is fully started before collecting performance numbers.
-
- 3. Collect the performance numbers:
-
- ```bash
- bash -x /workspace/benchmarks/llm/perf.sh --mode disaggregated --deployment-kind dynamo_vllm --prefill-tensor-parallelism 1 --prefill-data-parallelism 4 --decode-tensor-parallelism 4 --decode-data-parallelism 1
- ```
-
- > [!Important]
- > We should be careful in specifying these options in `perf.sh` script. They should closely reflect the deployment config that is being benchmarked. See `perf.sh --help` to learn more about these option. In the above command, we described that our deployment is using disaggregated serving in dynamo with vLLM backend. We have also accurately described that we have 4 prefill workers with TP=1 and 1 decode worker with TP=4
-
-For more information see [Collecting Performance Numbers](#collecting-performance-numbers) section below.
-
-## Disaggregated Multinode Benchmarking
-
-> [!Important]
-> Two 8xH100-80GB nodes are required the following instructions.
-
-In the following steps we compare Dynamo disaggregated vLLM performance to
-[native vLLM Aggregated Baseline](#vllm-aggregated-baseline-benchmarking) on two nodes. These were chosen to optimize
-for Output Token Throughput (per sec) when both are performing under similar Inter Token Latency (ms).
-For more details on your use case please see the [Performance Tuning Guide](/docs/guides/disagg_perf_tuning.md).
-
-In this setup, we will be using 8 prefill workers and 1 decode worker.
-Each prefill worker will use tensor parallel 1 and the decode worker will use tensor parallel 8.
-
-With the Dynamo repository, benchmarking image and model available, and **NATS and ETCD started on node 0**, perform the following steps:
-
- 1. Run benchmarking container (nodes 0 & 1)
-
-    ```bash
-    ./container/run.sh --mount-workspace
-    ```
-
-> [!Tip]
-> The huggingface home source mount can be changed by setting `--hf-cache ~/.cache/huggingface`.
-
- 2. Config NATS and ETCD (node 1)
-
-    ```bash
-    export NATS_SERVER="nats://<node_0_ip_addr>"
-    export ETCD_ENDPOINTS="<node_0_ip_addr>:2379"
-    ```
-
-> [!Important]
-> Node 1 must be able to reach Node 0 over the network for the above services.
-
- 3. Start workers (node 0)
-
-    ```bash
-    cd /workspace/examples/llm
-    dynamo serve benchmarks.disagg_multinode:Frontend -f benchmarks/disagg_multinode.yaml 1> disagg_multinode.log 2>&1 &
-    ```
-
-> [!Tip]
-> Check the `disagg_multinode.log` to make sure the service is fully started before collecting performance numbers.
-
- 4. Start workers (node 1)
-
-    ```bash
-    cd /workspace/examples/llm
-    dynamo serve components.prefill_worker:PrefillWorker -f benchmarks/disagg_multinode.yaml 1> prefill_multinode.log 2>&1 &
-    ```
-
-> [!Tip]
-> Check the `prefill_multinode.log` to make sure the service is fully started before collecting performance numbers.
-
- 5. Collect the performance numbers:
-
- ```bash
- bash -x /workspace/benchmarks/llm/perf.sh --mode disaggregated --deployment-kind dynamo_vllm --prefill-tensor-parallelism 1 --prefill-data-parallelism 8 --decode-tensor-parallelism 8 --decode-data-parallelism 1
- ```
-
- > [!Important]
- > We should be careful in specifying these options in `perf.sh` script. They should closely reflect the deployment config that is being benchmarked. See `perf.sh --help` to learn more about these option. In the above command, we described that our deployment is using disaggregated serving in dynamo with vLLM backend. We have also accurately described that we have 8 prefill workers with TP=1 and 1 decode worker with TP=8
-
-For more information see [Collecting Performance Numbers](#collecting-performance-numbers) section below.
-
-
-## vLLM Aggregated Baseline Benchmarking
-
-> [!Important]
-> One (or two) 8xH100-80GB nodes are required the following instructions.
-
-With the Dynamo repository and the benchmarking image available, perform the following steps:
-
- 1. Run benchmarking container
-
-    ```bash
-    ./container/run.sh --mount-workspace
-    ```
-
-> [!Tip]
-> The Hugging Face home source mount can be changed by setting `--hf-cache ~/.cache/huggingface`.
-
- 2. Start vLLM serve
-
-    ```bash
-    CUDA_VISIBLE_DEVICES=0,1,2,3 vllm serve neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic \
-      --block-size 128 \
-      --max-model-len 3500 \
-      --max-num-batched-tokens 3500 \
-      --tensor-parallel-size 4 \
-      --gpu-memory-utilization 0.95 \
-      --disable-log-requests \
-      --port 8001 1> vllm_0.log 2>&1 &
-    CUDA_VISIBLE_DEVICES=4,5,6,7 vllm serve neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic \
-      --block-size 128 \
-      --max-model-len 3500 \
-      --max-num-batched-tokens 3500 \
-      --tensor-parallel-size 4 \
-      --gpu-memory-utilization 0.95 \
-      --disable-log-requests \
-      --port 8002 1> vllm_1.log 2>&1 &
-    ```
-
-> [!Tip]
-> Check the `vllm_0.log` and `vllm_1.log` to make sure the service is fully started before collecting performance numbers.
->
-> If benchmarking with two or more nodes, `--tensor-parallel-size 8` should be used and only run one `vllm serve` instance per node.
-
- 3. Use NGINX as load balancer
-
-    ```bash
-    apt update && apt install -y nginx
-    cp /workspace/benchmarks/llm/nginx.conf /etc/nginx/nginx.conf
-    service nginx restart
-    ```
-
-> [!Note]
-> If benchmarking over 2 nodes, the `upstream` configuration will need to be updated to link to the `vllm serve` on the second node.
-
- 4. Collect the performance numbers:
-
-Single-Node
-
- ```bash
- bash -x /workspace/benchmarks/llm/perf.sh --mode aggregated --deployment-kind vllm_serve --tensor-parallelism 4 --data-parallelism 2
- ```
-
- Two Nodes
-
- ```bash
- bash -x /workspace/benchmarks/llm/perf.sh --mode aggregated --deployment-kind vllm_serve --tensor-parallelism 8 --data-parallelism 2
- ```
-
- We could also run the benchmarking script and specify the model, input sequence length, output sequence length, and concurrency levels to target for benchmarking:
-
- ```bash
- bash -x /workspace/benchmarks/llm/perf.sh \
-  --mode aggregated \
-  --deployment-kind vllm_serve \
-  --tensor-parallelism 1 \
-  --data-parallelism 1 \
-  --model neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic \
-  --input-sequence-length 3000 \
-  --output-sequence-length 150 \
-  --url http://localhost:8000 \
-  --concurrency 1,2,4,8,16,32,64,128,256
-
-  # The `--concurrency` option accepts either a single value (e.g., 64) or a comma-separated list (e.g., 1,2,4,8) to specify multiple concurrency levels for benchmarking.
- ```
-
- > [!Important]
- > We should be careful in specifying these options in `perf.sh` script. They should closely reflect the deployment config that is being benchmarked. See `perf.sh --help` to learn more about these option. In the above command, we described that our deployment is using aggregated serving in `vllm serve`. We have also accurately described that we have 2 workers with TP=4(or TP=8 for two nodes).
-
-For more information see [Collecting Performance Numbers](#collecting-performance-numbers) section below.
-
-## Collecting Performance Numbers
-
-Currently, there is no consistent way of obtaining the configuration of deployment service. Hence, we need to provide this information to the script in form of command line arguments. The benchmarking script `/workspace/benchmarks/llm/perf.sh` uses GenAI-Perf tool to collect the performance numbers at various different request concurrencies. The perf.sh script can be run multiple times to collect numbers for various different deployments. Each script execution will create a new artifacts directory in `artifacts_root` and dump these numbers in it. See [Plotting Pareto Graphs](#plotting-pareto-graphs) to learn how to convert the data from this `artifacts_root` to generate pareto graphs for the performance.
-
-Note: As each `perf.sh` adds a new artifacts directory in the `artifacts_root` always, proper care should be taken that we are starting experiment with clean `artifacts_root` so we include only results from runs that we want to compare.
-
-> [!Tip]
-> See [GenAI-Perf tutorial](https://github.com/triton-inference-server/perf_analyzer/blob/main/genai-perf/docs/tutorial.md)
-> @ [GitHub](https://github.com/triton-inference-server/perf_analyzer) for additional information about how to run GenAI-Perf
-> and how to interpret results.
-
-
-## Interpreting Results
-
-### Plotting Pareto Graphs
-
-The `artifacts` directory generated by GenAI-Perf contains the raw performance number from the benchmarking.
-
-Using the benchmarking image, install the dependencies for plotting Pareto graph
-```bash
-pip3 install matplotlib seaborn
-```
-At the directory where the artifacts are located, plot the Pareto graph
-
-Single-Node:
-
-```bash
-python3 /workspace/benchmarks/llm/plot_pareto.py --artifacts-root-dir artifacts_root
-```
-
-Two Nodes:
-
-```bash
-python3 /workspace/benchmarks/llm/plot_pareto.py --artifacts-root-dir artifacts_root --title "Two Nodes"
-```
-The graph will be saved to the current directory and named `pareto_plot.png`.
-
-### Interpreting Pareto Graphs
-
-The question we want to answer in this comparison is how much Output Token Throughput can be improved by switching from
-aggregated to disaggregated serving when both are performing under similar Inter Token Latency.
-
-For each concurrency benchmarked, it produces a latency and throughput value pair. The x-axis on the Pareto graph is
-latency (tokens/s/user), which the latency is lower if the value is higher. The y-axis on the Pareto graph is throughput
-(tokens/s/gpu). The latency and throughput value pair forms a dot on the Pareto graph. A line (Pareto Frontier) is
-formed when the dots from different concurrency values are plotted on the graph.
-
-With the Pareto Frontiers of the baseline and the disaggregated results plotted on the graph, we can look for the
-greatest increase in throughput (along the y-axis) between the baseline and the disaggregated result Pareto Frontier,
-over different latencies (along the x-axis).
-
-For example, at 45 tokens/s/user, the increase in tokens/s/gpu is `145 - 80 = 65`, from the orange baseline to the
-blue disaggregated line, so the improvement is around 1.44x speed up:
-![Example Pareto Plot](./example_plots/single_node_pareto_plot.png)
-Note: The above example was collected over a single benchmarking run, the actual number may vary between runs, configurations and hardware.
-
-## Supporting Additional Models
-
-The instructions above can be used for nearly any model desired.
-More complex setup instructions might be required for certain models.
-The above instruction regarding ETCD, NATS, nginx, dynamo-serve, and GenAI-Perf still apply and can be reused.
-The specifics of deploying with different hardware, in a unique environment, or using another model framework can be adapted using the links below.
-
-Regardless of the deployment mechanism, the GenAI-Perf tool will report the same metrics and measurements so long as an accessible endpoint is available for it to interact with. Use the provided [perf.sh](../../../benchmarks/llm/perf.sh) script to automate the measurement of model throughput and latency against multiple request concurrences.
-
-### Deployment Examples
-
- [Dynamo Multinode Deployments](../../../docs/examples/multinode.md)
- [Dynamo TensorRT LLM Deployments](../../../docs/examples/trtllm.md)
-    - [Aggregated Deployment of Very Large Models](../../../docs/examples/multinode.md#aggregated-deployment)
- [Dynamo vLLM Deployments](../../../docs/examples/llm_deployment.md)
-
-
-## Monitor Benchmark Startup Status
-
-When running dynamo deployment, you may have multiple instances of the same worker kind for a particular benchmark run.
-The deployment can process the workflow as long as at least one worker is ready, in the case where the benchmark is run
-as soon as dynamo is responsive to inference request, which may result in inaccurate benchmark result at the beginning of
-the benchmark. In such a case, you may additionally deploy benchmark watcher to provide signal on whether the full deployment
-is ready. For instance, if you expect the total number of prefill and decode workers to be 10, you can run the below to start
-the watcher, which will exit if the total number is less than 10 after timeout. In addition to that, the watcher will create
-a HTTP server on port 7001 by default, which you can use to send GET request for readiness to build external benchmarking workflow.
-
-```bash
-# start your benchmark deployment
-...
-
-# start monitor separately, or it can be part of the deployment above
-dynamo serve --service-name Watcher benchmark_watcher:Watcher --Watcher.total-workers=10 --Watcher.timeout=10
-
-# Send curl request to check liveness
-curl localhost:7001
-127.0.0.1 - - [12/Jun/2025 23:31:52] "GET / HTTP/1.1" 400 -
-...
-curl localhost:7001
-127.0.0.1 - - [12/Jun/2025 23:32:46] "GET / HTTP/1.1" 200 -
-```
-
-## Utility for Setting Up Environment
-
-### vLLM
- `vllm_multinode_setup.sh` is a helper script to configure the node for dynamo deployment for
-vLLM. Depending on whether environment variable `HEAD_NODE_IP` and `RAY_LEADER_NODE_IP` are set
-when the script is invoked, it will:
-  - start nats server and etcd on the current node if `HEAD_NODE_IP` is not set, otherwise
-  set the environment variables as expected by dynamo.
-  - run Ray and connect to the Ray cluster started by `RAY_LEADER_NODE_IP`, otherwise start
-  the Ray cluster with current node as the head node.
-  - print the command with `HEAD_NODE_IP` and `RAY_LEADER_NODE_IP` set, which can be used in
-  another node to setup connectivity with the current node.
-
-  ```bash
-  # On node 0
-  source vllm_multinode_setup.sh
-  ... # starting nats server, etcd and ray cluster
-
-  # script print command
-  HEAD_NODE_IP=NODE_0_IP RAY_LEADER_NODE_IP=NODE_0_IP source vllm_multinode_setup.sh
-
-  # On node 1
-  HEAD_NODE_IP=NODE_0_IP RAY_LEADER_NODE_IP=NODE_0_IP source vllm_multinode_setup.sh
-  ... # connecting to Ray cluster
-  ```
-
-## Metrics and Visualization
-
-For instructions on how to acquire per worker metrics and visualize them using Grafana,
-please see the provided [Visualization with Prometheus and Grafana](../../../deploy/metrics/README.md).
-
-## Troubleshooting
-
-When benchmarking disaggregation performance, there can be cases where the latency and
-throughput number don't match the expectation within some margin. Below is a list of scenarios
-that have been encountered, and details on observations and resolutions.
-
-### Interconnect Configuration
-
-Even if the nodes have faster interconnect hardware available, there can be misconfiguration such that
-the fastest route may not be selected by NIXL ([example regression](https://github.com/ai-dynamo/dynamo/pull/1314)). NIXL simplifies the interconnect but also hides
-selection detail. Therefore this can be the cause if you observe abnormal TTFT increase when
-splitting prefill workers and decode workers to different nodes. For example, we have seen instances of ~2 second overhead added to TTFT when TCP is selected over RDMA for KV Cache transfer due to a misconfigured environment.
-
-Currently NIXL doesn't provide utility for reporting which transport is selected. Therefore
-you will need to verify if that is the cause by using backend specific debug options.
-In the case of UCX backend, you can use `ucx_info -d` to check if the desired interconnect
-devices are being recognized. At runtime, `UCX_LOG_LEVEL=debug` and `UCX_PROTO_INFO=y`
-can be set as environment variables to provide detailed logs on UCX activities. This will
-reveal whether the desired transport is being used.
-
-### The Full Deployment is Configured Correctly
-
-As benchmarking often focuses on configurations where multiple workers are being used,
-one may mistakenly consider a deployment ready for benchmarking while there are only a
-subset of workers taking requests. For example, in the aggregated baseline benchmarking,
-a user can miss updating the ip address to the other node in upstream section of `nginx.conf`.
-This could lead to only one of the nodes serving requests. In such a case,
-the benchmark can still run to completion, but the result will not reflect the deployment
-capacity, because not all the compute resources are being utilized.
-
-Therefore, it is important to verify that the requests can be routed to all workers before
-performing the benchmark:
- **Framework-only benchmark** The simplest way is to send sample requests and check
-the logs of all workers. Each framework may provide utilities for readiness checks, so please
-refer to the framework's documentation for those details.
- **Dynamo based benchmark** Once you start the deployment, you can follow
-the instructions in [monitor benchmark startup status](#Monitor-Benchmark-Startup-Status),
-which will periodically poll the workers exposed to specific endpoints
-and return HTTP 200 code when the expected number of workers are met.
--- a/examples/llm/benchmarks/__init__.py
+++ b/examples/llm/benchmarks/__init__.py
--- a/examples/llm/benchmarks/benchmark_watcher.py
+++ b/examples/llm/benchmarks/benchmark_watcher.py
-# type: ignore  # Ignore all mypy errors in this file
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import asyncio
-import logging
-import threading
-import time
-from argparse import Namespace
-from http.server import BaseHTTPRequestHandler, HTTPServer
-
-from dynamo.sdk import async_on_start, dynamo_context, service
-from dynamo.sdk.lib.config import ServiceConfig
-
-logger = logging.getLogger(__name__)
-
-
-def start_server(server):
-    # Setup stuff here...
-    server.serve_forever()
-
-
-class HealthServer(HTTPServer):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.ready = False
-
-    def set_ready(self, ready: bool):
-        self.ready = ready
-
-
-class RequestHandler(BaseHTTPRequestHandler):
-    def do_GET(self):
-        if self.server.ready:
-            self.send_response(200)
-            self.end_headers()
-            self.wfile.write(b"Ready.")
-        else:
-            self.send_response(400)
-            self.end_headers()
-            self.wfile.write(b"Not Ready")
-            return
-
-
-def parse_args(service_name, prefix) -> Namespace:
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--total-workers",
-        type=int,
-        default=1,
-        help="Total number of workers to be registered",
-    )
-    parser.add_argument(
-        "--worker-components",
-        nargs="+",
-        default=["VllmWorker", "PrefillWorker"],
-        help="Components that we are tracking worker readiness",
-    )
-    parser.add_argument(
-        "--component-endpoints",
-        nargs="+",
-        default=["generate", "mock"],
-        help="Components that we are tracking worker readiness",
-    )
-    parser.add_argument(
-        "--timeout",
-        type=int,
-        default=600,
-        help="Timeout (seconds) for waiting for workers to be ready",
-    )
-    parser.add_argument(
-        "--port",
-        type=int,
-        default=7001,
-        help="port for readiness check",
-    )
-    config = ServiceConfig.get_instance()
-    config_args = config.as_args(service_name, prefix=prefix)
-    args = parser.parse_args(config_args)
-    if len(args.worker_components) != len(args.component_endpoints):
-        parser.error(
-            "--worker-components and --component-endpoints must have the same number "
-            f"of items, but got {args.worker_components} and {args.component_endpoints}"
-        )
-    return args
-
-
-# Use dynamo style to have access to clients
-@service(
-    dynamo={
-        "namespace": "dynamo",
-    },
-    resources={"cpu": "1", "memory": "1Gi"},
-    workers=1,
-)
-class Watcher:
-    def __init__(self):
-        self.args = parse_args(self.__class__.__name__, "")
-
-    @async_on_start
-    async def async_init(self):
-        self.runtime = dynamo_context["runtime"]
-        self.workers_clients = []
-        for component, endpoint in zip(
-            self.args.worker_components, self.args.component_endpoints
-        ):
-            self.workers_clients.append(
-                await self.runtime.namespace("dynamo")
-                .component(component)
-                .endpoint(endpoint)
-                .client()
-            )
-            logger.info(f"Component {component}/{endpoint} is registered")
-        logger.info(f"Total number of workers to be waited: {self.args.total_workers}")
-        logger.info(f"Timeout for waiting for workers to be ready: {self.args.timeout}")
-        self.server = HealthServer(("0.0.0.0", self.args.port), RequestHandler)
-        print(f"Serving on 0.0.0.0:{self.args.port}, listening to readiness check...")
-        self._server_thread = threading.Thread(target=start_server, args=(self.server,))
-        self._server_thread.start()
-        await check_required_workers(
-            self.workers_clients, self.args.total_workers, self.args.timeout
-        )
-        self.server.set_ready(True)
-        logger.info("All workers are ready.")
-
-
-async def check_required_workers(
-    workers_clients, required_workers: int, timeout: int, poll_interval=1
-):
-    """Wait until the minimum number of workers are ready."""
-    start_time = time.time()
-    num_workers = 0
-    while num_workers < required_workers and time.time() - start_time < timeout:
-        num_workers = sum(map(lambda wc: len(wc.instance_ids()), workers_clients))
-        if num_workers < required_workers:
-            logger.info(
-                f"Waiting for more workers to be ready.\n"
-                f" Current: {num_workers},"
-                f" Required: {required_workers}"
-            )
-            await asyncio.sleep(poll_interval)
-    if num_workers < required_workers:
-        raise TimeoutError(
-            f"Timed out waiting for {required_workers} workers to be ready."
-        )
--- a/examples/llm/benchmarks/disagg.py
+++ b/examples/llm/benchmarks/disagg.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from components.frontend import Frontend
-from components.prefill_worker import PrefillWorker
-from components.processor import Processor
-from components.worker import VllmWorker
-
-Frontend.link(Processor).link(VllmWorker).link(PrefillWorker)
--- a/examples/llm/benchmarks/disagg.yaml
+++ b/examples/llm/benchmarks/disagg.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-Common:
-  model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
-  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
-  router: round-robin
-  # Number of tokens in a batch for more efficient chunked transfers to GPUs.
-  block-size: 128
-  max-model-len: 3500
-  max-num-batched-tokens: 3500
-  disable-log-requests: true
-
-Frontend:
-  # This model was chosen for its 70B size and FP8 precision, which the TP and
-  # DP configurations were tuned for its size, and its precision reduces model
-  # and KV cache memory usage and easing remote cache transfer.
-  served_model_name: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
-  endpoint: dynamo.Processor.chat/completions
-  port: 8000
-
-Processor:
-  common-configs: [model, router]
-
-# x1 process with 4 GPUs generating output tokens (the "decode" phase).
-VllmWorker:
-  common-configs: [model, kv-transfer-config, router, block-size, max-model-len, disable-log-requests]
-  # Enable prefill at different workers.
-  remote-prefill: true
-  # Disable local prefill so only disaggregated prefill is used.
-  conditional-disagg: false
-  gpu-memory-utilization: 0.95
-  tensor-parallel-size: 4
-  ServiceArgs:
-    workers: 1
-    resources:
-      gpu: 4
-
-# x4 processes each with 1 GPU handling the initial prefill (context embedding) phase.
-PrefillWorker:
-  common-configs: [model, kv-transfer-config, block-size, max-model-len, max-num-batched-tokens, gpu-memory-utilization, disable-log-requests]
-  tensor-parallel-size: 1
-  ServiceArgs:
-    workers: 4
-    resources:
-      gpu: 1
-
-# Automatic prefix caching is disabled by default, since all requests are expected to be unique.
--- a/examples/llm/benchmarks/disagg_multinode.py
+++ b/examples/llm/benchmarks/disagg_multinode.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from components.frontend import Frontend
-from components.kv_router import Router
-from components.processor import Processor
-from components.worker import VllmWorker
-
-Frontend.link(Processor).link(Router).link(VllmWorker)
--- a/examples/llm/benchmarks/disagg_multinode.yaml
+++ b/examples/llm/benchmarks/disagg_multinode.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-Common:
-  model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
-  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
-  # Routing policy determines how remote workers are selected for processing
-  # prefill requests
-  # 1. random: randomly select workers for prefill requests
-  # 2. round-robin: different prefill requests take similar time to complete so
-  #                 selecting workers in round-robin maximizes the chance of
-  #                 selecting the least busy worker for a request
-  # 3. kv: finding prefill workers by KV cache is not beneficial when caching is
-  #        disabled on this setup
-  router: round-robin
-  # Number of tokens in a batch for more efficient chunked transfers to GPUs.
-  block-size: 128
-  max-model-len: 3500
-  max-num-batched-tokens: 3500
-  disable-log-requests: true
-
-Frontend:
-  served_model_name: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
-  endpoint: dynamo.Processor.chat/completions
-  port: 8000
-
-Processor:
-  common-configs: [model, block-size, max-model-len, router]
-
-Router:
-  common-configs: [model]
-  min-workers: 1
-
-VllmWorker:
-  common-configs: [model, kv-transfer-config, router, block-size, max-model-len, disable-log-requests]
-  # Enable prefill at different workers.
-  remote-prefill: true
-  # Disable local prefill so only disaggregated prefill is used.
-  conditional-disagg: false
-  # The GPU memory utilization do not have to match between VllmWorker and PrefillWorker.
-  gpu-memory-utilization: 0.95
-  # TP size is doubled from single node setup
-  tensor-parallel-size: 8
-  ServiceArgs:
-    workers: 1
-    resources:
-      gpu: 8
-
-PrefillWorker:
-  common-configs: [model, kv-transfer-config, block-size, max-model-len, max-num-batched-tokens, disable-log-requests]
-  gpu-memory-utilization: 0.95
-  tensor-parallel-size: 1
-  ServiceArgs:
-    # DP size is doubled from single node setup
-    workers: 8
-    resources:
-      gpu: 1
-
-# Automatic prefix caching is disabled by default, since all requests are expected to be unique.
--- a/examples/llm/benchmarks/example_plots/single_node_pareto_plot.png
+++ b/examples/llm/benchmarks/example_plots/single_node_pareto_plot.png
--- a/examples/llm/benchmarks/example_plots/two_node_pareto_plot.png
+++ b/examples/llm/benchmarks/example_plots/two_node_pareto_plot.png
--- a/examples/llm/benchmarks/vllm_multinode_setup.sh
+++ b/examples/llm/benchmarks/vllm_multinode_setup.sh
-#!/usr/bin/env bash
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# start nats and etcd
-if [[ -z "${HEAD_NODE_IP}" ]]; then
-    nats-server -js &
-    etcd --advertise-client-urls http://0.0.0.0:2379 --listen-client-urls http://0.0.0.0:2379 &
-    HEAD_NODE_IP=`hostname -i`
-else
-    export NATS_SERVER=nats://${HEAD_NODE_IP}:4222
-    export ETCD_ENDPOINTS=${HEAD_NODE_IP}:2379
-fi
-
-# start ray cluster
-if [[ -z "${RAY_LEADER_NODE_IP}" ]]; then
-    ray start --head --port=6379 --disable-usage-stats
-    RAY_LEADER_NODE_IP=`hostname -i`
-else
-    ray start --address=${RAY_LEADER_NODE_IP}:6379
-fi
-
-echo "HEAD_NODE_IP=${HEAD_NODE_IP} RAY_LEADER_NODE_IP=${RAY_LEADER_NODE_IP=} source ${BASH_SOURCE[0]}"
--- a/examples/llm/components/__init__.py
+++ b/examples/llm/components/__init__.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
--- a/examples/llm/components/disagg_router.py
+++ b/examples/llm/components/disagg_router.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-
-from dynamo.runtime import EtcdKvCache
-from dynamo.sdk import dynamo_context
-
-logger = logging.getLogger(__name__)
-
-
-class PyDisaggregatedRouter:
-    def __init__(
-        self,
-        runtime,
-        namespace,
-        max_local_prefill_length=1000,
-        max_prefill_queue_size=2,
-    ):
-        self.runtime = runtime
-        self.namespace = namespace
-        self.max_local_prefill_length = max_local_prefill_length
-        self.max_prefill_queue_size = max_prefill_queue_size
-
-    async def async_init(self):
-        runtime = dynamo_context["runtime"]
-        self.etcd_kv_cache = await EtcdKvCache.create(
-            runtime.etcd_client(),
-            f"/{self.namespace}/disagg_router/",
-            {
-                "max_local_prefill_length": str(self.max_local_prefill_length),
-                "max_prefill_queue_size": str(self.max_prefill_queue_size),
-            },
-        )
-
-    async def prefill_remote(
-        self, prompt_length: int, prefix_hit_rate: float, queue_size: int
-    ):
-        max_local_prefill_length = int(
-            await self.etcd_kv_cache.get("max_local_prefill_length")
-        )
-        max_prefill_queue_size = int(
-            await self.etcd_kv_cache.get("max_prefill_queue_size")
-        )
-        absolute_prefill_length = int(prompt_length * (1 - prefix_hit_rate))
-        # TODO: consider size of each request in the queue when making the decision
-        decision = (
-            absolute_prefill_length > max_local_prefill_length
-            and queue_size < max_prefill_queue_size
-        )
-        logger.info(
-            f"Remote prefill: {decision} (prefill length: {absolute_prefill_length}/{max_local_prefill_length}, prefill queue size: {queue_size}/{max_prefill_queue_size})"
-        )
-        return decision
--- a/examples/llm/components/frontend.py
+++ b/examples/llm/components/frontend.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import subprocess
-from pathlib import Path
-
-from components.planner_service import Planner
-from components.processor import Processor
-from components.worker import VllmWorker
-from pydantic import BaseModel
-
-from dynamo import sdk
-from dynamo.sdk import api, depends, on_shutdown, service
-from dynamo.sdk.lib.config import ServiceConfig
-from dynamo.sdk.lib.image import DYNAMO_IMAGE
-
-logger = logging.getLogger(__name__)
-
-# TODO: temp workaround to avoid port conflict with subprocess HTTP server; remove this once ingress is fixed
-os.environ["DYNAMO_PORT"] = "3999"
-
-
-def get_http_binary_path():
-    """Find the HTTP binary path in SDK or fallback to 'http' command."""
-    sdk_path = Path(sdk.__file__)
-    binary_path = sdk_path.parent / "cli/bin/http"
-    if not binary_path.exists():
-        return "http"
-    else:
-        return str(binary_path)
-
-
-class FrontendConfig(BaseModel):
-    """Configuration for the Frontend service including model and HTTP server settings."""
-
-    served_model_name: str
-    endpoint: str
-    port: int = 8080
-
-
-# todo this should be called ApiServer
-@service(
-    dynamo={
-        "namespace": "dynamo",
-    },
-    resources={"cpu": "10", "memory": "20Gi"},
-    workers=1,
-    image=DYNAMO_IMAGE,
-)
-class Frontend:
-    planner = depends(Planner)
-    worker = depends(VllmWorker)
-    processor = depends(Processor)
-
-    def __init__(self):
-        """Initialize Frontend service with HTTP server and model configuration."""
-        frontend_config = FrontendConfig(**ServiceConfig.get_parsed_config("Frontend"))
-        self.frontend_config = frontend_config
-        self.process = None
-        self.setup_model()
-        self.start_http_server()
-
-    def setup_model(self):
-        """Configure the model for HTTP service using llmctl."""
-        subprocess.run(
-            [
-                "llmctl",
-                "http",
-                "remove",
-                "chat-models",
-                self.frontend_config.served_model_name,
-            ],
-            check=False,
-        )
-        subprocess.run(
-            [
-                "llmctl",
-                "http",
-                "add",
-                "chat-models",
-                self.frontend_config.served_model_name,
-                self.frontend_config.endpoint,
-            ],
-            check=False,
-        )
-
-    def start_http_server(self):
-        """Start the HTTP server on the configured port."""
-        logger.info("Starting HTTP server")
-        http_binary = get_http_binary_path()
-
-        self.process = subprocess.Popen(
-            [http_binary, "-p", str(self.frontend_config.port)],
-            stdout=None,
-            stderr=None,
-        )
-
-    @api()
-    def dummy_api(self) -> None:
-        """
-        Dummy API to enable the HTTP server for the Dynamo operator.
-        This API is not used by the model.
-
-        NOTE: this is a temporary solution to expose ingress
-        for the LLM examples. Will be fixed and removed in the future.
-        The resulting api_endpoints in dynamo.yaml will be incorrect.
-        """
-
-    @on_shutdown
-    def cleanup(self):
-        """Clean up resources before shutdown."""
-
-        # circusd manages shutdown of http server process, we just need to remove the model using the on_shutdown hook
-        subprocess.run(
-            [
-                "llmctl",
-                "http",
-                "remove",
-                "chat-models",
-                self.frontend_config.served_model_name,
-            ],
-            check=False,
-        )
--- a/examples/llm/components/kv_router.py
+++ b/examples/llm/components/kv_router.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import argparse
-import logging
-import random
-from argparse import Namespace
-from typing import AsyncIterator, Tuple
-
-import numpy as np  # Add numpy import
-from components.worker import VllmWorker
-from utils.check_worker import check_required_workers
-from utils.protocol import LocalBlockHashes
-from utils.vllm import RouterType
-
-from dynamo.llm import (
-    AggregatedMetrics,
-    ApproxKvIndexer,
-    KvIndexer,
-    KvMetricsAggregator,
-    OverlapScores,
-)
-from dynamo.sdk import async_on_start, depends, dynamo_context, endpoint, service
-from dynamo.sdk.lib.config import ServiceConfig
-
-WorkerId = str
-fallback_msg = "Will fallback to random routing."
-
-logger = logging.getLogger(__name__)
-
-
-def softmax_sample_from_logits(
-    logits: dict[str, float], temperature: float = 1.0, lower_is_better: bool = True
-) -> str:
-    if not logits:
-        raise ValueError("Empty logits dictionary")
-
-    keys = list(logits.keys())
-    values = np.array(list(logits.values()))
-
-    min_val = np.min(values)
-    max_val = np.max(values)
-
-    if min_val == max_val:
-        # All values are the same, uniform probability
-        probabilities = np.ones(len(keys)) / len(keys)
-    else:
-        normalized = values / (max_val - min_val)
-        if lower_is_better:
-            normalized = -1 * normalized
-
-        scaled = normalized / temperature
-
-        exp_values = np.exp(scaled - np.max(scaled))
-        probabilities = exp_values / np.sum(exp_values)
-
-    # Sample from the probability distribution
-    return np.random.choice(keys, p=probabilities)
-
-
-def parse_args(service_name, prefix) -> Namespace:
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model",
-        type=str,
-        default="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
-        help="Model that is being served",
-    )
-    parser.add_argument(
-        "--min-workers",
-        type=int,
-        default=1,
-        help="Minimum number of workers required before proceeding",
-    )
-    # TODO: Read block size
-    parser.add_argument(
-        "--block-size",
-        type=int,
-        default=64,
-        help="KV block size",
-    )
-    parser.add_argument(
-        "--custom-router",
-        type=bool,
-        default=False,
-        help="Whether to use custom router or not",
-    )
-    parser.add_argument(
-        "--router",
-        type=str,
-        default="kv",
-        help="The router type",
-    )
-    parser.add_argument(
-        "--softmax-sample",
-        action="store_true",
-        help="Whether to do softmax sampling based on worker logits (default is to pick smallest)",
-    )
-    config = ServiceConfig.get_instance()
-    config_args = config.as_args(service_name, prefix=prefix)
-    args = parser.parse_args(config_args)
-    return args
-
-
-@service(
-    dynamo={
-        "namespace": "dynamo",
-    },
-    resources={"cpu": "10", "memory": "20Gi"},
-    workers=1,
-)
-class Router:
-    """
-    Request handler for the generate endpoint
-    """
-
-    worker = depends(VllmWorker)
-
-    def __init__(self):
-        logger.info("Initializing Custom Router")
-        self.args = parse_args(self.__class__.__name__, "")
-
-        self.default_metrics = {
-            "kv_active_blocks": 0,
-            "kv_total_blocks": 1,
-            "num_requests_waiting": 0.0,
-            "gpu_cache_usage_perc": 0.0,
-            "gpu_prefix_cache_hit_rate": 0.0,
-        }
-
-    @async_on_start
-    async def async_init(self):
-        self.runtime = dynamo_context["runtime"]
-        self.workers_client = (
-            await self.runtime.namespace("dynamo")
-            .component("VllmWorker")
-            .endpoint("generate")
-            .client()
-        )
-
-        self.router_type = self.args.router
-
-        await check_required_workers(self.workers_client, self.args.min_workers)
-
-        kv_listener = self.runtime.namespace("dynamo").component("VllmWorker")
-        await kv_listener.create_service()
-        if self.router_type == RouterType.KV:
-            self.indexer = KvIndexer(kv_listener, self.args.block_size)
-        elif self.router_type == RouterType.APPROX_KV:
-            # For now, hardcode the TTL to 2 minutes.
-            self.indexer = ApproxKvIndexer(kv_listener, self.args.block_size, 120.0)
-
-        self.metrics_aggregator = KvMetricsAggregator(kv_listener)
-
-        self.active_blocks_dict = {}
-        worker_ids = self.workers_client.instance_ids()
-        for worker_id in worker_ids:
-            # [old_value, predictive_value]
-            self.active_blocks_dict[worker_id] = [0, 0]
-
-        logger.info("KV Router initialized")
-
-    def _update_and_get_active_blocks(self, worker_id: str, polled_value: int) -> int:
-        """Helper routine to update waiting dict and return the desired waiting value.
-
-        This method implements a predictive mechanism for tracking waiting requests:
-        - If a new polled value is detected (different from the stored old value),
-          it updates both the old and predictive values to this new measurement and returns it
-        - If no change is detected (polled value equals old value), it returns the
-          predictive value which has been incremented based on previous routing decisions
-
-        This allows the router to account for requests that have been dispatched but
-        not yet reflected in the polled metrics.
-        """
-        # Initialize if worker_id is not present
-        if worker_id not in self.active_blocks_dict:
-            logger.warning(f"New Worker added: {worker_id}")
-            self.active_blocks_dict[worker_id] = [polled_value, polled_value]
-            return polled_value
-
-        old_value, predictive_value = self.active_blocks_dict[worker_id]
-
-        # Check if polled value is different from old value
-        if polled_value != old_value:
-            self.active_blocks_dict[worker_id] = [polled_value, polled_value]
-            return polled_value
-        else:
-            return predictive_value
-
-    def _cost_function(
-        self,
-        scores: OverlapScores | None,
-        metrics: AggregatedMetrics | None,
-        token_length: int,
-    ):
-        """The cost function for deciding the best worker to route a request to.
-        If there are multiple workers sharing the same optimal cost, then
-        one of them is randomly selected.
-
-        Args:
-            scores (OverlapScores | None): The number of matching blocks between
-                the request and the prefix cache of each worker.
-            metrics (AggregatedMetrics | None): Several worker metrics polled
-                by the `KvMetricsAggregator`, currently including the
-                GPU cache usage, number of waiting requests, and the
-                GPU prefix cache hit rate.
-            token_length (int): The number of tokens in the request.
-
-        Returns:
-            (str, float): The best worker id and the corresponding score.
-        """
-
-        # Get all worker IDs from the client. This is needed because scores / metrics may not have values for all workers
-        # and we want all workers to be considered in the logit calculation
-        worker_ids = self.workers_client.instance_ids()
-        request_blocks = (
-            token_length + self.args.block_size - 1
-        ) // self.args.block_size
-
-        overlap_blocks_dict = {worker_id: 0 for worker_id in worker_ids}
-        new_blocks_dict = {worker_id: request_blocks for worker_id in worker_ids}
-
-        if scores:
-            for worker_id, score in scores.scores.items():
-                # score is number of matching blocks we multiply by block_size to get tokens
-                # and compare to token_length. The larger the cache hit the better
-                overlap_blocks_dict[worker_id] = score
-                new_blocks_dict[worker_id] = request_blocks - score
-        else:
-            logger.warning("Cannot get KV scores")
-
-        worker_metrics = {}
-        if metrics:
-            for endpoint in metrics.endpoints:
-                worker_id = endpoint.worker_id
-                worker_metrics[worker_id] = {
-                    key: getattr(endpoint, key, self.default_metrics[key])
-                    for key in self.default_metrics.keys()
-                }
-
-                # Update waiting value using helper routine
-                polled_active_blocks = int(
-                    worker_metrics[worker_id]["kv_active_blocks"]
-                )
-                worker_metrics[worker_id][
-                    "kv_active_blocks"
-                ] = self._update_and_get_active_blocks(worker_id, polled_active_blocks)
-        else:
-            logger.warning("Cannot get metrics")
-
-        worker_logits = {}
-        for worker_id in worker_ids:
-            # Use default values if worker not in scores or metrics
-            metrics_dict = worker_metrics.get(worker_id, self.default_metrics)
-            kv_total_blocks = metrics_dict["kv_total_blocks"]
-
-            new_blocks = new_blocks_dict[worker_id]
-            normalized_new_blocks = new_blocks / kv_total_blocks
-            gpu_cache_usage = metrics_dict["kv_active_blocks"] / kv_total_blocks
-
-            # Use raw waiting value without normalization
-            num_requests_waiting = metrics_dict["num_requests_waiting"]
-
-            # Have 1 metric that weights towards cache hit
-            # 2 metrics that penalize overloaded worker and queuing
-            worker_logits[worker_id] = (
-                normalized_new_blocks + gpu_cache_usage + num_requests_waiting
-            )
-            logger.info(
-                f"Formula for {worker_id}: {worker_logits[worker_id]:.3f} = {normalized_new_blocks:.3f} + {gpu_cache_usage:.3f} + {num_requests_waiting:.3f}"
-            )
-
-        if not worker_logits or not any(worker_logits.values()):
-            logger.warning(f"All worker logits are zero. {fallback_msg}.")
-            return "", 0.0
-
-        # Select the worker with the highest logit
-        if self.args.softmax_sample:
-            best_worker_id = int(softmax_sample_from_logits(worker_logits))
-        else:
-            min_logit = min(worker_logits.values())
-            best_workers = [
-                wid for wid, logit in worker_logits.items() if logit == min_logit
-            ]
-            best_worker_id = random.choice(best_workers)
-
-        # Log the metrics for the selected worker
-        if best_worker_id:
-            metrics_dict = worker_metrics.get(best_worker_id, self.default_metrics)
-
-            # Create log messages
-            log_messages = [
-                f"Selected worker: {best_worker_id}, logit: {worker_logits[best_worker_id]:.3f}",
-                f"Score: {scores.scores.get(best_worker_id, 0.0) if scores else 0.0:.3f}",
-                f"GPU Cache Hit Rate: {metrics_dict['gpu_prefix_cache_hit_rate']:.3f}",
-                f"GPU Cache Usage: {metrics_dict['kv_active_blocks'] / metrics_dict['kv_total_blocks']:.3f}",
-                f"Requests Waiting: {metrics_dict['num_requests_waiting']}",
-            ]
-
-            # Log to vllm_logger
-            for message in log_messages:
-                logger.info(message)
-
-            # Increment predictive waiting for the selected worker before returning
-            self.active_blocks_dict[best_worker_id][1] += new_blocks_dict[
-                best_worker_id
-            ]
-
-        return (
-            best_worker_id,
-            overlap_blocks_dict[best_worker_id] * self.args.block_size / token_length,
-        )
-
-    def _get_underloaded_worker(self, metrics: AggregatedMetrics | None):
-        if not metrics:
-            logger.warning(f"Cannot get metrics. {fallback_msg}")
-            return "", 0.0
-
-        kv_load = {
-            endpoint.worker_id: getattr(endpoint, "gpu_cache_usage_perc", 0.0)
-            for endpoint in metrics.endpoints
-        }
-
-        if not kv_load or not any(kv_load.values()):
-            logger.warning(f"All KV loads are zero. {fallback_msg}")
-            return "", 0.0
-
-        min_load = min(kv_load.values())
-        min_load_workers = [
-            worker_id for worker_id, load in kv_load.items() if load == min_load
-        ]
-        best_worker_id = random.choice(min_load_workers)
-
-        logger.info(
-            f"Selected worker: {best_worker_id}, KV load: {kv_load[best_worker_id]:.3f}"
-        )
-        return best_worker_id, kv_load[best_worker_id]
-
-    @endpoint()
-    async def generate(
-        self, request: LocalBlockHashes
-    ) -> AsyncIterator[Tuple[WorkerId, float]]:
-        metrics = await self.metrics_aggregator.get_metrics()
-
-        # Quick return for KV_LOAD mode
-        if self.router_type == RouterType.KV_LOAD:
-            try:
-                yield self._get_underloaded_worker(metrics)
-            except Exception as e:
-                logger.exception(
-                    f"Error finding underloaded worker: {e}. {fallback_msg}"
-                )
-                yield "", 0.0
-            return
-
-        # Existing KV routing logic
-        try:
-            if self.router_type == RouterType.APPROX_KV:
-                scores = await self.indexer.find_matches_for_request(request.tokens)
-            else:
-                scores = await self.indexer.find_matches(request.hashes)
-        except Exception as e:
-            scores = {}
-            logger.exception(f"Error finding matches: {e}. {fallback_msg}")
-            yield "", 0.0
-            return
-
-        worker_id, prefix_hit_rate = self._cost_function(
-            scores, metrics, request.num_tokens
-        )
-
-        if self.router_type == RouterType.APPROX_KV:
-            # For the approx kv router, we need to know what worker we route to.
-            # We can't defer to the engine client to select a random worker.
-            # Because of this, we need to select a worker here.
-            if not worker_id:
-                all_workers = self.workers_client.instance_ids()
-                worker_id = random.choice(all_workers)
-
-            await self.log_router_decision(request.tokens, worker_id)
-
-        if worker_id:
-            logger.info(
-                f"Scheduling to worker_id: {worker_id} with estimated prefix hit rate: {prefix_hit_rate}"
-            )
-
-        yield worker_id, prefix_hit_rate
-
-    async def log_router_decision(self, tokens: list[int], worker_id: str):
-        if self.router_type == RouterType.APPROX_KV:
-            try:
-                await self.indexer.process_routing_decision_for_request(
-                    tokens, worker_id
-                )
-            except Exception as e:
-                logger.exception(
-                    f"Error processing routing decision: {e}. {fallback_msg}"
-                )