Unverified Commit f00d700e authored by Alec's avatar Alec Committed by GitHub
Browse files

refactor: remove old examples with old UX (#1899)

parent c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
import logging
from pydantic import BaseModel
from dynamo._core import Client
logger = logging.getLogger(__name__)
class GeneralRequest(BaseModel):
prompt: str = "user input"
request_id: str = "id_string"
class GeneralResponse(BaseModel):
worker_output: str = "generated output"
request_id: str = "id_string"
async def check_required_workers(
workers_client: Client,
required_workers: int,
on_change=True,
poll_interval=5,
tag="",
):
"""Wait until the minimum number of workers are ready."""
worker_ids = workers_client.endpoint_ids()
num_workers = len(worker_ids)
new_count = -1 # Force to log "waiting for worker" once
while num_workers < required_workers:
if (not on_change) or new_count != num_workers:
num_workers = new_count if new_count >= 0 else num_workers
logger.info(
f" {tag} Waiting for more workers to be ready.\n"
f" Current: {num_workers},"
f" Required: {required_workers}"
)
await asyncio.sleep(poll_interval)
worker_ids = workers_client.endpoint_ids()
new_count = len(worker_ids)
return worker_ids
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import socket
from components.utils import GeneralRequest, GeneralResponse
from dynamo.sdk import DYNAMO_IMAGE, dynamo_endpoint, service
logger = logging.getLogger(__name__)
@service(
dynamo={
"enabled": True,
"namespace": "dynamo-demo",
},
image=DYNAMO_IMAGE,
resources={"cpu": "10", "memory": "20Gi"},
workers=1,
)
class DummyWorker:
def __init__(self):
self.hostname = socket.gethostname()
@dynamo_endpoint()
async def generate(self, request: GeneralRequest):
logger.info(f"{self.hostname}: Worker invoked")
yield GeneralResponse(
request_id=request.request_id,
worker_output=request.prompt + "_GeneratedBy_" + self.hostname,
).model_dump_json()
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Processor:
min_worker: 2
router: round-robin
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Processor:
min_worker: 1
router: random
<!--
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# LLM Deployment Examples
This directory contains examples and reference implementations for deploying Large Language Models (LLMs) in various configurations.
## Use the Latest Release
We recommend using the latest stable release of dynamo to avoid breaking changes:
[![GitHub Release](https://img.shields.io/github/v/release/ai-dynamo/dynamo)](https://github.com/ai-dynamo/dynamo/releases/latest)
You can find the latest release [here](https://github.com/ai-dynamo/dynamo/releases/latest) and check out the corresponding branch with:
```bash
git checkout $(git describe --tags $(git rev-list --tags --max-count=1))
```
## Components
- workers: Prefill and decode worker handles actual LLM inference
- router: Handles API requests and routes them to appropriate workers based on specified strategy
- frontend: OpenAI compatible http server handles incoming requests
## Deployment Architectures
### Aggregated
Single-instance deployment where both prefill and decode are done by the same worker.
### Disaggregated
Distributed deployment where prefill and decode are done by separate workers that can scale independently.
```mermaid
sequenceDiagram
participant D as VllmWorker
participant Q as PrefillQueue
participant P as PrefillWorker
Note over D: Request is routed to decode
D->>D: Decide if prefill should be done locally or remotely
D->>D: Allocate KV blocks
D->>Q: Put RemotePrefillRequest on the queue
P->>Q: Pull request from the queue
P-->>D: Read cached KVs from Decode
D->>D: Decode other requests
P->>P: Run prefill
P-->>D: Write prefilled KVs into allocated blocks
P->>D: Send completion notification
Note over D: Notification received when prefill is done
D->>D: Schedule decoding
```
## Getting Started
1. Choose a deployment architecture based on your requirements
2. Configure the components as needed
3. Deploy using the provided scripts
### Prerequisites
Start required services (etcd and NATS) using [Docker Compose](../../deploy/metrics/docker-compose.yml)
```bash
docker compose -f deploy/metrics/docker-compose.yml up -d
```
### Build docker
```bash
# On an x86 machine
./container/build.sh --framework vllm
# On an ARM machine (ex: GB200)
./container/build.sh --framework vllm --platform linux/arm64
```
> [!NOTE]
> Building a vLLM docker image for ARM machines currently involves building vLLM from source,
> which has known issues with being slow and requiring a lot of system RAM:
> https://github.com/vllm-project/vllm/issues/8878
>
> You can tune the number of parallel build jobs for building VLLM from source
> on ARM based on your available cores and system RAM with `VLLM_MAX_JOBS`.
>
> For example, on an ARM machine with low system resources:
> `./container/build.sh --framework vllm --platform linux/arm64 --build-arg VLLM_MAX_JOBS=2`
>
> For example, on a GB200 which has very high CPU cores and memory resource:
> `./container/build.sh --framework vllm --platform linux/arm64 --build-arg VLLM_MAX_JOBS=64`
>
> When vLLM has pre-built ARM wheels published, this process can be improved.
### Run container
```
./container/run.sh -it --framework vllm
```
## Run Deployment
This figure shows an overview of the major components to deploy:
```
+----------------+
+------| prefill worker |-------+
notify | | | |
finished | +----------------+ | pull
v v
+------+ +-----------+ +------------------+ push +---------------+
| HTTP |----->| processor |----->| decode/monolith |------------>| prefill queue |
| |<-----| |<-----| worker | | |
+------+ +-----------+ +------------------+ +---------------+
| ^ |
query best | | return | publish kv events
worker | | worker_id v
| | +------------------+
| +---------| kv-router |
+------------->| |
+------------------+
```
> [!NOTE]
> The planner component is enabled by default for all deployment architectures but is set to no-op mode. This means the planner observes metrics but doesn't take scaling actions. To enable active scaling, you can add `--Planner.no-operation=false` to your `dynamo serve` command. For more details, see the [Planner documentation](../../components/planner/README.md).
### Example architectures
_Note_: For a non-dockerized deployment, first export `DYNAMO_HOME` to point to the dynamo repository root, e.g. `export DYNAMO_HOME=$(pwd)`
#### Aggregated serving
```bash
cd $DYNAMO_HOME/examples/llm
dynamo serve graphs.agg:Frontend -f ./configs/agg.yaml
```
#### Aggregated serving with KV Routing
```bash
cd $DYNAMO_HOME/examples/llm
dynamo serve graphs.agg_router:Frontend -f ./configs/agg_router.yaml
```
#### Disaggregated serving
```bash
cd $DYNAMO_HOME/examples/llm
dynamo serve graphs.disagg:Frontend -f ./configs/disagg.yaml
```
#### Disaggregated serving with KV Routing
```bash
cd $DYNAMO_HOME/examples/llm
dynamo serve graphs.disagg_router:Frontend -f ./configs/disagg_router.yaml
```
### Client
In another terminal:
```bash
# this test request has around 200 tokens isl
curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"messages": [
{
"role": "user",
"content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden."
}
],
"stream":false,
"max_tokens": 30
}'
```
### Multi-node deployment
See [multinode.md](../../docs/examples/multinode.md) for more details.
### Close deployment
See [close deployment](../../docs/guides/dynamo_serve.md#close-deployment) section to learn about how to close the deployment.
## Deploy to Kubernetes
These examples can be deployed to a Kubernetes cluster using [Dynamo Cloud](../../docs/guides/dynamo_deploy/dynamo_cloud.md) and the Dynamo CLI.
### Prerequisites
You must have first followed the instructions in [deploy/cloud/helm/README.md](../../deploy/cloud/helm/README.md) to install Dynamo Cloud on your Kubernetes cluster.
**Note**: The `KUBE_NS` variable in the following steps must match the Kubernetes namespace where you installed Dynamo Cloud. You must also expose the `dynamo-store` service externally. This will be the endpoint the CLI uses to interface with Dynamo Cloud.
### Deployment Steps
For detailed deployment instructions, please refer to the [Operator Deployment Guide](../../docs/guides/dynamo_deploy/operator_deployment.md). The following are the specific commands for the LLM examples:
```bash
# Set your project root directory
export PROJECT_ROOT=$(pwd)
# Configure environment variables (see operator_deployment.md for details)
export KUBE_NS=dynamo-cloud
export DYNAMO_CLOUD=http://localhost:8080 # If using port-forward
# OR
# export DYNAMO_CLOUD=https://dynamo-cloud.nvidia.com # If using Ingress/VirtualService
# Build the Dynamo base image (see operator_deployment.md for details)
export DYNAMO_IMAGE=<your-registry>/<your-image-name>:<your-tag>
# Build the service
cd $PROJECT_ROOT/examples/llm
DYNAMO_TAG=$(dynamo build graphs.agg:Frontend | grep "Successfully built" | awk '{ print $NF }' | sed 's/\.$//')
# Deploy to Kubernetes
export DEPLOYMENT_NAME=llm-agg
# TODO: Deploy your service using a DynamoGraphDeployment CR.
```
**Note**: To avoid rate limiting from unauthenticated requests to HuggingFace (HF), you can provide your `HF_TOKEN` as a secret in your deployment. See the [operator deployment guide](../../docs/guides/dynamo_deploy/operator_deployment.md#referencing-secrets-in-your-deployment) for instructions on referencing secrets like `HF_TOKEN` in your deployment configuration.
**Note**: Optionally add `--Planner.no-operation=false` at the end of the deployment command to enable the planner component to take scaling actions on your deployment.
### Testing the Deployment
Once the deployment is complete, you can test it. If you have ingress available for your deployment, you can directly call the url returned
in `dynamo deployment get ${DEPLOYMENT_NAME}` and skip the steps to find and forward the frontend pod.
```bash
# Find your frontend pod
export FRONTEND_POD=$(kubectl get pods -n ${KUBE_NS} | grep "${DEPLOYMENT_NAME}-frontend" | sort -k1 | tail -n1 | awk '{print $1}')
# Forward the pod's port to localhost
dynamo-operator-deployment.yaml/$FRONTEND_POD 3000:3000 -n ${KUBE_NS}
# Test the API endpoint
curl localhost:3000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"messages": [
{
"role": "user",
"content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden."
}
],
"stream":false,
"max_tokens": 30
}'
```
For more details on managing deployments, testing, and troubleshooting, please refer to the [Operator Deployment Guide](../../docs/guides/dynamo_deploy/operator_deployment.md).
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
<!--
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# LLM Deployment Benchmarking Guide
This guide provides detailed steps on benchmarking Large Language Models (LLMs) in single and multi-node configurations.
> [!NOTE]
> We recommend trying out the [LLM Deployment Examples](./README.md) before benchmarking.
## Prerequisites
> [!Important]
> At least one 8xH100-80GB node is required for the following instructions.
1. Build benchmarking image
```bash
./container/build.sh
```
2. Download model
```bash
huggingface-cli download neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
```
3. Start NATS and ETCD
```bash
docker compose -f deploy/metrics/docker-compose.yml up -d
```
> [!NOTE]
> This guide was tested on node(s) with the following hardware configuration:
>
> * **GPUs**:
> 8xH100-80GB-HBM3 (GPU Memory Bandwidth 3.2 TBs)
>
> * **CPU**:
> 2 x Intel Sapphire Rapids, Intel(R) Xeon(R) Platinum 8480CL E5, 112 cores (56 cores per CPU), 2.00 GHz (Base), 3.8 Ghz (Max boost), PCIe Gen5
>
> * **NVLink**:
> NVLink 4th Generation, 900 GB/s (GPU to GPU NVLink bidirectional bandwidth), 18 Links per GPU
>
> * **InfiniBand**:
> 8x400Gbit/s (Compute Links), 2x400Gbit/s (Storage Links)
>
> Benchmarking with a different hardware configuration may yield suboptimal results.
## Disaggregated Single Node Benchmarking
> [!Important]
> One 8xH100-80GB node is required for the following instructions.
In the following setup we compare Dynamo disaggregated vLLM performance to
[native vLLM Aggregated Baseline](#vllm-aggregated-baseline-benchmarking) on a single node. These were chosen to optimize
for Output Token Throughput (per sec) when both are performing under similar Inter Token Latency (ms).
For more details on your use case please see the [Performance Tuning Guide](/docs/guides/disagg_perf_tuning.md).
In this setup, we will be using 4 prefill workers and 1 decode worker.
Each prefill worker will use tensor parallel 1 and the decode worker will use tensor parallel 4.
With the Dynamo repository, benchmarking image and model available, and **NATS and ETCD started**, perform the following steps:
1. Run benchmarking container
```bash
./container/run.sh --mount-workspace
```
> [!Tip]
> The huggingface home source mount can be changed by setting `--hf-cache ~/.cache/huggingface`.
2. Start disaggregated services
```bash
cd /workspace/examples/llm
dynamo serve benchmarks.disagg:Frontend -f benchmarks/disagg.yaml 1> disagg.log 2>&1 &
```
> [!Tip]
> Check the `disagg.log` to make sure the service is fully started before collecting performance numbers.
3. Collect the performance numbers:
```bash
bash -x /workspace/benchmarks/llm/perf.sh --mode disaggregated --deployment-kind dynamo_vllm --prefill-tensor-parallelism 1 --prefill-data-parallelism 4 --decode-tensor-parallelism 4 --decode-data-parallelism 1
```
> [!Important]
> We should be careful in specifying these options in `perf.sh` script. They should closely reflect the deployment config that is being benchmarked. See `perf.sh --help` to learn more about these option. In the above command, we described that our deployment is using disaggregated serving in dynamo with vLLM backend. We have also accurately described that we have 4 prefill workers with TP=1 and 1 decode worker with TP=4
For more information see [Collecting Performance Numbers](#collecting-performance-numbers) section below.
## Disaggregated Multinode Benchmarking
> [!Important]
> Two 8xH100-80GB nodes are required the following instructions.
In the following steps we compare Dynamo disaggregated vLLM performance to
[native vLLM Aggregated Baseline](#vllm-aggregated-baseline-benchmarking) on two nodes. These were chosen to optimize
for Output Token Throughput (per sec) when both are performing under similar Inter Token Latency (ms).
For more details on your use case please see the [Performance Tuning Guide](/docs/guides/disagg_perf_tuning.md).
In this setup, we will be using 8 prefill workers and 1 decode worker.
Each prefill worker will use tensor parallel 1 and the decode worker will use tensor parallel 8.
With the Dynamo repository, benchmarking image and model available, and **NATS and ETCD started on node 0**, perform the following steps:
1. Run benchmarking container (nodes 0 & 1)
```bash
./container/run.sh --mount-workspace
```
> [!Tip]
> The huggingface home source mount can be changed by setting `--hf-cache ~/.cache/huggingface`.
2. Config NATS and ETCD (node 1)
```bash
export NATS_SERVER="nats://<node_0_ip_addr>"
export ETCD_ENDPOINTS="<node_0_ip_addr>:2379"
```
> [!Important]
> Node 1 must be able to reach Node 0 over the network for the above services.
3. Start workers (node 0)
```bash
cd /workspace/examples/llm
dynamo serve benchmarks.disagg_multinode:Frontend -f benchmarks/disagg_multinode.yaml 1> disagg_multinode.log 2>&1 &
```
> [!Tip]
> Check the `disagg_multinode.log` to make sure the service is fully started before collecting performance numbers.
4. Start workers (node 1)
```bash
cd /workspace/examples/llm
dynamo serve components.prefill_worker:PrefillWorker -f benchmarks/disagg_multinode.yaml 1> prefill_multinode.log 2>&1 &
```
> [!Tip]
> Check the `prefill_multinode.log` to make sure the service is fully started before collecting performance numbers.
5. Collect the performance numbers:
```bash
bash -x /workspace/benchmarks/llm/perf.sh --mode disaggregated --deployment-kind dynamo_vllm --prefill-tensor-parallelism 1 --prefill-data-parallelism 8 --decode-tensor-parallelism 8 --decode-data-parallelism 1
```
> [!Important]
> We should be careful in specifying these options in `perf.sh` script. They should closely reflect the deployment config that is being benchmarked. See `perf.sh --help` to learn more about these option. In the above command, we described that our deployment is using disaggregated serving in dynamo with vLLM backend. We have also accurately described that we have 8 prefill workers with TP=1 and 1 decode worker with TP=8
For more information see [Collecting Performance Numbers](#collecting-performance-numbers) section below.
## vLLM Aggregated Baseline Benchmarking
> [!Important]
> One (or two) 8xH100-80GB nodes are required the following instructions.
With the Dynamo repository and the benchmarking image available, perform the following steps:
1. Run benchmarking container
```bash
./container/run.sh --mount-workspace
```
> [!Tip]
> The Hugging Face home source mount can be changed by setting `--hf-cache ~/.cache/huggingface`.
2. Start vLLM serve
```bash
CUDA_VISIBLE_DEVICES=0,1,2,3 vllm serve neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic \
--block-size 128 \
--max-model-len 3500 \
--max-num-batched-tokens 3500 \
--tensor-parallel-size 4 \
--gpu-memory-utilization 0.95 \
--disable-log-requests \
--port 8001 1> vllm_0.log 2>&1 &
CUDA_VISIBLE_DEVICES=4,5,6,7 vllm serve neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic \
--block-size 128 \
--max-model-len 3500 \
--max-num-batched-tokens 3500 \
--tensor-parallel-size 4 \
--gpu-memory-utilization 0.95 \
--disable-log-requests \
--port 8002 1> vllm_1.log 2>&1 &
```
> [!Tip]
> Check the `vllm_0.log` and `vllm_1.log` to make sure the service is fully started before collecting performance numbers.
>
> If benchmarking with two or more nodes, `--tensor-parallel-size 8` should be used and only run one `vllm serve` instance per node.
3. Use NGINX as load balancer
```bash
apt update && apt install -y nginx
cp /workspace/benchmarks/llm/nginx.conf /etc/nginx/nginx.conf
service nginx restart
```
> [!Note]
> If benchmarking over 2 nodes, the `upstream` configuration will need to be updated to link to the `vllm serve` on the second node.
4. Collect the performance numbers:
Single-Node
```bash
bash -x /workspace/benchmarks/llm/perf.sh --mode aggregated --deployment-kind vllm_serve --tensor-parallelism 4 --data-parallelism 2
```
Two Nodes
```bash
bash -x /workspace/benchmarks/llm/perf.sh --mode aggregated --deployment-kind vllm_serve --tensor-parallelism 8 --data-parallelism 2
```
We could also run the benchmarking script and specify the model, input sequence length, output sequence length, and concurrency levels to target for benchmarking:
```bash
bash -x /workspace/benchmarks/llm/perf.sh \
--mode aggregated \
--deployment-kind vllm_serve \
--tensor-parallelism 1 \
--data-parallelism 1 \
--model neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic \
--input-sequence-length 3000 \
--output-sequence-length 150 \
--url http://localhost:8000 \
--concurrency 1,2,4,8,16,32,64,128,256
# The `--concurrency` option accepts either a single value (e.g., 64) or a comma-separated list (e.g., 1,2,4,8) to specify multiple concurrency levels for benchmarking.
```
> [!Important]
> We should be careful in specifying these options in `perf.sh` script. They should closely reflect the deployment config that is being benchmarked. See `perf.sh --help` to learn more about these option. In the above command, we described that our deployment is using aggregated serving in `vllm serve`. We have also accurately described that we have 2 workers with TP=4(or TP=8 for two nodes).
For more information see [Collecting Performance Numbers](#collecting-performance-numbers) section below.
## Collecting Performance Numbers
Currently, there is no consistent way of obtaining the configuration of deployment service. Hence, we need to provide this information to the script in form of command line arguments. The benchmarking script `/workspace/benchmarks/llm/perf.sh` uses GenAI-Perf tool to collect the performance numbers at various different request concurrencies. The perf.sh script can be run multiple times to collect numbers for various different deployments. Each script execution will create a new artifacts directory in `artifacts_root` and dump these numbers in it. See [Plotting Pareto Graphs](#plotting-pareto-graphs) to learn how to convert the data from this `artifacts_root` to generate pareto graphs for the performance.
Note: As each `perf.sh` adds a new artifacts directory in the `artifacts_root` always, proper care should be taken that we are starting experiment with clean `artifacts_root` so we include only results from runs that we want to compare.
> [!Tip]
> See [GenAI-Perf tutorial](https://github.com/triton-inference-server/perf_analyzer/blob/main/genai-perf/docs/tutorial.md)
> @ [GitHub](https://github.com/triton-inference-server/perf_analyzer) for additional information about how to run GenAI-Perf
> and how to interpret results.
## Interpreting Results
### Plotting Pareto Graphs
The `artifacts` directory generated by GenAI-Perf contains the raw performance number from the benchmarking.
Using the benchmarking image, install the dependencies for plotting Pareto graph
```bash
pip3 install matplotlib seaborn
```
At the directory where the artifacts are located, plot the Pareto graph
Single-Node:
```bash
python3 /workspace/benchmarks/llm/plot_pareto.py --artifacts-root-dir artifacts_root
```
Two Nodes:
```bash
python3 /workspace/benchmarks/llm/plot_pareto.py --artifacts-root-dir artifacts_root --title "Two Nodes"
```
The graph will be saved to the current directory and named `pareto_plot.png`.
### Interpreting Pareto Graphs
The question we want to answer in this comparison is how much Output Token Throughput can be improved by switching from
aggregated to disaggregated serving when both are performing under similar Inter Token Latency.
For each concurrency benchmarked, it produces a latency and throughput value pair. The x-axis on the Pareto graph is
latency (tokens/s/user), which the latency is lower if the value is higher. The y-axis on the Pareto graph is throughput
(tokens/s/gpu). The latency and throughput value pair forms a dot on the Pareto graph. A line (Pareto Frontier) is
formed when the dots from different concurrency values are plotted on the graph.
With the Pareto Frontiers of the baseline and the disaggregated results plotted on the graph, we can look for the
greatest increase in throughput (along the y-axis) between the baseline and the disaggregated result Pareto Frontier,
over different latencies (along the x-axis).
For example, at 45 tokens/s/user, the increase in tokens/s/gpu is `145 - 80 = 65`, from the orange baseline to the
blue disaggregated line, so the improvement is around 1.44x speed up:
![Example Pareto Plot](./example_plots/single_node_pareto_plot.png)
Note: The above example was collected over a single benchmarking run, the actual number may vary between runs, configurations and hardware.
## Supporting Additional Models
The instructions above can be used for nearly any model desired.
More complex setup instructions might be required for certain models.
The above instruction regarding ETCD, NATS, nginx, dynamo-serve, and GenAI-Perf still apply and can be reused.
The specifics of deploying with different hardware, in a unique environment, or using another model framework can be adapted using the links below.
Regardless of the deployment mechanism, the GenAI-Perf tool will report the same metrics and measurements so long as an accessible endpoint is available for it to interact with. Use the provided [perf.sh](../../../benchmarks/llm/perf.sh) script to automate the measurement of model throughput and latency against multiple request concurrences.
### Deployment Examples
- [Dynamo Multinode Deployments](../../../docs/examples/multinode.md)
- [Dynamo TensorRT LLM Deployments](../../../docs/examples/trtllm.md)
- [Aggregated Deployment of Very Large Models](../../../docs/examples/multinode.md#aggregated-deployment)
- [Dynamo vLLM Deployments](../../../docs/examples/llm_deployment.md)
## Monitor Benchmark Startup Status
When running dynamo deployment, you may have multiple instances of the same worker kind for a particular benchmark run.
The deployment can process the workflow as long as at least one worker is ready, in the case where the benchmark is run
as soon as dynamo is responsive to inference request, which may result in inaccurate benchmark result at the beginning of
the benchmark. In such a case, you may additionally deploy benchmark watcher to provide signal on whether the full deployment
is ready. For instance, if you expect the total number of prefill and decode workers to be 10, you can run the below to start
the watcher, which will exit if the total number is less than 10 after timeout. In addition to that, the watcher will create
a HTTP server on port 7001 by default, which you can use to send GET request for readiness to build external benchmarking workflow.
```bash
# start your benchmark deployment
...
# start monitor separately, or it can be part of the deployment above
dynamo serve --service-name Watcher benchmark_watcher:Watcher --Watcher.total-workers=10 --Watcher.timeout=10
# Send curl request to check liveness
curl localhost:7001
127.0.0.1 - - [12/Jun/2025 23:31:52] "GET / HTTP/1.1" 400 -
...
curl localhost:7001
127.0.0.1 - - [12/Jun/2025 23:32:46] "GET / HTTP/1.1" 200 -
```
## Utility for Setting Up Environment
### vLLM
- `vllm_multinode_setup.sh` is a helper script to configure the node for dynamo deployment for
vLLM. Depending on whether environment variable `HEAD_NODE_IP` and `RAY_LEADER_NODE_IP` are set
when the script is invoked, it will:
- start nats server and etcd on the current node if `HEAD_NODE_IP` is not set, otherwise
set the environment variables as expected by dynamo.
- run Ray and connect to the Ray cluster started by `RAY_LEADER_NODE_IP`, otherwise start
the Ray cluster with current node as the head node.
- print the command with `HEAD_NODE_IP` and `RAY_LEADER_NODE_IP` set, which can be used in
another node to setup connectivity with the current node.
```bash
# On node 0
source vllm_multinode_setup.sh
... # starting nats server, etcd and ray cluster
# script print command
HEAD_NODE_IP=NODE_0_IP RAY_LEADER_NODE_IP=NODE_0_IP source vllm_multinode_setup.sh
# On node 1
HEAD_NODE_IP=NODE_0_IP RAY_LEADER_NODE_IP=NODE_0_IP source vllm_multinode_setup.sh
... # connecting to Ray cluster
```
## Metrics and Visualization
For instructions on how to acquire per worker metrics and visualize them using Grafana,
please see the provided [Visualization with Prometheus and Grafana](../../../deploy/metrics/README.md).
## Troubleshooting
When benchmarking disaggregation performance, there can be cases where the latency and
throughput number don't match the expectation within some margin. Below is a list of scenarios
that have been encountered, and details on observations and resolutions.
### Interconnect Configuration
Even if the nodes have faster interconnect hardware available, there can be misconfiguration such that
the fastest route may not be selected by NIXL ([example regression](https://github.com/ai-dynamo/dynamo/pull/1314)). NIXL simplifies the interconnect but also hides
selection detail. Therefore this can be the cause if you observe abnormal TTFT increase when
splitting prefill workers and decode workers to different nodes. For example, we have seen instances of ~2 second overhead added to TTFT when TCP is selected over RDMA for KV Cache transfer due to a misconfigured environment.
Currently NIXL doesn't provide utility for reporting which transport is selected. Therefore
you will need to verify if that is the cause by using backend specific debug options.
In the case of UCX backend, you can use `ucx_info -d` to check if the desired interconnect
devices are being recognized. At runtime, `UCX_LOG_LEVEL=debug` and `UCX_PROTO_INFO=y`
can be set as environment variables to provide detailed logs on UCX activities. This will
reveal whether the desired transport is being used.
### The Full Deployment is Configured Correctly
As benchmarking often focuses on configurations where multiple workers are being used,
one may mistakenly consider a deployment ready for benchmarking while there are only a
subset of workers taking requests. For example, in the aggregated baseline benchmarking,
a user can miss updating the ip address to the other node in upstream section of `nginx.conf`.
This could lead to only one of the nodes serving requests. In such a case,
the benchmark can still run to completion, but the result will not reflect the deployment
capacity, because not all the compute resources are being utilized.
Therefore, it is important to verify that the requests can be routed to all workers before
performing the benchmark:
- **Framework-only benchmark** The simplest way is to send sample requests and check
the logs of all workers. Each framework may provide utilities for readiness checks, so please
refer to the framework's documentation for those details.
- **Dynamo based benchmark** Once you start the deployment, you can follow
the instructions in [monitor benchmark startup status](#Monitor-Benchmark-Startup-Status),
which will periodically poll the workers exposed to specific endpoints
and return HTTP 200 code when the expected number of workers are met.
# type: ignore # Ignore all mypy errors in this file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import asyncio
import logging
import threading
import time
from argparse import Namespace
from http.server import BaseHTTPRequestHandler, HTTPServer
from dynamo.sdk import async_on_start, dynamo_context, service
from dynamo.sdk.lib.config import ServiceConfig
logger = logging.getLogger(__name__)
def start_server(server):
# Setup stuff here...
server.serve_forever()
class HealthServer(HTTPServer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.ready = False
def set_ready(self, ready: bool):
self.ready = ready
class RequestHandler(BaseHTTPRequestHandler):
def do_GET(self):
if self.server.ready:
self.send_response(200)
self.end_headers()
self.wfile.write(b"Ready.")
else:
self.send_response(400)
self.end_headers()
self.wfile.write(b"Not Ready")
return
def parse_args(service_name, prefix) -> Namespace:
parser = argparse.ArgumentParser()
parser.add_argument(
"--total-workers",
type=int,
default=1,
help="Total number of workers to be registered",
)
parser.add_argument(
"--worker-components",
nargs="+",
default=["VllmWorker", "PrefillWorker"],
help="Components that we are tracking worker readiness",
)
parser.add_argument(
"--component-endpoints",
nargs="+",
default=["generate", "mock"],
help="Components that we are tracking worker readiness",
)
parser.add_argument(
"--timeout",
type=int,
default=600,
help="Timeout (seconds) for waiting for workers to be ready",
)
parser.add_argument(
"--port",
type=int,
default=7001,
help="port for readiness check",
)
config = ServiceConfig.get_instance()
config_args = config.as_args(service_name, prefix=prefix)
args = parser.parse_args(config_args)
if len(args.worker_components) != len(args.component_endpoints):
parser.error(
"--worker-components and --component-endpoints must have the same number "
f"of items, but got {args.worker_components} and {args.component_endpoints}"
)
return args
# Use dynamo style to have access to clients
@service(
dynamo={
"namespace": "dynamo",
},
resources={"cpu": "1", "memory": "1Gi"},
workers=1,
)
class Watcher:
def __init__(self):
self.args = parse_args(self.__class__.__name__, "")
@async_on_start
async def async_init(self):
self.runtime = dynamo_context["runtime"]
self.workers_clients = []
for component, endpoint in zip(
self.args.worker_components, self.args.component_endpoints
):
self.workers_clients.append(
await self.runtime.namespace("dynamo")
.component(component)
.endpoint(endpoint)
.client()
)
logger.info(f"Component {component}/{endpoint} is registered")
logger.info(f"Total number of workers to be waited: {self.args.total_workers}")
logger.info(f"Timeout for waiting for workers to be ready: {self.args.timeout}")
self.server = HealthServer(("0.0.0.0", self.args.port), RequestHandler)
print(f"Serving on 0.0.0.0:{self.args.port}, listening to readiness check...")
self._server_thread = threading.Thread(target=start_server, args=(self.server,))
self._server_thread.start()
await check_required_workers(
self.workers_clients, self.args.total_workers, self.args.timeout
)
self.server.set_ready(True)
logger.info("All workers are ready.")
async def check_required_workers(
workers_clients, required_workers: int, timeout: int, poll_interval=1
):
"""Wait until the minimum number of workers are ready."""
start_time = time.time()
num_workers = 0
while num_workers < required_workers and time.time() - start_time < timeout:
num_workers = sum(map(lambda wc: len(wc.instance_ids()), workers_clients))
if num_workers < required_workers:
logger.info(
f"Waiting for more workers to be ready.\n"
f" Current: {num_workers},"
f" Required: {required_workers}"
)
await asyncio.sleep(poll_interval)
if num_workers < required_workers:
raise TimeoutError(
f"Timed out waiting for {required_workers} workers to be ready."
)
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from components.frontend import Frontend
from components.prefill_worker import PrefillWorker
from components.processor import Processor
from components.worker import VllmWorker
Frontend.link(Processor).link(VllmWorker).link(PrefillWorker)
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
router: round-robin
# Number of tokens in a batch for more efficient chunked transfers to GPUs.
block-size: 128
max-model-len: 3500
max-num-batched-tokens: 3500
disable-log-requests: true
Frontend:
# This model was chosen for its 70B size and FP8 precision, which the TP and
# DP configurations were tuned for its size, and its precision reduces model
# and KV cache memory usage and easing remote cache transfer.
served_model_name: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
common-configs: [model, router]
# x1 process with 4 GPUs generating output tokens (the "decode" phase).
VllmWorker:
common-configs: [model, kv-transfer-config, router, block-size, max-model-len, disable-log-requests]
# Enable prefill at different workers.
remote-prefill: true
# Disable local prefill so only disaggregated prefill is used.
conditional-disagg: false
gpu-memory-utilization: 0.95
tensor-parallel-size: 4
ServiceArgs:
workers: 1
resources:
gpu: 4
# x4 processes each with 1 GPU handling the initial prefill (context embedding) phase.
PrefillWorker:
common-configs: [model, kv-transfer-config, block-size, max-model-len, max-num-batched-tokens, gpu-memory-utilization, disable-log-requests]
tensor-parallel-size: 1
ServiceArgs:
workers: 4
resources:
gpu: 1
# Automatic prefix caching is disabled by default, since all requests are expected to be unique.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from components.frontend import Frontend
from components.kv_router import Router
from components.processor import Processor
from components.worker import VllmWorker
Frontend.link(Processor).link(Router).link(VllmWorker)
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
# Routing policy determines how remote workers are selected for processing
# prefill requests
# 1. random: randomly select workers for prefill requests
# 2. round-robin: different prefill requests take similar time to complete so
# selecting workers in round-robin maximizes the chance of
# selecting the least busy worker for a request
# 3. kv: finding prefill workers by KV cache is not beneficial when caching is
# disabled on this setup
router: round-robin
# Number of tokens in a batch for more efficient chunked transfers to GPUs.
block-size: 128
max-model-len: 3500
max-num-batched-tokens: 3500
disable-log-requests: true
Frontend:
served_model_name: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
common-configs: [model, block-size, max-model-len, router]
Router:
common-configs: [model]
min-workers: 1
VllmWorker:
common-configs: [model, kv-transfer-config, router, block-size, max-model-len, disable-log-requests]
# Enable prefill at different workers.
remote-prefill: true
# Disable local prefill so only disaggregated prefill is used.
conditional-disagg: false
# The GPU memory utilization do not have to match between VllmWorker and PrefillWorker.
gpu-memory-utilization: 0.95
# TP size is doubled from single node setup
tensor-parallel-size: 8
ServiceArgs:
workers: 1
resources:
gpu: 8
PrefillWorker:
common-configs: [model, kv-transfer-config, block-size, max-model-len, max-num-batched-tokens, disable-log-requests]
gpu-memory-utilization: 0.95
tensor-parallel-size: 1
ServiceArgs:
# DP size is doubled from single node setup
workers: 8
resources:
gpu: 1
# Automatic prefix caching is disabled by default, since all requests are expected to be unique.
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
#!/usr/bin/env bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# start nats and etcd
if [[ -z "${HEAD_NODE_IP}" ]]; then
nats-server -js &
etcd --advertise-client-urls http://0.0.0.0:2379 --listen-client-urls http://0.0.0.0:2379 &
HEAD_NODE_IP=`hostname -i`
else
export NATS_SERVER=nats://${HEAD_NODE_IP}:4222
export ETCD_ENDPOINTS=${HEAD_NODE_IP}:2379
fi
# start ray cluster
if [[ -z "${RAY_LEADER_NODE_IP}" ]]; then
ray start --head --port=6379 --disable-usage-stats
RAY_LEADER_NODE_IP=`hostname -i`
else
ray start --address=${RAY_LEADER_NODE_IP}:6379
fi
echo "HEAD_NODE_IP=${HEAD_NODE_IP} RAY_LEADER_NODE_IP=${RAY_LEADER_NODE_IP=} source ${BASH_SOURCE[0]}"
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from dynamo.runtime import EtcdKvCache
from dynamo.sdk import dynamo_context
logger = logging.getLogger(__name__)
class PyDisaggregatedRouter:
def __init__(
self,
runtime,
namespace,
max_local_prefill_length=1000,
max_prefill_queue_size=2,
):
self.runtime = runtime
self.namespace = namespace
self.max_local_prefill_length = max_local_prefill_length
self.max_prefill_queue_size = max_prefill_queue_size
async def async_init(self):
runtime = dynamo_context["runtime"]
self.etcd_kv_cache = await EtcdKvCache.create(
runtime.etcd_client(),
f"/{self.namespace}/disagg_router/",
{
"max_local_prefill_length": str(self.max_local_prefill_length),
"max_prefill_queue_size": str(self.max_prefill_queue_size),
},
)
async def prefill_remote(
self, prompt_length: int, prefix_hit_rate: float, queue_size: int
):
max_local_prefill_length = int(
await self.etcd_kv_cache.get("max_local_prefill_length")
)
max_prefill_queue_size = int(
await self.etcd_kv_cache.get("max_prefill_queue_size")
)
absolute_prefill_length = int(prompt_length * (1 - prefix_hit_rate))
# TODO: consider size of each request in the queue when making the decision
decision = (
absolute_prefill_length > max_local_prefill_length
and queue_size < max_prefill_queue_size
)
logger.info(
f"Remote prefill: {decision} (prefill length: {absolute_prefill_length}/{max_local_prefill_length}, prefill queue size: {queue_size}/{max_prefill_queue_size})"
)
return decision
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
import subprocess
from pathlib import Path
from components.planner_service import Planner
from components.processor import Processor
from components.worker import VllmWorker
from pydantic import BaseModel
from dynamo import sdk
from dynamo.sdk import api, depends, on_shutdown, service
from dynamo.sdk.lib.config import ServiceConfig
from dynamo.sdk.lib.image import DYNAMO_IMAGE
logger = logging.getLogger(__name__)
# TODO: temp workaround to avoid port conflict with subprocess HTTP server; remove this once ingress is fixed
os.environ["DYNAMO_PORT"] = "3999"
def get_http_binary_path():
"""Find the HTTP binary path in SDK or fallback to 'http' command."""
sdk_path = Path(sdk.__file__)
binary_path = sdk_path.parent / "cli/bin/http"
if not binary_path.exists():
return "http"
else:
return str(binary_path)
class FrontendConfig(BaseModel):
"""Configuration for the Frontend service including model and HTTP server settings."""
served_model_name: str
endpoint: str
port: int = 8080
# todo this should be called ApiServer
@service(
dynamo={
"namespace": "dynamo",
},
resources={"cpu": "10", "memory": "20Gi"},
workers=1,
image=DYNAMO_IMAGE,
)
class Frontend:
planner = depends(Planner)
worker = depends(VllmWorker)
processor = depends(Processor)
def __init__(self):
"""Initialize Frontend service with HTTP server and model configuration."""
frontend_config = FrontendConfig(**ServiceConfig.get_parsed_config("Frontend"))
self.frontend_config = frontend_config
self.process = None
self.setup_model()
self.start_http_server()
def setup_model(self):
"""Configure the model for HTTP service using llmctl."""
subprocess.run(
[
"llmctl",
"http",
"remove",
"chat-models",
self.frontend_config.served_model_name,
],
check=False,
)
subprocess.run(
[
"llmctl",
"http",
"add",
"chat-models",
self.frontend_config.served_model_name,
self.frontend_config.endpoint,
],
check=False,
)
def start_http_server(self):
"""Start the HTTP server on the configured port."""
logger.info("Starting HTTP server")
http_binary = get_http_binary_path()
self.process = subprocess.Popen(
[http_binary, "-p", str(self.frontend_config.port)],
stdout=None,
stderr=None,
)
@api()
def dummy_api(self) -> None:
"""
Dummy API to enable the HTTP server for the Dynamo operator.
This API is not used by the model.
NOTE: this is a temporary solution to expose ingress
for the LLM examples. Will be fixed and removed in the future.
The resulting api_endpoints in dynamo.yaml will be incorrect.
"""
@on_shutdown
def cleanup(self):
"""Clean up resources before shutdown."""
# circusd manages shutdown of http server process, we just need to remove the model using the on_shutdown hook
subprocess.run(
[
"llmctl",
"http",
"remove",
"chat-models",
self.frontend_config.served_model_name,
],
check=False,
)
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
import random
from argparse import Namespace
from typing import AsyncIterator, Tuple
import numpy as np # Add numpy import
from components.worker import VllmWorker
from utils.check_worker import check_required_workers
from utils.protocol import LocalBlockHashes
from utils.vllm import RouterType
from dynamo.llm import (
AggregatedMetrics,
ApproxKvIndexer,
KvIndexer,
KvMetricsAggregator,
OverlapScores,
)
from dynamo.sdk import async_on_start, depends, dynamo_context, endpoint, service
from dynamo.sdk.lib.config import ServiceConfig
WorkerId = str
fallback_msg = "Will fallback to random routing."
logger = logging.getLogger(__name__)
def softmax_sample_from_logits(
logits: dict[str, float], temperature: float = 1.0, lower_is_better: bool = True
) -> str:
if not logits:
raise ValueError("Empty logits dictionary")
keys = list(logits.keys())
values = np.array(list(logits.values()))
min_val = np.min(values)
max_val = np.max(values)
if min_val == max_val:
# All values are the same, uniform probability
probabilities = np.ones(len(keys)) / len(keys)
else:
normalized = values / (max_val - min_val)
if lower_is_better:
normalized = -1 * normalized
scaled = normalized / temperature
exp_values = np.exp(scaled - np.max(scaled))
probabilities = exp_values / np.sum(exp_values)
# Sample from the probability distribution
return np.random.choice(keys, p=probabilities)
def parse_args(service_name, prefix) -> Namespace:
parser = argparse.ArgumentParser()
parser.add_argument(
"--model",
type=str,
default="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
help="Model that is being served",
)
parser.add_argument(
"--min-workers",
type=int,
default=1,
help="Minimum number of workers required before proceeding",
)
# TODO: Read block size
parser.add_argument(
"--block-size",
type=int,
default=64,
help="KV block size",
)
parser.add_argument(
"--custom-router",
type=bool,
default=False,
help="Whether to use custom router or not",
)
parser.add_argument(
"--router",
type=str,
default="kv",
help="The router type",
)
parser.add_argument(
"--softmax-sample",
action="store_true",
help="Whether to do softmax sampling based on worker logits (default is to pick smallest)",
)
config = ServiceConfig.get_instance()
config_args = config.as_args(service_name, prefix=prefix)
args = parser.parse_args(config_args)
return args
@service(
dynamo={
"namespace": "dynamo",
},
resources={"cpu": "10", "memory": "20Gi"},
workers=1,
)
class Router:
"""
Request handler for the generate endpoint
"""
worker = depends(VllmWorker)
def __init__(self):
logger.info("Initializing Custom Router")
self.args = parse_args(self.__class__.__name__, "")
self.default_metrics = {
"kv_active_blocks": 0,
"kv_total_blocks": 1,
"num_requests_waiting": 0.0,
"gpu_cache_usage_perc": 0.0,
"gpu_prefix_cache_hit_rate": 0.0,
}
@async_on_start
async def async_init(self):
self.runtime = dynamo_context["runtime"]
self.workers_client = (
await self.runtime.namespace("dynamo")
.component("VllmWorker")
.endpoint("generate")
.client()
)
self.router_type = self.args.router
await check_required_workers(self.workers_client, self.args.min_workers)
kv_listener = self.runtime.namespace("dynamo").component("VllmWorker")
await kv_listener.create_service()
if self.router_type == RouterType.KV:
self.indexer = KvIndexer(kv_listener, self.args.block_size)
elif self.router_type == RouterType.APPROX_KV:
# For now, hardcode the TTL to 2 minutes.
self.indexer = ApproxKvIndexer(kv_listener, self.args.block_size, 120.0)
self.metrics_aggregator = KvMetricsAggregator(kv_listener)
self.active_blocks_dict = {}
worker_ids = self.workers_client.instance_ids()
for worker_id in worker_ids:
# [old_value, predictive_value]
self.active_blocks_dict[worker_id] = [0, 0]
logger.info("KV Router initialized")
def _update_and_get_active_blocks(self, worker_id: str, polled_value: int) -> int:
"""Helper routine to update waiting dict and return the desired waiting value.
This method implements a predictive mechanism for tracking waiting requests:
- If a new polled value is detected (different from the stored old value),
it updates both the old and predictive values to this new measurement and returns it
- If no change is detected (polled value equals old value), it returns the
predictive value which has been incremented based on previous routing decisions
This allows the router to account for requests that have been dispatched but
not yet reflected in the polled metrics.
"""
# Initialize if worker_id is not present
if worker_id not in self.active_blocks_dict:
logger.warning(f"New Worker added: {worker_id}")
self.active_blocks_dict[worker_id] = [polled_value, polled_value]
return polled_value
old_value, predictive_value = self.active_blocks_dict[worker_id]
# Check if polled value is different from old value
if polled_value != old_value:
self.active_blocks_dict[worker_id] = [polled_value, polled_value]
return polled_value
else:
return predictive_value
def _cost_function(
self,
scores: OverlapScores | None,
metrics: AggregatedMetrics | None,
token_length: int,
):
"""The cost function for deciding the best worker to route a request to.
If there are multiple workers sharing the same optimal cost, then
one of them is randomly selected.
Args:
scores (OverlapScores | None): The number of matching blocks between
the request and the prefix cache of each worker.
metrics (AggregatedMetrics | None): Several worker metrics polled
by the `KvMetricsAggregator`, currently including the
GPU cache usage, number of waiting requests, and the
GPU prefix cache hit rate.
token_length (int): The number of tokens in the request.
Returns:
(str, float): The best worker id and the corresponding score.
"""
# Get all worker IDs from the client. This is needed because scores / metrics may not have values for all workers
# and we want all workers to be considered in the logit calculation
worker_ids = self.workers_client.instance_ids()
request_blocks = (
token_length + self.args.block_size - 1
) // self.args.block_size
overlap_blocks_dict = {worker_id: 0 for worker_id in worker_ids}
new_blocks_dict = {worker_id: request_blocks for worker_id in worker_ids}
if scores:
for worker_id, score in scores.scores.items():
# score is number of matching blocks we multiply by block_size to get tokens
# and compare to token_length. The larger the cache hit the better
overlap_blocks_dict[worker_id] = score
new_blocks_dict[worker_id] = request_blocks - score
else:
logger.warning("Cannot get KV scores")
worker_metrics = {}
if metrics:
for endpoint in metrics.endpoints:
worker_id = endpoint.worker_id
worker_metrics[worker_id] = {
key: getattr(endpoint, key, self.default_metrics[key])
for key in self.default_metrics.keys()
}
# Update waiting value using helper routine
polled_active_blocks = int(
worker_metrics[worker_id]["kv_active_blocks"]
)
worker_metrics[worker_id][
"kv_active_blocks"
] = self._update_and_get_active_blocks(worker_id, polled_active_blocks)
else:
logger.warning("Cannot get metrics")
worker_logits = {}
for worker_id in worker_ids:
# Use default values if worker not in scores or metrics
metrics_dict = worker_metrics.get(worker_id, self.default_metrics)
kv_total_blocks = metrics_dict["kv_total_blocks"]
new_blocks = new_blocks_dict[worker_id]
normalized_new_blocks = new_blocks / kv_total_blocks
gpu_cache_usage = metrics_dict["kv_active_blocks"] / kv_total_blocks
# Use raw waiting value without normalization
num_requests_waiting = metrics_dict["num_requests_waiting"]
# Have 1 metric that weights towards cache hit
# 2 metrics that penalize overloaded worker and queuing
worker_logits[worker_id] = (
normalized_new_blocks + gpu_cache_usage + num_requests_waiting
)
logger.info(
f"Formula for {worker_id}: {worker_logits[worker_id]:.3f} = {normalized_new_blocks:.3f} + {gpu_cache_usage:.3f} + {num_requests_waiting:.3f}"
)
if not worker_logits or not any(worker_logits.values()):
logger.warning(f"All worker logits are zero. {fallback_msg}.")
return "", 0.0
# Select the worker with the highest logit
if self.args.softmax_sample:
best_worker_id = int(softmax_sample_from_logits(worker_logits))
else:
min_logit = min(worker_logits.values())
best_workers = [
wid for wid, logit in worker_logits.items() if logit == min_logit
]
best_worker_id = random.choice(best_workers)
# Log the metrics for the selected worker
if best_worker_id:
metrics_dict = worker_metrics.get(best_worker_id, self.default_metrics)
# Create log messages
log_messages = [
f"Selected worker: {best_worker_id}, logit: {worker_logits[best_worker_id]:.3f}",
f"Score: {scores.scores.get(best_worker_id, 0.0) if scores else 0.0:.3f}",
f"GPU Cache Hit Rate: {metrics_dict['gpu_prefix_cache_hit_rate']:.3f}",
f"GPU Cache Usage: {metrics_dict['kv_active_blocks'] / metrics_dict['kv_total_blocks']:.3f}",
f"Requests Waiting: {metrics_dict['num_requests_waiting']}",
]
# Log to vllm_logger
for message in log_messages:
logger.info(message)
# Increment predictive waiting for the selected worker before returning
self.active_blocks_dict[best_worker_id][1] += new_blocks_dict[
best_worker_id
]
return (
best_worker_id,
overlap_blocks_dict[best_worker_id] * self.args.block_size / token_length,
)
def _get_underloaded_worker(self, metrics: AggregatedMetrics | None):
if not metrics:
logger.warning(f"Cannot get metrics. {fallback_msg}")
return "", 0.0
kv_load = {
endpoint.worker_id: getattr(endpoint, "gpu_cache_usage_perc", 0.0)
for endpoint in metrics.endpoints
}
if not kv_load or not any(kv_load.values()):
logger.warning(f"All KV loads are zero. {fallback_msg}")
return "", 0.0
min_load = min(kv_load.values())
min_load_workers = [
worker_id for worker_id, load in kv_load.items() if load == min_load
]
best_worker_id = random.choice(min_load_workers)
logger.info(
f"Selected worker: {best_worker_id}, KV load: {kv_load[best_worker_id]:.3f}"
)
return best_worker_id, kv_load[best_worker_id]
@endpoint()
async def generate(
self, request: LocalBlockHashes
) -> AsyncIterator[Tuple[WorkerId, float]]:
metrics = await self.metrics_aggregator.get_metrics()
# Quick return for KV_LOAD mode
if self.router_type == RouterType.KV_LOAD:
try:
yield self._get_underloaded_worker(metrics)
except Exception as e:
logger.exception(
f"Error finding underloaded worker: {e}. {fallback_msg}"
)
yield "", 0.0
return
# Existing KV routing logic
try:
if self.router_type == RouterType.APPROX_KV:
scores = await self.indexer.find_matches_for_request(request.tokens)
else:
scores = await self.indexer.find_matches(request.hashes)
except Exception as e:
scores = {}
logger.exception(f"Error finding matches: {e}. {fallback_msg}")
yield "", 0.0
return
worker_id, prefix_hit_rate = self._cost_function(
scores, metrics, request.num_tokens
)
if self.router_type == RouterType.APPROX_KV:
# For the approx kv router, we need to know what worker we route to.
# We can't defer to the engine client to select a random worker.
# Because of this, we need to select a worker here.
if not worker_id:
all_workers = self.workers_client.instance_ids()
worker_id = random.choice(all_workers)
await self.log_router_decision(request.tokens, worker_id)
if worker_id:
logger.info(
f"Scheduling to worker_id: {worker_id} with estimated prefix hit rate: {prefix_hit_rate}"
)
yield worker_id, prefix_hit_rate
async def log_router_decision(self, tokens: list[int], worker_id: str):
if self.router_type == RouterType.APPROX_KV:
try:
await self.indexer.process_routing_decision_for_request(
tokens, worker_id
)
except Exception as e:
logger.exception(
f"Error processing routing decision: {e}. {fallback_msg}"
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment