Unverified Commit 8bd37c96 authored by Anant Sharma's avatar Anant Sharma Committed by GitHub
Browse files

refactor: move backend deploy, launch and slurm files from components to examples (#3849)


Signed-off-by: default avatarAnant Sharma <anants@nvidia.com>
parent 78359046
...@@ -28,7 +28,7 @@ vllm: &vllm ...@@ -28,7 +28,7 @@ vllm: &vllm
- 'container/Dockerfile.vllm' - 'container/Dockerfile.vllm'
- 'container/deps/requirements.vllm.txt' - 'container/deps/requirements.vllm.txt'
- 'container/deps/vllm/**' - 'container/deps/vllm/**'
- 'components/backends/vllm/**' - 'examples/backends/vllm/**'
- 'components/src/dynamo/vllm/**' - 'components/src/dynamo/vllm/**'
- 'container/build.sh' - 'container/build.sh'
- 'tests/serve/test_vllm.py' - 'tests/serve/test_vllm.py'
...@@ -36,14 +36,14 @@ vllm: &vllm ...@@ -36,14 +36,14 @@ vllm: &vllm
sglang: &sglang sglang: &sglang
- 'container/Dockerfile.sglang' - 'container/Dockerfile.sglang'
- 'container/Dockerfile.sglang-wideep' - 'container/Dockerfile.sglang-wideep'
- 'components/backends/sglang/**' - 'examples/backends/sglang/**'
- 'components/src/dynamo/sglang/**' - 'components/src/dynamo/sglang/**'
- 'container/build.sh' - 'container/build.sh'
- 'tests/serve/test_sglang.py' - 'tests/serve/test_sglang.py'
trtllm: &trtllm trtllm: &trtllm
- 'container/Dockerfile.trtllm' - 'container/Dockerfile.trtllm'
- 'components/backends/trtllm/**' - 'examples/backends/trtllm/**'
- 'components/src/dynamo/trtllm/**' - 'components/src/dynamo/trtllm/**'
- 'container/build.sh' - 'container/build.sh'
- 'container/build_trtllm_wheel.sh' - 'container/build_trtllm_wheel.sh'
......
...@@ -429,7 +429,7 @@ jobs: ...@@ -429,7 +429,7 @@ jobs:
export KUBECONFIG=$(pwd)/.kubeconfig export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE kubectl config set-context --current --namespace=$NAMESPACE
cd components/backends/$FRAMEWORK cd examples/backends/$FRAMEWORK
export FRAMEWORK_RUNTIME_IMAGE="${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-${FRAMEWORK}-amd64" export FRAMEWORK_RUNTIME_IMAGE="${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-${FRAMEWORK}-amd64"
export KUBE_NS=$NAMESPACE export KUBE_NS=$NAMESPACE
export GRAPH_NAME=$(yq e '.metadata.name' $DEPLOYMENT_FILE) export GRAPH_NAME=$(yq e '.metadata.name' $DEPLOYMENT_FILE)
......
...@@ -171,7 +171,7 @@ Rerun with `curl -N` and change `stream` in the request to `true` to get the res ...@@ -171,7 +171,7 @@ Rerun with `curl -N` and change `stream` in the request to `true` to get the res
### Deploying Dynamo ### Deploying Dynamo
- Follow the [Quickstart Guide](docs/kubernetes/README.md) to deploy on Kubernetes. - Follow the [Quickstart Guide](docs/kubernetes/README.md) to deploy on Kubernetes.
- Check out [Backends](components/backends) to deploy various workflow configurations (e.g. SGLang with router, vLLM with disaggregated serving, etc.) - Check out [Backends](examples/backends) to deploy various workflow configurations (e.g. SGLang with router, vLLM with disaggregated serving, etc.)
- Run some [Examples](examples) to learn about building components in Dynamo and exploring various integrations. - Run some [Examples](examples) to learn about building components in Dynamo and exploring various integrations.
### Benchmarking Dynamo ### Benchmarking Dynamo
......
...@@ -20,7 +20,7 @@ This directory contains benchmarking scripts and tools for performance evaluatio ...@@ -20,7 +20,7 @@ This directory contains benchmarking scripts and tools for performance evaluatio
## Quick Start ## Quick Start
### Benchmark a Dynamo Deployment ### Benchmark a Dynamo Deployment
First, deploy your DynamoGraphDeployment using the [deployment documentation](../components/backends/), then: First, deploy your DynamoGraphDeployment using the [deployment documentation](../docs/kubernetes/), then:
```bash ```bash
# Port-forward your deployment to http://localhost:8000 # Port-forward your deployment to http://localhost:8000
......
...@@ -36,7 +36,7 @@ console_handler.setFormatter(formatter) ...@@ -36,7 +36,7 @@ console_handler.setFormatter(formatter)
logger.addHandler(console_handler) logger.addHandler(console_handler)
DEFAULT_SGLANG_CONFIG_PATH = "components/backends/sglang/deploy/disagg.yaml" DEFAULT_SGLANG_CONFIG_PATH = "examples/backends/sglang/deploy/disagg.yaml"
class SGLangConfigModifier: class SGLangConfigModifier:
......
...@@ -38,7 +38,7 @@ console_handler.setFormatter(formatter) ...@@ -38,7 +38,7 @@ console_handler.setFormatter(formatter)
logger.addHandler(console_handler) logger.addHandler(console_handler)
DEFAULT_TRTLLM_CONFIG_PATH = "components/backends/trtllm/deploy/disagg.yaml" DEFAULT_TRTLLM_CONFIG_PATH = "examples/backends/trtllm/deploy/disagg.yaml"
class TrtllmConfigModifier: class TrtllmConfigModifier:
......
...@@ -34,7 +34,7 @@ console_handler.setFormatter(formatter) ...@@ -34,7 +34,7 @@ console_handler.setFormatter(formatter)
logger.addHandler(console_handler) logger.addHandler(console_handler)
DEFAULT_VLLM_CONFIG_PATH = "components/backends/vllm/deploy/disagg.yaml" DEFAULT_VLLM_CONFIG_PATH = "examples/backends/vllm/deploy/disagg.yaml"
class VllmV1ConfigModifier: class VllmV1ConfigModifier:
......
...@@ -19,25 +19,17 @@ limitations under the License. ...@@ -19,25 +19,17 @@ limitations under the License.
This directory contains the core components that make up the Dynamo inference framework. Each component serves a specific role in the distributed LLM serving architecture, enabling high-throughput, low-latency inference across multiple nodes and GPUs. This directory contains the core components that make up the Dynamo inference framework. Each component serves a specific role in the distributed LLM serving architecture, enabling high-throughput, low-latency inference across multiple nodes and GPUs.
## Supported Inference Engines
Dynamo supports multiple inference engines (with a focus on SGLang, vLLM, and TensorRT-LLM), each with their own deployment configurations and capabilities:
- **[vLLM](/docs/backends/vllm/README.md)** - High-performance LLM inference with native KV cache events and NIXL-based transfer mechanisms
- **[SGLang](/docs/backends/sglang/README.md)** - Structured generation language framework with ZMQ-based communication
- **[TensorRT-LLM](/docs/backends/trtllm/README.md)** - NVIDIA's optimized LLM inference engine with TensorRT acceleration
Each engine provides launch scripts for different deployment patterns in their respective `/launch` & `/deploy` directories.
## Core Components ## Core Components
### [Backends](backends/) ### Backends
Dynamo supports multiple inference engines, each with their own deployment configurations and capabilities:
The backends directory contains inference engine integrations and implementations, with a key focus on: - **[vLLM](/docs/backends/vllm/README.md)** - Full-featured vLLM integration with disaggregated serving, KV-aware routing, SLA-based planning, native KV cache events, and NIXL-based transfer mechanisms
- **[SGLang](/docs/backends/sglang/README.md)** - SGLang engine integration with ZMQ-based communication, supporting disaggregated serving and KV-aware routing
- **[TensorRT-LLM](/docs/backends/trtllm/README.md)** - TensorRT-LLM integration with disaggregated serving capabilities and TensorRT acceleration
- **vLLM** - Full-featured vLLM integration with disaggregated serving, KV-aware routing, and SLA-based planning Each engine provides launch and deploy scripts for different deployment patterns in the [examples](../examples/backends/) folder.
- **SGLang** - SGLang engine integration supporting disaggregated serving and KV-aware routing
- **TensorRT-LLM** - TensorRT-LLM integration with disaggregated serving capabilities
### [Frontend](src/dynamo/frontend/) ### [Frontend](src/dynamo/frontend/)
......
...@@ -47,7 +47,7 @@ Clients query the `find_best_worker` endpoint to determine which worker should p ...@@ -47,7 +47,7 @@ Clients query the `find_best_worker` endpoint to determine which worker should p
> >
> Use this manual setup if you need explicit control over prefill routing configuration or want to manage prefill and decode routers separately. > Use this manual setup if you need explicit control over prefill routing configuration or want to manage prefill and decode routers separately.
See [`components/backends/vllm/launch/disagg_router.sh`](/components/backends/vllm/launch/disagg_router.sh) for a complete example. See [`examples/backends/vllm/launch/disagg_router.sh`](/examples/backends/vllm/launch/disagg_router.sh) for a complete example.
```bash ```bash
# Start frontend router for decode workers # Start frontend router for decode workers
......
...@@ -87,4 +87,4 @@ ENV PATH=/usr/local/bin/etcd:$PATH ...@@ -87,4 +87,4 @@ ENV PATH=/usr/local/bin/etcd:$PATH
# Enable forceful shutdown of inflight requests # Enable forceful shutdown of inflight requests
ENV SGL_FORCE_SHUTDOWN=1 ENV SGL_FORCE_SHUTDOWN=1
WORKDIR /sgl-workspace/dynamo/components/backends/sglang WORKDIR /sgl-workspace/dynamo/examples/backends/sglang
...@@ -33,7 +33,7 @@ This approach allows you to install Dynamo directly using a DynamoGraphDeploymen ...@@ -33,7 +33,7 @@ This approach allows you to install Dynamo directly using a DynamoGraphDeploymen
Here is how you would install a VLLM inference backend example. Here is how you would install a VLLM inference backend example.
```bash ```bash
helm upgrade --install dynamo-graph ./deploy/helm/chart -n dynamo-cloud -f ./components/backends/vllm/deploy/agg.yaml helm upgrade --install dynamo-graph ./deploy/helm/chart -n dynamo-cloud -f ./examples/backends/vllm/deploy/agg.yaml
``` ```
### Installation using Grove ### Installation using Grove
...@@ -41,7 +41,7 @@ helm upgrade --install dynamo-graph ./deploy/helm/chart -n dynamo-cloud -f ./com ...@@ -41,7 +41,7 @@ helm upgrade --install dynamo-graph ./deploy/helm/chart -n dynamo-cloud -f ./com
Same example as above, but using Grove PodCliqueSet resources. Same example as above, but using Grove PodCliqueSet resources.
```bash ```bash
helm upgrade --install dynamo-graph ./deploy/helm/chart -n dynamo-cloud -f ./components/backends/vllm/deploy/agg.yaml --set deploymentType=grove helm upgrade --install dynamo-graph ./deploy/helm/chart -n dynamo-cloud -f ./examples/backends/vllm/deploy/agg.yaml --set deploymentType=grove
``` ```
### Customizable Properties ### Customizable Properties
...@@ -50,7 +50,7 @@ You can override the default configuration by setting the following properties: ...@@ -50,7 +50,7 @@ You can override the default configuration by setting the following properties:
```bash ```bash
helm upgrade --install dynamo-graph ./deploy/helm/chart -n dynamo-cloud \ helm upgrade --install dynamo-graph ./deploy/helm/chart -n dynamo-cloud \
-f ./components/backends/vllm/deploy/agg.yaml \ -f ./examples/backends/vllm/deploy/agg.yaml \
--set "imagePullSecrets[0].name=docker-secret-1" \ --set "imagePullSecrets[0].name=docker-secret-1" \
--set etcdAddr="my-etcd-service:2379" \ --set etcdAddr="my-etcd-service:2379" \
--set natsAddr="nats://my-nats-service:4222" --set natsAddr="nats://my-nats-service:4222"
......
...@@ -66,12 +66,12 @@ kubectl get gateway inference-gateway -n my-model ...@@ -66,12 +66,12 @@ kubectl get gateway inference-gateway -n my-model
### 3. Deploy Your Model ### ### 3. Deploy Your Model ###
Follow the steps in [model deployment](../../components/backends/vllm/deploy/README.md) to deploy `Qwen/Qwen3-0.6B` model in aggregate mode using [agg.yaml](../../components/backends/vllm/deploy/agg.yaml) in `my-model` kubernetes namespace. Follow the steps in [model deployment](../../examples/backends/vllm/deploy/README.md) to deploy `Qwen/Qwen3-0.6B` model in aggregate mode using [agg.yaml](../../examples/backends/vllm/deploy/agg.yaml) in `my-model` kubernetes namespace.
Sample commands to deploy model: Sample commands to deploy model:
```bash ```bash
cd <dynamo-source-root>/components/backends/vllm/deploy cd <dynamo-source-root>/examples/backends/vllm/deploy
kubectl apply -f agg.yaml -n my-model kubectl apply -f agg.yaml -n my-model
``` ```
...@@ -97,7 +97,7 @@ kubectl create secret generic hf-token-secret \ ...@@ -97,7 +97,7 @@ kubectl create secret generic hf-token-secret \
``` ```
Create a model configuration file similar to the vllm_agg_qwen.yaml for your model. Create a model configuration file similar to the vllm_agg_qwen.yaml for your model.
This file demonstrates the values needed for the Vllm Agg setup in [agg.yaml](../../components/backends/vllm/deploy/agg.yaml) This file demonstrates the values needed for the Vllm Agg setup in [agg.yaml](../../examples/backends/vllm/deploy/agg.yaml)
Take a note of the model's block size provided in the model card. Take a note of the model's block size provided in the model card.
### 4. Install Dynamo GAIE helm chart ### ### 4. Install Dynamo GAIE helm chart ###
......
...@@ -91,7 +91,7 @@ Run the vLLM disaggregated script with tracing enabled: ...@@ -91,7 +91,7 @@ Run the vLLM disaggregated script with tracing enabled:
```bash ```bash
# Navigate to vLLM launch directory # Navigate to vLLM launch directory
cd components/backends/vllm/launch cd examples/backends/vllm/launch
# Run disaggregated deployment (modify the script to export env vars first) # Run disaggregated deployment (modify the script to export env vars first)
./disagg.sh ./disagg.sh
...@@ -179,7 +179,7 @@ For Kubernetes deployments, ensure you have a Tempo instance deployed and access ...@@ -179,7 +179,7 @@ For Kubernetes deployments, ensure you have a Tempo instance deployed and access
### Modify DynamoGraphDeployment for Tracing ### Modify DynamoGraphDeployment for Tracing
Add common tracing environment variables at the top level and service-specific names in each component in your `DynamoGraphDeployment` (e.g., `components/backends/vllm/deploy/disagg.yaml`): Add common tracing environment variables at the top level and service-specific names in each component in your `DynamoGraphDeployment` (e.g., `examples/backends/vllm/deploy/disagg.yaml`):
```yaml ```yaml
apiVersion: nvidia.com/v1alpha1 apiVersion: nvidia.com/v1alpha1
...@@ -228,7 +228,7 @@ spec: ...@@ -228,7 +228,7 @@ spec:
Apply the updated DynamoGraphDeployment: Apply the updated DynamoGraphDeployment:
```bash ```bash
kubectl apply -f components/backends/vllm/deploy/disagg.yaml kubectl apply -f examples/backends/vllm/deploy/disagg.yaml
``` ```
Traces will now be exported to Tempo and can be viewed in Grafana. Traces will now be exported to Tempo and can be viewed in Grafana.
......
...@@ -182,14 +182,14 @@ docker compose -f deploy/docker-compose.yml up -d ...@@ -182,14 +182,14 @@ docker compose -f deploy/docker-compose.yml up -d
### Aggregated Serving ### Aggregated Serving
```bash ```bash
cd $DYNAMO_HOME/components/backends/sglang cd $DYNAMO_HOME/examples/backends/sglang
./launch/agg.sh ./launch/agg.sh
``` ```
### Aggregated Serving with KV Routing ### Aggregated Serving with KV Routing
```bash ```bash
cd $DYNAMO_HOME/components/backends/sglang cd $DYNAMO_HOME/examples/backends/sglang
./launch/agg_router.sh ./launch/agg_router.sh
``` ```
...@@ -198,7 +198,7 @@ cd $DYNAMO_HOME/components/backends/sglang ...@@ -198,7 +198,7 @@ cd $DYNAMO_HOME/components/backends/sglang
Here's an example that uses the [Qwen/Qwen3-Embedding-4B](https://huggingface.co/Qwen/Qwen3-Embedding-4B) model. Here's an example that uses the [Qwen/Qwen3-Embedding-4B](https://huggingface.co/Qwen/Qwen3-Embedding-4B) model.
```bash ```bash
cd $DYNAMO_HOME/components/backends/sglang cd $DYNAMO_HOME/examples/backends/sglang
./launch/agg_embed.sh ./launch/agg_embed.sh
``` ```
...@@ -222,14 +222,14 @@ See [SGLang Disaggregation](sglang-disaggregation.md) to learn more about how sg ...@@ -222,14 +222,14 @@ See [SGLang Disaggregation](sglang-disaggregation.md) to learn more about how sg
```bash ```bash
cd $DYNAMO_HOME/components/backends/sglang cd $DYNAMO_HOME/examples/backends/sglang
./launch/disagg.sh ./launch/disagg.sh
``` ```
### Disaggregated Serving with KV Aware Prefill Routing ### Disaggregated Serving with KV Aware Prefill Routing
```bash ```bash
cd $DYNAMO_HOME/components/backends/sglang cd $DYNAMO_HOME/examples/backends/sglang
./launch/disagg_router.sh ./launch/disagg_router.sh
``` ```
...@@ -239,7 +239,7 @@ You can use this configuration to test out disaggregated serving with dp attenti ...@@ -239,7 +239,7 @@ You can use this configuration to test out disaggregated serving with dp attenti
```bash ```bash
# note this will require 4 GPUs # note this will require 4 GPUs
cd $DYNAMO_HOME/components/backends/sglang cd $DYNAMO_HOME/examples/backends/sglang
./launch/disagg_dp_attn.sh ./launch/disagg_dp_attn.sh
``` ```
...@@ -285,7 +285,7 @@ Below we provide a selected list of advanced examples. Please open up an issue i ...@@ -285,7 +285,7 @@ Below we provide a selected list of advanced examples. Please open up an issue i
We currently provide deployment examples for Kubernetes and SLURM. We currently provide deployment examples for Kubernetes and SLURM.
## Kubernetes ## Kubernetes
- **[Deploying Dynamo with SGLang on Kubernetes](../../../components/backends/sglang/deploy/README.md)** - **[Deploying Dynamo with SGLang on Kubernetes](../../../examples/backends/sglang/deploy/README.md)**
## SLURM ## SLURM
- **[Deploying Dynamo with SGLang on SLURM](../../../components/backends/sglang/slurm_jobs/README.md)** - **[Deploying Dynamo with SGLang on SLURM](../../../examples/backends/sglang/slurm_jobs/README.md)**
...@@ -44,7 +44,7 @@ docker run \ ...@@ -44,7 +44,7 @@ docker run \
dynamo-wideep:latest dynamo-wideep:latest
``` ```
In each container, you should be in the `/sgl-workspace/dynamo/components/backends/sglang` directory. In each container, you should be in the `/sgl-workspace/dynamo/examples/backends/sglang` directory.
3. Run the ingress and prefill worker 3. Run the ingress and prefill worker
......
...@@ -47,7 +47,7 @@ flowchart LR ...@@ -47,7 +47,7 @@ flowchart LR
``` ```
```bash ```bash
cd $DYNAMO_HOME/components/backends/sglang cd $DYNAMO_HOME/examples/backends/sglang
./launch/multimodal_agg.sh ./launch/multimodal_agg.sh
``` ```
...@@ -133,7 +133,7 @@ flowchart LR ...@@ -133,7 +133,7 @@ flowchart LR
```bash ```bash
cd $DYNAMO_HOME/components/backends/sglang cd $DYNAMO_HOME/examples/backends/sglang
./launch/multimodal_disagg.sh ./launch/multimodal_disagg.sh
``` ```
......
...@@ -128,13 +128,13 @@ This figure shows an overview of the major components to deploy: ...@@ -128,13 +128,13 @@ This figure shows an overview of the major components to deploy:
### Aggregated ### Aggregated
```bash ```bash
cd $DYNAMO_HOME/components/backends/trtllm cd $DYNAMO_HOME/examples/backends/trtllm
./launch/agg.sh ./launch/agg.sh
``` ```
### Aggregated with KV Routing ### Aggregated with KV Routing
```bash ```bash
cd $DYNAMO_HOME/components/backends/trtllm cd $DYNAMO_HOME/examples/backends/trtllm
./launch/agg_router.sh ./launch/agg_router.sh
``` ```
...@@ -144,7 +144,7 @@ cd $DYNAMO_HOME/components/backends/trtllm ...@@ -144,7 +144,7 @@ cd $DYNAMO_HOME/components/backends/trtllm
> Disaggregated serving supports two strategies for request flow: `"prefill_first"` and `"decode_first"`. By default, the script below uses the `"decode_first"` strategy, which can reduce response latency by minimizing extra hops in the return path. You can switch strategies by setting the `DISAGGREGATION_STRATEGY` environment variable. > Disaggregated serving supports two strategies for request flow: `"prefill_first"` and `"decode_first"`. By default, the script below uses the `"decode_first"` strategy, which can reduce response latency by minimizing extra hops in the return path. You can switch strategies by setting the `DISAGGREGATION_STRATEGY` environment variable.
```bash ```bash
cd $DYNAMO_HOME/components/backends/trtllm cd $DYNAMO_HOME/examples/backends/trtllm
./launch/disagg.sh ./launch/disagg.sh
``` ```
...@@ -154,13 +154,13 @@ cd $DYNAMO_HOME/components/backends/trtllm ...@@ -154,13 +154,13 @@ cd $DYNAMO_HOME/components/backends/trtllm
> Disaggregated serving with KV routing uses a "prefill first" workflow by default. Currently, Dynamo supports KV routing to only one endpoint per model. In disaggregated workflow, it is generally more effective to route requests to the prefill worker. If you wish to use a "decode first" workflow instead, you can simply set the `DISAGGREGATION_STRATEGY` environment variable accordingly. > Disaggregated serving with KV routing uses a "prefill first" workflow by default. Currently, Dynamo supports KV routing to only one endpoint per model. In disaggregated workflow, it is generally more effective to route requests to the prefill worker. If you wish to use a "decode first" workflow instead, you can simply set the `DISAGGREGATION_STRATEGY` environment variable accordingly.
```bash ```bash
cd $DYNAMO_HOME/components/backends/trtllm cd $DYNAMO_HOME/examples/backends/trtllm
./launch/disagg_router.sh ./launch/disagg_router.sh
``` ```
### Aggregated with Multi-Token Prediction (MTP) and DeepSeek R1 ### Aggregated with Multi-Token Prediction (MTP) and DeepSeek R1
```bash ```bash
cd $DYNAMO_HOME/components/backends/trtllm cd $DYNAMO_HOME/examples/backends/trtllm
export AGG_ENGINE_ARGS=./recipes/deepseek-r1/trtllm/mtp/mtp_agg.yaml export AGG_ENGINE_ARGS=./recipes/deepseek-r1/trtllm/mtp/mtp_agg.yaml
export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4" export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4"
...@@ -186,7 +186,7 @@ For comprehensive instructions on multinode serving, see the [multinode-examples ...@@ -186,7 +186,7 @@ For comprehensive instructions on multinode serving, see the [multinode-examples
### Kubernetes Deployment ### Kubernetes Deployment
For complete Kubernetes deployment instructions, configurations, and troubleshooting, see [TensorRT-LLM Kubernetes Deployment Guide](../../../components/backends/trtllm/deploy/README.md). For complete Kubernetes deployment instructions, configurations, and troubleshooting, see [TensorRT-LLM Kubernetes Deployment Guide](../../../examples/backends/trtllm/deploy/README.md).
### Client ### Client
...@@ -270,7 +270,7 @@ Logits processors let you modify the next-token logits at every decoding step (e ...@@ -270,7 +270,7 @@ Logits processors let you modify the next-token logits at every decoding step (e
You can enable a test-only processor that forces the model to respond with "Hello world!". This is useful to verify the wiring without modifying your model or engine code. You can enable a test-only processor that forces the model to respond with "Hello world!". This is useful to verify the wiring without modifying your model or engine code.
```bash ```bash
cd $DYNAMO_HOME/components/backends/trtllm cd $DYNAMO_HOME/examples/backends/trtllm
export DYNAMO_ENABLE_TEST_LOGITS_PROCESSOR=1 export DYNAMO_ENABLE_TEST_LOGITS_PROCESSOR=1
./launch/agg.sh ./launch/agg.sh
``` ```
...@@ -316,7 +316,7 @@ sampling_params.logits_processor = create_trtllm_adapters(processors) ...@@ -316,7 +316,7 @@ sampling_params.logits_processor = create_trtllm_adapters(processors)
## Performance Sweep ## Performance Sweep
For detailed instructions on running comprehensive performance sweeps across both aggregated and disaggregated serving configurations, see the [TensorRT-LLM Benchmark Scripts for DeepSeek R1 model](../../../components/backends/trtllm/performance_sweeps/README.md). This guide covers recommended benchmarking setups, usage of provided scripts, and best practices for evaluating system performance. For detailed instructions on running comprehensive performance sweeps across both aggregated and disaggregated serving configurations, see the [TensorRT-LLM Benchmark Scripts for DeepSeek R1 model](../../../examples/backends/trtllm/performance_sweeps/README.md). This guide covers recommended benchmarking setups, usage of provided scripts, and best practices for evaluating system performance.
## Dynamo KV Block Manager Integration ## Dynamo KV Block Manager Integration
......
...@@ -27,7 +27,7 @@ VSWA is a mechanism in which a model’s layers alternate between multiple slidi ...@@ -27,7 +27,7 @@ VSWA is a mechanism in which a model’s layers alternate between multiple slidi
## Aggregated Serving ## Aggregated Serving
```bash ```bash
cd $DYNAMO_HOME/components/backends/trtllm cd $DYNAMO_HOME/examples/backends/trtllm
export MODEL_PATH=google/gemma-3-1b-it export MODEL_PATH=google/gemma-3-1b-it
export SERVED_MODEL_NAME=$MODEL_PATH export SERVED_MODEL_NAME=$MODEL_PATH
export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml
...@@ -36,7 +36,7 @@ export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml ...@@ -36,7 +36,7 @@ export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml
## Aggregated Serving with KV Routing ## Aggregated Serving with KV Routing
```bash ```bash
cd $DYNAMO_HOME/components/backends/trtllm cd $DYNAMO_HOME/examples/backends/trtllm
export MODEL_PATH=google/gemma-3-1b-it export MODEL_PATH=google/gemma-3-1b-it
export SERVED_MODEL_NAME=$MODEL_PATH export SERVED_MODEL_NAME=$MODEL_PATH
export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml
...@@ -45,7 +45,7 @@ export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml ...@@ -45,7 +45,7 @@ export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml
## Disaggregated Serving ## Disaggregated Serving
```bash ```bash
cd $DYNAMO_HOME/components/backends/trtllm cd $DYNAMO_HOME/examples/backends/trtllm
export MODEL_PATH=google/gemma-3-1b-it export MODEL_PATH=google/gemma-3-1b-it
export SERVED_MODEL_NAME=$MODEL_PATH export SERVED_MODEL_NAME=$MODEL_PATH
export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_prefill.yaml export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_prefill.yaml
...@@ -55,7 +55,7 @@ export DECODE_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_decode.yaml ...@@ -55,7 +55,7 @@ export DECODE_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_decode.yaml
## Disaggregated Serving with KV Routing ## Disaggregated Serving with KV Routing
```bash ```bash
cd $DYNAMO_HOME/components/backends/trtllm cd $DYNAMO_HOME/examples/backends/trtllm
export MODEL_PATH=google/gemma-3-1b-it export MODEL_PATH=google/gemma-3-1b-it
export SERVED_MODEL_NAME=$MODEL_PATH export SERVED_MODEL_NAME=$MODEL_PATH
export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_prefill.yaml export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_prefill.yaml
......
...@@ -128,7 +128,7 @@ You can use the provided launch script or run the components manually: ...@@ -128,7 +128,7 @@ You can use the provided launch script or run the components manually:
#### Option A: Using the Launch Script #### Option A: Using the Launch Script
```bash ```bash
cd /workspace/components/backends/trtllm cd /workspace/examples/backends/trtllm
./launch/gpt_oss_disagg.sh ./launch/gpt_oss_disagg.sh
``` ```
...@@ -136,8 +136,6 @@ cd /workspace/components/backends/trtllm ...@@ -136,8 +136,6 @@ cd /workspace/components/backends/trtllm
1. **Start frontend**: 1. **Start frontend**:
```bash ```bash
cd /workspace/dynamo/components/backends/trtllm
# Start frontend with round-robin routing # Start frontend with round-robin routing
python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 & python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 &
``` ```
......
...@@ -39,7 +39,7 @@ inside an interactive shell on one of the allocated nodes, set the ...@@ -39,7 +39,7 @@ inside an interactive shell on one of the allocated nodes, set the
following environment variables based: following environment variables based:
```bash ```bash
cd $DYNAMO_HOME/components/backends/trtllm cd $DYNAMO_HOME/examples/backends/trtllm
export IMAGE="<dynamo_trtllm_image>" export IMAGE="<dynamo_trtllm_image>"
# export MOUNTS="${PWD}/:/mnt,/lustre:/lustre" # export MOUNTS="${PWD}/:/mnt,/lustre:/lustre"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment