Unverified Commit 88dfd1b3 authored by Ben Hamm's avatar Ben Hamm Committed by GitHub
Browse files

docs: Clean up incomplete recipes and clarify Kubernetes-only focus (#4159)


Signed-off-by: default avatarBen Hamm <ben.hamm@gmail.com>
Signed-off-by: default avatarTanmay Verma <tanmay2592@gmail.com>
Signed-off-by: default avataratchernych <atchernych@nvidia.com>
Co-authored-by: default avatarBiswa Panda <biswa.panda@gmail.com>
Co-authored-by: default avatartanmayv25 <tanmay2592@gmail.com>
Co-authored-by: default avatarTanmay Verma <tanmayv@nvidia.com>
Co-authored-by: default avatarAnant Sharma <anants@nvidia.com>
Co-authored-by: default avataratchernych <atchernych@nvidia.com>
parent 09bb1c68
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
NUM_WORKERS=8 NUM_WORKERS=8
MODEL_PATH="deepseek-ai/DeepSeek-R1-Distill-Llama-8B" MODEL_PATH="deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
RECIPE_PATH="$DYNAMO_HOME/recipes/deepseek-r1-distill-llama-8b/trtllm" ENGINE_CONFIG_PATH="$DYNAMO_HOME/examples/backends/trtllm/engine_configs/deepseek-r1-distill-llama-8b"
TENSOR_PARALLEL_SIZE=1 TENSOR_PARALLEL_SIZE=1
DATA_PARALLEL_SIZE=1 DATA_PARALLEL_SIZE=1
USE_MOCKERS=false USE_MOCKERS=false
...@@ -86,13 +86,13 @@ if [ ${#EXTRA_ARGS[@]} -eq 0 ]; then ...@@ -86,13 +86,13 @@ if [ ${#EXTRA_ARGS[@]} -eq 0 ]; then
) )
elif [ "$USE_TRTLLM" = true ]; then elif [ "$USE_TRTLLM" = true ]; then
# Default args for TensorRT-LLM engine using predefined YAML configs # Default args for TensorRT-LLM engine using predefined YAML configs
# Config files located at: $RECIPE_PATH/{agg,decode,prefill}.yaml # Config files located at: $ENGINE_CONFIG_PATH/{agg,decode,prefill}.yaml
if [ "$MODE" = "prefill" ]; then if [ "$MODE" = "prefill" ]; then
ENGINE_CONFIG="$RECIPE_PATH/prefill.yaml" ENGINE_CONFIG="$ENGINE_CONFIG_PATH/prefill.yaml"
elif [ "$MODE" = "decode" ]; then elif [ "$MODE" = "decode" ]; then
ENGINE_CONFIG="$RECIPE_PATH/decode.yaml" ENGINE_CONFIG="$ENGINE_CONFIG_PATH/decode.yaml"
else else
ENGINE_CONFIG="$RECIPE_PATH/agg.yaml" ENGINE_CONFIG="$ENGINE_CONFIG_PATH/agg.yaml"
fi fi
EXTRA_ARGS=( EXTRA_ARGS=(
......
...@@ -158,7 +158,7 @@ cd $DYNAMO_HOME/examples/backends/trtllm ...@@ -158,7 +158,7 @@ cd $DYNAMO_HOME/examples/backends/trtllm
```bash ```bash
cd $DYNAMO_HOME/examples/backends/trtllm cd $DYNAMO_HOME/examples/backends/trtllm
export AGG_ENGINE_ARGS=./recipes/deepseek-r1/trtllm/agg/mtp/mtp_agg.yaml export AGG_ENGINE_ARGS=./engine_configs/deepseek-r1/agg/mtp/mtp_agg.yaml
export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4" export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4"
# nvidia/DeepSeek-R1-FP4 is a large model # nvidia/DeepSeek-R1-FP4 is a large model
export MODEL_PATH="nvidia/DeepSeek-R1-FP4" export MODEL_PATH="nvidia/DeepSeek-R1-FP4"
......
...@@ -30,7 +30,7 @@ VSWA is a mechanism in which a model’s layers alternate between multiple slidi ...@@ -30,7 +30,7 @@ VSWA is a mechanism in which a model’s layers alternate between multiple slidi
cd $DYNAMO_HOME/examples/backends/trtllm cd $DYNAMO_HOME/examples/backends/trtllm
export MODEL_PATH=google/gemma-3-1b-it export MODEL_PATH=google/gemma-3-1b-it
export SERVED_MODEL_NAME=$MODEL_PATH export SERVED_MODEL_NAME=$MODEL_PATH
export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml export AGG_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_agg.yaml
./launch/agg.sh ./launch/agg.sh
``` ```
...@@ -39,7 +39,7 @@ export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml ...@@ -39,7 +39,7 @@ export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml
cd $DYNAMO_HOME/examples/backends/trtllm cd $DYNAMO_HOME/examples/backends/trtllm
export MODEL_PATH=google/gemma-3-1b-it export MODEL_PATH=google/gemma-3-1b-it
export SERVED_MODEL_NAME=$MODEL_PATH export SERVED_MODEL_NAME=$MODEL_PATH
export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml export AGG_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_agg.yaml
./launch/agg_router.sh ./launch/agg_router.sh
``` ```
...@@ -48,8 +48,8 @@ export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml ...@@ -48,8 +48,8 @@ export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml
cd $DYNAMO_HOME/examples/backends/trtllm cd $DYNAMO_HOME/examples/backends/trtllm
export MODEL_PATH=google/gemma-3-1b-it export MODEL_PATH=google/gemma-3-1b-it
export SERVED_MODEL_NAME=$MODEL_PATH export SERVED_MODEL_NAME=$MODEL_PATH
export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_prefill.yaml export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml
export DECODE_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_decode.yaml export DECODE_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml
./launch/disagg.sh ./launch/disagg.sh
``` ```
...@@ -58,7 +58,7 @@ export DECODE_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_decode.yaml ...@@ -58,7 +58,7 @@ export DECODE_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_decode.yaml
cd $DYNAMO_HOME/examples/backends/trtllm cd $DYNAMO_HOME/examples/backends/trtllm
export MODEL_PATH=google/gemma-3-1b-it export MODEL_PATH=google/gemma-3-1b-it
export SERVED_MODEL_NAME=$MODEL_PATH export SERVED_MODEL_NAME=$MODEL_PATH
export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_prefill.yaml export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml
export DECODE_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_decode.yaml export DECODE_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml
./launch/disagg_router.sh ./launch/disagg_router.sh
``` ```
...@@ -90,14 +90,14 @@ The deployment uses configuration files and command-line arguments to control be ...@@ -90,14 +90,14 @@ The deployment uses configuration files and command-line arguments to control be
#### Configuration Files #### Configuration Files
**Prefill Configuration (`recipes/gpt-oss-120b/trtllm/disagg/prefill.yaml`)**: **Prefill Configuration (`examples/backends/trtllm/engine_configs/gpt-oss-120b/prefill.yaml`)**:
- `enable_attention_dp: false` - Attention data parallelism disabled for prefill - `enable_attention_dp: false` - Attention data parallelism disabled for prefill
- `enable_chunked_prefill: true` - Enables efficient chunked prefill processing - `enable_chunked_prefill: true` - Enables efficient chunked prefill processing
- `moe_config.backend: CUTLASS` - Uses optimized CUTLASS kernels for MoE layers - `moe_config.backend: CUTLASS` - Uses optimized CUTLASS kernels for MoE layers
- `cache_transceiver_config.backend: ucx` - Uses UCX for efficient KV cache transfer - `cache_transceiver_config.backend: ucx` - Uses UCX for efficient KV cache transfer
- `cuda_graph_config.max_batch_size: 32` - Maximum batch size for CUDA graphs - `cuda_graph_config.max_batch_size: 32` - Maximum batch size for CUDA graphs
**Decode Configuration (`recipes/gpt-oss-120b/trtllm/disagg/decode.yaml`)**: **Decode Configuration (`examples/backends/trtllm/engine_configs/gpt-oss-120b/decode.yaml`)**:
- `enable_attention_dp: true` - Attention data parallelism enabled for decode - `enable_attention_dp: true` - Attention data parallelism enabled for decode
- `disable_overlap_scheduler: false` - Enables overlapping for decode efficiency - `disable_overlap_scheduler: false` - Enables overlapping for decode efficiency
- `moe_config.backend: CUTLASS` - Uses optimized CUTLASS kernels for MoE layers - `moe_config.backend: CUTLASS` - Uses optimized CUTLASS kernels for MoE layers
...@@ -145,7 +145,7 @@ python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 & ...@@ -145,7 +145,7 @@ python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 &
CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \ CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \
--model-path /model \ --model-path /model \
--served-model-name openai/gpt-oss-120b \ --served-model-name openai/gpt-oss-120b \
--extra-engine-args recipes/gpt-oss-120b/trtllm/disagg/prefill.yaml \ --extra-engine-args examples/backends/trtllm/engine_configs/gpt-oss-120b/prefill.yaml \
--dyn-reasoning-parser gpt_oss \ --dyn-reasoning-parser gpt_oss \
--dyn-tool-call-parser harmony \ --dyn-tool-call-parser harmony \
--disaggregation-mode prefill \ --disaggregation-mode prefill \
...@@ -161,7 +161,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \ ...@@ -161,7 +161,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \
CUDA_VISIBLE_DEVICES=4,5,6,7 python3 -m dynamo.trtllm \ CUDA_VISIBLE_DEVICES=4,5,6,7 python3 -m dynamo.trtllm \
--model-path /model \ --model-path /model \
--served-model-name openai/gpt-oss-120b \ --served-model-name openai/gpt-oss-120b \
--extra-engine-args recipes/gpt-oss-120b/trtllm/disagg/decode.yaml \ --extra-engine-args examples/backends/trtllm/engine_configs/gpt-oss-120b/decode.yaml \
--dyn-reasoning-parser gpt_oss \ --dyn-reasoning-parser gpt_oss \
--dyn-tool-call-parser harmony \ --dyn-tool-call-parser harmony \
--disaggregation-mode decode \ --disaggregation-mode decode \
......
...@@ -28,7 +28,7 @@ This guide demonstrates how to deploy Llama 4 Maverick Instruct with Eagle Specu ...@@ -28,7 +28,7 @@ This guide demonstrates how to deploy Llama 4 Maverick Instruct with Eagle Specu
- The other node runs the prefill worker. - The other node runs the prefill worker.
## Notes ## Notes
* Make sure the (`eagle3_one_model: true`) is set in the LLM API config inside the `recipes/llama4/trtllm/eagle` folder. * Make sure the (`eagle3_one_model: true`) is set in the LLM API config inside the `examples/backends/trtllm/engine_configs/llama4/eagle` folder.
## Setup ## Setup
...@@ -52,7 +52,7 @@ See [this](./multinode/multinode-examples.md#setup) section from multinode guide ...@@ -52,7 +52,7 @@ See [this](./multinode/multinode-examples.md#setup) section from multinode guide
## Aggregated Serving ## Aggregated Serving
```bash ```bash
export NUM_NODES=1 export NUM_NODES=1
export ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/eagle/eagle_agg.yml" export ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yml"
./multinode/srun_aggregated.sh ./multinode/srun_aggregated.sh
``` ```
...@@ -60,9 +60,9 @@ export ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/eagle/eagle_agg.yml" ...@@ -60,9 +60,9 @@ export ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/eagle/eagle_agg.yml"
```bash ```bash
export NUM_PREFILL_NODES=1 export NUM_PREFILL_NODES=1
export PREFILL_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/eagle/eagle_prefill.yaml" export PREFILL_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yml"
export NUM_DECODE_NODES=1 export NUM_DECODE_NODES=1
export DECODE_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/eagle/eagle_decode.yaml" export DECODE_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yml"
./multinode/srun_disaggregated.sh ./multinode/srun_disaggregated.sh
``` ```
......
...@@ -27,7 +27,7 @@ Here are quick steps to launch Llama-4 Maverick BF16 in aggregated mode ...@@ -27,7 +27,7 @@ Here are quick steps to launch Llama-4 Maverick BF16 in aggregated mode
```bash ```bash
cd $DYNAMO_HOME cd $DYNAMO_HOME
export AGG_ENGINE_ARGS=./recipes/llama4/trtllm/multimodal/agg.yaml export AGG_ENGINE_ARGS=./examples/backends/trtllm/engine_configs/llama4/multimodal/agg.yaml
export SERVED_MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct" export SERVED_MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct"
export MODEL_PATH="meta-llama/Llama-4-Maverick-17B-128E-Instruct" export MODEL_PATH="meta-llama/Llama-4-Maverick-17B-128E-Instruct"
./launch/agg.sh ./launch/agg.sh
...@@ -79,8 +79,8 @@ cd $DYNAMO_HOME ...@@ -79,8 +79,8 @@ cd $DYNAMO_HOME
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-VL-7B-Instruct"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-VL-7B-Instruct"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"recipes/qwen2-vl-7b-instruct/trtllm/prefill.yaml"} export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"recipes/qwen2-vl-7b-instruct/trtllm/decode.yaml"} export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/decode.yaml"}
export MODALITY=${MODALITY:-"multimodal"} export MODALITY=${MODALITY:-"multimodal"}
./launch/disagg.sh ./launch/disagg.sh
......
...@@ -17,6 +17,8 @@ limitations under the License. ...@@ -17,6 +17,8 @@ limitations under the License.
# Example: Multi-node TRTLLM Workers with Dynamo on Slurm # Example: Multi-node TRTLLM Workers with Dynamo on Slurm
> **Note:** The scripts referenced in this example (such as `srun_aggregated.sh` and `srun_disaggregated.sh`) can be found in [`examples/basics/multinode/trtllm/`](https://github.com/ai-dynamo/dynamo/tree/main/examples/basics/multinode/trtllm/).
To run a single Dynamo+TRTLLM Worker that spans multiple nodes (ex: TP16), To run a single Dynamo+TRTLLM Worker that spans multiple nodes (ex: TP16),
the set of nodes need to be launched together in the same MPI world, such as the set of nodes need to be launched together in the same MPI world, such as
via `mpirun` or `srun`. This is true regardless of whether the worker is via `mpirun` or `srun`. This is true regardless of whether the worker is
...@@ -106,8 +108,8 @@ export IMAGE="<dynamo_trtllm_image>" ...@@ -106,8 +108,8 @@ export IMAGE="<dynamo_trtllm_image>"
# For example, assuming your cluster had a `/lustre` directory on the host, you # For example, assuming your cluster had a `/lustre` directory on the host, you
# could add that as a mount like so: # could add that as a mount like so:
# #
# export MOUNTS="${PWD}/../:/mnt,/lustre:/lustre" # export MOUNTS="${PWD}/../../../../:/mnt,/lustre:/lustre"
export MOUNTS="${PWD}/../:/mnt" export MOUNTS="${PWD}/../../../../:/mnt"
# NOTE: In general, Deepseek R1 is very large, so it is recommended to # NOTE: In general, Deepseek R1 is very large, so it is recommended to
# pre-download the model weights and save them in some shared location, # pre-download the model weights and save them in some shared location,
...@@ -136,7 +138,7 @@ follow these steps below to launch an **aggregated** deployment across 4 nodes: ...@@ -136,7 +138,7 @@ follow these steps below to launch an **aggregated** deployment across 4 nodes:
```bash ```bash
# Default set in srun_aggregated.sh, but can customize here. # Default set in srun_aggregated.sh, but can customize here.
# export ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/agg/wide_ep/wide_ep_agg.yaml" # export ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/wide_ep_agg.yaml"
# Customize NUM_NODES to match the desired parallelism in ENGINE_CONFIG # Customize NUM_NODES to match the desired parallelism in ENGINE_CONFIG
# The product of NUM_NODES*NUM_GPUS_PER_NODE should match the number of # The product of NUM_NODES*NUM_GPUS_PER_NODE should match the number of
...@@ -165,8 +167,8 @@ deployment across 8 nodes: ...@@ -165,8 +167,8 @@ deployment across 8 nodes:
```bash ```bash
# Defaults set in srun_disaggregated.sh, but can customize here. # Defaults set in srun_disaggregated.sh, but can customize here.
# export PREFILL_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_prefill.yaml" # export PREFILL_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_prefill.yaml"
# export DECODE_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_decode.yaml" # export DECODE_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_decode.yaml"
# Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG # Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG
# Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG # Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG
......
...@@ -17,6 +17,8 @@ limitations under the License. ...@@ -17,6 +17,8 @@ limitations under the License.
# Example: Multi-node TRTLLM Workers with Dynamo on Slurm for multimodal models # Example: Multi-node TRTLLM Workers with Dynamo on Slurm for multimodal models
> **Note:** The scripts referenced in this example (such as `srun_aggregated.sh` and `srun_disaggregated.sh`) can be found in [`examples/basics/multinode/trtllm/`](https://github.com/ai-dynamo/dynamo/tree/main/examples/basics/multinode/trtllm/).
> [!IMPORTANT] > [!IMPORTANT]
> There are some known issues in tensorrt_llm==1.1.0rc5 version for multinode multimodal support. It is important to rebuild the dynamo container with a specific version of tensorrt_llm commit to use multimodal feature. > There are some known issues in tensorrt_llm==1.1.0rc5 version for multinode multimodal support. It is important to rebuild the dynamo container with a specific version of tensorrt_llm commit to use multimodal feature.
> >
...@@ -34,7 +36,7 @@ limitations under the License. ...@@ -34,7 +36,7 @@ limitations under the License.
> >
> Before running the deployment, you must update the engine configuration files to change `backend: DEFAULT` to `backend: default` (lowercase). Run the following command: > Before running the deployment, you must update the engine configuration files to change `backend: DEFAULT` to `backend: default` (lowercase). Run the following command:
> ```bash > ```bash
> sed -i 's/backend: DEFAULT/backend: default/g' /mnt/recipes/llama4/trtllm/multimodal/prefill.yaml /mnt/recipes/llama4/trtllm/multimodal/decode.yaml > sed -i 's/backend: DEFAULT/backend: default/g' /mnt/examples/backends/trtllm/engine_configs/llama4/multimodal/prefill.yaml /mnt/examples/backends/trtllm/engine_configs/llama4/multimodal/decode.yaml
> ``` > ```
...@@ -71,8 +73,8 @@ export IMAGE="<dynamo_trtllm_image>" ...@@ -71,8 +73,8 @@ export IMAGE="<dynamo_trtllm_image>"
# For example, assuming your cluster had a `/lustre` directory on the host, you # For example, assuming your cluster had a `/lustre` directory on the host, you
# could add that as a mount like so: # could add that as a mount like so:
# #
# export MOUNTS="${PWD}/../:/mnt,/lustre:/lustre" # export MOUNTS="${PWD}/../../../../:/mnt,/lustre:/lustre"
export MOUNTS="${PWD}/../:/mnt" export MOUNTS="${PWD}/../../../../:/mnt"
# Can point to local FS as weel # Can point to local FS as weel
# export MODEL_PATH="/location/to/model" # export MODEL_PATH="/location/to/model"
...@@ -100,8 +102,8 @@ deployment across 4 nodes: ...@@ -100,8 +102,8 @@ deployment across 4 nodes:
```bash ```bash
# Defaults set in srun_disaggregated.sh, but can customize here. # Defaults set in srun_disaggregated.sh, but can customize here.
# export PREFILL_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/multimodal/prefill.yaml" # export PREFILL_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/multimodal/prefill.yaml"
# export DECODE_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/multimodal/decode.yaml" # export DECODE_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/multimodal/decode.yaml"
# Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG # Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG
# Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG # Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG
......
...@@ -203,7 +203,7 @@ args: ...@@ -203,7 +203,7 @@ args:
- python3 -m dynamo.trtllm - python3 -m dynamo.trtllm
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--extra-engine-args /workspace/recipes/deepseek-r1-distill-llama-8b/agg.yaml --extra-engine-args /workspace/examples/backends/trtllm/engine_configs/deepseek-r1-distill-llama-8b/agg.yaml
``` ```
Key customization points include: Key customization points include:
......
...@@ -67,4 +67,4 @@ spec: ...@@ -67,4 +67,4 @@ spec:
- --served-model-name - --served-model-name
- Qwen/Qwen3-0.6B - Qwen/Qwen3-0.6B
- --extra-engine-args - --extra-engine-args
- ./recipes/qwen3/trtllm/agg.yaml - ./examples/backends/trtllm/engine_configs/qwen3/agg.yaml
...@@ -36,4 +36,4 @@ spec: ...@@ -36,4 +36,4 @@ spec:
- --served-model-name - --served-model-name
- Qwen/Qwen3-0.6B - Qwen/Qwen3-0.6B
- --extra-engine-args - --extra-engine-args
- ./recipes/qwen3/trtllm/agg.yaml - ./examples/backends/trtllm/engine_configs/qwen3/agg.yaml
...@@ -39,5 +39,5 @@ spec: ...@@ -39,5 +39,5 @@ spec:
- --served-model-name - --served-model-name
- Qwen/Qwen3-0.6B - Qwen/Qwen3-0.6B
- --extra-engine-args - --extra-engine-args
- ./recipes/qwen3/trtllm/agg.yaml - ./examples/backends/trtllm/engine_configs/qwen3/agg.yaml
- --publish-events-and-metrics - --publish-events-and-metrics
...@@ -37,7 +37,7 @@ spec: ...@@ -37,7 +37,7 @@ spec:
- --served-model-name - --served-model-name
- Qwen/Qwen3-0.6B - Qwen/Qwen3-0.6B
- --extra-engine-args - --extra-engine-args
- ./recipes/qwen3/trtllm/prefill.yaml - ./examples/backends/trtllm/engine_configs/qwen3/prefill.yaml
- --disaggregation-mode - --disaggregation-mode
- prefill - prefill
TRTLLMDecodeWorker: TRTLLMDecodeWorker:
...@@ -63,6 +63,6 @@ spec: ...@@ -63,6 +63,6 @@ spec:
- --served-model-name - --served-model-name
- Qwen/Qwen3-0.6B - Qwen/Qwen3-0.6B
- --extra-engine-args - --extra-engine-args
- ./recipes/qwen3/trtllm/decode.yaml - ./examples/backends/trtllm/engine_configs/qwen3/decode.yaml
- --disaggregation-mode - --disaggregation-mode
- decode - decode
...@@ -101,7 +101,7 @@ spec: ...@@ -101,7 +101,7 @@ spec:
- --served-model-name - --served-model-name
- Qwen/Qwen3-0.6B - Qwen/Qwen3-0.6B
- --extra-engine-args - --extra-engine-args
- ./recipes/qwen3/trtllm/decode.yaml - ./examples/backends/trtllm/engine_configs/qwen3/decode.yaml
- --disaggregation-mode - --disaggregation-mode
- decode - decode
TRTLLMPrefillWorker: TRTLLMPrefillWorker:
...@@ -128,6 +128,6 @@ spec: ...@@ -128,6 +128,6 @@ spec:
- --served-model-name - --served-model-name
- Qwen/Qwen3-0.6B - Qwen/Qwen3-0.6B
- --extra-engine-args - --extra-engine-args
- ./recipes/qwen3/trtllm/prefill.yaml - ./examples/backends/trtllm/engine_configs/qwen3/prefill.yaml
- --disaggregation-mode - --disaggregation-mode
- prefill - prefill
...@@ -39,7 +39,7 @@ spec: ...@@ -39,7 +39,7 @@ spec:
- --served-model-name - --served-model-name
- Qwen/Qwen3-0.6B - Qwen/Qwen3-0.6B
- --extra-engine-args - --extra-engine-args
- ./recipes/qwen3/trtllm/prefill.yaml - ./examples/backends/trtllm/engine_configs/qwen3/prefill.yaml
- --disaggregation-mode - --disaggregation-mode
- prefill - prefill
- --publish-events-and-metrics - --publish-events-and-metrics
...@@ -65,6 +65,6 @@ spec: ...@@ -65,6 +65,6 @@ spec:
- --served-model-name - --served-model-name
- Qwen/Qwen3-0.6B - Qwen/Qwen3-0.6B
- --extra-engine-args - --extra-engine-args
- ./recipes/qwen3/trtllm/decode.yaml - ./examples/backends/trtllm/engine_configs/qwen3/decode.yaml
- --disaggregation-mode - --disaggregation-mode
- decode - decode
# TensorRT-LLM Engine Configurations
This directory contains TensorRT-LLM engine configuration files for various model deployments.
## Usage
These YAML configuration files can be passed to TensorRT-LLM workers using the `--extra-engine-args` parameter:
```bash
python3 -m dynamo.trtllm \
--extra-engine-args "${ENGINE_ARGS}" \
...
```
Where `ENGINE_ARGS` points to one of the configuration files in this directory.
## Configuration Types
### Aggregated (agg/)
Single-node configurations that combine prefill and decode operations:
- **simple/**: Basic aggregated setup
- **mtp/**: Multi-token prediction configurations
- **wide_ep/**: Wide expert parallel configurations
### Disaggregated (disagg/)
Separate configurations for prefill and decode workers:
- **simple/**: Basic prefill/decode split
- **mtp/**: Multi-token prediction with separate prefill/decode
- **wide_ep/**: Wide expert parallel with expert load balancer
## Key Configuration Parameters
- **Parallelism**: `tensor_parallel_size`, `moe_expert_parallel_size`, `pipeline_parallel_size`
- **Memory**: `kv_cache_config.free_gpu_memory_fraction`, `kv_cache_config.dtype`
- **Batching**: `max_batch_size`, `max_num_tokens`, `max_seq_len`
- **Scheduling**: `disable_overlap_scheduler`, `cuda_graph_config`
## Notes
- For disaggregated setups, ensure `kv_cache_config.dtype` matches between prefill and decode configs
- WideEP configurations require an expert load balancer config (`eplb.yaml`)
- Adjust `free_gpu_memory_fraction` based on your workload and attention DP settings
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment