refactor: move engine configs out of components directory (#3772)

Signed-off-by: Anant Sharma <anants@nvidia.com> Co-authored-by: tanmayv25 <tanmay2592@gmail.com>

refactor: move engine configs out of components directory (#3772)
Signed-off-by: Anant Sharma <anants@nvidia.com> Co-authored-by: tanmayv25 <tanmay2592@gmail.com>
8354d325 · Anant Sharma · GitHub · 90caf3ea · 8354d325 · 8354d325
Unverified Commit 8354d325 authored Oct 24, 2025 by Anant Sharma Committed by GitHub Oct 24, 2025
20 changed files
--- a/docs/backends/trtllm/multimodal_support.md
+++ b/docs/backends/trtllm/multimodal_support.md
@@ -25,9 +25,9 @@ Please note that you should provide **either image URLs or embedding file paths*

 Here are quick steps to launch Llama-4 Maverick BF16 in aggregated mode
 ```bash
-cd $DYNAMO_HOME/components/backends/trtllm
+cd $DYNAMO_HOME

-export AGG_ENGINE_ARGS=./engine_configs/multinode/agg.yaml
+export AGG_ENGINE_ARGS=./recipes/llama4/trtllm/multimodal/agg.yaml
 export SERVED_MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct"
 export MODEL_PATH="meta-llama/Llama-4-Maverick-17B-128E-Instruct"
 ./launch/agg.sh
@@ -75,13 +75,13 @@ Here are quick steps to launch in disaggregated mode.

 The following is an example of launching a model in disaggregated mode. While this example uses `Qwen/Qwen2-VL-7B-Instruct`, you can adapt it for other models by modifying the environment variables for the model path and engine configurations.
 ```bash
-cd $DYNAMO_HOME/components/backends/trtllm
+cd $DYNAMO_HOME

 export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-VL-7B-Instruct"}
 export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"}
 export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"}
-export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/multimodal/prefill.yaml"}
-export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/multimodal/decode.yaml"}
+export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"recipes/qwen2-vl-7b-instruct/trtllm/prefill.yaml"}
+export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"recipes/qwen2-vl-7b-instruct/trtllm/decode.yaml"}
 export MODALITY=${MODALITY:-"multimodal"}

 ./launch/disagg.sh

--- a/docs/backends/trtllm/multinode/multinode-examples.md
+++ b/docs/backends/trtllm/multinode/multinode-examples.md
@@ -136,7 +136,7 @@ follow these steps below to launch an **aggregated** deployment across 4 nodes:

 ```bash
 # Default set in srun_aggregated.sh, but can customize here.
-# export ENGINE_CONFIG="/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml"
+# export ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_agg.yaml"

 # Customize NUM_NODES to match the desired parallelism in ENGINE_CONFIG
 # The product of NUM_NODES*NUM_GPUS_PER_NODE should match the number of
@@ -165,8 +165,8 @@ deployment across 8 nodes:

 ```bash
 # Defaults set in srun_disaggregated.sh, but can customize here.
-# export PREFILL_ENGINE_CONFIG="/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml"
-# export DECODE_ENGINE_CONFIG="/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml"
+# export PREFILL_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_prefill.yaml"
+# export DECODE_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_decode.yaml"

 # Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG
 # Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG

--- a/docs/backends/trtllm/multinode/multinode-multimodal-example.md
+++ b/docs/backends/trtllm/multinode/multinode-multimodal-example.md
@@ -34,7 +34,7 @@ limitations under the License.
 >
 > Before running the deployment, you must update the engine configuration files to change `backend: DEFAULT` to `backend: default` (lowercase). Run the following command:
 > ```bash
-> sed -i 's/backend: DEFAULT/backend: default/g' /mnt/engine_configs/multimodal/llama4/prefill.yaml /mnt/engine_configs/multimodal/llama4/decode.yaml
+> sed -i 's/backend: DEFAULT/backend: default/g' /mnt/recipes/llama4/trtllm/multimodal/prefill.yaml /mnt/recipes/llama4/trtllm/multimodal/decode.yaml
 > ```


@@ -100,8 +100,8 @@ deployment across 4 nodes:

 ```bash
 # Defaults set in srun_disaggregated.sh, but can customize here.
-# export PREFILL_ENGINE_CONFIG="/mnt/engine_configs/multimodal/llama4/prefill.yaml"
-# export DECODE_ENGINE_CONFIG="/mnt/engine_configs/multimodal/llama4/decode.yaml"
+# export PREFILL_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/multimodal/prefill.yaml"
+# export DECODE_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/multimodal/decode.yaml"

 # Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG
 # Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG

--- a/docs/kubernetes/README.md
+++ b/docs/kubernetes/README.md
@@ -203,7 +203,7 @@ args:
  - python3 -m dynamo.trtllm
    --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
    --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-    --extra-engine-args engine_configs/agg.yaml
+    --extra-engine-args /workspace/recipes/deepseek-r1-distill-llama-8b/agg.yaml
 ```

 Key customization points include:

--- a/components/backends/trtllm/multinode/srun_aggregated.sh
+++ b/components/backends/trtllm/multinode/srun_aggregated.sh
@@ -10,7 +10,7 @@ IMAGE="${IMAGE:-""}"
 # but you may freely customize the mounts based on your cluster. A common practice
 # is to mount paths to NFS storage for common scripts, model weights, etc.
 # NOTE: This can be a comma separated list of multiple mounts as well.
-DEFAULT_MOUNT="${PWD}/../:/mnt"
+DEFAULT_MOUNT="${PWD}/../../../../:/mnt"
 MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}"

 # Example values, assuming 4 nodes with 4 GPUs on each node, such as 4xGB200 nodes.
@@ -18,7 +18,7 @@ MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}"
 NUM_NODES=${NUM_NODES:-4}
 NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4}

-export ENGINE_CONFIG="${ENGINE_CONFIG:-/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml}"
+export ENGINE_CONFIG="${ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_agg.yaml}"

 # Automate settings of certain variables for convenience, but you are free
 # to manually set these for more control as well.
@@ -51,7 +51,7 @@ srun \
  --nodelist "${HEAD_NODE}" \
  --nodes 1 \
  --jobid "${SLURM_JOB_ID}" \
-  /mnt/multinode/start_frontend_services.sh &
+  /mnt/examples/basics/multinode/trtllm/start_frontend_services.sh &

 # NOTE: Output streamed to stdout for ease of understanding the example, but
 # in practice you would probably set `srun --output ... --error ...` to pipe
@@ -71,4 +71,4 @@ srun \
  --nodes "${NUM_NODES}" \
  --ntasks-per-node "${NUM_GPUS_PER_NODE}" \
  --jobid "${SLURM_JOB_ID}" \
-  /mnt/multinode/start_trtllm_worker.sh &
\ No newline at end of file
+  /mnt/examples/basics/multinode/trtllm/start_trtllm_worker.sh &
\ No newline at end of file
--- a/components/backends/trtllm/multinode/srun_disaggregated.sh
+++ b/components/backends/trtllm/multinode/srun_disaggregated.sh
@@ -10,18 +10,18 @@ IMAGE="${IMAGE:-""}"
 # but you may freely customize the mounts based on your cluster. A common practice
 # is to mount paths to NFS storage for common scripts, model weights, etc.
 # NOTE: This can be a comma separated list of multiple mounts as well.
-DEFAULT_MOUNT="${PWD}/../:/mnt"
+DEFAULT_MOUNT="${PWD}/../../../../:/mnt"
 MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}"

 NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4}

 NUM_PREFILL_NODES=${NUM_PREFILL_NODES:-4}
 NUM_PREFILL_WORKERS=${NUM_PREFILL_WORKERS:-1}
-PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml}"
+PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_prefill.yaml}"

 NUM_DECODE_NODES=${NUM_DECODE_NODES:-4}
 NUM_DECODE_WORKERS=${NUM_DECODE_WORKERS:-1}
-DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml}"
+DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_decode.yaml}"

 DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"}

@@ -56,7 +56,7 @@ srun \
  --nodelist "${HEAD_NODE}" \
  --nodes 1 \
  --jobid "${SLURM_JOB_ID}" \
-  /mnt/multinode/start_frontend_services.sh &
+  /mnt/examples/basics/multinode/trtllm/start_frontend_services.sh &

 # NOTE: Output streamed to stdout for ease of understanding the example, but
 # in practice you would probably set `srun --output ... --error ...` to pipe
@@ -78,7 +78,7 @@ for ((i=1; i<=${NUM_PREFILL_WORKERS}; i++)); do
    --nodes "${NUM_PREFILL_NODES}" \
    --ntasks-per-node "${NUM_GPUS_PER_NODE}" \
    --jobid "${SLURM_JOB_ID}" \
-    /mnt/multinode/start_trtllm_worker.sh &
+    /mnt/examples/basics/multinode/trtllm/start_trtllm_worker.sh &
 done

 for ((i=1; i<=${NUM_DECODE_WORKERS}; i++)); do
@@ -98,5 +98,5 @@ for ((i=1; i<=${NUM_DECODE_WORKERS}; i++)); do
    --nodes "${NUM_DECODE_NODES}" \
    --ntasks-per-node "${NUM_GPUS_PER_NODE}" \
    --jobid "${SLURM_JOB_ID}" \
-    /mnt/multinode/start_trtllm_worker.sh &
+    /mnt/examples/basics/multinode/trtllm/start_trtllm_worker.sh &
 done
\ No newline at end of file
--- a/components/backends/trtllm/multinode/start_frontend_services.sh
+++ b/components/backends/trtllm/multinode/start_frontend_services.sh
--- a/components/backends/trtllm/multinode/start_trtllm_worker.sh
+++ b/components/backends/trtllm/multinode/start_trtllm_worker.sh
--- a/components/backends/trtllm/engine_configs/agg.yaml
+++ b/components/backends/trtllm/engine_configs/agg.yaml
--- a/components/backends/trtllm/engine_configs/decode.yaml
+++ b/components/backends/trtllm/engine_configs/decode.yaml
--- a/components/backends/trtllm/engine_configs/prefill.yaml
+++ b/components/backends/trtllm/engine_configs/prefill.yaml
--- a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_agg.yaml
+++ b/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_agg.yaml
--- a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml
+++ b/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml
--- a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml
+++ b/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml
--- a/components/backends/trtllm/engine_configs/deepseek_r1/simple/agg.yaml
+++ b/components/backends/trtllm/engine_configs/deepseek_r1/simple/agg.yaml
--- a/components/backends/trtllm/engine_configs/deepseek_r1/simple/decode.yaml
+++ b/components/backends/trtllm/engine_configs/deepseek_r1/simple/decode.yaml
--- a/components/backends/trtllm/engine_configs/deepseek_r1/simple/prefill.yaml
+++ b/components/backends/trtllm/engine_configs/deepseek_r1/simple/prefill.yaml
--- a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/dep16_agg.yaml
+++ b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/dep16_agg.yaml
--- a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/eplb.yaml
+++ b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/eplb.yaml
--- a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml
+++ b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml
@@ -11,7 +11,7 @@ moe_config:
  #   moe_max_num_tokens = max_batch_size * moe_expert_parallel_size
  #   4096 = 256 * 16
  # moe_max_num_tokens: 4096
-  load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
+  load_balancer: /mnt/recipes/deepseek-r1/trtllm/wide_ep/eplb.yaml

 tensor_parallel_size: 16
 moe_expert_parallel_size: 16