Unverified Commit 8354d325 authored by Anant Sharma's avatar Anant Sharma Committed by GitHub
Browse files

refactor: move engine configs out of components directory (#3772)


Signed-off-by: default avatarAnant Sharma <anants@nvidia.com>
Co-authored-by: default avatartanmayv25 <tanmay2592@gmail.com>
parent 90caf3ea
...@@ -25,9 +25,9 @@ Please note that you should provide **either image URLs or embedding file paths* ...@@ -25,9 +25,9 @@ Please note that you should provide **either image URLs or embedding file paths*
Here are quick steps to launch Llama-4 Maverick BF16 in aggregated mode Here are quick steps to launch Llama-4 Maverick BF16 in aggregated mode
```bash ```bash
cd $DYNAMO_HOME/components/backends/trtllm cd $DYNAMO_HOME
export AGG_ENGINE_ARGS=./engine_configs/multinode/agg.yaml export AGG_ENGINE_ARGS=./recipes/llama4/trtllm/multimodal/agg.yaml
export SERVED_MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct" export SERVED_MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct"
export MODEL_PATH="meta-llama/Llama-4-Maverick-17B-128E-Instruct" export MODEL_PATH="meta-llama/Llama-4-Maverick-17B-128E-Instruct"
./launch/agg.sh ./launch/agg.sh
...@@ -75,13 +75,13 @@ Here are quick steps to launch in disaggregated mode. ...@@ -75,13 +75,13 @@ Here are quick steps to launch in disaggregated mode.
The following is an example of launching a model in disaggregated mode. While this example uses `Qwen/Qwen2-VL-7B-Instruct`, you can adapt it for other models by modifying the environment variables for the model path and engine configurations. The following is an example of launching a model in disaggregated mode. While this example uses `Qwen/Qwen2-VL-7B-Instruct`, you can adapt it for other models by modifying the environment variables for the model path and engine configurations.
```bash ```bash
cd $DYNAMO_HOME/components/backends/trtllm cd $DYNAMO_HOME
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-VL-7B-Instruct"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-VL-7B-Instruct"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"}
export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"} export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/multimodal/prefill.yaml"} export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"recipes/qwen2-vl-7b-instruct/trtllm/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/multimodal/decode.yaml"} export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"recipes/qwen2-vl-7b-instruct/trtllm/decode.yaml"}
export MODALITY=${MODALITY:-"multimodal"} export MODALITY=${MODALITY:-"multimodal"}
./launch/disagg.sh ./launch/disagg.sh
......
...@@ -136,7 +136,7 @@ follow these steps below to launch an **aggregated** deployment across 4 nodes: ...@@ -136,7 +136,7 @@ follow these steps below to launch an **aggregated** deployment across 4 nodes:
```bash ```bash
# Default set in srun_aggregated.sh, but can customize here. # Default set in srun_aggregated.sh, but can customize here.
# export ENGINE_CONFIG="/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml" # export ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_agg.yaml"
# Customize NUM_NODES to match the desired parallelism in ENGINE_CONFIG # Customize NUM_NODES to match the desired parallelism in ENGINE_CONFIG
# The product of NUM_NODES*NUM_GPUS_PER_NODE should match the number of # The product of NUM_NODES*NUM_GPUS_PER_NODE should match the number of
...@@ -165,8 +165,8 @@ deployment across 8 nodes: ...@@ -165,8 +165,8 @@ deployment across 8 nodes:
```bash ```bash
# Defaults set in srun_disaggregated.sh, but can customize here. # Defaults set in srun_disaggregated.sh, but can customize here.
# export PREFILL_ENGINE_CONFIG="/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml" # export PREFILL_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_prefill.yaml"
# export DECODE_ENGINE_CONFIG="/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml" # export DECODE_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_decode.yaml"
# Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG # Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG
# Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG # Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG
......
...@@ -34,7 +34,7 @@ limitations under the License. ...@@ -34,7 +34,7 @@ limitations under the License.
> >
> Before running the deployment, you must update the engine configuration files to change `backend: DEFAULT` to `backend: default` (lowercase). Run the following command: > Before running the deployment, you must update the engine configuration files to change `backend: DEFAULT` to `backend: default` (lowercase). Run the following command:
> ```bash > ```bash
> sed -i 's/backend: DEFAULT/backend: default/g' /mnt/engine_configs/multimodal/llama4/prefill.yaml /mnt/engine_configs/multimodal/llama4/decode.yaml > sed -i 's/backend: DEFAULT/backend: default/g' /mnt/recipes/llama4/trtllm/multimodal/prefill.yaml /mnt/recipes/llama4/trtllm/multimodal/decode.yaml
> ``` > ```
...@@ -100,8 +100,8 @@ deployment across 4 nodes: ...@@ -100,8 +100,8 @@ deployment across 4 nodes:
```bash ```bash
# Defaults set in srun_disaggregated.sh, but can customize here. # Defaults set in srun_disaggregated.sh, but can customize here.
# export PREFILL_ENGINE_CONFIG="/mnt/engine_configs/multimodal/llama4/prefill.yaml" # export PREFILL_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/multimodal/prefill.yaml"
# export DECODE_ENGINE_CONFIG="/mnt/engine_configs/multimodal/llama4/decode.yaml" # export DECODE_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/multimodal/decode.yaml"
# Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG # Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG
# Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG # Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG
......
...@@ -203,7 +203,7 @@ args: ...@@ -203,7 +203,7 @@ args:
- python3 -m dynamo.trtllm - python3 -m dynamo.trtllm
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--extra-engine-args engine_configs/agg.yaml --extra-engine-args /workspace/recipes/deepseek-r1-distill-llama-8b/agg.yaml
``` ```
Key customization points include: Key customization points include:
......
...@@ -10,7 +10,7 @@ IMAGE="${IMAGE:-""}" ...@@ -10,7 +10,7 @@ IMAGE="${IMAGE:-""}"
# but you may freely customize the mounts based on your cluster. A common practice # but you may freely customize the mounts based on your cluster. A common practice
# is to mount paths to NFS storage for common scripts, model weights, etc. # is to mount paths to NFS storage for common scripts, model weights, etc.
# NOTE: This can be a comma separated list of multiple mounts as well. # NOTE: This can be a comma separated list of multiple mounts as well.
DEFAULT_MOUNT="${PWD}/../:/mnt" DEFAULT_MOUNT="${PWD}/../../../../:/mnt"
MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}" MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}"
# Example values, assuming 4 nodes with 4 GPUs on each node, such as 4xGB200 nodes. # Example values, assuming 4 nodes with 4 GPUs on each node, such as 4xGB200 nodes.
...@@ -18,7 +18,7 @@ MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}" ...@@ -18,7 +18,7 @@ MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}"
NUM_NODES=${NUM_NODES:-4} NUM_NODES=${NUM_NODES:-4}
NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4} NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4}
export ENGINE_CONFIG="${ENGINE_CONFIG:-/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml}" export ENGINE_CONFIG="${ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_agg.yaml}"
# Automate settings of certain variables for convenience, but you are free # Automate settings of certain variables for convenience, but you are free
# to manually set these for more control as well. # to manually set these for more control as well.
...@@ -51,7 +51,7 @@ srun \ ...@@ -51,7 +51,7 @@ srun \
--nodelist "${HEAD_NODE}" \ --nodelist "${HEAD_NODE}" \
--nodes 1 \ --nodes 1 \
--jobid "${SLURM_JOB_ID}" \ --jobid "${SLURM_JOB_ID}" \
/mnt/multinode/start_frontend_services.sh & /mnt/examples/basics/multinode/trtllm/start_frontend_services.sh &
# NOTE: Output streamed to stdout for ease of understanding the example, but # NOTE: Output streamed to stdout for ease of understanding the example, but
# in practice you would probably set `srun --output ... --error ...` to pipe # in practice you would probably set `srun --output ... --error ...` to pipe
...@@ -71,4 +71,4 @@ srun \ ...@@ -71,4 +71,4 @@ srun \
--nodes "${NUM_NODES}" \ --nodes "${NUM_NODES}" \
--ntasks-per-node "${NUM_GPUS_PER_NODE}" \ --ntasks-per-node "${NUM_GPUS_PER_NODE}" \
--jobid "${SLURM_JOB_ID}" \ --jobid "${SLURM_JOB_ID}" \
/mnt/multinode/start_trtllm_worker.sh & /mnt/examples/basics/multinode/trtllm/start_trtllm_worker.sh &
\ No newline at end of file \ No newline at end of file
...@@ -10,18 +10,18 @@ IMAGE="${IMAGE:-""}" ...@@ -10,18 +10,18 @@ IMAGE="${IMAGE:-""}"
# but you may freely customize the mounts based on your cluster. A common practice # but you may freely customize the mounts based on your cluster. A common practice
# is to mount paths to NFS storage for common scripts, model weights, etc. # is to mount paths to NFS storage for common scripts, model weights, etc.
# NOTE: This can be a comma separated list of multiple mounts as well. # NOTE: This can be a comma separated list of multiple mounts as well.
DEFAULT_MOUNT="${PWD}/../:/mnt" DEFAULT_MOUNT="${PWD}/../../../../:/mnt"
MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}" MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}"
NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4} NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4}
NUM_PREFILL_NODES=${NUM_PREFILL_NODES:-4} NUM_PREFILL_NODES=${NUM_PREFILL_NODES:-4}
NUM_PREFILL_WORKERS=${NUM_PREFILL_WORKERS:-1} NUM_PREFILL_WORKERS=${NUM_PREFILL_WORKERS:-1}
PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml}" PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_prefill.yaml}"
NUM_DECODE_NODES=${NUM_DECODE_NODES:-4} NUM_DECODE_NODES=${NUM_DECODE_NODES:-4}
NUM_DECODE_WORKERS=${NUM_DECODE_WORKERS:-1} NUM_DECODE_WORKERS=${NUM_DECODE_WORKERS:-1}
DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml}" DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_decode.yaml}"
DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"} DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"}
...@@ -56,7 +56,7 @@ srun \ ...@@ -56,7 +56,7 @@ srun \
--nodelist "${HEAD_NODE}" \ --nodelist "${HEAD_NODE}" \
--nodes 1 \ --nodes 1 \
--jobid "${SLURM_JOB_ID}" \ --jobid "${SLURM_JOB_ID}" \
/mnt/multinode/start_frontend_services.sh & /mnt/examples/basics/multinode/trtllm/start_frontend_services.sh &
# NOTE: Output streamed to stdout for ease of understanding the example, but # NOTE: Output streamed to stdout for ease of understanding the example, but
# in practice you would probably set `srun --output ... --error ...` to pipe # in practice you would probably set `srun --output ... --error ...` to pipe
...@@ -78,7 +78,7 @@ for ((i=1; i<=${NUM_PREFILL_WORKERS}; i++)); do ...@@ -78,7 +78,7 @@ for ((i=1; i<=${NUM_PREFILL_WORKERS}; i++)); do
--nodes "${NUM_PREFILL_NODES}" \ --nodes "${NUM_PREFILL_NODES}" \
--ntasks-per-node "${NUM_GPUS_PER_NODE}" \ --ntasks-per-node "${NUM_GPUS_PER_NODE}" \
--jobid "${SLURM_JOB_ID}" \ --jobid "${SLURM_JOB_ID}" \
/mnt/multinode/start_trtllm_worker.sh & /mnt/examples/basics/multinode/trtllm/start_trtllm_worker.sh &
done done
for ((i=1; i<=${NUM_DECODE_WORKERS}; i++)); do for ((i=1; i<=${NUM_DECODE_WORKERS}; i++)); do
...@@ -98,5 +98,5 @@ for ((i=1; i<=${NUM_DECODE_WORKERS}; i++)); do ...@@ -98,5 +98,5 @@ for ((i=1; i<=${NUM_DECODE_WORKERS}; i++)); do
--nodes "${NUM_DECODE_NODES}" \ --nodes "${NUM_DECODE_NODES}" \
--ntasks-per-node "${NUM_GPUS_PER_NODE}" \ --ntasks-per-node "${NUM_GPUS_PER_NODE}" \
--jobid "${SLURM_JOB_ID}" \ --jobid "${SLURM_JOB_ID}" \
/mnt/multinode/start_trtllm_worker.sh & /mnt/examples/basics/multinode/trtllm/start_trtllm_worker.sh &
done done
\ No newline at end of file
...@@ -11,7 +11,7 @@ moe_config: ...@@ -11,7 +11,7 @@ moe_config:
# moe_max_num_tokens = max_batch_size * moe_expert_parallel_size # moe_max_num_tokens = max_batch_size * moe_expert_parallel_size
# 4096 = 256 * 16 # 4096 = 256 * 16
# moe_max_num_tokens: 4096 # moe_max_num_tokens: 4096
load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml load_balancer: /mnt/recipes/deepseek-r1/trtllm/wide_ep/eplb.yaml
tensor_parallel_size: 16 tensor_parallel_size: 16
moe_expert_parallel_size: 16 moe_expert_parallel_size: 16
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment