Unverified Commit 8354d325 authored by Anant Sharma's avatar Anant Sharma Committed by GitHub
Browse files

refactor: move engine configs out of components directory (#3772)


Signed-off-by: default avatarAnant Sharma <anants@nvidia.com>
Co-authored-by: default avatartanmayv25 <tanmay2592@gmail.com>
parent 90caf3ea
......@@ -25,9 +25,9 @@ Please note that you should provide **either image URLs or embedding file paths*
Here are quick steps to launch Llama-4 Maverick BF16 in aggregated mode
```bash
cd $DYNAMO_HOME/components/backends/trtllm
cd $DYNAMO_HOME
export AGG_ENGINE_ARGS=./engine_configs/multinode/agg.yaml
export AGG_ENGINE_ARGS=./recipes/llama4/trtllm/multimodal/agg.yaml
export SERVED_MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct"
export MODEL_PATH="meta-llama/Llama-4-Maverick-17B-128E-Instruct"
./launch/agg.sh
......@@ -75,13 +75,13 @@ Here are quick steps to launch in disaggregated mode.
The following is an example of launching a model in disaggregated mode. While this example uses `Qwen/Qwen2-VL-7B-Instruct`, you can adapt it for other models by modifying the environment variables for the model path and engine configurations.
```bash
cd $DYNAMO_HOME/components/backends/trtllm
cd $DYNAMO_HOME
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-VL-7B-Instruct"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"}
export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/multimodal/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/multimodal/decode.yaml"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"recipes/qwen2-vl-7b-instruct/trtllm/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"recipes/qwen2-vl-7b-instruct/trtllm/decode.yaml"}
export MODALITY=${MODALITY:-"multimodal"}
./launch/disagg.sh
......
......@@ -136,7 +136,7 @@ follow these steps below to launch an **aggregated** deployment across 4 nodes:
```bash
# Default set in srun_aggregated.sh, but can customize here.
# export ENGINE_CONFIG="/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml"
# export ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_agg.yaml"
# Customize NUM_NODES to match the desired parallelism in ENGINE_CONFIG
# The product of NUM_NODES*NUM_GPUS_PER_NODE should match the number of
......@@ -165,8 +165,8 @@ deployment across 8 nodes:
```bash
# Defaults set in srun_disaggregated.sh, but can customize here.
# export PREFILL_ENGINE_CONFIG="/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml"
# export DECODE_ENGINE_CONFIG="/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml"
# export PREFILL_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_prefill.yaml"
# export DECODE_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_decode.yaml"
# Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG
# Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG
......
......@@ -34,7 +34,7 @@ limitations under the License.
>
> Before running the deployment, you must update the engine configuration files to change `backend: DEFAULT` to `backend: default` (lowercase). Run the following command:
> ```bash
> sed -i 's/backend: DEFAULT/backend: default/g' /mnt/engine_configs/multimodal/llama4/prefill.yaml /mnt/engine_configs/multimodal/llama4/decode.yaml
> sed -i 's/backend: DEFAULT/backend: default/g' /mnt/recipes/llama4/trtllm/multimodal/prefill.yaml /mnt/recipes/llama4/trtllm/multimodal/decode.yaml
> ```
......@@ -100,8 +100,8 @@ deployment across 4 nodes:
```bash
# Defaults set in srun_disaggregated.sh, but can customize here.
# export PREFILL_ENGINE_CONFIG="/mnt/engine_configs/multimodal/llama4/prefill.yaml"
# export DECODE_ENGINE_CONFIG="/mnt/engine_configs/multimodal/llama4/decode.yaml"
# export PREFILL_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/multimodal/prefill.yaml"
# export DECODE_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/multimodal/decode.yaml"
# Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG
# Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG
......
......@@ -203,7 +203,7 @@ args:
- python3 -m dynamo.trtllm
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--extra-engine-args engine_configs/agg.yaml
--extra-engine-args /workspace/recipes/deepseek-r1-distill-llama-8b/agg.yaml
```
Key customization points include:
......
......@@ -10,7 +10,7 @@ IMAGE="${IMAGE:-""}"
# but you may freely customize the mounts based on your cluster. A common practice
# is to mount paths to NFS storage for common scripts, model weights, etc.
# NOTE: This can be a comma separated list of multiple mounts as well.
DEFAULT_MOUNT="${PWD}/../:/mnt"
DEFAULT_MOUNT="${PWD}/../../../../:/mnt"
MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}"
# Example values, assuming 4 nodes with 4 GPUs on each node, such as 4xGB200 nodes.
......@@ -18,7 +18,7 @@ MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}"
NUM_NODES=${NUM_NODES:-4}
NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4}
export ENGINE_CONFIG="${ENGINE_CONFIG:-/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml}"
export ENGINE_CONFIG="${ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_agg.yaml}"
# Automate settings of certain variables for convenience, but you are free
# to manually set these for more control as well.
......@@ -51,7 +51,7 @@ srun \
--nodelist "${HEAD_NODE}" \
--nodes 1 \
--jobid "${SLURM_JOB_ID}" \
/mnt/multinode/start_frontend_services.sh &
/mnt/examples/basics/multinode/trtllm/start_frontend_services.sh &
# NOTE: Output streamed to stdout for ease of understanding the example, but
# in practice you would probably set `srun --output ... --error ...` to pipe
......@@ -71,4 +71,4 @@ srun \
--nodes "${NUM_NODES}" \
--ntasks-per-node "${NUM_GPUS_PER_NODE}" \
--jobid "${SLURM_JOB_ID}" \
/mnt/multinode/start_trtllm_worker.sh &
\ No newline at end of file
/mnt/examples/basics/multinode/trtllm/start_trtllm_worker.sh &
\ No newline at end of file
......@@ -10,18 +10,18 @@ IMAGE="${IMAGE:-""}"
# but you may freely customize the mounts based on your cluster. A common practice
# is to mount paths to NFS storage for common scripts, model weights, etc.
# NOTE: This can be a comma separated list of multiple mounts as well.
DEFAULT_MOUNT="${PWD}/../:/mnt"
DEFAULT_MOUNT="${PWD}/../../../../:/mnt"
MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}"
NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4}
NUM_PREFILL_NODES=${NUM_PREFILL_NODES:-4}
NUM_PREFILL_WORKERS=${NUM_PREFILL_WORKERS:-1}
PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml}"
PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_prefill.yaml}"
NUM_DECODE_NODES=${NUM_DECODE_NODES:-4}
NUM_DECODE_WORKERS=${NUM_DECODE_WORKERS:-1}
DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml}"
DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_decode.yaml}"
DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"}
......@@ -56,7 +56,7 @@ srun \
--nodelist "${HEAD_NODE}" \
--nodes 1 \
--jobid "${SLURM_JOB_ID}" \
/mnt/multinode/start_frontend_services.sh &
/mnt/examples/basics/multinode/trtllm/start_frontend_services.sh &
# NOTE: Output streamed to stdout for ease of understanding the example, but
# in practice you would probably set `srun --output ... --error ...` to pipe
......@@ -78,7 +78,7 @@ for ((i=1; i<=${NUM_PREFILL_WORKERS}; i++)); do
--nodes "${NUM_PREFILL_NODES}" \
--ntasks-per-node "${NUM_GPUS_PER_NODE}" \
--jobid "${SLURM_JOB_ID}" \
/mnt/multinode/start_trtllm_worker.sh &
/mnt/examples/basics/multinode/trtllm/start_trtllm_worker.sh &
done
for ((i=1; i<=${NUM_DECODE_WORKERS}; i++)); do
......@@ -98,5 +98,5 @@ for ((i=1; i<=${NUM_DECODE_WORKERS}; i++)); do
--nodes "${NUM_DECODE_NODES}" \
--ntasks-per-node "${NUM_GPUS_PER_NODE}" \
--jobid "${SLURM_JOB_ID}" \
/mnt/multinode/start_trtllm_worker.sh &
/mnt/examples/basics/multinode/trtllm/start_trtllm_worker.sh &
done
\ No newline at end of file
......@@ -11,7 +11,7 @@ moe_config:
# moe_max_num_tokens = max_batch_size * moe_expert_parallel_size
# 4096 = 256 * 16
# moe_max_num_tokens: 4096
load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
load_balancer: /mnt/recipes/deepseek-r1/trtllm/wide_ep/eplb.yaml
tensor_parallel_size: 16
moe_expert_parallel_size: 16
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment