Unverified Commit 8354d325 authored by Anant Sharma's avatar Anant Sharma Committed by GitHub
Browse files

refactor: move engine configs out of components directory (#3772)


Signed-off-by: default avatarAnant Sharma <anants@nvidia.com>
Co-authored-by: default avatartanmayv25 <tanmay2592@gmail.com>
parent 90caf3ea
...@@ -4,8 +4,10 @@ ...@@ -4,8 +4,10 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# Parse command-line arguments # Parse command-line arguments
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
NUM_WORKERS=8 NUM_WORKERS=8
MODEL_PATH="deepseek-ai/DeepSeek-R1-Distill-Llama-8B" MODEL_PATH="deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
RECIPE_PATH="$DYNAMO_HOME/recipes/deepseek-r1-distill-llama-8b/trtllm"
TENSOR_PARALLEL_SIZE=1 TENSOR_PARALLEL_SIZE=1
DATA_PARALLEL_SIZE=1 DATA_PARALLEL_SIZE=1
USE_MOCKERS=false USE_MOCKERS=false
...@@ -84,13 +86,13 @@ if [ ${#EXTRA_ARGS[@]} -eq 0 ]; then ...@@ -84,13 +86,13 @@ if [ ${#EXTRA_ARGS[@]} -eq 0 ]; then
) )
elif [ "$USE_TRTLLM" = true ]; then elif [ "$USE_TRTLLM" = true ]; then
# Default args for TensorRT-LLM engine using predefined YAML configs # Default args for TensorRT-LLM engine using predefined YAML configs
# Config files located at: ../../components/backends/trtllm/engine_configs/{agg,decode,prefill}.yaml # Config files located at: $RECIPE_PATH/{agg,decode,prefill}.yaml
if [ "$MODE" = "prefill" ]; then if [ "$MODE" = "prefill" ]; then
ENGINE_CONFIG="../../components/backends/trtllm/engine_configs/prefill.yaml" ENGINE_CONFIG="$RECIPE_PATH/prefill.yaml"
elif [ "$MODE" = "decode" ]; then elif [ "$MODE" = "decode" ]; then
ENGINE_CONFIG="../../components/backends/trtllm/engine_configs/decode.yaml" ENGINE_CONFIG="$RECIPE_PATH/decode.yaml"
else else
ENGINE_CONFIG="../../components/backends/trtllm/engine_configs/agg.yaml" ENGINE_CONFIG="$RECIPE_PATH/agg.yaml"
fi fi
EXTRA_ARGS=( EXTRA_ARGS=(
......
...@@ -55,7 +55,7 @@ spec: ...@@ -55,7 +55,7 @@ spec:
# mount the configmap as a volume # mount the configmap as a volume
volumeMounts: volumeMounts:
- name: nvidia-config - name: nvidia-config
mountPath: /workspace/components/backends/trtllm/engine_configs mountPath: /workspace/
readOnly: true readOnly: true
command: command:
- python3 - python3
...@@ -67,4 +67,4 @@ spec: ...@@ -67,4 +67,4 @@ spec:
- --served-model-name - --served-model-name
- Qwen/Qwen3-0.6B - Qwen/Qwen3-0.6B
- --extra-engine-args - --extra-engine-args
- engine_configs/agg.yaml - ./recipes/qwen3/trtllm/agg.yaml
...@@ -25,7 +25,7 @@ spec: ...@@ -25,7 +25,7 @@ spec:
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: my-registry/trtllm-runtime:my-tag image: my-registry/trtllm-runtime:my-tag
workingDir: /workspace/components/backends/trtllm workingDir: /workspace/
command: command:
- python3 - python3
- -m - -m
...@@ -36,4 +36,4 @@ spec: ...@@ -36,4 +36,4 @@ spec:
- --served-model-name - --served-model-name
- Qwen/Qwen3-0.6B - Qwen/Qwen3-0.6B
- --extra-engine-args - --extra-engine-args
- engine_configs/agg.yaml - ./recipes/qwen3/trtllm/agg.yaml
...@@ -28,7 +28,7 @@ spec: ...@@ -28,7 +28,7 @@ spec:
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: my-registry/trtllm-runtime:my-tag image: my-registry/trtllm-runtime:my-tag
workingDir: /workspace/components/backends/trtllm workingDir: /workspace/
command: command:
- python3 - python3
- -m - -m
...@@ -39,5 +39,5 @@ spec: ...@@ -39,5 +39,5 @@ spec:
- --served-model-name - --served-model-name
- Qwen/Qwen3-0.6B - Qwen/Qwen3-0.6B
- --extra-engine-args - --extra-engine-args
- engine_configs/agg.yaml - ./recipes/qwen3/trtllm/agg.yaml
- --publish-events-and-metrics - --publish-events-and-metrics
...@@ -125,10 +125,10 @@ spec: ...@@ -125,10 +125,10 @@ spec:
mainContainer: mainContainer:
volumeMounts: volumeMounts:
- name: nvidia-config - name: nvidia-config
mountPath: /workspace/components/backends/trtllm/engine_configs mountPath: /workspace/
readOnly: true readOnly: true
image: my-registry/trtllm-runtime:my-tag image: my-registry/trtllm-runtime:my-tag
workingDir: /workspace/components/backends/trtllm workingDir: /workspace/
command: command:
- python3 - python3
- -m - -m
...@@ -139,7 +139,7 @@ spec: ...@@ -139,7 +139,7 @@ spec:
- --served-model-name - --served-model-name
- Qwen/Qwen3-0.6B - Qwen/Qwen3-0.6B
- --extra-engine-args - --extra-engine-args
- engine_configs/prefill.yaml - ./recipes/qwen3/trtllm/prefill.yaml
- --disaggregation-mode - --disaggregation-mode
- prefill - prefill
- --disaggregation-strategy - --disaggregation-strategy
...@@ -165,10 +165,10 @@ spec: ...@@ -165,10 +165,10 @@ spec:
mainContainer: mainContainer:
volumeMounts: volumeMounts:
- name: nvidia-config - name: nvidia-config
mountPath: /workspace/components/backends/trtllm/engine_configs mountPath: /workspace/
readOnly: true readOnly: true
image: my-registry/trtllm-runtime:my-tag image: my-registry/trtllm-runtime:my-tag
workingDir: /workspace/components/backends/trtllm workingDir: /workspace/
command: command:
- python3 - python3
- -m - -m
...@@ -179,7 +179,7 @@ spec: ...@@ -179,7 +179,7 @@ spec:
- --served-model-name - --served-model-name
- Qwen/Qwen3-0.6B - Qwen/Qwen3-0.6B
- --extra-engine-args - --extra-engine-args
- engine_configs/decode.yaml - ./recipes/qwen3/trtllm/decode.yaml
- --disaggregation-mode - --disaggregation-mode
- decode - decode
- --disaggregation-strategy - --disaggregation-strategy
......
...@@ -26,7 +26,7 @@ spec: ...@@ -26,7 +26,7 @@ spec:
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: my-registry/trtllm-runtime:my-tag image: my-registry/trtllm-runtime:my-tag
workingDir: /workspace/components/backends/trtllm workingDir: /workspace/
command: command:
- python3 - python3
- -m - -m
...@@ -37,7 +37,7 @@ spec: ...@@ -37,7 +37,7 @@ spec:
- --served-model-name - --served-model-name
- Qwen/Qwen3-0.6B - Qwen/Qwen3-0.6B
- --extra-engine-args - --extra-engine-args
- engine_configs/prefill.yaml - ./recipes/qwen3/trtllm/prefill.yaml
- --disaggregation-mode - --disaggregation-mode
- prefill - prefill
- --disaggregation-strategy - --disaggregation-strategy
...@@ -54,7 +54,7 @@ spec: ...@@ -54,7 +54,7 @@ spec:
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: my-registry/trtllm-runtime:my-tag image: my-registry/trtllm-runtime:my-tag
workingDir: /workspace/components/backends/trtllm workingDir: /workspace/
command: command:
- python3 - python3
- -m - -m
...@@ -65,7 +65,7 @@ spec: ...@@ -65,7 +65,7 @@ spec:
- --served-model-name - --served-model-name
- Qwen/Qwen3-0.6B - Qwen/Qwen3-0.6B
- --extra-engine-args - --extra-engine-args
- engine_configs/decode.yaml - ./recipes/qwen3/trtllm/decode.yaml
- --disaggregation-mode - --disaggregation-mode
- decode - decode
- --disaggregation-strategy - --disaggregation-strategy
......
...@@ -86,7 +86,7 @@ spec: ...@@ -86,7 +86,7 @@ spec:
terminationGracePeriodSeconds: 600 terminationGracePeriodSeconds: 600
mainContainer: mainContainer:
image: my-registry/trtllm-runtime:my-tag image: my-registry/trtllm-runtime:my-tag
workingDir: /workspace/components/backends/trtllm workingDir: /workspace/
command: command:
- python3 - python3
args: args:
...@@ -97,7 +97,7 @@ spec: ...@@ -97,7 +97,7 @@ spec:
- --served-model-name - --served-model-name
- Qwen/Qwen3-0.6B - Qwen/Qwen3-0.6B
- --extra-engine-args - --extra-engine-args
- engine_configs/decode.yaml - ./recipes/qwen3/trtllm/decode.yaml
- --disaggregation-mode - --disaggregation-mode
- decode - decode
- --disaggregation-strategy - --disaggregation-strategy
...@@ -115,7 +115,7 @@ spec: ...@@ -115,7 +115,7 @@ spec:
terminationGracePeriodSeconds: 600 terminationGracePeriodSeconds: 600
mainContainer: mainContainer:
image: my-registry/trtllm-runtime:my-tag image: my-registry/trtllm-runtime:my-tag
workingDir: /workspace/components/backends/trtllm workingDir: /workspace/
command: command:
- python3 - python3
args: args:
...@@ -126,7 +126,7 @@ spec: ...@@ -126,7 +126,7 @@ spec:
- --served-model-name - --served-model-name
- Qwen/Qwen3-0.6B - Qwen/Qwen3-0.6B
- --extra-engine-args - --extra-engine-args
- engine_configs/prefill.yaml - ./recipes/qwen3/trtllm/prefill.yaml
- --disaggregation-mode - --disaggregation-mode
- prefill - prefill
- --disaggregation-strategy - --disaggregation-strategy
......
...@@ -28,7 +28,7 @@ spec: ...@@ -28,7 +28,7 @@ spec:
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: my-registry/trtllm-runtime:my-tag image: my-registry/trtllm-runtime:my-tag
workingDir: /workspace/components/backends/trtllm workingDir: /workspace/
command: command:
- python3 - python3
- -m - -m
...@@ -39,7 +39,7 @@ spec: ...@@ -39,7 +39,7 @@ spec:
- --served-model-name - --served-model-name
- Qwen/Qwen3-0.6B - Qwen/Qwen3-0.6B
- --extra-engine-args - --extra-engine-args
- engine_configs/prefill.yaml - ./recipes/qwen3/trtllm/prefill.yaml
- --disaggregation-mode - --disaggregation-mode
- prefill - prefill
- --disaggregation-strategy - --disaggregation-strategy
...@@ -56,7 +56,7 @@ spec: ...@@ -56,7 +56,7 @@ spec:
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: my-registry/trtllm-runtime:my-tag image: my-registry/trtllm-runtime:my-tag
workingDir: /workspace/components/backends/trtllm workingDir: /workspace/
command: command:
- python3 - python3
- -m - -m
...@@ -67,7 +67,7 @@ spec: ...@@ -67,7 +67,7 @@ spec:
- --served-model-name - --served-model-name
- Qwen/Qwen3-0.6B - Qwen/Qwen3-0.6B
- --extra-engine-args - --extra-engine-args
- engine_configs/decode.yaml - ./recipes/qwen3/trtllm/decode.yaml
- --disaggregation-mode - --disaggregation-mode
- decode - decode
- --disaggregation-strategy - --disaggregation-strategy
......
...@@ -3,9 +3,10 @@ ...@@ -3,9 +3,10 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# Environment variables with defaults # Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"engine_configs/agg.yaml"} export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/agg.yaml"}
export MODALITY=${MODALITY:-"text"} export MODALITY=${MODALITY:-"text"}
# If you want to use multimodal, set MODALITY to "multimodal" # If you want to use multimodal, set MODALITY to "multimodal"
#export MODALITY=${MODALITY:-"multimodal"} #export MODALITY=${MODALITY:-"multimodal"}
......
...@@ -3,9 +3,10 @@ ...@@ -3,9 +3,10 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# Environment variables with defaults # Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"engine_configs/agg.yaml"} export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/agg.yaml"}
export MODALITY=${MODALITY:-"text"} export MODALITY=${MODALITY:-"text"}
# Setup cleanup trap # Setup cleanup trap
......
...@@ -3,9 +3,10 @@ ...@@ -3,9 +3,10 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# Environment variables with defaults # Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"engine_configs/agg.yaml"} export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/agg.yaml"}
# Setup cleanup trap # Setup cleanup trap
cleanup() { cleanup() {
......
...@@ -3,11 +3,12 @@ ...@@ -3,11 +3,12 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# Environment variables with defaults # Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"} export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/prefill.yaml"} export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/decode.yaml"} export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/decode.yaml"}
export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"} export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"}
export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"} export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"}
export MODALITY=${MODALITY:-"text"} export MODALITY=${MODALITY:-"text"}
......
...@@ -3,11 +3,12 @@ ...@@ -3,11 +3,12 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# Environment variables with defaults # Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"prefill_first"} export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"prefill_first"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/prefill.yaml"} export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/decode.yaml"} export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/decode.yaml"}
export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"} export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"}
export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"} export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"}
......
...@@ -3,12 +3,13 @@ ...@@ -3,12 +3,13 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# Environment variables with defaults # Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-VL-7B-Instruct"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-VL-7B-Instruct"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"}
export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"} export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/prefill.yaml"} export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen2-vl-7b-instruct/trtllm/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/decode.yaml"} export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen2-vl-7b-instruct/trtllm/decode.yaml"}
export ENCODE_ENGINE_ARGS=${ENCODE_ENGINE_ARGS:-"engine_configs/encode.yaml"} export ENCODE_ENGINE_ARGS=${ENCODE_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen2-vl-7b-instruct/trtllm/encode.yaml"}
export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"} export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"}
export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"} export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"}
export ENCODE_CUDA_VISIBLE_DEVICES=${ENCODE_CUDA_VISIBLE_DEVICES:-"2"} export ENCODE_CUDA_VISIBLE_DEVICES=${ENCODE_CUDA_VISIBLE_DEVICES:-"2"}
......
...@@ -3,11 +3,12 @@ ...@@ -3,11 +3,12 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# Environment variables with defaults # Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"/model"} export MODEL_PATH=${MODEL_PATH:-"/model"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"openai/gpt-oss-120b"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"openai/gpt-oss-120b"}
export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"prefill_first"} export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"prefill_first"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/gpt_oss/prefill.yaml"} export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/gpt-oss-120b/trtllm/disagg/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/gpt_oss/decode.yaml"} export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/gpt-oss-120b/trtllm/disagg/decode.yaml"}
set -e set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
......
...@@ -272,6 +272,7 @@ COPY examples /workspace/examples ...@@ -272,6 +272,7 @@ COPY examples /workspace/examples
COPY benchmarks /workspace/benchmarks COPY benchmarks /workspace/benchmarks
COPY deploy /workspace/deploy COPY deploy /workspace/deploy
COPY components/ /workspace/components/ COPY components/ /workspace/components/
COPY recipes/ /workspace/recipes/
# Copy attribution files # Copy attribution files
COPY ATTRIBUTION* LICENSE /workspace/ COPY ATTRIBUTION* LICENSE /workspace/
......
...@@ -162,7 +162,7 @@ cd $DYNAMO_HOME/components/backends/trtllm ...@@ -162,7 +162,7 @@ cd $DYNAMO_HOME/components/backends/trtllm
```bash ```bash
cd $DYNAMO_HOME/components/backends/trtllm cd $DYNAMO_HOME/components/backends/trtllm
export AGG_ENGINE_ARGS=./engine_configs/deepseek_r1/mtp/mtp_agg.yaml export AGG_ENGINE_ARGS=./recipes/deepseek-r1/trtllm/mtp/mtp_agg.yaml
export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4" export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4"
# nvidia/DeepSeek-R1-FP4 is a large model # nvidia/DeepSeek-R1-FP4 is a large model
export MODEL_PATH="nvidia/DeepSeek-R1-FP4" export MODEL_PATH="nvidia/DeepSeek-R1-FP4"
......
...@@ -30,7 +30,7 @@ VSWA is a mechanism in which a model’s layers alternate between multiple slidi ...@@ -30,7 +30,7 @@ VSWA is a mechanism in which a model’s layers alternate between multiple slidi
cd $DYNAMO_HOME/components/backends/trtllm cd $DYNAMO_HOME/components/backends/trtllm
export MODEL_PATH=google/gemma-3-1b-it export MODEL_PATH=google/gemma-3-1b-it
export SERVED_MODEL_NAME=$MODEL_PATH export SERVED_MODEL_NAME=$MODEL_PATH
export AGG_ENGINE_ARGS=engine_configs/gemma3/vswa_agg.yaml export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml
./launch/agg.sh ./launch/agg.sh
``` ```
...@@ -39,7 +39,7 @@ export AGG_ENGINE_ARGS=engine_configs/gemma3/vswa_agg.yaml ...@@ -39,7 +39,7 @@ export AGG_ENGINE_ARGS=engine_configs/gemma3/vswa_agg.yaml
cd $DYNAMO_HOME/components/backends/trtllm cd $DYNAMO_HOME/components/backends/trtllm
export MODEL_PATH=google/gemma-3-1b-it export MODEL_PATH=google/gemma-3-1b-it
export SERVED_MODEL_NAME=$MODEL_PATH export SERVED_MODEL_NAME=$MODEL_PATH
export AGG_ENGINE_ARGS=engine_configs/gemma3/vswa_agg.yaml export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml
./launch/agg_router.sh ./launch/agg_router.sh
``` ```
...@@ -48,8 +48,8 @@ export AGG_ENGINE_ARGS=engine_configs/gemma3/vswa_agg.yaml ...@@ -48,8 +48,8 @@ export AGG_ENGINE_ARGS=engine_configs/gemma3/vswa_agg.yaml
cd $DYNAMO_HOME/components/backends/trtllm cd $DYNAMO_HOME/components/backends/trtllm
export MODEL_PATH=google/gemma-3-1b-it export MODEL_PATH=google/gemma-3-1b-it
export SERVED_MODEL_NAME=$MODEL_PATH export SERVED_MODEL_NAME=$MODEL_PATH
export PREFILL_ENGINE_ARGS=engine_configs/gemma3/vswa_prefill.yaml export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_prefill.yaml
export DECODE_ENGINE_ARGS=engine_configs/gemma3/vswa_decode.yaml export DECODE_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_decode.yaml
./launch/disagg.sh ./launch/disagg.sh
``` ```
...@@ -58,7 +58,7 @@ export DECODE_ENGINE_ARGS=engine_configs/gemma3/vswa_decode.yaml ...@@ -58,7 +58,7 @@ export DECODE_ENGINE_ARGS=engine_configs/gemma3/vswa_decode.yaml
cd $DYNAMO_HOME/components/backends/trtllm cd $DYNAMO_HOME/components/backends/trtllm
export MODEL_PATH=google/gemma-3-1b-it export MODEL_PATH=google/gemma-3-1b-it
export SERVED_MODEL_NAME=$MODEL_PATH export SERVED_MODEL_NAME=$MODEL_PATH
export PREFILL_ENGINE_ARGS=engine_configs/gemma3/vswa_prefill.yaml export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_prefill.yaml
export DECODE_ENGINE_ARGS=engine_configs/gemma3/vswa_decode.yaml export DECODE_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_decode.yaml
./launch/disagg_router.sh ./launch/disagg_router.sh
``` ```
...@@ -90,14 +90,14 @@ The deployment uses configuration files and command-line arguments to control be ...@@ -90,14 +90,14 @@ The deployment uses configuration files and command-line arguments to control be
#### Configuration Files #### Configuration Files
**Prefill Configuration (`engine_configs/gpt_oss/prefill.yaml`)**: **Prefill Configuration (`recipes/gpt-oss-120b/trtllm/disagg/prefill.yaml`)**:
- `enable_attention_dp: false` - Attention data parallelism disabled for prefill - `enable_attention_dp: false` - Attention data parallelism disabled for prefill
- `enable_chunked_prefill: true` - Enables efficient chunked prefill processing - `enable_chunked_prefill: true` - Enables efficient chunked prefill processing
- `moe_config.backend: CUTLASS` - Uses optimized CUTLASS kernels for MoE layers - `moe_config.backend: CUTLASS` - Uses optimized CUTLASS kernels for MoE layers
- `cache_transceiver_config.backend: ucx` - Uses UCX for efficient KV cache transfer - `cache_transceiver_config.backend: ucx` - Uses UCX for efficient KV cache transfer
- `cuda_graph_config.max_batch_size: 32` - Maximum batch size for CUDA graphs - `cuda_graph_config.max_batch_size: 32` - Maximum batch size for CUDA graphs
**Decode Configuration (`engine_configs/gpt_oss/decode.yaml`)**: **Decode Configuration (`recipes/gpt-oss-120b/trtllm/disagg/decode.yaml`)**:
- `enable_attention_dp: true` - Attention data parallelism enabled for decode - `enable_attention_dp: true` - Attention data parallelism enabled for decode
- `disable_overlap_scheduler: false` - Enables overlapping for decode efficiency - `disable_overlap_scheduler: false` - Enables overlapping for decode efficiency
- `moe_config.backend: CUTLASS` - Uses optimized CUTLASS kernels for MoE layers - `moe_config.backend: CUTLASS` - Uses optimized CUTLASS kernels for MoE layers
...@@ -147,7 +147,7 @@ python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 & ...@@ -147,7 +147,7 @@ python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 &
CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \ CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \
--model-path /model \ --model-path /model \
--served-model-name openai/gpt-oss-120b \ --served-model-name openai/gpt-oss-120b \
--extra-engine-args engine_configs/gpt_oss/prefill.yaml \ --extra-engine-args recipes/gpt-oss-120b/trtllm/disagg/prefill.yaml \
--dyn-reasoning-parser gpt_oss \ --dyn-reasoning-parser gpt_oss \
--dyn-tool-call-parser harmony \ --dyn-tool-call-parser harmony \
--disaggregation-mode prefill \ --disaggregation-mode prefill \
...@@ -164,7 +164,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \ ...@@ -164,7 +164,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \
CUDA_VISIBLE_DEVICES=4,5,6,7 python3 -m dynamo.trtllm \ CUDA_VISIBLE_DEVICES=4,5,6,7 python3 -m dynamo.trtllm \
--model-path /model \ --model-path /model \
--served-model-name openai/gpt-oss-120b \ --served-model-name openai/gpt-oss-120b \
--extra-engine-args engine_configs/gpt_oss/decode.yaml \ --extra-engine-args recipes/gpt-oss-120b/trtllm/disagg/decode.yaml \
--dyn-reasoning-parser gpt_oss \ --dyn-reasoning-parser gpt_oss \
--dyn-tool-call-parser harmony \ --dyn-tool-call-parser harmony \
--disaggregation-mode decode \ --disaggregation-mode decode \
......
...@@ -30,7 +30,7 @@ This guide demonstrates how to deploy Llama 4 Maverick Instruct with Eagle Specu ...@@ -30,7 +30,7 @@ This guide demonstrates how to deploy Llama 4 Maverick Instruct with Eagle Specu
For advanced control over how requests are routed between prefill and decode workers in disaggregated mode, refer to the [Disaggregation Strategy](./README.md#disaggregation-strategy) section. For advanced control over how requests are routed between prefill and decode workers in disaggregated mode, refer to the [Disaggregation Strategy](./README.md#disaggregation-strategy) section.
## Notes ## Notes
* Make sure the (`eagle3_one_model: true`) is set in the LLM API config inside the `engine_configs/llama4/eagle` folder. * Make sure the (`eagle3_one_model: true`) is set in the LLM API config inside the `recipes/llama4/trtllm/eagle` folder.
## Setup ## Setup
...@@ -54,7 +54,7 @@ See [this](./multinode/multinode-examples.md#setup) section from multinode guide ...@@ -54,7 +54,7 @@ See [this](./multinode/multinode-examples.md#setup) section from multinode guide
## Aggregated Serving ## Aggregated Serving
```bash ```bash
export NUM_NODES=1 export NUM_NODES=1
export ENGINE_CONFIG="/mnt/engine_configs/llama4/eagle/eagle_agg.yaml" export ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/eagle/eagle_agg.yml"
./multinode/srun_aggregated.sh ./multinode/srun_aggregated.sh
``` ```
...@@ -62,9 +62,9 @@ export ENGINE_CONFIG="/mnt/engine_configs/llama4/eagle/eagle_agg.yaml" ...@@ -62,9 +62,9 @@ export ENGINE_CONFIG="/mnt/engine_configs/llama4/eagle/eagle_agg.yaml"
```bash ```bash
export NUM_PREFILL_NODES=1 export NUM_PREFILL_NODES=1
export PREFILL_ENGINE_CONFIG="/mnt/engine_configs/llama4/eagle/eagle_prefill.yaml" export PREFILL_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/eagle/eagle_prefill.yaml"
export NUM_DECODE_NODES=1 export NUM_DECODE_NODES=1
export DECODE_ENGINE_CONFIG="/mnt/engine_configs/llama4/eagle/eagle_decode.yaml" export DECODE_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/eagle/eagle_decode.yaml"
./multinode/srun_disaggregated.sh ./multinode/srun_disaggregated.sh
``` ```
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment