#!/bin/bash # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # Environment variables with defaults export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export MODEL_PATH=${MODEL_PATH:-"/model"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"openai/gpt-oss-120b"} export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/gpt-oss-120b/trtllm/disagg/prefill.yaml"} export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/gpt-oss-120b/trtllm/disagg/decode.yaml"} set -e trap 'echo Cleaning up...; kill 0' EXIT # run frontend python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 & # With tensor_parallel_size=4, each worker needs 4 GPUs # run prefill worker CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \ --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ --extra-engine-args "$PREFILL_ENGINE_ARGS" \ --dyn-reasoning-parser gpt_oss \ --dyn-tool-call-parser harmony \ --disaggregation-mode prefill \ --max-num-tokens 20000 \ --max-batch-size 32 \ --free-gpu-memory-fraction 0.9 \ --tensor-parallel-size 4 \ --expert-parallel-size 4 & # run decode worker CUDA_VISIBLE_DEVICES=4,5,6,7 python3 -m dynamo.trtllm \ --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ --extra-engine-args "$DECODE_ENGINE_ARGS" \ --dyn-reasoning-parser gpt_oss \ --dyn-tool-call-parser harmony \ --disaggregation-mode decode \ --max-num-tokens 16384 \ --free-gpu-memory-fraction 0.9 \ --tensor-parallel-size 4 \ --expert-parallel-size 4