#!/bin/bash # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # Environment variables with defaults export MODEL_PATH=${MODEL_PATH:-"/model"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"openai/gpt-oss-120b"} export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"prefill_first"} export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/gpt_oss/prefill.yaml"} export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/gpt_oss/decode.yaml"} set -e trap 'echo Cleaning up...; kill 0' EXIT # run clear_namespace python3 utils/clear_namespace.py --namespace dynamo # run frontend python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 & # With tensor_parallel_size=4, each worker needs 4 GPUs # run prefill worker CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \ --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ --extra-engine-args "$PREFILL_ENGINE_ARGS" \ --disaggregation-mode prefill \ --disaggregation-strategy "$DISAGGREGATION_STRATEGY" \ --max-num-tokens 20000 \ --max-batch-size 32 \ --free-gpu-memory-fraction 0.9 \ --tensor-parallel-size 4 \ --expert-parallel-size 4 & # run decode worker CUDA_VISIBLE_DEVICES=4,5,6,7 python3 -m dynamo.trtllm \ --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ --extra-engine-args "$DECODE_ENGINE_ARGS" \ --disaggregation-mode decode \ --disaggregation-strategy "$DISAGGREGATION_STRATEGY" \ --max-num-tokens 16384 \ --free-gpu-memory-fraction 0.9 \ --tensor-parallel-size 4 \ --expert-parallel-size 4