Unverified Commit 34ccc0b1 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

feat: refactor launch scripts with shared launch_utils.sh for consistent failure handling (#7008)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent c95bfc2e
...@@ -5,14 +5,11 @@ ...@@ -5,14 +5,11 @@
# Aggregated serving: single worker handles both prefill and decode. # Aggregated serving: single worker handles both prefill and decode.
# GPUs: 1 # GPUs: 1
# Setup cleanup trap set -e
cleanup() { trap 'echo Cleaning up...; kill 0' EXIT
echo "Cleaning up background processes..."
kill $DYNAMO_PID 2>/dev/null || true SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
wait $DYNAMO_PID 2>/dev/null || true source "$SCRIPT_DIR/../../../common/launch_utils.sh"
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# Default values # Default values
MODEL="Qwen/Qwen3-0.6B" MODEL="Qwen/Qwen3-0.6B"
...@@ -58,30 +55,12 @@ if [ "$ENABLE_OTEL" = true ]; then ...@@ -58,30 +55,12 @@ if [ "$ENABLE_OTEL" = true ]; then
fi fi
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "==========================================" print_launch_banner "Launching Aggregated LLM Worker" "$MODEL" "$HTTP_PORT"
echo "Launching Aggregated LLM Worker"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
# run ingress # run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \ OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend & python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run worker with metrics enabled # run worker with metrics enabled
OTEL_SERVICE_NAME=dynamo-worker DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \ OTEL_SERVICE_NAME=dynamo-worker DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
...@@ -94,4 +73,7 @@ python3 -m dynamo.sglang \ ...@@ -94,4 +73,7 @@ python3 -m dynamo.sglang \
--skip-tokenizer-init \ --skip-tokenizer-init \
--enable-metrics \ --enable-metrics \
"${TRACE_ARGS[@]}" \ "${TRACE_ARGS[@]}" \
"${EXTRA_ARGS[@]}" "${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
...@@ -5,14 +5,11 @@ ...@@ -5,14 +5,11 @@
# Aggregated embedding model serving. # Aggregated embedding model serving.
# GPUs: 1 # GPUs: 1
# Setup cleanup trap set -e
cleanup() { trap 'echo Cleaning up...; kill 0' EXIT
echo "Cleaning up background processes..."
kill $DYNAMO_PID 2>/dev/null || true SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
wait $DYNAMO_PID 2>/dev/null || true source "$SCRIPT_DIR/../../../common/launch_utils.sh"
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# Parse command line arguments # Parse command line arguments
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
...@@ -36,28 +33,20 @@ done ...@@ -36,28 +33,20 @@ done
MODEL="Qwen/Qwen3-Embedding-4B" MODEL="Qwen/Qwen3-Embedding-4B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "==========================================" print_launch_banner --no-curl "Launching Embedding Worker" "$MODEL" "$HTTP_PORT"
echo "Launching Embedding Worker"
echo "==========================================" print_curl_footer <<CURL
echo "Model: $MODEL" curl http://localhost:${HTTP_PORT}/v1/embeddings \\
echo "Frontend: http://localhost:$HTTP_PORT" -H 'Content-Type: application/json' \\
echo "==========================================" -d '{
echo "" "model": "${MODEL}",
echo "Example test command:" "input": "${EXAMPLE_PROMPT}"
echo "" }'
echo " curl http://localhost:${HTTP_PORT}/v1/embeddings \\" CURL
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"input\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\""
echo " }'"
echo ""
echo "=========================================="
# run ingress # run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend & python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run worker # run worker
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
...@@ -69,4 +58,7 @@ python3 -m dynamo.sglang \ ...@@ -69,4 +58,7 @@ python3 -m dynamo.sglang \
--tp 1 \ --tp 1 \
--trust-remote-code \ --trust-remote-code \
--use-sglang-tokenizer \ --use-sglang-tokenizer \
--enable-metrics --enable-metrics &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
...@@ -5,14 +5,11 @@ ...@@ -5,14 +5,11 @@
# Two aggregated workers behind a KV-aware router. # Two aggregated workers behind a KV-aware router.
# GPUs: 2 # GPUs: 2
# Setup cleanup trap set -e
cleanup() { trap 'echo Cleaning up...; kill 0' EXIT
echo "Cleaning up background processes..."
kill $DYNAMO_PID $WORKER_PID 2>/dev/null || true SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
wait $DYNAMO_PID $WORKER_PID 2>/dev/null || true source "$SCRIPT_DIR/../../../common/launch_utils.sh"
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# Parse command line arguments # Parse command line arguments
ENABLE_OTEL=false ENABLE_OTEL=false
...@@ -56,24 +53,7 @@ fi ...@@ -56,24 +53,7 @@ fi
MODEL="Qwen/Qwen3-0.6B" MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "==========================================" print_launch_banner "Launching Aggregated Router (2 workers)" "$MODEL" "$HTTP_PORT"
echo "Launching Aggregated Router (2 workers)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
# run ingress # run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
...@@ -83,7 +63,6 @@ if [ "$APPROX_MODE" = true ]; then ...@@ -83,7 +63,6 @@ if [ "$APPROX_MODE" = true ]; then
fi fi
OTEL_SERVICE_NAME=dynamo-frontend \ OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend "${FRONTEND_ARGS[@]}" & python3 -m dynamo.frontend "${FRONTEND_ARGS[@]}" &
DYNAMO_PID=$!
# run worker # run worker
# Build KV events args conditionally (only when not in approx mode) # Build KV events args conditionally (only when not in approx mode)
...@@ -104,7 +83,6 @@ python3 -m dynamo.sglang \ ...@@ -104,7 +83,6 @@ python3 -m dynamo.sglang \
"${KV_EVENTS_ARGS_1[@]}" \ "${KV_EVENTS_ARGS_1[@]}" \
--enable-metrics \ --enable-metrics \
"${TRACE_ARGS[@]}" & "${TRACE_ARGS[@]}" &
WORKER_PID=$!
OTEL_SERVICE_NAME=dynamo-worker-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_WORKER2:-8082} \ OTEL_SERVICE_NAME=dynamo-worker-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_WORKER2:-8082} \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
...@@ -115,4 +93,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \ ...@@ -115,4 +93,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--trust-remote-code \ --trust-remote-code \
"${KV_EVENTS_ARGS_2[@]}" \ "${KV_EVENTS_ARGS_2[@]}" \
--enable-metrics \ --enable-metrics \
"${TRACE_ARGS[@]}" "${TRACE_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
...@@ -5,14 +5,11 @@ ...@@ -5,14 +5,11 @@
# Aggregated multimodal (vision + LLM) serving. # Aggregated multimodal (vision + LLM) serving.
# GPUs: 1 # GPUs: 1
# Setup cleanup trap set -e
cleanup() { trap 'echo Cleaning up...; kill 0' EXIT
echo "Cleaning up background processes..."
kill $DYNAMO_PID 2>/dev/null || true SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
wait $DYNAMO_PID 2>/dev/null || true source "$SCRIPT_DIR/../../../common/launch_utils.sh"
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# Default values # Default values
MODEL="Qwen/Qwen3-VL-8B-Instruct" MODEL="Qwen/Qwen3-VL-8B-Instruct"
...@@ -64,33 +61,12 @@ if [ "$ENABLE_OTEL" = true ]; then ...@@ -64,33 +61,12 @@ if [ "$ENABLE_OTEL" = true ]; then
fi fi
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "==========================================" print_launch_banner --multimodal "Launching Aggregated Vision Worker" "$MODEL" "$HTTP_PORT"
echo "Launching Aggregated Vision Worker"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": ["
echo " {\"type\": \"text\", \"text\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"},"
echo " {\"type\": \"image_url\", \"image_url\": {\"url\": \"http://images.cocodataset.org/test2017/000000155781.jpg\"}}"
echo " ]}],"
echo " \"max_tokens\": 50"
echo " }'"
echo ""
echo "=========================================="
# run ingress # run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \ OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend & python3 -m dynamo.frontend &
DYNAMO_PID=$!
# Build chat template args (only if explicitly set) # Build chat template args (only if explicitly set)
TEMPLATE_ARGS=() TEMPLATE_ARGS=()
...@@ -110,4 +86,7 @@ python3 -m dynamo.sglang \ ...@@ -110,4 +86,7 @@ python3 -m dynamo.sglang \
--skip-tokenizer-init \ --skip-tokenizer-init \
--enable-metrics \ --enable-metrics \
"${TRACE_ARGS[@]}" \ "${TRACE_ARGS[@]}" \
"${EXTRA_ARGS[@]}" "${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
...@@ -6,15 +6,10 @@ ...@@ -6,15 +6,10 @@
# GPUs: 1 # GPUs: 1
set -e set -e
trap 'echo Cleaning up...; kill 0' EXIT
# Setup cleanup trap SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
cleanup() { source "$SCRIPT_DIR/../../../common/launch_utils.sh"
echo "Cleaning up background processes..."
kill $FRONTEND_PID 2>/dev/null || true
wait $FRONTEND_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# Model configuration # Model configuration
MODEL_PATH="inclusionAI/LLaDA2.0-mini-preview" MODEL_PATH="inclusionAI/LLaDA2.0-mini-preview"
...@@ -30,38 +25,29 @@ ENDPOINT="${ENDPOINT:-generate}" ...@@ -30,38 +25,29 @@ ENDPOINT="${ENDPOINT:-generate}"
HTTP_PORT="${HTTP_PORT:-8001}" HTTP_PORT="${HTTP_PORT:-8001}"
TP_SIZE="${TP_SIZE:-1}" TP_SIZE="${TP_SIZE:-1}"
echo "==========================================" print_launch_banner --no-curl "Launching Diffusion LM Worker (LLaDA2.0)" "$MODEL_PATH" "$HTTP_PORT" \
echo "Launching Diffusion LM Worker (LLaDA2.0)" "Namespace: $NAMESPACE" \
echo "==========================================" "Component: $COMPONENT" \
echo "Model: $MODEL_PATH" "TP Size: $TP_SIZE" \
echo "Namespace: $NAMESPACE" "Diffusion Algorithm: ${DLLM_ALGORITHM:-LowConfidence}" \
echo "Component: $COMPONENT" "Algorithm Config: ${DLLM_ALGORITHM_CONFIG:-default}"
echo "Frontend Port: $HTTP_PORT"
echo "TP Size: $TP_SIZE" print_curl_footer <<CURL
echo "Diffusion Algorithm: ${DLLM_ALGORITHM:-LowConfidence}" curl http://localhost:${HTTP_PORT}/v1/chat/completions \\
echo "Algorithm Config: ${DLLM_ALGORITHM_CONFIG:-default}" -H 'Content-Type: application/json' \\
echo "==========================================" -d '{
echo "" "model": "${MODEL_PATH}",
echo "Example test command:" "messages": [{"role": "user", "content": "${EXAMPLE_PROMPT}"}],
echo "" "temperature": 0.7,
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\" "max_tokens": 512
echo " -H 'Content-Type: application/json' \\" }'
echo " -d '{" CURL
echo " \"model\": \"${MODEL_PATH}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"temperature\": 0.7,"
echo " \"max_tokens\": 512"
echo " }'"
echo ""
echo "=========================================="
# Launch frontend (OpenAI-compatible API server) # Launch frontend (OpenAI-compatible API server)
echo "Starting Dynamo Frontend on port $HTTP_PORT..." echo "Starting Dynamo Frontend on port $HTTP_PORT..."
python -m dynamo.frontend \ python -m dynamo.frontend \
--http-port "$HTTP_PORT" & --http-port "$HTTP_PORT" &
FRONTEND_PID=$!
# Wait for frontend to start # Wait for frontend to start
sleep 2 sleep 2
...@@ -88,4 +74,7 @@ if [ -n "$DLLM_ALGORITHM_CONFIG" ]; then ...@@ -88,4 +74,7 @@ if [ -n "$DLLM_ALGORITHM_CONFIG" ]; then
fi fi
# Execute the command # Execute the command
eval $CMD eval $CMD &
\ No newline at end of file
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
...@@ -5,14 +5,11 @@ ...@@ -5,14 +5,11 @@
# Disaggregated serving: prefill on GPU 0, decode on GPU 1. # Disaggregated serving: prefill on GPU 0, decode on GPU 1.
# GPUs: 2 # GPUs: 2
# Setup cleanup trap set -e
cleanup() { trap 'echo Cleaning up...; kill 0' EXIT
echo "Cleaning up background processes..."
kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true source "$SCRIPT_DIR/../../../common/launch_utils.sh"
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# Parse command line arguments # Parse command line arguments
ENABLE_OTEL=false ENABLE_OTEL=false
...@@ -50,30 +47,12 @@ fi ...@@ -50,30 +47,12 @@ fi
MODEL="Qwen/Qwen3-0.6B" MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "==========================================" print_launch_banner "Launching Disaggregated Workers (P/D)" "$MODEL" "$HTTP_PORT"
echo "Launching Disaggregated Workers (P/D)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
# run ingress # run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \ OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend & python3 -m dynamo.frontend &
DYNAMO_PID=$!
#AssertionError: Prefill round robin balance is required when dp size > 1. Please make sure that the prefill instance is launched with `--load-balance-method round_robin` and `--prefill-round-robin-balance` is set for decode server. #AssertionError: Prefill round robin balance is required when dp size > 1. Please make sure that the prefill instance is launched with `--load-balance-method round_robin` and `--prefill-round-robin-balance` is set for decode server.
...@@ -94,7 +73,6 @@ python3 -m dynamo.sglang \ ...@@ -94,7 +73,6 @@ python3 -m dynamo.sglang \
--disaggregation-transfer-backend nixl \ --disaggregation-transfer-backend nixl \
--enable-metrics \ --enable-metrics \
"${TRACE_ARGS[@]}" & "${TRACE_ARGS[@]}" &
PREFILL_PID=$!
# run decode worker # run decode worker
OTEL_SERVICE_NAME=dynamo-worker-decode DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \ OTEL_SERVICE_NAME=dynamo-worker-decode DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
...@@ -109,4 +87,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \ ...@@ -109,4 +87,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--host 0.0.0.0 \ --host 0.0.0.0 \
--disaggregation-transfer-backend nixl \ --disaggregation-transfer-backend nixl \
--enable-metrics \ --enable-metrics \
"${TRACE_ARGS[@]}" "${TRACE_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
...@@ -5,14 +5,11 @@ ...@@ -5,14 +5,11 @@
# Disaggregated serving with KV-aware routing: 2 prefill + 2 decode workers. # Disaggregated serving with KV-aware routing: 2 prefill + 2 decode workers.
# GPUs: 4 # GPUs: 4
# Setup cleanup trap set -e
cleanup() { trap 'echo Cleaning up...; kill 0' EXIT
echo "Cleaning up background processes..."
kill $DYNAMO_PID $PREFILL_PID1 $PREFILL_PID2 $DECODE_PID1 $DECODE_PID2 2>/dev/null || true SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
wait $DYNAMO_PID $PREFILL_PID1 $PREFILL_PID2 $DECODE_PID1 $DECODE_PID2 2>/dev/null || true source "$SCRIPT_DIR/../../../common/launch_utils.sh"
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# Parse command line arguments # Parse command line arguments
ENABLE_OTEL=false ENABLE_OTEL=false
...@@ -51,24 +48,7 @@ fi ...@@ -51,24 +48,7 @@ fi
MODEL="Qwen/Qwen3-0.6B" MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "==========================================" print_launch_banner "Launching Disaggregated Router (2P + 2D)" "$MODEL" "$HTTP_PORT"
echo "Launching Disaggregated Router (2P + 2D)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
# Start frontend with KV routing # Start frontend with KV routing
# The frontend will automatically detect prefill workers and activate an internal prefill router # The frontend will automatically detect prefill workers and activate an internal prefill router
...@@ -78,7 +58,6 @@ OTEL_SERVICE_NAME=dynamo-frontend \ ...@@ -78,7 +58,6 @@ OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend \ python3 -m dynamo.frontend \
--router-mode kv \ --router-mode kv \
--router-reset-states & --router-reset-states &
DYNAMO_PID=$!
# run prefill worker # run prefill worker
OTEL_SERVICE_NAME=dynamo-worker-prefill-1 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \ OTEL_SERVICE_NAME=dynamo-worker-prefill-1 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
...@@ -94,7 +73,6 @@ python3 -m dynamo.sglang \ ...@@ -94,7 +73,6 @@ python3 -m dynamo.sglang \
--disaggregation-transfer-backend nixl \ --disaggregation-transfer-backend nixl \
--enable-metrics \ --enable-metrics \
"${TRACE_ARGS[@]}" & "${TRACE_ARGS[@]}" &
PREFILL_PID1=$!
# run prefill worker # run prefill worker
OTEL_SERVICE_NAME=dynamo-worker-prefill-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \ OTEL_SERVICE_NAME=dynamo-worker-prefill-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
...@@ -110,7 +88,6 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \ ...@@ -110,7 +88,6 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--disaggregation-transfer-backend nixl \ --disaggregation-transfer-backend nixl \
--enable-metrics \ --enable-metrics \
"${TRACE_ARGS[@]}" & "${TRACE_ARGS[@]}" &
PREFILL_PID2=$!
# run decode worker # run decode worker
OTEL_SERVICE_NAME=dynamo-worker-decode-1 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT3:-8083} \ OTEL_SERVICE_NAME=dynamo-worker-decode-1 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT3:-8083} \
...@@ -126,7 +103,6 @@ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.sglang \ ...@@ -126,7 +103,6 @@ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.sglang \
--disaggregation-transfer-backend nixl \ --disaggregation-transfer-backend nixl \
--enable-metrics \ --enable-metrics \
"${TRACE_ARGS[@]}" & "${TRACE_ARGS[@]}" &
DECODE_PID1=$!
# run decode worker # run decode worker
OTEL_SERVICE_NAME=dynamo-worker-decode-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT4:-8084} \ OTEL_SERVICE_NAME=dynamo-worker-decode-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT4:-8084} \
...@@ -142,7 +118,7 @@ CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.sglang \ ...@@ -142,7 +118,7 @@ CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.sglang \
--disaggregation-transfer-backend nixl \ --disaggregation-transfer-backend nixl \
--enable-metrics \ --enable-metrics \
"${TRACE_ARGS[@]}" & "${TRACE_ARGS[@]}" &
DECODE_PID2=$!
# Wait for any worker to exit (keeps script running) # Wait for any worker to exit (keeps script running)
wait # Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
...@@ -14,6 +14,9 @@ ...@@ -14,6 +14,9 @@
# KV cache : 25,536-29,712 tokens per worker # KV cache : 25,536-29,712 tokens per worker
# Handles full 4096-token context with --max-running-requests 2. # Handles full 4096-token context with --max-running-requests 2.
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
...@@ -35,43 +38,17 @@ else ...@@ -35,43 +38,17 @@ else
GPU_MEM_FRACTION=$(gpu_worker_fraction sglang) GPU_MEM_FRACTION=$(gpu_worker_fraction sglang)
fi fi
# Setup cleanup trap source "$SCRIPT_DIR/../../../common/launch_utils.sh"
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "==========================================" print_launch_banner "Launching Disaggregated (same GPU)" "$MODEL" "$HTTP_PORT" \
echo "Launching Disaggregated (same GPU)" "Context len: $CONTEXT_LENGTH" \
echo "==========================================" "GPU Mem: ${GPU_MEM_FRACTION} per worker (~${_EW_TOTAL_GIB} GiB each)" \
echo "Model: $MODEL" " estimate: weights=${_EW_WEIGHTS_GIB} + kv=${_EW_KV_GIB} + overhead=${_EW_OVERHEAD_GIB} GiB"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "Context len: $CONTEXT_LENGTH"
echo "GPU Mem: ${GPU_MEM_FRACTION} per worker (~${_EW_TOTAL_GIB} GiB each)"
echo " estimate: weights=${_EW_WEIGHTS_GIB} + kv=${_EW_KV_GIB} + overhead=${_EW_OVERHEAD_GIB} GiB"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
# run ingress with KV router mode for disaggregated setup # run ingress with KV router mode for disaggregated setup
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode kv & python3 -m dynamo.frontend --router-mode kv &
DYNAMO_PID=$!
# run prefill worker with metrics on port 8081 # run prefill worker with metrics on port 8081
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
...@@ -93,7 +70,6 @@ python3 -m dynamo.sglang \ ...@@ -93,7 +70,6 @@ python3 -m dynamo.sglang \
--delete-ckpt-after-loading \ --delete-ckpt-after-loading \
--max-running-requests "$MAX_RUNNING_REQUESTS" \ --max-running-requests "$MAX_RUNNING_REQUESTS" \
--enable-metrics & --enable-metrics &
PREFILL_PID=$!
# Wait for prefill worker to initialize before starting decode worker # Wait for prefill worker to initialize before starting decode worker
# This prevents both workers from competing for GPU memory simultaneously, which can cause OOM. # This prevents both workers from competing for GPU memory simultaneously, which can cause OOM.
...@@ -104,7 +80,7 @@ PREFILL_PID=$! ...@@ -104,7 +80,7 @@ PREFILL_PID=$!
echo "Waiting for prefill worker to initialize..." echo "Waiting for prefill worker to initialize..."
sleep 5 sleep 5
# run decode worker with metrics on port 8082 (foreground) # run decode worker with metrics on port 8082
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
python3 -m dynamo.sglang \ python3 -m dynamo.sglang \
--model-path "$MODEL" \ --model-path "$MODEL" \
...@@ -123,4 +99,7 @@ python3 -m dynamo.sglang \ ...@@ -123,4 +99,7 @@ python3 -m dynamo.sglang \
--enable-memory-saver \ --enable-memory-saver \
--delete-ckpt-after-loading \ --delete-ckpt-after-loading \
--max-running-requests "$MAX_RUNNING_REQUESTS" \ --max-running-requests "$MAX_RUNNING_REQUESTS" \
--enable-metrics --enable-metrics &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
...@@ -6,15 +6,10 @@ ...@@ -6,15 +6,10 @@
# GPUs: 1 # GPUs: 1
set -e set -e
trap 'echo Cleaning up...; kill 0' EXIT
# Setup cleanup trap SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
cleanup() { source "$SCRIPT_DIR/../../../common/launch_utils.sh"
echo "Cleaning up background processes..."
kill $FRONTEND_PID 2>/dev/null || true
wait $FRONTEND_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# Defaults # Defaults
MODEL_PATH="black-forest-labs/FLUX.1-dev" MODEL_PATH="black-forest-labs/FLUX.1-dev"
...@@ -71,30 +66,24 @@ while [[ $# -gt 0 ]]; do ...@@ -71,30 +66,24 @@ while [[ $# -gt 0 ]]; do
esac esac
done done
echo "==========================================" EXTRA_INFO=("FS URL: $FS_URL")
echo "Launching Image Diffusion Worker" [ -n "$HTTP_URL" ] && EXTRA_INFO+=("HTTP URL: $HTTP_URL")
echo "==========================================" print_launch_banner --no-curl "Launching Image Diffusion Worker" "$MODEL_PATH" "$HTTP_PORT" \
echo "Model: $MODEL_PATH" "${EXTRA_INFO[@]}"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "FS URL: $FS_URL" print_curl_footer <<CURL
[ -n "$HTTP_URL" ] && echo "HTTP URL: $HTTP_URL" curl http://localhost:${HTTP_PORT}/v1/images/generations \\
echo "==========================================" -H 'Content-Type: application/json' \\
echo "" -d '{
echo "Example test command:" "prompt": "${EXAMPLE_PROMPT_VISUAL}",
echo "" "model": "${MODEL_PATH}",
echo " curl http://localhost:${HTTP_PORT}/v1/images/generations \\" "size": "1024x1024",
echo " -H 'Content-Type: application/json' \\" "response_format": "url",
echo " -d '{" "nvext": {
echo " \"prompt\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"," "num_inference_steps": 15
echo " \"model\": \"${MODEL_PATH}\"," }
echo " \"size\": \"1024x1024\"," }'
echo " \"response_format\": \"url\"," CURL
echo " \"nvext\": {"
echo " \"num_inference_steps\": 15"
echo " }"
echo " }'"
echo ""
echo "=========================================="
# Build optional HTTP URL arg # Build optional HTTP URL arg
HTTP_URL_ARGS=() HTTP_URL_ARGS=()
...@@ -106,7 +95,6 @@ fi ...@@ -106,7 +95,6 @@ fi
echo "Starting Dynamo Frontend on port $HTTP_PORT..." echo "Starting Dynamo Frontend on port $HTTP_PORT..."
python3 -m dynamo.frontend \ python3 -m dynamo.frontend \
--http-port "$HTTP_PORT" & --http-port "$HTTP_PORT" &
FRONTEND_PID=$!
sleep 2 sleep 2
...@@ -121,4 +109,7 @@ python3 -m dynamo.sglang \ ...@@ -121,4 +109,7 @@ python3 -m dynamo.sglang \
--trust-remote-code \ --trust-remote-code \
--skip-tokenizer-init \ --skip-tokenizer-init \
--enable-metrics \ --enable-metrics \
"${EXTRA_ARGS[@]}" "${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
...@@ -5,14 +5,11 @@ ...@@ -5,14 +5,11 @@
# Multimodal E/P/D: encoder (GPU 0), prefill (GPU 1), decode (GPU 2). # Multimodal E/P/D: encoder (GPU 0), prefill (GPU 1), decode (GPU 2).
# GPUs: 3 # GPUs: 3
# Setup cleanup trap set -e
cleanup() { trap 'echo Cleaning up...; kill 0' EXIT
echo "Cleaning up background processes..."
kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true source "$SCRIPT_DIR/../../../common/launch_utils.sh"
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# Default values # Default values
MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct" MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
...@@ -63,32 +60,11 @@ if [[ -n "$SERVED_MODEL_NAME" ]]; then ...@@ -63,32 +60,11 @@ if [[ -n "$SERVED_MODEL_NAME" ]]; then
fi fi
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "==========================================" print_launch_banner --multimodal "Launching Multimodal E/P/D Workers" "$MODEL_NAME" "$HTTP_PORT"
echo "Launching Multimodal E/P/D Workers"
echo "=========================================="
echo "Model: $MODEL_NAME"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL_NAME}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": ["
echo " {\"type\": \"text\", \"text\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"},"
echo " {\"type\": \"image_url\", \"image_url\": {\"url\": \"http://images.cocodataset.org/test2017/000000155781.jpg\"}}"
echo " ]}],"
echo " \"max_tokens\": 50"
echo " }'"
echo ""
echo "=========================================="
# run ingress # run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend & python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run SGLang multimodal processor # run SGLang multimodal processor
python3 -m dynamo.sglang --multimodal-processor --model-path "$MODEL_NAME" $SERVED_MODEL_ARG --chat-template "$CHAT_TEMPLATE" & python3 -m dynamo.sglang --multimodal-processor --model-path "$MODEL_NAME" $SERVED_MODEL_ARG --chat-template "$CHAT_TEMPLATE" &
...@@ -127,5 +103,5 @@ CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.sglang \ ...@@ -127,5 +103,5 @@ CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.sglang \
--host 0.0.0.0 \ --host 0.0.0.0 \
--disaggregation-transfer-backend nixl & --disaggregation-transfer-backend nixl &
# Wait for all background processes to complete # Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait wait_any_exit
...@@ -5,14 +5,11 @@ ...@@ -5,14 +5,11 @@
# Multimodal E/PD: separate vision encoder (GPU 0) + combined PD worker (GPU 1). # Multimodal E/PD: separate vision encoder (GPU 0) + combined PD worker (GPU 1).
# GPUs: 2 # GPUs: 2
# Setup cleanup trap set -e
cleanup() { trap 'echo Cleaning up...; kill 0' EXIT
echo "Cleaning up background processes..."
kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true source "$SCRIPT_DIR/../../../common/launch_utils.sh"
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# Default values # Default values
MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct" MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
...@@ -63,32 +60,11 @@ if [[ -n "$SERVED_MODEL_NAME" ]]; then ...@@ -63,32 +60,11 @@ if [[ -n "$SERVED_MODEL_NAME" ]]; then
fi fi
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "==========================================" print_launch_banner --multimodal "Launching Multimodal E/PD Workers" "$MODEL_NAME" "$HTTP_PORT"
echo "Launching Multimodal E/PD Workers"
echo "=========================================="
echo "Model: $MODEL_NAME"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL_NAME}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": ["
echo " {\"type\": \"text\", \"text\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"},"
echo " {\"type\": \"image_url\", \"image_url\": {\"url\": \"http://images.cocodataset.org/test2017/000000155781.jpg\"}}"
echo " ]}],"
echo " \"max_tokens\": 50"
echo " }'"
echo ""
echo "=========================================="
# run ingress # run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend & python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run SGLang multimodal processor # run SGLang multimodal processor
python3 -m dynamo.sglang --multimodal-processor --model-path "$MODEL_NAME" $SERVED_MODEL_ARG --chat-template "$CHAT_TEMPLATE" & python3 -m dynamo.sglang --multimodal-processor --model-path "$MODEL_NAME" $SERVED_MODEL_ARG --chat-template "$CHAT_TEMPLATE" &
...@@ -110,5 +86,5 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \ ...@@ -110,5 +86,5 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--disable-radix-cache \ --disable-radix-cache \
--disaggregation-transfer-backend nixl & --disaggregation-transfer-backend nixl &
# Wait for all background processes to complete # Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait wait_any_exit
...@@ -6,15 +6,10 @@ ...@@ -6,15 +6,10 @@
# GPUs: 1 (--wan-size 1b) or 2 (--wan-size 14b) # GPUs: 1 (--wan-size 1b) or 2 (--wan-size 14b)
set -e set -e
trap 'echo Cleaning up...; kill 0' EXIT
# Setup cleanup trap SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
cleanup() { source "$SCRIPT_DIR/../../../common/launch_utils.sh"
echo "Cleaning up background processes..."
kill $FRONTEND_PID 2>/dev/null || true
wait $FRONTEND_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# Defaults # Defaults
WAN_SIZE="1b" WAN_SIZE="1b"
...@@ -98,40 +93,32 @@ case "$WAN_SIZE" in ...@@ -98,40 +93,32 @@ case "$WAN_SIZE" in
;; ;;
esac esac
echo "==========================================" print_launch_banner --no-curl "Launching T2V Video Generation Worker" "$MODEL_PATH" "$HTTP_PORT" \
echo "Launching T2V Video Generation Worker" "TP Size: $TP_SIZE" \
echo "==========================================" "FS URL: $FS_URL" \
echo "Model: $MODEL_PATH" "Resolution: ${WIDTH}x${HEIGHT}"
echo "TP Size: $TP_SIZE"
echo "Frontend: http://localhost:$HTTP_PORT" print_curl_footer <<CURL
echo "FS URL: $FS_URL" curl http://localhost:${HTTP_PORT}/v1/videos \\
echo "Resolution: ${WIDTH}x${HEIGHT}" -H 'Content-Type: application/json' \\
echo "==========================================" -d '{
echo "" "prompt": "${EXAMPLE_PROMPT_VISUAL}",
echo "Example test command:" "model": "${MODEL_PATH}",
echo "" "seconds": 2,
echo " curl http://localhost:${HTTP_PORT}/v1/videos \\" "size": "${WIDTH}x${HEIGHT}",
echo " -H 'Content-Type: application/json' \\" "response_format": "url",
echo " -d '{" "nvext": {
echo " \"prompt\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"," "fps": 8,
echo " \"model\": \"${MODEL_PATH}\"," "num_frames": ${NUM_FRAMES},
echo " \"seconds\": 2," "num_inference_steps": ${NUM_INFERENCE_STEPS}
echo " \"size\": \"${WIDTH}x${HEIGHT}\"," }
echo " \"response_format\": \"url\"," }'
echo " \"nvext\": {" CURL
echo " \"fps\": 8,"
echo " \"num_frames\": ${NUM_FRAMES},"
echo " \"num_inference_steps\": ${NUM_INFERENCE_STEPS}"
echo " }"
echo " }'"
echo ""
echo "=========================================="
# Launch frontend # Launch frontend
echo "Starting Dynamo Frontend on port $HTTP_PORT..." echo "Starting Dynamo Frontend on port $HTTP_PORT..."
python3 -m dynamo.frontend \ python3 -m dynamo.frontend \
--http-port "$HTTP_PORT" & --http-port "$HTTP_PORT" &
FRONTEND_PID=$!
sleep 2 sleep 2
...@@ -146,4 +133,7 @@ python3 -m dynamo.sglang \ ...@@ -146,4 +133,7 @@ python3 -m dynamo.sglang \
--trust-remote-code \ --trust-remote-code \
--skip-tokenizer-init \ --skip-tokenizer-init \
--enable-metrics \ --enable-metrics \
"${EXTRA_ARGS[@]}" "${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
...@@ -2,6 +2,12 @@ ...@@ -2,6 +2,12 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Environment variables with defaults # Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
...@@ -11,14 +17,6 @@ export MODALITY=${MODALITY:-"text"} ...@@ -11,14 +17,6 @@ export MODALITY=${MODALITY:-"text"}
# If you want to use multimodal, set MODALITY to "multimodal" # If you want to use multimodal, set MODALITY to "multimodal"
#export MODALITY=${MODALITY:-"multimodal"} #export MODALITY=${MODALITY:-"multimodal"}
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID 2>/dev/null || true
wait $DYNAMO_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
ENABLE_OTEL=false ENABLE_OTEL=false
EXTRA_ARGS=() EXTRA_ARGS=()
...@@ -57,7 +55,6 @@ fi ...@@ -57,7 +55,6 @@ fi
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \ OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend & python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run worker # run worker
# Additional command line args can be passed # Additional command line args can be passed
...@@ -68,4 +65,7 @@ python3 -m dynamo.trtllm \ ...@@ -68,4 +65,7 @@ python3 -m dynamo.trtllm \
--modality "$MODALITY" \ --modality "$MODALITY" \
--extra-engine-args "$AGG_ENGINE_ARGS" \ --extra-engine-args "$AGG_ENGINE_ARGS" \
"${TRACE_ARGS[@]}" \ "${TRACE_ARGS[@]}" \
"${EXTRA_ARGS[@]}" "${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
...@@ -2,6 +2,12 @@ ...@@ -2,6 +2,12 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Environment variables with defaults # Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
...@@ -9,19 +15,9 @@ export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} ...@@ -9,19 +15,9 @@ export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/agg.yaml"} export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/agg.yaml"}
export MODALITY=${MODALITY:-"text"} export MODALITY=${MODALITY:-"text"}
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID 2>/dev/null || true
wait $DYNAMO_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# Run frontend # Run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend & python3 -m dynamo.frontend &
DYNAMO_PID=$!
# Run worker # Run worker
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
...@@ -30,4 +26,7 @@ python3 -m dynamo.trtllm \ ...@@ -30,4 +26,7 @@ python3 -m dynamo.trtllm \
--served-model-name "$SERVED_MODEL_NAME" \ --served-model-name "$SERVED_MODEL_NAME" \
--modality "$MODALITY" \ --modality "$MODALITY" \
--extra-engine-args "$AGG_ENGINE_ARGS" \ --extra-engine-args "$AGG_ENGINE_ARGS" \
--publish-events-and-metrics --publish-events-and-metrics &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
...@@ -2,6 +2,12 @@ ...@@ -2,6 +2,12 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Environment variables with defaults # Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-VL-7B-Instruct"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-VL-7B-Instruct"}
...@@ -9,20 +15,9 @@ export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"} ...@@ -9,20 +15,9 @@ export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/agg.yaml"} export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/agg.yaml"}
export MODALITY=${MODALITY:-"multimodal"} export MODALITY=${MODALITY:-"multimodal"}
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID 2>/dev/null || true
wait $DYNAMO_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# run frontend # run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode kv & python3 -m dynamo.frontend --router-mode kv &
DYNAMO_PID=$!
# run worker # run worker
python3 -m dynamo.trtllm \ python3 -m dynamo.trtllm \
...@@ -30,5 +25,7 @@ python3 -m dynamo.trtllm \ ...@@ -30,5 +25,7 @@ python3 -m dynamo.trtllm \
--served-model-name "$SERVED_MODEL_NAME" \ --served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$AGG_ENGINE_ARGS" \ --extra-engine-args "$AGG_ENGINE_ARGS" \
--modality "$MODALITY" \ --modality "$MODALITY" \
--publish-events-and-metrics --publish-events-and-metrics &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
...@@ -2,30 +2,28 @@ ...@@ -2,30 +2,28 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Environment variables with defaults # Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/agg.yaml"} export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/agg.yaml"}
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID 2>/dev/null || true
wait $DYNAMO_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# run frontend # run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode kv & python3 -m dynamo.frontend --router-mode kv &
DYNAMO_PID=$!
# run worker # run worker
python3 -m dynamo.trtllm \ python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \ --model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \ --served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$AGG_ENGINE_ARGS" \ --extra-engine-args "$AGG_ENGINE_ARGS" \
--publish-events-and-metrics --publish-events-and-metrics &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
...@@ -2,28 +2,26 @@ ...@@ -2,28 +2,26 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Environment variables with defaults # Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/agg.yaml"} export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/agg.yaml"}
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID 2>/dev/null || true
wait $DYNAMO_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# run frontend with KV router in approximate mode (i.e. no KV events) # run frontend with KV router in approximate mode (i.e. no KV events)
python3 -m dynamo.frontend --router-mode kv --no-kv-events & python3 -m dynamo.frontend --router-mode kv --no-kv-events &
DYNAMO_PID=$!
# run worker (no event publishing needed - frontend handles routing with predictive approx kv mode) # run worker (no event publishing needed - frontend handles routing with predictive approx kv mode)
python3 -m dynamo.trtllm \ python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \ --model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \ --served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$AGG_ENGINE_ARGS" --extra-engine-args "$AGG_ENGINE_ARGS" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
...@@ -2,6 +2,12 @@ ...@@ -2,6 +2,12 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Environment variables with defaults # Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
...@@ -14,15 +20,6 @@ export MODALITY=${MODALITY:-"text"} ...@@ -14,15 +20,6 @@ export MODALITY=${MODALITY:-"text"}
# If you want to use multimodal, set MODALITY to "multimodal" # If you want to use multimodal, set MODALITY to "multimodal"
#export MODALITY=${MODALITY:-"multimodal"} #export MODALITY=${MODALITY:-"multimodal"}
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
ENABLE_OTEL=false ENABLE_OTEL=false
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case $1 in case $1 in
...@@ -59,7 +56,6 @@ fi ...@@ -59,7 +56,6 @@ fi
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \ OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend & python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run prefill worker # run prefill worker
OTEL_SERVICE_NAME=dynamo-worker-prefill CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \ OTEL_SERVICE_NAME=dynamo-worker-prefill CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
...@@ -69,7 +65,6 @@ OTEL_SERVICE_NAME=dynamo-worker-prefill CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIB ...@@ -69,7 +65,6 @@ OTEL_SERVICE_NAME=dynamo-worker-prefill CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIB
--modality "$MODALITY" \ --modality "$MODALITY" \
--disaggregation-mode prefill \ --disaggregation-mode prefill \
"${TRACE_ARGS[@]}" & "${TRACE_ARGS[@]}" &
PREFILL_PID=$!
# run decode worker # run decode worker
OTEL_SERVICE_NAME=dynamo-worker-decode CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \ OTEL_SERVICE_NAME=dynamo-worker-decode CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
...@@ -78,4 +73,7 @@ OTEL_SERVICE_NAME=dynamo-worker-decode CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE ...@@ -78,4 +73,7 @@ OTEL_SERVICE_NAME=dynamo-worker-decode CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE
--extra-engine-args "$DECODE_ENGINE_ARGS" \ --extra-engine-args "$DECODE_ENGINE_ARGS" \
--modality "$MODALITY" \ --modality "$MODALITY" \
--disaggregation-mode decode \ --disaggregation-mode decode \
"${TRACE_ARGS[@]}" "${TRACE_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
...@@ -6,6 +6,12 @@ ...@@ -6,6 +6,12 @@
# GPU 0: Encode (vision encoder) # GPU 0: Encode (vision encoder)
# GPU 0: PD worker (prefill + decode, TP=1) # GPU 0: PD worker (prefill + decode, TP=1)
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Environment variables with defaults # Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-VL-2B-Instruct"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-VL-2B-Instruct"}
...@@ -21,19 +27,9 @@ export MAX_FILE_SIZE_MB=${MAX_FILE_SIZE_MB:-50} ...@@ -21,19 +27,9 @@ export MAX_FILE_SIZE_MB=${MAX_FILE_SIZE_MB:-50}
# Extra arguments forwarded to the PD worker (e.g. --multimodal-embedding-cache-capacity-gb 10) # Extra arguments forwarded to the PD worker (e.g. --multimodal-embedding-cache-capacity-gb 10)
EXTRA_PD_ARGS=("$@") EXTRA_PD_ARGS=("$@")
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID $ENCODE_PID $PD_PID_1 2>/dev/null || true
wait $DYNAMO_PID $ENCODE_PID $PD_PID_1 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# run frontend # run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend & python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run encode worker (vision encoder on GPU 0) # run encode worker (vision encoder on GPU 0)
CUDA_VISIBLE_DEVICES=$ENCODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \ CUDA_VISIBLE_DEVICES=$ENCODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
...@@ -44,7 +40,6 @@ CUDA_VISIBLE_DEVICES=$ENCODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \ ...@@ -44,7 +40,6 @@ CUDA_VISIBLE_DEVICES=$ENCODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--allowed-local-media-path "$ALLOWED_LOCAL_MEDIA_PATH" \ --allowed-local-media-path "$ALLOWED_LOCAL_MEDIA_PATH" \
--max-file-size-mb "$MAX_FILE_SIZE_MB" \ --max-file-size-mb "$MAX_FILE_SIZE_MB" \
--disaggregation-mode encode & --disaggregation-mode encode &
ENCODE_PID=$!
# run PD worker 1 (GPU 0) # run PD worker 1 (GPU 0)
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.trtllm \ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.trtllm \
...@@ -55,6 +50,6 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.trtllm \ ...@@ -55,6 +50,6 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.trtllm \
--encode-endpoint "$ENCODE_ENDPOINT" \ --encode-endpoint "$ENCODE_ENDPOINT" \
--disaggregation-mode prefill_and_decode \ --disaggregation-mode prefill_and_decode \
"${EXTRA_PD_ARGS[@]}" & "${EXTRA_PD_ARGS[@]}" &
PD_PID_1=$!
wait $DYNAMO_PID # Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
...@@ -2,6 +2,12 @@ ...@@ -2,6 +2,12 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Environment variables with defaults # Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-VL-2B-Instruct"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-VL-2B-Instruct"}
...@@ -12,20 +18,9 @@ export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"} ...@@ -12,20 +18,9 @@ export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"}
export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"} export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"}
export MODALITY=${MODALITY:-"multimodal"} export MODALITY=${MODALITY:-"multimodal"}
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# run frontend # run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend & python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run prefill worker # run prefill worker
CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \ CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
...@@ -34,7 +29,6 @@ CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \ ...@@ -34,7 +29,6 @@ CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--extra-engine-args "$PREFILL_ENGINE_ARGS" \ --extra-engine-args "$PREFILL_ENGINE_ARGS" \
--modality "$MODALITY" \ --modality "$MODALITY" \
--disaggregation-mode prefill & --disaggregation-mode prefill &
PREFILL_PID=$!
# run decode worker # run decode worker
CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \ CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
...@@ -42,4 +36,7 @@ CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \ ...@@ -42,4 +36,7 @@ CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--served-model-name "$SERVED_MODEL_NAME" \ --served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$DECODE_ENGINE_ARGS" \ --extra-engine-args "$DECODE_ENGINE_ARGS" \
--modality "$MODALITY" \ --modality "$MODALITY" \
--disaggregation-mode decode --disaggregation-mode decode &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment