Unverified Commit 34ccc0b1 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

feat: refactor launch scripts with shared launch_utils.sh for consistent failure handling (#7008)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent c95bfc2e
......@@ -5,14 +5,11 @@
# Aggregated serving: single worker handles both prefill and decode.
# GPUs: 1
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID 2>/dev/null || true
wait $DYNAMO_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Default values
MODEL="Qwen/Qwen3-0.6B"
......@@ -58,30 +55,12 @@ if [ "$ENABLE_OTEL" = true ]; then
fi
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated LLM Worker"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner "Launching Aggregated LLM Worker" "$MODEL" "$HTTP_PORT"
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run worker with metrics enabled
OTEL_SERVICE_NAME=dynamo-worker DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
......@@ -94,4 +73,7 @@ python3 -m dynamo.sglang \
--skip-tokenizer-init \
--enable-metrics \
"${TRACE_ARGS[@]}" \
"${EXTRA_ARGS[@]}"
"${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -5,14 +5,11 @@
# Aggregated embedding model serving.
# GPUs: 1
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID 2>/dev/null || true
wait $DYNAMO_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Parse command line arguments
while [[ $# -gt 0 ]]; do
......@@ -36,28 +33,20 @@ done
MODEL="Qwen/Qwen3-Embedding-4B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Embedding Worker"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/embeddings \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"input\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\""
echo " }'"
echo ""
echo "=========================================="
print_launch_banner --no-curl "Launching Embedding Worker" "$MODEL" "$HTTP_PORT"
print_curl_footer <<CURL
curl http://localhost:${HTTP_PORT}/v1/embeddings \\
-H 'Content-Type: application/json' \\
-d '{
"model": "${MODEL}",
"input": "${EXAMPLE_PROMPT}"
}'
CURL
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run worker
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
......@@ -69,4 +58,7 @@ python3 -m dynamo.sglang \
--tp 1 \
--trust-remote-code \
--use-sglang-tokenizer \
--enable-metrics
--enable-metrics &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -5,14 +5,11 @@
# Two aggregated workers behind a KV-aware router.
# GPUs: 2
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID $WORKER_PID 2>/dev/null || true
wait $DYNAMO_PID $WORKER_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Parse command line arguments
ENABLE_OTEL=false
......@@ -56,24 +53,7 @@ fi
MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated Router (2 workers)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner "Launching Aggregated Router (2 workers)" "$MODEL" "$HTTP_PORT"
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
......@@ -83,7 +63,6 @@ if [ "$APPROX_MODE" = true ]; then
fi
OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend "${FRONTEND_ARGS[@]}" &
DYNAMO_PID=$!
# run worker
# Build KV events args conditionally (only when not in approx mode)
......@@ -104,7 +83,6 @@ python3 -m dynamo.sglang \
"${KV_EVENTS_ARGS_1[@]}" \
--enable-metrics \
"${TRACE_ARGS[@]}" &
WORKER_PID=$!
OTEL_SERVICE_NAME=dynamo-worker-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_WORKER2:-8082} \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
......@@ -115,4 +93,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--trust-remote-code \
"${KV_EVENTS_ARGS_2[@]}" \
--enable-metrics \
"${TRACE_ARGS[@]}"
"${TRACE_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -5,14 +5,11 @@
# Aggregated multimodal (vision + LLM) serving.
# GPUs: 1
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID 2>/dev/null || true
wait $DYNAMO_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Default values
MODEL="Qwen/Qwen3-VL-8B-Instruct"
......@@ -64,33 +61,12 @@ if [ "$ENABLE_OTEL" = true ]; then
fi
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated Vision Worker"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": ["
echo " {\"type\": \"text\", \"text\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"},"
echo " {\"type\": \"image_url\", \"image_url\": {\"url\": \"http://images.cocodataset.org/test2017/000000155781.jpg\"}}"
echo " ]}],"
echo " \"max_tokens\": 50"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner --multimodal "Launching Aggregated Vision Worker" "$MODEL" "$HTTP_PORT"
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# Build chat template args (only if explicitly set)
TEMPLATE_ARGS=()
......@@ -110,4 +86,7 @@ python3 -m dynamo.sglang \
--skip-tokenizer-init \
--enable-metrics \
"${TRACE_ARGS[@]}" \
"${EXTRA_ARGS[@]}"
"${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -6,15 +6,10 @@
# GPUs: 1
set -e
trap 'echo Cleaning up...; kill 0' EXIT
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $FRONTEND_PID 2>/dev/null || true
wait $FRONTEND_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Model configuration
MODEL_PATH="inclusionAI/LLaDA2.0-mini-preview"
......@@ -30,38 +25,29 @@ ENDPOINT="${ENDPOINT:-generate}"
HTTP_PORT="${HTTP_PORT:-8001}"
TP_SIZE="${TP_SIZE:-1}"
echo "=========================================="
echo "Launching Diffusion LM Worker (LLaDA2.0)"
echo "=========================================="
echo "Model: $MODEL_PATH"
echo "Namespace: $NAMESPACE"
echo "Component: $COMPONENT"
echo "Frontend Port: $HTTP_PORT"
echo "TP Size: $TP_SIZE"
echo "Diffusion Algorithm: ${DLLM_ALGORITHM:-LowConfidence}"
echo "Algorithm Config: ${DLLM_ALGORITHM_CONFIG:-default}"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL_PATH}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"temperature\": 0.7,"
echo " \"max_tokens\": 512"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner --no-curl "Launching Diffusion LM Worker (LLaDA2.0)" "$MODEL_PATH" "$HTTP_PORT" \
"Namespace: $NAMESPACE" \
"Component: $COMPONENT" \
"TP Size: $TP_SIZE" \
"Diffusion Algorithm: ${DLLM_ALGORITHM:-LowConfidence}" \
"Algorithm Config: ${DLLM_ALGORITHM_CONFIG:-default}"
print_curl_footer <<CURL
curl http://localhost:${HTTP_PORT}/v1/chat/completions \\
-H 'Content-Type: application/json' \\
-d '{
"model": "${MODEL_PATH}",
"messages": [{"role": "user", "content": "${EXAMPLE_PROMPT}"}],
"temperature": 0.7,
"max_tokens": 512
}'
CURL
# Launch frontend (OpenAI-compatible API server)
echo "Starting Dynamo Frontend on port $HTTP_PORT..."
python -m dynamo.frontend \
--http-port "$HTTP_PORT" &
FRONTEND_PID=$!
# Wait for frontend to start
sleep 2
......@@ -88,4 +74,7 @@ if [ -n "$DLLM_ALGORITHM_CONFIG" ]; then
fi
# Execute the command
eval $CMD
\ No newline at end of file
eval $CMD &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -5,14 +5,11 @@
# Disaggregated serving: prefill on GPU 0, decode on GPU 1.
# GPUs: 2
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Parse command line arguments
ENABLE_OTEL=false
......@@ -50,30 +47,12 @@ fi
MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Disaggregated Workers (P/D)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner "Launching Disaggregated Workers (P/D)" "$MODEL" "$HTTP_PORT"
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend &
DYNAMO_PID=$!
#AssertionError: Prefill round robin balance is required when dp size > 1. Please make sure that the prefill instance is launched with `--load-balance-method round_robin` and `--prefill-round-robin-balance` is set for decode server.
......@@ -94,7 +73,6 @@ python3 -m dynamo.sglang \
--disaggregation-transfer-backend nixl \
--enable-metrics \
"${TRACE_ARGS[@]}" &
PREFILL_PID=$!
# run decode worker
OTEL_SERVICE_NAME=dynamo-worker-decode DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
......@@ -109,4 +87,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--host 0.0.0.0 \
--disaggregation-transfer-backend nixl \
--enable-metrics \
"${TRACE_ARGS[@]}"
"${TRACE_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -5,14 +5,11 @@
# Disaggregated serving with KV-aware routing: 2 prefill + 2 decode workers.
# GPUs: 4
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID $PREFILL_PID1 $PREFILL_PID2 $DECODE_PID1 $DECODE_PID2 2>/dev/null || true
wait $DYNAMO_PID $PREFILL_PID1 $PREFILL_PID2 $DECODE_PID1 $DECODE_PID2 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Parse command line arguments
ENABLE_OTEL=false
......@@ -51,24 +48,7 @@ fi
MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Disaggregated Router (2P + 2D)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner "Launching Disaggregated Router (2P + 2D)" "$MODEL" "$HTTP_PORT"
# Start frontend with KV routing
# The frontend will automatically detect prefill workers and activate an internal prefill router
......@@ -78,7 +58,6 @@ OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend \
--router-mode kv \
--router-reset-states &
DYNAMO_PID=$!
# run prefill worker
OTEL_SERVICE_NAME=dynamo-worker-prefill-1 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
......@@ -94,7 +73,6 @@ python3 -m dynamo.sglang \
--disaggregation-transfer-backend nixl \
--enable-metrics \
"${TRACE_ARGS[@]}" &
PREFILL_PID1=$!
# run prefill worker
OTEL_SERVICE_NAME=dynamo-worker-prefill-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
......@@ -110,7 +88,6 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--disaggregation-transfer-backend nixl \
--enable-metrics \
"${TRACE_ARGS[@]}" &
PREFILL_PID2=$!
# run decode worker
OTEL_SERVICE_NAME=dynamo-worker-decode-1 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT3:-8083} \
......@@ -126,7 +103,6 @@ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.sglang \
--disaggregation-transfer-backend nixl \
--enable-metrics \
"${TRACE_ARGS[@]}" &
DECODE_PID1=$!
# run decode worker
OTEL_SERVICE_NAME=dynamo-worker-decode-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT4:-8084} \
......@@ -142,7 +118,7 @@ CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.sglang \
--disaggregation-transfer-backend nixl \
--enable-metrics \
"${TRACE_ARGS[@]}" &
DECODE_PID2=$!
# Wait for any worker to exit (keeps script running)
wait
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -14,6 +14,9 @@
# KV cache : 25,536-29,712 tokens per worker
# Handles full 4096-token context with --max-running-requests 2.
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
......@@ -35,43 +38,17 @@ else
GPU_MEM_FRACTION=$(gpu_worker_fraction sglang)
fi
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Disaggregated (same GPU)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "Context len: $CONTEXT_LENGTH"
echo "GPU Mem: ${GPU_MEM_FRACTION} per worker (~${_EW_TOTAL_GIB} GiB each)"
echo " estimate: weights=${_EW_WEIGHTS_GIB} + kv=${_EW_KV_GIB} + overhead=${_EW_OVERHEAD_GIB} GiB"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner "Launching Disaggregated (same GPU)" "$MODEL" "$HTTP_PORT" \
"Context len: $CONTEXT_LENGTH" \
"GPU Mem: ${GPU_MEM_FRACTION} per worker (~${_EW_TOTAL_GIB} GiB each)" \
" estimate: weights=${_EW_WEIGHTS_GIB} + kv=${_EW_KV_GIB} + overhead=${_EW_OVERHEAD_GIB} GiB"
# run ingress with KV router mode for disaggregated setup
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode kv &
DYNAMO_PID=$!
# run prefill worker with metrics on port 8081
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
......@@ -93,7 +70,6 @@ python3 -m dynamo.sglang \
--delete-ckpt-after-loading \
--max-running-requests "$MAX_RUNNING_REQUESTS" \
--enable-metrics &
PREFILL_PID=$!
# Wait for prefill worker to initialize before starting decode worker
# This prevents both workers from competing for GPU memory simultaneously, which can cause OOM.
......@@ -104,7 +80,7 @@ PREFILL_PID=$!
echo "Waiting for prefill worker to initialize..."
sleep 5
# run decode worker with metrics on port 8082 (foreground)
# run decode worker with metrics on port 8082
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
python3 -m dynamo.sglang \
--model-path "$MODEL" \
......@@ -123,4 +99,7 @@ python3 -m dynamo.sglang \
--enable-memory-saver \
--delete-ckpt-after-loading \
--max-running-requests "$MAX_RUNNING_REQUESTS" \
--enable-metrics
--enable-metrics &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -6,15 +6,10 @@
# GPUs: 1
set -e
trap 'echo Cleaning up...; kill 0' EXIT
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $FRONTEND_PID 2>/dev/null || true
wait $FRONTEND_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Defaults
MODEL_PATH="black-forest-labs/FLUX.1-dev"
......@@ -71,30 +66,24 @@ while [[ $# -gt 0 ]]; do
esac
done
echo "=========================================="
echo "Launching Image Diffusion Worker"
echo "=========================================="
echo "Model: $MODEL_PATH"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "FS URL: $FS_URL"
[ -n "$HTTP_URL" ] && echo "HTTP URL: $HTTP_URL"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/images/generations \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"prompt\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\","
echo " \"model\": \"${MODEL_PATH}\","
echo " \"size\": \"1024x1024\","
echo " \"response_format\": \"url\","
echo " \"nvext\": {"
echo " \"num_inference_steps\": 15"
echo " }"
echo " }'"
echo ""
echo "=========================================="
EXTRA_INFO=("FS URL: $FS_URL")
[ -n "$HTTP_URL" ] && EXTRA_INFO+=("HTTP URL: $HTTP_URL")
print_launch_banner --no-curl "Launching Image Diffusion Worker" "$MODEL_PATH" "$HTTP_PORT" \
"${EXTRA_INFO[@]}"
print_curl_footer <<CURL
curl http://localhost:${HTTP_PORT}/v1/images/generations \\
-H 'Content-Type: application/json' \\
-d '{
"prompt": "${EXAMPLE_PROMPT_VISUAL}",
"model": "${MODEL_PATH}",
"size": "1024x1024",
"response_format": "url",
"nvext": {
"num_inference_steps": 15
}
}'
CURL
# Build optional HTTP URL arg
HTTP_URL_ARGS=()
......@@ -106,7 +95,6 @@ fi
echo "Starting Dynamo Frontend on port $HTTP_PORT..."
python3 -m dynamo.frontend \
--http-port "$HTTP_PORT" &
FRONTEND_PID=$!
sleep 2
......@@ -121,4 +109,7 @@ python3 -m dynamo.sglang \
--trust-remote-code \
--skip-tokenizer-init \
--enable-metrics \
"${EXTRA_ARGS[@]}"
"${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -5,14 +5,11 @@
# Multimodal E/P/D: encoder (GPU 0), prefill (GPU 1), decode (GPU 2).
# GPUs: 3
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Default values
MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
......@@ -63,32 +60,11 @@ if [[ -n "$SERVED_MODEL_NAME" ]]; then
fi
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Multimodal E/P/D Workers"
echo "=========================================="
echo "Model: $MODEL_NAME"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL_NAME}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": ["
echo " {\"type\": \"text\", \"text\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"},"
echo " {\"type\": \"image_url\", \"image_url\": {\"url\": \"http://images.cocodataset.org/test2017/000000155781.jpg\"}}"
echo " ]}],"
echo " \"max_tokens\": 50"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner --multimodal "Launching Multimodal E/P/D Workers" "$MODEL_NAME" "$HTTP_PORT"
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run SGLang multimodal processor
python3 -m dynamo.sglang --multimodal-processor --model-path "$MODEL_NAME" $SERVED_MODEL_ARG --chat-template "$CHAT_TEMPLATE" &
......@@ -127,5 +103,5 @@ CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.sglang \
--host 0.0.0.0 \
--disaggregation-transfer-backend nixl &
# Wait for all background processes to complete
wait
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -5,14 +5,11 @@
# Multimodal E/PD: separate vision encoder (GPU 0) + combined PD worker (GPU 1).
# GPUs: 2
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Default values
MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
......@@ -63,32 +60,11 @@ if [[ -n "$SERVED_MODEL_NAME" ]]; then
fi
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Multimodal E/PD Workers"
echo "=========================================="
echo "Model: $MODEL_NAME"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL_NAME}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": ["
echo " {\"type\": \"text\", \"text\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"},"
echo " {\"type\": \"image_url\", \"image_url\": {\"url\": \"http://images.cocodataset.org/test2017/000000155781.jpg\"}}"
echo " ]}],"
echo " \"max_tokens\": 50"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner --multimodal "Launching Multimodal E/PD Workers" "$MODEL_NAME" "$HTTP_PORT"
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run SGLang multimodal processor
python3 -m dynamo.sglang --multimodal-processor --model-path "$MODEL_NAME" $SERVED_MODEL_ARG --chat-template "$CHAT_TEMPLATE" &
......@@ -110,5 +86,5 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--disable-radix-cache \
--disaggregation-transfer-backend nixl &
# Wait for all background processes to complete
wait
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -6,15 +6,10 @@
# GPUs: 1 (--wan-size 1b) or 2 (--wan-size 14b)
set -e
trap 'echo Cleaning up...; kill 0' EXIT
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $FRONTEND_PID 2>/dev/null || true
wait $FRONTEND_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Defaults
WAN_SIZE="1b"
......@@ -98,40 +93,32 @@ case "$WAN_SIZE" in
;;
esac
echo "=========================================="
echo "Launching T2V Video Generation Worker"
echo "=========================================="
echo "Model: $MODEL_PATH"
echo "TP Size: $TP_SIZE"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "FS URL: $FS_URL"
echo "Resolution: ${WIDTH}x${HEIGHT}"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/videos \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"prompt\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\","
echo " \"model\": \"${MODEL_PATH}\","
echo " \"seconds\": 2,"
echo " \"size\": \"${WIDTH}x${HEIGHT}\","
echo " \"response_format\": \"url\","
echo " \"nvext\": {"
echo " \"fps\": 8,"
echo " \"num_frames\": ${NUM_FRAMES},"
echo " \"num_inference_steps\": ${NUM_INFERENCE_STEPS}"
echo " }"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner --no-curl "Launching T2V Video Generation Worker" "$MODEL_PATH" "$HTTP_PORT" \
"TP Size: $TP_SIZE" \
"FS URL: $FS_URL" \
"Resolution: ${WIDTH}x${HEIGHT}"
print_curl_footer <<CURL
curl http://localhost:${HTTP_PORT}/v1/videos \\
-H 'Content-Type: application/json' \\
-d '{
"prompt": "${EXAMPLE_PROMPT_VISUAL}",
"model": "${MODEL_PATH}",
"seconds": 2,
"size": "${WIDTH}x${HEIGHT}",
"response_format": "url",
"nvext": {
"fps": 8,
"num_frames": ${NUM_FRAMES},
"num_inference_steps": ${NUM_INFERENCE_STEPS}
}
}'
CURL
# Launch frontend
echo "Starting Dynamo Frontend on port $HTTP_PORT..."
python3 -m dynamo.frontend \
--http-port "$HTTP_PORT" &
FRONTEND_PID=$!
sleep 2
......@@ -146,4 +133,7 @@ python3 -m dynamo.sglang \
--trust-remote-code \
--skip-tokenizer-init \
--enable-metrics \
"${EXTRA_ARGS[@]}"
"${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -2,6 +2,12 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
......@@ -11,14 +17,6 @@ export MODALITY=${MODALITY:-"text"}
# If you want to use multimodal, set MODALITY to "multimodal"
#export MODALITY=${MODALITY:-"multimodal"}
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID 2>/dev/null || true
wait $DYNAMO_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
ENABLE_OTEL=false
EXTRA_ARGS=()
......@@ -57,7 +55,6 @@ fi
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run worker
# Additional command line args can be passed
......@@ -68,4 +65,7 @@ python3 -m dynamo.trtllm \
--modality "$MODALITY" \
--extra-engine-args "$AGG_ENGINE_ARGS" \
"${TRACE_ARGS[@]}" \
"${EXTRA_ARGS[@]}"
"${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -2,6 +2,12 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
......@@ -9,19 +15,9 @@ export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/agg.yaml"}
export MODALITY=${MODALITY:-"text"}
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID 2>/dev/null || true
wait $DYNAMO_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# Run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# Run worker
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
......@@ -30,4 +26,7 @@ python3 -m dynamo.trtllm \
--served-model-name "$SERVED_MODEL_NAME" \
--modality "$MODALITY" \
--extra-engine-args "$AGG_ENGINE_ARGS" \
--publish-events-and-metrics
--publish-events-and-metrics &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -2,6 +2,12 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-VL-7B-Instruct"}
......@@ -9,20 +15,9 @@ export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/agg.yaml"}
export MODALITY=${MODALITY:-"multimodal"}
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID 2>/dev/null || true
wait $DYNAMO_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode kv &
DYNAMO_PID=$!
# run worker
python3 -m dynamo.trtllm \
......@@ -30,5 +25,7 @@ python3 -m dynamo.trtllm \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$AGG_ENGINE_ARGS" \
--modality "$MODALITY" \
--publish-events-and-metrics
--publish-events-and-metrics &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -2,30 +2,28 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/agg.yaml"}
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID 2>/dev/null || true
wait $DYNAMO_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode kv &
DYNAMO_PID=$!
# run worker
python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$AGG_ENGINE_ARGS" \
--publish-events-and-metrics
--publish-events-and-metrics &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -2,28 +2,26 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/agg.yaml"}
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID 2>/dev/null || true
wait $DYNAMO_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# run frontend with KV router in approximate mode (i.e. no KV events)
python3 -m dynamo.frontend --router-mode kv --no-kv-events &
DYNAMO_PID=$!
# run worker (no event publishing needed - frontend handles routing with predictive approx kv mode)
python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$AGG_ENGINE_ARGS"
--extra-engine-args "$AGG_ENGINE_ARGS" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -2,6 +2,12 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
......@@ -14,15 +20,6 @@ export MODALITY=${MODALITY:-"text"}
# If you want to use multimodal, set MODALITY to "multimodal"
#export MODALITY=${MODALITY:-"multimodal"}
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
ENABLE_OTEL=false
while [[ $# -gt 0 ]]; do
case $1 in
......@@ -59,7 +56,6 @@ fi
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run prefill worker
OTEL_SERVICE_NAME=dynamo-worker-prefill CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
......@@ -69,7 +65,6 @@ OTEL_SERVICE_NAME=dynamo-worker-prefill CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIB
--modality "$MODALITY" \
--disaggregation-mode prefill \
"${TRACE_ARGS[@]}" &
PREFILL_PID=$!
# run decode worker
OTEL_SERVICE_NAME=dynamo-worker-decode CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
......@@ -78,4 +73,7 @@ OTEL_SERVICE_NAME=dynamo-worker-decode CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE
--extra-engine-args "$DECODE_ENGINE_ARGS" \
--modality "$MODALITY" \
--disaggregation-mode decode \
"${TRACE_ARGS[@]}"
"${TRACE_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -6,6 +6,12 @@
# GPU 0: Encode (vision encoder)
# GPU 0: PD worker (prefill + decode, TP=1)
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-VL-2B-Instruct"}
......@@ -21,19 +27,9 @@ export MAX_FILE_SIZE_MB=${MAX_FILE_SIZE_MB:-50}
# Extra arguments forwarded to the PD worker (e.g. --multimodal-embedding-cache-capacity-gb 10)
EXTRA_PD_ARGS=("$@")
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID $ENCODE_PID $PD_PID_1 2>/dev/null || true
wait $DYNAMO_PID $ENCODE_PID $PD_PID_1 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run encode worker (vision encoder on GPU 0)
CUDA_VISIBLE_DEVICES=$ENCODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
......@@ -44,7 +40,6 @@ CUDA_VISIBLE_DEVICES=$ENCODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--allowed-local-media-path "$ALLOWED_LOCAL_MEDIA_PATH" \
--max-file-size-mb "$MAX_FILE_SIZE_MB" \
--disaggregation-mode encode &
ENCODE_PID=$!
# run PD worker 1 (GPU 0)
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.trtllm \
......@@ -55,6 +50,6 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.trtllm \
--encode-endpoint "$ENCODE_ENDPOINT" \
--disaggregation-mode prefill_and_decode \
"${EXTRA_PD_ARGS[@]}" &
PD_PID_1=$!
wait $DYNAMO_PID
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -2,6 +2,12 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-VL-2B-Instruct"}
......@@ -12,20 +18,9 @@ export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"}
export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"}
export MODALITY=${MODALITY:-"multimodal"}
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run prefill worker
CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
......@@ -34,7 +29,6 @@ CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--extra-engine-args "$PREFILL_ENGINE_ARGS" \
--modality "$MODALITY" \
--disaggregation-mode prefill &
PREFILL_PID=$!
# run decode worker
CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
......@@ -42,4 +36,7 @@ CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$DECODE_ENGINE_ARGS" \
--modality "$MODALITY" \
--disaggregation-mode decode
--disaggregation-mode decode &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment