feat: refactor launch scripts with shared launch_utils.sh for consistent failure handling (#7008)

Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>

feat: refactor launch scripts with shared launch_utils.sh for consistent failure handling (#7008)
Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>
34ccc0b1 · Keiven C · GitHub · c95bfc2e · 34ccc0b1 · 34ccc0b1
Unverified Commit 34ccc0b1 authored Mar 11, 2026 by Keiven C Committed by GitHub Mar 11, 2026
18 changed files
--- a/examples/backends/vllm/launch/agg_spec_decoding.sh
+++ b/examples/backends/vllm/launch/agg_spec_decoding.sh
@@ -4,26 +4,12 @@
 set -e
 trap 'echo Cleaning up...; kill 0' EXIT

+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+
 MODEL="meta-llama/Meta-Llama-3.1-8B-Instruct"
 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
-echo "=========================================="
-echo "Launching Speculative Decoding (1 GPU)"
-echo "=========================================="
-echo "Model:       $MODEL"
-echo "Frontend:    http://localhost:$HTTP_PORT"
-echo "=========================================="
-echo ""
-echo "Example test command:"
-echo ""
-echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
-echo "    -H 'Content-Type: application/json' \\"
-echo "    -d '{"
-echo "      \"model\": \"${MODEL}\","
-echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
-echo "      \"max_tokens\": 32"
-echo "    }'"
-echo ""
-echo "=========================================="
+print_launch_banner "Launching Speculative Decoding (1 GPU)" "$MODEL" "$HTTP_PORT"

 # ---------------------------
 # 1. Frontend (Ingress)
@@ -45,4 +31,7 @@ CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm \
        "num_speculative_tokens": 2,
        "method": "eagle3"
    }' \
-    --gpu-memory-utilization 0.8
\ No newline at end of file
+    --gpu-memory-utilization 0.8 &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/examples/backends/vllm/launch/dep.sh
+++ b/examples/backends/vllm/launch/dep.sh
@@ -8,24 +8,11 @@ trap 'echo Cleaning up...; kill 0' EXIT
 MODEL="Qwen/Qwen3-30B-A3B"

 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
-echo "=========================================="
-echo "Launching Data Parallel / Expert Parallelism (4 GPUs)"
-echo "=========================================="
-echo "Model:       $MODEL"
-echo "Frontend:    http://localhost:$HTTP_PORT"
-echo "=========================================="
-echo ""
-echo "Example test command:"
-echo ""
-echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
-echo "    -H 'Content-Type: application/json' \\"
-echo "    -d '{"
-echo "      \"model\": \"${MODEL}\","
-echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
-echo "      \"max_tokens\": 32"
-echo "    }'"
-echo ""
-echo "=========================================="
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+
+print_launch_banner "Launching Data Parallel / Expert Parallelism (4 GPUs)" "$MODEL" "$HTTP_PORT"
+

 # run ingress
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
@@ -47,4 +34,5 @@ python3 -m dynamo.vllm \
 --kv-events-config "{\"publisher\":\"zmq\",\"topic\":\"kv-events\",\"endpoint\":\"tcp://*:20080\",\"enable_kv_cache_events\":true}" &

 echo "All workers starting. (press Ctrl+C to stop)..."
-wait
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/examples/backends/vllm/launch/disagg.sh
+++ b/examples/backends/vllm/launch/disagg.sh
@@ -7,25 +7,11 @@ trap 'echo Cleaning up...; kill 0' EXIT
 # Common configuration
 MODEL="Qwen/Qwen3-0.6B"

+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+
 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
-echo "=========================================="
-echo "Launching Disaggregated Serving (2 GPUs)"
-echo "=========================================="
-echo "Model:       $MODEL"
-echo "Frontend:    http://localhost:$HTTP_PORT"
-echo "=========================================="
-echo ""
-echo "Example test command:"
-echo ""
-echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
-echo "    -H 'Content-Type: application/json' \\"
-echo "    -d '{"
-echo "      \"model\": \"${MODEL}\","
-echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
-echo "      \"max_tokens\": 32"
-echo "    }'"
-echo ""
-echo "=========================================="
+print_launch_banner "Launching Disaggregated Serving (2 GPUs)" "$MODEL" "$HTTP_PORT"

 # run ingress
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
@@ -46,4 +32,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
    --enforce-eager \
    --disaggregation-mode prefill \
    --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
-    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
+    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/examples/backends/vllm/launch/disagg_flexkv.sh
+++ b/examples/backends/vllm/launch/disagg_flexkv.sh
@@ -6,24 +6,11 @@ trap 'echo Cleaning up...; kill 0' EXIT

 MODEL="Qwen/Qwen3-0.6B"
 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
-echo "=========================================="
-echo "Launching Disaggregated Serving + FlexKV (2 GPUs)"
-echo "=========================================="
-echo "Model:       $MODEL"
-echo "Frontend:    http://localhost:$HTTP_PORT"
-echo "=========================================="
-echo ""
-echo "Example test command:"
-echo ""
-echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
-echo "    -H 'Content-Type: application/json' \\"
-echo "    -d '{"
-echo "      \"model\": \"${MODEL}\","
-echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
-echo "      \"max_tokens\": 32"
-echo "    }'"
-echo ""
-echo "=========================================="
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+
+print_launch_banner "Launching Disaggregated Serving + FlexKV (2 GPUs)" "$MODEL" "$HTTP_PORT"
+

 # Run frontend
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
@@ -42,4 +29,7 @@ CUDA_VISIBLE_DEVICES=1 \
  --model $MODEL \
  --is-prefill-worker \
  --kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"FlexKVConnectorV1","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \
-  --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
\ No newline at end of file
+  --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/examples/backends/vllm/launch/disagg_kvbm.sh
+++ b/examples/backends/vllm/launch/disagg_kvbm.sh
@@ -6,24 +6,11 @@ trap 'echo Cleaning up...; kill 0' EXIT

 MODEL="Qwen/Qwen3-0.6B"
 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
-echo "=========================================="
-echo "Launching Disaggregated Serving + KVBM (2 GPUs)"
-echo "=========================================="
-echo "Model:       $MODEL"
-echo "Frontend:    http://localhost:$HTTP_PORT"
-echo "=========================================="
-echo ""
-echo "Example test command:"
-echo ""
-echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
-echo "    -H 'Content-Type: application/json' \\"
-echo "    -d '{"
-echo "      \"model\": \"${MODEL}\","
-echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
-echo "      \"max_tokens\": 32"
-echo "    }'"
-echo ""
-echo "=========================================="
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+
+print_launch_banner "Launching Disaggregated Serving + KVBM (2 GPUs)" "$MODEL" "$HTTP_PORT"
+

 # run ingress
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
@@ -43,4 +30,7 @@ CUDA_VISIBLE_DEVICES=1 \
    --disaggregation-mode prefill \
    --kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \
    --enforce-eager \
-    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
+    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/examples/backends/vllm/launch/disagg_kvbm_2p2d.sh
+++ b/examples/backends/vllm/launch/disagg_kvbm_2p2d.sh
@@ -6,24 +6,11 @@ trap 'echo Cleaning up...; kill 0' EXIT

 MODEL="Qwen/Qwen3-0.6B"
 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
-echo "=========================================="
-echo "Launching Disaggregated + KVBM 2P+2D (4 GPUs)"
-echo "=========================================="
-echo "Model:       $MODEL"
-echo "Frontend:    http://localhost:$HTTP_PORT"
-echo "=========================================="
-echo ""
-echo "Example test command:"
-echo ""
-echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
-echo "    -H 'Content-Type: application/json' \\"
-echo "    -d '{"
-echo "      \"model\": \"${MODEL}\","
-echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
-echo "      \"max_tokens\": 32"
-echo "    }'"
-echo ""
-echo "=========================================="
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+
+print_launch_banner "Launching Disaggregated + KVBM 2P+2D (4 GPUs)" "$MODEL" "$HTTP_PORT"
+

 # run ingress with KV router
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
@@ -58,4 +45,7 @@ CUDA_VISIBLE_DEVICES=3 \
    --disaggregation-mode prefill \
    --kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \
    --enforce-eager \
-    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20083","enable_kv_cache_events":true}'
+    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20083","enable_kv_cache_events":true}' &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/examples/backends/vllm/launch/disagg_kvbm_router.sh
+++ b/examples/backends/vllm/launch/disagg_kvbm_router.sh
@@ -11,24 +11,11 @@ export PYTHONHASHSEED=0
 MODEL="Qwen/Qwen3-0.6B"

 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
-echo "=========================================="
-echo "Launching Disaggregated + KVBM + KV Routing (4 GPUs)"
-echo "=========================================="
-echo "Model:       $MODEL"
-echo "Frontend:    http://localhost:$HTTP_PORT"
-echo "=========================================="
-echo ""
-echo "Example test command:"
-echo ""
-echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
-echo "    -H 'Content-Type: application/json' \\"
-echo "    -d '{"
-echo "      \"model\": \"${MODEL}\","
-echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
-echo "      \"max_tokens\": 32"
-echo "    }'"
-echo ""
-echo "=========================================="
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+
+print_launch_banner "Launching Disaggregated + KVBM + KV Routing (4 GPUs)" "$MODEL" "$HTTP_PORT"
+

 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
 python -m dynamo.frontend \
@@ -72,4 +59,7 @@ CUDA_VISIBLE_DEVICES=3 DYN_KVBM_CPU_CACHE_GB=20 \
    --enforce-eager \
    --disaggregation-mode prefill \
    --kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \
-    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}'
+    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}' &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/examples/backends/vllm/launch/disagg_lmcache.sh
+++ b/examples/backends/vllm/launch/disagg_lmcache.sh
@@ -6,24 +6,11 @@ trap 'echo Cleaning up...; kill 0' EXIT

 MODEL="Qwen/Qwen3-0.6B"
 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
-echo "=========================================="
-echo "Launching Disaggregated Serving + LMCache (2 GPUs)"
-echo "=========================================="
-echo "Model:       $MODEL"
-echo "Frontend:    http://localhost:$HTTP_PORT"
-echo "=========================================="
-echo ""
-echo "Example test command:"
-echo ""
-echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
-echo "    -H 'Content-Type: application/json' \\"
-echo "    -d '{"
-echo "      \"model\": \"${MODEL}\","
-echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
-echo "      \"max_tokens\": 32"
-echo "    }'"
-echo ""
-echo "=========================================="
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+
+print_launch_banner "Launching Disaggregated Serving + LMCache (2 GPUs)" "$MODEL" "$HTTP_PORT"
+

 # run ingress with KV router
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
@@ -42,4 +29,7 @@ CUDA_VISIBLE_DEVICES=1 \
    --model "$MODEL" \
    --disaggregation-mode prefill \
    --kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \
-    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
+    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/examples/backends/vllm/launch/disagg_multimodal_e_pd.sh
+++ b/examples/backends/vllm/launch/disagg_multimodal_e_pd.sh
@@ -4,6 +4,9 @@
 set -e
 trap 'echo Cleaning up...; kill 0' EXIT

+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+
 # Default values
 MODEL_NAME="Qwen/Qwen3-VL-30B-A3B-Instruct-FP8"
 SINGLE_GPU=false
@@ -58,35 +61,12 @@ PD_MAX_MODEL_LEN="16384"


 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
-echo "=========================================="
 if [[ "$SINGLE_GPU" == "true" ]]; then
    GPU_LABEL="1 GPU"
 else
    GPU_LABEL="2 GPUs"
 fi
-echo "Launching Disaggregated Multimodal E+PD ($GPU_LABEL)"
-echo "=========================================="
-echo "Model:       $MODEL_NAME"
-echo "Frontend:    http://localhost:$HTTP_PORT"
-echo "=========================================="
-echo ""
-echo "Example test command:"
-echo ""
-echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
-echo "    -H 'Content-Type: application/json' \\"
-echo "    -d '{"
-echo "      \"model\": \"${MODEL_NAME}\","
-echo "      \"messages\": [{"
-echo "        \"role\": \"user\","
-echo "        \"content\": ["
-echo "          {\"type\": \"text\", \"text\": \"Describe the image.\"},"
-echo "          {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/480px-Cat03.jpg\"}}"
-echo "        ]"
-echo "      }],"
-echo "      \"max_tokens\": 50"
-echo "    }'"
-echo ""
-echo "=========================================="
+print_launch_banner --multimodal "Launching Disaggregated Multimodal E+PD ($GPU_LABEL)" "$MODEL_NAME" "$HTTP_PORT"


 # Start frontend (no router mode)
@@ -102,6 +82,7 @@ EXTRA_ARGS=""
 export DYN_VLLM_EMBEDDING_TRANSFER_MODE=${DYN_VLLM_EMBEDDING_TRANSFER_MODE:-"local"}

 # GPU assignments (override via environment variables)
+# In single-GPU mode both workers share the same GPU.
 if [[ "$SINGLE_GPU" == "true" ]]; then
    DYN_ENCODE_WORKER_GPU=${DYN_ENCODE_WORKER_GPU:-0}
    DYN_PD_WORKER_GPU=${DYN_PD_WORKER_GPU:-0}
@@ -143,5 +124,5 @@ echo "=================================================="
 echo "All components started. Waiting for initialization..."
 echo "=================================================="

-# Wait for all background processes to complete
-wait
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/examples/backends/vllm/launch/disagg_multimodal_epd.sh
+++ b/examples/backends/vllm/launch/disagg_multimodal_epd.sh
@@ -4,6 +4,9 @@
 set -e
 trap 'echo Cleaning up...; kill 0' EXIT

+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+
 # Default values
 MODEL_NAME="llava-hf/llava-1.5-7b-hf"

@@ -57,35 +60,12 @@ done


 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
-echo "=========================================="
 if [[ "$SINGLE_GPU" == "true" ]]; then
    GPU_LABEL="1 GPU"
 else
    GPU_LABEL="3 GPUs"
 fi
-echo "Launching Disaggregated Multimodal E/P/D ($GPU_LABEL)"
-echo "=========================================="
-echo "Model:       $MODEL_NAME"
-echo "Frontend:    http://localhost:$HTTP_PORT"
-echo "=========================================="
-echo ""
-echo "Example test command:"
-echo ""
-echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
-echo "    -H 'Content-Type: application/json' \\"
-echo "    -d '{"
-echo "      \"model\": \"${MODEL_NAME}\","
-echo "      \"messages\": [{"
-echo "        \"role\": \"user\","
-echo "        \"content\": ["
-echo "          {\"type\": \"text\", \"text\": \"Describe the image.\"},"
-echo "          {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/480px-Cat03.jpg\"}}"
-echo "        ]"
-echo "      }],"
-echo "      \"max_tokens\": 50"
-echo "    }'"
-echo ""
-echo "=========================================="
+print_launch_banner --multimodal "Launching Disaggregated Multimodal E/P/D ($GPU_LABEL)" "$MODEL_NAME" "$HTTP_PORT"


 # Start frontend (no router mode)
@@ -135,5 +115,5 @@ echo "=================================================="
 echo "All components started. Waiting for initialization..."
 echo "=================================================="

-# Wait for all background processes to complete
-wait
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/examples/backends/vllm/launch/disagg_multimodal_llama.sh
+++ b/examples/backends/vllm/launch/disagg_multimodal_llama.sh
@@ -3,6 +3,9 @@
 # SPDX-License-Identifier: Apache-2.0
 set -ex

+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+
 # Default values
 HEAD_NODE=0
 MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
@@ -48,34 +51,11 @@ done
 trap 'echo Cleaning up...; kill 0' EXIT

 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
-echo "=========================================="
-echo "Launching Disaggregated Multimodal Llama 4 (Multi-Node)"
-echo "=========================================="
-echo "Model:       $MODEL_NAME"
-if [[ $HEAD_NODE -eq 1 ]]; then
-echo "Frontend:    http://localhost:$HTTP_PORT"
-fi
-echo "=========================================="
 if [[ $HEAD_NODE -eq 1 ]]; then
-echo ""
-echo "Example test command:"
-echo ""
-echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
-echo "    -H 'Content-Type: application/json' \\"
-echo "    -d '{"
-echo "      \"model\": \"${MODEL_NAME}\","
-echo "      \"messages\": [{"
-echo "        \"role\": \"user\","
-echo "        \"content\": ["
-echo "          {\"type\": \"text\", \"text\": \"Describe the image.\"},"
-echo "          {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/480px-Cat03.jpg\"}}"
-echo "        ]"
-echo "      }],"
-echo "      \"max_tokens\": 50"
-echo "    }'"
-echo ""
+    print_launch_banner --multimodal "Launching Disaggregated Multimodal Llama 4 (Multi-Node)" "$MODEL_NAME" "$HTTP_PORT"
+else
+    print_launch_banner --no-curl "Launching Disaggregated Multimodal Llama 4 (Multi-Node)" "$MODEL_NAME" "$HTTP_PORT"
 fi
-echo "=========================================="

 # Use TCP transport to avoid NATS payload limits for multimodal
 export DYN_REQUEST_PLANE=tcp
@@ -121,5 +101,5 @@ else
        "${EXTRA_ARGS[@]}" &
 fi

-# Wait for all background processes to complete
-wait
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/examples/backends/vllm/launch/disagg_router.sh
+++ b/examples/backends/vllm/launch/disagg_router.sh
@@ -4,6 +4,9 @@
 set -e
 trap 'echo Cleaning up...; kill 0' EXIT

+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+
 # Set deterministic hash for KV event IDs
 export PYTHONHASHSEED=0

@@ -12,24 +15,9 @@ MODEL="Qwen/Qwen3-0.6B"
 BLOCK_SIZE=64

 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
-echo "=========================================="
-echo "Launching Disaggregated + KV Routing (4 GPUs)"
-echo "=========================================="
-echo "Model:       $MODEL"
-echo "Frontend:    http://localhost:$HTTP_PORT"
-echo "=========================================="
-echo ""
-echo "Example test command:"
-echo ""
-echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
-echo "    -H 'Content-Type: application/json' \\"
-echo "    -d '{"
-echo "      \"model\": \"${MODEL}\","
-echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
-echo "      \"max_tokens\": 32"
-echo "    }'"
-echo ""
-echo "=========================================="
+
+print_launch_banner "Launching Disaggregated + KV Routing (4 GPUs)" "$MODEL" "$HTTP_PORT"
+

 # Start frontend with KV routing
 # The frontend will automatically detect prefill workers and activate an internal prefill router
@@ -74,4 +62,7 @@ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \
    --enforce-eager \
    --disaggregation-mode prefill \
    --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
-    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20083","enable_kv_cache_events":true}'
+    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20083","enable_kv_cache_events":true}' &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/examples/backends/vllm/launch/disagg_router_gaudi.sh
+++ b/examples/backends/vllm/launch/disagg_router_gaudi.sh
@@ -4,6 +4,9 @@
 set -e
 trap 'echo Cleaning up...; kill 0' EXIT

+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+
 # Set deterministic hash for KV event IDs
 export PYTHONHASHSEED=0

@@ -17,24 +20,9 @@ NIXL_BUFFER_DEVICE=cpu
 VLLM_NIXL_BACKEND=UCX

 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
-echo "=========================================="
-echo "Launching Disaggregated + KV Routing on Gaudi (4 HPUs)"
-echo "=========================================="
-echo "Model:       $MODEL"
-echo "Frontend:    http://localhost:$HTTP_PORT"
-echo "=========================================="
-echo ""
-echo "Example test command:"
-echo ""
-echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
-echo "    -H 'Content-Type: application/json' \\"
-echo "    -d '{"
-echo "      \"model\": \"${MODEL}\","
-echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
-echo "      \"max_tokens\": 32"
-echo "    }'"
-echo ""
-echo "=========================================="
+
+print_launch_banner "Launching Disaggregated + KV Routing on Gaudi (4 HPUs)" "$MODEL" "$HTTP_PORT"
+

 # Start frontend with KV routing
 # The frontend will automatically detect prefill workers and activate an internal prefill router
@@ -77,4 +65,7 @@ HABANA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \
    --block-size $BLOCK_SIZE \
    --kv-transfer-config "{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_both\", \"kv_buffer_device\": \"${NIXL_BUFFER_DEVICE}\", \"kv_connector_extra_config\": {\"backends\": [\"${VLLM_NIXL_BACKEND}\"]}}" \
    --disaggregation-mode prefill \
-    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5559", "enable_kv_cache_events":true}'
+    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5559", "enable_kv_cache_events":true}' &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/examples/backends/vllm/launch/disagg_same_gpu.sh
+++ b/examples/backends/vllm/launch/disagg_same_gpu.sh
@@ -14,6 +14,9 @@
 #   The ~1.3 GiB pad comes from the overhead term (CUDA ctx + activations).
 #   Overestimating is intentional -- better to pad than OOM.

+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+
 SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
 source "$SCRIPT_DIR/../../../common/gpu_utils.sh"

@@ -35,42 +38,17 @@ else
    GPU_MEM_FRACTION=$(gpu_worker_fraction vllm)
 fi

-# Setup cleanup trap
-cleanup() {
-    echo "Cleaning up background processes..."
-    kill $DYNAMO_PID $DECODE_PID 2>/dev/null || true
-    wait $DYNAMO_PID $DECODE_PID 2>/dev/null || true
-    echo "Cleanup complete."
-}
-trap cleanup EXIT INT TERM
+source "$SCRIPT_DIR/../../../common/launch_utils.sh"

 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
-echo "=========================================="
-echo "Launching Disaggregated on Same GPU (1 GPU)"
-echo "=========================================="
-echo "Model:       $MODEL"
-echo "Frontend:    http://localhost:$HTTP_PORT"
-echo "Max seq len: $MAX_MODEL_LEN"
-echo "GPU Mem:     ${GPU_MEM_FRACTION} per worker (~${_EW_TOTAL_GIB} GiB each)"
-echo "  estimate:  weights=${_EW_WEIGHTS_GIB} + kv=${_EW_KV_GIB} + overhead=${_EW_OVERHEAD_GIB} GiB"
-echo "=========================================="
-echo ""
-echo "Example test command:"
-echo ""
-echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
-echo "    -H 'Content-Type: application/json' \\"
-echo "    -d '{"
-echo "      \"model\": \"${MODEL}\","
-echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
-echo "      \"max_tokens\": 32"
-echo "    }'"
-echo ""
-echo "=========================================="
+print_launch_banner "Launching Disaggregated on Same GPU (1 GPU)" "$MODEL" "$HTTP_PORT" \
+    "Max seq len: $MAX_MODEL_LEN" \
+    "GPU Mem:     ${GPU_MEM_FRACTION} per worker (~${_EW_TOTAL_GIB} GiB each)" \
+    "  estimate:  weights=${_EW_WEIGHTS_GIB} + kv=${_EW_KV_GIB} + overhead=${_EW_OVERHEAD_GIB} GiB"

 # run ingress
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
 python3 -m dynamo.frontend &
-DYNAMO_PID=$!

 # run decode worker with metrics on port 8081
 # --enforce-eager is added for quick deployment. for production use, need to remove this flag
@@ -85,7 +63,6 @@ python3 -m dynamo.vllm \
  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
  --gpu-memory-utilization "${GPU_MEM_FRACTION}" \
  --max-model-len "$MAX_MODEL_LEN" &
-DECODE_PID=$!

 # Wait for decode worker to initialize before starting prefill worker
 # This prevents both workers from competing for GPU memory simultaneously, which can cause OOM.
@@ -96,7 +73,7 @@ DECODE_PID=$!
 echo "Waiting for decode worker to initialize..."
 sleep 10

-# run prefill worker with metrics on port 8082 (foreground)
+# run prefill worker with metrics on port 8082
 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
 VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
 CUDA_VISIBLE_DEVICES=0 \
@@ -107,4 +84,7 @@ python3 -m dynamo.vllm \
  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
  --gpu-memory-utilization "${GPU_MEM_FRACTION}" \
  --max-model-len "$MAX_MODEL_LEN" \
-  --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
+  --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/examples/backends/vllm/launch/dsr1_dep.sh
+++ b/examples/backends/vllm/launch/dsr1_dep.sh
@@ -4,6 +4,9 @@

 set -ex

+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+
 # Default values
 NUM_NODES=""
 NODE_RANK=""
@@ -71,31 +74,26 @@ fi
 DATA_PARALLEL_SIZE=$((NUM_NODES * GPUS_PER_NODE))

 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
-echo "=========================================="
-echo "Launching DeepSeek-R1 Data Parallel (Multi-Node)"
-echo "=========================================="
-echo "Model:       $MODEL"
-if [ "$NODE_RANK" -eq 0 ]; then
-echo "Frontend:    http://localhost:$HTTP_PORT"
-fi
-echo "Number of nodes: $NUM_NODES"
-echo "Node rank:       $NODE_RANK"
-echo "GPUs per node:   $GPUS_PER_NODE"
-echo "Data parallel:   $DATA_PARALLEL_SIZE"
-echo "Master address:  $MASTER_ADDR"
-echo "Log directory:   $LOG_DIR"
-echo "=========================================="
+print_launch_banner --no-curl "Launching DeepSeek-R1 Data Parallel (Multi-Node)" "$MODEL" "$HTTP_PORT" \
+    "Number of nodes: $NUM_NODES" \
+    "Node rank:       $NODE_RANK" \
+    "GPUs per node:   $GPUS_PER_NODE" \
+    "Data parallel:   $DATA_PARALLEL_SIZE" \
+    "Master address:  $MASTER_ADDR" \
+    "Log directory:   $LOG_DIR"
 if [ "$NODE_RANK" -eq 0 ]; then
 echo ""
 echo "Example test command:"
 echo ""
-echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
-echo "    -H 'Content-Type: application/json' \\"
-echo "    -d '{"
-echo "      \"model\": \"${MODEL}\","
-echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
-echo "      \"max_tokens\": 32"
-echo "    }'"
+cat <<CURL_EOF
+  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\
+    -H 'Content-Type: application/json' \\
+    -d '{
+      "model": "${MODEL}",
+      "messages": [{"role": "user", "content": "${EXAMPLE_PROMPT}"}],
+      "max_tokens": 32
+    }'
+CURL_EOF
 echo ""
 echo "=========================================="
 fi
@@ -136,4 +134,5 @@ python3 -m dynamo.vllm \
 --kv-events-config "{\"publisher\":\"zmq\",\"topic\":\"kv-events\",\"endpoint\":\"tcp://*:20080\",\"enable_kv_cache_events\":true}" 2>&1 | tee $LOG_DIR/dsr1_dep_${dp_start_rank}.log &

 echo "All workers starting. (press Ctrl+C to stop)..."
-wait
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/examples/backends/vllm/launch/lora/agg_lora.sh
+++ b/examples/backends/vllm/launch/lora/agg_lora.sh
@@ -4,6 +4,9 @@
 set -e
 trap 'echo Cleaning up...; kill 0' EXIT

+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
+
 export AWS_ENDPOINT=http://localhost:9000
 export AWS_ACCESS_KEY_ID=minioadmin
 export AWS_SECRET_ACCESS_KEY=minioadmin
@@ -19,12 +22,7 @@ mkdir -p $DYN_LORA_PATH
 MODEL="Qwen/Qwen3-0.6B"
 SYSTEM_PORT="${DYN_SYSTEM_PORT1:-8081}"
 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
-echo "=========================================="
-echo "Launching Aggregated Serving + LoRA (1 GPU)"
-echo "=========================================="
-echo "Model:       $MODEL"
-echo "Frontend:    http://localhost:$HTTP_PORT"
-echo "=========================================="
+print_launch_banner --no-curl "Launching Aggregated Serving + LoRA (1 GPU)" "$MODEL" "$HTTP_PORT"
 echo ""
 echo "Once running, test with:"
 echo ""
@@ -65,4 +63,7 @@ python -m dynamo.frontend &
 DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT} \
    python -m dynamo.vllm --model "$MODEL" --enforce-eager \
    --enable-lora \
-    --max-lora-rank 64
+    --max-lora-rank 64 &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/examples/backends/vllm/launch/lora/agg_lora_router.sh
+++ b/examples/backends/vllm/launch/lora/agg_lora_router.sh
@@ -4,6 +4,9 @@
 set -e
 trap 'echo Cleaning up...; kill 0' EXIT

+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
+
 export AWS_ENDPOINT=http://localhost:9000
 export AWS_ACCESS_KEY_ID=minioadmin
 export AWS_SECRET_ACCESS_KEY=minioadmin
@@ -26,12 +29,7 @@ BLOCK_SIZE=64
 SYSTEM_PORT1="${DYN_SYSTEM_PORT1:-8081}"
 SYSTEM_PORT2="${DYN_SYSTEM_PORT2:-8082}"
 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
-echo "=========================================="
-echo "Launching Aggregated + LoRA + KV Routing (2 GPUs)"
-echo "=========================================="
-echo "Model:       $MODEL"
-echo "Frontend:    http://localhost:$HTTP_PORT"
-echo "=========================================="
+print_launch_banner --no-curl "Launching Aggregated + LoRA + KV Routing (2 GPUs)" "$MODEL" "$HTTP_PORT"
 echo ""
 echo "Once running, test with:"
 echo ""
@@ -83,11 +81,11 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
    --enforce-eager \
    --enable-lora \
    --max-lora-rank 64 \
-    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
+    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' &

 # Sample output after running LoRA inference curl request twice.
- # usage.prompt_tokens_details.cached_tokens is the number of tokens that were cached from the previous request.
- : <<'SAMPLE_OUTPUT'
+# usage.prompt_tokens_details.cached_tokens is the number of tokens that were cached from the previous request.
+: <<'SAMPLE_OUTPUT'
 {
  "id": "chatcmpl-0cf880c2-fe98-45c4-9c76-84c3ad1a56cc",
  "choices": [
@@ -121,3 +119,6 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
  }
 }
 SAMPLE_OUTPUT
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/examples/common/launch_utils.sh
+++ b/examples/common/launch_utils.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Shared launch utilities for example scripts.
+#
+# Goal:
+#   Unify behavior and reduce duplication across vLLM, SGLang, and TensorRT-LLM
+#   example launch scripts so they share one pattern for banners, process
+#   management, and example curl output.
+#
+# Benefits:
+#   - Single place to change banner format, example prompts, and wait/cleanup logic
+#   - Consistent UX: same startup output and exit behavior across all backends
+#   - Less per-script boilerplate (no manual PID tracking or custom cleanup traps)
+#   - wait_any_exit propagates the first failing child's exit code and lets the
+#     EXIT trap tear down the rest, so failures and Ctrl+C behave predictably
+#
+# Usage:
+#   source "$(dirname "$(readlink -f "$0")")/../common/launch_utils.sh"
+#   # or with SCRIPT_DIR already set:
+#   source "$SCRIPT_DIR/../common/launch_utils.sh"
+#
+# Constants:
+#   EXAMPLE_PROMPT         Default example prompt for curl commands (LLM / embedding)
+#   EXAMPLE_PROMPT_VISUAL  Default example prompt for image / video generation
+#
+# Requires: bash 4.3+ (wait -n)
+#
+# Functions:
+#   print_launch_banner    Print startup banner with model info and example curl
+#   print_curl_footer      Print a custom curl example with standard framing (heredoc)
+#   wait_any_exit          Wait for any background process to exit, propagate its code
+
+if [[ "${BASH_VERSINFO[0]}" -lt 4 || ( "${BASH_VERSINFO[0]}" -eq 4 && "${BASH_VERSINFO[1]}" -lt 3 ) ]]; then
+    echo "launch_utils.sh requires bash 4.3+ (for wait -n), found ${BASH_VERSION}" >&2
+    exit 1
+fi
+
+EXAMPLE_PROMPT="Who is the tennis GOAT: Federer, Djokovic, or Nadal?"
+EXAMPLE_PROMPT_VISUAL="A golden retriever riding a skateboard through a neon-lit city"
+
+# wait_any_exit
+#
+# Waits for ANY backgrounded process to exit and propagates its exit code.
+# Call this as the LAST line of every launch script, after backgrounding
+# all processes (including the one that would otherwise run in the foreground).
+#
+# Why this is better than tracking PIDs manually or running in the foreground:
+#   Foreground pattern:  if the frontend crashes, the script blocks on the
+#   foreground worker and never notices until that worker also exits.
+#   Manual PIDs:  requires bookkeeping ($DYNAMO_PID, $PREFILL_PID, ...),
+#   a custom cleanup() function, and `wait $PID` only watches one process.
+#   wait -n watches ALL children and returns as soon as ANY child dies, so
+#   failures are detected immediately regardless of which process it was.
+#
+# Signal handling:
+#   SIGTERM/SIGINT are trapped to exit 0 (clean shutdown).  Without this,
+#   external cleanup (e.g. a test harness sending SIGTERM to the process
+#   group) interrupts wait -n, which returns 143 (128+15).  Combined with
+#   set -e, that non-zero code looks like a test failure.  Trapping TERM/INT
+#   makes external teardown exit cleanly while still propagating real errors
+#   (OOM, Python exceptions, etc.) from child processes.
+#
+# The EXIT trap (set at the top of each script) still fires when this function
+# calls exit, tearing down the remaining processes via kill 0.
+#
+# Usage:
+#   python -m dynamo.frontend &
+#   python -m dynamo.vllm --model "$MODEL" &
+#   wait_any_exit
+wait_any_exit() {
+    trap 'exit 0' TERM INT
+    if ! jobs -p | grep -q .; then
+        echo "wait_any_exit: no background processes found (script bug: did you forget '&'?)" >&2
+        exit 1
+    fi
+    wait -n
+    local _rc=$?
+    echo "A background process exited with code $_rc"
+    exit "$_rc"
+}
+
+# print_launch_banner [flags] <title> <model> <port> [extra_info_lines...]
+#
+# Prints a startup banner with model/frontend info and an example curl command.
+#
+# Flags (must come before positional args):
+#   --multimodal       Use a multimodal (image_url) curl example (max_tokens=50)
+#   --max-tokens N     Override max_tokens in the curl example (default: 32)
+#   --no-curl          Print only the banner, skip the example curl section
+#
+# Positional args:
+#   title              Banner title, e.g. "Launching Aggregated Serving (1 GPU)"
+#   model              Model name, e.g. "$MODEL"
+#   port               HTTP port, e.g. "$HTTP_PORT"
+#   extra_info_lines   Optional extra lines printed below "Frontend:" (one per arg)
+#
+# Examples:
+#   # Standard text serving
+#   print_launch_banner "Launching Aggregated Serving (1 GPU)" "$MODEL" "$HTTP_PORT"
+#
+#   # With extra info
+#   print_launch_banner "Launching Disagg on Same GPU" "$MODEL" "$HTTP_PORT" \
+#       "GPU Mem:     0.09 per worker (4 GiB each)"
+#
+#   # Multimodal
+#   print_launch_banner --multimodal "Launching Multimodal" "$MODEL" "$HTTP_PORT"
+#
+#   # Banner only (script prints its own curl or conditionally skips)
+#   print_launch_banner --no-curl "Launching DSR1 (Multi-Node)" "$MODEL" "$HTTP_PORT" \
+#       "Nodes:       $NUM_NODES" \
+#       "Node rank:   $NODE_RANK"
+print_launch_banner() {
+    local _curl_type="text"
+    local _max_tokens=32
+    local _no_curl=false
+
+    while [[ "${1:-}" == --* ]]; do
+        case "$1" in
+            --multimodal) _curl_type="multimodal"; _max_tokens=50; shift ;;
+            --max-tokens) _max_tokens="$2"; shift 2 ;;
+            --no-curl)    _no_curl=true; shift ;;
+            *) break ;;
+        esac
+    done
+
+    local _title="$1"
+    local _model="$2"
+    local _port="$3"
+    shift 3
+
+    echo "=========================================="
+    echo "$_title"
+    echo "=========================================="
+    echo "Model:       $_model"
+    echo "Frontend:    http://localhost:$_port"
+    for _line in "$@"; do
+        echo "$_line"
+    done
+    echo "=========================================="
+
+    if [[ "$_no_curl" == true ]]; then
+        return
+    fi
+
+    echo ""
+    echo "Example test command:"
+    echo ""
+
+    if [[ "$_curl_type" == "multimodal" ]]; then
+        cat <<CURL_EOF
+  curl http://localhost:${_port}/v1/chat/completions \\
+    -H 'Content-Type: application/json' \\
+    -d '{
+      "model": "${_model}",
+      "messages": [{"role": "user", "content": [
+        {"type": "text", "text": "Describe this image"},
+        {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/PNG_transparency_demonstration_1.png/300px-PNG_transparency_demonstration_1.png"}}
+      ]}],
+      "max_tokens": ${_max_tokens}
+    }'
+CURL_EOF
+    else
+        cat <<CURL_EOF
+  curl http://localhost:${_port}/v1/chat/completions \\
+    -H 'Content-Type: application/json' \\
+    -d '{
+      "model": "${_model}",
+      "messages": [{"role": "user", "content": "Hello!"}],
+      "max_tokens": ${_max_tokens}
+    }'
+CURL_EOF
+    fi
+
+    echo ""
+    echo "=========================================="
+}
+
+# print_curl_footer
+#
+# Prints a custom curl example wrapped in the standard framing (matching
+# print_launch_banner's built-in curl output). Reads the curl command from
+# stdin so callers can use a heredoc -- no quoting issues with embedded
+# double quotes, variable interpolation, etc.
+#
+# Pair with print_launch_banner --no-curl for non-standard endpoints
+# (images, video, embeddings, etc.) that need a custom request body.
+#
+# Usage:
+#   print_launch_banner --no-curl "Launching Image Diffusion" "$MODEL" "$PORT"
+#   print_curl_footer <<CURL
+#   curl http://localhost:${PORT}/v1/images/generations \\
+#     -H 'Content-Type: application/json' \\
+#     -d '{
+#       "model": "${MODEL}",
+#       "prompt": "A cat on a skateboard"
+#     }'
+#   CURL
+print_curl_footer() {
+    echo ""
+    echo "Example test command:"
+    echo ""
+    cat
+    echo ""
+    echo "=========================================="
+}