Unverified Commit 34ccc0b1 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

feat: refactor launch scripts with shared launch_utils.sh for consistent failure handling (#7008)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent c95bfc2e
......@@ -4,26 +4,12 @@
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
MODEL="meta-llama/Meta-Llama-3.1-8B-Instruct"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Speculative Decoding (1 GPU)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner "Launching Speculative Decoding (1 GPU)" "$MODEL" "$HTTP_PORT"
# ---------------------------
# 1. Frontend (Ingress)
......@@ -45,4 +31,7 @@ CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm \
"num_speculative_tokens": 2,
"method": "eagle3"
}' \
--gpu-memory-utilization 0.8
\ No newline at end of file
--gpu-memory-utilization 0.8 &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -8,24 +8,11 @@ trap 'echo Cleaning up...; kill 0' EXIT
MODEL="Qwen/Qwen3-30B-A3B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Data Parallel / Expert Parallelism (4 GPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
print_launch_banner "Launching Data Parallel / Expert Parallelism (4 GPUs)" "$MODEL" "$HTTP_PORT"
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
......@@ -47,4 +34,5 @@ python3 -m dynamo.vllm \
--kv-events-config "{\"publisher\":\"zmq\",\"topic\":\"kv-events\",\"endpoint\":\"tcp://*:20080\",\"enable_kv_cache_events\":true}" &
echo "All workers starting. (press Ctrl+C to stop)..."
wait
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -7,25 +7,11 @@ trap 'echo Cleaning up...; kill 0' EXIT
# Common configuration
MODEL="Qwen/Qwen3-0.6B"
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Disaggregated Serving (2 GPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner "Launching Disaggregated Serving (2 GPUs)" "$MODEL" "$HTTP_PORT"
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
......@@ -46,4 +32,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--enforce-eager \
--disaggregation-mode prefill \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -6,24 +6,11 @@ trap 'echo Cleaning up...; kill 0' EXIT
MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Disaggregated Serving + FlexKV (2 GPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
print_launch_banner "Launching Disaggregated Serving + FlexKV (2 GPUs)" "$MODEL" "$HTTP_PORT"
# Run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
......@@ -42,4 +29,7 @@ CUDA_VISIBLE_DEVICES=1 \
--model $MODEL \
--is-prefill-worker \
--kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"FlexKVConnectorV1","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
\ No newline at end of file
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -6,24 +6,11 @@ trap 'echo Cleaning up...; kill 0' EXIT
MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Disaggregated Serving + KVBM (2 GPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
print_launch_banner "Launching Disaggregated Serving + KVBM (2 GPUs)" "$MODEL" "$HTTP_PORT"
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
......@@ -43,4 +30,7 @@ CUDA_VISIBLE_DEVICES=1 \
--disaggregation-mode prefill \
--kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \
--enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -6,24 +6,11 @@ trap 'echo Cleaning up...; kill 0' EXIT
MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Disaggregated + KVBM 2P+2D (4 GPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
print_launch_banner "Launching Disaggregated + KVBM 2P+2D (4 GPUs)" "$MODEL" "$HTTP_PORT"
# run ingress with KV router
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
......@@ -58,4 +45,7 @@ CUDA_VISIBLE_DEVICES=3 \
--disaggregation-mode prefill \
--kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \
--enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20083","enable_kv_cache_events":true}'
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20083","enable_kv_cache_events":true}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -11,24 +11,11 @@ export PYTHONHASHSEED=0
MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Disaggregated + KVBM + KV Routing (4 GPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
print_launch_banner "Launching Disaggregated + KVBM + KV Routing (4 GPUs)" "$MODEL" "$HTTP_PORT"
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend \
......@@ -72,4 +59,7 @@ CUDA_VISIBLE_DEVICES=3 DYN_KVBM_CPU_CACHE_GB=20 \
--enforce-eager \
--disaggregation-mode prefill \
--kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}'
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -6,24 +6,11 @@ trap 'echo Cleaning up...; kill 0' EXIT
MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Disaggregated Serving + LMCache (2 GPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
print_launch_banner "Launching Disaggregated Serving + LMCache (2 GPUs)" "$MODEL" "$HTTP_PORT"
# run ingress with KV router
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
......@@ -42,4 +29,7 @@ CUDA_VISIBLE_DEVICES=1 \
--model "$MODEL" \
--disaggregation-mode prefill \
--kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -4,6 +4,9 @@
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Default values
MODEL_NAME="Qwen/Qwen3-VL-30B-A3B-Instruct-FP8"
SINGLE_GPU=false
......@@ -58,35 +61,12 @@ PD_MAX_MODEL_LEN="16384"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
if [[ "$SINGLE_GPU" == "true" ]]; then
GPU_LABEL="1 GPU"
else
GPU_LABEL="2 GPUs"
fi
echo "Launching Disaggregated Multimodal E+PD ($GPU_LABEL)"
echo "=========================================="
echo "Model: $MODEL_NAME"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL_NAME}\","
echo " \"messages\": [{"
echo " \"role\": \"user\","
echo " \"content\": ["
echo " {\"type\": \"text\", \"text\": \"Describe the image.\"},"
echo " {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/480px-Cat03.jpg\"}}"
echo " ]"
echo " }],"
echo " \"max_tokens\": 50"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner --multimodal "Launching Disaggregated Multimodal E+PD ($GPU_LABEL)" "$MODEL_NAME" "$HTTP_PORT"
# Start frontend (no router mode)
......@@ -102,6 +82,7 @@ EXTRA_ARGS=""
export DYN_VLLM_EMBEDDING_TRANSFER_MODE=${DYN_VLLM_EMBEDDING_TRANSFER_MODE:-"local"}
# GPU assignments (override via environment variables)
# In single-GPU mode both workers share the same GPU.
if [[ "$SINGLE_GPU" == "true" ]]; then
DYN_ENCODE_WORKER_GPU=${DYN_ENCODE_WORKER_GPU:-0}
DYN_PD_WORKER_GPU=${DYN_PD_WORKER_GPU:-0}
......@@ -143,5 +124,5 @@ echo "=================================================="
echo "All components started. Waiting for initialization..."
echo "=================================================="
# Wait for all background processes to complete
wait
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -4,6 +4,9 @@
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Default values
MODEL_NAME="llava-hf/llava-1.5-7b-hf"
......@@ -57,35 +60,12 @@ done
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
if [[ "$SINGLE_GPU" == "true" ]]; then
GPU_LABEL="1 GPU"
else
GPU_LABEL="3 GPUs"
fi
echo "Launching Disaggregated Multimodal E/P/D ($GPU_LABEL)"
echo "=========================================="
echo "Model: $MODEL_NAME"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL_NAME}\","
echo " \"messages\": [{"
echo " \"role\": \"user\","
echo " \"content\": ["
echo " {\"type\": \"text\", \"text\": \"Describe the image.\"},"
echo " {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/480px-Cat03.jpg\"}}"
echo " ]"
echo " }],"
echo " \"max_tokens\": 50"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner --multimodal "Launching Disaggregated Multimodal E/P/D ($GPU_LABEL)" "$MODEL_NAME" "$HTTP_PORT"
# Start frontend (no router mode)
......@@ -135,5 +115,5 @@ echo "=================================================="
echo "All components started. Waiting for initialization..."
echo "=================================================="
# Wait for all background processes to complete
wait
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -3,6 +3,9 @@
# SPDX-License-Identifier: Apache-2.0
set -ex
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Default values
HEAD_NODE=0
MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
......@@ -48,34 +51,11 @@ done
trap 'echo Cleaning up...; kill 0' EXIT
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Disaggregated Multimodal Llama 4 (Multi-Node)"
echo "=========================================="
echo "Model: $MODEL_NAME"
if [[ $HEAD_NODE -eq 1 ]]; then
echo "Frontend: http://localhost:$HTTP_PORT"
fi
echo "=========================================="
if [[ $HEAD_NODE -eq 1 ]]; then
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL_NAME}\","
echo " \"messages\": [{"
echo " \"role\": \"user\","
echo " \"content\": ["
echo " {\"type\": \"text\", \"text\": \"Describe the image.\"},"
echo " {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/480px-Cat03.jpg\"}}"
echo " ]"
echo " }],"
echo " \"max_tokens\": 50"
echo " }'"
echo ""
print_launch_banner --multimodal "Launching Disaggregated Multimodal Llama 4 (Multi-Node)" "$MODEL_NAME" "$HTTP_PORT"
else
print_launch_banner --no-curl "Launching Disaggregated Multimodal Llama 4 (Multi-Node)" "$MODEL_NAME" "$HTTP_PORT"
fi
echo "=========================================="
# Use TCP transport to avoid NATS payload limits for multimodal
export DYN_REQUEST_PLANE=tcp
......@@ -121,5 +101,5 @@ else
"${EXTRA_ARGS[@]}" &
fi
# Wait for all background processes to complete
wait
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -4,6 +4,9 @@
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Set deterministic hash for KV event IDs
export PYTHONHASHSEED=0
......@@ -12,24 +15,9 @@ MODEL="Qwen/Qwen3-0.6B"
BLOCK_SIZE=64
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Disaggregated + KV Routing (4 GPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner "Launching Disaggregated + KV Routing (4 GPUs)" "$MODEL" "$HTTP_PORT"
# Start frontend with KV routing
# The frontend will automatically detect prefill workers and activate an internal prefill router
......@@ -74,4 +62,7 @@ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \
--enforce-eager \
--disaggregation-mode prefill \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20083","enable_kv_cache_events":true}'
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20083","enable_kv_cache_events":true}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -4,6 +4,9 @@
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Set deterministic hash for KV event IDs
export PYTHONHASHSEED=0
......@@ -17,24 +20,9 @@ NIXL_BUFFER_DEVICE=cpu
VLLM_NIXL_BACKEND=UCX
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Disaggregated + KV Routing on Gaudi (4 HPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner "Launching Disaggregated + KV Routing on Gaudi (4 HPUs)" "$MODEL" "$HTTP_PORT"
# Start frontend with KV routing
# The frontend will automatically detect prefill workers and activate an internal prefill router
......@@ -77,4 +65,7 @@ HABANA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \
--kv-transfer-config "{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_both\", \"kv_buffer_device\": \"${NIXL_BUFFER_DEVICE}\", \"kv_connector_extra_config\": {\"backends\": [\"${VLLM_NIXL_BACKEND}\"]}}" \
--disaggregation-mode prefill \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5559", "enable_kv_cache_events":true}'
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5559", "enable_kv_cache_events":true}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -14,6 +14,9 @@
# The ~1.3 GiB pad comes from the overhead term (CUDA ctx + activations).
# Overestimating is intentional -- better to pad than OOM.
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
......@@ -35,42 +38,17 @@ else
GPU_MEM_FRACTION=$(gpu_worker_fraction vllm)
fi
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID $DECODE_PID 2>/dev/null || true
wait $DYNAMO_PID $DECODE_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Disaggregated on Same GPU (1 GPU)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "Max seq len: $MAX_MODEL_LEN"
echo "GPU Mem: ${GPU_MEM_FRACTION} per worker (~${_EW_TOTAL_GIB} GiB each)"
echo " estimate: weights=${_EW_WEIGHTS_GIB} + kv=${_EW_KV_GIB} + overhead=${_EW_OVERHEAD_GIB} GiB"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner "Launching Disaggregated on Same GPU (1 GPU)" "$MODEL" "$HTTP_PORT" \
"Max seq len: $MAX_MODEL_LEN" \
"GPU Mem: ${GPU_MEM_FRACTION} per worker (~${_EW_TOTAL_GIB} GiB each)" \
" estimate: weights=${_EW_WEIGHTS_GIB} + kv=${_EW_KV_GIB} + overhead=${_EW_OVERHEAD_GIB} GiB"
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run decode worker with metrics on port 8081
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
......@@ -85,7 +63,6 @@ python3 -m dynamo.vllm \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
--gpu-memory-utilization "${GPU_MEM_FRACTION}" \
--max-model-len "$MAX_MODEL_LEN" &
DECODE_PID=$!
# Wait for decode worker to initialize before starting prefill worker
# This prevents both workers from competing for GPU memory simultaneously, which can cause OOM.
......@@ -96,7 +73,7 @@ DECODE_PID=$!
echo "Waiting for decode worker to initialize..."
sleep 10
# run prefill worker with metrics on port 8082 (foreground)
# run prefill worker with metrics on port 8082
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=0 \
......@@ -107,4 +84,7 @@ python3 -m dynamo.vllm \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
--gpu-memory-utilization "${GPU_MEM_FRACTION}" \
--max-model-len "$MAX_MODEL_LEN" \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -4,6 +4,9 @@
set -ex
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Default values
NUM_NODES=""
NODE_RANK=""
......@@ -71,31 +74,26 @@ fi
DATA_PARALLEL_SIZE=$((NUM_NODES * GPUS_PER_NODE))
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching DeepSeek-R1 Data Parallel (Multi-Node)"
echo "=========================================="
echo "Model: $MODEL"
if [ "$NODE_RANK" -eq 0 ]; then
echo "Frontend: http://localhost:$HTTP_PORT"
fi
echo "Number of nodes: $NUM_NODES"
echo "Node rank: $NODE_RANK"
echo "GPUs per node: $GPUS_PER_NODE"
echo "Data parallel: $DATA_PARALLEL_SIZE"
echo "Master address: $MASTER_ADDR"
echo "Log directory: $LOG_DIR"
echo "=========================================="
print_launch_banner --no-curl "Launching DeepSeek-R1 Data Parallel (Multi-Node)" "$MODEL" "$HTTP_PORT" \
"Number of nodes: $NUM_NODES" \
"Node rank: $NODE_RANK" \
"GPUs per node: $GPUS_PER_NODE" \
"Data parallel: $DATA_PARALLEL_SIZE" \
"Master address: $MASTER_ADDR" \
"Log directory: $LOG_DIR"
if [ "$NODE_RANK" -eq 0 ]; then
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
cat <<CURL_EOF
curl http://localhost:${HTTP_PORT}/v1/chat/completions \\
-H 'Content-Type: application/json' \\
-d '{
"model": "${MODEL}",
"messages": [{"role": "user", "content": "${EXAMPLE_PROMPT}"}],
"max_tokens": 32
}'
CURL_EOF
echo ""
echo "=========================================="
fi
......@@ -136,4 +134,5 @@ python3 -m dynamo.vllm \
--kv-events-config "{\"publisher\":\"zmq\",\"topic\":\"kv-events\",\"endpoint\":\"tcp://*:20080\",\"enable_kv_cache_events\":true}" 2>&1 | tee $LOG_DIR/dsr1_dep_${dp_start_rank}.log &
echo "All workers starting. (press Ctrl+C to stop)..."
wait
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -4,6 +4,9 @@
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
export AWS_ENDPOINT=http://localhost:9000
export AWS_ACCESS_KEY_ID=minioadmin
export AWS_SECRET_ACCESS_KEY=minioadmin
......@@ -19,12 +22,7 @@ mkdir -p $DYN_LORA_PATH
MODEL="Qwen/Qwen3-0.6B"
SYSTEM_PORT="${DYN_SYSTEM_PORT1:-8081}"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated Serving + LoRA (1 GPU)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
print_launch_banner --no-curl "Launching Aggregated Serving + LoRA (1 GPU)" "$MODEL" "$HTTP_PORT"
echo ""
echo "Once running, test with:"
echo ""
......@@ -65,4 +63,7 @@ python -m dynamo.frontend &
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT} \
python -m dynamo.vllm --model "$MODEL" --enforce-eager \
--enable-lora \
--max-lora-rank 64
--max-lora-rank 64 &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -4,6 +4,9 @@
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
export AWS_ENDPOINT=http://localhost:9000
export AWS_ACCESS_KEY_ID=minioadmin
export AWS_SECRET_ACCESS_KEY=minioadmin
......@@ -26,12 +29,7 @@ BLOCK_SIZE=64
SYSTEM_PORT1="${DYN_SYSTEM_PORT1:-8081}"
SYSTEM_PORT2="${DYN_SYSTEM_PORT2:-8082}"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated + LoRA + KV Routing (2 GPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
print_launch_banner --no-curl "Launching Aggregated + LoRA + KV Routing (2 GPUs)" "$MODEL" "$HTTP_PORT"
echo ""
echo "Once running, test with:"
echo ""
......@@ -83,11 +81,11 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--enforce-eager \
--enable-lora \
--max-lora-rank 64 \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' &
# Sample output after running LoRA inference curl request twice.
# usage.prompt_tokens_details.cached_tokens is the number of tokens that were cached from the previous request.
: <<'SAMPLE_OUTPUT'
# usage.prompt_tokens_details.cached_tokens is the number of tokens that were cached from the previous request.
: <<'SAMPLE_OUTPUT'
{
"id": "chatcmpl-0cf880c2-fe98-45c4-9c76-84c3ad1a56cc",
"choices": [
......@@ -121,3 +119,6 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
}
}
SAMPLE_OUTPUT
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Shared launch utilities for example scripts.
#
# Goal:
# Unify behavior and reduce duplication across vLLM, SGLang, and TensorRT-LLM
# example launch scripts so they share one pattern for banners, process
# management, and example curl output.
#
# Benefits:
# - Single place to change banner format, example prompts, and wait/cleanup logic
# - Consistent UX: same startup output and exit behavior across all backends
# - Less per-script boilerplate (no manual PID tracking or custom cleanup traps)
# - wait_any_exit propagates the first failing child's exit code and lets the
# EXIT trap tear down the rest, so failures and Ctrl+C behave predictably
#
# Usage:
# source "$(dirname "$(readlink -f "$0")")/../common/launch_utils.sh"
# # or with SCRIPT_DIR already set:
# source "$SCRIPT_DIR/../common/launch_utils.sh"
#
# Constants:
# EXAMPLE_PROMPT Default example prompt for curl commands (LLM / embedding)
# EXAMPLE_PROMPT_VISUAL Default example prompt for image / video generation
#
# Requires: bash 4.3+ (wait -n)
#
# Functions:
# print_launch_banner Print startup banner with model info and example curl
# print_curl_footer Print a custom curl example with standard framing (heredoc)
# wait_any_exit Wait for any background process to exit, propagate its code
if [[ "${BASH_VERSINFO[0]}" -lt 4 || ( "${BASH_VERSINFO[0]}" -eq 4 && "${BASH_VERSINFO[1]}" -lt 3 ) ]]; then
echo "launch_utils.sh requires bash 4.3+ (for wait -n), found ${BASH_VERSION}" >&2
exit 1
fi
EXAMPLE_PROMPT="Who is the tennis GOAT: Federer, Djokovic, or Nadal?"
EXAMPLE_PROMPT_VISUAL="A golden retriever riding a skateboard through a neon-lit city"
# wait_any_exit
#
# Waits for ANY backgrounded process to exit and propagates its exit code.
# Call this as the LAST line of every launch script, after backgrounding
# all processes (including the one that would otherwise run in the foreground).
#
# Why this is better than tracking PIDs manually or running in the foreground:
# Foreground pattern: if the frontend crashes, the script blocks on the
# foreground worker and never notices until that worker also exits.
# Manual PIDs: requires bookkeeping ($DYNAMO_PID, $PREFILL_PID, ...),
# a custom cleanup() function, and `wait $PID` only watches one process.
# wait -n watches ALL children and returns as soon as ANY child dies, so
# failures are detected immediately regardless of which process it was.
#
# Signal handling:
# SIGTERM/SIGINT are trapped to exit 0 (clean shutdown). Without this,
# external cleanup (e.g. a test harness sending SIGTERM to the process
# group) interrupts wait -n, which returns 143 (128+15). Combined with
# set -e, that non-zero code looks like a test failure. Trapping TERM/INT
# makes external teardown exit cleanly while still propagating real errors
# (OOM, Python exceptions, etc.) from child processes.
#
# The EXIT trap (set at the top of each script) still fires when this function
# calls exit, tearing down the remaining processes via kill 0.
#
# Usage:
# python -m dynamo.frontend &
# python -m dynamo.vllm --model "$MODEL" &
# wait_any_exit
wait_any_exit() {
trap 'exit 0' TERM INT
if ! jobs -p | grep -q .; then
echo "wait_any_exit: no background processes found (script bug: did you forget '&'?)" >&2
exit 1
fi
wait -n
local _rc=$?
echo "A background process exited with code $_rc"
exit "$_rc"
}
# print_launch_banner [flags] <title> <model> <port> [extra_info_lines...]
#
# Prints a startup banner with model/frontend info and an example curl command.
#
# Flags (must come before positional args):
# --multimodal Use a multimodal (image_url) curl example (max_tokens=50)
# --max-tokens N Override max_tokens in the curl example (default: 32)
# --no-curl Print only the banner, skip the example curl section
#
# Positional args:
# title Banner title, e.g. "Launching Aggregated Serving (1 GPU)"
# model Model name, e.g. "$MODEL"
# port HTTP port, e.g. "$HTTP_PORT"
# extra_info_lines Optional extra lines printed below "Frontend:" (one per arg)
#
# Examples:
# # Standard text serving
# print_launch_banner "Launching Aggregated Serving (1 GPU)" "$MODEL" "$HTTP_PORT"
#
# # With extra info
# print_launch_banner "Launching Disagg on Same GPU" "$MODEL" "$HTTP_PORT" \
# "GPU Mem: 0.09 per worker (4 GiB each)"
#
# # Multimodal
# print_launch_banner --multimodal "Launching Multimodal" "$MODEL" "$HTTP_PORT"
#
# # Banner only (script prints its own curl or conditionally skips)
# print_launch_banner --no-curl "Launching DSR1 (Multi-Node)" "$MODEL" "$HTTP_PORT" \
# "Nodes: $NUM_NODES" \
# "Node rank: $NODE_RANK"
print_launch_banner() {
local _curl_type="text"
local _max_tokens=32
local _no_curl=false
while [[ "${1:-}" == --* ]]; do
case "$1" in
--multimodal) _curl_type="multimodal"; _max_tokens=50; shift ;;
--max-tokens) _max_tokens="$2"; shift 2 ;;
--no-curl) _no_curl=true; shift ;;
*) break ;;
esac
done
local _title="$1"
local _model="$2"
local _port="$3"
shift 3
echo "=========================================="
echo "$_title"
echo "=========================================="
echo "Model: $_model"
echo "Frontend: http://localhost:$_port"
for _line in "$@"; do
echo "$_line"
done
echo "=========================================="
if [[ "$_no_curl" == true ]]; then
return
fi
echo ""
echo "Example test command:"
echo ""
if [[ "$_curl_type" == "multimodal" ]]; then
cat <<CURL_EOF
curl http://localhost:${_port}/v1/chat/completions \\
-H 'Content-Type: application/json' \\
-d '{
"model": "${_model}",
"messages": [{"role": "user", "content": [
{"type": "text", "text": "Describe this image"},
{"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/PNG_transparency_demonstration_1.png/300px-PNG_transparency_demonstration_1.png"}}
]}],
"max_tokens": ${_max_tokens}
}'
CURL_EOF
else
cat <<CURL_EOF
curl http://localhost:${_port}/v1/chat/completions \\
-H 'Content-Type: application/json' \\
-d '{
"model": "${_model}",
"messages": [{"role": "user", "content": "Hello!"}],
"max_tokens": ${_max_tokens}
}'
CURL_EOF
fi
echo ""
echo "=========================================="
}
# print_curl_footer
#
# Prints a custom curl example wrapped in the standard framing (matching
# print_launch_banner's built-in curl output). Reads the curl command from
# stdin so callers can use a heredoc -- no quoting issues with embedded
# double quotes, variable interpolation, etc.
#
# Pair with print_launch_banner --no-curl for non-standard endpoints
# (images, video, embeddings, etc.) that need a custom request body.
#
# Usage:
# print_launch_banner --no-curl "Launching Image Diffusion" "$MODEL" "$PORT"
# print_curl_footer <<CURL
# curl http://localhost:${PORT}/v1/images/generations \\
# -H 'Content-Type: application/json' \\
# -d '{
# "model": "${MODEL}",
# "prompt": "A cat on a skateboard"
# }'
# CURL
print_curl_footer() {
echo ""
echo "Example test command:"
echo ""
cat
echo ""
echo "=========================================="
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment