Unverified Commit 34ccc0b1 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

feat: refactor launch scripts with shared launch_utils.sh for consistent failure handling (#7008)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent c95bfc2e
......@@ -2,6 +2,12 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
......@@ -11,20 +17,9 @@ export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/
export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"}
export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"}
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# run frontend with KV routing for cache-aware optimization
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode kv &
DYNAMO_PID=$!
# run prefill worker
# Publishes KV events for router's cache-aware routing
......@@ -35,7 +30,6 @@ CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--extra-engine-args "$PREFILL_ENGINE_ARGS" \
--disaggregation-mode prefill \
--publish-events-and-metrics &
PREFILL_PID=$!
# run decode worker
# No event publishing needed - prefill handles it
......@@ -43,4 +37,7 @@ CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$DECODE_ENGINE_ARGS" \
--disaggregation-mode decode
--disaggregation-mode decode &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -18,6 +18,9 @@
# fraction per worker (free) : 0.05
# Overestimating is intentional -- better to pad than OOM.
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
......@@ -46,14 +49,7 @@ export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/
export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-"0"}
export MODALITY=${MODALITY:-"text"}
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
ENABLE_OTEL=false
while [[ $# -gt 0 ]]; do
......@@ -91,20 +87,16 @@ if [ "$ENABLE_OTEL" = true ]; then
fi
OVERRIDE_ARGS=(--override-engine-args "{${OVERRIDE_PAIRS}}")
echo "=========================================="
echo "Launching Disaggregated on Same GPU (1 GPU)"
echo "=========================================="
echo "Model: $MODEL"
echo "Max seq len: $MAX_SEQ_LEN"
echo "GPU Mem: ${GPU_MEM_FRACTION} per worker (~${_EW_TOTAL_GIB} GiB each)"
echo " estimate: weights=${_EW_WEIGHTS_GIB} + kv=${_EW_KV_GIB} + overhead=${_EW_OVERHEAD_GIB} GiB"
echo "=========================================="
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Disaggregated on Same GPU (1 GPU)" "$MODEL" "$HTTP_PORT" \
"Max seq len: $MAX_SEQ_LEN" \
"GPU Mem: ${GPU_MEM_FRACTION} per worker (~${_EW_TOTAL_GIB} GiB each)" \
" estimate: weights=${_EW_WEIGHTS_GIB} + kv=${_EW_KV_GIB} + overhead=${_EW_OVERHEAD_GIB} GiB"
# run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run prefill worker (shares GPU with decode)
OTEL_SERVICE_NAME=dynamo-worker-prefill \
......@@ -118,9 +110,8 @@ python3 -m dynamo.trtllm \
--publish-events-and-metrics \
--disaggregation-mode prefill \
"${OVERRIDE_ARGS[@]}" &
PREFILL_PID=$!
# run decode worker (shares GPU with prefill) - foreground
# run decode worker (shares GPU with prefill)
OTEL_SERVICE_NAME=dynamo-worker-decode \
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
......@@ -131,4 +122,7 @@ python3 -m dynamo.trtllm \
--modality "$MODALITY" \
--publish-events-and-metrics \
--disaggregation-mode decode \
"${OVERRIDE_ARGS[@]}"
"${OVERRIDE_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -2,6 +2,12 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"meta-llama/Llama-4-Scout-17B-16E-Instruct"}
......@@ -17,20 +23,9 @@ export MODALITY=${MODALITY:-"multimodal"}
export ALLOWED_LOCAL_MEDIA_PATH=${ALLOWED_LOCAL_MEDIA_PATH:-"/tmp"}
export MAX_FILE_SIZE_MB=${MAX_FILE_SIZE_MB:-50}
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID $PREFILL_PID $DECODE_PID $ENCODE_PID 2>/dev/null || true
wait $DYNAMO_PID $PREFILL_PID $DECODE_PID $ENCODE_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run encode worker
CUDA_VISIBLE_DEVICES=$ENCODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
......@@ -41,7 +36,6 @@ CUDA_VISIBLE_DEVICES=$ENCODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--allowed-local-media-path "$ALLOWED_LOCAL_MEDIA_PATH" \
--max-file-size-mb "$MAX_FILE_SIZE_MB" \
--disaggregation-mode encode &
ENCODE_PID=$!
# run prefill worker
CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
......@@ -51,7 +45,6 @@ CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--modality "$MODALITY" \
--disaggregation-mode prefill \
--encode-endpoint "$ENCODE_ENDPOINT" &
PREFILL_PID=$!
# run decode worker
CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
......@@ -60,6 +53,6 @@ CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--extra-engine-args "$DECODE_ENGINE_ARGS" \
--modality "$MODALITY" \
--disaggregation-mode decode &
DECODE_PID=$!
wait $DYNAMO_PID
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -2,6 +2,12 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-VL-2B-Instruct"}
......@@ -17,21 +23,9 @@ export MODALITY=${MODALITY:-"multimodal"}
export ALLOWED_LOCAL_MEDIA_PATH=${ALLOWED_LOCAL_MEDIA_PATH:-"/tmp"}
export MAX_FILE_SIZE_MB=${MAX_FILE_SIZE_MB:-50}
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID $PREFILL_PID $DECODE_PID $ENCODE_PID 2>/dev/null || true
wait $DYNAMO_PID $PREFILL_PID $DECODE_PID $ENCODE_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run encode worker
CUDA_VISIBLE_DEVICES=$ENCODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
......@@ -42,7 +36,6 @@ CUDA_VISIBLE_DEVICES=$ENCODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--allowed-local-media-path "$ALLOWED_LOCAL_MEDIA_PATH" \
--max-file-size-mb "$MAX_FILE_SIZE_MB" \
--disaggregation-mode encode &
ENCODE_PID=$!
# run prefill worker
CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
......@@ -52,7 +45,6 @@ CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--modality "$MODALITY" \
--disaggregation-mode prefill \
--encode-endpoint "$ENCODE_ENDPOINT" &
PREFILL_PID=$!
# run decode worker
CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
......@@ -63,6 +55,6 @@ CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--allowed-local-media-path "$ALLOWED_LOCAL_MEDIA_PATH" \
--max-file-size-mb "$MAX_FILE_SIZE_MB" \
--disaggregation-mode decode &
DECODE_PID=$!
wait $DYNAMO_PID
\ No newline at end of file
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -2,6 +2,12 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"/model"}
......@@ -9,10 +15,6 @@ export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"openai/gpt-oss-120b"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gpt-oss-120b/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gpt-oss-120b/decode.yaml"}
set -e
trap 'echo Cleaning up...; kill 0' EXIT
# run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode round-robin &
......@@ -43,4 +45,7 @@ CUDA_VISIBLE_DEVICES=4,5,6,7 python3 -m dynamo.trtllm \
--max-num-tokens 16384 \
--free-gpu-memory-fraction 0.9 \
--tensor-parallel-size 4 \
--expert-parallel-size 4
--expert-parallel-size 4 &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -4,6 +4,9 @@
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Default model
MODEL="Qwen/Qwen3-0.6B"
......@@ -23,24 +26,7 @@ while [[ $# -gt 0 ]]; do
done
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated Serving (1 GPU)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner "Launching Aggregated Serving (1 GPU)" "$MODEL" "$HTTP_PORT"
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
......@@ -49,4 +35,7 @@ python -m dynamo.frontend &
# run worker
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --model "$MODEL" --enforce-eager "${EXTRA_ARGS[@]}"
python -m dynamo.vllm --model "$MODEL" --enforce-eager "${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -6,24 +6,11 @@ trap 'echo Cleaning up...; kill 0' EXIT
MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated Serving + FlexKV (1 GPU)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
print_launch_banner "Launching Aggregated Serving + FlexKV (1 GPU)" "$MODEL" "$HTTP_PORT"
# Run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
......@@ -32,4 +19,7 @@ python -m dynamo.frontend &
# Run worker with FlexKV
DYNAMO_USE_FLEXKV=1 \
FLEXKV_CPU_CACHE_GB=32 \
python -m dynamo.vllm --model "$MODEL" --kv-transfer-config '{"kv_connector":"FlexKVConnectorV1","kv_role":"kv_both"}'
python -m dynamo.vllm --model "$MODEL" --kv-transfer-config '{"kv_connector":"FlexKVConnectorV1","kv_role":"kv_both"}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -6,24 +6,11 @@ trap 'echo Cleaning up...; kill 0' EXIT
MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated Serving + FlexKV + KV Routing (2 GPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
print_launch_banner "Launching Aggregated Serving + FlexKV + KV Routing (2 GPUs)" "$MODEL" "$HTTP_PORT"
# Run frontend and KV router
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
......@@ -50,4 +37,7 @@ python -m dynamo.vllm \
--model $MODEL \
--kv-transfer-config '{"kv_connector":"FlexKVConnectorV1","kv_role":"kv_both"}' \
--gpu-memory-utilization 0.2 \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -6,24 +6,11 @@ trap 'echo Cleaning up...; kill 0' EXIT
MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated Serving + KVBM (1 GPU)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
print_launch_banner "Launching Aggregated Serving + KVBM (1 GPU)" "$MODEL" "$HTTP_PORT"
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
......@@ -32,4 +19,7 @@ python -m dynamo.frontend &
# run worker with KVBM enabled
# NOTE: remove --enforce-eager for production use
DYN_KVBM_CPU_CACHE_GB=20 \
python -m dynamo.vllm --model "$MODEL" --kv-transfer-config '{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"}' --enforce-eager
python -m dynamo.vllm --model "$MODEL" --kv-transfer-config '{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"}' --enforce-eager &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -7,27 +7,13 @@ trap 'echo Cleaning up...; kill 0' EXIT
# Set deterministic hash for KV event IDs
export PYTHONHASHSEED=0
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Common configuration
MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated + KVBM + KV Routing (2 GPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner "Launching Aggregated + KVBM + KV Routing (2 GPUs)" "$MODEL" "$HTTP_PORT"
# run frontend + KV router
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
......@@ -57,4 +43,7 @@ CUDA_VISIBLE_DEVICES=1 DYN_KVBM_CPU_CACHE_GB=2 \
--enforce-eager \
--kv-transfer-config '{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"}' \
--gpu-memory-utilization 0.4 \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -7,26 +7,12 @@ trap 'echo Cleaning up...; kill 0' EXIT
# Explicitly unset PROMETHEUS_MULTIPROC_DIR to let LMCache or Dynamo manage it internally
unset PROMETHEUS_MULTIPROC_DIR
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated Serving + LMCache (1 GPU)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner "Launching Aggregated Serving + LMCache (1 GPU)" "$MODEL" "$HTTP_PORT"
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
......@@ -34,4 +20,7 @@ python -m dynamo.frontend &
# run worker with LMCache enabled (without PROMETHEUS_MULTIPROC_DIR set externally)
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --model "$MODEL" --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
python -m dynamo.vllm --model "$MODEL" --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -17,26 +17,12 @@ cleanup() {
}
trap cleanup EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated + LMCache + Multiproc (1 GPU)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner "Launching Aggregated + LMCache + Multiproc (1 GPU)" "$MODEL" "$HTTP_PORT"
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
......@@ -45,5 +31,7 @@ python -m dynamo.frontend &
# run worker with LMCache enabled and PROMETHEUS_MULTIPROC_DIR explicitly set
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
PROMETHEUS_MULTIPROC_DIR="$PROMETHEUS_MULTIPROC_DIR" \
python -m dynamo.vllm --model "$MODEL" --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
python -m dynamo.vllm --model "$MODEL" --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -14,6 +14,9 @@
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Default values
MODEL_NAME="Qwen/Qwen3-VL-30B-A3B-Instruct-FP8"
......@@ -44,30 +47,7 @@ while [[ $# -gt 0 ]]; do
done
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated Multimodal Serving"
echo "=========================================="
echo "Model: $MODEL_NAME"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL_NAME}\","
echo " \"messages\": [{"
echo " \"role\": \"user\","
echo " \"content\": ["
echo " {\"type\": \"text\", \"text\": \"Describe the image.\"},"
echo " {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/480px-Cat03.jpg\"}}"
echo " ]"
echo " }],"
echo " \"max_tokens\": 50"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner --multimodal "Launching Aggregated Multimodal Serving" "$MODEL_NAME" "$HTTP_PORT"
# Use TCP transport (instead of default NATS)
# TCP is preferred for multimodal workloads because it overcomes:
......@@ -96,7 +76,5 @@ CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0} \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME $MODEL_SPECIFIC_ARGS "${EXTRA_ARGS[@]}"
# Wait for all background processes to complete
wait
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -12,6 +12,7 @@ MODEL="${MODEL:-Qwen/Qwen2.5-Omni-7B}"
# Stage config path - use single-stage LLM config for text-to-text
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
STAGE_CONFIG="${STAGE_CONFIG:-$SCRIPT_DIR/stage_configs/single_stage_llm.yaml}"
# Parse command line arguments
......@@ -30,24 +31,8 @@ while [[ $# -gt 0 ]]; do
done
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching vLLM-Omni Text-to-Text (1 GPU)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner "Launching vLLM-Omni Text-to-Text (1 GPU)" "$MODEL" "$HTTP_PORT"
# Run ingress (frontend)
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
......@@ -63,4 +48,7 @@ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
--model "$MODEL" \
--omni \
--stage-configs-path "$STAGE_CONFIG" \
"${EXTRA_ARGS[@]}"
"${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -5,6 +5,9 @@
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
MODEL="Qwen/Qwen-Image"
# Parse command line arguments
......@@ -23,24 +26,8 @@ while [[ $# -gt 0 ]]; do
done
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching vLLM-Omni Image Generation (1 GPU)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Generate an image of a sunset over mountains.\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner "Launching vLLM-Omni Image Generation (1 GPU)" "$MODEL" "$HTTP_PORT"
python -m dynamo.frontend &
FRONTEND_PID=$!
......@@ -54,4 +41,7 @@ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
--omni \
--output-modalities image \
--media-output-fs-url file:///tmp/dynamo_media \
"${EXTRA_ARGS[@]}"
"${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -5,6 +5,8 @@
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
MODEL="Wan-AI/Wan2.1-T2V-1.3B-Diffusers"
......@@ -24,24 +26,8 @@ while [[ $# -gt 0 ]]; do
done
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching vLLM-Omni Video Generation (1 GPU)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Generate a short video of ocean waves.\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner "Launching vLLM-Omni Video Generation (1 GPU)" "$MODEL" "$HTTP_PORT"
python -m dynamo.frontend &
FRONTEND_PID=$!
......@@ -55,4 +41,7 @@ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
--omni \
--output-modalities video \
--media-output-fs-url file:///tmp/dynamo_media \
"${EXTRA_ARGS[@]}"
\ No newline at end of file
"${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -4,6 +4,9 @@
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Parse command-line arguments for request plane mode
REQUEST_PLANE="tcp" # Default to TCP
......@@ -43,24 +46,7 @@ export DYN_REQUEST_PLANE=$REQUEST_PLANE
echo "Using request plane mode: $REQUEST_PLANE"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated Serving + Request Planes (1 GPU)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner "Launching Aggregated Serving + Request Planes (1 GPU)" "$MODEL" "$HTTP_PORT"
# Frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
......@@ -68,4 +54,7 @@ python -m dynamo.frontend &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
DYN_HEALTH_CHECK_ENABLED=true \
python -m dynamo.vllm --model "$MODEL" --enforce-eager
python -m dynamo.vllm --model "$MODEL" --enforce-eager &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# TODO: rename to agg_router_2gpu.sh (uses 2 GPUs) and update all references
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Set deterministic hash for KV event IDs
export PYTHONHASHSEED=0
......@@ -12,24 +16,7 @@ MODEL="Qwen/Qwen3-0.6B"
BLOCK_SIZE=64
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated + KV Routing (2 GPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner "Launching Aggregated + KV Routing (2 GPUs)" "$MODEL" "$HTTP_PORT"
# run frontend + KV router
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
......@@ -56,4 +43,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -4,28 +4,14 @@
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Common configuration
MODEL="Qwen/Qwen3-0.6B"
BLOCK_SIZE=64
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated + Approximate KV Routing (2 GPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner "Launching Aggregated + Approximate KV Routing (2 GPUs)" "$MODEL" "$HTTP_PORT"
# run frontend with KV router (--router-mode kv) in approximate mode (--no-kv-events)
python -m dynamo.frontend \
......@@ -49,4 +35,7 @@ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--kv-events-config '{"enable_kv_cache_events": false}'
--kv-events-config '{"enable_kv_cache_events": false}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -10,27 +10,13 @@ export PYTHONHASHSEED=0
# Common configuration
MODEL="Qwen/Qwen3-0.6B"
BLOCK_SIZE=64
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
HTTP_PORT_R1="${DYN_HTTP_PORT_R1:-8000}"
HTTP_PORT_R2="${DYN_HTTP_PORT_R2:-8001}"
echo "=========================================="
echo "Launching Aggregated + KV Routing + Replicas (2 GPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend R1: http://localhost:$HTTP_PORT_R1"
echo "Frontend R2: http://localhost:$HTTP_PORT_R2"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT_R1}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
print_launch_banner --no-curl "Launching Aggregated + KV Routing + Replicas (2 GPUs)" "$MODEL" "$HTTP_PORT_R1" \
"Frontend R2: http://localhost:$HTTP_PORT_R2"
# run two routers (different HTTP + system ports)
# Note: use --router-reset-states only on one router to avoid wiping shared state twice.
......@@ -59,4 +45,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
\ No newline at end of file
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment