Unverified Commit a77558d4 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

feat: allow dynamic port in scripts in preparation for parallel testing (part 1) (#4546)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent 0a8b917e
......@@ -11,9 +11,9 @@ export PYTHONHASHSEED=0
MODEL="Qwen/Qwen3-0.6B"
# run frontend + KV router
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend \
--router-mode kv \
--http-port 8000 \
--router-reset-states &
# run workers with KVBM enabled
......
......@@ -5,8 +5,9 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT
# run ingress
python -m dynamo.frontend --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# run worker with LMCache enabled
DYN_SYSTEM_PORT=8081 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache
......@@ -45,7 +45,8 @@ done
export DYN_REQUEST_PLANE=tcp
# Start frontend with Rust OpenAIPreprocessor
python -m dynamo.frontend --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# Configure GPU memory optimization for specific models
EXTRA_ARGS=""
......@@ -59,7 +60,7 @@ fi
# Multimodal data (images) are decoded in the backend worker using ImageLoader
# --enforce-eager: Quick deployment (remove for production)
# --connector none: No KV transfer needed for aggregated serving
DYN_SYSTEM_PORT=8081 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME --enforce-eager --connector none $EXTRA_ARGS
# Wait for all background processes to complete
......
......@@ -64,7 +64,8 @@ else
fi
# Start frontend (HTTP endpoint)
python -m dynamo.frontend --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# To make Qwen2.5-VL fit in A100 40GB, set the following extra arguments
EXTRA_ARGS=""
......
......@@ -8,7 +8,8 @@ trap 'echo Cleaning up...; kill 0' EXIT
MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
# run ingress
python -m dynamo.frontend --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# run processor
python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "<|image|>\n<prompt>" &
......
......@@ -41,8 +41,9 @@ export DYN_REQUEST_PLANE=$REQUEST_PLANE
echo "Using request plane mode: $REQUEST_PLANE"
# Frontend
python -m dynamo.frontend --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
DYN_SYSTEM_PORT=8081 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
DYN_HEALTH_CHECK_ENABLED=true \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --connector none
......@@ -12,9 +12,9 @@ MODEL="Qwen/Qwen3-0.6B"
BLOCK_SIZE=64
# run frontend + KV router
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend \
--router-mode kv \
--http-port 8000 \
--router-reset-states &
# run workers
......
......@@ -5,7 +5,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT
# run ingress
python -m dynamo.frontend --router-mode kv --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend --router-mode kv &
# Data Parallel Attention / Expert Parallelism
# Routing to DP workers managed by Dynamo
......
......@@ -5,7 +5,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT
# run ingress
python -m dynamo.frontend --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager &
......
......@@ -5,7 +5,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT
# run ingress
python -m dynamo.frontend --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# run decode worker on GPU 0, without enabling KVBM
# NOTE: remove --enforce-eager for production use
......
......@@ -5,7 +5,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT
# run ingress with KV router
python -m dynamo.frontend --router-mode kv --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend --router-mode kv &
# run decode workers on GPU 0 and 1, without enabling KVBM
# NOTE: remove --enforce-eager for production use
......
......@@ -10,9 +10,9 @@ export PYTHONHASHSEED=0
# Common configuration
MODEL="Qwen/Qwen3-0.6B"
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend \
--router-mode kv \
--http-port 8000 \
--router-reset-states &
# two decode workers (without KVBM)
......
......@@ -5,7 +5,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT
# run ingress with KV router
python -m dynamo.frontend --router-mode kv --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend --router-mode kv &
# run decode worker on GPU 0, without enabling LMCache
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B &
......
......@@ -72,7 +72,8 @@ echo "=================================================="
# Start frontend (no router mode)
echo "Starting frontend..."
python -m dynamo.frontend --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# Start processor
echo "Starting processor..."
......
......@@ -45,7 +45,8 @@ MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
if [[ $HEAD_NODE -eq 1 ]]; then
# run ingress
python -m dynamo.frontend --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# run processor
python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "<|image|>\n<prompt>" &
......
......@@ -13,9 +13,9 @@ BLOCK_SIZE=64
# Start frontend with KV routing
# The frontend will automatically detect prefill workers and activate an internal prefill router
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend \
--router-mode kv \
--http-port 8000 \
--router-reset-states &
# two decode workers
......
......@@ -42,12 +42,13 @@ cleanup() {
trap cleanup EXIT INT TERM
# run ingress
python3 -m dynamo.frontend --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run decode worker with metrics on port 8081
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
DYN_SYSTEM_PORT=8081 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
CUDA_VISIBLE_DEVICES=0 \
python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \
......@@ -65,7 +66,7 @@ echo "Waiting for decode worker to initialize..."
sleep 10
# run prefill worker with metrics on port 8082 (foreground)
DYN_SYSTEM_PORT=8082 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_PREFILL:-8082} \
DYN_VLLM_KV_EVENT_PORT=20081 \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=0 \
......
......@@ -82,8 +82,9 @@ echo " Model name: $MODEL"
trap 'echo Cleaning up...; kill 0' EXIT
# run ingress if it's node 0
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
if [ $NODE_RANK -eq 0 ]; then
DYN_LOG=debug python -m dynamo.frontend --router-mode kv --http-port=8000 2>&1 | tee $LOG_DIR/dsr1_dep_ingress.log &
DYN_LOG=debug python -m dynamo.frontend --router-mode kv 2>&1 | tee $LOG_DIR/dsr1_dep_ingress.log &
fi
mkdir -p $LOG_DIR
......
......@@ -12,5 +12,6 @@ etcd --listen-client-urls http://0.0.0.0:2379 --advertise-client-urls http://0.0
sleep 3
# Start OpenAI Frontend which will dynamically discover workers when they startup
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
# NOTE: This is a blocking call.
python3 -m dynamo.frontend --http-port 8000
python3 -m dynamo.frontend
......@@ -17,7 +17,8 @@ cleanup() {
trap cleanup EXIT INT TERM
# run ingress
python3 -m dynamo.frontend --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
FRONTEND_PID=$!
# run the mock worker + template validation generate()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment