Unverified Commit a77558d4 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

feat: allow dynamic port in scripts in preparation for parallel testing (part 1) (#4546)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent 0a8b917e
...@@ -11,9 +11,9 @@ export PYTHONHASHSEED=0 ...@@ -11,9 +11,9 @@ export PYTHONHASHSEED=0
MODEL="Qwen/Qwen3-0.6B" MODEL="Qwen/Qwen3-0.6B"
# run frontend + KV router # run frontend + KV router
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend \ python -m dynamo.frontend \
--router-mode kv \ --router-mode kv \
--http-port 8000 \
--router-reset-states & --router-reset-states &
# run workers with KVBM enabled # run workers with KVBM enabled
......
...@@ -5,8 +5,9 @@ set -e ...@@ -5,8 +5,9 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
# run ingress # run ingress
python -m dynamo.frontend --http-port=8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# run worker with LMCache enabled # run worker with LMCache enabled
DYN_SYSTEM_PORT=8081 \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache
...@@ -45,7 +45,8 @@ done ...@@ -45,7 +45,8 @@ done
export DYN_REQUEST_PLANE=tcp export DYN_REQUEST_PLANE=tcp
# Start frontend with Rust OpenAIPreprocessor # Start frontend with Rust OpenAIPreprocessor
python -m dynamo.frontend --http-port=8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# Configure GPU memory optimization for specific models # Configure GPU memory optimization for specific models
EXTRA_ARGS="" EXTRA_ARGS=""
...@@ -59,7 +60,7 @@ fi ...@@ -59,7 +60,7 @@ fi
# Multimodal data (images) are decoded in the backend worker using ImageLoader # Multimodal data (images) are decoded in the backend worker using ImageLoader
# --enforce-eager: Quick deployment (remove for production) # --enforce-eager: Quick deployment (remove for production)
# --connector none: No KV transfer needed for aggregated serving # --connector none: No KV transfer needed for aggregated serving
DYN_SYSTEM_PORT=8081 \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME --enforce-eager --connector none $EXTRA_ARGS python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME --enforce-eager --connector none $EXTRA_ARGS
# Wait for all background processes to complete # Wait for all background processes to complete
......
...@@ -64,7 +64,8 @@ else ...@@ -64,7 +64,8 @@ else
fi fi
# Start frontend (HTTP endpoint) # Start frontend (HTTP endpoint)
python -m dynamo.frontend --http-port=8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# To make Qwen2.5-VL fit in A100 40GB, set the following extra arguments # To make Qwen2.5-VL fit in A100 40GB, set the following extra arguments
EXTRA_ARGS="" EXTRA_ARGS=""
......
...@@ -8,7 +8,8 @@ trap 'echo Cleaning up...; kill 0' EXIT ...@@ -8,7 +8,8 @@ trap 'echo Cleaning up...; kill 0' EXIT
MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
# run ingress # run ingress
python -m dynamo.frontend --http-port=8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# run processor # run processor
python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "<|image|>\n<prompt>" & python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "<|image|>\n<prompt>" &
......
...@@ -41,8 +41,9 @@ export DYN_REQUEST_PLANE=$REQUEST_PLANE ...@@ -41,8 +41,9 @@ export DYN_REQUEST_PLANE=$REQUEST_PLANE
echo "Using request plane mode: $REQUEST_PLANE" echo "Using request plane mode: $REQUEST_PLANE"
# Frontend # Frontend
python -m dynamo.frontend --http-port=8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
DYN_SYSTEM_PORT=8081 \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
DYN_HEALTH_CHECK_ENABLED=true \ DYN_HEALTH_CHECK_ENABLED=true \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --connector none python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --connector none
...@@ -12,9 +12,9 @@ MODEL="Qwen/Qwen3-0.6B" ...@@ -12,9 +12,9 @@ MODEL="Qwen/Qwen3-0.6B"
BLOCK_SIZE=64 BLOCK_SIZE=64
# run frontend + KV router # run frontend + KV router
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend \ python -m dynamo.frontend \
--router-mode kv \ --router-mode kv \
--http-port 8000 \
--router-reset-states & --router-reset-states &
# run workers # run workers
......
...@@ -5,7 +5,8 @@ set -e ...@@ -5,7 +5,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
# run ingress # run ingress
python -m dynamo.frontend --router-mode kv --http-port=8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend --router-mode kv &
# Data Parallel Attention / Expert Parallelism # Data Parallel Attention / Expert Parallelism
# Routing to DP workers managed by Dynamo # Routing to DP workers managed by Dynamo
......
...@@ -5,7 +5,8 @@ set -e ...@@ -5,7 +5,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
# run ingress # run ingress
python -m dynamo.frontend --http-port=8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# --enforce-eager is added for quick deployment. for production use, need to remove this flag # --enforce-eager is added for quick deployment. for production use, need to remove this flag
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager & CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager &
......
...@@ -5,7 +5,8 @@ set -e ...@@ -5,7 +5,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
# run ingress # run ingress
python -m dynamo.frontend --http-port=8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# run decode worker on GPU 0, without enabling KVBM # run decode worker on GPU 0, without enabling KVBM
# NOTE: remove --enforce-eager for production use # NOTE: remove --enforce-eager for production use
......
...@@ -5,7 +5,8 @@ set -e ...@@ -5,7 +5,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
# run ingress with KV router # run ingress with KV router
python -m dynamo.frontend --router-mode kv --http-port=8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend --router-mode kv &
# run decode workers on GPU 0 and 1, without enabling KVBM # run decode workers on GPU 0 and 1, without enabling KVBM
# NOTE: remove --enforce-eager for production use # NOTE: remove --enforce-eager for production use
......
...@@ -10,9 +10,9 @@ export PYTHONHASHSEED=0 ...@@ -10,9 +10,9 @@ export PYTHONHASHSEED=0
# Common configuration # Common configuration
MODEL="Qwen/Qwen3-0.6B" MODEL="Qwen/Qwen3-0.6B"
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend \ python -m dynamo.frontend \
--router-mode kv \ --router-mode kv \
--http-port 8000 \
--router-reset-states & --router-reset-states &
# two decode workers (without KVBM) # two decode workers (without KVBM)
......
...@@ -5,7 +5,8 @@ set -e ...@@ -5,7 +5,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
# run ingress with KV router # run ingress with KV router
python -m dynamo.frontend --router-mode kv --http-port=8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend --router-mode kv &
# run decode worker on GPU 0, without enabling LMCache # run decode worker on GPU 0, without enabling LMCache
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B & CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B &
......
...@@ -72,7 +72,8 @@ echo "==================================================" ...@@ -72,7 +72,8 @@ echo "=================================================="
# Start frontend (no router mode) # Start frontend (no router mode)
echo "Starting frontend..." echo "Starting frontend..."
python -m dynamo.frontend --http-port=8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# Start processor # Start processor
echo "Starting processor..." echo "Starting processor..."
......
...@@ -45,7 +45,8 @@ MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" ...@@ -45,7 +45,8 @@ MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
if [[ $HEAD_NODE -eq 1 ]]; then if [[ $HEAD_NODE -eq 1 ]]; then
# run ingress # run ingress
python -m dynamo.frontend --http-port=8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# run processor # run processor
python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "<|image|>\n<prompt>" & python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "<|image|>\n<prompt>" &
......
...@@ -13,9 +13,9 @@ BLOCK_SIZE=64 ...@@ -13,9 +13,9 @@ BLOCK_SIZE=64
# Start frontend with KV routing # Start frontend with KV routing
# The frontend will automatically detect prefill workers and activate an internal prefill router # The frontend will automatically detect prefill workers and activate an internal prefill router
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend \ python -m dynamo.frontend \
--router-mode kv \ --router-mode kv \
--http-port 8000 \
--router-reset-states & --router-reset-states &
# two decode workers # two decode workers
......
...@@ -42,12 +42,13 @@ cleanup() { ...@@ -42,12 +42,13 @@ cleanup() {
trap cleanup EXIT INT TERM trap cleanup EXIT INT TERM
# run ingress # run ingress
python3 -m dynamo.frontend --http-port=8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$! DYNAMO_PID=$!
# run decode worker with metrics on port 8081 # run decode worker with metrics on port 8081
# --enforce-eager is added for quick deployment. for production use, need to remove this flag # --enforce-eager is added for quick deployment. for production use, need to remove this flag
DYN_SYSTEM_PORT=8081 \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
CUDA_VISIBLE_DEVICES=0 \ CUDA_VISIBLE_DEVICES=0 \
python3 -m dynamo.vllm \ python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \ --model Qwen/Qwen3-0.6B \
...@@ -65,7 +66,7 @@ echo "Waiting for decode worker to initialize..." ...@@ -65,7 +66,7 @@ echo "Waiting for decode worker to initialize..."
sleep 10 sleep 10
# run prefill worker with metrics on port 8082 (foreground) # run prefill worker with metrics on port 8082 (foreground)
DYN_SYSTEM_PORT=8082 \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_PREFILL:-8082} \
DYN_VLLM_KV_EVENT_PORT=20081 \ DYN_VLLM_KV_EVENT_PORT=20081 \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=0 \ CUDA_VISIBLE_DEVICES=0 \
......
...@@ -82,8 +82,9 @@ echo " Model name: $MODEL" ...@@ -82,8 +82,9 @@ echo " Model name: $MODEL"
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
# run ingress if it's node 0 # run ingress if it's node 0
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
if [ $NODE_RANK -eq 0 ]; then if [ $NODE_RANK -eq 0 ]; then
DYN_LOG=debug python -m dynamo.frontend --router-mode kv --http-port=8000 2>&1 | tee $LOG_DIR/dsr1_dep_ingress.log & DYN_LOG=debug python -m dynamo.frontend --router-mode kv 2>&1 | tee $LOG_DIR/dsr1_dep_ingress.log &
fi fi
mkdir -p $LOG_DIR mkdir -p $LOG_DIR
......
...@@ -12,5 +12,6 @@ etcd --listen-client-urls http://0.0.0.0:2379 --advertise-client-urls http://0.0 ...@@ -12,5 +12,6 @@ etcd --listen-client-urls http://0.0.0.0:2379 --advertise-client-urls http://0.0
sleep 3 sleep 3
# Start OpenAI Frontend which will dynamically discover workers when they startup # Start OpenAI Frontend which will dynamically discover workers when they startup
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
# NOTE: This is a blocking call. # NOTE: This is a blocking call.
python3 -m dynamo.frontend --http-port 8000 python3 -m dynamo.frontend
...@@ -17,7 +17,8 @@ cleanup() { ...@@ -17,7 +17,8 @@ cleanup() {
trap cleanup EXIT INT TERM trap cleanup EXIT INT TERM
# run ingress # run ingress
python3 -m dynamo.frontend --http-port=8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
FRONTEND_PID=$! FRONTEND_PID=$!
# run the mock worker + template validation generate() # run the mock worker + template validation generate()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment