Unverified Commit a77558d4 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

feat: allow dynamic port in scripts in preparation for parallel testing (part 1) (#4546)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent 0a8b917e
......@@ -44,12 +44,13 @@ if [ "$ENABLE_OTEL" = true ]; then
fi
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend --http-port=8000 &
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run worker
OTEL_SERVICE_NAME=dynamo-worker DYN_SYSTEM_PORT=8081 \
# run worker with metrics enabled
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
......
......@@ -44,12 +44,13 @@ if [ "$ENABLE_OTEL" = true ]; then
fi
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend --http-port=8000 &
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run worker
OTEL_SERVICE_NAME=dynamo-worker-embedding DYN_SYSTEM_PORT=8081 \
OTEL_SERVICE_NAME=dynamo-worker-embedding DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python3 -m dynamo.sglang \
--embedding-worker \
--model-path Qwen/Qwen3-Embedding-4B \
......
......@@ -44,12 +44,13 @@ if [ "$ENABLE_OTEL" = true ]; then
fi
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend --router-mode kv --http-port=8000 &
python3 -m dynamo.frontend --router-mode kv &
DYNAMO_PID=$!
# run worker
OTEL_SERVICE_NAME=dynamo-worker-1 DYN_SYSTEM_PORT=8081 \
OTEL_SERVICE_NAME=dynamo-worker-1 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_WORKER1:-8081} \
python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
......@@ -60,7 +61,7 @@ python3 -m dynamo.sglang \
--enable-metrics &
WORKER_PID=$!
OTEL_SERVICE_NAME=dynamo-worker-2 DYN_SYSTEM_PORT=8082 \
OTEL_SERVICE_NAME=dynamo-worker-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_WORKER2:-8082} \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
......
......@@ -44,12 +44,13 @@ if [ "$ENABLE_OTEL" = true ]; then
fi
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend --http-port=8000 &
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run prefill worker
OTEL_SERVICE_NAME=dynamo-worker-prefill DYN_SYSTEM_PORT=8081 \
OTEL_SERVICE_NAME=dynamo-worker-prefill DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_PREFILL:-8081} \
python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
......@@ -64,7 +65,7 @@ python3 -m dynamo.sglang \
PREFILL_PID=$!
# run decode worker
OTEL_SERVICE_NAME=dynamo-worker-decode DYN_SYSTEM_PORT=8082 \
OTEL_SERVICE_NAME=dynamo-worker-decode DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_DECODE:-8082} \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
......
......@@ -45,16 +45,16 @@ if [ "$ENABLE_OTEL" = true ]; then
fi
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend \
--http-port=8000 \
--router-mode kv \
--kv-overlap-score-weight 0 \
--router-reset-states &
DYNAMO_PID=$!
# run prefill router
OTEL_SERVICE_NAME=dynamo-router-prefill DYN_SYSTEM_PORT=8081 \
OTEL_SERVICE_NAME=dynamo-router-prefill DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_PREFILL_ROUTER:-8081} \
python3 -m dynamo.router \
--endpoint dynamo.prefill.generate \
--block-size 64 \
......@@ -63,7 +63,7 @@ python3 -m dynamo.router \
PREFILL_ROUTER_PID=$!
# run prefill worker
OTEL_SERVICE_NAME=dynamo-worker-prefill-1 DYN_SYSTEM_PORT=8082 \
OTEL_SERVICE_NAME=dynamo-worker-prefill-1 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_PREFILL_WORKER1:-8082} \
python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
......@@ -78,7 +78,7 @@ python3 -m dynamo.sglang \
PREFILL_PID=$!
# run prefill worker
OTEL_SERVICE_NAME=dynamo-worker-prefill-2 DYN_SYSTEM_PORT=8083 \
OTEL_SERVICE_NAME=dynamo-worker-prefill-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_PREFILL_WORKER2:-8083} \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
......@@ -93,7 +93,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
PREFILL_PID=$!
# run decode worker
OTEL_SERVICE_NAME=dynamo-worker-decode-1 DYN_SYSTEM_PORT=8084 \
OTEL_SERVICE_NAME=dynamo-worker-decode-1 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_DECODE_WORKER1:-8084} \
CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
......@@ -108,7 +108,7 @@ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.sglang \
PREFILL_PID=$!
# run decode worker
OTEL_SERVICE_NAME=dynamo-worker-decode-2 DYN_SYSTEM_PORT=8085 \
OTEL_SERVICE_NAME=dynamo-worker-decode-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_DECODE_WORKER2:-8085} \
CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
......
......@@ -37,11 +37,12 @@ trap cleanup EXIT INT TERM
# run ingress with KV router mode for disaggregated setup
python3 -m dynamo.frontend --router-mode kv --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode kv &
DYNAMO_PID=$!
# run prefill worker with metrics on port 8081
DYN_SYSTEM_PORT=8081 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
......@@ -71,7 +72,7 @@ echo "Waiting for prefill worker to initialize..."
sleep 5
# run decode worker with metrics on port 8082 (foreground)
DYN_SYSTEM_PORT=8082 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
......
......@@ -60,7 +60,8 @@ if [[ -n "$SERVED_MODEL_NAME" ]]; then
fi
# run ingress
python3 -m dynamo.frontend --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run SGLang multimodal processor
......
......@@ -60,7 +60,8 @@ if [[ -n "$SERVED_MODEL_NAME" ]]; then
fi
# run ingress
python3 -m dynamo.frontend --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run SGLang multimodal processor
......
......@@ -22,7 +22,8 @@ trap cleanup EXIT INT TERM
# run frontend
python3 -m dynamo.frontend --http-port 8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run worker
......
......@@ -19,11 +19,12 @@ cleanup() {
trap cleanup EXIT INT TERM
# Run frontend
python3 -m dynamo.frontend --http-port 8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# Run worker
DYN_SYSTEM_PORT=8081 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
......
......@@ -19,7 +19,8 @@ trap cleanup EXIT INT TERM
# run frontend
python3 -m dynamo.frontend --router-mode kv --http-port 8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode kv &
DYNAMO_PID=$!
# run worker
......
......@@ -25,7 +25,8 @@ trap cleanup EXIT INT TERM
# run frontend
python3 -m dynamo.frontend --http-port 8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run prefill worker
......
......@@ -23,7 +23,8 @@ trap cleanup EXIT INT TERM
# run frontend
python3 -m dynamo.frontend --http-port 8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run prefill worker
......
......@@ -22,7 +22,8 @@ trap cleanup EXIT INT TERM
# run frontend with KV routing for cache-aware optimization
python3 -m dynamo.frontend --router-mode kv --http-port 8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode kv &
DYNAMO_PID=$!
# run prefill worker
......
......@@ -48,12 +48,13 @@ trap cleanup EXIT INT TERM
# run frontend
python3 -m dynamo.frontend --http-port 8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run prefill worker (shares GPU with decode)
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
DYN_SYSTEM_PORT=8081 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
......@@ -65,7 +66,7 @@ PREFILL_PID=$!
# run decode worker (shares GPU with prefill)
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
DYN_SYSTEM_PORT=8082 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
......
......@@ -29,7 +29,8 @@ trap cleanup EXIT INT TERM
# run frontend
python3 -m dynamo.frontend --http-port 8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run encode worker
......
......@@ -14,7 +14,8 @@ trap 'echo Cleaning up...; kill 0' EXIT
# run frontend
python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode round-robin &
# With tensor_parallel_size=4, each worker needs 4 GPUs
# run prefill worker
......
......@@ -20,6 +20,7 @@ etcd --listen-client-urls http://0.0.0.0:2379 --advertise-client-urls http://0.0
sleep 2
# Start OpenAI Frontend which will dynamically discover workers when they startup
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
# NOTE: This is a blocking call.
python3 -m dynamo.frontend --http-port 8000
python3 -m dynamo.frontend
......@@ -5,9 +5,10 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT
# run ingress
python -m dynamo.frontend --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# run worker
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
DYN_SYSTEM_PORT=8081 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --connector none
......@@ -5,7 +5,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT
# run ingress
python -m dynamo.frontend --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# run worker with KVBM enabled
# NOTE: remove --enforce-eager for production use
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment