Unverified Commit a77558d4 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

feat: allow dynamic port in scripts in preparation for parallel testing (part 1) (#4546)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent 0a8b917e
...@@ -44,12 +44,13 @@ if [ "$ENABLE_OTEL" = true ]; then ...@@ -44,12 +44,13 @@ if [ "$ENABLE_OTEL" = true ]; then
fi fi
# run ingress # run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \ OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend --http-port=8000 & python3 -m dynamo.frontend &
DYNAMO_PID=$! DYNAMO_PID=$!
# run worker # run worker with metrics enabled
OTEL_SERVICE_NAME=dynamo-worker DYN_SYSTEM_PORT=8081 \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python3 -m dynamo.sglang \ python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \ --model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \ --served-model-name Qwen/Qwen3-0.6B \
......
...@@ -44,12 +44,13 @@ if [ "$ENABLE_OTEL" = true ]; then ...@@ -44,12 +44,13 @@ if [ "$ENABLE_OTEL" = true ]; then
fi fi
# run ingress # run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \ OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend --http-port=8000 & python3 -m dynamo.frontend &
DYNAMO_PID=$! DYNAMO_PID=$!
# run worker # run worker
OTEL_SERVICE_NAME=dynamo-worker-embedding DYN_SYSTEM_PORT=8081 \ OTEL_SERVICE_NAME=dynamo-worker-embedding DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python3 -m dynamo.sglang \ python3 -m dynamo.sglang \
--embedding-worker \ --embedding-worker \
--model-path Qwen/Qwen3-Embedding-4B \ --model-path Qwen/Qwen3-Embedding-4B \
......
...@@ -44,12 +44,13 @@ if [ "$ENABLE_OTEL" = true ]; then ...@@ -44,12 +44,13 @@ if [ "$ENABLE_OTEL" = true ]; then
fi fi
# run ingress # run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \ OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend --router-mode kv --http-port=8000 & python3 -m dynamo.frontend --router-mode kv &
DYNAMO_PID=$! DYNAMO_PID=$!
# run worker # run worker
OTEL_SERVICE_NAME=dynamo-worker-1 DYN_SYSTEM_PORT=8081 \ OTEL_SERVICE_NAME=dynamo-worker-1 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_WORKER1:-8081} \
python3 -m dynamo.sglang \ python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \ --model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \ --served-model-name Qwen/Qwen3-0.6B \
...@@ -60,7 +61,7 @@ python3 -m dynamo.sglang \ ...@@ -60,7 +61,7 @@ python3 -m dynamo.sglang \
--enable-metrics & --enable-metrics &
WORKER_PID=$! WORKER_PID=$!
OTEL_SERVICE_NAME=dynamo-worker-2 DYN_SYSTEM_PORT=8082 \ OTEL_SERVICE_NAME=dynamo-worker-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_WORKER2:-8082} \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \ --model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \ --served-model-name Qwen/Qwen3-0.6B \
......
...@@ -44,12 +44,13 @@ if [ "$ENABLE_OTEL" = true ]; then ...@@ -44,12 +44,13 @@ if [ "$ENABLE_OTEL" = true ]; then
fi fi
# run ingress # run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \ OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend --http-port=8000 & python3 -m dynamo.frontend &
DYNAMO_PID=$! DYNAMO_PID=$!
# run prefill worker # run prefill worker
OTEL_SERVICE_NAME=dynamo-worker-prefill DYN_SYSTEM_PORT=8081 \ OTEL_SERVICE_NAME=dynamo-worker-prefill DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_PREFILL:-8081} \
python3 -m dynamo.sglang \ python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \ --model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \ --served-model-name Qwen/Qwen3-0.6B \
...@@ -64,7 +65,7 @@ python3 -m dynamo.sglang \ ...@@ -64,7 +65,7 @@ python3 -m dynamo.sglang \
PREFILL_PID=$! PREFILL_PID=$!
# run decode worker # run decode worker
OTEL_SERVICE_NAME=dynamo-worker-decode DYN_SYSTEM_PORT=8082 \ OTEL_SERVICE_NAME=dynamo-worker-decode DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_DECODE:-8082} \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \ --model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \ --served-model-name Qwen/Qwen3-0.6B \
......
...@@ -45,16 +45,16 @@ if [ "$ENABLE_OTEL" = true ]; then ...@@ -45,16 +45,16 @@ if [ "$ENABLE_OTEL" = true ]; then
fi fi
# run ingress # run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \ OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend \ python3 -m dynamo.frontend \
--http-port=8000 \
--router-mode kv \ --router-mode kv \
--kv-overlap-score-weight 0 \ --kv-overlap-score-weight 0 \
--router-reset-states & --router-reset-states &
DYNAMO_PID=$! DYNAMO_PID=$!
# run prefill router # run prefill router
OTEL_SERVICE_NAME=dynamo-router-prefill DYN_SYSTEM_PORT=8081 \ OTEL_SERVICE_NAME=dynamo-router-prefill DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_PREFILL_ROUTER:-8081} \
python3 -m dynamo.router \ python3 -m dynamo.router \
--endpoint dynamo.prefill.generate \ --endpoint dynamo.prefill.generate \
--block-size 64 \ --block-size 64 \
...@@ -63,7 +63,7 @@ python3 -m dynamo.router \ ...@@ -63,7 +63,7 @@ python3 -m dynamo.router \
PREFILL_ROUTER_PID=$! PREFILL_ROUTER_PID=$!
# run prefill worker # run prefill worker
OTEL_SERVICE_NAME=dynamo-worker-prefill-1 DYN_SYSTEM_PORT=8082 \ OTEL_SERVICE_NAME=dynamo-worker-prefill-1 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_PREFILL_WORKER1:-8082} \
python3 -m dynamo.sglang \ python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
...@@ -78,7 +78,7 @@ python3 -m dynamo.sglang \ ...@@ -78,7 +78,7 @@ python3 -m dynamo.sglang \
PREFILL_PID=$! PREFILL_PID=$!
# run prefill worker # run prefill worker
OTEL_SERVICE_NAME=dynamo-worker-prefill-2 DYN_SYSTEM_PORT=8083 \ OTEL_SERVICE_NAME=dynamo-worker-prefill-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_PREFILL_WORKER2:-8083} \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
...@@ -93,7 +93,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \ ...@@ -93,7 +93,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
PREFILL_PID=$! PREFILL_PID=$!
# run decode worker # run decode worker
OTEL_SERVICE_NAME=dynamo-worker-decode-1 DYN_SYSTEM_PORT=8084 \ OTEL_SERVICE_NAME=dynamo-worker-decode-1 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_DECODE_WORKER1:-8084} \
CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.sglang \ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
...@@ -108,7 +108,7 @@ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.sglang \ ...@@ -108,7 +108,7 @@ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.sglang \
PREFILL_PID=$! PREFILL_PID=$!
# run decode worker # run decode worker
OTEL_SERVICE_NAME=dynamo-worker-decode-2 DYN_SYSTEM_PORT=8085 \ OTEL_SERVICE_NAME=dynamo-worker-decode-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_DECODE_WORKER2:-8085} \
CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.sglang \ CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
......
...@@ -37,11 +37,12 @@ trap cleanup EXIT INT TERM ...@@ -37,11 +37,12 @@ trap cleanup EXIT INT TERM
# run ingress with KV router mode for disaggregated setup # run ingress with KV router mode for disaggregated setup
python3 -m dynamo.frontend --router-mode kv --http-port=8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode kv &
DYNAMO_PID=$! DYNAMO_PID=$!
# run prefill worker with metrics on port 8081 # run prefill worker with metrics on port 8081
DYN_SYSTEM_PORT=8081 \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
python3 -m dynamo.sglang \ python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \ --model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \ --served-model-name Qwen/Qwen3-0.6B \
...@@ -71,7 +72,7 @@ echo "Waiting for prefill worker to initialize..." ...@@ -71,7 +72,7 @@ echo "Waiting for prefill worker to initialize..."
sleep 5 sleep 5
# run decode worker with metrics on port 8082 (foreground) # run decode worker with metrics on port 8082 (foreground)
DYN_SYSTEM_PORT=8082 \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
python3 -m dynamo.sglang \ python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \ --model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \ --served-model-name Qwen/Qwen3-0.6B \
......
...@@ -60,7 +60,8 @@ if [[ -n "$SERVED_MODEL_NAME" ]]; then ...@@ -60,7 +60,8 @@ if [[ -n "$SERVED_MODEL_NAME" ]]; then
fi fi
# run ingress # run ingress
python3 -m dynamo.frontend --http-port=8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$! DYNAMO_PID=$!
# run SGLang multimodal processor # run SGLang multimodal processor
......
...@@ -60,7 +60,8 @@ if [[ -n "$SERVED_MODEL_NAME" ]]; then ...@@ -60,7 +60,8 @@ if [[ -n "$SERVED_MODEL_NAME" ]]; then
fi fi
# run ingress # run ingress
python3 -m dynamo.frontend --http-port=8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$! DYNAMO_PID=$!
# run SGLang multimodal processor # run SGLang multimodal processor
......
...@@ -22,7 +22,8 @@ trap cleanup EXIT INT TERM ...@@ -22,7 +22,8 @@ trap cleanup EXIT INT TERM
# run frontend # run frontend
python3 -m dynamo.frontend --http-port 8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$! DYNAMO_PID=$!
# run worker # run worker
......
...@@ -19,11 +19,12 @@ cleanup() { ...@@ -19,11 +19,12 @@ cleanup() {
trap cleanup EXIT INT TERM trap cleanup EXIT INT TERM
# Run frontend # Run frontend
python3 -m dynamo.frontend --http-port 8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$! DYNAMO_PID=$!
# Run worker # Run worker
DYN_SYSTEM_PORT=8081 \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python3 -m dynamo.trtllm \ python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \ --model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \ --served-model-name "$SERVED_MODEL_NAME" \
......
...@@ -19,7 +19,8 @@ trap cleanup EXIT INT TERM ...@@ -19,7 +19,8 @@ trap cleanup EXIT INT TERM
# run frontend # run frontend
python3 -m dynamo.frontend --router-mode kv --http-port 8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode kv &
DYNAMO_PID=$! DYNAMO_PID=$!
# run worker # run worker
......
...@@ -25,7 +25,8 @@ trap cleanup EXIT INT TERM ...@@ -25,7 +25,8 @@ trap cleanup EXIT INT TERM
# run frontend # run frontend
python3 -m dynamo.frontend --http-port 8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$! DYNAMO_PID=$!
# run prefill worker # run prefill worker
......
...@@ -23,7 +23,8 @@ trap cleanup EXIT INT TERM ...@@ -23,7 +23,8 @@ trap cleanup EXIT INT TERM
# run frontend # run frontend
python3 -m dynamo.frontend --http-port 8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$! DYNAMO_PID=$!
# run prefill worker # run prefill worker
......
...@@ -22,7 +22,8 @@ trap cleanup EXIT INT TERM ...@@ -22,7 +22,8 @@ trap cleanup EXIT INT TERM
# run frontend with KV routing for cache-aware optimization # run frontend with KV routing for cache-aware optimization
python3 -m dynamo.frontend --router-mode kv --http-port 8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode kv &
DYNAMO_PID=$! DYNAMO_PID=$!
# run prefill worker # run prefill worker
......
...@@ -48,12 +48,13 @@ trap cleanup EXIT INT TERM ...@@ -48,12 +48,13 @@ trap cleanup EXIT INT TERM
# run frontend # run frontend
python3 -m dynamo.frontend --http-port 8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$! DYNAMO_PID=$!
# run prefill worker (shares GPU with decode) # run prefill worker (shares GPU with decode)
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \ CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
DYN_SYSTEM_PORT=8081 \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
python3 -m dynamo.trtllm \ python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \ --model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \ --served-model-name "$SERVED_MODEL_NAME" \
...@@ -65,7 +66,7 @@ PREFILL_PID=$! ...@@ -65,7 +66,7 @@ PREFILL_PID=$!
# run decode worker (shares GPU with prefill) # run decode worker (shares GPU with prefill)
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \ CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
DYN_SYSTEM_PORT=8082 \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
python3 -m dynamo.trtllm \ python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \ --model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \ --served-model-name "$SERVED_MODEL_NAME" \
......
...@@ -29,7 +29,8 @@ trap cleanup EXIT INT TERM ...@@ -29,7 +29,8 @@ trap cleanup EXIT INT TERM
# run frontend # run frontend
python3 -m dynamo.frontend --http-port 8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$! DYNAMO_PID=$!
# run encode worker # run encode worker
......
...@@ -14,7 +14,8 @@ trap 'echo Cleaning up...; kill 0' EXIT ...@@ -14,7 +14,8 @@ trap 'echo Cleaning up...; kill 0' EXIT
# run frontend # run frontend
python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode round-robin &
# With tensor_parallel_size=4, each worker needs 4 GPUs # With tensor_parallel_size=4, each worker needs 4 GPUs
# run prefill worker # run prefill worker
......
...@@ -20,6 +20,7 @@ etcd --listen-client-urls http://0.0.0.0:2379 --advertise-client-urls http://0.0 ...@@ -20,6 +20,7 @@ etcd --listen-client-urls http://0.0.0.0:2379 --advertise-client-urls http://0.0
sleep 2 sleep 2
# Start OpenAI Frontend which will dynamically discover workers when they startup # Start OpenAI Frontend which will dynamically discover workers when they startup
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
# NOTE: This is a blocking call. # NOTE: This is a blocking call.
python3 -m dynamo.frontend --http-port 8000 python3 -m dynamo.frontend
...@@ -5,9 +5,10 @@ set -e ...@@ -5,9 +5,10 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
# run ingress # run ingress
python -m dynamo.frontend --http-port=8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# run worker # run worker
# --enforce-eager is added for quick deployment. for production use, need to remove this flag # --enforce-eager is added for quick deployment. for production use, need to remove this flag
DYN_SYSTEM_PORT=8081 \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --connector none python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --connector none
...@@ -5,7 +5,8 @@ set -e ...@@ -5,7 +5,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
# run ingress # run ingress
python -m dynamo.frontend --http-port=8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# run worker with KVBM enabled # run worker with KVBM enabled
# NOTE: remove --enforce-eager for production use # NOTE: remove --enforce-eager for production use
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment