Unverified Commit 8d524d03 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

docs: add consistent startup banners to SGLang and TRT-LLM launch scripts (#7252)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent 64f02ffa
......@@ -55,7 +55,7 @@ if [ "$ENABLE_OTEL" = true ]; then
fi
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated LLM Worker" "$MODEL" "$HTTP_PORT"
print_launch_banner "Launching Aggregated Serving" "$MODEL" "$HTTP_PORT"
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
......
......@@ -53,7 +53,7 @@ fi
MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Router (2 workers)" "$MODEL" "$HTTP_PORT"
print_launch_banner "Launching Aggregated + KV Routing (2 GPUs)" "$MODEL" "$HTTP_PORT"
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
......
......@@ -61,7 +61,7 @@ if [ "$ENABLE_OTEL" = true ]; then
fi
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --multimodal "Launching Aggregated Vision Worker" "$MODEL" "$HTTP_PORT"
print_launch_banner --multimodal "Launching Aggregated Multimodal Serving" "$MODEL" "$HTTP_PORT"
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
......
......@@ -47,7 +47,7 @@ fi
MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Disaggregated Workers (P/D)" "$MODEL" "$HTTP_PORT"
print_launch_banner "Launching Disaggregated Serving (2 GPUs)" "$MODEL" "$HTTP_PORT"
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
......
......@@ -48,7 +48,7 @@ fi
MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Disaggregated Router (2P + 2D)" "$MODEL" "$HTTP_PORT"
print_launch_banner "Launching Disaggregated + KV Routing (4 GPUs)" "$MODEL" "$HTTP_PORT"
# Start frontend with KV routing
# The frontend will automatically detect prefill workers and activate an internal prefill router
......
......@@ -41,7 +41,7 @@ fi
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Disaggregated (same GPU)" "$MODEL" "$HTTP_PORT" \
print_launch_banner "Launching Disaggregated on Same GPU" "$MODEL" "$HTTP_PORT" \
"Context len: $CONTEXT_LENGTH" \
"GPU Mem: ${GPU_MEM_FRACTION} per worker (~${_EW_TOTAL_GIB} GiB each)" \
" estimate: weights=${_EW_WEIGHTS_GIB} + kv=${_EW_KV_GIB} + overhead=${_EW_OVERHEAD_GIB} GiB"
......
......@@ -60,7 +60,7 @@ if [[ -n "$SERVED_MODEL_NAME" ]]; then
fi
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --multimodal "Launching Multimodal E/P/D Workers" "$MODEL_NAME" "$HTTP_PORT"
print_launch_banner --multimodal "Launching Disaggregated Multimodal E/P/D (3 GPUs)" "$MODEL_NAME" "$HTTP_PORT"
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
......
......@@ -60,7 +60,7 @@ if [[ -n "$SERVED_MODEL_NAME" ]]; then
fi
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --multimodal "Launching Multimodal E/PD Workers" "$MODEL_NAME" "$HTTP_PORT"
print_launch_banner --multimodal "Launching Multimodal E/PD (2 GPUs)" "$MODEL_NAME" "$HTTP_PORT"
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
......
......@@ -50,6 +50,8 @@ if [ "$ENABLE_OTEL" = true ]; then
TRACE_ARGS+=(--override-engine-args "{\"return_perf_metrics\": true, \"otlp_traces_endpoint\": \"${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}\" }")
fi
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving" "$MODEL_PATH" "$HTTP_PORT"
# run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
......
......@@ -15,6 +15,9 @@ export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/agg.yaml"}
export MODALITY=${MODALITY:-"text"}
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving + Metrics" "$MODEL_PATH" "$HTTP_PORT"
# Run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
......
......@@ -15,6 +15,9 @@ export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/agg.yaml"}
export MODALITY=${MODALITY:-"multimodal"}
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --multimodal "Launching Aggregated Multimodal Serving" "$MODEL_PATH" "$HTTP_PORT"
# run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode kv &
......
......@@ -14,6 +14,9 @@ export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/agg.yaml"}
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated + KV Routing" "$MODEL_PATH" "$HTTP_PORT"
# run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode kv &
......
......@@ -14,6 +14,9 @@ export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/agg.yaml"}
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated + Approximate KV Routing" "$MODEL_PATH" "$HTTP_PORT"
# run frontend with KV router in approximate mode (i.e. no KV events)
python3 -m dynamo.frontend --router-mode kv --no-kv-events &
......
......@@ -52,6 +52,9 @@ if [ "$ENABLE_OTEL" = true ]; then
TRACE_ARGS+=(--override-engine-args "{\"return_perf_metrics\": true, \"otlp_traces_endpoint\": \"${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}\" }")
fi
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Disaggregated Serving (2 GPUs)" "$MODEL_PATH" "$HTTP_PORT"
# run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \
......
......@@ -27,6 +27,9 @@ export MAX_FILE_SIZE_MB=${MAX_FILE_SIZE_MB:-50}
# Extra arguments forwarded to the PD worker (e.g. --multimodal-embedding-cache-capacity-gb 10)
EXTRA_PD_ARGS=("$@")
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --multimodal "Launching Multimodal E/PD" "$MODEL_PATH" "$HTTP_PORT"
# run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
......
......@@ -18,6 +18,9 @@ export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"}
export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"}
export MODALITY=${MODALITY:-"multimodal"}
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --multimodal "Launching Disaggregated Multimodal Serving (2 GPUs)" "$MODEL_PATH" "$HTTP_PORT"
# run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
......
......@@ -17,6 +17,9 @@ export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/
export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"}
export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"}
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Disaggregated + KV Routing (2 GPUs)" "$MODEL_PATH" "$HTTP_PORT"
# run frontend with KV routing for cache-aware optimization
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode kv &
......
......@@ -23,6 +23,9 @@ export MODALITY=${MODALITY:-"multimodal"}
export ALLOWED_LOCAL_MEDIA_PATH=${ALLOWED_LOCAL_MEDIA_PATH:-"/tmp"}
export MAX_FILE_SIZE_MB=${MAX_FILE_SIZE_MB:-50}
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --multimodal "Launching Disaggregated Multimodal E/P/D (8 GPUs)" "$MODEL_PATH" "$HTTP_PORT"
# run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
......
......@@ -23,6 +23,9 @@ export MODALITY=${MODALITY:-"multimodal"}
export ALLOWED_LOCAL_MEDIA_PATH=${ALLOWED_LOCAL_MEDIA_PATH:-"/tmp"}
export MAX_FILE_SIZE_MB=${MAX_FILE_SIZE_MB:-50}
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --multimodal "Launching Multimodal E/P/D" "$MODEL_PATH" "$HTTP_PORT"
# run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
......
......@@ -15,6 +15,9 @@ export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"openai/gpt-oss-120b"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gpt-oss-120b/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gpt-oss-120b/decode.yaml"}
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Disaggregated Serving (8 GPUs)" "$SERVED_MODEL_NAME" "$HTTP_PORT"
# run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode round-robin &
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment