Unverified Commit 6ed8ba0a authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

feat: GPU VRAM profiler + profiled test markers for 1 GPU SGLang (part 3) (#7508)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent a04a9401
...@@ -9,7 +9,8 @@ set -e ...@@ -9,7 +9,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh" source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_gpu_mem_args
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Default values # Default values
MODEL="Qwen/Qwen3-0.6B" MODEL="Qwen/Qwen3-0.6B"
...@@ -54,6 +55,8 @@ if [ "$ENABLE_OTEL" = true ]; then ...@@ -54,6 +55,8 @@ if [ "$ENABLE_OTEL" = true ]; then
TRACE_ARGS+=(--enable-trace --otlp-traces-endpoint localhost:4317) TRACE_ARGS+=(--enable-trace --otlp-traces-endpoint localhost:4317)
fi fi
GPU_MEM_FRACTION=$(build_gpu_mem_args sglang --model "$MODEL")
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving" "$MODEL" "$HTTP_PORT" print_launch_banner "Launching Aggregated Serving" "$MODEL" "$HTTP_PORT"
...@@ -72,6 +75,7 @@ python3 -m dynamo.sglang \ ...@@ -72,6 +75,7 @@ python3 -m dynamo.sglang \
--trust-remote-code \ --trust-remote-code \
--skip-tokenizer-init \ --skip-tokenizer-init \
--enable-metrics \ --enable-metrics \
${GPU_MEM_FRACTION:+--mem-fraction-static "$GPU_MEM_FRACTION"} \
"${TRACE_ARGS[@]}" \ "${TRACE_ARGS[@]}" \
"${EXTRA_ARGS[@]}" & "${EXTRA_ARGS[@]}" &
......
...@@ -9,14 +9,24 @@ set -e ...@@ -9,14 +9,24 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh" source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_gpu_mem_args
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Default values
MODEL="Qwen/Qwen3-Embedding-4B"
# Parse command line arguments # Parse command line arguments
EXTRA_ARGS=()
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case $1 in case $1 in
--model-path)
MODEL="$2"
shift 2
;;
-h|--help) -h|--help)
echo "Usage: $0 [OPTIONS]" echo "Usage: $0 [OPTIONS]"
echo "Options:" echo "Options:"
echo " --model-path <name> Specify model (default: $MODEL)"
echo " -h, --help Show this help message" echo " -h, --help Show this help message"
echo "" echo ""
echo "Note: System metrics are enabled by default on port 8081 (worker)" echo "Note: System metrics are enabled by default on port 8081 (worker)"
...@@ -24,14 +34,14 @@ while [[ $# -gt 0 ]]; do ...@@ -24,14 +34,14 @@ while [[ $# -gt 0 ]]; do
exit 0 exit 0
;; ;;
*) *)
echo "Unknown option: $1" EXTRA_ARGS+=("$1")
echo "Use --help for usage information" shift
exit 1
;; ;;
esac esac
done done
MODEL="Qwen/Qwen3-Embedding-4B" GPU_MEM_FRACTION=$(build_gpu_mem_args sglang --model "$MODEL" 2>/dev/null || true)
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --no-curl "Launching Embedding Worker" "$MODEL" "$HTTP_PORT" print_launch_banner --no-curl "Launching Embedding Worker" "$MODEL" "$HTTP_PORT"
...@@ -52,13 +62,15 @@ python3 -m dynamo.frontend & ...@@ -52,13 +62,15 @@ python3 -m dynamo.frontend &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python3 -m dynamo.sglang \ python3 -m dynamo.sglang \
--embedding-worker \ --embedding-worker \
--model-path Qwen/Qwen3-Embedding-4B \ --model-path "$MODEL" \
--served-model-name Qwen/Qwen3-Embedding-4B \ --served-model-name "$MODEL" \
--page-size 16 \ --page-size 16 \
--tp 1 \ --tp 1 \
--trust-remote-code \ --trust-remote-code \
--use-sglang-tokenizer \ --use-sglang-tokenizer \
--enable-metrics & ${GPU_MEM_FRACTION:+--mem-fraction-static "$GPU_MEM_FRACTION"} \
--enable-metrics \
"${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest # Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit wait_any_exit
...@@ -9,7 +9,8 @@ set -e ...@@ -9,7 +9,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh" source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_gpu_mem_args
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Parse command line arguments # Parse command line arguments
ENABLE_OTEL=false ENABLE_OTEL=false
...@@ -52,6 +53,9 @@ if [ "$ENABLE_OTEL" = true ]; then ...@@ -52,6 +53,9 @@ if [ "$ENABLE_OTEL" = true ]; then
fi fi
MODEL="Qwen/Qwen3-0.6B" MODEL="Qwen/Qwen3-0.6B"
GPU_MEM_FRACTION=$(build_gpu_mem_args sglang --model "$MODEL")
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated + KV Routing (2 GPUs)" "$MODEL" "$HTTP_PORT" print_launch_banner "Launching Aggregated + KV Routing (2 GPUs)" "$MODEL" "$HTTP_PORT"
...@@ -75,22 +79,24 @@ fi ...@@ -75,22 +79,24 @@ fi
OTEL_SERVICE_NAME=dynamo-worker-1 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_WORKER1:-8081} \ OTEL_SERVICE_NAME=dynamo-worker-1 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_WORKER1:-8081} \
python3 -m dynamo.sglang \ python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \ --model-path "$MODEL" \
--served-model-name Qwen/Qwen3-0.6B \ --served-model-name "$MODEL" \
--page-size 16 \ --page-size 16 \
--tp 1 \ --tp 1 \
--trust-remote-code \ --trust-remote-code \
${GPU_MEM_FRACTION:+--mem-fraction-static "$GPU_MEM_FRACTION"} \
"${KV_EVENTS_ARGS_1[@]}" \ "${KV_EVENTS_ARGS_1[@]}" \
--enable-metrics \ --enable-metrics \
"${TRACE_ARGS[@]}" & "${TRACE_ARGS[@]}" &
OTEL_SERVICE_NAME=dynamo-worker-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_WORKER2:-8082} \ OTEL_SERVICE_NAME=dynamo-worker-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_WORKER2:-8082} \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \ --model-path "$MODEL" \
--served-model-name Qwen/Qwen3-0.6B \ --served-model-name "$MODEL" \
--page-size 16 \ --page-size 16 \
--tp 1 \ --tp 1 \
--trust-remote-code \ --trust-remote-code \
${GPU_MEM_FRACTION:+--mem-fraction-static "$GPU_MEM_FRACTION"} \
"${KV_EVENTS_ARGS_2[@]}" \ "${KV_EVENTS_ARGS_2[@]}" \
--enable-metrics \ --enable-metrics \
"${TRACE_ARGS[@]}" & "${TRACE_ARGS[@]}" &
......
...@@ -9,7 +9,8 @@ set -e ...@@ -9,7 +9,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh" source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_gpu_mem_args
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Parse command line arguments # Parse command line arguments
ENABLE_OTEL=false ENABLE_OTEL=false
...@@ -46,6 +47,9 @@ if [ "$ENABLE_OTEL" = true ]; then ...@@ -46,6 +47,9 @@ if [ "$ENABLE_OTEL" = true ]; then
fi fi
MODEL="Qwen/Qwen3-0.6B" MODEL="Qwen/Qwen3-0.6B"
GPU_MEM_FRACTION=$(build_gpu_mem_args sglang --model "$MODEL")
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Disaggregated Serving (2 GPUs)" "$MODEL" "$HTTP_PORT" print_launch_banner "Launching Disaggregated Serving (2 GPUs)" "$MODEL" "$HTTP_PORT"
...@@ -61,8 +65,8 @@ python3 -m dynamo.frontend & ...@@ -61,8 +65,8 @@ python3 -m dynamo.frontend &
# harnesses can set one simple pair for disaggregated deployments. # harnesses can set one simple pair for disaggregated deployments.
OTEL_SERVICE_NAME=dynamo-worker-prefill DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \ OTEL_SERVICE_NAME=dynamo-worker-prefill DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
python3 -m dynamo.sglang \ python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \ --model-path "$MODEL" \
--served-model-name Qwen/Qwen3-0.6B \ --served-model-name "$MODEL" \
--page-size 16 \ --page-size 16 \
--tp 1 \ --tp 1 \
--trust-remote-code \ --trust-remote-code \
...@@ -71,14 +75,15 @@ python3 -m dynamo.sglang \ ...@@ -71,14 +75,15 @@ python3 -m dynamo.sglang \
--host 0.0.0.0 \ --host 0.0.0.0 \
--port 40000 \ --port 40000 \
--disaggregation-transfer-backend nixl \ --disaggregation-transfer-backend nixl \
${GPU_MEM_FRACTION:+--mem-fraction-static "$GPU_MEM_FRACTION"} \
--enable-metrics \ --enable-metrics \
"${TRACE_ARGS[@]}" & "${TRACE_ARGS[@]}" &
# run decode worker # run decode worker
OTEL_SERVICE_NAME=dynamo-worker-decode DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \ OTEL_SERVICE_NAME=dynamo-worker-decode DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \ --model-path "$MODEL" \
--served-model-name Qwen/Qwen3-0.6B \ --served-model-name "$MODEL" \
--page-size 16 \ --page-size 16 \
--tp 1 \ --tp 1 \
--trust-remote-code \ --trust-remote-code \
...@@ -86,6 +91,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \ ...@@ -86,6 +91,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--disaggregation-bootstrap-port 12345 \ --disaggregation-bootstrap-port 12345 \
--host 0.0.0.0 \ --host 0.0.0.0 \
--disaggregation-transfer-backend nixl \ --disaggregation-transfer-backend nixl \
${GPU_MEM_FRACTION:+--mem-fraction-static "$GPU_MEM_FRACTION"} \
--enable-metrics \ --enable-metrics \
"${TRACE_ARGS[@]}" & "${TRACE_ARGS[@]}" &
......
...@@ -9,7 +9,8 @@ set -e ...@@ -9,7 +9,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh" source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_gpu_mem_args
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Default values # Default values
MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct" MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
...@@ -86,6 +87,14 @@ DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.9} ...@@ -86,6 +87,14 @@ DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.9}
DYN_PREFILL_GPU_MEM=${DYN_PREFILL_GPU_MEM:-0.9} DYN_PREFILL_GPU_MEM=${DYN_PREFILL_GPU_MEM:-0.9}
DYN_DECODE_GPU_MEM=${DYN_DECODE_GPU_MEM:-0.9} DYN_DECODE_GPU_MEM=${DYN_DECODE_GPU_MEM:-0.9}
# Profiler override: scale prefill/decode fractions proportionally.
# Encode worker has no --mem-fraction-static in single-gpu mode, so it's unaffected.
if [[ -n "${_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE:-}" && "$SINGLE_GPU" == "true" ]]; then
_TOTAL_FRAC=$(awk -v p="$DYN_PREFILL_GPU_MEM" -v d="$DYN_DECODE_GPU_MEM" 'BEGIN { printf "%.4f", p + d }')
DYN_PREFILL_GPU_MEM=$(awk -v o="$_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE" -v p="$DYN_PREFILL_GPU_MEM" -v t="$_TOTAL_FRAC" 'BEGIN { printf "%.2f", o * p / t }')
DYN_DECODE_GPU_MEM=$(awk -v o="$_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE" -v d="$DYN_DECODE_GPU_MEM" -v t="$_TOTAL_FRAC" 'BEGIN { printf "%.2f", o * d / t }')
fi
ENCODE_EXTRA_ARGS="" ENCODE_EXTRA_ARGS=""
PREFILL_EXTRA_ARGS="" PREFILL_EXTRA_ARGS=""
DECODE_EXTRA_ARGS="" DECODE_EXTRA_ARGS=""
......
...@@ -9,7 +9,8 @@ set -e ...@@ -9,7 +9,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh" source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_gpu_mem_args
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Default values # Default values
MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct" MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
...@@ -78,6 +79,14 @@ DYN_WORKER_GPU=${DYN_WORKER_GPU:-1} ...@@ -78,6 +79,14 @@ DYN_WORKER_GPU=${DYN_WORKER_GPU:-1}
DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.9} DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.9}
DYN_WORKER_GPU_MEM=${DYN_WORKER_GPU_MEM:-0.9} DYN_WORKER_GPU_MEM=${DYN_WORKER_GPU_MEM:-0.9}
# Profiler override: split _PROFILE_PYTEST_VRAM_FRAC_OVERRIDE between workers
# preserving the ratio set by the env vars.
if [[ -n "${_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE:-}" && "$SINGLE_GPU" == "true" ]]; then
_TOTAL_FRAC=$(awk -v e="$DYN_ENCODE_GPU_MEM" -v w="$DYN_WORKER_GPU_MEM" 'BEGIN { printf "%.4f", e + w }')
DYN_ENCODE_GPU_MEM=$(awk -v o="$_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE" -v e="$DYN_ENCODE_GPU_MEM" -v t="$_TOTAL_FRAC" 'BEGIN { printf "%.2f", o * e / t }')
DYN_WORKER_GPU_MEM=$(awk -v o="$_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE" -v w="$DYN_WORKER_GPU_MEM" -v t="$_TOTAL_FRAC" 'BEGIN { printf "%.2f", o * w / t }')
fi
ENCODE_EXTRA_ARGS="" ENCODE_EXTRA_ARGS=""
WORKER_EXTRA_ARGS="" WORKER_EXTRA_ARGS=""
......
...@@ -76,7 +76,7 @@ kv_cache_total = kv_cache_per_token * max_model_len * max_concurrent_seqs ...@@ -76,7 +76,7 @@ kv_cache_total = kv_cache_per_token * max_model_len * max_concurrent_seqs
overhead ≈ engine-dependent (auto-computed by estimate_worker_vram): overhead ≈ engine-dependent (auto-computed by estimate_worker_vram):
vllm: 1.2 + 1.0 * sqrt(params_b) GiB (0.6B≈2.0, 8B≈4.0) vllm: 1.2 + 1.0 * sqrt(params_b) GiB (0.6B≈2.0, 8B≈4.0)
sglang: 2.5 + 1.5 * sqrt(params_b) GiB (0.6B≈3.7, 8B≈6.7) sglang: 1.5 + 1.0 * sqrt(params_b) GiB (0.6B≈2.3, 8B≈4.3)
trtllm: 2.0 + 1.2 * sqrt(params_b) GiB (0.6B≈2.9, 8B≈5.4) trtllm: 2.0 + 1.2 * sqrt(params_b) GiB (0.6B≈2.9, 8B≈5.4)
``` ```
...@@ -104,11 +104,27 @@ This is slightly different from vLLM (which includes activations in the budget). ...@@ -104,11 +104,27 @@ This is slightly different from vLLM (which includes activations in the budget).
sglang recommends keeping 5-8 GiB free for activations and overhead. If you sglang recommends keeping 5-8 GiB free for activations and overhead. If you
see OOM errors, decrease `--mem-fraction-static` by 0.01-0.05 increments. see OOM errors, decrease `--mem-fraction-static` by 0.01-0.05 increments.
### How `--context-length` works ### How `--context-length` and `--max-running-requests` work
Equivalent to vLLM's `--max-model-len`. Defaults to the model's native context Unlike vLLM (where `--max-model-len` directly affects KV cache sizing), sglang's
window. Reducing it shrinks the per-request KV cache requirement and allows more `--context-length` and `--max-running-requests` do **not** affect KV cache
concurrent sequences. allocation. The KV cache pool is sized entirely from `--mem-fraction-static`:
```
kv_cache_pool = total_vram * mem_fraction_static - model_weights
```
Profiling confirmed this: changing `--context-length` from 512 to 40960 produced
identical `max_total_num_tokens` values (269,136 on a 48 GiB GPU at fraction 0.95).
These flags only affect **request scheduling**:
- `--context-length` caps the per-request token usage from the KV pool
- `--max-running-requests` limits concurrent request slots (allocated from
memory outside the `--mem-fraction-static` budget)
Setting `--max-running-requests` too high at high fractions can cause OOM because
the request slot pool competes for the small amount of memory left after KV cache
allocation.
### Estimating total GPU usage ### Estimating total GPU usage
...@@ -117,9 +133,9 @@ total_vram ≈ model_weights + kv_cache_pool + activations_and_overhead ...@@ -117,9 +133,9 @@ total_vram ≈ model_weights + kv_cache_pool + activations_and_overhead
kv_cache_pool = total_vram * mem_fraction_static - model_weights kv_cache_pool = total_vram * mem_fraction_static - model_weights
activations_and_overhead ≈ 1-8 GiB (depends on model size, batch size, seq len; activations_and_overhead ≈ 1-2 GiB for small models (0.6B-4B)
~1-2 GiB for small models like 0.6B, ~3-5 GiB for larger models (7B+)
~5-8 GiB for larger models like 8B+ with CUDA graphs) (CUDA context, graphs, request pools — allocated outside mem_fraction_static)
``` ```
--- ---
......
...@@ -182,6 +182,11 @@ get_model_params() { ...@@ -182,6 +182,11 @@ get_model_params() {
# MHA (not GQA): num_key_value_heads == num_attention_heads == 32 # MHA (not GQA): num_key_value_heads == num_attention_heads == 32
deepseek-ai/deepseek-llm-7b-base) deepseek-ai/deepseek-llm-7b-base)
pb=6.9; wb=2; layers=30; kvh=32; hd=128 ;; pb=6.9; wb=2; layers=30; kvh=32; hd=128 ;;
# https://huggingface.co/Qwen/Qwen3-Embedding-4B/raw/main/config.json
# params_b from model.safetensors.index.json metadata.total_size / 2 / 1e9
# head_dim = hidden_size(2560) / num_attention_heads(32) = 80
Qwen/Qwen3-Embedding-4B)
pb=4.0; wb=2; layers=36; kvh=8; hd=80 ;;
# https://huggingface.co/llava-hf/llava-1.5-7b-hf/raw/main/config.json (text_config) # https://huggingface.co/llava-hf/llava-1.5-7b-hf/raw/main/config.json (text_config)
# MHA: num_key_value_heads == num_attention_heads == 32 # MHA: num_key_value_heads == num_attention_heads == 32
llava-hf/llava-1.5-7b-hf) llava-hf/llava-1.5-7b-hf)
...@@ -216,9 +221,13 @@ get_model_params() { ...@@ -216,9 +221,13 @@ get_model_params() {
# #
# Per-engine constants (calibrated from measurements on RTX 6000 Ada 48 GiB): # Per-engine constants (calibrated from measurements on RTX 6000 Ada 48 GiB):
# vllm: base=1.2, scale=1.0 → 0.6B≈2.0, 8B≈4.0, 30B≈6.7 # vllm: base=1.2, scale=1.0 → 0.6B≈2.0, 8B≈4.0, 30B≈6.7
# sglang: base=2.5, scale=1.5 → 0.6B≈3.7, 8B≈6.7, 30B≈10.8 # sglang: base=1.5, scale=1.0 → 0.6B≈2.3, 8B≈4.3, 30B≈7.0
# trtllm: base=2.0, scale=1.2 → 0.6B≈2.9, 8B≈5.4, 30B≈8.6 # trtllm: base=2.0, scale=1.2 → 0.6B≈2.9, 8B≈5.4, 30B≈8.6
# #
# sglang overhead was re-calibrated via profile_pytest.py bisection on
# RTX 6000 Ada 48 GiB. Observed CUDA overhead (outside --mem-fraction-static):
# Qwen3-0.6B: ~1.8 GiB. Previous coefficients (2.5, 1.5) over-estimated by ~2x.
#
# If the 4th argument is a number, it's used directly (backward compatible). # If the 4th argument is a number, it's used directly (backward compatible).
# If omitted, defaults to 2.0 (backward compatible). # If omitted, defaults to 2.0 (backward compatible).
# #
...@@ -241,7 +250,7 @@ estimate_worker_vram() { ...@@ -241,7 +250,7 @@ estimate_worker_vram() {
local overhead local overhead
case "$engine_or_overhead" in case "$engine_or_overhead" in
vllm) overhead=$(awk -v p="$pb" 'BEGIN { printf "%.1f", 1.2 + 1.0 * sqrt(p) }') ;; vllm) overhead=$(awk -v p="$pb" 'BEGIN { printf "%.1f", 1.2 + 1.0 * sqrt(p) }') ;;
sglang) overhead=$(awk -v p="$pb" 'BEGIN { printf "%.1f", 2.5 + 1.5 * sqrt(p) }') ;; sglang) overhead=$(awk -v p="$pb" 'BEGIN { printf "%.1f", 1.5 + 1.0 * sqrt(p) }') ;;
trtllm) overhead=$(awk -v p="$pb" 'BEGIN { printf "%.1f", 2.0 + 1.2 * sqrt(p) }') ;; trtllm) overhead=$(awk -v p="$pb" 'BEGIN { printf "%.1f", 2.0 + 1.2 * sqrt(p) }') ;;
*) overhead="$engine_or_overhead" ;; *) overhead="$engine_or_overhead" ;;
esac esac
......
...@@ -456,11 +456,13 @@ The profiler sets the `_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE` environment variable ...@@ -456,11 +456,13 @@ The profiler sets the `_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE` environment variable
| Engine | CLI flag | Launch script support | | Engine | CLI flag | Launch script support |
|---------|----------------------------------|-----------------------| |---------|----------------------------------|-----------------------|
| vLLM | `--gpu-memory-utilization` | Implemented in `agg.sh`, `disagg.sh`, etc. | | vLLM | `--gpu-memory-utilization` | Implemented in `agg.sh`, `disagg.sh`, etc. via `build_gpu_mem_args` |
| SGLang | `--mem-fraction-static` | Not yet implemented (TODO) | | SGLang | `--mem-fraction-static` | Implemented in `agg.sh`, `agg_embed.sh`, `disagg.sh`, `agg_router.sh`, `disagg_same_gpu.sh` via `build_gpu_mem_args`. Multimodal scripts (`multimodal_epd.sh`, `multimodal_disagg.sh`) split the override proportionally between workers. |
| TRT-LLM | `--free-gpu-memory-fraction` | Not yet implemented (has its own `DYN_TRTLLM_FREE_GPU_MEMORY_FRACTION`, TODO: unify) | | TRT-LLM | `--free-gpu-memory-fraction` | Not yet implemented (has its own `DYN_TRTLLM_FREE_GPU_MEMORY_FRACTION`, TODO: unify) |
Scripts that already hard-code their own memory fraction (e.g. `agg_multimodal.sh` with 0.85) have a TODO to honor `_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE` in the future. If the profiler detects constant VRAM across all probes (meaning the env var is ignored), it prints a warning and skips marker recommendations. **Note on sglang:** Unlike vLLM (where `--max-model-len` affects KV cache sizing), sglang's `--mem-fraction-static` is the sole knob for KV cache allocation. `--context-length` and `--max-running-requests` only affect request scheduling, not memory allocation. See `examples/common/gpu_utils.md` for details.
If the profiler detects constant VRAM across all probes (meaning the env var is ignored), it prints a warning and skips marker recommendations.
### Usage ### Usage
......
...@@ -45,7 +45,10 @@ sglang_dir = os.environ.get("SGLANG_DIR") or os.path.join( ...@@ -45,7 +45,10 @@ sglang_dir = os.environ.get("SGLANG_DIR") or os.path.join(
# SGLang test configurations # SGLang test configurations
# NOTE: pytest.mark.gpu_1 tests take ~167s (2m 47s) total to run sequentially (with models pre-cached) # NOTE: pytest.mark.gpu_1 tests take ~167s (2m 47s) total to run sequentially (with models pre-cached)
# TODO: Parallelize these tests to reduce total execution time # TODO: Now that these tests use dynamic ports and each config has a max_vram_gib marker,
# optimize the runtime by bin-packing multiple engine deployments in parallel on the same GPU.
# A future collector/launcher can sum max_vram_gib values to decide how many tests fit
# concurrently without exceeding available VRAM.
sglang_configs = { sglang_configs = {
"aggregated": SGLangConfig( "aggregated": SGLangConfig(
# Uses backend agg.sh (with metrics enabled) for testing standard # Uses backend agg.sh (with metrics enabled) for testing standard
...@@ -55,8 +58,9 @@ sglang_configs = { ...@@ -55,8 +58,9 @@ sglang_configs = {
script_name="agg.sh", script_name="agg.sh",
marks=[ marks=[
pytest.mark.gpu_1, pytest.mark.gpu_1,
pytest.mark.max_vram_gib(6.1), # observed peak 5.6 GiB (+10% safety)
pytest.mark.timeout(240), # profiled 34.4s on A6000
pytest.mark.pre_merge, pytest.mark.pre_merge,
pytest.mark.timeout(240), # 3x measured time (39s) + download time (120s)
], ],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
env={}, env={},
...@@ -76,7 +80,7 @@ sglang_configs = { ...@@ -76,7 +80,7 @@ sglang_configs = {
marks=[ marks=[
pytest.mark.gpu_2, pytest.mark.gpu_2,
pytest.mark.pre_merge, pytest.mark.pre_merge,
], ], # TODO(gpu_2): profile max_vram, timeout, add markers (separate PR)
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
env={}, env={},
frontend_port=DefaultPort.FRONTEND.value, frontend_port=DefaultPort.FRONTEND.value,
...@@ -96,8 +100,10 @@ sglang_configs = { ...@@ -96,8 +100,10 @@ sglang_configs = {
pytest.mark.gpu_1, pytest.mark.gpu_1,
pytest.mark.pre_merge, pytest.mark.pre_merge,
pytest.mark.skip(reason="unstable"), pytest.mark.skip(reason="unstable"),
# TODO: profile to get max_vram and timeout (currently skipped)
], ],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
delayed_start=30,
env={}, env={},
frontend_port=DefaultPort.FRONTEND.value, frontend_port=DefaultPort.FRONTEND.value,
request_payloads=[ request_payloads=[
...@@ -126,7 +132,7 @@ sglang_configs = { ...@@ -126,7 +132,7 @@ sglang_configs = {
marks=[ marks=[
pytest.mark.gpu_2, pytest.mark.gpu_2,
pytest.mark.pre_merge, pytest.mark.pre_merge,
], ], # TODO(gpu_2): profile max_vram, timeout, add markers (separate PR)
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
env={ env={
"DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info", "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info",
...@@ -154,9 +160,9 @@ sglang_configs = { ...@@ -154,9 +160,9 @@ sglang_configs = {
script_name="template_verifier.sh", script_name="template_verifier.sh",
marks=[ marks=[
pytest.mark.gpu_1, pytest.mark.gpu_1,
pytest.mark.timeout(240), # profiled 11.7s on A6000 (no GPU model load)
pytest.mark.pre_merge, pytest.mark.pre_merge,
pytest.mark.nightly, pytest.mark.nightly,
pytest.mark.timeout(240), # 3x measured time (20s) + download time (180s)
], ],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
env={}, env={},
...@@ -167,13 +173,21 @@ sglang_configs = { ...@@ -167,13 +173,21 @@ sglang_configs = {
) )
], ],
), ),
# NOTE: Pack all workers on 1 GPU for lower CI resource requirements # NOTE: Pack all workers on 1 GPU for lower CI resource requirements.
# NOTE: multimodal_epd.sh uses explicit --mem-fraction-static via DYN_ENCODE_GPU_MEM
# / DYN_WORKER_GPU_MEM env vars, so _PROFILE_PYTEST_VRAM_FRAC_OVERRIDE has no effect.
# Regardless of fraction overrides, the workers combined consistently use ~23.6 GiB.
"multimodal_e_pd_qwen": SGLangConfig( "multimodal_e_pd_qwen": SGLangConfig(
# E/P/D architecture: Encode, Prefill, Decode workers all on GPU 0 # E/P/D architecture: Encode, Prefill, Decode workers all on GPU 0
name="multimodal_e_pd_qwen", name="multimodal_e_pd_qwen",
directory=sglang_dir, directory=sglang_dir,
script_name="multimodal_epd.sh", script_name="multimodal_epd.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge], marks=[
pytest.mark.gpu_1,
pytest.mark.max_vram_gib(13.3), # observed peak 12.1 GiB (+10% safety)
pytest.mark.timeout(360), # profiled 31.0s on A6000
pytest.mark.pre_merge,
],
model="Qwen/Qwen3-VL-2B-Instruct", model="Qwen/Qwen3-VL-2B-Instruct",
script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"], script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"],
timeout=360, timeout=360,
...@@ -212,8 +226,9 @@ sglang_configs = { ...@@ -212,8 +226,9 @@ sglang_configs = {
script_name="multimodal_disagg.sh", script_name="multimodal_disagg.sh",
marks=[ marks=[
pytest.mark.gpu_1, pytest.mark.gpu_1,
pytest.mark.max_vram_gib(17.7), # observed peak 16.1 GiB (+10% safety)
pytest.mark.timeout(360), # profiled 36.0s on A6000
pytest.mark.pre_merge, pytest.mark.pre_merge,
pytest.mark.timeout(360),
], ],
model="Qwen/Qwen3-VL-2B-Instruct", model="Qwen/Qwen3-VL-2B-Instruct",
script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"], script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"],
...@@ -246,9 +261,10 @@ sglang_configs = { ...@@ -246,9 +261,10 @@ sglang_configs = {
script_name="agg.sh", script_name="agg.sh",
marks=[ marks=[
pytest.mark.gpu_1, pytest.mark.gpu_1,
pytest.mark.max_vram_gib(21.0), # observed peak 19.1 GiB (+10% safety)
pytest.mark.timeout(300), # profiled 41.3s on A6000
pytest.mark.pre_merge, pytest.mark.pre_merge,
pytest.mark.nightly, pytest.mark.nightly,
pytest.mark.timeout(300),
], ],
model="Qwen/Qwen2.5-VL-7B-Instruct", model="Qwen/Qwen2.5-VL-7B-Instruct",
script_args=[ script_args=[
...@@ -284,9 +300,10 @@ sglang_configs = { ...@@ -284,9 +300,10 @@ sglang_configs = {
script_name="agg_embed.sh", script_name="agg_embed.sh",
marks=[ marks=[
pytest.mark.gpu_1, pytest.mark.gpu_1,
pytest.mark.max_vram_gib(12.1), # observed peak 11.0 GiB (+10% safety)
pytest.mark.timeout(270), # profiled 25.5s on A6000
pytest.mark.pre_merge, pytest.mark.pre_merge,
pytest.mark.nightly, pytest.mark.nightly,
pytest.mark.timeout(270), # 3x measured time (29s) + download time (180s)
], ],
model="Qwen/Qwen3-Embedding-4B", model="Qwen/Qwen3-Embedding-4B",
delayed_start=0, delayed_start=0,
...@@ -321,10 +338,9 @@ sglang_configs = { ...@@ -321,10 +338,9 @@ sglang_configs = {
script_name="agg.sh", script_name="agg.sh",
marks=[ marks=[
pytest.mark.gpu_1, pytest.mark.gpu_1,
pytest.mark.max_vram_gib(16.2), # observed peak 14.8 GiB (+10% safety)
pytest.mark.timeout(420), # profiled 73s on A6000
pytest.mark.post_merge, pytest.mark.post_merge,
pytest.mark.timeout(
420
), # Total test timeout: 2x measured average (79.36s) + download time (240s) for 7B model
], ],
model="deepseek-ai/deepseek-llm-7b-base", model="deepseek-ai/deepseek-llm-7b-base",
script_args=[ script_args=[
...@@ -346,6 +362,7 @@ sglang_configs = { ...@@ -346,6 +362,7 @@ sglang_configs = {
pytest.mark.post_merge, pytest.mark.post_merge,
pytest.mark.timeout(240), pytest.mark.timeout(240),
pytest.mark.skip(reason="DYN-2261"), pytest.mark.skip(reason="DYN-2261"),
# TODO: profile to get max_vram (currently skipped)
], ],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
env={"DYN_ENABLE_ANTHROPIC_API": "1"}, env={"DYN_ENABLE_ANTHROPIC_API": "1"},
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment