Unverified Commit 423e4b45 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

feat: add an auto GPU VRAM estimator for disagg-same-GPU (#6868)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent f0ac86e8
...@@ -2,31 +2,37 @@ ...@@ -2,31 +2,37 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# #
# Disaggregated serving on a single GPU (prefill + decode share memory). # Disaggregated prefill/decode on a SINGLE GPU.
# GPUs: 1 (requires 16+ GB VRAM) # Per-worker VRAM is estimated from model parameters below. Override individual
# knobs (CONTEXT_LENGTH, MAX_RUNNING_REQUESTS) via env vars, or set
# DYN_GPU_MEMORY_FRACTION_OVERRIDE to bypass the calculation entirely.
# #
# Usage: ./disagg_same_gpu.sh [GPU_MEM_FRACTION] # Measured reference (Qwen/Qwen3-0.6B, --context-length 4096, RTX 6000 Ada 48 GiB):
# GPU_MEM_FRACTION: Fraction of GPU memory to use per worker (default: 0.45) # estimate (from gpu_utils.sh) : ~5.7 GiB per worker (w=1.1 + kv=0.9 + oh=3.7)
# Example: ./disagg_same_gpu.sh 0.45 # actual (nvidia-smi) : ~5.3 GiB per worker (~10.9 GiB total)
# fraction per worker (48 GiB) : 0.12
# KV cache : 25,536-29,712 tokens per worker
# Handles full 4096-token context with --max-running-requests 2.
# GPU memory fraction to use per worker (default: 0.45 = 45% each = 90% total for both workers) SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
GPU_MEM_FRACTION="${1:-0.45}" source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
# Check GPU memory before starting disaggregated mode on single GPU MODEL="Qwen/Qwen3-0.6B"
FREE_GPU_GB=$(python3 -c "import torch; print(torch.cuda.mem_get_info()[0]/1024**3)" 2>/dev/null)
if [ $? -ne 0 ]; then # ---- Tunable (override via env vars) ----
echo "Error: Failed to check GPU memory. Is PyTorch with CUDA available?" CONTEXT_LENGTH="${CONTEXT_LENGTH:-4096}"
exit 1 MAX_RUNNING_REQUESTS="${MAX_RUNNING_REQUESTS:-2}"
fi
REQUIRED_GB=16 # ---- Estimate per-worker VRAM (see examples/common/gpu_utils.md) ----
# Use Python for floating-point comparison to avoid bc dependency # Sets _EW_WEIGHTS_GIB, _EW_KV_GIB, _EW_OVERHEAD_GIB, _EW_TOTAL_GIB
if python3 -c "import sys; sys.exit(0 if float('$FREE_GPU_GB') >= $REQUIRED_GB else 1)"; then estimate_worker_vram "$MODEL" "$CONTEXT_LENGTH" "$MAX_RUNNING_REQUESTS" sglang
echo "GPU memory check passed: ${FREE_GPU_GB}GB available (required: ${REQUIRED_GB}GB)"
# DYN_GPU_MEMORY_FRACTION_OVERRIDE takes precedence (profiler binary search).
# In single-GPU mode, split the override evenly between the two workers.
if [[ -n "${DYN_GPU_MEMORY_FRACTION_OVERRIDE:-}" ]]; then
GPU_MEM_FRACTION=$(awk -v f="$DYN_GPU_MEMORY_FRACTION_OVERRIDE" 'BEGIN { printf "%.2f", f / 2 }')
else else
echo "Error: Insufficient GPU memory. Required: ${REQUIRED_GB}GB, Available: ${FREE_GPU_GB}GB" GPU_MEM_FRACTION=$(gpu_worker_fraction sglang)
echo "Please free up GPU memory before running disaggregated mode on single GPU."
exit 1
fi fi
# Setup cleanup trap # Setup cleanup trap
...@@ -39,14 +45,15 @@ cleanup() { ...@@ -39,14 +45,15 @@ cleanup() {
trap cleanup EXIT INT TERM trap cleanup EXIT INT TERM
MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "==========================================" echo "=========================================="
echo "Launching Disaggregated (same GPU)" echo "Launching Disaggregated (same GPU)"
echo "==========================================" echo "=========================================="
echo "Model: $MODEL" echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT" echo "Frontend: http://localhost:$HTTP_PORT"
echo "GPU Mem: ${GPU_MEM_FRACTION} per worker" echo "Context len: $CONTEXT_LENGTH"
echo "GPU Mem: ${GPU_MEM_FRACTION} per worker (~${_EW_TOTAL_GIB} GiB each)"
echo " estimate: weights=${_EW_WEIGHTS_GIB} + kv=${_EW_KV_GIB} + overhead=${_EW_OVERHEAD_GIB} GiB"
echo "==========================================" echo "=========================================="
echo "" echo ""
echo "Example test command:" echo "Example test command:"
...@@ -69,8 +76,8 @@ DYNAMO_PID=$! ...@@ -69,8 +76,8 @@ DYNAMO_PID=$!
# run prefill worker with metrics on port 8081 # run prefill worker with metrics on port 8081
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
python3 -m dynamo.sglang \ python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \ --model-path "$MODEL" \
--served-model-name Qwen/Qwen3-0.6B \ --served-model-name "$MODEL" \
--page-size 16 \ --page-size 16 \
--tp 1 \ --tp 1 \
--trust-remote-code \ --trust-remote-code \
...@@ -78,12 +85,13 @@ python3 -m dynamo.sglang \ ...@@ -78,12 +85,13 @@ python3 -m dynamo.sglang \
--disaggregation-bootstrap-port 12345 \ --disaggregation-bootstrap-port 12345 \
--host 0.0.0.0 \ --host 0.0.0.0 \
--disaggregation-transfer-backend nixl \ --disaggregation-transfer-backend nixl \
--mem-fraction-static ${GPU_MEM_FRACTION} \ --mem-fraction-static "${GPU_MEM_FRACTION}" \
--chunked-prefill-size 4096 \ --context-length "$CONTEXT_LENGTH" \
--max-prefill-tokens 4096 \ --chunked-prefill-size "$CONTEXT_LENGTH" \
--max-prefill-tokens "$CONTEXT_LENGTH" \
--enable-memory-saver \ --enable-memory-saver \
--delete-ckpt-after-loading \ --delete-ckpt-after-loading \
--max-running-requests 2 \ --max-running-requests "$MAX_RUNNING_REQUESTS" \
--enable-metrics & --enable-metrics &
PREFILL_PID=$! PREFILL_PID=$!
...@@ -99,8 +107,8 @@ sleep 5 ...@@ -99,8 +107,8 @@ sleep 5
# run decode worker with metrics on port 8082 (foreground) # run decode worker with metrics on port 8082 (foreground)
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
python3 -m dynamo.sglang \ python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \ --model-path "$MODEL" \
--served-model-name Qwen/Qwen3-0.6B \ --served-model-name "$MODEL" \
--page-size 16 \ --page-size 16 \
--tp 1 \ --tp 1 \
--trust-remote-code \ --trust-remote-code \
...@@ -108,11 +116,11 @@ python3 -m dynamo.sglang \ ...@@ -108,11 +116,11 @@ python3 -m dynamo.sglang \
--disaggregation-bootstrap-port 12345 \ --disaggregation-bootstrap-port 12345 \
--host 0.0.0.0 \ --host 0.0.0.0 \
--disaggregation-transfer-backend nixl \ --disaggregation-transfer-backend nixl \
--mem-fraction-static ${GPU_MEM_FRACTION} \ --mem-fraction-static "${GPU_MEM_FRACTION}" \
--chunked-prefill-size 4096 \ --context-length "$CONTEXT_LENGTH" \
--max-prefill-tokens 4096 \ --chunked-prefill-size "$CONTEXT_LENGTH" \
--max-prefill-tokens "$CONTEXT_LENGTH" \
--enable-memory-saver \ --enable-memory-saver \
--delete-ckpt-after-loading \ --delete-ckpt-after-loading \
--max-running-requests 2 \ --max-running-requests "$MAX_RUNNING_REQUESTS" \
--enable-metrics --enable-metrics
#!/bin/bash #!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
#
# Disaggregated prefill/decode on a SINGLE GPU.
# Per-worker VRAM is estimated from model parameters below. Override individual
# knobs (MAX_SEQ_LEN, MAX_CONCURRENT_SEQS) via env vars, or set
# DYN_GPU_MEMORY_FRACTION_OVERRIDE to bypass the calculation entirely.
#
# NOTE — trtllm fraction semantics differ from vllm/sglang:
# vllm/sglang: fraction of TOTAL VRAM (weights + KV + activations all inside)
# trtllm: fraction of FREE VRAM (KV cache only, after model load)
# gpu_worker_fraction("trtllm") handles this — see gpu_utils.sh / gpu_utils.md.
#
# Measured reference (Qwen/Qwen3-0.6B, --max-seq-len 4096, RTX 6000 Ada 48 GiB):
# estimate (from gpu_utils.sh) : ~8.0 GiB per worker (~16.0 GiB total)
# actual (nvidia-smi) : ~7.4 GiB per worker (~14.8 GiB total)
# fraction per worker (free) : 0.05
# Overestimating is intentional -- better to pad than OOM.
# Disaggregated mode on single GPU - for testing only SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
# Both prefill and decode workers share the same GPU with reduced memory source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
# Check GPU memory availability MODEL="Qwen/Qwen3-0.6B"
FREE_GPU_GB=$(python3 -c "import torch; print(torch.cuda.mem_get_info()[0]/1024**3)" 2>/dev/null)
if [ $? -ne 0 ]; then
echo "Error: Failed to check GPU memory. Is PyTorch with CUDA available?"
exit 1
fi
REQUIRED_GB=16 # ---- Tunable (override via env vars) ----
# Use bash arithmetic instead of bc to avoid external dependency MAX_SEQ_LEN="${MAX_SEQ_LEN:-4096}"
FREE_GPU_INT=$(python3 -c "print(int(float('$FREE_GPU_GB')))" 2>/dev/null) MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
if [ $? -ne 0 ]; then
echo "Error: Failed to parse GPU memory value."
exit 1
fi
if (( FREE_GPU_INT < REQUIRED_GB )); then # ---- Estimate per-worker VRAM (see examples/common/gpu_utils.md) ----
echo "Error: Insufficient GPU memory. Required: ${REQUIRED_GB}GB, Available: ${FREE_GPU_GB}GB" # Sets _EW_WEIGHTS_GIB, _EW_KV_GIB, _EW_OVERHEAD_GIB, _EW_TOTAL_GIB
echo "Please free up GPU memory before running disaggregated mode on single GPU." estimate_worker_vram "$MODEL" "$MAX_SEQ_LEN" "$MAX_CONCURRENT_SEQS" trtllm
exit 1
fi
echo "GPU memory check passed: ${FREE_GPU_GB}GB available (required: ${REQUIRED_GB}GB)" # DYN_GPU_MEMORY_FRACTION_OVERRIDE takes precedence (profiler binary search).
# In single-GPU mode, split the override evenly between the two workers.
if [[ -n "${DYN_GPU_MEMORY_FRACTION_OVERRIDE:-}" ]]; then
GPU_MEM_FRACTION=$(awk -v f="$DYN_GPU_MEMORY_FRACTION_OVERRIDE" 'BEGIN { printf "%.2f", f / 2 }')
else
GPU_MEM_FRACTION=$(gpu_worker_fraction trtllm)
fi
# Environment variables with defaults # Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/prefill.yaml"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/decode.yaml"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/tests/serve/trtllm/engine_configs/qwen3/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/tests/serve/trtllm/engine_configs/qwen3/decode.yaml"}
export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-"0"} export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-"0"}
export MODALITY=${MODALITY:-"text"} export MODALITY=${MODALITY:-"text"}
...@@ -69,14 +78,28 @@ while [[ $# -gt 0 ]]; do ...@@ -69,14 +78,28 @@ while [[ $# -gt 0 ]]; do
esac esac
done done
# Enable tracing if requested # Build --override-engine-args JSON.
TRACE_ARGS=() # Always override free_gpu_memory_fraction so the script controls KV cache size,
# matching how vllm (--gpu-memory-utilization) and sglang (--mem-fraction-static)
# pass memory parameters from the launch script.
OVERRIDE_PAIRS="\"kv_cache_config\": {\"free_gpu_memory_fraction\": ${GPU_MEM_FRACTION}}"
if [ "$ENABLE_OTEL" = true ]; then if [ "$ENABLE_OTEL" = true ]; then
export DYN_LOGGING_JSONL=true export DYN_LOGGING_JSONL=true
export OTEL_EXPORT_ENABLED=1 export OTEL_EXPORT_ENABLED=1
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-http://localhost:4317} export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-http://localhost:4317}
TRACE_ARGS+=(--override-engine-args "{\"return_perf_metrics\": true, \"otlp_traces_endpoint\": \"${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}\" }") OVERRIDE_PAIRS="${OVERRIDE_PAIRS}, \"return_perf_metrics\": true, \"otlp_traces_endpoint\": \"${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}\""
fi fi
OVERRIDE_ARGS=(--override-engine-args "{${OVERRIDE_PAIRS}}")
echo "=========================================="
echo "Launching Disaggregated on Same GPU (1 GPU)"
echo "=========================================="
echo "Model: $MODEL"
echo "Max seq len: $MAX_SEQ_LEN"
echo "GPU Mem: ${GPU_MEM_FRACTION} per worker (~${_EW_TOTAL_GIB} GiB each)"
echo " estimate: weights=${_EW_WEIGHTS_GIB} + kv=${_EW_KV_GIB} + overhead=${_EW_OVERHEAD_GIB} GiB"
echo "=========================================="
# run frontend # run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \ OTEL_SERVICE_NAME=dynamo-frontend \
...@@ -88,25 +111,24 @@ OTEL_SERVICE_NAME=dynamo-worker-prefill \ ...@@ -88,25 +111,24 @@ OTEL_SERVICE_NAME=dynamo-worker-prefill \
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \ CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
python3 -m dynamo.trtllm \ python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \ --model-path "$MODEL" \
--served-model-name "$SERVED_MODEL_NAME" \ --served-model-name "$MODEL" \
--extra-engine-args "$PREFILL_ENGINE_ARGS" \ --extra-engine-args "$PREFILL_ENGINE_ARGS" \
--modality "$MODALITY" \ --modality "$MODALITY" \
--publish-events-and-metrics \ --publish-events-and-metrics \
--disaggregation-mode prefill \ --disaggregation-mode prefill \
"${TRACE_ARGS[@]}" & "${OVERRIDE_ARGS[@]}" &
PREFILL_PID=$! PREFILL_PID=$!
# run decode worker (shares GPU with prefill) # run decode worker (shares GPU with prefill) - foreground
OTEL_SERVICE_NAME=dynamo-worker-decode \ OTEL_SERVICE_NAME=dynamo-worker-decode \
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \ CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
python3 -m dynamo.trtllm \ python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \ --model-path "$MODEL" \
--served-model-name "$SERVED_MODEL_NAME" \ --served-model-name "$MODEL" \
--extra-engine-args "$DECODE_ENGINE_ARGS" \ --extra-engine-args "$DECODE_ENGINE_ARGS" \
--modality "$MODALITY" \ --modality "$MODALITY" \
--publish-events-and-metrics \ --publish-events-and-metrics \
--disaggregation-mode decode \ --disaggregation-mode decode \
"${TRACE_ARGS[@]}" "${OVERRIDE_ARGS[@]}"
...@@ -2,38 +2,39 @@ ...@@ -2,38 +2,39 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# #
# Usage: ./disagg_same_gpu.sh # Disaggregated prefill/decode on a SINGLE GPU.
# Automatically calculates GPU memory fraction so each worker gets 4GB # Per-worker VRAM is estimated from model parameters below. Override individual
# knobs (MAX_MODEL_LEN, MAX_CONCURRENT_SEQS) via env vars, or set
# DYN_GPU_MEMORY_FRACTION_OVERRIDE to bypass the calculation entirely.
#
# Measured reference (Qwen/Qwen3-0.6B, --max-model-len 4096, RTX 6000 Ada 48 GiB):
# estimate (from gpu_utils.sh) : ~4.0 GiB per worker (~8.0 GiB total)
# actual (nvidia-smi) : ~3.4 GiB per worker (~6.7 GiB total)
# fraction per worker (for 48 GiB) : 0.09
# The ~1.3 GiB pad comes from the overhead term (CUDA ctx + activations).
# Overestimating is intentional -- better to pad than OOM.
# Get total and free GPU memory SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
GPU_MEM_INFO=$(python3 -c "import torch; free, total = torch.cuda.mem_get_info(); print(f'{free/1024**3:.2f} {total/1024**3:.2f}')" 2>/dev/null) source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
if [ $? -ne 0 ]; then
echo "Error: Failed to check GPU memory. Is PyTorch with CUDA available?"
exit 1
fi
FREE_GPU_GB=$(echo $GPU_MEM_INFO | awk '{print $1}') MODEL="Qwen/Qwen3-0.6B"
TOTAL_GPU_GB=$(echo $GPU_MEM_INFO | awk '{print $2}')
# Each worker needs 4GB # ---- Tunable (override via env vars) ----
REQUIRED_GB_PER_WORKER=4 MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
REQUIRED_GB_TOTAL=8 MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
# Calculate fraction needed per worker (4GB / total GPU memory) # ---- Estimate per-worker VRAM (see examples/common/gpu_utils.md) ----
GPU_MEM_FRACTION=$(python3 -c "print(f'{$REQUIRED_GB_PER_WORKER / $TOTAL_GPU_GB:.3f}')") # Sets _EW_WEIGHTS_GIB, _EW_KV_GIB, _EW_OVERHEAD_GIB, _EW_TOTAL_GIB
estimate_worker_vram "$MODEL" "$MAX_MODEL_LEN" "$MAX_CONCURRENT_SEQS" vllm
# Check if we have enough free memory # DYN_GPU_MEMORY_FRACTION_OVERRIDE takes precedence (profiler binary search).
if python3 -c "import sys; sys.exit(0 if float('$FREE_GPU_GB') >= $REQUIRED_GB_TOTAL else 1)"; then # In single-GPU mode, split the override evenly between the two workers.
echo "GPU memory check passed: ${FREE_GPU_GB}GB free / ${TOTAL_GPU_GB}GB total (required: ${REQUIRED_GB_TOTAL}GB)" if [[ -n "${DYN_GPU_MEMORY_FRACTION_OVERRIDE:-}" ]]; then
echo "Using ${GPU_MEM_FRACTION} memory fraction per worker (${REQUIRED_GB_PER_WORKER}GB each)" GPU_MEM_FRACTION=$(awk -v f="$DYN_GPU_MEMORY_FRACTION_OVERRIDE" 'BEGIN { printf "%.2f", f / 2 }')
else else
echo "Error: Insufficient GPU memory. Required: ${REQUIRED_GB_TOTAL}GB, Available: ${FREE_GPU_GB}GB" GPU_MEM_FRACTION=$(gpu_worker_fraction vllm)
echo "Please free up GPU memory before running disaggregated mode on single GPU."
exit 1
fi fi
MODEL="Qwen/Qwen3-0.6B"
# Setup cleanup trap # Setup cleanup trap
cleanup() { cleanup() {
echo "Cleaning up background processes..." echo "Cleaning up background processes..."
...@@ -49,6 +50,9 @@ echo "Launching Disaggregated on Same GPU (1 GPU)" ...@@ -49,6 +50,9 @@ echo "Launching Disaggregated on Same GPU (1 GPU)"
echo "==========================================" echo "=========================================="
echo "Model: $MODEL" echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT" echo "Frontend: http://localhost:$HTTP_PORT"
echo "Max seq len: $MAX_MODEL_LEN"
echo "GPU Mem: ${GPU_MEM_FRACTION} per worker (~${_EW_TOTAL_GIB} GiB each)"
echo " estimate: weights=${_EW_WEIGHTS_GIB} + kv=${_EW_KV_GIB} + overhead=${_EW_OVERHEAD_GIB} GiB"
echo "==========================================" echo "=========================================="
echo "" echo ""
echo "Example test command:" echo "Example test command:"
...@@ -79,8 +83,8 @@ python3 -m dynamo.vllm \ ...@@ -79,8 +83,8 @@ python3 -m dynamo.vllm \
--enforce-eager \ --enforce-eager \
--disaggregation-mode decode \ --disaggregation-mode decode \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
--gpu-memory-utilization ${GPU_MEM_FRACTION} \ --gpu-memory-utilization "${GPU_MEM_FRACTION}" \
--max-model-len 16384 & --max-model-len "$MAX_MODEL_LEN" &
DECODE_PID=$! DECODE_PID=$!
# Wait for decode worker to initialize before starting prefill worker # Wait for decode worker to initialize before starting prefill worker
...@@ -101,7 +105,6 @@ python3 -m dynamo.vllm \ ...@@ -101,7 +105,6 @@ python3 -m dynamo.vllm \
--enforce-eager \ --enforce-eager \
--disaggregation-mode prefill \ --disaggregation-mode prefill \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
--gpu-memory-utilization ${GPU_MEM_FRACTION} \ --gpu-memory-utilization "${GPU_MEM_FRACTION}" \
--max-model-len 16384 \ --max-model-len "$MAX_MODEL_LEN" \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
# GPU Memory Parameters by Engine
How vLLM, sglang, and TensorRT-LLM interpret memory-related parameters, and how
to estimate total GPU VRAM usage for each.
---
## Quick Reference
| Parameter | vLLM | sglang | TensorRT-LLM |
|---|---|---|---|
| Memory fraction | `--gpu-memory-utilization` | `--mem-fraction-static` | `free_gpu_memory_fraction` (YAML/override) |
| Fraction base | Total VRAM | Total VRAM | Free VRAM (after model load) |
| Default fraction | 0.90 | 0.90 | 0.90 |
| Max sequence length | `--max-model-len` | `--context-length` | `max_seq_len` (YAML/override) |
| KV cache size override | `--kv-cache-memory-bytes` | N/A | `max_gpu_total_bytes` (broken in 1.3.0rc5) |
---
## 1. vLLM
### How `--gpu-memory-utilization` works
This is a fraction of **total** GPU VRAM. The engine budgets everything within
this limit:
```
budget = total_vram * gpu_memory_utilization
KV cache = budget - model_weights - peak_activations - framework_overhead
```
At startup, vLLM profiles actual model weight and activation memory, then
pre-allocates the remaining budget as KV cache blocks. The KV pool size is fixed
for the lifetime of the engine.
### How `--max-model-len` works
Sets the maximum total sequence length (input + output tokens). Longer sequences
require more KV cache per request. If the requested `max-model-len` needs more
KV cache than the budget allows, vLLM errors at startup:
```
ValueError: ... X GiB KV cache is needed, which is larger than the available
KV cache memory (Y GiB). ...
```
Reducing `--max-model-len` is the most effective way to reduce VRAM when the
model fits but the KV cache doesn't.
### How `--kv-cache-memory-bytes` works
When set, this overrides the automatic KV cache sizing from
`gpu-memory-utilization`. The engine allocates exactly this many bytes for KV
cache regardless of the fraction. This means `gpu-memory-utilization` still
controls the *overall* VRAM budget (and thus whether the model fits), but the
KV cache portion is pinned to the explicit byte value.
Consequence for profiling: if a script uses `--kv-cache-memory-bytes`,
changing `DYN_GPU_MEMORY_FRACTION_OVERRIDE` (which maps to
`--gpu-memory-utilization`) won't change the KV cache size, only the leftover
headroom for activations and overhead.
### Estimating total GPU usage
```
total_vram ≈ model_weights + kv_cache + activations + overhead
model_weights ≈ num_params * bytes_per_param
(e.g. 7B * 2 bytes for BF16 ≈ 14 GiB)
kv_cache_per_token = 2 * num_layers * num_kv_heads * head_dim * bytes_per_element
(the factor of 2 is for K and V tensors)
kv_cache_total = kv_cache_per_token * max_model_len * max_concurrent_seqs
overhead ≈ engine-dependent (auto-computed by estimate_worker_vram):
vllm: 1.2 + 1.0 * sqrt(params_b) GiB (0.6B≈2.0, 8B≈4.0)
sglang: 2.5 + 1.5 * sqrt(params_b) GiB (0.6B≈3.7, 8B≈6.7)
trtllm: 2.0 + 1.2 * sqrt(params_b) GiB (0.6B≈2.9, 8B≈5.4)
```
Rule of thumb: set `gpu-memory-utilization` so that
`total_vram * fraction >= model_weights + 2 GiB`. The rest becomes KV cache.
---
## 2. sglang
### How `--mem-fraction-static` works
Like vLLM, this is a fraction of **total** GPU VRAM:
```
budget = total_vram * mem_fraction_static
KV cache pool = budget - model_weights
```
The budget covers model weights and the KV cache pool. Activations and CUDA
graph buffers are allocated *outside* this budget from the remaining VRAM.
This is slightly different from vLLM (which includes activations in the budget).
sglang recommends keeping 5-8 GiB free for activations and overhead. If you
see OOM errors, decrease `--mem-fraction-static` by 0.01-0.05 increments.
### How `--context-length` works
Equivalent to vLLM's `--max-model-len`. Defaults to the model's native context
window. Reducing it shrinks the per-request KV cache requirement and allows more
concurrent sequences.
### Estimating total GPU usage
```
total_vram ≈ model_weights + kv_cache_pool + activations_and_overhead
kv_cache_pool = total_vram * mem_fraction_static - model_weights
activations_and_overhead ≈ 1-8 GiB (depends on model size, batch size, seq len;
~1-2 GiB for small models like 0.6B,
~5-8 GiB for larger models like 8B+ with CUDA graphs)
```
---
## 3. TensorRT-LLM
### How `free_gpu_memory_fraction` works
This is a fraction of **free** VRAM (not total). The engine:
1. Loads model weights and builds the TRT engine (fixed cost).
2. Queries remaining free GPU memory.
3. Allocates `free_memory * free_gpu_memory_fraction` for the KV cache pool.
```
kv_cache = free_vram_after_model_load * free_gpu_memory_fraction
```
This means the same fraction yields different absolute KV cache sizes depending
on how much VRAM the model consumed. A 5 GiB model on a 48 GiB GPU leaves
~43 GiB free; fraction=0.24 gives ~10 GiB KV cache. A 30 GiB model leaves
~18 GiB free; fraction=0.24 gives only ~4 GiB.
Set via YAML config, CLI, or env var:
```bash
--override-engine-args '{"kv_cache_config":{"free_gpu_memory_fraction": 0.24}}'
DYN_TRTLLM_OVERRIDE_ENGINE_ARGS='{"kv_cache_config":{"free_gpu_memory_fraction": 0.24}}'
```
### How `max_seq_len` works
Maximum total sequence length. Defaults to the model's native context.
Sequences exceeding this limit are rejected at runtime.
**VRAM impact: none (PyTorch backend).** Reducing max_seq_len from 40960 to
2048 had zero effect on total VRAM or KV cache size in testing (Qwen3-0.6B,
trtllm 1.3.0rc5). The PyTorch backend does not pre-allocate internal buffers
proportional to max_seq_len; KV cache size is determined solely by
`free_gpu_memory_fraction`. This differs from vLLM/sglang where reducing
context length measurably reduces memory.
Override via:
```bash
--override-engine-args '{"max_seq_len": 4096}'
```
### Override gotcha: sub-dict replacement
Overriding any field inside `kv_cache_config` **replaces the entire sub-dict**.
If your YAML has `enable_block_reuse: true` and you override only
`free_gpu_memory_fraction`, you lose `enable_block_reuse`. Always re-include
all fields you need:
```json
{"kv_cache_config": {"free_gpu_memory_fraction": 0.15, "enable_block_reuse": true}}
```
### How `max_num_tokens` works
Maximum batched input tokens per iteration. Primarily a throughput knob.
**VRAM impact: none.** Reducing from 8192 → 256 had no measurable effect on
total VRAM (41,643 vs 41,465 MiB — within noise; the slight *increase* is
because smaller activation footprint lets the fraction claim marginally more
KV cache).
### `max_gpu_total_bytes` (broken)
Intended as an absolute byte cap for KV cache. As of trtllm 1.3.0rc5, this
field is **ignored**. Setting 5 GiB cap with `free_gpu_memory_fraction=0.95`
still allocated ~42 GiB of KV cache. Setting `free_gpu_memory_fraction=0.0`
with only `max_gpu_total_bytes` causes `"Impossible to fit any sequence in
kvCache"`. Do not rely on this field.
### Override precedence
```
--override-engine-args JSON > --extra-engine-args YAML > CLI flags
```
The `DYN_TRTLLM_OVERRIDE_ENGINE_ARGS` env var is equivalent to
`--override-engine-args` and avoids shell quoting issues with scripts whose
arg parsers consume unknown flags before passing `"$@"`.
### Estimating total GPU usage
```
total_vram ≈ model_weights + engine_overhead + kv_cache
model_weights ≈ num_params * bytes_per_param / tensor_parallel_size
engine_overhead ≈ 2.0 + 1.2 * sqrt(params_b) GiB (CUDA context + TRT buffers + activations)
kv_cache = free_vram_after_model_load * free_gpu_memory_fraction
```
Engine overhead is auto-computed by `estimate_worker_vram` when called with the
`trtllm` engine name. Examples: 0.6B → 2.9 GiB, 8B → 5.4 GiB, 30B → 8.6 GiB.
### Empirical validation (Qwen3-0.6B, RTX 6000 Ada 48 GiB, trtllm 1.3.0rc5)
Controlled test: single worker via agg.sh, one override at a time.
| # | Override | Total VRAM | KV Cache | Tokens |
|---|---------|-----------|----------|--------|
| 1 | Baseline (YAML frac=0.85) | 41,465 MiB | 38.04 GiB | 356,160 |
| 2 | `free_gpu_memory_fraction=0.15` | 9,383 MiB | 6.71 GiB | 62,848 |
| 3 | `max_num_tokens=256` | 41,643 MiB | 38.26 GiB | 358,208 |
| 4 | `max_seq_len=4096` | 41,469 MiB | 38.05 GiB | 356,192 |
| 5 | `max_seq_len=2048` | 41,469 MiB | 38.05 GiB | 356,192 |
| 6 | seq=4096 + frac=0.15 | 9,383 MiB | 6.71 GiB | 62,848 |
| 7 | tokens=256 + seq=4096 + frac=0.15 | 9,377 MiB | 6.75 GiB | 63,200 |
**Conclusion:** `free_gpu_memory_fraction` is the **sole effective knob** for
trtllm VRAM control. Neither `max_seq_len` nor `max_num_tokens` reduce memory.
Combined overrides (test 7) produce no additional benefit over fraction alone
(test 2).
---
## Why vLLM/sglang fractions are NOT interchangeable with TensorRT-LLM
Consider wanting 10 GiB of KV cache on a 48 GiB GPU with a 5 GiB model:
| Engine | Fraction meaning | Calculation | Result |
|---|---|---|---|
| vLLM | 10/48 = 0.21 of total | `48 * 0.21 = 10 GiB` budget (minus model = 5 GiB KV) | Wrong — need higher fraction |
| sglang | Same as vLLM | Same math | Same problem |
| TensorRT-LLM | 10/43 = 0.23 of free | `43 * 0.23 = 10 GiB` KV cache | Correct |
For vLLM/sglang, you actually need `(model + kv) / total = (5 + 10) / 48 = 0.31`
to get 10 GiB of KV cache with a 5 GiB model.
The helper functions in `gpu_utils.sh` handle these differences:
- `gpu_gb_to_total_fraction`: for vLLM/sglang (fraction of total VRAM)
- `gpu_gb_to_free_fraction`: for TensorRT-LLM (fraction of free VRAM)
- `gpu_worker_fraction <engine>`: unified wrapper — reads `_EW_*` vars from
`estimate_worker_vram` and calls the right function for the engine.
Launch scripts use `gpu_worker_fraction` so they all follow the same pattern:
```bash
estimate_worker_vram "$MODEL" "$SEQ_LEN" "$CONCURRENCY" trtllm
GPU_MEM_FRACTION=$(gpu_worker_fraction trtllm)
```
---
## KV Cache Memory Per Token
The formula for KV cache memory per token is the same across all engines:
```
kv_bytes_per_token = 2 * num_layers * num_kv_heads * head_dim * bytes_per_element
```
| Model | Layers | KV Heads | Head Dim | Dtype | Per Token |
|---|---|---|---|---|---|
| Qwen3-0.6B | 28 | 8 | 128 | BF16 | 112 KiB |
| Llama-3.1-8B | 32 | 8 | 128 | BF16 | 128 KiB |
| Llama-3.1-70B | 80 | 8 | 128 | BF16 | 320 KiB |
| Qwen2.5-VL-7B | 28 | 4 | 128 | BF16 | 56 KiB |
To estimate KV cache for a given context length:
```
kv_cache_gib = kv_bytes_per_token * max_model_len * max_concurrent_seqs / (1024^3)
```
---
## `DYN_GPU_MEMORY_FRACTION_OVERRIDE`
Environment variable used by Dynamo's VRAM profiler to binary-search the minimum
memory fraction a script needs.
- Maps to `--gpu-memory-utilization` in vLLM and `--mem-fraction-static` in sglang.
- For TensorRT-LLM, maps to `kv_cache_config.free_gpu_memory_fraction` via
`--override-engine-args`.
- Launch scripts use `gpu_worker_fraction <engine>` to compute the default
fraction; the override bypasses this and splits the raw value between workers.
- Scripts that use `--kv-cache-memory-bytes` (vLLM) bypass the fraction-based KV
cache sizing, making the profiler's fraction override ineffective for KV cache.
Those scripts should warn when `DYN_GPU_MEMORY_FRACTION_OVERRIDE` is set.
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Shared GPU utility functions for launch scripts.
#
# Usage:
# source "$(dirname "$(readlink -f "$0")")/../common/gpu_utils.sh"
# # or with SCRIPT_DIR already set:
# source "$SCRIPT_DIR/../common/gpu_utils.sh"
#
# Functions:
# get_model_params <model> Set _MP_* vars for a known model's architecture
# estimate_worker_vram <model> ... Set _EW_* vars with per-worker VRAM estimate
# gpu_worker_fraction <engine> Convert _EW_* estimate → engine-appropriate fraction
# gpu_gb_to_total_fraction <gib> Convert absolute GiB → fraction of TOTAL VRAM (vLLM/sglang)
# gpu_gb_to_free_fraction <gib> Convert absolute GiB → fraction of FREE VRAM (TensorRT-LLM)
# get_model_params <model_name>
#
# Sets _MP_* variables for a known model's architecture:
# _MP_PARAMS_B Total parameters in billions (all experts for MoE)
# _MP_WEIGHT_BYTES Bytes per weight element (2=BF16/FP16, 1=FP8)
# _MP_LAYERS Number of transformer layers
# _MP_KV_HEADS Number of key-value heads (GQA groups)
# _MP_HEAD_DIM Dimension per attention head
#
# KV cache is assumed BF16 (2 bytes per element) regardless of weight dtype,
# since FP8 KV cache (--kv-cache-dtype fp8) is opt-in and not the default.
#
# To add a model: look up config.json on HuggingFace for num_hidden_layers,
# num_key_value_heads, and head_dim. For VL/multimodal models, use the
# text_config section. For MoE, _MP_PARAMS_B is the TOTAL param count
# (all experts are loaded into VRAM).
#
# Usage:
# get_model_params "Qwen/Qwen3-0.6B"
# echo "$_MP_LAYERS layers, $_MP_KV_HEADS KV heads"
get_model_params() {
local model="${1:?usage: get_model_params <model_name>}"
case "$model" in
Qwen/Qwen3-0.6B)
_MP_PARAMS_B=0.6; _MP_WEIGHT_BYTES=2
_MP_LAYERS=28; _MP_KV_HEADS=8; _MP_HEAD_DIM=128 ;;
Qwen/Qwen2.5-VL-7B-Instruct)
_MP_PARAMS_B=8.3; _MP_WEIGHT_BYTES=2
_MP_LAYERS=28; _MP_KV_HEADS=4; _MP_HEAD_DIM=128 ;;
Qwen/Qwen3-VL-8B-Instruct)
_MP_PARAMS_B=9.2; _MP_WEIGHT_BYTES=2
_MP_LAYERS=36; _MP_KV_HEADS=8; _MP_HEAD_DIM=128 ;;
Qwen/Qwen3-30B-A3B|\
Qwen/Qwen3-30B-A3B-Instruct)
_MP_PARAMS_B=30.5; _MP_WEIGHT_BYTES=2
_MP_LAYERS=48; _MP_KV_HEADS=4; _MP_HEAD_DIM=128 ;;
Qwen/Qwen3-VL-30B-A3B-Instruct-FP8)
_MP_PARAMS_B=30.5; _MP_WEIGHT_BYTES=1
_MP_LAYERS=48; _MP_KV_HEADS=4; _MP_HEAD_DIM=128 ;;
meta-llama/Meta-Llama-3.1-8B-Instruct)
_MP_PARAMS_B=8.0; _MP_WEIGHT_BYTES=2
_MP_LAYERS=32; _MP_KV_HEADS=8; _MP_HEAD_DIM=128 ;;
llava-hf/llava-1.5-7b-hf)
_MP_PARAMS_B=7.1; _MP_WEIGHT_BYTES=2
_MP_LAYERS=32; _MP_KV_HEADS=32; _MP_HEAD_DIM=128 ;;
*)
echo "get_model_params: unknown model '$model'" >&2
echo "Add it to get_model_params() in gpu_utils.sh" >&2
return 1 ;;
esac
}
# estimate_worker_vram <model> [max_model_len] [max_concurrent_seqs] [engine_or_overhead]
#
# Calls get_model_params, then sets:
# _EW_WEIGHTS_GIB Estimated model weight memory
# _EW_KV_GIB Estimated KV cache memory
# _EW_OVERHEAD_GIB Overhead used (auto-computed or explicit)
# _EW_TOTAL_GIB Estimated total per-worker VRAM (weights + kv + overhead)
#
# Formula:
# weights = params_b * 1e9 * weight_bytes
# kv = 2 * layers * kv_heads * head_dim * 2(BF16) * seq_len * seqs
# total = weights + kv + overhead
#
# Arguments:
# model HuggingFace model name (required)
# max_model_len Max tokens per sequence (default: 4096)
# max_concurrent_seqs Concurrent sequences to budget for (default: 2)
# engine_or_overhead Engine name OR explicit GiB value (default: 2.0)
#
# If the 4th argument is an engine name (vllm, sglang, trtllm), overhead is
# auto-computed from model parameters:
# overhead = base + scale * sqrt(params_b)
#
# Per-engine constants (calibrated from measurements on RTX 6000 Ada 48 GiB):
# vllm: base=1.2, scale=1.0 → 0.6B≈2.0, 8B≈4.0, 30B≈6.7
# sglang: base=2.5, scale=1.5 → 0.6B≈3.7, 8B≈6.7, 30B≈10.8
# trtllm: base=2.0, scale=1.2 → 0.6B≈2.9, 8B≈5.4, 30B≈8.6
#
# If the 4th argument is a number, it's used directly (backward compatible).
# If omitted, defaults to 2.0 (backward compatible).
#
# See examples/common/gpu_utils.md for the full derivation.
#
# Usage:
# estimate_worker_vram "Qwen/Qwen3-0.6B" 4096 2 vllm # auto overhead
# estimate_worker_vram "Qwen/Qwen3-0.6B" 4096 2 trtllm # auto overhead
# estimate_worker_vram "Qwen/Qwen3-0.6B" 4096 2 3.5 # explicit 3.5 GiB
# estimate_worker_vram "Qwen/Qwen3-0.6B" 4096 2 # default 2.0 GiB
# echo "$_EW_TOTAL_GIB GiB (w=$_EW_WEIGHTS_GIB kv=$_EW_KV_GIB oh=$_EW_OVERHEAD_GIB)"
estimate_worker_vram() {
local model="${1:?usage: estimate_worker_vram <model> [seq_len] [seqs] [engine_or_overhead]}"
local seqlen="${2:-4096}"
local seqs="${3:-2}"
local engine_or_overhead="${4:-2.0}"
get_model_params "$model" || return 1
local overhead
case "$engine_or_overhead" in
vllm) overhead=$(awk -v p="$_MP_PARAMS_B" 'BEGIN { printf "%.1f", 1.2 + 1.0 * sqrt(p) }') ;;
sglang) overhead=$(awk -v p="$_MP_PARAMS_B" 'BEGIN { printf "%.1f", 2.5 + 1.5 * sqrt(p) }') ;;
trtllm) overhead=$(awk -v p="$_MP_PARAMS_B" 'BEGIN { printf "%.1f", 2.0 + 1.2 * sqrt(p) }') ;;
*) overhead="$engine_or_overhead" ;;
esac
_EW_OVERHEAD_GIB="$overhead"
read -r _EW_WEIGHTS_GIB _EW_KV_GIB _EW_TOTAL_GIB <<< "$(awk \
-v pb="$_MP_PARAMS_B" -v wbytes="$_MP_WEIGHT_BYTES" \
-v layers="$_MP_LAYERS" -v heads="$_MP_KV_HEADS" -v dim="$_MP_HEAD_DIM" \
-v seqlen="$seqlen" -v seqs="$seqs" -v overhead="$overhead" \
'BEGIN {
gib = 1024 * 1024 * 1024
w = pb * 1e9 * wbytes / gib
kv = 2 * layers * heads * dim * 2 * seqlen * seqs / gib
printf "%.1f %.1f %.1f", w, kv, w + kv + overhead
}')"
}
# gpu_worker_fraction <engine> [gpu_index]
#
# Unified fraction calculator for all engines. Reads the _EW_* variables
# set by estimate_worker_vram and returns the engine-appropriate fraction.
#
# Engine semantics (see examples/common/gpu_utils.md):
# vllm/sglang — fraction of TOTAL VRAM. The engine budgets weights + KV +
# activations inside this limit. We pass _EW_TOTAL_GIB.
# trtllm — fraction of FREE VRAM (after model load). The engine uses
# this only for KV cache. We pass _EW_KV_GIB.
#
# This lets every launch script use the same pattern:
# estimate_worker_vram "$MODEL" "$SEQ_LEN" "$CONCURRENCY" "$OVERHEAD_GIB"
# GPU_MEM_FRACTION=$(gpu_worker_fraction "<engine>")
#
# Usage:
# gpu_worker_fraction vllm # uses _EW_TOTAL_GIB, fraction of total
# gpu_worker_fraction sglang # same as vllm
# gpu_worker_fraction trtllm # uses _EW_KV_GIB, fraction of free
# gpu_worker_fraction trtllm 1 # query GPU index 1
gpu_worker_fraction() {
local engine="${1:?usage: gpu_worker_fraction <engine> [gpu_index]}"
local gpu_idx="${2:-0}"
case "$engine" in
vllm|sglang)
gpu_gb_to_total_fraction "$_EW_TOTAL_GIB" "$gpu_idx" ;;
trtllm)
gpu_gb_to_free_fraction "$_EW_KV_GIB" "$gpu_idx" ;;
*)
echo "gpu_worker_fraction: unknown engine '$engine'" >&2
echo "Supported: vllm, sglang, trtllm" >&2
return 1 ;;
esac
}
# gpu_gb_to_total_fraction <gib> [gpu_index]
#
# For vLLM / sglang: --gpu-memory-utilization is a fraction of TOTAL GPU memory.
# The engine budgets model weights + KV cache + activations within that limit.
#
# Prints the fraction of total GPU VRAM that <gib> GiB represents.
# Useful for converting portable absolute memory requirements to
# engine-specific fraction parameters (--gpu-memory-utilization, etc).
#
# Examples:
# gpu_gb_to_total_fraction 4 # on 48 GiB GPU → 0.09
# gpu_gb_to_total_fraction 16 # on 48 GiB GPU → 0.34
# gpu_gb_to_total_fraction 4 1 # query GPU index 1 instead of 0
#
# The result is ceil-rounded to 2 decimal places with a minimum of 0.05
# and a maximum of 0.95.
gpu_gb_to_total_fraction() {
local gib=${1:?usage: gpu_gb_to_total_fraction <gib> [gpu_index]}
local gpu_idx=${2:-0}
local total_mib
total_mib=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits -i "$gpu_idx" 2>/dev/null)
if [[ -z "$total_mib" || "$total_mib" -eq 0 ]]; then
echo "gpu_gb_to_total_fraction: failed to query GPU $gpu_idx total memory" >&2
return 1
fi
local total_gib
total_gib=$(awk -v t="$total_mib" 'BEGIN { printf "%.1f", t / 1024 }')
if awk -v gib="$gib" -v total="$total_mib" 'BEGIN { exit (gib * 1024 > total) ? 0 : 1 }'; then
echo "" >&2
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" >&2
echo "WARNING: Requested ${gib} GiB but GPU $gpu_idx only has ${total_gib} GiB total." >&2
echo "The model likely won't fit. Consider a GPU with more VRAM" >&2
echo "or reduce the model size (quantization, smaller model, etc)." >&2
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" >&2
echo "" >&2
fi
# fraction = gib * 1024 / total_mib, ceil to 2 decimals, clamp [0.05, 0.95]
awk -v gib="$gib" -v total="$total_mib" 'BEGIN {
frac = (gib * 1024) / total
# ceil to 2 decimal places
frac = int(frac * 100 + 0.99) / 100
if (frac < 0.05) frac = 0.05
if (frac > 0.95) frac = 0.95
printf "%.2f\n", frac
}'
}
# gpu_gb_to_free_fraction <gib> [gpu_index]
#
# For TensorRT-LLM: --free-gpu-memory-fraction (CLI) and
# kv_cache_config.free_gpu_memory_fraction (YAML) are fractions of FREE
# memory AFTER model weights are loaded — NOT fractions of total VRAM.
# The engine loads model weights first, queries remaining free memory,
# then allocates fraction * free_after_model for the KV cache.
#
# Why gpu_gb_to_total_fraction won't work for TensorRT-LLM:
# gpu_gb_to_total_fraction(10) on a 48 GiB GPU → 0.21 (fraction of total).
# Passing 0.21 as free_gpu_memory_fraction after a 5 GiB model loads
# would allocate 0.21 * 43 GiB ≈ 9 GiB — close but not exact.
# For larger models the error grows: a 30 GiB model leaves 18 GiB free,
# so 0.21 * 18 ≈ 3.8 GiB — far less than the 10 GiB intended.
#
# This function queries CURRENT free memory from nvidia-smi and computes
# gib / free_mib. The result is a best-effort estimate: TensorRT-LLM will
# see less free memory than we measure here (model weights haven't loaded
# yet), so the actual KV cache allocation will be smaller than <gib>.
# For rough sizing this is fine; for precise control use the YAML config
# with a known model size.
#
# For disagg_same_gpu (two workers sharing one GPU), launch workers
# sequentially: start the first, wait for it to finish loading (poll
# nvidia-smi or logs), then query free memory again and compute the
# fraction for the second worker. This gives predictable per-worker
# KV cache sizes on any GPU.
#
# Override at launch via CLI or env var:
# --override-engine-args '{"kv_cache_config":{"free_gpu_memory_fraction": 0.15}}'
# DYN_TRTLLM_OVERRIDE_ENGINE_ARGS='{"kv_cache_config":{"free_gpu_memory_fraction": 0.15}}'
#
# GOTCHA: overriding any field inside kv_cache_config REPLACES the entire
# sub-dict from the YAML. You must re-include all fields you care about
# (e.g. enable_block_reuse, dtype) or they'll be lost.
#
# Examples:
# gpu_gb_to_free_fraction 10 # on 48 GiB GPU with 46 GiB free → 0.22
# gpu_gb_to_free_fraction 10 1 # query GPU index 1 instead of 0
#
# The result is ceil-rounded to 2 decimal places, clamped [0.01, 0.95].
# The floor is 0.01 (not 0.05 like gpu_gb_to_total_fraction) because this
# fraction only controls KV cache, so small values are valid.
gpu_gb_to_free_fraction() {
local gib=${1:?usage: gpu_gb_to_free_fraction <gib> [gpu_index]}
local gpu_idx=${2:-0}
local free_mib
free_mib=$(nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits -i "$gpu_idx" 2>/dev/null)
if [[ -z "$free_mib" || "$free_mib" -eq 0 ]]; then
echo "gpu_gb_to_free_fraction: failed to query GPU $gpu_idx free memory" >&2
return 1
fi
local free_gib
free_gib=$(awk -v f="$free_mib" 'BEGIN { printf "%.1f", f / 1024 }')
if awk -v gib="$gib" -v free="$free_mib" 'BEGIN { exit (gib * 1024 > free) ? 0 : 1 }'; then
echo "" >&2
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" >&2
echo "WARNING: Requested ${gib} GiB KV cache but GPU $gpu_idx only has ${free_gib} GiB free." >&2
echo "After model loading, even less will be available." >&2
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" >&2
echo "" >&2
fi
# fraction = gib * 1024 / free_mib, ceil to 2 decimals, clamp [0.01, 0.95]
awk -v gib="$gib" -v free="$free_mib" 'BEGIN {
frac = (gib * 1024) / free
frac = int(frac * 100 + 0.99) / 100
if (frac < 0.01) frac = 0.01
if (frac > 0.95) frac = 0.95
printf "%.2f\n", frac
}'
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment