feat: add an auto GPU VRAM estimator for disagg-same-GPU (#6868)

Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>

feat: add an auto GPU VRAM estimator for disagg-same-GPU (#6868)
Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>
423e4b45 · Keiven C · GitHub · f0ac86e8 · 423e4b45 · 423e4b45
Unverified Commit 423e4b45 authored Mar 10, 2026 by Keiven C Committed by GitHub Mar 10, 2026
5 changed files
--- a/examples/backends/sglang/launch/disagg_same_gpu.sh
+++ b/examples/backends/sglang/launch/disagg_same_gpu.sh
@@ -2,31 +2,37 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
-# Disaggregated serving on a single GPU (prefill + decode share memory).
+# Disaggregated prefill/decode on a SINGLE GPU.
-# GPUs: 1 (requires 16+ GB VRAM)
+# Per-worker VRAM is estimated from model parameters below. Override individual
+# knobs (CONTEXT_LENGTH, MAX_RUNNING_REQUESTS) via env vars, or set
+# DYN_GPU_MEMORY_FRACTION_OVERRIDE to bypass the calculation entirely.
 #
-# Usage: ./disagg_same_gpu.sh [GPU_MEM_FRACTION]
+# Measured reference (Qwen/Qwen3-0.6B, --context-length 4096, RTX 6000 Ada 48 GiB):
-#   GPU_MEM_FRACTION: Fraction of GPU memory to use per worker (default: 0.45)
+#   estimate (from gpu_utils.sh) : ~5.7 GiB per worker (w=1.1 + kv=0.9 + oh=3.7)
-#   Example: ./disagg_same_gpu.sh 0.45
+#   actual (nvidia-smi)          : ~5.3 GiB per worker (~10.9 GiB total)
+#   fraction per worker (48 GiB)  : 0.12
+#   KV cache                      : 25,536-29,712 tokens per worker
+#   Handles full 4096-token context with --max-running-requests 2.
-# GPU memory fraction to use per worker (default: 0.45 = 45% each = 90% total for both workers)
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
-GPU_MEM_FRACTION="${1:-0.45}"
+source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
-# Check GPU memory before starting disaggregated mode on single GPU
+MODEL="Qwen/Qwen3-0.6B"
-FREE_GPU_GB=$(python3 -c "import torch; print(torch.cuda.mem_get_info()[0]/1024**3)" 2>/dev/null)
-if [ $? -ne 0 ]; then
+# ---- Tunable (override via env vars) ----
-  echo "Error: Failed to check GPU memory. Is PyTorch with CUDA available?"
+CONTEXT_LENGTH="${CONTEXT_LENGTH:-4096}"
-  exit 1
+MAX_RUNNING_REQUESTS="${MAX_RUNNING_REQUESTS:-2}"
-fi
-REQUIRED_GB=16
+# ---- Estimate per-worker VRAM (see examples/common/gpu_utils.md) ----
-# Use Python for floating-point comparison to avoid bc dependency
+# Sets _EW_WEIGHTS_GIB, _EW_KV_GIB, _EW_OVERHEAD_GIB, _EW_TOTAL_GIB
-if python3 -c "import sys; sys.exit(0 if float('$FREE_GPU_GB') >= $REQUIRED_GB else 1)"; then
+estimate_worker_vram "$MODEL" "$CONTEXT_LENGTH" "$MAX_RUNNING_REQUESTS" sglang
-  echo "GPU memory check passed: ${FREE_GPU_GB}GB available (required: ${REQUIRED_GB}GB)"
+# DYN_GPU_MEMORY_FRACTION_OVERRIDE takes precedence (profiler binary search).
+# In single-GPU mode, split the override evenly between the two workers.
+if [[ -n "${DYN_GPU_MEMORY_FRACTION_OVERRIDE:-}" ]]; then
+    GPU_MEM_FRACTION=$(awk -v f="$DYN_GPU_MEMORY_FRACTION_OVERRIDE" 'BEGIN { printf "%.2f", f / 2 }')
 else
-  echo "Error: Insufficient GPU memory. Required: ${REQUIRED_GB}GB, Available: ${FREE_GPU_GB}GB"
+    GPU_MEM_FRACTION=$(gpu_worker_fraction sglang)
-  echo "Please free up GPU memory before running disaggregated mode on single GPU."
-  exit 1
 fi
 # Setup cleanup trap
@@ -39,14 +45,15 @@ cleanup() {
 trap cleanup EXIT INT TERM
-MODEL="Qwen/Qwen3-0.6B"
 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
 echo "=========================================="
 echo "Launching Disaggregated (same GPU)"
 echo "=========================================="
 echo "Model:       $MODEL"
 echo "Frontend:    http://localhost:$HTTP_PORT"
-echo "GPU Mem:     ${GPU_MEM_FRACTION} per worker"
+echo "Context len: $CONTEXT_LENGTH"
+echo "GPU Mem:     ${GPU_MEM_FRACTION} per worker (~${_EW_TOTAL_GIB} GiB each)"
+echo "  estimate:  weights=${_EW_WEIGHTS_GIB} + kv=${_EW_KV_GIB} + overhead=${_EW_OVERHEAD_GIB} GiB"
 echo "=========================================="
 echo ""
 echo "Example test command:"
@@ -69,8 +76,8 @@ DYNAMO_PID=$!
 # run prefill worker with metrics on port 8081
 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
 python3 -m dynamo.sglang \
-  --model-path Qwen/Qwen3-0.6B \
+  --model-path "$MODEL" \
-  --served-model-name Qwen/Qwen3-0.6B \
+  --served-model-name "$MODEL" \
  --page-size 16 \
  --tp 1 \
  --trust-remote-code \
@@ -78,12 +85,13 @@ python3 -m dynamo.sglang \
  --disaggregation-bootstrap-port 12345 \
  --host 0.0.0.0 \
  --disaggregation-transfer-backend nixl \
-  --mem-fraction-static ${GPU_MEM_FRACTION} \
+  --mem-fraction-static "${GPU_MEM_FRACTION}" \
-  --chunked-prefill-size 4096 \
+  --context-length "$CONTEXT_LENGTH" \
-  --max-prefill-tokens 4096 \
+  --chunked-prefill-size "$CONTEXT_LENGTH" \
+  --max-prefill-tokens "$CONTEXT_LENGTH" \
  --enable-memory-saver \
  --delete-ckpt-after-loading \
-  --max-running-requests 2 \
+  --max-running-requests "$MAX_RUNNING_REQUESTS" \
  --enable-metrics &
 PREFILL_PID=$!
@@ -99,8 +107,8 @@ sleep 5
 # run decode worker with metrics on port 8082 (foreground)
 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
 python3 -m dynamo.sglang \
-  --model-path Qwen/Qwen3-0.6B \
+  --model-path "$MODEL" \
-  --served-model-name Qwen/Qwen3-0.6B \
+  --served-model-name "$MODEL" \
  --page-size 16 \
  --tp 1 \
  --trust-remote-code \
@@ -108,11 +116,11 @@ python3 -m dynamo.sglang \
  --disaggregation-bootstrap-port 12345 \
  --host 0.0.0.0 \
  --disaggregation-transfer-backend nixl \
-  --mem-fraction-static ${GPU_MEM_FRACTION} \
+  --mem-fraction-static "${GPU_MEM_FRACTION}" \
-  --chunked-prefill-size 4096 \
+  --context-length "$CONTEXT_LENGTH" \
-  --max-prefill-tokens 4096 \
+  --chunked-prefill-size "$CONTEXT_LENGTH" \
+  --max-prefill-tokens "$CONTEXT_LENGTH" \
  --enable-memory-saver \
  --delete-ckpt-after-loading \
-  --max-running-requests 2 \
+  --max-running-requests "$MAX_RUNNING_REQUESTS" \
  --enable-metrics
--- a/examples/backends/trtllm/launch/disagg_same_gpu.sh
+++ b/examples/backends/trtllm/launch/disagg_same_gpu.sh
 #!/bin/bash
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
+#
+# Disaggregated prefill/decode on a SINGLE GPU.
+# Per-worker VRAM is estimated from model parameters below. Override individual
+# knobs (MAX_SEQ_LEN, MAX_CONCURRENT_SEQS) via env vars, or set
+# DYN_GPU_MEMORY_FRACTION_OVERRIDE to bypass the calculation entirely.
+#
+# NOTE — trtllm fraction semantics differ from vllm/sglang:
+#   vllm/sglang:  fraction of TOTAL VRAM  (weights + KV + activations all inside)
+#   trtllm:       fraction of FREE  VRAM  (KV cache only, after model load)
+# gpu_worker_fraction("trtllm") handles this — see gpu_utils.sh / gpu_utils.md.
+#
+# Measured reference (Qwen/Qwen3-0.6B, --max-seq-len 4096, RTX 6000 Ada 48 GiB):
+#   estimate (from gpu_utils.sh) : ~8.0 GiB per worker (~16.0 GiB total)
+#   actual (nvidia-smi)          : ~7.4 GiB per worker (~14.8 GiB total)
+#   fraction per worker (free)   : 0.05
+#   Overestimating is intentional -- better to pad than OOM.
-# Disaggregated mode on single GPU - for testing only
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
-# Both prefill and decode workers share the same GPU with reduced memory
+source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
-# Check GPU memory availability
+MODEL="Qwen/Qwen3-0.6B"
-FREE_GPU_GB=$(python3 -c "import torch; print(torch.cuda.mem_get_info()[0]/1024**3)" 2>/dev/null)
-if [ $? -ne 0 ]; then
-    echo "Error: Failed to check GPU memory. Is PyTorch with CUDA available?"
-    exit 1
-fi
-REQUIRED_GB=16
+# ---- Tunable (override via env vars) ----
-# Use bash arithmetic instead of bc to avoid external dependency
+MAX_SEQ_LEN="${MAX_SEQ_LEN:-4096}"
-FREE_GPU_INT=$(python3 -c "print(int(float('$FREE_GPU_GB')))" 2>/dev/null)
+MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
-if [ $? -ne 0 ]; then
-    echo "Error: Failed to parse GPU memory value."
-    exit 1
-fi
-if (( FREE_GPU_INT < REQUIRED_GB )); then
+# ---- Estimate per-worker VRAM (see examples/common/gpu_utils.md) ----
-    echo "Error: Insufficient GPU memory. Required: ${REQUIRED_GB}GB, Available: ${FREE_GPU_GB}GB"
+# Sets _EW_WEIGHTS_GIB, _EW_KV_GIB, _EW_OVERHEAD_GIB, _EW_TOTAL_GIB
-    echo "Please free up GPU memory before running disaggregated mode on single GPU."
+estimate_worker_vram "$MODEL" "$MAX_SEQ_LEN" "$MAX_CONCURRENT_SEQS" trtllm
-    exit 1
-fi
-echo "GPU memory check passed: ${FREE_GPU_GB}GB available (required: ${REQUIRED_GB}GB)"
+# DYN_GPU_MEMORY_FRACTION_OVERRIDE takes precedence (profiler binary search).
+# In single-GPU mode, split the override evenly between the two workers.
+if [[ -n "${DYN_GPU_MEMORY_FRACTION_OVERRIDE:-}" ]]; then
+    GPU_MEM_FRACTION=$(awk -v f="$DYN_GPU_MEMORY_FRACTION_OVERRIDE" 'BEGIN { printf "%.2f", f / 2 }')
+else
+    GPU_MEM_FRACTION=$(gpu_worker_fraction trtllm)
+fi
 # Environment variables with defaults
 export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
-export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
+export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/prefill.yaml"}
-export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
+export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/decode.yaml"}
-export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/tests/serve/trtllm/engine_configs/qwen3/prefill.yaml"}
-export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/tests/serve/trtllm/engine_configs/qwen3/decode.yaml"}
 export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-"0"}
 export MODALITY=${MODALITY:-"text"}
@@ -69,14 +78,28 @@ while [[ $# -gt 0 ]]; do
    esac
 done
-# Enable tracing if requested
+# Build --override-engine-args JSON.
-TRACE_ARGS=()
+# Always override free_gpu_memory_fraction so the script controls KV cache size,
+# matching how vllm (--gpu-memory-utilization) and sglang (--mem-fraction-static)
+# pass memory parameters from the launch script.
+OVERRIDE_PAIRS="\"kv_cache_config\": {\"free_gpu_memory_fraction\": ${GPU_MEM_FRACTION}}"
 if [ "$ENABLE_OTEL" = true ]; then
    export DYN_LOGGING_JSONL=true
    export OTEL_EXPORT_ENABLED=1
    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-http://localhost:4317}
-    TRACE_ARGS+=(--override-engine-args "{\"return_perf_metrics\": true, \"otlp_traces_endpoint\": \"${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}\" }")
+    OVERRIDE_PAIRS="${OVERRIDE_PAIRS}, \"return_perf_metrics\": true, \"otlp_traces_endpoint\": \"${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}\""
 fi
+OVERRIDE_ARGS=(--override-engine-args "{${OVERRIDE_PAIRS}}")
+echo "=========================================="
+echo "Launching Disaggregated on Same GPU (1 GPU)"
+echo "=========================================="
+echo "Model:       $MODEL"
+echo "Max seq len: $MAX_SEQ_LEN"
+echo "GPU Mem:     ${GPU_MEM_FRACTION} per worker (~${_EW_TOTAL_GIB} GiB each)"
+echo "  estimate:  weights=${_EW_WEIGHTS_GIB} + kv=${_EW_KV_GIB} + overhead=${_EW_OVERHEAD_GIB} GiB"
+echo "=========================================="
 # run frontend
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
 OTEL_SERVICE_NAME=dynamo-frontend \
@@ -88,25 +111,24 @@ OTEL_SERVICE_NAME=dynamo-worker-prefill \
 CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
 python3 -m dynamo.trtllm \
-  --model-path "$MODEL_PATH" \
+  --model-path "$MODEL" \
-  --served-model-name "$SERVED_MODEL_NAME" \
+  --served-model-name "$MODEL" \
  --extra-engine-args  "$PREFILL_ENGINE_ARGS" \
  --modality "$MODALITY" \
  --publish-events-and-metrics \
  --disaggregation-mode prefill \
-  "${TRACE_ARGS[@]}" &
+  "${OVERRIDE_ARGS[@]}" &
 PREFILL_PID=$!
-# run decode worker (shares GPU with prefill)
+# run decode worker (shares GPU with prefill) - foreground
 OTEL_SERVICE_NAME=dynamo-worker-decode \
 CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
 python3 -m dynamo.trtllm \
-  --model-path "$MODEL_PATH" \
+  --model-path "$MODEL" \
-  --served-model-name "$SERVED_MODEL_NAME" \
+  --served-model-name "$MODEL" \
  --extra-engine-args  "$DECODE_ENGINE_ARGS" \
  --modality "$MODALITY" \
  --publish-events-and-metrics \
  --disaggregation-mode decode \
-  "${TRACE_ARGS[@]}"
+  "${OVERRIDE_ARGS[@]}"
--- a/examples/backends/vllm/launch/disagg_same_gpu.sh
+++ b/examples/backends/vllm/launch/disagg_same_gpu.sh
@@ -2,38 +2,39 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
-# Usage: ./disagg_same_gpu.sh
+# Disaggregated prefill/decode on a SINGLE GPU.
-# Automatically calculates GPU memory fraction so each worker gets 4GB
+# Per-worker VRAM is estimated from model parameters below. Override individual
+# knobs (MAX_MODEL_LEN, MAX_CONCURRENT_SEQS) via env vars, or set
+# DYN_GPU_MEMORY_FRACTION_OVERRIDE to bypass the calculation entirely.
+#
+# Measured reference (Qwen/Qwen3-0.6B, --max-model-len 4096, RTX 6000 Ada 48 GiB):
+#   estimate (from gpu_utils.sh) : ~4.0 GiB per worker (~8.0 GiB total)
+#   actual (nvidia-smi)          : ~3.4 GiB per worker (~6.7 GiB total)
+#   fraction per worker (for 48 GiB) : 0.09
+#   The ~1.3 GiB pad comes from the overhead term (CUDA ctx + activations).
+#   Overestimating is intentional -- better to pad than OOM.
-# Get total and free GPU memory
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
-GPU_MEM_INFO=$(python3 -c "import torch; free, total = torch.cuda.mem_get_info(); print(f'{free/1024**3:.2f} {total/1024**3:.2f}')" 2>/dev/null)
+source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
-if [ $? -ne 0 ]; then
-  echo "Error: Failed to check GPU memory. Is PyTorch with CUDA available?"
-  exit 1
-fi
-FREE_GPU_GB=$(echo $GPU_MEM_INFO | awk '{print $1}')
+MODEL="Qwen/Qwen3-0.6B"
-TOTAL_GPU_GB=$(echo $GPU_MEM_INFO | awk '{print $2}')
-# Each worker needs 4GB
+# ---- Tunable (override via env vars) ----
-REQUIRED_GB_PER_WORKER=4
+MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
-REQUIRED_GB_TOTAL=8
+MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
-# Calculate fraction needed per worker (4GB / total GPU memory)
+# ---- Estimate per-worker VRAM (see examples/common/gpu_utils.md) ----
-GPU_MEM_FRACTION=$(python3 -c "print(f'{$REQUIRED_GB_PER_WORKER / $TOTAL_GPU_GB:.3f}')")
+# Sets _EW_WEIGHTS_GIB, _EW_KV_GIB, _EW_OVERHEAD_GIB, _EW_TOTAL_GIB
+estimate_worker_vram "$MODEL" "$MAX_MODEL_LEN" "$MAX_CONCURRENT_SEQS" vllm
-# Check if we have enough free memory
+# DYN_GPU_MEMORY_FRACTION_OVERRIDE takes precedence (profiler binary search).
-if python3 -c "import sys; sys.exit(0 if float('$FREE_GPU_GB') >= $REQUIRED_GB_TOTAL else 1)"; then
+# In single-GPU mode, split the override evenly between the two workers.
-  echo "GPU memory check passed: ${FREE_GPU_GB}GB free / ${TOTAL_GPU_GB}GB total (required: ${REQUIRED_GB_TOTAL}GB)"
+if [[ -n "${DYN_GPU_MEMORY_FRACTION_OVERRIDE:-}" ]]; then
-  echo "Using ${GPU_MEM_FRACTION} memory fraction per worker (${REQUIRED_GB_PER_WORKER}GB each)"
+    GPU_MEM_FRACTION=$(awk -v f="$DYN_GPU_MEMORY_FRACTION_OVERRIDE" 'BEGIN { printf "%.2f", f / 2 }')
 else
-  echo "Error: Insufficient GPU memory. Required: ${REQUIRED_GB_TOTAL}GB, Available: ${FREE_GPU_GB}GB"
+    GPU_MEM_FRACTION=$(gpu_worker_fraction vllm)
-  echo "Please free up GPU memory before running disaggregated mode on single GPU."
-  exit 1
 fi
-MODEL="Qwen/Qwen3-0.6B"
 # Setup cleanup trap
 cleanup() {
    echo "Cleaning up background processes..."
@@ -49,6 +50,9 @@ echo "Launching Disaggregated on Same GPU (1 GPU)"
 echo "=========================================="
 echo "Model:       $MODEL"
 echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "Max seq len: $MAX_MODEL_LEN"
+echo "GPU Mem:     ${GPU_MEM_FRACTION} per worker (~${_EW_TOTAL_GIB} GiB each)"
+echo "  estimate:  weights=${_EW_WEIGHTS_GIB} + kv=${_EW_KV_GIB} + overhead=${_EW_OVERHEAD_GIB} GiB"
 echo "=========================================="
 echo ""
 echo "Example test command:"
@@ -79,8 +83,8 @@ python3 -m dynamo.vllm \
  --enforce-eager \
  --disaggregation-mode decode \
  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
-  --gpu-memory-utilization ${GPU_MEM_FRACTION} \
+  --gpu-memory-utilization "${GPU_MEM_FRACTION}" \
-  --max-model-len 16384 &
+  --max-model-len "$MAX_MODEL_LEN" &
 DECODE_PID=$!
 # Wait for decode worker to initialize before starting prefill worker
@@ -101,7 +105,6 @@ python3 -m dynamo.vllm \
  --enforce-eager \
  --disaggregation-mode prefill \
  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
-  --gpu-memory-utilization ${GPU_MEM_FRACTION} \
+  --gpu-memory-utilization "${GPU_MEM_FRACTION}" \
-  --max-model-len 16384 \
+  --max-model-len "$MAX_MODEL_LEN" \
  --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
--- a/examples/common/gpu_utils.md
+++ b/examples/common/gpu_utils.md
+# GPU Memory Parameters by Engine
+How vLLM, sglang, and TensorRT-LLM interpret memory-related parameters, and how
+to estimate total GPU VRAM usage for each.
+---
+## Quick Reference
+| Parameter | vLLM | sglang | TensorRT-LLM |
+|---|---|---|---|
+| Memory fraction | `--gpu-memory-utilization` | `--mem-fraction-static` | `free_gpu_memory_fraction` (YAML/override) |
+| Fraction base | Total VRAM | Total VRAM | Free VRAM (after model load) |
+| Default fraction | 0.90 | 0.90 | 0.90 |
+| Max sequence length | `--max-model-len` | `--context-length` | `max_seq_len` (YAML/override) |
+| KV cache size override | `--kv-cache-memory-bytes` | N/A | `max_gpu_total_bytes` (broken in 1.3.0rc5) |
+---
+## 1. vLLM
+### How `--gpu-memory-utilization` works
+This is a fraction of **total** GPU VRAM. The engine budgets everything within
+this limit:
+```
+budget = total_vram * gpu_memory_utilization
+KV cache = budget - model_weights - peak_activations - framework_overhead
+```
+At startup, vLLM profiles actual model weight and activation memory, then
+pre-allocates the remaining budget as KV cache blocks. The KV pool size is fixed
+for the lifetime of the engine.
+### How `--max-model-len` works
+Sets the maximum total sequence length (input + output tokens). Longer sequences
+require more KV cache per request. If the requested `max-model-len` needs more
+KV cache than the budget allows, vLLM errors at startup:
+```
+ValueError: ... X GiB KV cache is needed, which is larger than the available
+KV cache memory (Y GiB). ...
+```
+Reducing `--max-model-len` is the most effective way to reduce VRAM when the
+model fits but the KV cache doesn't.
+### How `--kv-cache-memory-bytes` works
+When set, this overrides the automatic KV cache sizing from
+`gpu-memory-utilization`. The engine allocates exactly this many bytes for KV
+cache regardless of the fraction. This means `gpu-memory-utilization` still
+controls the *overall* VRAM budget (and thus whether the model fits), but the
+KV cache portion is pinned to the explicit byte value.
+Consequence for profiling: if a script uses `--kv-cache-memory-bytes`,
+changing `DYN_GPU_MEMORY_FRACTION_OVERRIDE` (which maps to
+`--gpu-memory-utilization`) won't change the KV cache size, only the leftover
+headroom for activations and overhead.
+### Estimating total GPU usage
+```
+total_vram ≈ model_weights + kv_cache + activations + overhead
+model_weights ≈ num_params * bytes_per_param
+                (e.g. 7B * 2 bytes for BF16 ≈ 14 GiB)
+kv_cache_per_token = 2 * num_layers * num_kv_heads * head_dim * bytes_per_element
+                     (the factor of 2 is for K and V tensors)
+kv_cache_total = kv_cache_per_token * max_model_len * max_concurrent_seqs
+overhead ≈ engine-dependent (auto-computed by estimate_worker_vram):
+           vllm:   1.2 + 1.0 * sqrt(params_b) GiB  (0.6B≈2.0, 8B≈4.0)
+           sglang: 2.5 + 1.5 * sqrt(params_b) GiB  (0.6B≈3.7, 8B≈6.7)
+           trtllm: 2.0 + 1.2 * sqrt(params_b) GiB  (0.6B≈2.9, 8B≈5.4)
+```
+Rule of thumb: set `gpu-memory-utilization` so that
+`total_vram * fraction >= model_weights + 2 GiB`. The rest becomes KV cache.
+---
+## 2. sglang
+### How `--mem-fraction-static` works
+Like vLLM, this is a fraction of **total** GPU VRAM:
+```
+budget = total_vram * mem_fraction_static
+KV cache pool = budget - model_weights
+```
+The budget covers model weights and the KV cache pool. Activations and CUDA
+graph buffers are allocated *outside* this budget from the remaining VRAM.
+This is slightly different from vLLM (which includes activations in the budget).
+sglang recommends keeping 5-8 GiB free for activations and overhead. If you
+see OOM errors, decrease `--mem-fraction-static` by 0.01-0.05 increments.
+### How `--context-length` works
+Equivalent to vLLM's `--max-model-len`. Defaults to the model's native context
+window. Reducing it shrinks the per-request KV cache requirement and allows more
+concurrent sequences.
+### Estimating total GPU usage
+```
+total_vram ≈ model_weights + kv_cache_pool + activations_and_overhead
+kv_cache_pool = total_vram * mem_fraction_static - model_weights
+activations_and_overhead ≈ 1-8 GiB (depends on model size, batch size, seq len;
+                           ~1-2 GiB for small models like 0.6B,
+                           ~5-8 GiB for larger models like 8B+ with CUDA graphs)
+```
+---
+## 3. TensorRT-LLM
+### How `free_gpu_memory_fraction` works
+This is a fraction of **free** VRAM (not total). The engine:
+1. Loads model weights and builds the TRT engine (fixed cost).
+2. Queries remaining free GPU memory.
+3. Allocates `free_memory * free_gpu_memory_fraction` for the KV cache pool.
+```
+kv_cache = free_vram_after_model_load * free_gpu_memory_fraction
+```
+This means the same fraction yields different absolute KV cache sizes depending
+on how much VRAM the model consumed. A 5 GiB model on a 48 GiB GPU leaves
+~43 GiB free; fraction=0.24 gives ~10 GiB KV cache. A 30 GiB model leaves
+~18 GiB free; fraction=0.24 gives only ~4 GiB.
+Set via YAML config, CLI, or env var:
+```bash
+--override-engine-args '{"kv_cache_config":{"free_gpu_memory_fraction": 0.24}}'
+DYN_TRTLLM_OVERRIDE_ENGINE_ARGS='{"kv_cache_config":{"free_gpu_memory_fraction": 0.24}}'
+```
+### How `max_seq_len` works
+Maximum total sequence length. Defaults to the model's native context.
+Sequences exceeding this limit are rejected at runtime.
+**VRAM impact: none (PyTorch backend).** Reducing max_seq_len from 40960 to
+2048 had zero effect on total VRAM or KV cache size in testing (Qwen3-0.6B,
+trtllm 1.3.0rc5). The PyTorch backend does not pre-allocate internal buffers
+proportional to max_seq_len; KV cache size is determined solely by
+`free_gpu_memory_fraction`. This differs from vLLM/sglang where reducing
+context length measurably reduces memory.
+Override via:
+```bash
+--override-engine-args '{"max_seq_len": 4096}'
+```
+### Override gotcha: sub-dict replacement
+Overriding any field inside `kv_cache_config` **replaces the entire sub-dict**.
+If your YAML has `enable_block_reuse: true` and you override only
+`free_gpu_memory_fraction`, you lose `enable_block_reuse`. Always re-include
+all fields you need:
+```json
+{"kv_cache_config": {"free_gpu_memory_fraction": 0.15, "enable_block_reuse": true}}
+```
+### How `max_num_tokens` works
+Maximum batched input tokens per iteration. Primarily a throughput knob.
+**VRAM impact: none.** Reducing from 8192 → 256 had no measurable effect on
+total VRAM (41,643 vs 41,465 MiB — within noise; the slight *increase* is
+because smaller activation footprint lets the fraction claim marginally more
+KV cache).
+### `max_gpu_total_bytes` (broken)
+Intended as an absolute byte cap for KV cache. As of trtllm 1.3.0rc5, this
+field is **ignored**. Setting 5 GiB cap with `free_gpu_memory_fraction=0.95`
+still allocated ~42 GiB of KV cache. Setting `free_gpu_memory_fraction=0.0`
+with only `max_gpu_total_bytes` causes `"Impossible to fit any sequence in
+kvCache"`. Do not rely on this field.
+### Override precedence
+```
+--override-engine-args JSON  >  --extra-engine-args YAML  >  CLI flags
+```
+The `DYN_TRTLLM_OVERRIDE_ENGINE_ARGS` env var is equivalent to
+`--override-engine-args` and avoids shell quoting issues with scripts whose
+arg parsers consume unknown flags before passing `"$@"`.
+### Estimating total GPU usage
+```
+total_vram ≈ model_weights + engine_overhead + kv_cache
+model_weights ≈ num_params * bytes_per_param / tensor_parallel_size
+engine_overhead ≈ 2.0 + 1.2 * sqrt(params_b) GiB  (CUDA context + TRT buffers + activations)
+kv_cache = free_vram_after_model_load * free_gpu_memory_fraction
+```
+Engine overhead is auto-computed by `estimate_worker_vram` when called with the
+`trtllm` engine name.  Examples: 0.6B → 2.9 GiB, 8B → 5.4 GiB, 30B → 8.6 GiB.
+### Empirical validation (Qwen3-0.6B, RTX 6000 Ada 48 GiB, trtllm 1.3.0rc5)
+Controlled test: single worker via agg.sh, one override at a time.
+| # | Override | Total VRAM | KV Cache | Tokens |
+|---|---------|-----------|----------|--------|
+| 1 | Baseline (YAML frac=0.85) | 41,465 MiB | 38.04 GiB | 356,160 |
+| 2 | `free_gpu_memory_fraction=0.15` | 9,383 MiB | 6.71 GiB | 62,848 |
+| 3 | `max_num_tokens=256` | 41,643 MiB | 38.26 GiB | 358,208 |
+| 4 | `max_seq_len=4096` | 41,469 MiB | 38.05 GiB | 356,192 |
+| 5 | `max_seq_len=2048` | 41,469 MiB | 38.05 GiB | 356,192 |
+| 6 | seq=4096 + frac=0.15 | 9,383 MiB | 6.71 GiB | 62,848 |
+| 7 | tokens=256 + seq=4096 + frac=0.15 | 9,377 MiB | 6.75 GiB | 63,200 |
+**Conclusion:** `free_gpu_memory_fraction` is the **sole effective knob** for
+trtllm VRAM control. Neither `max_seq_len` nor `max_num_tokens` reduce memory.
+Combined overrides (test 7) produce no additional benefit over fraction alone
+(test 2).
+---
+## Why vLLM/sglang fractions are NOT interchangeable with TensorRT-LLM
+Consider wanting 10 GiB of KV cache on a 48 GiB GPU with a 5 GiB model:
+| Engine | Fraction meaning | Calculation | Result |
+|---|---|---|---|
+| vLLM | 10/48 = 0.21 of total | `48 * 0.21 = 10 GiB` budget (minus model = 5 GiB KV) | Wrong — need higher fraction |
+| sglang | Same as vLLM | Same math | Same problem |
+| TensorRT-LLM | 10/43 = 0.23 of free | `43 * 0.23 = 10 GiB` KV cache | Correct |
+For vLLM/sglang, you actually need `(model + kv) / total = (5 + 10) / 48 = 0.31`
+to get 10 GiB of KV cache with a 5 GiB model.
+The helper functions in `gpu_utils.sh` handle these differences:
+- `gpu_gb_to_total_fraction`: for vLLM/sglang (fraction of total VRAM)
+- `gpu_gb_to_free_fraction`: for TensorRT-LLM (fraction of free VRAM)
+- `gpu_worker_fraction <engine>`: unified wrapper — reads `_EW_*` vars from
+  `estimate_worker_vram` and calls the right function for the engine.
+Launch scripts use `gpu_worker_fraction` so they all follow the same pattern:
+```bash
+estimate_worker_vram "$MODEL" "$SEQ_LEN" "$CONCURRENCY" trtllm
+GPU_MEM_FRACTION=$(gpu_worker_fraction trtllm)
+```
+---
+## KV Cache Memory Per Token
+The formula for KV cache memory per token is the same across all engines:
+```
+kv_bytes_per_token = 2 * num_layers * num_kv_heads * head_dim * bytes_per_element
+```
+| Model | Layers | KV Heads | Head Dim | Dtype | Per Token |
+|---|---|---|---|---|---|
+| Qwen3-0.6B | 28 | 8 | 128 | BF16 | 112 KiB |
+| Llama-3.1-8B | 32 | 8 | 128 | BF16 | 128 KiB |
+| Llama-3.1-70B | 80 | 8 | 128 | BF16 | 320 KiB |
+| Qwen2.5-VL-7B | 28 | 4 | 128 | BF16 | 56 KiB |
+To estimate KV cache for a given context length:
+```
+kv_cache_gib = kv_bytes_per_token * max_model_len * max_concurrent_seqs / (1024^3)
+```
+---
+## `DYN_GPU_MEMORY_FRACTION_OVERRIDE`
+Environment variable used by Dynamo's VRAM profiler to binary-search the minimum
+memory fraction a script needs.
+- Maps to `--gpu-memory-utilization` in vLLM and `--mem-fraction-static` in sglang.
+- For TensorRT-LLM, maps to `kv_cache_config.free_gpu_memory_fraction` via
+  `--override-engine-args`.
+- Launch scripts use `gpu_worker_fraction <engine>` to compute the default
+  fraction; the override bypasses this and splits the raw value between workers.
+- Scripts that use `--kv-cache-memory-bytes` (vLLM) bypass the fraction-based KV
+  cache sizing, making the profiler's fraction override ineffective for KV cache.
+  Those scripts should warn when `DYN_GPU_MEMORY_FRACTION_OVERRIDE` is set.
--- a/examples/common/gpu_utils.sh
+++ b/examples/common/gpu_utils.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Shared GPU utility functions for launch scripts.
+#
+# Usage:
+#   source "$(dirname "$(readlink -f "$0")")/../common/gpu_utils.sh"
+#   # or with SCRIPT_DIR already set:
+#   source "$SCRIPT_DIR/../common/gpu_utils.sh"
+#
+# Functions:
+#   get_model_params <model>           Set _MP_* vars for a known model's architecture
+#   estimate_worker_vram <model> ...   Set _EW_* vars with per-worker VRAM estimate
+#   gpu_worker_fraction <engine>       Convert _EW_* estimate → engine-appropriate fraction
+#   gpu_gb_to_total_fraction <gib>     Convert absolute GiB → fraction of TOTAL VRAM (vLLM/sglang)
+#   gpu_gb_to_free_fraction <gib>      Convert absolute GiB → fraction of FREE VRAM (TensorRT-LLM)
+# get_model_params <model_name>
+#
+# Sets _MP_* variables for a known model's architecture:
+#   _MP_PARAMS_B       Total parameters in billions (all experts for MoE)
+#   _MP_WEIGHT_BYTES   Bytes per weight element (2=BF16/FP16, 1=FP8)
+#   _MP_LAYERS         Number of transformer layers
+#   _MP_KV_HEADS       Number of key-value heads (GQA groups)
+#   _MP_HEAD_DIM       Dimension per attention head
+#
+# KV cache is assumed BF16 (2 bytes per element) regardless of weight dtype,
+# since FP8 KV cache (--kv-cache-dtype fp8) is opt-in and not the default.
+#
+# To add a model: look up config.json on HuggingFace for num_hidden_layers,
+# num_key_value_heads, and head_dim. For VL/multimodal models, use the
+# text_config section. For MoE, _MP_PARAMS_B is the TOTAL param count
+# (all experts are loaded into VRAM).
+#
+# Usage:
+#   get_model_params "Qwen/Qwen3-0.6B"
+#   echo "$_MP_LAYERS layers, $_MP_KV_HEADS KV heads"
+get_model_params() {
+    local model="${1:?usage: get_model_params <model_name>}"
+    case "$model" in
+        Qwen/Qwen3-0.6B)
+            _MP_PARAMS_B=0.6;  _MP_WEIGHT_BYTES=2
+            _MP_LAYERS=28;  _MP_KV_HEADS=8;   _MP_HEAD_DIM=128 ;;
+        Qwen/Qwen2.5-VL-7B-Instruct)
+            _MP_PARAMS_B=8.3;  _MP_WEIGHT_BYTES=2
+            _MP_LAYERS=28;  _MP_KV_HEADS=4;   _MP_HEAD_DIM=128 ;;
+        Qwen/Qwen3-VL-8B-Instruct)
+            _MP_PARAMS_B=9.2;  _MP_WEIGHT_BYTES=2
+            _MP_LAYERS=36;  _MP_KV_HEADS=8;   _MP_HEAD_DIM=128 ;;
+        Qwen/Qwen3-30B-A3B|\
+        Qwen/Qwen3-30B-A3B-Instruct)
+            _MP_PARAMS_B=30.5; _MP_WEIGHT_BYTES=2
+            _MP_LAYERS=48;  _MP_KV_HEADS=4;   _MP_HEAD_DIM=128 ;;
+        Qwen/Qwen3-VL-30B-A3B-Instruct-FP8)
+            _MP_PARAMS_B=30.5; _MP_WEIGHT_BYTES=1
+            _MP_LAYERS=48;  _MP_KV_HEADS=4;   _MP_HEAD_DIM=128 ;;
+        meta-llama/Meta-Llama-3.1-8B-Instruct)
+            _MP_PARAMS_B=8.0;  _MP_WEIGHT_BYTES=2
+            _MP_LAYERS=32;  _MP_KV_HEADS=8;   _MP_HEAD_DIM=128 ;;
+        llava-hf/llava-1.5-7b-hf)
+            _MP_PARAMS_B=7.1;  _MP_WEIGHT_BYTES=2
+            _MP_LAYERS=32;  _MP_KV_HEADS=32;  _MP_HEAD_DIM=128 ;;
+        *)
+            echo "get_model_params: unknown model '$model'" >&2
+            echo "Add it to get_model_params() in gpu_utils.sh" >&2
+            return 1 ;;
+    esac
+}
+# estimate_worker_vram <model> [max_model_len] [max_concurrent_seqs] [engine_or_overhead]
+#
+# Calls get_model_params, then sets:
+#   _EW_WEIGHTS_GIB    Estimated model weight memory
+#   _EW_KV_GIB         Estimated KV cache memory
+#   _EW_OVERHEAD_GIB   Overhead used (auto-computed or explicit)
+#   _EW_TOTAL_GIB      Estimated total per-worker VRAM (weights + kv + overhead)
+#
+# Formula:
+#   weights = params_b * 1e9 * weight_bytes
+#   kv      = 2 * layers * kv_heads * head_dim * 2(BF16) * seq_len * seqs
+#   total   = weights + kv + overhead
+#
+# Arguments:
+#   model               HuggingFace model name (required)
+#   max_model_len       Max tokens per sequence (default: 4096)
+#   max_concurrent_seqs Concurrent sequences to budget for (default: 2)
+#   engine_or_overhead  Engine name OR explicit GiB value (default: 2.0)
+#
+# If the 4th argument is an engine name (vllm, sglang, trtllm), overhead is
+# auto-computed from model parameters:
+#   overhead = base + scale * sqrt(params_b)
+#
+# Per-engine constants (calibrated from measurements on RTX 6000 Ada 48 GiB):
+#   vllm:   base=1.2, scale=1.0  → 0.6B≈2.0, 8B≈4.0, 30B≈6.7
+#   sglang: base=2.5, scale=1.5  → 0.6B≈3.7, 8B≈6.7, 30B≈10.8
+#   trtllm: base=2.0, scale=1.2  → 0.6B≈2.9, 8B≈5.4, 30B≈8.6
+#
+# If the 4th argument is a number, it's used directly (backward compatible).
+# If omitted, defaults to 2.0 (backward compatible).
+#
+# See examples/common/gpu_utils.md for the full derivation.
+#
+# Usage:
+#   estimate_worker_vram "Qwen/Qwen3-0.6B" 4096 2 vllm      # auto overhead
+#   estimate_worker_vram "Qwen/Qwen3-0.6B" 4096 2 trtllm     # auto overhead
+#   estimate_worker_vram "Qwen/Qwen3-0.6B" 4096 2 3.5        # explicit 3.5 GiB
+#   estimate_worker_vram "Qwen/Qwen3-0.6B" 4096 2            # default 2.0 GiB
+#   echo "$_EW_TOTAL_GIB GiB (w=$_EW_WEIGHTS_GIB kv=$_EW_KV_GIB oh=$_EW_OVERHEAD_GIB)"
+estimate_worker_vram() {
+    local model="${1:?usage: estimate_worker_vram <model> [seq_len] [seqs] [engine_or_overhead]}"
+    local seqlen="${2:-4096}"
+    local seqs="${3:-2}"
+    local engine_or_overhead="${4:-2.0}"
+    get_model_params "$model" || return 1
+    local overhead
+    case "$engine_or_overhead" in
+        vllm)   overhead=$(awk -v p="$_MP_PARAMS_B" 'BEGIN { printf "%.1f", 1.2 + 1.0 * sqrt(p) }') ;;
+        sglang) overhead=$(awk -v p="$_MP_PARAMS_B" 'BEGIN { printf "%.1f", 2.5 + 1.5 * sqrt(p) }') ;;
+        trtllm) overhead=$(awk -v p="$_MP_PARAMS_B" 'BEGIN { printf "%.1f", 2.0 + 1.2 * sqrt(p) }') ;;
+        *)      overhead="$engine_or_overhead" ;;
+    esac
+    _EW_OVERHEAD_GIB="$overhead"
+    read -r _EW_WEIGHTS_GIB _EW_KV_GIB _EW_TOTAL_GIB <<< "$(awk \
+        -v pb="$_MP_PARAMS_B" -v wbytes="$_MP_WEIGHT_BYTES" \
+        -v layers="$_MP_LAYERS" -v heads="$_MP_KV_HEADS" -v dim="$_MP_HEAD_DIM" \
+        -v seqlen="$seqlen" -v seqs="$seqs" -v overhead="$overhead" \
+        'BEGIN {
+            gib = 1024 * 1024 * 1024
+            w   = pb * 1e9 * wbytes / gib
+            kv  = 2 * layers * heads * dim * 2 * seqlen * seqs / gib
+            printf "%.1f %.1f %.1f", w, kv, w + kv + overhead
+        }')"
+}
+# gpu_worker_fraction <engine> [gpu_index]
+#
+# Unified fraction calculator for all engines.  Reads the _EW_* variables
+# set by estimate_worker_vram and returns the engine-appropriate fraction.
+#
+# Engine semantics (see examples/common/gpu_utils.md):
+#   vllm/sglang  — fraction of TOTAL VRAM.  The engine budgets weights + KV +
+#                  activations inside this limit.  We pass _EW_TOTAL_GIB.
+#   trtllm       — fraction of FREE VRAM (after model load).  The engine uses
+#                  this only for KV cache.  We pass _EW_KV_GIB.
+#
+# This lets every launch script use the same pattern:
+#   estimate_worker_vram "$MODEL" "$SEQ_LEN" "$CONCURRENCY" "$OVERHEAD_GIB"
+#   GPU_MEM_FRACTION=$(gpu_worker_fraction "<engine>")
+#
+# Usage:
+#   gpu_worker_fraction vllm        # uses _EW_TOTAL_GIB, fraction of total
+#   gpu_worker_fraction sglang      # same as vllm
+#   gpu_worker_fraction trtllm      # uses _EW_KV_GIB, fraction of free
+#   gpu_worker_fraction trtllm 1    # query GPU index 1
+gpu_worker_fraction() {
+    local engine="${1:?usage: gpu_worker_fraction <engine> [gpu_index]}"
+    local gpu_idx="${2:-0}"
+    case "$engine" in
+        vllm|sglang)
+            gpu_gb_to_total_fraction "$_EW_TOTAL_GIB" "$gpu_idx" ;;
+        trtllm)
+            gpu_gb_to_free_fraction "$_EW_KV_GIB" "$gpu_idx" ;;
+        *)
+            echo "gpu_worker_fraction: unknown engine '$engine'" >&2
+            echo "Supported: vllm, sglang, trtllm" >&2
+            return 1 ;;
+    esac
+}
+# gpu_gb_to_total_fraction <gib> [gpu_index]
+#
+# For vLLM / sglang: --gpu-memory-utilization is a fraction of TOTAL GPU memory.
+# The engine budgets model weights + KV cache + activations within that limit.
+#
+# Prints the fraction of total GPU VRAM that <gib> GiB represents.
+# Useful for converting portable absolute memory requirements to
+# engine-specific fraction parameters (--gpu-memory-utilization, etc).
+#
+# Examples:
+#   gpu_gb_to_total_fraction 4        # on 48 GiB GPU → 0.09
+#   gpu_gb_to_total_fraction 16       # on 48 GiB GPU → 0.34
+#   gpu_gb_to_total_fraction 4 1      # query GPU index 1 instead of 0
+#
+# The result is ceil-rounded to 2 decimal places with a minimum of 0.05
+# and a maximum of 0.95.
+gpu_gb_to_total_fraction() {
+    local gib=${1:?usage: gpu_gb_to_total_fraction <gib> [gpu_index]}
+    local gpu_idx=${2:-0}
+    local total_mib
+    total_mib=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits -i "$gpu_idx" 2>/dev/null)
+    if [[ -z "$total_mib" || "$total_mib" -eq 0 ]]; then
+        echo "gpu_gb_to_total_fraction: failed to query GPU $gpu_idx total memory" >&2
+        return 1
+    fi
+    local total_gib
+    total_gib=$(awk -v t="$total_mib" 'BEGIN { printf "%.1f", t / 1024 }')
+    if awk -v gib="$gib" -v total="$total_mib" 'BEGIN { exit (gib * 1024 > total) ? 0 : 1 }'; then
+        echo "" >&2
+        echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" >&2
+        echo "WARNING: Requested ${gib} GiB but GPU $gpu_idx only has ${total_gib} GiB total." >&2
+        echo "The model likely won't fit. Consider a GPU with more VRAM" >&2
+        echo "or reduce the model size (quantization, smaller model, etc)." >&2
+        echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" >&2
+        echo "" >&2
+    fi
+    # fraction = gib * 1024 / total_mib, ceil to 2 decimals, clamp [0.05, 0.95]
+    awk -v gib="$gib" -v total="$total_mib" 'BEGIN {
+        frac = (gib * 1024) / total
+        # ceil to 2 decimal places
+        frac = int(frac * 100 + 0.99) / 100
+        if (frac < 0.05) frac = 0.05
+        if (frac > 0.95) frac = 0.95
+        printf "%.2f\n", frac
+    }'
+}
+# gpu_gb_to_free_fraction <gib> [gpu_index]
+#
+# For TensorRT-LLM: --free-gpu-memory-fraction (CLI) and
+# kv_cache_config.free_gpu_memory_fraction (YAML) are fractions of FREE
+# memory AFTER model weights are loaded — NOT fractions of total VRAM.
+# The engine loads model weights first, queries remaining free memory,
+# then allocates  fraction * free_after_model  for the KV cache.
+#
+# Why gpu_gb_to_total_fraction won't work for TensorRT-LLM:
+#   gpu_gb_to_total_fraction(10) on a 48 GiB GPU → 0.21 (fraction of total).
+#   Passing 0.21 as free_gpu_memory_fraction after a 5 GiB model loads
+#   would allocate 0.21 * 43 GiB ≈ 9 GiB — close but not exact.
+#   For larger models the error grows: a 30 GiB model leaves 18 GiB free,
+#   so 0.21 * 18 ≈ 3.8 GiB — far less than the 10 GiB intended.
+#
+# This function queries CURRENT free memory from nvidia-smi and computes
+# gib / free_mib. The result is a best-effort estimate: TensorRT-LLM will
+# see less free memory than we measure here (model weights haven't loaded
+# yet), so the actual KV cache allocation will be smaller than <gib>.
+# For rough sizing this is fine; for precise control use the YAML config
+# with a known model size.
+#
+# For disagg_same_gpu (two workers sharing one GPU), launch workers
+# sequentially: start the first, wait for it to finish loading (poll
+# nvidia-smi or logs), then query free memory again and compute the
+# fraction for the second worker. This gives predictable per-worker
+# KV cache sizes on any GPU.
+#
+# Override at launch via CLI or env var:
+#   --override-engine-args '{"kv_cache_config":{"free_gpu_memory_fraction": 0.15}}'
+#   DYN_TRTLLM_OVERRIDE_ENGINE_ARGS='{"kv_cache_config":{"free_gpu_memory_fraction": 0.15}}'
+#
+# GOTCHA: overriding any field inside kv_cache_config REPLACES the entire
+# sub-dict from the YAML. You must re-include all fields you care about
+# (e.g. enable_block_reuse, dtype) or they'll be lost.
+#
+# Examples:
+#   gpu_gb_to_free_fraction 10       # on 48 GiB GPU with 46 GiB free → 0.22
+#   gpu_gb_to_free_fraction 10 1     # query GPU index 1 instead of 0
+#
+# The result is ceil-rounded to 2 decimal places, clamped [0.01, 0.95].
+# The floor is 0.01 (not 0.05 like gpu_gb_to_total_fraction) because this
+# fraction only controls KV cache, so small values are valid.
+gpu_gb_to_free_fraction() {
+    local gib=${1:?usage: gpu_gb_to_free_fraction <gib> [gpu_index]}
+    local gpu_idx=${2:-0}
+    local free_mib
+    free_mib=$(nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits -i "$gpu_idx" 2>/dev/null)
+    if [[ -z "$free_mib" || "$free_mib" -eq 0 ]]; then
+        echo "gpu_gb_to_free_fraction: failed to query GPU $gpu_idx free memory" >&2
+        return 1
+    fi
+    local free_gib
+    free_gib=$(awk -v f="$free_mib" 'BEGIN { printf "%.1f", f / 1024 }')
+    if awk -v gib="$gib" -v free="$free_mib" 'BEGIN { exit (gib * 1024 > free) ? 0 : 1 }'; then
+        echo "" >&2
+        echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" >&2
+        echo "WARNING: Requested ${gib} GiB KV cache but GPU $gpu_idx only has ${free_gib} GiB free." >&2
+        echo "After model loading, even less will be available." >&2
+        echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" >&2
+        echo "" >&2
+    fi
+    # fraction = gib * 1024 / free_mib, ceil to 2 decimals, clamp [0.01, 0.95]
+    awk -v gib="$gib" -v free="$free_mib" 'BEGIN {
+        frac = (gib * 1024) / free
+        frac = int(frac * 100 + 0.99) / 100
+        if (frac < 0.01) frac = 0.01
+        if (frac > 0.95) frac = 0.95
+        printf "%.2f\n", frac
+    }'
+}