#!/bin/bash # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Shared GPU utility functions for launch scripts. # # CLI: # ./gpu_utils.sh --model [options...] Print GPU fraction # ./gpu_utils.sh --self-test Run self-test suite # # Source: # source "$(dirname "$(readlink -f "$0")")/../common/gpu_utils.sh" # # or with SCRIPT_DIR already set: # source "$SCRIPT_DIR/../common/gpu_utils.sh" # # Functions (all return via stdout — no hidden globals): # build_gpu_mem_args ... Prints fraction (or empty) # get_model_params Prints "pb wb layers kvh hd" # estimate_worker_vram ... Prints "w_gib kv_gib oh_gib total_gib" # gpu_worker_fraction Prints engine-appropriate fraction # gpu_peak_to_engine_fraction Prints fraction (subtracts engine overhead) # gpu_gb_to_total_fraction Prints fraction of TOTAL VRAM (vLLM/sglang) # gpu_gb_to_free_fraction Prints fraction of FREE VRAM (TensorRT-LLM) # build_gpu_mem_args [options...] # # Prints the computed memory fraction to stdout (empty line if none). # Callers capture with: GPU_MEM_FRACTION=$(build_gpu_mem_args ...) # # Priority: # 1. _PROFILE_PYTEST_VRAM_FRAC_OVERRIDE (profiler binary search) # 2. Engine flag passed to this function (user already chose a value) # 3. estimate_worker_vram + gpu_worker_fraction (model architecture) # 4. Empty (let engine use its own default) # # Options (each flag accepts engine-specific aliases): # --model NAME Model name (required). # aliases: --model-path (sglang, trtllm) # --max-model-len N Max tokens per sequence (default: 4096). # aliases: --context-length (sglang) # --max-seq-len (trtllm) # --max-num-seqs N Concurrent sequences to budget for (default: 2). # aliases: --max-running-requests (sglang) # --max-batch-size (trtllm) # --gpu-memory-utilization F User override (vllm flag name). Skipped when empty. # --mem-fraction-static F User override (sglang flag name). # --workers-per-gpu N Divide the fraction by N (for shared-GPU disagg). # # Usage: # # Simple single-worker (agg.sh) # GPU_MEM_FRACTION=$(build_gpu_mem_args vllm \ # --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS") # python -m dynamo.vllm --model "$MODEL" \ # ${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} & # # # Two workers sharing one GPU (disagg_same_gpu.sh) # GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --workers-per-gpu 2) # python -m dynamo.vllm ... --gpu-memory-utilization "${GPU_MEM_FRACTION}" & # # # sglang # GPU_MEM_FRACTION=$(build_gpu_mem_args sglang --model "$MODEL" --workers-per-gpu 2) # python -m dynamo.sglang ... --mem-fraction-static "${GPU_MEM_FRACTION}" & # # # trtllm (fraction goes into JSON, not CLI) # GPU_MEM_FRACTION=$(build_gpu_mem_args trtllm --model "$MODEL" --workers-per-gpu 2) # OVERRIDE_ARGS=(--override-engine-args "{\"kv_cache_config\":{\"free_gpu_memory_fraction\":${GPU_MEM_FRACTION}}}") build_gpu_mem_args() { local engine="${1:?usage: build_gpu_mem_args --model [options...]}" shift local model="" local max_model_len="4096" local max_seqs="2" local workers_per_gpu=1 local user_frac="" while [[ $# -gt 0 ]]; do case "$1" in --model|--model-path) model="$2"; shift 2 ;; --max-model-len|--context-length|--max-seq-len) max_model_len="$2"; shift 2 ;; --max-num-seqs|--max-running-requests|--max-batch-size) max_seqs="$2"; shift 2 ;; --gpu-memory-utilization|--mem-fraction-static) user_frac="$2"; shift 2 ;; --workers-per-gpu) workers_per_gpu="$2"; shift 2 ;; *) echo "build_gpu_mem_args: unknown option '$1'" >&2; return 1 ;; esac done if [[ -z "$model" ]]; then echo "build_gpu_mem_args: --model is required" >&2 return 1 fi local frac="" local from_estimator=false local est_w="" est_kv="" est_oh="" est_total="" if [[ -n "${_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE:-}" ]]; then frac="$_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE" elif [[ -n "$user_frac" ]]; then frac="$user_frac" elif read -r est_w est_kv est_oh est_total <<< "$(estimate_worker_vram "$model" "$max_model_len" "$max_seqs" "$engine" 2>/dev/null)" && [[ -n "$est_total" ]]; then frac=$(gpu_worker_fraction "$engine" "$est_total" "$est_kv") from_estimator=true fi # --workers-per-gpu divides profiler/user/estimator results only if [[ -n "$frac" && "$workers_per_gpu" -gt 1 ]]; then frac=$(awk -v f="$frac" -v n="$workers_per_gpu" 'BEGIN { printf "%.2f", f / n }') fi echo "$frac" } # get_model_params # # Prints "params_b weight_bytes layers kv_heads head_dim" to stdout. # Returns 1 (prints nothing) if the model is unknown. # # Fields: # params_b Total parameters in billions (all experts for MoE) # weight_bytes Bytes per weight element (2=BF16/FP16, 1=FP8) # layers Number of transformer layers # kv_heads Number of key-value heads (GQA groups) # head_dim Dimension per attention head # # KV cache is assumed BF16 (2 bytes per element) regardless of weight dtype, # since FP8 KV cache (--kv-cache-dtype fp8) is opt-in and not the default. # # To add a model: # 1. Find config.json at https://huggingface.co//raw/main/config.json # For VL/multimodal models, architecture params are under text_config. # 2. Map fields: # layers ← num_hidden_layers # kv_heads ← num_key_value_heads # head_dim ← head_dim (or hidden_size / num_attention_heads) # 3. params_b: total parameter count in billions. Derive from: # - safetensors file size: size_bytes / weight_bytes / 1e9 # (single file: ls -l model.safetensors; sharded: metadata.total_size # in model.safetensors.index.json) # - or the model card / paper # For MoE: params_b is the TOTAL count (all experts loaded into VRAM). # 4. weight_bytes: 2 for BF16/FP16, 1 for FP8/INT8. # # Usage: # read -r pb wb layers kvh hd <<< "$(get_model_params "Qwen/Qwen3-0.6B")" # echo "$layers layers, $kvh KV heads" get_model_params() { local model="${1:?usage: get_model_params }" local pb wb layers kvh hd case "$model" in # https://huggingface.co/Qwen/Qwen3-0.6B/raw/main/config.json Qwen/Qwen3-0.6B) pb=0.6; wb=2; layers=28; kvh=8; hd=128 ;; # https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/raw/main/config.json (text_config) # params_b from model.safetensors.index.json metadata.total_size / 2 / 1e9 Qwen/Qwen2-VL-2B-Instruct) pb=2.2; wb=2; layers=28; kvh=2; hd=128 ;; # https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/raw/main/config.json (text_config) Qwen/Qwen2.5-VL-7B-Instruct) pb=8.3; wb=2; layers=28; kvh=4; hd=128 ;; # https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct/raw/main/config.json (text_config) # params_b from model.safetensors size / 2 / 1e9 Qwen/Qwen3-VL-2B-Instruct) pb=2.1; wb=2; layers=28; kvh=8; hd=128 ;; # https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct/raw/main/config.json (text_config) Qwen/Qwen3-VL-8B-Instruct) pb=9.2; wb=2; layers=36; kvh=8; hd=128 ;; # https://huggingface.co/Qwen/Qwen3-30B-A3B/raw/main/config.json Qwen/Qwen3-30B-A3B|\ Qwen/Qwen3-30B-A3B-Instruct) pb=30.5; wb=2; layers=48; kvh=4; hd=128 ;; # Same architecture as Qwen3-30B-A3B but FP8 quantized (1 byte per weight) Qwen/Qwen3-VL-30B-A3B-Instruct-FP8) pb=30.5; wb=1; layers=48; kvh=4; hd=128 ;; # https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct/raw/main/config.json meta-llama/Meta-Llama-3.1-8B-Instruct) pb=8.0; wb=2; layers=32; kvh=8; hd=128 ;; # https://huggingface.co/deepseek-ai/deepseek-llm-7b-base/raw/main/config.json # MHA (not GQA): num_key_value_heads == num_attention_heads == 32 deepseek-ai/deepseek-llm-7b-base) pb=6.9; wb=2; layers=30; kvh=32; hd=128 ;; # https://huggingface.co/llava-hf/llava-1.5-7b-hf/raw/main/config.json (text_config) # MHA: num_key_value_heads == num_attention_heads == 32 llava-hf/llava-1.5-7b-hf) pb=7.1; wb=2; layers=32; kvh=32; hd=128 ;; *) echo "get_model_params: unknown model '$model'" >&2 echo "Add it to get_model_params() in gpu_utils.sh" >&2 return 1 ;; esac echo "$pb $wb $layers $kvh $hd" } # estimate_worker_vram [max_model_len] [max_concurrent_seqs] [engine_or_overhead] # # Prints "weights_gib kv_gib overhead_gib total_gib" to stdout. # Returns 1 (prints nothing) if the model is unknown to get_model_params. # # Formula: # weights = params_b * 1e9 * weight_bytes # kv = 2 * layers * kv_heads * head_dim * 2(BF16) * seq_len * seqs # total = weights + kv + overhead # # Arguments: # model HuggingFace model name (required) # max_model_len Max tokens per sequence (default: 4096) # max_concurrent_seqs Concurrent sequences to budget for (default: 2) # engine_or_overhead Engine name OR explicit GiB value (default: 2.0) # # If the 4th argument is an engine name (vllm, sglang, trtllm), overhead is # auto-computed from model parameters: # overhead = base + scale * sqrt(params_b) # # Per-engine constants (calibrated from measurements on RTX 6000 Ada 48 GiB): # vllm: base=1.2, scale=1.0 → 0.6B≈2.0, 8B≈4.0, 30B≈6.7 # sglang: base=2.5, scale=1.5 → 0.6B≈3.7, 8B≈6.7, 30B≈10.8 # trtllm: base=2.0, scale=1.2 → 0.6B≈2.9, 8B≈5.4, 30B≈8.6 # # If the 4th argument is a number, it's used directly (backward compatible). # If omitted, defaults to 2.0 (backward compatible). # # See examples/common/gpu_utils.md for the full derivation. # # Usage: # read -r w kv oh total <<< "$(estimate_worker_vram "Qwen/Qwen3-0.6B" 4096 2 vllm)" # echo "$total GiB (w=$w kv=$kv oh=$oh)" estimate_worker_vram() { local model="${1:?usage: estimate_worker_vram [seq_len] [seqs] [engine_or_overhead]}" local seqlen="${2:-4096}" local seqs="${3:-2}" local engine_or_overhead="${4:-2.0}" local mp_out mp_out=$(get_model_params "$model") || return 1 local pb wb layers kvh hd read -r pb wb layers kvh hd <<< "$mp_out" local overhead case "$engine_or_overhead" in vllm) overhead=$(awk -v p="$pb" 'BEGIN { printf "%.1f", 1.2 + 1.0 * sqrt(p) }') ;; sglang) overhead=$(awk -v p="$pb" 'BEGIN { printf "%.1f", 2.5 + 1.5 * sqrt(p) }') ;; trtllm) overhead=$(awk -v p="$pb" 'BEGIN { printf "%.1f", 2.0 + 1.2 * sqrt(p) }') ;; *) overhead="$engine_or_overhead" ;; esac awk -v pb="$pb" -v wbytes="$wb" \ -v layers="$layers" -v heads="$kvh" -v dim="$hd" \ -v seqlen="$seqlen" -v seqs="$seqs" -v overhead="$overhead" \ 'BEGIN { gib = 1024 * 1024 * 1024 w = pb * 1e9 * wbytes / gib kv = 2 * layers * heads * dim * 2 * seqlen * seqs / gib printf "%.1f %.1f %.1f %.1f", w, kv, overhead, w + kv + overhead }' } # gpu_worker_fraction [gpu_index] # # Convert estimated GiB into the engine-appropriate GPU memory fraction. # # Engine semantics (see examples/common/gpu_utils.md): # vllm/sglang — fraction of TOTAL VRAM (uses total_gib). # trtllm — fraction of FREE VRAM after model load (uses kv_gib). # # Usage: # gpu_worker_fraction vllm 4.0 0.9 # fraction of total # gpu_worker_fraction trtllm 4.0 0.9 # fraction of free # gpu_worker_fraction trtllm 4.0 0.9 1 # query GPU index 1 gpu_worker_fraction() { local engine="${1:?usage: gpu_worker_fraction [gpu_index]}" local total_gib="${2:?usage: gpu_worker_fraction }" local kv_gib="${3:?usage: gpu_worker_fraction }" local gpu_idx="${4:-0}" case "$engine" in vllm|sglang) gpu_gb_to_total_fraction "$total_gib" "$gpu_idx" ;; trtllm) gpu_gb_to_free_fraction "$kv_gib" "$gpu_idx" ;; *) echo "gpu_worker_fraction: unknown engine '$engine'" >&2 echo "Supported: vllm, sglang, trtllm" >&2 return 1 ;; esac } # gpu_peak_to_engine_fraction [gpu_index] # # Convert a measured/profiled GPU peak (total VRAM including CUDA context, # activations, etc.) into the engine-specific memory fraction flag. # # Each engine's fraction controls only a SUBSET of GPU memory (e.g. vLLM's # --gpu-memory-utilization covers weights + KV cache but not CUDA context). # This function subtracts the engine-specific overhead so the fraction # targets the right internal budget, keeping the real peak stable across # re-profiles. # # Overhead constants (GiB outside the engine's budget): # vllm 2.0 CUDA ctx ~0.6 + activations/sampler ~0.5 + PyTorch alloc ~0.5 # sglang 2.0 (assumed same as vllm; refine when profiled) # trtllm 0.0 free-fraction is measured after model load, no subtraction needed # # Usage: # gpu_peak_to_engine_fraction vllm 8.6 # on 48 GiB → 0.14 # gpu_peak_to_engine_fraction vllm 20.9 # on 48 GiB → 0.40 # gpu_peak_to_engine_fraction vllm 8.6 1 # query GPU index 1 gpu_peak_to_engine_fraction() { local engine=${1:?usage: gpu_peak_to_engine_fraction [gpu_index]} local peak_gib=${2:?usage: gpu_peak_to_engine_fraction [gpu_index]} local gpu_idx=${3:-0} local overhead case "$engine" in vllm|sglang) overhead=2.0 ;; trtllm) overhead=0.0 ;; *) echo "gpu_peak_to_engine_fraction: unknown engine '$engine'" >&2 echo "Supported: vllm, sglang, trtllm" >&2 return 1 ;; esac local budget budget=$(awk -v g="$peak_gib" -v oh="$overhead" \ 'BEGIN { b = g - oh; if (b < 1) b = 1; printf "%.1f", b }') case "$engine" in vllm|sglang) gpu_gb_to_total_fraction "$budget" "$gpu_idx" ;; trtllm) gpu_gb_to_free_fraction "$budget" "$gpu_idx" ;; esac } # gpu_gb_to_total_fraction [gpu_index] # # For vLLM / sglang: --gpu-memory-utilization is a fraction of TOTAL GPU memory. # The engine budgets model weights + KV cache + activations within that limit. # # Prints the fraction of total GPU VRAM that GiB represents. # Useful for converting portable absolute memory requirements to # engine-specific fraction parameters (--gpu-memory-utilization, etc). # # Examples: # gpu_gb_to_total_fraction 4 # on 48 GiB GPU → 0.09 # gpu_gb_to_total_fraction 16 # on 48 GiB GPU → 0.34 # gpu_gb_to_total_fraction 4 1 # query GPU index 1 instead of 0 # # The result is ceil-rounded to 2 decimal places with a minimum of 0.05 # and a maximum of 0.95. gpu_gb_to_total_fraction() { local gib=${1:?usage: gpu_gb_to_total_fraction [gpu_index]} local gpu_idx=${2:-0} local total_mib total_mib=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits -i "$gpu_idx" 2>/dev/null) if [[ -z "$total_mib" || "$total_mib" -eq 0 ]]; then echo "gpu_gb_to_total_fraction: failed to query GPU $gpu_idx total memory" >&2 return 1 fi local total_gib total_gib=$(awk -v t="$total_mib" 'BEGIN { printf "%.1f", t / 1024 }') if awk -v gib="$gib" -v total="$total_mib" 'BEGIN { exit (gib * 1024 > total) ? 0 : 1 }'; then echo "" >&2 echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" >&2 echo "WARNING: Requested ${gib} GiB but GPU $gpu_idx only has ${total_gib} GiB total." >&2 echo "The model likely won't fit. Consider a GPU with more VRAM" >&2 echo "or reduce the model size (quantization, smaller model, etc)." >&2 echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" >&2 echo "" >&2 fi # fraction = gib * 1024 / total_mib, ceil to 2 decimals, clamp [0.05, 0.95] awk -v gib="$gib" -v total="$total_mib" 'BEGIN { frac = (gib * 1024) / total # ceil to 2 decimal places frac = int(frac * 100 + 0.99) / 100 if (frac < 0.05) frac = 0.05 if (frac > 0.95) frac = 0.95 printf "%.2f\n", frac }' } # gpu_gb_to_free_fraction [gpu_index] # # For TensorRT-LLM: --free-gpu-memory-fraction (CLI) and # kv_cache_config.free_gpu_memory_fraction (YAML) are fractions of FREE # memory AFTER model weights are loaded — NOT fractions of total VRAM. # The engine loads model weights first, queries remaining free memory, # then allocates fraction * free_after_model for the KV cache. # # Why gpu_gb_to_total_fraction won't work for TensorRT-LLM: # gpu_gb_to_total_fraction(10) on a 48 GiB GPU → 0.21 (fraction of total). # Passing 0.21 as free_gpu_memory_fraction after a 5 GiB model loads # would allocate 0.21 * 43 GiB ≈ 9 GiB — close but not exact. # For larger models the error grows: a 30 GiB model leaves 18 GiB free, # so 0.21 * 18 ≈ 3.8 GiB — far less than the 10 GiB intended. # # This function queries CURRENT free memory from nvidia-smi and computes # gib / free_mib. The result is a best-effort estimate: TensorRT-LLM will # see less free memory than we measure here (model weights haven't loaded # yet), so the actual KV cache allocation will be smaller than . # For rough sizing this is fine; for precise control use the YAML config # with a known model size. # # For disagg_same_gpu (two workers sharing one GPU), launch workers # sequentially: start the first, wait for it to finish loading (poll # nvidia-smi or logs), then query free memory again and compute the # fraction for the second worker. This gives predictable per-worker # KV cache sizes on any GPU. # # Override at launch via CLI or env var: # --override-engine-args '{"kv_cache_config":{"free_gpu_memory_fraction": 0.15}}' # DYN_TRTLLM_OVERRIDE_ENGINE_ARGS='{"kv_cache_config":{"free_gpu_memory_fraction": 0.15}}' # # GOTCHA: overriding any field inside kv_cache_config REPLACES the entire # sub-dict from the YAML. You must re-include all fields you care about # (e.g. enable_block_reuse, dtype) or they'll be lost. # # Examples: # gpu_gb_to_free_fraction 10 # on 48 GiB GPU with 46 GiB free → 0.22 # gpu_gb_to_free_fraction 10 1 # query GPU index 1 instead of 0 # # The result is ceil-rounded to 2 decimal places, clamped [0.01, 0.95]. # The floor is 0.01 (not 0.05 like gpu_gb_to_total_fraction) because this # fraction only controls KV cache, so small values are valid. gpu_gb_to_free_fraction() { local gib=${1:?usage: gpu_gb_to_free_fraction [gpu_index]} local gpu_idx=${2:-0} local free_mib free_mib=$(nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits -i "$gpu_idx" 2>/dev/null) if [[ -z "$free_mib" || "$free_mib" -eq 0 ]]; then echo "gpu_gb_to_free_fraction: failed to query GPU $gpu_idx free memory" >&2 return 1 fi local free_gib free_gib=$(awk -v f="$free_mib" 'BEGIN { printf "%.1f", f / 1024 }') if awk -v gib="$gib" -v free="$free_mib" 'BEGIN { exit (gib * 1024 > free) ? 0 : 1 }'; then echo "" >&2 echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" >&2 echo "WARNING: Requested ${gib} GiB KV cache but GPU $gpu_idx only has ${free_gib} GiB free." >&2 echo "After model loading, even less will be available." >&2 echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" >&2 echo "" >&2 fi # fraction = gib * 1024 / free_mib, ceil to 2 decimals, clamp [0.01, 0.95] awk -v gib="$gib" -v free="$free_mib" 'BEGIN { frac = (gib * 1024) / free frac = int(frac * 100 + 0.99) / 100 if (frac < 0.01) frac = 0.01 if (frac > 0.95) frac = 0.95 printf "%.2f\n", frac }' } # --------------------------------------------------------------------------- # Self-test: bash gpu_utils.sh --self-test # --------------------------------------------------------------------------- _gpu_utils_self_test() { local pass=0 fail=0 _assert() { local label="$1" expected="$2" actual="$3" if [[ "$expected" == "$actual" ]]; then ((pass++)) echo " PASS $label" else ((fail++)) echo " FAIL $label (expected='$expected' actual='$actual')" fi } echo "=== get_model_params ===" local out out=$(get_model_params "Qwen/Qwen3-0.6B") _assert "known model returns 5 fields" "0.6 2 28 8 128" "$out" out=$(get_model_params "nope/unknown" 2>/dev/null) _assert "unknown model returns empty" "" "$out" get_model_params "nope/unknown" >/dev/null 2>&1 _assert "unknown model exits 1" "1" "$?" echo "" echo "=== estimate_worker_vram ===" out=$(estimate_worker_vram "Qwen/Qwen3-0.6B" 4096 2 vllm) _assert "returns 4 space-separated fields" "4" "$(echo "$out" | wc -w | tr -d ' ')" local w kv oh total read -r w kv oh total <<< "$out" _assert "weights > 0" "yes" "$(awk -v v="$w" 'BEGIN { print (v > 0) ? "yes" : "no" }')" _assert "total > weights" "yes" "$(awk -v t="$total" -v w="$w" 'BEGIN { print (t > w) ? "yes" : "no" }')" out=$(estimate_worker_vram "nope/unknown" 2>/dev/null) _assert "unknown model returns empty" "" "$out" local out_vllm out_sglang out_vllm=$(estimate_worker_vram "Qwen/Qwen3-0.6B" 4096 2 vllm) out_sglang=$(estimate_worker_vram "Qwen/Qwen3-0.6B" 4096 2 sglang) _assert "sglang overhead > vllm overhead" "yes" \ "$(awk -v v="$out_vllm" -v s="$out_sglang" 'BEGIN { split(v, a); split(s, b); print (b[3]+0 > a[3]+0) ? "yes" : "no" }')" echo "" echo "=== build_gpu_mem_args: estimator path (known model) ===" local frac frac=$(build_gpu_mem_args vllm --model "Qwen/Qwen3-0.6B" --max-model-len 4096 --max-num-seqs 2) _assert "FRACTION non-empty" "yes" "$([[ -n "$frac" ]] && echo yes || echo no)" echo "" echo "=== build_gpu_mem_args: unknown model, no default ===" frac=$(build_gpu_mem_args vllm --model "nope/unknown") _assert "FRACTION empty" "" "$frac" echo "" echo "=== build_gpu_mem_args: profiler wins over all ===" frac=$(_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE=0.55 \ build_gpu_mem_args vllm --model "Qwen/Qwen3-0.6B" --gpu-memory-utilization 0.70) _assert "FRACTION = profiler (beats user flag)" "0.55" "$frac" echo "" echo "=== build_gpu_mem_args: user flag wins over estimator ===" frac=$(build_gpu_mem_args vllm --model "Qwen/Qwen3-0.6B" --gpu-memory-utilization 0.70) _assert "FRACTION = user flag" "0.70" "$frac" echo "" echo "=== build_gpu_mem_args: empty user flag falls through ===" frac=$(build_gpu_mem_args vllm --model "Qwen/Qwen3-0.6B" --max-model-len 4096 --max-num-seqs 2 --gpu-memory-utilization "") _assert "FRACTION = estimator" "yes" "$([[ -n "$frac" ]] && echo yes || echo no)" echo "" echo "=== build_gpu_mem_args: --workers-per-gpu divides estimator ===" local undivided undivided=$(build_gpu_mem_args vllm --model "Qwen/Qwen3-0.6B" --max-model-len 4096 --max-num-seqs 2) frac=$(build_gpu_mem_args vllm --model "Qwen/Qwen3-0.6B" --max-model-len 4096 --max-num-seqs 2 --workers-per-gpu 2) local expected_half expected_half=$(awk -v f="$undivided" 'BEGIN { printf "%.2f", f / 2 }') _assert "FRACTION halved" "$expected_half" "$frac" echo "" echo "=== build_gpu_mem_args: --workers-per-gpu divides profiler ===" frac=$(_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE=0.80 \ build_gpu_mem_args vllm --model "Qwen/Qwen3-0.6B" --workers-per-gpu 2) _assert "FRACTION = 0.80/2 = 0.40" "0.40" "$frac" echo "" echo "=== build_gpu_mem_args: sglang engine (sglang flag names) ===" frac=$(build_gpu_mem_args sglang --model-path "Qwen/Qwen3-0.6B" --context-length 4096 --max-running-requests 2) _assert "sglang FRACTION non-empty" "yes" "$([[ -n "$frac" ]] && echo yes || echo no)" echo "" echo "=== build_gpu_mem_args: trtllm engine (trtllm flag names) ===" frac=$(build_gpu_mem_args trtllm --model-path "Qwen/Qwen3-0.6B" --max-seq-len 4096 --max-batch-size 2) _assert "trtllm FRACTION non-empty" "yes" "$([[ -n "$frac" ]] && echo yes || echo no)" echo "" echo "=== build_gpu_mem_args: --mem-fraction-static user flag (sglang) ===" frac=$(build_gpu_mem_args sglang --model-path "Qwen/Qwen3-0.6B" --mem-fraction-static 0.60) _assert "FRACTION = user flag" "0.60" "$frac" echo "" echo "=== build_gpu_mem_args: missing --model ===" build_gpu_mem_args vllm 2>/dev/null _assert "missing --model exits 1" "1" "$?" echo "" echo "=== gpu_worker_fraction: explicit args ===" local frac frac=$(gpu_worker_fraction vllm 4.0 0.9) _assert "vllm returns non-empty" "yes" "$([[ -n "$frac" ]] && echo yes || echo no)" frac=$(gpu_worker_fraction trtllm 4.0 0.9) _assert "trtllm returns non-empty" "yes" "$([[ -n "$frac" ]] && echo yes || echo no)" gpu_worker_fraction badengine 4.0 0.9 >/dev/null 2>&1 _assert "bad engine exits 1" "1" "$?" echo "" echo "==========================================" echo "Results: $pass passed, $fail failed" echo "==========================================" [[ "$fail" -eq 0 ]] } # CLI mode: only when executed directly (not sourced by another script) if [[ "${BASH_SOURCE[0]}" == "$0" ]]; then if [[ "${1:-}" == "--self-test" ]]; then _gpu_utils_self_test exit $? fi if [[ $# -gt 0 ]]; then build_gpu_mem_args "$@" exit $? fi cat <<'HELP' gpu_utils.sh — GPU memory fraction estimator Usage: ./gpu_utils.sh --model [options...] ./gpu_utils.sh --self-test Engines: vllm, sglang, trtllm Examples: ./gpu_utils.sh vllm --model Qwen/Qwen3-0.6B ./gpu_utils.sh vllm --model Qwen/Qwen3-0.6B --max-model-len 4096 --max-num-seqs 2 ./gpu_utils.sh vllm --model Qwen/Qwen3-0.6B --workers-per-gpu 2 ./gpu_utils.sh sglang --model Qwen/Qwen3-0.6B --context-length 8192 ./gpu_utils.sh trtllm --model meta-llama/Meta-Llama-3.1-8B-Instruct --max-seq-len 4096 Options: --model NAME Model name (required) aliases: --model-path --max-model-len N Max sequence length (default: 4096) aliases: --context-length, --max-seq-len --max-num-seqs N Concurrent sequences (default: 2) aliases: --max-running-requests, --max-batch-size --gpu-memory-utilization F Override fraction (vllm flag) aliases: --mem-fraction-static --workers-per-gpu N Divide fraction by N (shared-GPU disagg) --self-test Run built-in test suite Output: prints the fraction to stdout (empty if model is unknown). HELP exit 0 fi