Unverified Commit 4cdc49c2 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

refactor: split build_gpu_mem_args into engine-specific functions (#7916)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent b1c18bb1
......@@ -63,7 +63,7 @@ python -m dynamo.frontend &
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_ARGS=$(build_gpu_mem_args vllm)
GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT} \
python -m dynamo.vllm --model "$MODEL" --enforce-eager \
--max-model-len "$MAX_MODEL_LEN" \
......
......@@ -64,7 +64,7 @@ python -m dynamo.frontend \
# run workers
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
# TODO: use build_vllm_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT1} \
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--model $MODEL \
......
......@@ -66,7 +66,7 @@ python -m dynamo.frontend \
# run workers
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
# TODO: use build_vllm_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT1} \
ZE_AFFINITY_MASK=0 python3 -m dynamo.vllm \
--model $MODEL \
......
......@@ -66,14 +66,14 @@ python -m dynamo.frontend &
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT} \
python -m dynamo.vllm --model "$MODEL" --enforce-eager \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} \
$GPU_MEM_ARGS \
--enable-lora \
--max-lora-rank 64 &
......
......@@ -29,7 +29,7 @@ MODEL="Qwen/Qwen3-0.6B"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated + LMCache + Multiproc (1 GPU)" "$MODEL" "$HTTP_PORT"
......@@ -42,7 +42,7 @@ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} \
$GPU_MEM_ARGS \
--kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both","kv_buffer_device":"xpu"}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
......
......@@ -19,7 +19,7 @@ MODEL="Qwen/Qwen3-0.6B"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving + LMCache (1 GPU)" "$MODEL" "$HTTP_PORT"
......@@ -31,7 +31,7 @@ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} \
$GPU_MEM_ARGS \
--kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both","kv_buffer_device":"xpu"}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
......
......@@ -71,7 +71,7 @@ case "$MODEL_NAME" in
MODEL_EXTRA_ARGS="--tensor-parallel-size=8" ;;
esac
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL_NAME" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
# Start vLLM worker with vision model
# --enforce-eager: Quick deployment (remove for production)
......@@ -81,7 +81,7 @@ ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK:-0} python -m dynamo.vllm --enable-multimoda
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} $MODEL_EXTRA_ARGS "${EXTRA_ARGS[@]}"
$GPU_MEM_ARGS $MODEL_EXTRA_ARGS "${EXTRA_ARGS[@]}"
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -52,7 +52,7 @@ MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
export DYN_REQUEST_PLANE=$REQUEST_PLANE
echo "Using request plane mode: $REQUEST_PLANE"
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving + Request Planes (1 GPU)" "$MODEL" "$HTTP_PORT"
......@@ -65,7 +65,7 @@ DYN_HEALTH_CHECK_ENABLED=true \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} &
$GPU_MEM_ARGS &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -26,7 +26,7 @@ python -m dynamo.frontend \
#
# If multiple workers are launched, they must not share the same system/metrics port.
# Use DYN_SYSTEM_PORT{1,2} so tests/launchers can provide a simple numbered port set.
# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
# TODO: use build_vllm_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
ZE_AFFINITY_MASK=0 python3 -m dynamo.vllm \
......
......@@ -31,7 +31,7 @@ python -m dynamo.frontend \
#
# If multiple workers are launched, they must not share the same system/metrics port.
# Use DYN_SYSTEM_PORT{1,2} so tests/launchers can provide a simple numbered port set.
# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
# TODO: use build_vllm_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
#
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
ZE_AFFINITY_MASK=0 python3 -m dynamo.vllm \
......
......@@ -8,7 +8,7 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../common/gpu_utils.sh" # gpu_gb_to_total_fraction
source "$SCRIPT_DIR/../../../../common/gpu_utils.sh" # build_vllm_gpu_mem_args
source "$SCRIPT_DIR/../../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
export VLLM_TARGET_DEVICE=xpu
......@@ -35,7 +35,7 @@ done
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving (1 GPU)" "$MODEL" "$HTTP_PORT"
......@@ -51,7 +51,7 @@ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} "${EXTRA_ARGS[@]}" &
$GPU_MEM_ARGS "${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -24,7 +24,7 @@ Instead, we use **absolute KV cache caps**:
|--------|----------------------|---------|
| vLLM | `--kv-cache-memory-bytes N` | `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES` |
| SGLang | `--max-total-tokens N` | `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS` |
| TensorRT-LLM | *(future TODO)* | — |
| TensorRT-LLM | `--override-engine-args '{"kv_cache_config":{"max_tokens":N}}'` | `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` |
---
......@@ -36,7 +36,7 @@ Instead, we use **absolute KV cache caps**:
| Fraction base | Total VRAM | Total VRAM | Free VRAM (post-load) |
| Default | 0.90 | 0.90 | 0.90 |
| Max seq len | `--max-model-len` | `--context-length` | `max_seq_len` |
| KV cache override | `--kv-cache-memory-bytes` | `--max-total-tokens` | *(broken in 1.3.0rc5)* |
| KV cache override | `--kv-cache-memory-bytes` | `--max-total-tokens` | `KvCacheConfig.max_tokens` via `--override-engine-args` |
---
......@@ -76,40 +76,55 @@ only — they do **not** change KV cache allocation.
`free_gpu_memory_fraction` is a fraction of **free** VRAM after model load.
Set via YAML or `--override-engine-args '{"kv_cache_config":{"free_gpu_memory_fraction": 0.24}}'`.
Deterministic KV cache control via `build_gpu_mem_args` is a future TODO.
Deterministic KV cache control uses `build_trtllm_override_args_with_mem` in
`gpu_utils.sh`, which builds JSON for `--override-engine-args`. Token-based
(`_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS`) or byte-based
(`_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES`) caps are supported. If the
launch script already passes `--override-engine-args`, the function merges
the GPU config into the existing JSON via `--merge-with-json`.
---
## `build_gpu_mem_args` and Env Vars
## Engine-Specific GPU Memory Functions
Launch scripts source `gpu_utils.sh` and call `build_gpu_mem_args` to pick
Launch scripts source `gpu_utils.sh` and call engine-specific functions to pick
up env-var overrides during profiling and parallel execution:
```bash
source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
GPU_MEM_ARGS=$(build_gpu_mem_args vllm)
# vLLM
GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
python -m dynamo.vllm --model "$MODEL" $GPU_MEM_ARGS &
GPU_MEM_ARGS=$(build_gpu_mem_args sglang)
# SGLang
GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
python -m dynamo.sglang --model-path "$MODEL" $GPU_MEM_ARGS &
# TRT-LLM (JSON merging, separate function)
OVERRIDE_JSON=$(build_trtllm_override_args_with_mem)
python -m dynamo.trtllm --model-path "$MODEL" ${OVERRIDE_JSON:+--override-engine-args "$OVERRIDE_JSON"} &
```
When the env var is set, `build_gpu_mem_args` returns the corresponding flag.
When the env var is set, the function returns the corresponding flag.
Otherwise it returns empty and the engine uses its default allocation.
| Env var | Engine | CLI flag produced |
|---------|--------|-------------------|
| `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES` | vLLM | `--kv-cache-memory-bytes N --gpu-memory-utilization 0.01` |
| `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS` | SGLang | `--max-total-tokens N` |
| Env var | Function | Output |
|---------|----------|--------|
| `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES` | `build_vllm_gpu_mem_args` | `--kv-cache-memory-bytes N --gpu-memory-utilization 0.01` |
| `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS` | `build_sglang_gpu_mem_args` | `--max-total-tokens N` |
| `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` | `build_trtllm_override_args_with_mem` | `{"kv_cache_config": {"max_tokens": N}}` (JSON) |
| `_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES` | `build_trtllm_override_args_with_mem` | `{"kv_cache_config": {"max_gpu_total_bytes": N}}` (JSON) |
For multi-worker single-GPU scripts, pass `--workers-per-gpu N` to divide
the allocation: `build_gpu_mem_args vllm --workers-per-gpu 2`.
All functions return per-process args. In multi-worker-per-GPU setups
(e.g. `disagg_same_gpu.sh`), each worker gets the same override value.
The profiler finds the per-worker budget directly.
**Profiler** (`profile_pytest.py`): binary-searches the KV cap to find the
minimum passing value, applies a 2x safety factor, outputs pytest markers
(`@pytest.mark.requested_vllm_kv_cache_bytes(N)` or
`@pytest.mark.requested_sglang_kv_tokens(N)`).
(`@pytest.mark.requested_vllm_kv_cache_bytes(N)`,
`@pytest.mark.requested_sglang_kv_tokens(N)`, or
`@pytest.mark.requested_trtllm_kv_tokens(N)`).
**Scheduler** (`pytest_parallel_gpu.py`): reads the markers at runtime and
sets the env var per-test. See `tests/README.md` for details.
......@@ -10,64 +10,57 @@
# source "$SCRIPT_DIR/../common/gpu_utils.sh"
#
# Functions (all return via stdout):
# build_gpu_mem_args <engine> [--workers-per-gpu N]
# Returns engine-specific CLI args for GPU memory control based on
# environment variable overrides. Empty if no overrides.
#
# Supported engines: vllm, sglang
#
# build_vllm_gpu_mem_args
# vLLM: _PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES → --kv-cache-memory-bytes N --gpu-memory-utilization 0.01
#
# build_sglang_gpu_mem_args
# SGLang: _PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS → --max-total-tokens N
#
# Note: TensorRT-LLM uses build_trtllm_override_args_with_mem() instead (requires JSON merging)
#
# TODO: Split into build_vllm_gpu_mem_args and build_sglang_gpu_mem_args
#
# Usage:
# # vLLM / SGLang
# GPU_MEM_ARGS=$(build_gpu_mem_args sglang)
# GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
# python -m dynamo.sglang --model-path "$MODEL" $GPU_MEM_ARGS &
#
# GPU_MEM_ARGS=$(build_gpu_mem_args vllm)
# GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
# python -m dynamo.vllm --model "$MODEL" $GPU_MEM_ARGS &
build_gpu_mem_args() {
local engine="${1:?usage: build_gpu_mem_args <engine> [--workers-per-gpu N]}"
shift
# TensorRT-LLM uses build_trtllm_override_args_with_mem instead
if [[ "$engine" == "trtllm" ]]; then
echo "build_gpu_mem_args: TensorRT-LLM not supported. Use build_trtllm_override_args_with_mem instead." >&2
return 1
fi
local workers_per_gpu=1
while [[ $# -gt 0 ]]; do
case "$1" in
--workers-per-gpu) workers_per_gpu="$2"; shift 2 ;;
*) echo "build_gpu_mem_args: unknown option '$1'" >&2; return 1 ;;
esac
done
# --- SGLang: token-based KV cache cap ---
if [[ "$engine" == "sglang" && -n "${_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS:-}" ]]; then
echo "--max-total-tokens ${_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS}"
# ---------------------------------------------------------------------------
# build_vllm_gpu_mem_args
# Returns vLLM CLI args for GPU memory control.
# Empty if _PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES is not set.
#
# --kv-cache-memory-bytes is per-process: each vLLM worker gets the same
# value, even in multi-worker-per-GPU setups (e.g. disagg_same_gpu.sh).
# The profiler finds the per-worker budget directly.
#
# --gpu-memory-utilization 0.01 prevents vLLM's startup check from rejecting
# the launch when co-resident tests use >10% of VRAM (vLLM checks free memory
# against the fraction *before* applying the byte cap).
# ---------------------------------------------------------------------------
build_vllm_gpu_mem_args() {
if [[ -n "${_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES:-}" ]]; then
echo "--kv-cache-memory-bytes ${_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES} --gpu-memory-utilization 0.01"
return 0
fi
# --- vLLM: byte-based KV cache cap ---
# --gpu-memory-utilization 0.01 prevents vLLM's startup check from rejecting
# the launch when co-resident tests use >10% of VRAM (vLLM checks free memory
# against the fraction *before* applying the byte cap).
if [[ "$engine" == "vllm" && -n "${_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES:-}" ]]; then
local kv_bytes="$_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES"
if [[ "$workers_per_gpu" -gt 1 ]]; then
kv_bytes=$(awk -v b="$kv_bytes" -v n="$workers_per_gpu" 'BEGIN { printf "%d", b / n }')
fi
echo "--kv-cache-memory-bytes $kv_bytes --gpu-memory-utilization 0.01"
echo ""
}
# ---------------------------------------------------------------------------
# build_sglang_gpu_mem_args
# Returns SGLang CLI args for GPU memory control.
# Empty if _PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS is not set.
# ---------------------------------------------------------------------------
build_sglang_gpu_mem_args() {
if [[ -n "${_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS:-}" ]]; then
echo "--max-total-tokens ${_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS}"
return 0
fi
# No override — engine uses its default allocation
echo ""
}
......@@ -160,45 +153,46 @@ _gpu_utils_self_test() {
local result
# --- build_vllm_gpu_mem_args (direct) ---
echo "=== vLLM: kv bytes override ==="
result=$(_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES=942054000 \
build_gpu_mem_args vllm)
build_vllm_gpu_mem_args)
_assert "kv bytes" "--kv-cache-memory-bytes 942054000 --gpu-memory-utilization 0.01" "$result"
echo ""
echo "=== vLLM: kv bytes with --workers-per-gpu 2 ==="
result=$(_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES=942054000 \
build_gpu_mem_args vllm --workers-per-gpu 2)
_assert "kv bytes / 2" "--kv-cache-memory-bytes 471027000 --gpu-memory-utilization 0.01" "$result"
echo ""
echo "=== vLLM: no override = empty ==="
result=$(build_gpu_mem_args vllm)
result=$(build_vllm_gpu_mem_args)
_assert "empty (engine default)" "" "$result"
echo ""
echo "=== vLLM: sglang token env ignored ==="
result=$(_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS=23824 \
build_gpu_mem_args vllm)
build_vllm_gpu_mem_args)
_assert "vllm ignores token cap" "" "$result"
# --- build_sglang_gpu_mem_args (direct) ---
echo ""
echo "=== sglang: token cap env ==="
result=$(_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS=1024 \
build_gpu_mem_args sglang)
build_sglang_gpu_mem_args)
_assert "token cap" "--max-total-tokens 1024" "$result"
echo ""
echo "=== sglang: no override = empty ==="
result=$(build_gpu_mem_args sglang)
result=$(build_sglang_gpu_mem_args)
_assert "empty (engine default)" "" "$result"
echo ""
echo "=== sglang: vllm kv bytes env ignored ==="
result=$(_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES=942054000 \
build_gpu_mem_args sglang)
build_sglang_gpu_mem_args)
_assert "sglang ignores kv bytes" "" "$result"
# --- build_trtllm_override_args_with_mem ---
echo ""
echo "=== trtllm: token cap env ==="
result=$(_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS=4096 \
......@@ -239,16 +233,6 @@ _gpu_utils_self_test() {
result=$(build_trtllm_override_args_with_mem --merge-with-json '{"return_perf_metrics": true}')
_assert "trtllm passthrough" '{"return_perf_metrics": true}' "$result"
echo ""
echo "=== missing engine ==="
(build_gpu_mem_args 2>/dev/null)
_assert "missing engine exits non-zero" "1" "$?"
echo ""
echo "=== trtllm rejected (use build_trtllm_override_args_with_mem) ==="
(build_gpu_mem_args trtllm 2>/dev/null)
_assert "trtllm rejected" "1" "$?"
echo ""
echo "=========================================="
echo "Results: $pass passed, $fail failed"
......
......@@ -95,7 +95,7 @@ python -m dynamo.frontend &
python3 components/processor.py --model $MODEL_NAME --prompt-template "$PROMPT_TEMPLATE" &
# run E/P/D workers
GPU_MEM_ARGS=$(build_gpu_mem_args vllm)
GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
CUDA_VISIBLE_DEVICES=0 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
......
......@@ -96,7 +96,7 @@ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT4:-8084} \
python3 components/processor.py --model $MODEL_NAME --prompt-template "$PROMPT_TEMPLATE" &
# run E/P/D workers
GPU_MEM_ARGS=$(build_gpu_mem_args vllm)
GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
CUDA_VISIBLE_DEVICES=0 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT3:-8083} \
......
......@@ -192,7 +192,7 @@ Markers differ by engine:
- **`profiled_vram_gib(N)`** — actual peak from nvidia-smi. Used for `--max-vram-gib` filtering and scheduler budget.
- **`requested_trtllm_kv_tokens(N)`** — max KV cache tokens for text models. Sets `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS``KvCacheConfig.max_tokens` via `--override-engine-args` JSON. Deterministic and parallel-safe.
- **`requested_trtllm_vram_gib(N)`** — max VRAM in GiB for non-text workloads (video/image diffusion). Sets `_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES``KvCacheConfig.max_gpu_total_bytes` via `--override-engine-args` JSON. Note: diffusion models don't use KV cache, so this parameter may have no effect — `profiled_vram_gib` alone is sufficient for scheduler budget tracking.
- TRT-LLM requires JSON merging for `--override-engine-args`, handled by `build_trtllm_override_args_with_mem` in `gpu_utils.sh` (separate from `build_gpu_mem_args` used by vLLM/SGLang).
- TRT-LLM requires JSON merging for `--override-engine-args`, handled by `build_trtllm_override_args_with_mem` in `gpu_utils.sh` (separate from `build_vllm_gpu_mem_args` / `build_sglang_gpu_mem_args`).
`--max-vram-gib=N` deselects tests whose `profiled_vram_gib` exceeds N. Tests without a VRAM marker are also deselected (unknown VRAM = unsafe for parallel). To add a test to the pool, profile it with `tests/utils/profile_pytest.py` (see [GPU VRAM Profiler](#gpu-vram-profiler-profile_pytestpy)).
......@@ -203,9 +203,9 @@ GPU tests run concurrently via a custom VRAM-aware scheduler (`tests/utils/pytes
1. **VRAM budget**: xdist has no GPU memory awareness — two 20 GiB tests on a 48 GiB GPU will OOM.
2. **Profiling race**: engines snapshot free memory during init; concurrent startups corrupt each other. The scheduler staggers launches (VRAM stability check) and retries transient failures.
3. **Engine-specific allocation**: each test gets a constrained allocation so it uses only its budgeted share. xdist has no mechanism for this.
- **vLLM**: `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES = N``--kv-cache-memory-bytes` (from `requested_vllm_kv_cache_bytes` marker). Byte-based cap is deterministic and doesn't depend on current free memory, making it inherently parallel-safe.
- **SGLang**: `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS = N``--max-total-tokens` (from `requested_sglang_kv_tokens` marker). Token-based cap is deterministic and doesn't depend on current free memory, making it inherently parallel-safe.
- **TRT-LLM**: `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS = N``KvCacheConfig.max_tokens` via `--override-engine-args` JSON (from `requested_trtllm_kv_tokens` marker). Token-based cap is deterministic and parallel-safe. Uses `build_trtllm_override_args_with_mem` (not `build_gpu_mem_args`) because TRT-LLM requires JSON merging for `--override-engine-args`.
- **vLLM**: `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES = N``--kv-cache-memory-bytes` (from `requested_vllm_kv_cache_bytes` marker). Byte-based cap is deterministic and doesn't depend on current free memory, making it inherently parallel-safe. Uses `build_vllm_gpu_mem_args` in `gpu_utils.sh`.
- **SGLang**: `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS = N``--max-total-tokens` (from `requested_sglang_kv_tokens` marker). Token-based cap is deterministic and doesn't depend on current free memory, making it inherently parallel-safe. Uses `build_sglang_gpu_mem_args` in `gpu_utils.sh`.
- **TRT-LLM**: `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS = N``KvCacheConfig.max_tokens` via `--override-engine-args` JSON (from `requested_trtllm_kv_tokens` marker). Token-based cap is deterministic and parallel-safe. Uses `build_trtllm_override_args_with_mem` in `gpu_utils.sh` (separate function because TRT-LLM requires JSON merging).
```bash
# Dry-run: preview which tests fit and the GPU plan
......@@ -546,19 +546,23 @@ The profiler automatically detects the engine type and uses the appropriate bina
- **SGLang**: bisects `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS` (token count) → `--max-total-tokens`. Finds the minimum KV cache tokens where the test passes, applies a 2x safety factor, then runs a final probe at the safe token count to measure the actual VRAM. Outputs `profiled_vram_gib` and `requested_sglang_kv_tokens` markers.
- **TRT-LLM**: bisects `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` (token count) → `KvCacheConfig.max_tokens` via `--override-engine-args` JSON. Same logic as SGLang (token-based bisection, 2x safety). Outputs `profiled_vram_gib` and `requested_trtllm_kv_tokens` markers. For non-text models (video/image diffusion) that don't use KV cache, use `--no-find-min-vram` for a single-pass VRAM measurement — binary search won't work because the model doesn't log KV token allocation.
**Requirement (vLLM):** The launch script must honor `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES`. This is handled by `build_gpu_mem_args` in `gpu_utils.sh` (returns `--kv-cache-memory-bytes N`).
**Requirement (vLLM):** The launch script must honor `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES`. This is handled by `build_vllm_gpu_mem_args` in `gpu_utils.sh` (returns `--kv-cache-memory-bytes N`).
**Requirement (SGLang):** The launch script must honor `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS`. This is handled by `build_gpu_mem_args` in `gpu_utils.sh` (returns `--max-total-tokens N`).
**Requirement (SGLang):** The launch script must honor `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS`. This is handled by `build_sglang_gpu_mem_args` in `gpu_utils.sh` (returns `--max-total-tokens N`).
**Requirement (TRT-LLM):** The launch script must honor `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` (and optionally `_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES`). This is handled by `build_trtllm_override_args_with_mem` in `gpu_utils.sh` (returns JSON for `--override-engine-args`). Note: this is a separate function from `build_gpu_mem_args` because TRT-LLM requires JSON merging.
**Requirement (TRT-LLM):** The launch script must honor `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` (and optionally `_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES`). This is handled by `build_trtllm_override_args_with_mem` in `gpu_utils.sh` (returns JSON for `--override-engine-args`). Note: this is a separate function from `build_vllm_gpu_mem_args` / `build_sglang_gpu_mem_args` because TRT-LLM requires JSON merging.
### Engine-specific mapping
Launch scripts call `build_gpu_mem_args` (vLLM/SGLang) or `build_trtllm_override_args_with_mem` (TRT-LLM) from `examples/common/gpu_utils.sh`, which check env var overrides and return the appropriate CLI flags:
Launch scripts call engine-specific functions from `examples/common/gpu_utils.sh` which check env var overrides and return the appropriate CLI flags:
```bash
# vLLM / SGLang
GPU_MEM_ARGS=$(build_gpu_mem_args sglang)
# vLLM
GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
python -m dynamo.vllm --model "$MODEL" $GPU_MEM_ARGS &
# SGLang
GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
python -m dynamo.sglang --model-path "$MODEL" $GPU_MEM_ARGS &
# TRT-LLM (requires JSON merging, separate function)
......
......@@ -330,7 +330,7 @@ vllm_configs = {
# NOTE: Pack all workers on 1 GPU for lower CI resource requirements
# NOTE: disagg_multimodal_e_pd.sh uses explicit --gpu-memory-utilization via
# DYN_ENCODE_GPU_MEM / DYN_PD_GPU_MEM env vars in single-GPU mode.
# PD worker honors build_gpu_mem_args for parallel execution.
# PD worker honors build_vllm_gpu_mem_args for parallel execution.
"multimodal_e_pd_qwen": VLLMConfig(
name="multimodal_e_pd_qwen",
directory=vllm_dir,
......@@ -414,7 +414,7 @@ vllm_configs = {
# total on this GPU.
# NOTE: disagg_multimodal_epd.sh uses explicit --gpu-memory-utilization via
# DYN_ENCODE_GPU_MEM / DYN_PREFILL_GPU_MEM / DYN_DECODE_GPU_MEM env vars.
# P/D workers honor build_gpu_mem_args for parallel execution.
# P/D workers honor build_vllm_gpu_mem_args for parallel execution.
"multimodal_disagg_qwen": VLLMConfig(
name="multimodal_disagg_qwen",
directory=vllm_dir,
......
......@@ -25,8 +25,8 @@ the ``@pytest.mark.profiled_vram_gib`` recommendation.
**IMPORTANT**: The test under profile **MUST** read the appropriate KV cache
override — either directly (see ``test_mock_gpu_alloc.py``) or via launch
scripts that call ``build_gpu_mem_args`` (vLLM/SGLang) or
``build_trtllm_override_args_with_mem`` (TensorRT-LLM). If the test
scripts that call ``build_vllm_gpu_mem_args`` / ``build_sglang_gpu_mem_args``
or ``build_trtllm_override_args_with_mem`` (TensorRT-LLM). If the test
ignores the override, every probe will pass at the same peak and the profiler
will warn that the binary search is unreliable.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment