refactor: split build_gpu_mem_args into engine-specific functions (#7916)

Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>

refactor: split build_gpu_mem_args into engine-specific functions (#7916)
Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>
4cdc49c2 · Keiven C · GitHub · b1c18bb1 · 4cdc49c2 · 4cdc49c2
Unverified Commit 4cdc49c2 authored Apr 07, 2026 by Keiven C Committed by GitHub Apr 07, 2026
18 changed files
--- a/examples/backends/vllm/launch/lora/agg_lora.sh
+++ b/examples/backends/vllm/launch/lora/agg_lora.sh
@@ -63,7 +63,7 @@ python -m dynamo.frontend &
 MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
 MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"

-GPU_MEM_ARGS=$(build_gpu_mem_args vllm)
+GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
 DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT} \
    python -m dynamo.vllm --model "$MODEL" --enforce-eager \
    --max-model-len "$MAX_MODEL_LEN" \

--- a/examples/backends/vllm/launch/lora/agg_lora_router.sh
+++ b/examples/backends/vllm/launch/lora/agg_lora_router.sh
@@ -64,7 +64,7 @@ python -m dynamo.frontend \

 # run workers
 # --enforce-eager is added for quick deployment. for production use, need to remove this flag
-# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
+# TODO: use build_vllm_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
 DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT1} \
 CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
    --model $MODEL \

--- a/examples/backends/vllm/launch/lora/xpu/agg_lora_router_xpu.sh
+++ b/examples/backends/vllm/launch/lora/xpu/agg_lora_router_xpu.sh
@@ -66,7 +66,7 @@ python -m dynamo.frontend \

 # run workers
 # --enforce-eager is added for quick deployment. for production use, need to remove this flag
-# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
+# TODO: use build_vllm_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
 DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT1} \
 ZE_AFFINITY_MASK=0 python3 -m dynamo.vllm \
    --model $MODEL \

--- a/examples/backends/vllm/launch/lora/xpu/agg_lora_xpu.sh
+++ b/examples/backends/vllm/launch/lora/xpu/agg_lora_xpu.sh
@@ -66,14 +66,14 @@ python -m dynamo.frontend &
 MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
 MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"

-GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
+GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)

 DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT} \
    python -m dynamo.vllm --model "$MODEL" --enforce-eager \
    --max-model-len "$MAX_MODEL_LEN" \
    --max-num-seqs "$MAX_CONCURRENT_SEQS" \
    --block-size "${BLOCK_SIZE:-64}" \
-    ${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} \
+    $GPU_MEM_ARGS \
    --enable-lora \
    --max-lora-rank 64 &


--- a/examples/backends/vllm/launch/xpu/agg_lmcache_multiproc_xpu.sh
+++ b/examples/backends/vllm/launch/xpu/agg_lmcache_multiproc_xpu.sh
@@ -29,7 +29,7 @@ MODEL="Qwen/Qwen3-0.6B"
 MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
 MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"

-GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
+GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)

 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
 print_launch_banner "Launching Aggregated + LMCache + Multiproc (1 GPU)" "$MODEL" "$HTTP_PORT"
@@ -42,7 +42,7 @@ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
  --max-model-len "$MAX_MODEL_LEN" \
  --max-num-seqs "$MAX_CONCURRENT_SEQS" \
  --block-size "${BLOCK_SIZE:-64}" \
-  ${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} \
+  $GPU_MEM_ARGS \
  --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both","kv_buffer_device":"xpu"}' &

 # Exit on first worker failure; kill 0 in the EXIT trap tears down the rest

--- a/examples/backends/vllm/launch/xpu/agg_lmcache_xpu.sh
+++ b/examples/backends/vllm/launch/xpu/agg_lmcache_xpu.sh
@@ -19,7 +19,7 @@ MODEL="Qwen/Qwen3-0.6B"
 MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
 MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"

-GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
+GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)

 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
 print_launch_banner "Launching Aggregated Serving + LMCache (1 GPU)" "$MODEL" "$HTTP_PORT"
@@ -31,7 +31,7 @@ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
  --max-model-len "$MAX_MODEL_LEN" \
  --max-num-seqs "$MAX_CONCURRENT_SEQS" \
  --block-size "${BLOCK_SIZE:-64}" \
-  ${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} \
+  $GPU_MEM_ARGS \
  --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both","kv_buffer_device":"xpu"}' &

 # Exit on first worker failure; kill 0 in the EXIT trap tears down the rest

--- a/examples/backends/vllm/launch/xpu/agg_multimodal_xpu.sh
+++ b/examples/backends/vllm/launch/xpu/agg_multimodal_xpu.sh
@@ -71,7 +71,7 @@ case "$MODEL_NAME" in
        MODEL_EXTRA_ARGS="--tensor-parallel-size=8" ;;
 esac

-GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL_NAME" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
+GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)

 # Start vLLM worker with vision model
 # --enforce-eager: Quick deployment (remove for production)
@@ -81,7 +81,7 @@ ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK:-0} python -m dynamo.vllm --enable-multimoda
    --max-model-len "$MAX_MODEL_LEN" \
    --max-num-seqs "$MAX_CONCURRENT_SEQS" \
    --block-size "${BLOCK_SIZE:-64}" \
-    ${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} $MODEL_EXTRA_ARGS "${EXTRA_ARGS[@]}"
+    $GPU_MEM_ARGS $MODEL_EXTRA_ARGS "${EXTRA_ARGS[@]}"

 # Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
 wait_any_exit
--- a/examples/backends/vllm/launch/xpu/agg_request_planes_xpu.sh
+++ b/examples/backends/vllm/launch/xpu/agg_request_planes_xpu.sh
@@ -52,7 +52,7 @@ MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
 export DYN_REQUEST_PLANE=$REQUEST_PLANE
 echo "Using request plane mode: $REQUEST_PLANE"

-GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
+GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)

 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
 print_launch_banner "Launching Aggregated Serving + Request Planes (1 GPU)" "$MODEL" "$HTTP_PORT"
@@ -65,7 +65,7 @@ DYN_HEALTH_CHECK_ENABLED=true \
    --max-model-len "$MAX_MODEL_LEN" \
    --max-num-seqs "$MAX_CONCURRENT_SEQS" \
    --block-size "${BLOCK_SIZE:-64}" \
-    ${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} &
+    $GPU_MEM_ARGS &

 # Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
 wait_any_exit
--- a/examples/backends/vllm/launch/xpu/agg_router_approx_xpu.sh
+++ b/examples/backends/vllm/launch/xpu/agg_router_approx_xpu.sh
@@ -26,7 +26,7 @@ python -m dynamo.frontend \
 #
 # If multiple workers are launched, they must not share the same system/metrics port.
 # Use DYN_SYSTEM_PORT{1,2} so tests/launchers can provide a simple numbered port set.
-# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
+# TODO: use build_vllm_gpu_mem_args to measure VRAM instead of relying on vLLM defaults

 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
 ZE_AFFINITY_MASK=0 python3 -m dynamo.vllm \

--- a/examples/backends/vllm/launch/xpu/agg_router_xpu.sh
+++ b/examples/backends/vllm/launch/xpu/agg_router_xpu.sh
@@ -31,7 +31,7 @@ python -m dynamo.frontend \
 #
 # If multiple workers are launched, they must not share the same system/metrics port.
 # Use DYN_SYSTEM_PORT{1,2} so tests/launchers can provide a simple numbered port set.
-# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
+# TODO: use build_vllm_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
 #
 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
 ZE_AFFINITY_MASK=0 python3 -m dynamo.vllm \

--- a/examples/backends/vllm/launch/xpu/agg_xpu.sh
+++ b/examples/backends/vllm/launch/xpu/agg_xpu.sh
@@ -8,7 +8,7 @@ set -e
 trap 'echo Cleaning up...; kill 0' EXIT

 SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
-source "$SCRIPT_DIR/../../../../common/gpu_utils.sh"   # gpu_gb_to_total_fraction
+source "$SCRIPT_DIR/../../../../common/gpu_utils.sh"   # build_vllm_gpu_mem_args
 source "$SCRIPT_DIR/../../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit

 export VLLM_TARGET_DEVICE=xpu
@@ -35,7 +35,7 @@ done
 MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
 MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"

-GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
+GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)

 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
 print_launch_banner "Launching Aggregated Serving (1 GPU)" "$MODEL" "$HTTP_PORT"
@@ -51,7 +51,7 @@ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
    --max-model-len "$MAX_MODEL_LEN" \
    --max-num-seqs "$MAX_CONCURRENT_SEQS" \
    --block-size "${BLOCK_SIZE:-64}" \
-    ${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} "${EXTRA_ARGS[@]}" &
+    $GPU_MEM_ARGS "${EXTRA_ARGS[@]}" &

 # Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
 wait_any_exit
--- a/examples/common/gpu_utils.md
+++ b/examples/common/gpu_utils.md
@@ -24,7 +24,7 @@ Instead, we use **absolute KV cache caps**:
 |--------|----------------------|---------|
 | vLLM | `--kv-cache-memory-bytes N` | `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES` |
 | SGLang | `--max-total-tokens N` | `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS` |
-| TensorRT-LLM | *(future TODO)* | — |
+| TensorRT-LLM | `--override-engine-args '{"kv_cache_config":{"max_tokens":N}}'` | `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` |

 ---

@@ -36,7 +36,7 @@ Instead, we use **absolute KV cache caps**:
 | Fraction base | Total VRAM | Total VRAM | Free VRAM (post-load) |
 | Default | 0.90 | 0.90 | 0.90 |
 | Max seq len | `--max-model-len` | `--context-length` | `max_seq_len` |
-| KV cache override | `--kv-cache-memory-bytes` | `--max-total-tokens` | *(broken in 1.3.0rc5)* |
+| KV cache override | `--kv-cache-memory-bytes` | `--max-total-tokens` | `KvCacheConfig.max_tokens` via `--override-engine-args` |

 ---

@@ -76,40 +76,55 @@ only — they do **not** change KV cache allocation.
 `free_gpu_memory_fraction` is a fraction of **free** VRAM after model load.
 Set via YAML or `--override-engine-args '{"kv_cache_config":{"free_gpu_memory_fraction": 0.24}}'`.

-Deterministic KV cache control via `build_gpu_mem_args` is a future TODO.
+Deterministic KV cache control uses `build_trtllm_override_args_with_mem` in
+`gpu_utils.sh`, which builds JSON for `--override-engine-args`. Token-based
+(`_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS`) or byte-based
+(`_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES`) caps are supported. If the
+launch script already passes `--override-engine-args`, the function merges
+the GPU config into the existing JSON via `--merge-with-json`.

 ---

-## `build_gpu_mem_args` and Env Vars
+## Engine-Specific GPU Memory Functions

-Launch scripts source `gpu_utils.sh` and call `build_gpu_mem_args` to pick
+Launch scripts source `gpu_utils.sh` and call engine-specific functions to pick
 up env-var overrides during profiling and parallel execution:

 ```bash
 source "$SCRIPT_DIR/../../../common/gpu_utils.sh"

-GPU_MEM_ARGS=$(build_gpu_mem_args vllm)
+# vLLM
+GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
 python -m dynamo.vllm --model "$MODEL" $GPU_MEM_ARGS &

-GPU_MEM_ARGS=$(build_gpu_mem_args sglang)
+# SGLang
+GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
 python -m dynamo.sglang --model-path "$MODEL" $GPU_MEM_ARGS &
+
+# TRT-LLM (JSON merging, separate function)
+OVERRIDE_JSON=$(build_trtllm_override_args_with_mem)
+python -m dynamo.trtllm --model-path "$MODEL" ${OVERRIDE_JSON:+--override-engine-args "$OVERRIDE_JSON"} &
 ```

-When the env var is set, `build_gpu_mem_args` returns the corresponding flag.
+When the env var is set, the function returns the corresponding flag.
 Otherwise it returns empty and the engine uses its default allocation.

-| Env var | Engine | CLI flag produced |
-|---------|--------|-------------------|
-| `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES` | vLLM | `--kv-cache-memory-bytes N --gpu-memory-utilization 0.01` |
-| `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS` | SGLang | `--max-total-tokens N` |
+| Env var | Function | Output |
+|---------|----------|--------|
+| `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES` | `build_vllm_gpu_mem_args` | `--kv-cache-memory-bytes N --gpu-memory-utilization 0.01` |
+| `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS` | `build_sglang_gpu_mem_args` | `--max-total-tokens N` |
+| `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` | `build_trtllm_override_args_with_mem` | `{"kv_cache_config": {"max_tokens": N}}` (JSON) |
+| `_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES` | `build_trtllm_override_args_with_mem` | `{"kv_cache_config": {"max_gpu_total_bytes": N}}` (JSON) |

-For multi-worker single-GPU scripts, pass `--workers-per-gpu N` to divide
-the allocation: `build_gpu_mem_args vllm --workers-per-gpu 2`.
+All functions return per-process args. In multi-worker-per-GPU setups
+(e.g. `disagg_same_gpu.sh`), each worker gets the same override value.
+The profiler finds the per-worker budget directly.

 **Profiler** (`profile_pytest.py`): binary-searches the KV cap to find the
 minimum passing value, applies a 2x safety factor, outputs pytest markers
-(`@pytest.mark.requested_vllm_kv_cache_bytes(N)` or
-`@pytest.mark.requested_sglang_kv_tokens(N)`).
+(`@pytest.mark.requested_vllm_kv_cache_bytes(N)`,
+`@pytest.mark.requested_sglang_kv_tokens(N)`, or
+`@pytest.mark.requested_trtllm_kv_tokens(N)`).

 **Scheduler** (`pytest_parallel_gpu.py`): reads the markers at runtime and
 sets the env var per-test. See `tests/README.md` for details.
--- a/examples/common/gpu_utils.sh
+++ b/examples/common/gpu_utils.sh
@@ -10,64 +10,57 @@
 #   source "$SCRIPT_DIR/../common/gpu_utils.sh"
 #
 # Functions (all return via stdout):
-#   build_gpu_mem_args <engine> [--workers-per-gpu N]
-#       Returns engine-specific CLI args for GPU memory control based on
-#       environment variable overrides. Empty if no overrides.
-#
-#       Supported engines: vllm, sglang
 #
+#   build_vllm_gpu_mem_args
 #       vLLM:   _PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES → --kv-cache-memory-bytes N --gpu-memory-utilization 0.01
+#
+#   build_sglang_gpu_mem_args
 #       SGLang: _PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS → --max-total-tokens N
 #
 #       Note: TensorRT-LLM uses build_trtllm_override_args_with_mem() instead (requires JSON merging)
 #
-#       TODO: Split into build_vllm_gpu_mem_args and build_sglang_gpu_mem_args
-#
 # Usage:
-#   # vLLM / SGLang
-#   GPU_MEM_ARGS=$(build_gpu_mem_args sglang)
+#   GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
 #   python -m dynamo.sglang --model-path "$MODEL" $GPU_MEM_ARGS &
 #
-#   GPU_MEM_ARGS=$(build_gpu_mem_args vllm)
+#   GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
 #   python -m dynamo.vllm --model "$MODEL" $GPU_MEM_ARGS &
-build_gpu_mem_args() {
-    local engine="${1:?usage: build_gpu_mem_args <engine> [--workers-per-gpu N]}"
-    shift
-
-    # TensorRT-LLM uses build_trtllm_override_args_with_mem instead
-    if [[ "$engine" == "trtllm" ]]; then
-        echo "build_gpu_mem_args: TensorRT-LLM not supported. Use build_trtllm_override_args_with_mem instead." >&2
-        return 1
-    fi

-    local workers_per_gpu=1
-    while [[ $# -gt 0 ]]; do
-        case "$1" in
-            --workers-per-gpu) workers_per_gpu="$2"; shift 2 ;;
-            *) echo "build_gpu_mem_args: unknown option '$1'" >&2; return 1 ;;
-        esac
-    done

-    # --- SGLang: token-based KV cache cap ---
-    if [[ "$engine" == "sglang" && -n "${_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS:-}" ]]; then
-        echo "--max-total-tokens ${_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS}"
+# ---------------------------------------------------------------------------
+# build_vllm_gpu_mem_args
+#   Returns vLLM CLI args for GPU memory control.
+#   Empty if _PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES is not set.
+#
+#   --kv-cache-memory-bytes is per-process: each vLLM worker gets the same
+#   value, even in multi-worker-per-GPU setups (e.g. disagg_same_gpu.sh).
+#   The profiler finds the per-worker budget directly.
+#
+#   --gpu-memory-utilization 0.01 prevents vLLM's startup check from rejecting
+#   the launch when co-resident tests use >10% of VRAM (vLLM checks free memory
+#   against the fraction *before* applying the byte cap).
+# ---------------------------------------------------------------------------
+build_vllm_gpu_mem_args() {
+    if [[ -n "${_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES:-}" ]]; then
+        echo "--kv-cache-memory-bytes ${_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES} --gpu-memory-utilization 0.01"
        return 0
    fi

-    # --- vLLM: byte-based KV cache cap ---
-    # --gpu-memory-utilization 0.01 prevents vLLM's startup check from rejecting
-    # the launch when co-resident tests use >10% of VRAM (vLLM checks free memory
-    # against the fraction *before* applying the byte cap).
-    if [[ "$engine" == "vllm" && -n "${_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES:-}" ]]; then
-        local kv_bytes="$_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES"
-        if [[ "$workers_per_gpu" -gt 1 ]]; then
-            kv_bytes=$(awk -v b="$kv_bytes" -v n="$workers_per_gpu" 'BEGIN { printf "%d", b / n }')
-        fi
-        echo "--kv-cache-memory-bytes $kv_bytes --gpu-memory-utilization 0.01"
+    echo ""
+}
+
+
+# ---------------------------------------------------------------------------
+# build_sglang_gpu_mem_args
+#   Returns SGLang CLI args for GPU memory control.
+#   Empty if _PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS is not set.
+# ---------------------------------------------------------------------------
+build_sglang_gpu_mem_args() {
+    if [[ -n "${_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS:-}" ]]; then
+        echo "--max-total-tokens ${_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS}"
        return 0
    fi

-    # No override — engine uses its default allocation
    echo ""
 }

@@ -160,45 +153,46 @@ _gpu_utils_self_test() {

    local result

+    # --- build_vllm_gpu_mem_args (direct) ---
+
    echo "=== vLLM: kv bytes override ==="
    result=$(_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES=942054000 \
-        build_gpu_mem_args vllm)
+        build_vllm_gpu_mem_args)
    _assert "kv bytes" "--kv-cache-memory-bytes 942054000 --gpu-memory-utilization 0.01" "$result"

-    echo ""
-    echo "=== vLLM: kv bytes with --workers-per-gpu 2 ==="
-    result=$(_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES=942054000 \
-        build_gpu_mem_args vllm --workers-per-gpu 2)
-    _assert "kv bytes / 2" "--kv-cache-memory-bytes 471027000 --gpu-memory-utilization 0.01" "$result"
-
    echo ""
    echo "=== vLLM: no override = empty ==="
-    result=$(build_gpu_mem_args vllm)
+    result=$(build_vllm_gpu_mem_args)
    _assert "empty (engine default)" "" "$result"

    echo ""
    echo "=== vLLM: sglang token env ignored ==="
    result=$(_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS=23824 \
-        build_gpu_mem_args vllm)
+        build_vllm_gpu_mem_args)
    _assert "vllm ignores token cap" "" "$result"

+    # --- build_sglang_gpu_mem_args (direct) ---
+
    echo ""
    echo "=== sglang: token cap env ==="
    result=$(_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS=1024 \
-        build_gpu_mem_args sglang)
+        build_sglang_gpu_mem_args)
    _assert "token cap" "--max-total-tokens 1024" "$result"

    echo ""
    echo "=== sglang: no override = empty ==="
-    result=$(build_gpu_mem_args sglang)
+    result=$(build_sglang_gpu_mem_args)
    _assert "empty (engine default)" "" "$result"

    echo ""
    echo "=== sglang: vllm kv bytes env ignored ==="
    result=$(_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES=942054000 \
-        build_gpu_mem_args sglang)
+        build_sglang_gpu_mem_args)
    _assert "sglang ignores kv bytes" "" "$result"

+
+    # --- build_trtllm_override_args_with_mem ---
+
    echo ""
    echo "=== trtllm: token cap env ==="
    result=$(_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS=4096 \
@@ -239,16 +233,6 @@ _gpu_utils_self_test() {
    result=$(build_trtllm_override_args_with_mem --merge-with-json '{"return_perf_metrics": true}')
    _assert "trtllm passthrough" '{"return_perf_metrics": true}' "$result"

-    echo ""
-    echo "=== missing engine ==="
-    (build_gpu_mem_args 2>/dev/null)
-    _assert "missing engine exits non-zero" "1" "$?"
-
-    echo ""
-    echo "=== trtllm rejected (use build_trtllm_override_args_with_mem) ==="
-    (build_gpu_mem_args trtllm 2>/dev/null)
-    _assert "trtllm rejected" "1" "$?"
-
    echo ""
    echo "=========================================="
    echo "Results: $pass passed, $fail failed"

--- a/examples/multimodal/launch/audio_agg.sh
+++ b/examples/multimodal/launch/audio_agg.sh
@@ -95,7 +95,7 @@ python -m dynamo.frontend &
 python3 components/processor.py --model $MODEL_NAME --prompt-template "$PROMPT_TEMPLATE" &

 # run E/P/D workers
-GPU_MEM_ARGS=$(build_gpu_mem_args vllm)
+GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)

 CUDA_VISIBLE_DEVICES=0 \
    DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \

--- a/examples/multimodal/launch/audio_disagg.sh
+++ b/examples/multimodal/launch/audio_disagg.sh
@@ -96,7 +96,7 @@ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT4:-8084} \
    python3 components/processor.py --model $MODEL_NAME --prompt-template "$PROMPT_TEMPLATE" &

 # run E/P/D workers
-GPU_MEM_ARGS=$(build_gpu_mem_args vllm)
+GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)

 CUDA_VISIBLE_DEVICES=0 \
    DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT3:-8083} \

--- a/tests/README.md
+++ b/tests/README.md
@@ -192,7 +192,7 @@ Markers differ by engine:
 - **`profiled_vram_gib(N)`** — actual peak from nvidia-smi. Used for `--max-vram-gib` filtering and scheduler budget.
 - **`requested_trtllm_kv_tokens(N)`** — max KV cache tokens for text models. Sets `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` → `KvCacheConfig.max_tokens` via `--override-engine-args` JSON. Deterministic and parallel-safe.
 - **`requested_trtllm_vram_gib(N)`** — max VRAM in GiB for non-text workloads (video/image diffusion). Sets `_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES` → `KvCacheConfig.max_gpu_total_bytes` via `--override-engine-args` JSON. Note: diffusion models don't use KV cache, so this parameter may have no effect — `profiled_vram_gib` alone is sufficient for scheduler budget tracking.
- TRT-LLM requires JSON merging for `--override-engine-args`, handled by `build_trtllm_override_args_with_mem` in `gpu_utils.sh` (separate from `build_gpu_mem_args` used by vLLM/SGLang).
+- TRT-LLM requires JSON merging for `--override-engine-args`, handled by `build_trtllm_override_args_with_mem` in `gpu_utils.sh` (separate from `build_vllm_gpu_mem_args` / `build_sglang_gpu_mem_args`).

 `--max-vram-gib=N` deselects tests whose `profiled_vram_gib` exceeds N. Tests without a VRAM marker are also deselected (unknown VRAM = unsafe for parallel). To add a test to the pool, profile it with `tests/utils/profile_pytest.py` (see [GPU VRAM Profiler](#gpu-vram-profiler-profile_pytestpy)).

@@ -203,9 +203,9 @@ GPU tests run concurrently via a custom VRAM-aware scheduler (`tests/utils/pytes
 1. **VRAM budget**: xdist has no GPU memory awareness — two 20 GiB tests on a 48 GiB GPU will OOM.
 2. **Profiling race**: engines snapshot free memory during init; concurrent startups corrupt each other. The scheduler staggers launches (VRAM stability check) and retries transient failures.
 3. **Engine-specific allocation**: each test gets a constrained allocation so it uses only its budgeted share. xdist has no mechanism for this.
-   - **vLLM**: `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES = N` → `--kv-cache-memory-bytes` (from `requested_vllm_kv_cache_bytes` marker). Byte-based cap is deterministic and doesn't depend on current free memory, making it inherently parallel-safe.
-   - **SGLang**: `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS = N` → `--max-total-tokens` (from `requested_sglang_kv_tokens` marker). Token-based cap is deterministic and doesn't depend on current free memory, making it inherently parallel-safe.
-   - **TRT-LLM**: `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS = N` → `KvCacheConfig.max_tokens` via `--override-engine-args` JSON (from `requested_trtllm_kv_tokens` marker). Token-based cap is deterministic and parallel-safe. Uses `build_trtllm_override_args_with_mem` (not `build_gpu_mem_args`) because TRT-LLM requires JSON merging for `--override-engine-args`.
+   - **vLLM**: `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES = N` → `--kv-cache-memory-bytes` (from `requested_vllm_kv_cache_bytes` marker). Byte-based cap is deterministic and doesn't depend on current free memory, making it inherently parallel-safe. Uses `build_vllm_gpu_mem_args` in `gpu_utils.sh`.
+   - **SGLang**: `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS = N` → `--max-total-tokens` (from `requested_sglang_kv_tokens` marker). Token-based cap is deterministic and doesn't depend on current free memory, making it inherently parallel-safe. Uses `build_sglang_gpu_mem_args` in `gpu_utils.sh`.
+   - **TRT-LLM**: `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS = N` → `KvCacheConfig.max_tokens` via `--override-engine-args` JSON (from `requested_trtllm_kv_tokens` marker). Token-based cap is deterministic and parallel-safe. Uses `build_trtllm_override_args_with_mem` in `gpu_utils.sh` (separate function because TRT-LLM requires JSON merging).

 ```bash
 # Dry-run: preview which tests fit and the GPU plan
@@ -546,19 +546,23 @@ The profiler automatically detects the engine type and uses the appropriate bina
 - **SGLang**: bisects `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS` (token count) → `--max-total-tokens`. Finds the minimum KV cache tokens where the test passes, applies a 2x safety factor, then runs a final probe at the safe token count to measure the actual VRAM. Outputs `profiled_vram_gib` and `requested_sglang_kv_tokens` markers.
 - **TRT-LLM**: bisects `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` (token count) → `KvCacheConfig.max_tokens` via `--override-engine-args` JSON. Same logic as SGLang (token-based bisection, 2x safety). Outputs `profiled_vram_gib` and `requested_trtllm_kv_tokens` markers. For non-text models (video/image diffusion) that don't use KV cache, use `--no-find-min-vram` for a single-pass VRAM measurement — binary search won't work because the model doesn't log KV token allocation.

-**Requirement (vLLM):** The launch script must honor `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES`. This is handled by `build_gpu_mem_args` in `gpu_utils.sh` (returns `--kv-cache-memory-bytes N`).
+**Requirement (vLLM):** The launch script must honor `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES`. This is handled by `build_vllm_gpu_mem_args` in `gpu_utils.sh` (returns `--kv-cache-memory-bytes N`).

-**Requirement (SGLang):** The launch script must honor `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS`. This is handled by `build_gpu_mem_args` in `gpu_utils.sh` (returns `--max-total-tokens N`).
+**Requirement (SGLang):** The launch script must honor `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS`. This is handled by `build_sglang_gpu_mem_args` in `gpu_utils.sh` (returns `--max-total-tokens N`).

-**Requirement (TRT-LLM):** The launch script must honor `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` (and optionally `_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES`). This is handled by `build_trtllm_override_args_with_mem` in `gpu_utils.sh` (returns JSON for `--override-engine-args`). Note: this is a separate function from `build_gpu_mem_args` because TRT-LLM requires JSON merging.
+**Requirement (TRT-LLM):** The launch script must honor `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` (and optionally `_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES`). This is handled by `build_trtllm_override_args_with_mem` in `gpu_utils.sh` (returns JSON for `--override-engine-args`). Note: this is a separate function from `build_vllm_gpu_mem_args` / `build_sglang_gpu_mem_args` because TRT-LLM requires JSON merging.

 ### Engine-specific mapping

-Launch scripts call `build_gpu_mem_args` (vLLM/SGLang) or `build_trtllm_override_args_with_mem` (TRT-LLM) from `examples/common/gpu_utils.sh`, which check env var overrides and return the appropriate CLI flags:
+Launch scripts call engine-specific functions from `examples/common/gpu_utils.sh` which check env var overrides and return the appropriate CLI flags:

 ```bash
-# vLLM / SGLang
-GPU_MEM_ARGS=$(build_gpu_mem_args sglang)
+# vLLM
+GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
+python -m dynamo.vllm --model "$MODEL" $GPU_MEM_ARGS &
+
+# SGLang
+GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
 python -m dynamo.sglang --model-path "$MODEL" $GPU_MEM_ARGS &

 # TRT-LLM (requires JSON merging, separate function)

--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -330,7 +330,7 @@ vllm_configs = {
    # NOTE: Pack all workers on 1 GPU for lower CI resource requirements
    # NOTE: disagg_multimodal_e_pd.sh uses explicit --gpu-memory-utilization via
    # DYN_ENCODE_GPU_MEM / DYN_PD_GPU_MEM env vars in single-GPU mode.
-    # PD worker honors build_gpu_mem_args for parallel execution.
+    # PD worker honors build_vllm_gpu_mem_args for parallel execution.
    "multimodal_e_pd_qwen": VLLMConfig(
        name="multimodal_e_pd_qwen",
        directory=vllm_dir,
@@ -414,7 +414,7 @@ vllm_configs = {
    # total on this GPU.
    # NOTE: disagg_multimodal_epd.sh uses explicit --gpu-memory-utilization via
    # DYN_ENCODE_GPU_MEM / DYN_PREFILL_GPU_MEM / DYN_DECODE_GPU_MEM env vars.
-    # P/D workers honor build_gpu_mem_args for parallel execution.
+    # P/D workers honor build_vllm_gpu_mem_args for parallel execution.
    "multimodal_disagg_qwen": VLLMConfig(
        name="multimodal_disagg_qwen",
        directory=vllm_dir,

--- a/tests/utils/profile_pytest.py
+++ b/tests/utils/profile_pytest.py
@@ -25,8 +25,8 @@ the ``@pytest.mark.profiled_vram_gib`` recommendation.

 **IMPORTANT**: The test under profile **MUST** read the appropriate KV cache
 override — either directly (see ``test_mock_gpu_alloc.py``) or via launch
-scripts that call ``build_gpu_mem_args`` (vLLM/SGLang) or
-``build_trtllm_override_args_with_mem`` (TensorRT-LLM).  If the test
+scripts that call ``build_vllm_gpu_mem_args`` / ``build_sglang_gpu_mem_args``
+or ``build_trtllm_override_args_with_mem`` (TensorRT-LLM).  If the test
 ignores the override, every probe will pass at the same peak and the profiler
 will warn that the binary search is unreliable.