Unverified Commit 4cdc49c2 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

refactor: split build_gpu_mem_args into engine-specific functions (#7916)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent b1c18bb1
...@@ -63,7 +63,7 @@ python -m dynamo.frontend & ...@@ -63,7 +63,7 @@ python -m dynamo.frontend &
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}" MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}" MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_ARGS=$(build_gpu_mem_args vllm) GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT} \ DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT} \
python -m dynamo.vllm --model "$MODEL" --enforce-eager \ python -m dynamo.vllm --model "$MODEL" --enforce-eager \
--max-model-len "$MAX_MODEL_LEN" \ --max-model-len "$MAX_MODEL_LEN" \
......
...@@ -64,7 +64,7 @@ python -m dynamo.frontend \ ...@@ -64,7 +64,7 @@ python -m dynamo.frontend \
# run workers # run workers
# --enforce-eager is added for quick deployment. for production use, need to remove this flag # --enforce-eager is added for quick deployment. for production use, need to remove this flag
# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults # TODO: use build_vllm_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT1} \ DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT1} \
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--model $MODEL \ --model $MODEL \
......
...@@ -66,7 +66,7 @@ python -m dynamo.frontend \ ...@@ -66,7 +66,7 @@ python -m dynamo.frontend \
# run workers # run workers
# --enforce-eager is added for quick deployment. for production use, need to remove this flag # --enforce-eager is added for quick deployment. for production use, need to remove this flag
# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults # TODO: use build_vllm_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT1} \ DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT1} \
ZE_AFFINITY_MASK=0 python3 -m dynamo.vllm \ ZE_AFFINITY_MASK=0 python3 -m dynamo.vllm \
--model $MODEL \ --model $MODEL \
......
...@@ -66,14 +66,14 @@ python -m dynamo.frontend & ...@@ -66,14 +66,14 @@ python -m dynamo.frontend &
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}" MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}" MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS") GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT} \ DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT} \
python -m dynamo.vllm --model "$MODEL" --enforce-eager \ python -m dynamo.vllm --model "$MODEL" --enforce-eager \
--max-model-len "$MAX_MODEL_LEN" \ --max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \ --max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \ --block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} \ $GPU_MEM_ARGS \
--enable-lora \ --enable-lora \
--max-lora-rank 64 & --max-lora-rank 64 &
......
...@@ -29,7 +29,7 @@ MODEL="Qwen/Qwen3-0.6B" ...@@ -29,7 +29,7 @@ MODEL="Qwen/Qwen3-0.6B"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}" MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}" MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS") GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated + LMCache + Multiproc (1 GPU)" "$MODEL" "$HTTP_PORT" print_launch_banner "Launching Aggregated + LMCache + Multiproc (1 GPU)" "$MODEL" "$HTTP_PORT"
...@@ -42,7 +42,7 @@ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \ ...@@ -42,7 +42,7 @@ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
--max-model-len "$MAX_MODEL_LEN" \ --max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \ --max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \ --block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} \ $GPU_MEM_ARGS \
--kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both","kv_buffer_device":"xpu"}' & --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both","kv_buffer_device":"xpu"}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest # Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
......
...@@ -19,7 +19,7 @@ MODEL="Qwen/Qwen3-0.6B" ...@@ -19,7 +19,7 @@ MODEL="Qwen/Qwen3-0.6B"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}" MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}" MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS") GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving + LMCache (1 GPU)" "$MODEL" "$HTTP_PORT" print_launch_banner "Launching Aggregated Serving + LMCache (1 GPU)" "$MODEL" "$HTTP_PORT"
...@@ -31,7 +31,7 @@ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \ ...@@ -31,7 +31,7 @@ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
--max-model-len "$MAX_MODEL_LEN" \ --max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \ --max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \ --block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} \ $GPU_MEM_ARGS \
--kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both","kv_buffer_device":"xpu"}' & --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both","kv_buffer_device":"xpu"}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest # Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
......
...@@ -71,7 +71,7 @@ case "$MODEL_NAME" in ...@@ -71,7 +71,7 @@ case "$MODEL_NAME" in
MODEL_EXTRA_ARGS="--tensor-parallel-size=8" ;; MODEL_EXTRA_ARGS="--tensor-parallel-size=8" ;;
esac esac
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL_NAME" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS") GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
# Start vLLM worker with vision model # Start vLLM worker with vision model
# --enforce-eager: Quick deployment (remove for production) # --enforce-eager: Quick deployment (remove for production)
...@@ -81,7 +81,7 @@ ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK:-0} python -m dynamo.vllm --enable-multimoda ...@@ -81,7 +81,7 @@ ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK:-0} python -m dynamo.vllm --enable-multimoda
--max-model-len "$MAX_MODEL_LEN" \ --max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \ --max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \ --block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} $MODEL_EXTRA_ARGS "${EXTRA_ARGS[@]}" $GPU_MEM_ARGS $MODEL_EXTRA_ARGS "${EXTRA_ARGS[@]}"
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest # Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit wait_any_exit
...@@ -52,7 +52,7 @@ MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}" ...@@ -52,7 +52,7 @@ MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
export DYN_REQUEST_PLANE=$REQUEST_PLANE export DYN_REQUEST_PLANE=$REQUEST_PLANE
echo "Using request plane mode: $REQUEST_PLANE" echo "Using request plane mode: $REQUEST_PLANE"
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS") GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving + Request Planes (1 GPU)" "$MODEL" "$HTTP_PORT" print_launch_banner "Launching Aggregated Serving + Request Planes (1 GPU)" "$MODEL" "$HTTP_PORT"
...@@ -65,7 +65,7 @@ DYN_HEALTH_CHECK_ENABLED=true \ ...@@ -65,7 +65,7 @@ DYN_HEALTH_CHECK_ENABLED=true \
--max-model-len "$MAX_MODEL_LEN" \ --max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \ --max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \ --block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} & $GPU_MEM_ARGS &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest # Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit wait_any_exit
...@@ -26,7 +26,7 @@ python -m dynamo.frontend \ ...@@ -26,7 +26,7 @@ python -m dynamo.frontend \
# #
# If multiple workers are launched, they must not share the same system/metrics port. # If multiple workers are launched, they must not share the same system/metrics port.
# Use DYN_SYSTEM_PORT{1,2} so tests/launchers can provide a simple numbered port set. # Use DYN_SYSTEM_PORT{1,2} so tests/launchers can provide a simple numbered port set.
# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults # TODO: use build_vllm_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
ZE_AFFINITY_MASK=0 python3 -m dynamo.vllm \ ZE_AFFINITY_MASK=0 python3 -m dynamo.vllm \
......
...@@ -31,7 +31,7 @@ python -m dynamo.frontend \ ...@@ -31,7 +31,7 @@ python -m dynamo.frontend \
# #
# If multiple workers are launched, they must not share the same system/metrics port. # If multiple workers are launched, they must not share the same system/metrics port.
# Use DYN_SYSTEM_PORT{1,2} so tests/launchers can provide a simple numbered port set. # Use DYN_SYSTEM_PORT{1,2} so tests/launchers can provide a simple numbered port set.
# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults # TODO: use build_vllm_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
# #
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
ZE_AFFINITY_MASK=0 python3 -m dynamo.vllm \ ZE_AFFINITY_MASK=0 python3 -m dynamo.vllm \
......
...@@ -8,7 +8,7 @@ set -e ...@@ -8,7 +8,7 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../common/gpu_utils.sh" # gpu_gb_to_total_fraction source "$SCRIPT_DIR/../../../../common/gpu_utils.sh" # build_vllm_gpu_mem_args
source "$SCRIPT_DIR/../../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit source "$SCRIPT_DIR/../../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
export VLLM_TARGET_DEVICE=xpu export VLLM_TARGET_DEVICE=xpu
...@@ -35,7 +35,7 @@ done ...@@ -35,7 +35,7 @@ done
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}" MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}" MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS") GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving (1 GPU)" "$MODEL" "$HTTP_PORT" print_launch_banner "Launching Aggregated Serving (1 GPU)" "$MODEL" "$HTTP_PORT"
...@@ -51,7 +51,7 @@ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \ ...@@ -51,7 +51,7 @@ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
--max-model-len "$MAX_MODEL_LEN" \ --max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \ --max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \ --block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} "${EXTRA_ARGS[@]}" & $GPU_MEM_ARGS "${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest # Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit wait_any_exit
...@@ -24,7 +24,7 @@ Instead, we use **absolute KV cache caps**: ...@@ -24,7 +24,7 @@ Instead, we use **absolute KV cache caps**:
|--------|----------------------|---------| |--------|----------------------|---------|
| vLLM | `--kv-cache-memory-bytes N` | `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES` | | vLLM | `--kv-cache-memory-bytes N` | `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES` |
| SGLang | `--max-total-tokens N` | `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS` | | SGLang | `--max-total-tokens N` | `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS` |
| TensorRT-LLM | *(future TODO)* | — | | TensorRT-LLM | `--override-engine-args '{"kv_cache_config":{"max_tokens":N}}'` | `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` |
--- ---
...@@ -36,7 +36,7 @@ Instead, we use **absolute KV cache caps**: ...@@ -36,7 +36,7 @@ Instead, we use **absolute KV cache caps**:
| Fraction base | Total VRAM | Total VRAM | Free VRAM (post-load) | | Fraction base | Total VRAM | Total VRAM | Free VRAM (post-load) |
| Default | 0.90 | 0.90 | 0.90 | | Default | 0.90 | 0.90 | 0.90 |
| Max seq len | `--max-model-len` | `--context-length` | `max_seq_len` | | Max seq len | `--max-model-len` | `--context-length` | `max_seq_len` |
| KV cache override | `--kv-cache-memory-bytes` | `--max-total-tokens` | *(broken in 1.3.0rc5)* | | KV cache override | `--kv-cache-memory-bytes` | `--max-total-tokens` | `KvCacheConfig.max_tokens` via `--override-engine-args` |
--- ---
...@@ -76,40 +76,55 @@ only — they do **not** change KV cache allocation. ...@@ -76,40 +76,55 @@ only — they do **not** change KV cache allocation.
`free_gpu_memory_fraction` is a fraction of **free** VRAM after model load. `free_gpu_memory_fraction` is a fraction of **free** VRAM after model load.
Set via YAML or `--override-engine-args '{"kv_cache_config":{"free_gpu_memory_fraction": 0.24}}'`. Set via YAML or `--override-engine-args '{"kv_cache_config":{"free_gpu_memory_fraction": 0.24}}'`.
Deterministic KV cache control via `build_gpu_mem_args` is a future TODO. Deterministic KV cache control uses `build_trtllm_override_args_with_mem` in
`gpu_utils.sh`, which builds JSON for `--override-engine-args`. Token-based
(`_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS`) or byte-based
(`_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES`) caps are supported. If the
launch script already passes `--override-engine-args`, the function merges
the GPU config into the existing JSON via `--merge-with-json`.
--- ---
## `build_gpu_mem_args` and Env Vars ## Engine-Specific GPU Memory Functions
Launch scripts source `gpu_utils.sh` and call `build_gpu_mem_args` to pick Launch scripts source `gpu_utils.sh` and call engine-specific functions to pick
up env-var overrides during profiling and parallel execution: up env-var overrides during profiling and parallel execution:
```bash ```bash
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
GPU_MEM_ARGS=$(build_gpu_mem_args vllm) # vLLM
GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
python -m dynamo.vllm --model "$MODEL" $GPU_MEM_ARGS & python -m dynamo.vllm --model "$MODEL" $GPU_MEM_ARGS &
GPU_MEM_ARGS=$(build_gpu_mem_args sglang) # SGLang
GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
python -m dynamo.sglang --model-path "$MODEL" $GPU_MEM_ARGS & python -m dynamo.sglang --model-path "$MODEL" $GPU_MEM_ARGS &
# TRT-LLM (JSON merging, separate function)
OVERRIDE_JSON=$(build_trtllm_override_args_with_mem)
python -m dynamo.trtllm --model-path "$MODEL" ${OVERRIDE_JSON:+--override-engine-args "$OVERRIDE_JSON"} &
``` ```
When the env var is set, `build_gpu_mem_args` returns the corresponding flag. When the env var is set, the function returns the corresponding flag.
Otherwise it returns empty and the engine uses its default allocation. Otherwise it returns empty and the engine uses its default allocation.
| Env var | Engine | CLI flag produced | | Env var | Function | Output |
|---------|--------|-------------------| |---------|----------|--------|
| `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES` | vLLM | `--kv-cache-memory-bytes N --gpu-memory-utilization 0.01` | | `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES` | `build_vllm_gpu_mem_args` | `--kv-cache-memory-bytes N --gpu-memory-utilization 0.01` |
| `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS` | SGLang | `--max-total-tokens N` | | `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS` | `build_sglang_gpu_mem_args` | `--max-total-tokens N` |
| `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` | `build_trtllm_override_args_with_mem` | `{"kv_cache_config": {"max_tokens": N}}` (JSON) |
| `_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES` | `build_trtllm_override_args_with_mem` | `{"kv_cache_config": {"max_gpu_total_bytes": N}}` (JSON) |
For multi-worker single-GPU scripts, pass `--workers-per-gpu N` to divide All functions return per-process args. In multi-worker-per-GPU setups
the allocation: `build_gpu_mem_args vllm --workers-per-gpu 2`. (e.g. `disagg_same_gpu.sh`), each worker gets the same override value.
The profiler finds the per-worker budget directly.
**Profiler** (`profile_pytest.py`): binary-searches the KV cap to find the **Profiler** (`profile_pytest.py`): binary-searches the KV cap to find the
minimum passing value, applies a 2x safety factor, outputs pytest markers minimum passing value, applies a 2x safety factor, outputs pytest markers
(`@pytest.mark.requested_vllm_kv_cache_bytes(N)` or (`@pytest.mark.requested_vllm_kv_cache_bytes(N)`,
`@pytest.mark.requested_sglang_kv_tokens(N)`). `@pytest.mark.requested_sglang_kv_tokens(N)`, or
`@pytest.mark.requested_trtllm_kv_tokens(N)`).
**Scheduler** (`pytest_parallel_gpu.py`): reads the markers at runtime and **Scheduler** (`pytest_parallel_gpu.py`): reads the markers at runtime and
sets the env var per-test. See `tests/README.md` for details. sets the env var per-test. See `tests/README.md` for details.
...@@ -10,64 +10,57 @@ ...@@ -10,64 +10,57 @@
# source "$SCRIPT_DIR/../common/gpu_utils.sh" # source "$SCRIPT_DIR/../common/gpu_utils.sh"
# #
# Functions (all return via stdout): # Functions (all return via stdout):
# build_gpu_mem_args <engine> [--workers-per-gpu N]
# Returns engine-specific CLI args for GPU memory control based on
# environment variable overrides. Empty if no overrides.
#
# Supported engines: vllm, sglang
# #
# build_vllm_gpu_mem_args
# vLLM: _PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES → --kv-cache-memory-bytes N --gpu-memory-utilization 0.01 # vLLM: _PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES → --kv-cache-memory-bytes N --gpu-memory-utilization 0.01
#
# build_sglang_gpu_mem_args
# SGLang: _PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS → --max-total-tokens N # SGLang: _PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS → --max-total-tokens N
# #
# Note: TensorRT-LLM uses build_trtllm_override_args_with_mem() instead (requires JSON merging) # Note: TensorRT-LLM uses build_trtllm_override_args_with_mem() instead (requires JSON merging)
# #
# TODO: Split into build_vllm_gpu_mem_args and build_sglang_gpu_mem_args
#
# Usage: # Usage:
# # vLLM / SGLang # GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
# GPU_MEM_ARGS=$(build_gpu_mem_args sglang)
# python -m dynamo.sglang --model-path "$MODEL" $GPU_MEM_ARGS & # python -m dynamo.sglang --model-path "$MODEL" $GPU_MEM_ARGS &
# #
# GPU_MEM_ARGS=$(build_gpu_mem_args vllm) # GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
# python -m dynamo.vllm --model "$MODEL" $GPU_MEM_ARGS & # python -m dynamo.vllm --model "$MODEL" $GPU_MEM_ARGS &
build_gpu_mem_args() {
local engine="${1:?usage: build_gpu_mem_args <engine> [--workers-per-gpu N]}"
shift
# TensorRT-LLM uses build_trtllm_override_args_with_mem instead
if [[ "$engine" == "trtllm" ]]; then
echo "build_gpu_mem_args: TensorRT-LLM not supported. Use build_trtllm_override_args_with_mem instead." >&2
return 1
fi
local workers_per_gpu=1
while [[ $# -gt 0 ]]; do
case "$1" in
--workers-per-gpu) workers_per_gpu="$2"; shift 2 ;;
*) echo "build_gpu_mem_args: unknown option '$1'" >&2; return 1 ;;
esac
done
# --- SGLang: token-based KV cache cap --- # ---------------------------------------------------------------------------
if [[ "$engine" == "sglang" && -n "${_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS:-}" ]]; then # build_vllm_gpu_mem_args
echo "--max-total-tokens ${_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS}" # Returns vLLM CLI args for GPU memory control.
# Empty if _PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES is not set.
#
# --kv-cache-memory-bytes is per-process: each vLLM worker gets the same
# value, even in multi-worker-per-GPU setups (e.g. disagg_same_gpu.sh).
# The profiler finds the per-worker budget directly.
#
# --gpu-memory-utilization 0.01 prevents vLLM's startup check from rejecting
# the launch when co-resident tests use >10% of VRAM (vLLM checks free memory
# against the fraction *before* applying the byte cap).
# ---------------------------------------------------------------------------
build_vllm_gpu_mem_args() {
if [[ -n "${_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES:-}" ]]; then
echo "--kv-cache-memory-bytes ${_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES} --gpu-memory-utilization 0.01"
return 0 return 0
fi fi
# --- vLLM: byte-based KV cache cap --- echo ""
# --gpu-memory-utilization 0.01 prevents vLLM's startup check from rejecting }
# the launch when co-resident tests use >10% of VRAM (vLLM checks free memory
# against the fraction *before* applying the byte cap).
if [[ "$engine" == "vllm" && -n "${_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES:-}" ]]; then # ---------------------------------------------------------------------------
local kv_bytes="$_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES" # build_sglang_gpu_mem_args
if [[ "$workers_per_gpu" -gt 1 ]]; then # Returns SGLang CLI args for GPU memory control.
kv_bytes=$(awk -v b="$kv_bytes" -v n="$workers_per_gpu" 'BEGIN { printf "%d", b / n }') # Empty if _PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS is not set.
fi # ---------------------------------------------------------------------------
echo "--kv-cache-memory-bytes $kv_bytes --gpu-memory-utilization 0.01" build_sglang_gpu_mem_args() {
if [[ -n "${_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS:-}" ]]; then
echo "--max-total-tokens ${_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS}"
return 0 return 0
fi fi
# No override — engine uses its default allocation
echo "" echo ""
} }
...@@ -160,45 +153,46 @@ _gpu_utils_self_test() { ...@@ -160,45 +153,46 @@ _gpu_utils_self_test() {
local result local result
# --- build_vllm_gpu_mem_args (direct) ---
echo "=== vLLM: kv bytes override ===" echo "=== vLLM: kv bytes override ==="
result=$(_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES=942054000 \ result=$(_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES=942054000 \
build_gpu_mem_args vllm) build_vllm_gpu_mem_args)
_assert "kv bytes" "--kv-cache-memory-bytes 942054000 --gpu-memory-utilization 0.01" "$result" _assert "kv bytes" "--kv-cache-memory-bytes 942054000 --gpu-memory-utilization 0.01" "$result"
echo ""
echo "=== vLLM: kv bytes with --workers-per-gpu 2 ==="
result=$(_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES=942054000 \
build_gpu_mem_args vllm --workers-per-gpu 2)
_assert "kv bytes / 2" "--kv-cache-memory-bytes 471027000 --gpu-memory-utilization 0.01" "$result"
echo "" echo ""
echo "=== vLLM: no override = empty ===" echo "=== vLLM: no override = empty ==="
result=$(build_gpu_mem_args vllm) result=$(build_vllm_gpu_mem_args)
_assert "empty (engine default)" "" "$result" _assert "empty (engine default)" "" "$result"
echo "" echo ""
echo "=== vLLM: sglang token env ignored ===" echo "=== vLLM: sglang token env ignored ==="
result=$(_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS=23824 \ result=$(_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS=23824 \
build_gpu_mem_args vllm) build_vllm_gpu_mem_args)
_assert "vllm ignores token cap" "" "$result" _assert "vllm ignores token cap" "" "$result"
# --- build_sglang_gpu_mem_args (direct) ---
echo "" echo ""
echo "=== sglang: token cap env ===" echo "=== sglang: token cap env ==="
result=$(_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS=1024 \ result=$(_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS=1024 \
build_gpu_mem_args sglang) build_sglang_gpu_mem_args)
_assert "token cap" "--max-total-tokens 1024" "$result" _assert "token cap" "--max-total-tokens 1024" "$result"
echo "" echo ""
echo "=== sglang: no override = empty ===" echo "=== sglang: no override = empty ==="
result=$(build_gpu_mem_args sglang) result=$(build_sglang_gpu_mem_args)
_assert "empty (engine default)" "" "$result" _assert "empty (engine default)" "" "$result"
echo "" echo ""
echo "=== sglang: vllm kv bytes env ignored ===" echo "=== sglang: vllm kv bytes env ignored ==="
result=$(_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES=942054000 \ result=$(_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES=942054000 \
build_gpu_mem_args sglang) build_sglang_gpu_mem_args)
_assert "sglang ignores kv bytes" "" "$result" _assert "sglang ignores kv bytes" "" "$result"
# --- build_trtllm_override_args_with_mem ---
echo "" echo ""
echo "=== trtllm: token cap env ===" echo "=== trtllm: token cap env ==="
result=$(_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS=4096 \ result=$(_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS=4096 \
...@@ -239,16 +233,6 @@ _gpu_utils_self_test() { ...@@ -239,16 +233,6 @@ _gpu_utils_self_test() {
result=$(build_trtllm_override_args_with_mem --merge-with-json '{"return_perf_metrics": true}') result=$(build_trtllm_override_args_with_mem --merge-with-json '{"return_perf_metrics": true}')
_assert "trtllm passthrough" '{"return_perf_metrics": true}' "$result" _assert "trtllm passthrough" '{"return_perf_metrics": true}' "$result"
echo ""
echo "=== missing engine ==="
(build_gpu_mem_args 2>/dev/null)
_assert "missing engine exits non-zero" "1" "$?"
echo ""
echo "=== trtllm rejected (use build_trtllm_override_args_with_mem) ==="
(build_gpu_mem_args trtllm 2>/dev/null)
_assert "trtllm rejected" "1" "$?"
echo "" echo ""
echo "==========================================" echo "=========================================="
echo "Results: $pass passed, $fail failed" echo "Results: $pass passed, $fail failed"
......
...@@ -95,7 +95,7 @@ python -m dynamo.frontend & ...@@ -95,7 +95,7 @@ python -m dynamo.frontend &
python3 components/processor.py --model $MODEL_NAME --prompt-template "$PROMPT_TEMPLATE" & python3 components/processor.py --model $MODEL_NAME --prompt-template "$PROMPT_TEMPLATE" &
# run E/P/D workers # run E/P/D workers
GPU_MEM_ARGS=$(build_gpu_mem_args vllm) GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
CUDA_VISIBLE_DEVICES=0 \ CUDA_VISIBLE_DEVICES=0 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
......
...@@ -96,7 +96,7 @@ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT4:-8084} \ ...@@ -96,7 +96,7 @@ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT4:-8084} \
python3 components/processor.py --model $MODEL_NAME --prompt-template "$PROMPT_TEMPLATE" & python3 components/processor.py --model $MODEL_NAME --prompt-template "$PROMPT_TEMPLATE" &
# run E/P/D workers # run E/P/D workers
GPU_MEM_ARGS=$(build_gpu_mem_args vllm) GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
CUDA_VISIBLE_DEVICES=0 \ CUDA_VISIBLE_DEVICES=0 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT3:-8083} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT3:-8083} \
......
...@@ -192,7 +192,7 @@ Markers differ by engine: ...@@ -192,7 +192,7 @@ Markers differ by engine:
- **`profiled_vram_gib(N)`** — actual peak from nvidia-smi. Used for `--max-vram-gib` filtering and scheduler budget. - **`profiled_vram_gib(N)`** — actual peak from nvidia-smi. Used for `--max-vram-gib` filtering and scheduler budget.
- **`requested_trtllm_kv_tokens(N)`** — max KV cache tokens for text models. Sets `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS``KvCacheConfig.max_tokens` via `--override-engine-args` JSON. Deterministic and parallel-safe. - **`requested_trtllm_kv_tokens(N)`** — max KV cache tokens for text models. Sets `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS``KvCacheConfig.max_tokens` via `--override-engine-args` JSON. Deterministic and parallel-safe.
- **`requested_trtllm_vram_gib(N)`** — max VRAM in GiB for non-text workloads (video/image diffusion). Sets `_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES``KvCacheConfig.max_gpu_total_bytes` via `--override-engine-args` JSON. Note: diffusion models don't use KV cache, so this parameter may have no effect — `profiled_vram_gib` alone is sufficient for scheduler budget tracking. - **`requested_trtllm_vram_gib(N)`** — max VRAM in GiB for non-text workloads (video/image diffusion). Sets `_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES``KvCacheConfig.max_gpu_total_bytes` via `--override-engine-args` JSON. Note: diffusion models don't use KV cache, so this parameter may have no effect — `profiled_vram_gib` alone is sufficient for scheduler budget tracking.
- TRT-LLM requires JSON merging for `--override-engine-args`, handled by `build_trtllm_override_args_with_mem` in `gpu_utils.sh` (separate from `build_gpu_mem_args` used by vLLM/SGLang). - TRT-LLM requires JSON merging for `--override-engine-args`, handled by `build_trtllm_override_args_with_mem` in `gpu_utils.sh` (separate from `build_vllm_gpu_mem_args` / `build_sglang_gpu_mem_args`).
`--max-vram-gib=N` deselects tests whose `profiled_vram_gib` exceeds N. Tests without a VRAM marker are also deselected (unknown VRAM = unsafe for parallel). To add a test to the pool, profile it with `tests/utils/profile_pytest.py` (see [GPU VRAM Profiler](#gpu-vram-profiler-profile_pytestpy)). `--max-vram-gib=N` deselects tests whose `profiled_vram_gib` exceeds N. Tests without a VRAM marker are also deselected (unknown VRAM = unsafe for parallel). To add a test to the pool, profile it with `tests/utils/profile_pytest.py` (see [GPU VRAM Profiler](#gpu-vram-profiler-profile_pytestpy)).
...@@ -203,9 +203,9 @@ GPU tests run concurrently via a custom VRAM-aware scheduler (`tests/utils/pytes ...@@ -203,9 +203,9 @@ GPU tests run concurrently via a custom VRAM-aware scheduler (`tests/utils/pytes
1. **VRAM budget**: xdist has no GPU memory awareness — two 20 GiB tests on a 48 GiB GPU will OOM. 1. **VRAM budget**: xdist has no GPU memory awareness — two 20 GiB tests on a 48 GiB GPU will OOM.
2. **Profiling race**: engines snapshot free memory during init; concurrent startups corrupt each other. The scheduler staggers launches (VRAM stability check) and retries transient failures. 2. **Profiling race**: engines snapshot free memory during init; concurrent startups corrupt each other. The scheduler staggers launches (VRAM stability check) and retries transient failures.
3. **Engine-specific allocation**: each test gets a constrained allocation so it uses only its budgeted share. xdist has no mechanism for this. 3. **Engine-specific allocation**: each test gets a constrained allocation so it uses only its budgeted share. xdist has no mechanism for this.
- **vLLM**: `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES = N``--kv-cache-memory-bytes` (from `requested_vllm_kv_cache_bytes` marker). Byte-based cap is deterministic and doesn't depend on current free memory, making it inherently parallel-safe. - **vLLM**: `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES = N``--kv-cache-memory-bytes` (from `requested_vllm_kv_cache_bytes` marker). Byte-based cap is deterministic and doesn't depend on current free memory, making it inherently parallel-safe. Uses `build_vllm_gpu_mem_args` in `gpu_utils.sh`.
- **SGLang**: `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS = N``--max-total-tokens` (from `requested_sglang_kv_tokens` marker). Token-based cap is deterministic and doesn't depend on current free memory, making it inherently parallel-safe. - **SGLang**: `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS = N``--max-total-tokens` (from `requested_sglang_kv_tokens` marker). Token-based cap is deterministic and doesn't depend on current free memory, making it inherently parallel-safe. Uses `build_sglang_gpu_mem_args` in `gpu_utils.sh`.
- **TRT-LLM**: `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS = N``KvCacheConfig.max_tokens` via `--override-engine-args` JSON (from `requested_trtllm_kv_tokens` marker). Token-based cap is deterministic and parallel-safe. Uses `build_trtllm_override_args_with_mem` (not `build_gpu_mem_args`) because TRT-LLM requires JSON merging for `--override-engine-args`. - **TRT-LLM**: `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS = N``KvCacheConfig.max_tokens` via `--override-engine-args` JSON (from `requested_trtllm_kv_tokens` marker). Token-based cap is deterministic and parallel-safe. Uses `build_trtllm_override_args_with_mem` in `gpu_utils.sh` (separate function because TRT-LLM requires JSON merging).
```bash ```bash
# Dry-run: preview which tests fit and the GPU plan # Dry-run: preview which tests fit and the GPU plan
...@@ -546,19 +546,23 @@ The profiler automatically detects the engine type and uses the appropriate bina ...@@ -546,19 +546,23 @@ The profiler automatically detects the engine type and uses the appropriate bina
- **SGLang**: bisects `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS` (token count) → `--max-total-tokens`. Finds the minimum KV cache tokens where the test passes, applies a 2x safety factor, then runs a final probe at the safe token count to measure the actual VRAM. Outputs `profiled_vram_gib` and `requested_sglang_kv_tokens` markers. - **SGLang**: bisects `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS` (token count) → `--max-total-tokens`. Finds the minimum KV cache tokens where the test passes, applies a 2x safety factor, then runs a final probe at the safe token count to measure the actual VRAM. Outputs `profiled_vram_gib` and `requested_sglang_kv_tokens` markers.
- **TRT-LLM**: bisects `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` (token count) → `KvCacheConfig.max_tokens` via `--override-engine-args` JSON. Same logic as SGLang (token-based bisection, 2x safety). Outputs `profiled_vram_gib` and `requested_trtllm_kv_tokens` markers. For non-text models (video/image diffusion) that don't use KV cache, use `--no-find-min-vram` for a single-pass VRAM measurement — binary search won't work because the model doesn't log KV token allocation. - **TRT-LLM**: bisects `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` (token count) → `KvCacheConfig.max_tokens` via `--override-engine-args` JSON. Same logic as SGLang (token-based bisection, 2x safety). Outputs `profiled_vram_gib` and `requested_trtllm_kv_tokens` markers. For non-text models (video/image diffusion) that don't use KV cache, use `--no-find-min-vram` for a single-pass VRAM measurement — binary search won't work because the model doesn't log KV token allocation.
**Requirement (vLLM):** The launch script must honor `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES`. This is handled by `build_gpu_mem_args` in `gpu_utils.sh` (returns `--kv-cache-memory-bytes N`). **Requirement (vLLM):** The launch script must honor `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES`. This is handled by `build_vllm_gpu_mem_args` in `gpu_utils.sh` (returns `--kv-cache-memory-bytes N`).
**Requirement (SGLang):** The launch script must honor `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS`. This is handled by `build_gpu_mem_args` in `gpu_utils.sh` (returns `--max-total-tokens N`). **Requirement (SGLang):** The launch script must honor `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS`. This is handled by `build_sglang_gpu_mem_args` in `gpu_utils.sh` (returns `--max-total-tokens N`).
**Requirement (TRT-LLM):** The launch script must honor `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` (and optionally `_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES`). This is handled by `build_trtllm_override_args_with_mem` in `gpu_utils.sh` (returns JSON for `--override-engine-args`). Note: this is a separate function from `build_gpu_mem_args` because TRT-LLM requires JSON merging. **Requirement (TRT-LLM):** The launch script must honor `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` (and optionally `_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES`). This is handled by `build_trtllm_override_args_with_mem` in `gpu_utils.sh` (returns JSON for `--override-engine-args`). Note: this is a separate function from `build_vllm_gpu_mem_args` / `build_sglang_gpu_mem_args` because TRT-LLM requires JSON merging.
### Engine-specific mapping ### Engine-specific mapping
Launch scripts call `build_gpu_mem_args` (vLLM/SGLang) or `build_trtllm_override_args_with_mem` (TRT-LLM) from `examples/common/gpu_utils.sh`, which check env var overrides and return the appropriate CLI flags: Launch scripts call engine-specific functions from `examples/common/gpu_utils.sh` which check env var overrides and return the appropriate CLI flags:
```bash ```bash
# vLLM / SGLang # vLLM
GPU_MEM_ARGS=$(build_gpu_mem_args sglang) GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
python -m dynamo.vllm --model "$MODEL" $GPU_MEM_ARGS &
# SGLang
GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
python -m dynamo.sglang --model-path "$MODEL" $GPU_MEM_ARGS & python -m dynamo.sglang --model-path "$MODEL" $GPU_MEM_ARGS &
# TRT-LLM (requires JSON merging, separate function) # TRT-LLM (requires JSON merging, separate function)
......
...@@ -330,7 +330,7 @@ vllm_configs = { ...@@ -330,7 +330,7 @@ vllm_configs = {
# NOTE: Pack all workers on 1 GPU for lower CI resource requirements # NOTE: Pack all workers on 1 GPU for lower CI resource requirements
# NOTE: disagg_multimodal_e_pd.sh uses explicit --gpu-memory-utilization via # NOTE: disagg_multimodal_e_pd.sh uses explicit --gpu-memory-utilization via
# DYN_ENCODE_GPU_MEM / DYN_PD_GPU_MEM env vars in single-GPU mode. # DYN_ENCODE_GPU_MEM / DYN_PD_GPU_MEM env vars in single-GPU mode.
# PD worker honors build_gpu_mem_args for parallel execution. # PD worker honors build_vllm_gpu_mem_args for parallel execution.
"multimodal_e_pd_qwen": VLLMConfig( "multimodal_e_pd_qwen": VLLMConfig(
name="multimodal_e_pd_qwen", name="multimodal_e_pd_qwen",
directory=vllm_dir, directory=vllm_dir,
...@@ -414,7 +414,7 @@ vllm_configs = { ...@@ -414,7 +414,7 @@ vllm_configs = {
# total on this GPU. # total on this GPU.
# NOTE: disagg_multimodal_epd.sh uses explicit --gpu-memory-utilization via # NOTE: disagg_multimodal_epd.sh uses explicit --gpu-memory-utilization via
# DYN_ENCODE_GPU_MEM / DYN_PREFILL_GPU_MEM / DYN_DECODE_GPU_MEM env vars. # DYN_ENCODE_GPU_MEM / DYN_PREFILL_GPU_MEM / DYN_DECODE_GPU_MEM env vars.
# P/D workers honor build_gpu_mem_args for parallel execution. # P/D workers honor build_vllm_gpu_mem_args for parallel execution.
"multimodal_disagg_qwen": VLLMConfig( "multimodal_disagg_qwen": VLLMConfig(
name="multimodal_disagg_qwen", name="multimodal_disagg_qwen",
directory=vllm_dir, directory=vllm_dir,
......
...@@ -25,8 +25,8 @@ the ``@pytest.mark.profiled_vram_gib`` recommendation. ...@@ -25,8 +25,8 @@ the ``@pytest.mark.profiled_vram_gib`` recommendation.
**IMPORTANT**: The test under profile **MUST** read the appropriate KV cache **IMPORTANT**: The test under profile **MUST** read the appropriate KV cache
override — either directly (see ``test_mock_gpu_alloc.py``) or via launch override — either directly (see ``test_mock_gpu_alloc.py``) or via launch
scripts that call ``build_gpu_mem_args`` (vLLM/SGLang) or scripts that call ``build_vllm_gpu_mem_args`` / ``build_sglang_gpu_mem_args``
``build_trtllm_override_args_with_mem`` (TensorRT-LLM). If the test or ``build_trtllm_override_args_with_mem`` (TensorRT-LLM). If the test
ignores the override, every probe will pass at the same peak and the profiler ignores the override, every probe will pass at the same peak and the profiler
will warn that the binary search is unreliable. will warn that the binary search is unreliable.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment