Unverified Commit ad2205eb authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

feat: add a few TRT-LLM example support for GPU-parallel test execution (#7880)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent 2af062ec
......@@ -109,6 +109,23 @@ def build_kv_connector_config(config: Config):
return None
def _warn_override_collisions(target: dict, source: dict, path: str = "") -> None:
"""Log warnings for keys in *source* that will overwrite existing values in *target*."""
for key, new_val in source.items():
full_key = f"{path}.{key}" if path else key
if key in target:
old_val = target[key]
if isinstance(new_val, dict) and isinstance(old_val, dict):
_warn_override_collisions(old_val, new_val, full_key)
elif old_val != new_val:
logging.warning(
"override_engine_args will replace %s: %r -> %r",
full_key,
old_val,
new_val,
)
async def init_llm_worker(
runtime: DistributedRuntime,
config: Config,
......@@ -206,6 +223,7 @@ async def init_llm_worker(
overrides = json.loads(config.override_engine_args)
logging.info(f"Applying engine arg overrides: {overrides}")
_warn_override_collisions(arg_map, overrides)
deep_update(arg_map, overrides)
except json.JSONDecodeError as e:
logging.error(f"Failed to parse override_engine_args as JSON: {e}")
......
......@@ -6,7 +6,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_trtllm_override_args_with_mem
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
......@@ -42,12 +43,22 @@ while [[ $# -gt 0 ]]; do
esac
done
TRACE_ARGS=()
TRTLLM_OVERRIDE_ARGS=()
if [ "$ENABLE_OTEL" = true ]; then
export DYN_LOGGING_JSONL=true
export OTEL_EXPORT_ENABLED=1
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-http://localhost:4317}
TRACE_ARGS+=(--override-engine-args "{\"return_perf_metrics\": true, \"otlp_traces_endpoint\": \"${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}\" }")
OTEL_JSON="{\"return_perf_metrics\": true, \"otlp_traces_endpoint\": \"${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}\"}"
# Merge GPU mem config with OTEL config
OVERRIDE_JSON=$(build_trtllm_override_args_with_mem --merge-with-json "$OTEL_JSON")
else
# Just GPU mem config (if any)
OVERRIDE_JSON=$(build_trtllm_override_args_with_mem)
fi
# Add --override-engine-args if we have JSON
if [[ -n "$OVERRIDE_JSON" ]]; then
TRTLLM_OVERRIDE_ARGS=(--override-engine-args "$OVERRIDE_JSON")
fi
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
......@@ -66,7 +77,7 @@ python3 -m dynamo.trtllm \
--served-model-name "$SERVED_MODEL_NAME" \
--modality "$MODALITY" \
--extra-engine-args "$AGG_ENGINE_ARGS" \
"${TRACE_ARGS[@]}" \
"${TRTLLM_OVERRIDE_ARGS[@]}" \
"${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
......
......@@ -6,7 +6,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_trtllm_override_args_with_mem
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
......@@ -15,6 +16,15 @@ export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/agg.yaml"}
export MODALITY=${MODALITY:-"text"}
# Build GPU memory JSON (returns bare JSON, no flag)
OVERRIDE_JSON=$(build_trtllm_override_args_with_mem)
# Add --override-engine-args if we have JSON
TRTLLM_OVERRIDE_ARGS=()
if [[ -n "$OVERRIDE_JSON" ]]; then
TRTLLM_OVERRIDE_ARGS=(--override-engine-args "$OVERRIDE_JSON")
fi
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving + Metrics" "$MODEL_PATH" "$HTTP_PORT"
......@@ -29,7 +39,8 @@ python3 -m dynamo.trtllm \
--served-model-name "$SERVED_MODEL_NAME" \
--modality "$MODALITY" \
--extra-engine-args "$AGG_ENGINE_ARGS" \
--publish-events-and-metrics &
--publish-events-and-metrics \
"${TRTLLM_OVERRIDE_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -10,6 +10,7 @@ trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_trtllm_override_args_with_mem
# Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
......@@ -36,6 +37,15 @@ while [[ $# -gt 0 ]]; do
esac
done
# Build GPU memory JSON (returns bare JSON, no flag)
OVERRIDE_JSON=$(build_trtllm_override_args_with_mem)
# Add --override-engine-args if we have JSON
TRTLLM_OVERRIDE_ARGS=()
if [[ -n "$OVERRIDE_JSON" ]]; then
TRTLLM_OVERRIDE_ARGS=(--override-engine-args "$OVERRIDE_JSON")
fi
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --no-curl "Launching Video Diffusion Serving (1 GPU)" "$MODEL_PATH" "$HTTP_PORT" \
"Media URL: $MEDIA_OUTPUT_FS_URL"
......@@ -61,6 +71,7 @@ python3 -m dynamo.trtllm \
--served-model-name "$SERVED_MODEL_NAME" \
--modality video_diffusion \
--media-output-fs-url "$MEDIA_OUTPUT_FS_URL" \
"${TRTLLM_OVERRIDE_ARGS[@]}" \
"${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
......
......@@ -14,10 +14,17 @@
# Returns engine-specific CLI args for GPU memory control based on
# environment variable overrides. Empty if no overrides.
#
# Supported engines: vllm, sglang
#
# vLLM: _PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES → --kv-cache-memory-bytes N --gpu-memory-utilization 0.01
# SGLang: _PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS → --max-total-tokens N
#
# Note: TensorRT-LLM uses build_trtllm_override_args_with_mem() instead (requires JSON merging)
#
# TODO: Split into build_vllm_gpu_mem_args and build_sglang_gpu_mem_args
#
# Usage:
# # vLLM / SGLang
# GPU_MEM_ARGS=$(build_gpu_mem_args sglang)
# python -m dynamo.sglang --model-path "$MODEL" $GPU_MEM_ARGS &
#
......@@ -27,6 +34,12 @@ build_gpu_mem_args() {
local engine="${1:?usage: build_gpu_mem_args <engine> [--workers-per-gpu N]}"
shift
# TensorRT-LLM uses build_trtllm_override_args_with_mem instead
if [[ "$engine" == "trtllm" ]]; then
echo "build_gpu_mem_args: TensorRT-LLM not supported. Use build_trtllm_override_args_with_mem instead." >&2
return 1
fi
local workers_per_gpu=1
while [[ $# -gt 0 ]]; do
case "$1" in
......@@ -59,6 +72,76 @@ build_gpu_mem_args() {
}
# ---------------------------------------------------------------------------
# build_trtllm_override_args_with_mem [--merge-with-json JSON]
# TensorRT-LLM-specific: builds JSON for --override-engine-args with GPU memory config.
# Returns ONLY the bare JSON value (no --override-engine-args flag, no quotes).
#
# Separate function because TRT-LLM requires JSON merging for --override-engine-args
# (unlike vLLM/SGLang which use direct CLI flags).
#
# Environment variables:
# _PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS → {"kv_cache_config": {"max_tokens": N}}
# _PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES → {"kv_cache_config": {"max_gpu_total_bytes": N}}
#
# If --merge-with-json is provided, merges GPU config with the existing JSON.
#
# Usage:
# # TensorRT-LLM: simple case (no existing overrides)
# JSON=$(build_trtllm_override_args_with_mem)
# python -m dynamo.trtllm --model-path "$MODEL" ${JSON:+--override-engine-args "$JSON"} &
#
# # TensorRT-LLM: merge with existing JSON
# EXISTING='{"return_perf_metrics": true}'
# JSON=$(build_trtllm_override_args_with_mem --merge-with-json "$EXISTING")
# python -m dynamo.trtllm --model-path "$MODEL" --override-engine-args "$JSON" &
# ---------------------------------------------------------------------------
build_trtllm_override_args_with_mem() {
local merge_json=""
while [[ $# -gt 0 ]]; do
case "$1" in
--merge-with-json)
merge_json="$2"
shift 2
;;
*) echo "build_trtllm_override_args_with_mem: unknown option '$1'" >&2; return 1 ;;
esac
done
local gpu_mem_json=""
# Token-based (preferred, simpler to reason about)
if [[ -n "${_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS:-}" ]]; then
gpu_mem_json='"kv_cache_config": {"max_tokens": '"${_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS}"'}'
# Byte-based (alternative, more precise)
elif [[ -n "${_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES:-}" ]]; then
gpu_mem_json='"kv_cache_config": {"max_gpu_total_bytes": '"${_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES}"'}'
fi
if [[ -n "$gpu_mem_json" ]]; then
if [[ -n "$merge_json" ]]; then
# Merge: GPU mem config first, then existing config
# Strip outer braces from existing JSON
local existing="${merge_json#\{}"
existing="${existing%\}}"
if [[ -n "${existing//[[:space:]]/}" ]]; then
echo "{${gpu_mem_json}, ${existing}}"
else
echo "{${gpu_mem_json}}"
fi
else
# Just GPU mem config
echo "{${gpu_mem_json}}"
fi
elif [[ -n "$merge_json" ]]; then
# No GPU override, return existing JSON as-is
echo "$merge_json"
fi
# No output if both are empty (engine uses default)
}
# ---------------------------------------------------------------------------
# Self-test: bash gpu_utils.sh --self-test
# ---------------------------------------------------------------------------
......@@ -116,11 +199,56 @@ _gpu_utils_self_test() {
build_gpu_mem_args sglang)
_assert "sglang ignores kv bytes" "" "$result"
echo ""
echo "=== trtllm: token cap env ==="
result=$(_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS=4096 \
build_trtllm_override_args_with_mem)
_assert "trtllm token cap" '{"kv_cache_config": {"max_tokens": 4096}}' "$result"
echo ""
echo "=== trtllm: byte cap env ==="
result=$(_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES=1073741824 \
build_trtllm_override_args_with_mem)
_assert "trtllm byte cap" '{"kv_cache_config": {"max_gpu_total_bytes": 1073741824}}' "$result"
echo ""
echo "=== trtllm: no override = empty ==="
result=$(build_trtllm_override_args_with_mem)
_assert "empty (engine default)" "" "$result"
echo ""
echo "=== trtllm: token cap takes precedence over byte cap ==="
result=$(_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS=2048 _PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES=999999 \
build_trtllm_override_args_with_mem)
_assert "trtllm token precedence" '{"kv_cache_config": {"max_tokens": 2048}}' "$result"
echo ""
echo "=== trtllm: merge with existing JSON ==="
result=$(_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS=2048 \
build_trtllm_override_args_with_mem --merge-with-json '{"return_perf_metrics": true, "otlp_traces_endpoint": "http://localhost:4317"}')
_assert "trtllm merged" '{"kv_cache_config": {"max_tokens": 2048}, "return_perf_metrics": true, "otlp_traces_endpoint": "http://localhost:4317"}' "$result"
echo ""
echo "=== trtllm: merge with empty JSON object ==="
result=$(_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS=2048 \
build_trtllm_override_args_with_mem --merge-with-json '{}')
_assert "trtllm merge empty obj" '{"kv_cache_config": {"max_tokens": 2048}}' "$result"
echo ""
echo "=== trtllm: no GPU override, but pass through existing JSON ==="
result=$(build_trtllm_override_args_with_mem --merge-with-json '{"return_perf_metrics": true}')
_assert "trtllm passthrough" '{"return_perf_metrics": true}' "$result"
echo ""
echo "=== missing engine ==="
(build_gpu_mem_args 2>/dev/null)
_assert "missing engine exits non-zero" "1" "$?"
echo ""
echo "=== trtllm rejected (use build_trtllm_override_args_with_mem) ==="
(build_gpu_mem_args trtllm 2>/dev/null)
_assert "trtllm rejected" "1" "$?"
echo ""
echo "=========================================="
echo "Results: $pass passed, $fail failed"
......
......@@ -234,10 +234,12 @@ markers = [
"gpu_8: marks tests to run on 8GPUs",
"xpu_1: marks tests to run on XPU",
"xpu_2: marks tests to run on 2XPUs",
# These 3 (profiled_vram_gib and requested_*) are used for parallel pytest executions:
# These 5 (profiled_vram_gib and requested_*) are used for parallel pytest executions:
"profiled_vram_gib(N): actual peak VRAM observed by nvidia-smi during profiling. Used for --max-vram-gib filtering and scheduler budget tracking",
"requested_vllm_kv_cache_bytes(N): exact KV cache bytes for vLLM (skips memory profiling). Sets _PROFILE_PYTEST_KV_CACHE_BYTES. Most deterministic method for parallel execution",
"requested_sglang_kv_tokens(N): max KV cache tokens for SGLang parallel execution. Sets _OVERRIDE_SGLANG_MAX_TOTAL_TOKENS to cap --max-total-tokens and prevent over-allocation",
"requested_trtllm_kv_tokens(N): max KV cache tokens for TensorRT-LLM parallel execution. Sets _PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS to cap KvCacheConfig.max_tokens via --override-engine-args",
"requested_trtllm_vram_gib(N): max VRAM in GiB for TensorRT-LLM parallel execution. Sets _PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES to cap KvCacheConfig.max_gpu_total_bytes via --override-engine-args. Use for non-text workloads (video/image diffusion)",
"e2e: marks tests as end-to-end tests",
"integration: marks tests as integration tests",
"unit: marks tests as unit tests",
......
......@@ -117,6 +117,8 @@ Markers are required for all tests. They are used for test selection in CI and l
| VRAM (profiled) | profiled_vram_gib(N) | Actual peak VRAM observed by nvidia-smi during profiling (includes CUDA overhead). Used for `--max-vram-gib=N` filtering and GPU-parallel scheduler budget tracking. |
| vLLM KV cache bytes | requested_vllm_kv_cache_bytes(N) | (vLLM only) Exact KV cache bytes. Sets `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES``--kv-cache-memory-bytes`. Deterministic, parallel-safe. |
| SGLang KV tokens | requested_sglang_kv_tokens(N) | (SGLang only) Max KV cache tokens. Sets `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS``--max-total-tokens`. Deterministic, parallel-safe. |
| TRT-LLM KV tokens | requested_trtllm_kv_tokens(N) | (TRT-LLM only) Max KV cache tokens. Sets `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS``KvCacheConfig.max_tokens` via `--override-engine-args`. Deterministic, parallel-safe. |
| TRT-LLM VRAM GiB | requested_trtllm_vram_gib(N) | (TRT-LLM only) Max VRAM in GiB. Sets `_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES``KvCacheConfig.max_gpu_total_bytes` via `--override-engine-args`. For non-text workloads (video/image diffusion) where token-based control doesn't apply. |
| Component/Framework | vllm, trtllm, sglang, kvbm, kvbm_concurrency, planner, router | Backend or component specificity |
| Infrastructure | k8s, deploy, fault_tolerance | Infrastructure/environment needs |
| Execution | parallel | Test can run in parallel with pytest-xdist. Must use dynamic port allocation (`alloc_ports`) and not share resources (e.g. filesystem) |
......@@ -147,6 +149,33 @@ def test_sglang_aggregated():
...
```
### Example (TRT-LLM with token cap)
```python
@pytest.mark.pre_merge
@pytest.mark.e2e
@pytest.mark.gpu_1
@pytest.mark.profiled_vram_gib(3.9) # actual nvidia-smi peak at recommended token count
@pytest.mark.requested_trtllm_kv_tokens(2592) # KV cache cap (2x safety over min=1296)
@pytest.mark.timeout(300)
@pytest.mark.trtllm
def test_trtllm_aggregated():
...
```
### Example (TRT-LLM diffusion — no KV cache)
```python
@pytest.mark.pre_merge
@pytest.mark.gpu_1
@pytest.mark.trtllm
# Diffusion models don't use KV cache, so requested_trtllm_kv_tokens doesn't apply
# and requested_trtllm_vram_gib (KvCacheConfig.max_gpu_total_bytes) has no effect —
# the VRAM is model weights + activations. Only profiled_vram_gib is meaningful.
@pytest.mark.profiled_vram_gib(17.1) # actual nvidia-smi peak
@pytest.mark.timeout(600)
def test_trtllm_video_diffusion():
...
```
### VRAM Markers and Filtering
Markers differ by engine:
......@@ -159,6 +188,12 @@ Markers differ by engine:
- **`profiled_vram_gib(N)`** — actual peak from nvidia-smi at the recommended token count. Used for `--max-vram-gib` filtering and scheduler budget.
- **`requested_sglang_kv_tokens(N)`** — max KV cache tokens. Sets `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS``--max-total-tokens`. SGLang's default `--mem-fraction-static` is never overridden; the token cap is the sole allocation control. Deterministic and parallel-safe (see `examples/common/gpu_utils.md`).
**TRT-LLM** uses token-based control (text models) or byte-based control (diffusion models):
- **`profiled_vram_gib(N)`** — actual peak from nvidia-smi. Used for `--max-vram-gib` filtering and scheduler budget.
- **`requested_trtllm_kv_tokens(N)`** — max KV cache tokens for text models. Sets `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS``KvCacheConfig.max_tokens` via `--override-engine-args` JSON. Deterministic and parallel-safe.
- **`requested_trtllm_vram_gib(N)`** — max VRAM in GiB for non-text workloads (video/image diffusion). Sets `_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES``KvCacheConfig.max_gpu_total_bytes` via `--override-engine-args` JSON. Note: diffusion models don't use KV cache, so this parameter may have no effect — `profiled_vram_gib` alone is sufficient for scheduler budget tracking.
- TRT-LLM requires JSON merging for `--override-engine-args`, handled by `build_trtllm_override_args_with_mem` in `gpu_utils.sh` (separate from `build_gpu_mem_args` used by vLLM/SGLang).
`--max-vram-gib=N` deselects tests whose `profiled_vram_gib` exceeds N. Tests without a VRAM marker are also deselected (unknown VRAM = unsafe for parallel). To add a test to the pool, profile it with `tests/utils/profile_pytest.py` (see [GPU VRAM Profiler](#gpu-vram-profiler-profile_pytestpy)).
### GPU-Parallel Execution
......@@ -170,6 +205,7 @@ GPU tests run concurrently via a custom VRAM-aware scheduler (`tests/utils/pytes
3. **Engine-specific allocation**: each test gets a constrained allocation so it uses only its budgeted share. xdist has no mechanism for this.
- **vLLM**: `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES = N``--kv-cache-memory-bytes` (from `requested_vllm_kv_cache_bytes` marker). Byte-based cap is deterministic and doesn't depend on current free memory, making it inherently parallel-safe.
- **SGLang**: `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS = N``--max-total-tokens` (from `requested_sglang_kv_tokens` marker). Token-based cap is deterministic and doesn't depend on current free memory, making it inherently parallel-safe.
- **TRT-LLM**: `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS = N``KvCacheConfig.max_tokens` via `--override-engine-args` JSON (from `requested_trtllm_kv_tokens` marker). Token-based cap is deterministic and parallel-safe. Uses `build_trtllm_override_args_with_mem` (not `build_gpu_mem_args`) because TRT-LLM requires JSON merging for `--override-engine-args`.
```bash
# Dry-run: preview which tests fit and the GPU plan
......@@ -508,18 +544,26 @@ The profiler automatically detects the engine type and uses the appropriate bina
- **vLLM**: bisects `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES` (bytes) → `--kv-cache-memory-bytes`. Finds the minimum KV cache bytes where the test passes, applies a 2x safety factor. Outputs `profiled_vram_gib` and `requested_vllm_kv_cache_bytes` markers.
- **SGLang**: bisects `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS` (token count) → `--max-total-tokens`. Finds the minimum KV cache tokens where the test passes, applies a 2x safety factor, then runs a final probe at the safe token count to measure the actual VRAM. Outputs `profiled_vram_gib` and `requested_sglang_kv_tokens` markers.
- **TRT-LLM**: bisects `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` (token count) → `KvCacheConfig.max_tokens` via `--override-engine-args` JSON. Same logic as SGLang (token-based bisection, 2x safety). Outputs `profiled_vram_gib` and `requested_trtllm_kv_tokens` markers. For non-text models (video/image diffusion) that don't use KV cache, use `--no-find-min-vram` for a single-pass VRAM measurement — binary search won't work because the model doesn't log KV token allocation.
**Requirement (vLLM):** The launch script must honor `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES`. This is handled by `build_gpu_mem_args` in `gpu_utils.sh` (returns `--kv-cache-memory-bytes N`).
**Requirement (SGLang):** The launch script must honor `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS`. This is handled by `build_gpu_mem_args` in `gpu_utils.sh` (returns `--max-total-tokens N`).
**Requirement (TRT-LLM):** The launch script must honor `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` (and optionally `_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES`). This is handled by `build_trtllm_override_args_with_mem` in `gpu_utils.sh` (returns JSON for `--override-engine-args`). Note: this is a separate function from `build_gpu_mem_args` because TRT-LLM requires JSON merging.
### Engine-specific mapping
Launch scripts call `build_gpu_mem_args` (from `examples/common/gpu_utils.sh`) which checks env var overrides and returns the appropriate CLI flags:
Launch scripts call `build_gpu_mem_args` (vLLM/SGLang) or `build_trtllm_override_args_with_mem` (TRT-LLM) from `examples/common/gpu_utils.sh`, which check env var overrides and return the appropriate CLI flags:
```bash
# vLLM / SGLang
GPU_MEM_ARGS=$(build_gpu_mem_args sglang)
python -m dynamo.sglang --model-path "$MODEL" $GPU_MEM_ARGS &
# TRT-LLM (requires JSON merging, separate function)
OVERRIDE_JSON=$(build_trtllm_override_args_with_mem)
python -m dynamo.trtllm --model-path "$MODEL" ${OVERRIDE_JSON:+--override-engine-args "$OVERRIDE_JSON"} &
```
Env vars control engine allocation during profiling and parallel test execution:
......@@ -536,7 +580,19 @@ Env vars control engine allocation during profiling and parallel test execution:
|---------|----------------------------------|-------|
| SGLang | `--max-total-tokens N` | Token-based KV cache cap |
Both use absolute caps (bytes and tokens) — deterministic and independent of current free memory, which is critical for parallel test execution. See `examples/common/gpu_utils.md`.
**`_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS`** (integer) — TRT-LLM text models:
| Engine | Returned JSON | Notes |
|---------|--------------------------------------------------------|-------|
| TRT-LLM | `{"kv_cache_config": {"max_tokens": N}}` | Token-based KV cache cap via `--override-engine-args` |
**`_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES`** (integer) — TRT-LLM non-text models:
| Engine | Returned JSON | Notes |
|---------|------------------------------------------------------------------|-------|
| TRT-LLM | `{"kv_cache_config": {"max_gpu_total_bytes": N}}` | Byte-based cap via `--override-engine-args`. For diffusion models. |
All use absolute caps — deterministic and independent of current free memory, which is critical for parallel test execution. See `examples/common/gpu_utils.md`.
### Usage
......@@ -550,6 +606,12 @@ python tests/utils/profile_pytest.py --gpu 1 tests/serve/test_vllm.py::test_serv
# SGLang: binary search for minimum KV cache tokens (automatic)
python tests/utils/profile_pytest.py tests/serve/test_sglang.py::test_sglang_deployment[aggregated-2] -xvs
# TRT-LLM: binary search for minimum KV cache tokens (text models)
python tests/utils/profile_pytest.py tests/serve/test_trtllm.py::test_deployment[aggregated-2] -xvs
# TRT-LLM: single-pass for diffusion models (no KV cache, binary search won't work)
python tests/utils/profile_pytest.py --no-find-min-vram tests/serve/test_trtllm.py::test_deployment[video_diffusion-2] -xvs
# Single-pass profiling (no binary search, just measure one run using default RAM)
python tests/utils/profile_pytest.py --no-find-min-vram tests/serve/test_vllm.py::test_serve_deployment[aggregated]
```
......@@ -626,6 +688,36 @@ MINIMUM KV TOKENS RESULT
========================================================================
```
### Example output (TRT-LLM — token-based bisection)
```bash
========================================================================
FIND MINIMUM KV TOKENS (TensorRT-LLM) (binary search)
========================================================================
GPU total : 48.0 GiB
GPU free : 47.1 GiB (in use: 0.9 GiB)
Test : tests/serve/test_trtllm.py::test_deployment[aggregated-2] -xvs
[probe 1] Validation run (no token cap, default fraction)
[PASS] peak 41.3 GiB, wall 48s, max_tokens=41472 (TensorRT-LLM), iter took 56s
...
[probe 6/12] tokens=1296
[PASS] tokens=1296, peak 3.7 GiB, wall 46s, iter took 54s
[EARLY STOP] Peak VRAM stable for last 3 probes
[final probe] Measuring VRAM at safe_tokens=2592
[PASS] tokens=2592, peak 3.9 GiB, wall 46s
========================================================================
MINIMUM KV TOKENS RESULT (TensorRT-LLM)
========================================================================
Minimum tokens : 1296 (raw bisection result)
Recommended : 2592 (2x safety)
Peak VRAM : 3.9 GiB (at 2592 tokens)
@pytest.mark.profiled_vram_gib(3.9)
@pytest.mark.requested_trtllm_kv_tokens(2592), # KV cache cap (2x safety over min=1296)
========================================================================
```
### How to use the recommendations
1. **Copy the `@pytest.mark.*` lines** into your test function or `pytestmark` list.
......
......@@ -100,6 +100,8 @@ def pytest_configure(config: pytest.Config) -> None:
vram_limit = config.getoption("max_vram_gib", default=None)
if vram_limit is None:
return
if config.option.collectonly:
return
# Delayed: vram_utils requires pynvml, otherwise conftest fails to load
# on CPU-only CI runners (e.g. ARM deploy tests) that lack nvidia-ml-py.
from tests.utils.pytest_parallel_gpu import _parse_cuda_visible
......@@ -482,8 +484,9 @@ def pytest_collection_modifyitems(config, items):
# - Tests whose profiled VRAM exceeds the limit are removed
# - Tests WITHOUT a VRAM marker are also removed (unknown VRAM = unsafe)
# Using deselect (not skip) so they never reach the xdist scheduler.
# Skip all VRAM logic during --collect-only (just listing tests).
vram_limit = config.getoption("--max-vram-gib", default=None)
if vram_limit is not None:
if vram_limit is not None and not config.option.collectonly:
keep = []
deselected = []
for item in items:
......@@ -497,7 +500,7 @@ def pytest_collection_modifyitems(config, items):
items[:] = keep
# Write test metadata for the GPU orchestrator to read.
if vram_limit is not None:
if vram_limit is not None and not config.option.collectonly:
# Delayed: see vram_utils pynvml note in pytest_configure
from tests.utils.vram_utils import print_gpu_plan, write_test_meta
......
......@@ -80,15 +80,20 @@ trtllm_configs = {
directory=trtllm_dir,
script_name="agg_metrics.sh",
marks=[
pytest.mark.gpu_1,
pytest.mark.gpu_1, # 1 GPU(s) used, peak 3.9 GiB
pytest.mark.pre_merge,
pytest.mark.trtllm,
pytest.mark.profiled_vram_gib(3.9), # actual nvidia-smi peak 3.9 GiB
pytest.mark.requested_trtllm_kv_tokens(
2592
), # KV cache cap (2x safety over min=1296)
pytest.mark.timeout(
300
), # 3x measured time (44.66s) + download time (150s)
],
model="Qwen/Qwen3-0.6B",
frontend_port=DefaultPort.FRONTEND.value,
delayed_start=5,
request_payloads=[
chat_payload_default(),
completion_payload_default(),
......@@ -137,9 +142,19 @@ trtllm_configs = {
name="aggregated_logprobs",
directory=trtllm_dir,
script_name="agg.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.trtllm],
marks=[
pytest.mark.gpu_1, # 1 GPU(s) used, peak 3.8 GiB
pytest.mark.pre_merge,
pytest.mark.trtllm,
pytest.mark.profiled_vram_gib(3.8), # actual nvidia-smi peak 3.8 GiB
pytest.mark.requested_trtllm_kv_tokens(
2592
), # KV cache cap (2x safety over min=1296)
pytest.mark.timeout(300), # 3x measured time (~44s) + download time (150s)
],
model="Qwen/Qwen3-0.6B",
frontend_port=DefaultPort.FRONTEND.value,
delayed_start=5,
request_payloads=[
chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=5),
chat_payload(content=TEXT_PROMPT, logprobs=False, top_logprobs=5),
......@@ -360,9 +375,17 @@ trtllm_configs = {
"17",
],
marks=[
pytest.mark.gpu_1,
pytest.mark.gpu_1, # 1 GPU(s) used, peak 17.1 GiB
pytest.mark.trtllm,
pytest.mark.pre_merge,
# Diffusion models don't use KV cache, so requested_trtllm_kv_tokens
# doesn't apply. requested_trtllm_vram_gib maps to
# KvCacheConfig.max_gpu_total_bytes which has no effect on the
# diffusion engine itself, but the parallel scheduler requires one
# of the KV/VRAM markers to accept the test. We set it to the
# profiled peak so the scheduler's VRAM budget is accurate.
pytest.mark.profiled_vram_gib(17.1), # actual nvidia-smi peak 17.1 GiB
pytest.mark.requested_trtllm_vram_gib(17.1),
pytest.mark.timeout(
600
), # Video generation is slow even at small resolution
......@@ -370,7 +393,7 @@ trtllm_configs = {
model="Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
frontend_port=DefaultPort.FRONTEND.value,
timeout=300,
delayed_start=60, # Model loading takes time
delayed_start=5,
request_payloads=[
VideoGenerationPayload(
body={
......
......@@ -15,8 +15,9 @@ in-process instrumentation. Using NVML directly (the same C library that
and allows high-frequency sampling.
In **binary-search mode** (the default), the profiler bisects the KV cache
allocation — ``_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES`` for vLLM (bytes) or
``_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS`` for SGLang (tokens).
allocation — ``_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES`` for vLLM (bytes),
``_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS`` for SGLang (tokens), or
``_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS`` for TensorRT-LLM (tokens).
If the test passes, the allocation is lowered; if it OOMs, it is raised —
standard bisection to find the minimum the test needs. A safety factor
is applied and the peak ``memory.used`` from the last passing run becomes
......@@ -24,7 +25,8 @@ the ``@pytest.mark.profiled_vram_gib`` recommendation.
**IMPORTANT**: The test under profile **MUST** read the appropriate KV cache
override — either directly (see ``test_mock_gpu_alloc.py``) or via launch
scripts that call ``build_gpu_mem_args`` (e.g. ``agg.sh``). If the test
scripts that call ``build_gpu_mem_args`` (vLLM/SGLang) or
``build_trtllm_override_args_with_mem`` (TensorRT-LLM). If the test
ignores the override, every probe will pass at the same peak and the profiler
will warn that the binary search is unreliable.
......@@ -459,6 +461,7 @@ def _recommend_markers(
model_name: str | None = None,
num_runs: int = 1,
requested_sglang_kv_tokens: int | None = None,
requested_trtllm_kv_tokens: int | None = None,
requested_vllm_kv_cache_bytes: int | None = None,
min_kv_value: int | None = None,
) -> tuple[list[MarkerRecommendation], list[str]]:
......@@ -559,6 +562,14 @@ def _recommend_markers(
f"KV cache cap ({_KV_SAFETY_FACTOR:.0f}x safety{min_label})",
)
)
if requested_trtllm_kv_tokens is not None:
min_label = f" over min={min_kv_value}" if min_kv_value is not None else ""
recs.append(
MarkerRecommendation(
f"requested_trtllm_kv_tokens({requested_trtllm_kv_tokens})",
f"KV cache cap ({_KV_SAFETY_FACTOR:.0f}x safety{min_label})",
)
)
if requested_vllm_kv_cache_bytes is not None:
min_label = (
f" over min={min_kv_value:_}" if min_kv_value is not None else ""
......@@ -634,6 +645,7 @@ def _print_recommendations(
_SGLANG_NODEID_MARKERS = ["test_sglang", "sglang"]
_TRTLLM_NODEID_MARKERS = ["test_trtllm", "trtllm"]
def _is_sglang_test(pytest_args: list[str]) -> bool:
......@@ -643,6 +655,13 @@ def _is_sglang_test(pytest_args: list[str]) -> bool:
)
def _is_trtllm_test(pytest_args: list[str]) -> bool:
"""Check if any pytest arg looks like a TensorRT-LLM test node ID."""
return any(
marker in arg for arg in pytest_args for marker in _TRTLLM_NODEID_MARKERS
)
_OOM_PATTERNS = [
"OutOfMemoryError",
"CUDA out of memory",
......@@ -673,6 +692,22 @@ def _extract_requested_sglang_kv_tokens(stdout: str) -> int | None:
return None
_TRTLLM_MAX_TOKENS_RE = re.compile(
r"\[MemUsageChange\] Allocated .* for max tokens in paged KV cache \((\d+)\)"
)
def _extract_requested_trtllm_kv_tokens(stdout: str) -> int | None:
"""Extract max_tokens from TensorRT-LLM engine output.
TensorRT-LLM logs: "[MemUsageChange] Allocated 0.22 GiB for max tokens in paged KV cache (2048)."
"""
match = _TRTLLM_MAX_TOKENS_RE.search(stdout)
if match:
return int(match.group(1))
return None
_DEFAULT_PROBE_TIMEOUT = 300 # 5 minutes max per profile run
......@@ -765,15 +800,17 @@ def _find_min_vram(
) -> int:
"""Binary search to find the minimum VRAM a test needs.
Three modes, two patterns:
Three modes, three patterns:
KV bisection (deterministic, no profiling race):
vLLM: bisects _PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES (bytes)
SGLang: bisects _PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS (tokens)
Both use the same _KV_SAFETY_FACTOR (2x) and the same bisect loop.
TensorRT-LLM: bisects _PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS (tokens)
All use the same _KV_SAFETY_FACTOR (2x) and the same bisect loop.
The only differences are env var name, units, display, and bounds.
"""
is_sglang = _is_sglang_test(pytest_args)
is_trtllm = _is_trtllm_test(pytest_args)
gpu_info = _query_gpu_stats()
if not gpu_info:
......@@ -792,11 +829,13 @@ def _find_min_vram(
model_name = _extract_model_from_markers(pytest_args)
if not is_sglang:
if not is_sglang and not is_trtllm:
kv_bytes_mode = True
if kv_bytes_mode:
mode_label = "KV CACHE BYTES (vLLM, deterministic)"
elif is_trtllm:
mode_label = "KV TOKENS (TensorRT-LLM)"
else:
mode_label = "KV TOKENS (SGLang)"
print(f"\n--- FIND MINIMUM {mode_label} (binary search) ---")
......@@ -908,6 +947,16 @@ def _find_min_vram(
f" [PASS] peak {_format_mib(peak_mib)}, wall {wall:.0f}s, "
f"iter took {iter_elapsed:.0f}s"
)
else:
if is_trtllm:
max_tokens = _extract_requested_trtllm_kv_tokens(stdout)
if max_tokens is None:
print(
" [ERROR] Could not extract max_tokens from TensorRT-LLM output.\n"
" The launch script must log '[MemUsageChange] Allocated ... for max tokens in paged KV cache (N)'."
)
return 4
backend_label = "TensorRT-LLM"
else:
max_tokens = _extract_requested_sglang_kv_tokens(stdout)
if max_tokens is None:
......@@ -916,13 +965,14 @@ def _find_min_vram(
" The launch script must log 'max_total_tokens=N' (SGLang does this by default)."
)
return 4
backend_label = "SGLang"
page_size = 16
lo = page_size
hi = max_tokens
tolerance = page_size * 2
print(
f" [PASS] peak {_format_mib(peak_mib)}, wall {wall:.0f}s, "
f"max_total_tokens={max_tokens}, iter took {iter_elapsed:.0f}s"
f"max_tokens={max_tokens} ({backend_label}), iter took {iter_elapsed:.0f}s"
)
baseline_time = iter_elapsed
......@@ -968,6 +1018,14 @@ def _find_min_vram(
"_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES": str(mid_int),
}
probe_desc = f"kv_cache={mid_int // (1024**2)} MiB ({mid_int:,} bytes)"
elif is_trtllm:
mid_int = ((int(lo) + int(hi)) // 2 // page_size) * page_size
mid_int = max(mid_int, page_size)
probe_env = {
**_gpu_env,
"_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS": str(mid_int),
}
probe_desc = f"tokens={mid_int}"
else:
mid_int = ((int(lo) + int(hi)) // 2 // page_size) * page_size
mid_int = max(mid_int, page_size)
......@@ -1083,7 +1141,7 @@ def _find_min_vram(
# safe_kv_bytes which allocates more KV cache and thus more VRAM.
print(f" [final probe] Measuring VRAM at safe_kv_bytes={safe_kv_mib} MiB")
sys.stdout.flush()
rc_final, wall_final, reports_final, samples_final, stdout_final = _run_once(
rc_final, wall_final, reports_final, samples_final, _stdout_final = _run_once(
pytest_args,
interval=interval,
baseline_seconds=baseline_seconds,
......@@ -1141,14 +1199,24 @@ def _find_min_vram(
# safe_tokens which allocates more KV cache and thus more VRAM.
print(f" [final probe] Measuring VRAM at safe_tokens={safe_tokens}")
sys.stdout.flush()
rc_final, wall_final, reports_final, samples_final, stdout_final = _run_once(
if is_trtllm:
env_var_name = "_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS"
marker_name = "requested_trtllm_kv_tokens"
backend_label = "TensorRT-LLM"
else:
env_var_name = "_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS"
marker_name = "requested_sglang_kv_tokens"
backend_label = "SGLang"
rc_final, wall_final, reports_final, samples_final, _stdout_final = _run_once(
pytest_args,
interval=interval,
baseline_seconds=baseline_seconds,
teardown_seconds=teardown_seconds,
extra_env={
**_gpu_env,
"_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS": str(safe_tokens),
env_var_name: str(safe_tokens),
},
quiet=True,
run_label="final",
......@@ -1171,7 +1239,7 @@ def _find_min_vram(
)
print(f"\n{'=' * 72}")
print("MINIMUM KV TOKENS RESULT")
print(f"MINIMUM KV TOKENS RESULT ({backend_label})")
print(f"{'=' * 72}")
print(f" Minimum tokens : {min_tokens} (raw bisection result)")
print(f" Recommended : {safe_tokens} ({_KV_SAFETY_FACTOR:.0f}x safety)")
......@@ -1180,12 +1248,13 @@ def _find_min_vram(
)
print(f" {test_short}: @pytest.mark.profiled_vram_gib({peak_gib})")
print(
f" {test_short}: @pytest.mark.requested_sglang_kv_tokens({safe_tokens}), # KV cache cap ({_KV_SAFETY_FACTOR:.0f}x safety over min={min_tokens})"
f" {test_short}: @pytest.mark.{marker_name}({safe_tokens}), # KV cache cap ({_KV_SAFETY_FACTOR:.0f}x safety over min={min_tokens})"
)
print(f"{'=' * 72}")
# Marker recommendations
requested_sglang_kv_tokens = safe_tokens if is_sglang else None
requested_trtllm_kv_tokens = safe_tokens if is_trtllm else None
requested_vllm_kv_cache_bytes = safe_kv_bytes if kv_bytes_mode else None
min_kv_value = int(last_pass_value)
if recommend:
......@@ -1196,6 +1265,7 @@ def _find_min_vram(
model_name,
num_runs=len(pass_wall_times),
requested_sglang_kv_tokens=requested_sglang_kv_tokens,
requested_trtllm_kv_tokens=requested_trtllm_kv_tokens,
requested_vllm_kv_cache_bytes=requested_vllm_kv_cache_bytes,
min_kv_value=min_kv_value,
)
......@@ -1326,6 +1396,7 @@ def main(argv: list[str] | None = None) -> int:
model_name = _extract_model_from_markers(pytest_args)
is_sglang = _is_sglang_test(pytest_args)
is_trtllm = _is_trtllm_test(pytest_args)
rc, wall_secs, reports, samples, stdout = _run_once(
pytest_args,
......@@ -1333,20 +1404,24 @@ def main(argv: list[str] | None = None) -> int:
baseline_seconds=args.baseline_seconds,
teardown_seconds=args.teardown_seconds,
extra_env=gpu_env,
run_label="profile" if is_sglang else None,
run_label="profile" if (is_sglang or is_trtllm) else None,
)
_print_report(reports, rc, wall_secs, model_name=model_name)
if not args.no_recommend and reports:
requested_sglang_kv_tokens = None
requested_trtllm_kv_tokens = None
if is_sglang:
requested_sglang_kv_tokens = _extract_requested_sglang_kv_tokens(stdout)
if is_trtllm:
requested_trtllm_kv_tokens = _extract_requested_trtllm_kv_tokens(stdout)
recs, warnings = _recommend_markers(
reports,
wall_secs,
model_name=model_name,
requested_sglang_kv_tokens=requested_sglang_kv_tokens,
requested_trtllm_kv_tokens=requested_trtllm_kv_tokens,
)
_print_recommendations(recs, warnings, pytest_args=pytest_args)
......
......@@ -59,6 +59,8 @@ class _TestEntry:
timeout: float
requested_vllm_kv_cache_bytes: int | None = None
requested_sglang_kv_tokens: int | None = None
requested_trtllm_kv_tokens: int | None = None
requested_trtllm_vram_gib: float | None = None
skip_reason: str | None = None
w_id: int = 0
assigned_gpu: int | None = None
......@@ -117,6 +119,10 @@ def _fmt_req(test: _TestEntry) -> str:
"""Format the resource request value for display."""
if test.requested_sglang_kv_tokens is not None:
return f"req_kv_tokens={int(test.requested_sglang_kv_tokens)}"
if test.requested_trtllm_kv_tokens is not None:
return f"req_kv_tokens={int(test.requested_trtllm_kv_tokens)}"
if test.requested_trtllm_vram_gib is not None:
return f"req_vram={test.requested_trtllm_vram_gib:.1f} GiB"
if test.requested_vllm_kv_cache_bytes is not None:
gib = int(test.requested_vllm_kv_cache_bytes) / (1024**3)
return f"req_kv={gib:.2f} GiB"
......@@ -347,6 +353,8 @@ def run_parallel(
requested_vllm_kv_cache_bytes=m.get("requested_vllm_kv_cache_bytes"),
timeout=m.get("timeout", 600),
requested_sglang_kv_tokens=m.get("requested_sglang_kv_tokens"),
requested_trtllm_kv_tokens=m.get("requested_trtllm_kv_tokens"),
requested_trtllm_vram_gib=m.get("requested_trtllm_vram_gib"),
skip_reason=m.get("skip_reason"),
)
)
......@@ -367,19 +375,19 @@ def run_parallel(
for t in tests
if t.requested_vllm_kv_cache_bytes is None
and t.requested_sglang_kv_tokens is None
and t.requested_trtllm_kv_tokens is None
and t.requested_trtllm_vram_gib is None
and t.profiled_gib > 0
]
if no_kv:
_print(
f"\nERROR: {len(no_kv)} test(s) lack a requested_vllm_kv_cache_bytes "
f"or requested_sglang_kv_tokens marker and cannot run in parallel:"
f"\nERROR: {len(no_kv)} test(s) lack a requested_vllm_kv_cache_bytes, "
f"requested_sglang_kv_tokens, requested_trtllm_kv_tokens, "
f"or requested_trtllm_vram_gib marker and cannot run in parallel:"
)
for t in no_kv:
_print(f" {t.name}")
_print(
"\nAdd the appropriate marker via profile_pytest.py --kv-bytes, "
"then rerun."
)
_print("\nAdd the appropriate marker via profile_pytest.py, " "then rerun.")
return 1
# Identify tests in metadata that exceed the VRAM budget
......@@ -502,6 +510,13 @@ def run_parallel(
env["_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS"] = str(
int(test.requested_sglang_kv_tokens)
)
elif test.requested_trtllm_kv_tokens is not None:
env["_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS"] = str(
int(test.requested_trtllm_kv_tokens)
)
elif test.requested_trtllm_vram_gib is not None:
gib_to_bytes = int(test.requested_trtllm_vram_gib * 1024**3)
env["_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES"] = str(gib_to_bytes)
elif test.requested_vllm_kv_cache_bytes is not None:
env["_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES"] = str(
int(test.requested_vllm_kv_cache_bytes)
......@@ -705,7 +720,8 @@ def run_parallel(
gi = entry.assigned_gpu
assert gi is not None
is_vllm = (
entry.requested_sglang_kv_tokens is None and entry.profiled_gib > 0
entry.requested_vllm_kv_cache_bytes is not None
and entry.profiled_gib > 0
)
# Per-GPU vLLM stagger — only between vLLM tests on the
......
......@@ -109,6 +109,12 @@ def write_test_meta(items, dest_dir: str | None = None) -> None:
kv_tokens_mark = item.get_closest_marker("requested_sglang_kv_tokens")
if kv_tokens_mark and kv_tokens_mark.args:
meta["requested_sglang_kv_tokens"] = kv_tokens_mark.args[0]
trtllm_tokens_mark = item.get_closest_marker("requested_trtllm_kv_tokens")
if trtllm_tokens_mark and trtllm_tokens_mark.args:
meta["requested_trtllm_kv_tokens"] = trtllm_tokens_mark.args[0]
trtllm_vram_mark = item.get_closest_marker("requested_trtllm_vram_gib")
if trtllm_vram_mark and trtllm_vram_mark.args:
meta["requested_trtllm_vram_gib"] = trtllm_vram_mark.args[0]
skip_mark = item.get_closest_marker("skip")
if skip_mark:
reason = skip_mark.kwargs.get("reason", "")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment