"ssh:/git@developer.sourcefind.cn:2222/OpenDAS/dynamo.git" did not exist on "f5ba9e7f68baed45e78fe02d17e06fd67b07c4ac"
Unverified Commit ad2205eb authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

feat: add a few TRT-LLM example support for GPU-parallel test execution (#7880)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent 2af062ec
...@@ -109,6 +109,23 @@ def build_kv_connector_config(config: Config): ...@@ -109,6 +109,23 @@ def build_kv_connector_config(config: Config):
return None return None
def _warn_override_collisions(target: dict, source: dict, path: str = "") -> None:
"""Log warnings for keys in *source* that will overwrite existing values in *target*."""
for key, new_val in source.items():
full_key = f"{path}.{key}" if path else key
if key in target:
old_val = target[key]
if isinstance(new_val, dict) and isinstance(old_val, dict):
_warn_override_collisions(old_val, new_val, full_key)
elif old_val != new_val:
logging.warning(
"override_engine_args will replace %s: %r -> %r",
full_key,
old_val,
new_val,
)
async def init_llm_worker( async def init_llm_worker(
runtime: DistributedRuntime, runtime: DistributedRuntime,
config: Config, config: Config,
...@@ -206,6 +223,7 @@ async def init_llm_worker( ...@@ -206,6 +223,7 @@ async def init_llm_worker(
overrides = json.loads(config.override_engine_args) overrides = json.loads(config.override_engine_args)
logging.info(f"Applying engine arg overrides: {overrides}") logging.info(f"Applying engine arg overrides: {overrides}")
_warn_override_collisions(arg_map, overrides)
deep_update(arg_map, overrides) deep_update(arg_map, overrides)
except json.JSONDecodeError as e: except json.JSONDecodeError as e:
logging.error(f"Failed to parse override_engine_args as JSON: {e}") logging.error(f"Failed to parse override_engine_args as JSON: {e}")
......
...@@ -6,7 +6,8 @@ set -e ...@@ -6,7 +6,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh" source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_trtllm_override_args_with_mem
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Environment variables with defaults # Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
...@@ -42,12 +43,22 @@ while [[ $# -gt 0 ]]; do ...@@ -42,12 +43,22 @@ while [[ $# -gt 0 ]]; do
esac esac
done done
TRACE_ARGS=() TRTLLM_OVERRIDE_ARGS=()
if [ "$ENABLE_OTEL" = true ]; then if [ "$ENABLE_OTEL" = true ]; then
export DYN_LOGGING_JSONL=true export DYN_LOGGING_JSONL=true
export OTEL_EXPORT_ENABLED=1 export OTEL_EXPORT_ENABLED=1
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-http://localhost:4317} export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-http://localhost:4317}
TRACE_ARGS+=(--override-engine-args "{\"return_perf_metrics\": true, \"otlp_traces_endpoint\": \"${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}\" }") OTEL_JSON="{\"return_perf_metrics\": true, \"otlp_traces_endpoint\": \"${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}\"}"
# Merge GPU mem config with OTEL config
OVERRIDE_JSON=$(build_trtllm_override_args_with_mem --merge-with-json "$OTEL_JSON")
else
# Just GPU mem config (if any)
OVERRIDE_JSON=$(build_trtllm_override_args_with_mem)
fi
# Add --override-engine-args if we have JSON
if [[ -n "$OVERRIDE_JSON" ]]; then
TRTLLM_OVERRIDE_ARGS=(--override-engine-args "$OVERRIDE_JSON")
fi fi
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
...@@ -66,7 +77,7 @@ python3 -m dynamo.trtllm \ ...@@ -66,7 +77,7 @@ python3 -m dynamo.trtllm \
--served-model-name "$SERVED_MODEL_NAME" \ --served-model-name "$SERVED_MODEL_NAME" \
--modality "$MODALITY" \ --modality "$MODALITY" \
--extra-engine-args "$AGG_ENGINE_ARGS" \ --extra-engine-args "$AGG_ENGINE_ARGS" \
"${TRACE_ARGS[@]}" \ "${TRTLLM_OVERRIDE_ARGS[@]}" \
"${EXTRA_ARGS[@]}" & "${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest # Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
......
...@@ -6,7 +6,8 @@ set -e ...@@ -6,7 +6,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh" source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_trtllm_override_args_with_mem
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Environment variables with defaults # Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
...@@ -15,6 +16,15 @@ export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} ...@@ -15,6 +16,15 @@ export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/agg.yaml"} export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/agg.yaml"}
export MODALITY=${MODALITY:-"text"} export MODALITY=${MODALITY:-"text"}
# Build GPU memory JSON (returns bare JSON, no flag)
OVERRIDE_JSON=$(build_trtllm_override_args_with_mem)
# Add --override-engine-args if we have JSON
TRTLLM_OVERRIDE_ARGS=()
if [[ -n "$OVERRIDE_JSON" ]]; then
TRTLLM_OVERRIDE_ARGS=(--override-engine-args "$OVERRIDE_JSON")
fi
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving + Metrics" "$MODEL_PATH" "$HTTP_PORT" print_launch_banner "Launching Aggregated Serving + Metrics" "$MODEL_PATH" "$HTTP_PORT"
...@@ -29,7 +39,8 @@ python3 -m dynamo.trtllm \ ...@@ -29,7 +39,8 @@ python3 -m dynamo.trtllm \
--served-model-name "$SERVED_MODEL_NAME" \ --served-model-name "$SERVED_MODEL_NAME" \
--modality "$MODALITY" \ --modality "$MODALITY" \
--extra-engine-args "$AGG_ENGINE_ARGS" \ --extra-engine-args "$AGG_ENGINE_ARGS" \
--publish-events-and-metrics & --publish-events-and-metrics \
"${TRTLLM_OVERRIDE_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest # Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit wait_any_exit
...@@ -10,6 +10,7 @@ trap 'echo Cleaning up...; kill 0' EXIT ...@@ -10,6 +10,7 @@ trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh" source "$SCRIPT_DIR/../../../common/launch_utils.sh"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_trtllm_override_args_with_mem
# Environment variables with defaults # Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
...@@ -36,6 +37,15 @@ while [[ $# -gt 0 ]]; do ...@@ -36,6 +37,15 @@ while [[ $# -gt 0 ]]; do
esac esac
done done
# Build GPU memory JSON (returns bare JSON, no flag)
OVERRIDE_JSON=$(build_trtllm_override_args_with_mem)
# Add --override-engine-args if we have JSON
TRTLLM_OVERRIDE_ARGS=()
if [[ -n "$OVERRIDE_JSON" ]]; then
TRTLLM_OVERRIDE_ARGS=(--override-engine-args "$OVERRIDE_JSON")
fi
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --no-curl "Launching Video Diffusion Serving (1 GPU)" "$MODEL_PATH" "$HTTP_PORT" \ print_launch_banner --no-curl "Launching Video Diffusion Serving (1 GPU)" "$MODEL_PATH" "$HTTP_PORT" \
"Media URL: $MEDIA_OUTPUT_FS_URL" "Media URL: $MEDIA_OUTPUT_FS_URL"
...@@ -61,6 +71,7 @@ python3 -m dynamo.trtllm \ ...@@ -61,6 +71,7 @@ python3 -m dynamo.trtllm \
--served-model-name "$SERVED_MODEL_NAME" \ --served-model-name "$SERVED_MODEL_NAME" \
--modality video_diffusion \ --modality video_diffusion \
--media-output-fs-url "$MEDIA_OUTPUT_FS_URL" \ --media-output-fs-url "$MEDIA_OUTPUT_FS_URL" \
"${TRTLLM_OVERRIDE_ARGS[@]}" \
"${EXTRA_ARGS[@]}" & "${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest # Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
......
...@@ -14,10 +14,17 @@ ...@@ -14,10 +14,17 @@
# Returns engine-specific CLI args for GPU memory control based on # Returns engine-specific CLI args for GPU memory control based on
# environment variable overrides. Empty if no overrides. # environment variable overrides. Empty if no overrides.
# #
# vLLM: _PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES → --kv-cache-memory-bytes N --gpu-memory-utilization 0.01 # Supported engines: vllm, sglang
#
# vLLM: _PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES → --kv-cache-memory-bytes N --gpu-memory-utilization 0.01
# SGLang: _PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS → --max-total-tokens N # SGLang: _PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS → --max-total-tokens N
# #
# Note: TensorRT-LLM uses build_trtllm_override_args_with_mem() instead (requires JSON merging)
#
# TODO: Split into build_vllm_gpu_mem_args and build_sglang_gpu_mem_args
#
# Usage: # Usage:
# # vLLM / SGLang
# GPU_MEM_ARGS=$(build_gpu_mem_args sglang) # GPU_MEM_ARGS=$(build_gpu_mem_args sglang)
# python -m dynamo.sglang --model-path "$MODEL" $GPU_MEM_ARGS & # python -m dynamo.sglang --model-path "$MODEL" $GPU_MEM_ARGS &
# #
...@@ -27,6 +34,12 @@ build_gpu_mem_args() { ...@@ -27,6 +34,12 @@ build_gpu_mem_args() {
local engine="${1:?usage: build_gpu_mem_args <engine> [--workers-per-gpu N]}" local engine="${1:?usage: build_gpu_mem_args <engine> [--workers-per-gpu N]}"
shift shift
# TensorRT-LLM uses build_trtllm_override_args_with_mem instead
if [[ "$engine" == "trtllm" ]]; then
echo "build_gpu_mem_args: TensorRT-LLM not supported. Use build_trtllm_override_args_with_mem instead." >&2
return 1
fi
local workers_per_gpu=1 local workers_per_gpu=1
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case "$1" in case "$1" in
...@@ -59,6 +72,76 @@ build_gpu_mem_args() { ...@@ -59,6 +72,76 @@ build_gpu_mem_args() {
} }
# ---------------------------------------------------------------------------
# build_trtllm_override_args_with_mem [--merge-with-json JSON]
# TensorRT-LLM-specific: builds JSON for --override-engine-args with GPU memory config.
# Returns ONLY the bare JSON value (no --override-engine-args flag, no quotes).
#
# Separate function because TRT-LLM requires JSON merging for --override-engine-args
# (unlike vLLM/SGLang which use direct CLI flags).
#
# Environment variables:
# _PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS → {"kv_cache_config": {"max_tokens": N}}
# _PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES → {"kv_cache_config": {"max_gpu_total_bytes": N}}
#
# If --merge-with-json is provided, merges GPU config with the existing JSON.
#
# Usage:
# # TensorRT-LLM: simple case (no existing overrides)
# JSON=$(build_trtllm_override_args_with_mem)
# python -m dynamo.trtllm --model-path "$MODEL" ${JSON:+--override-engine-args "$JSON"} &
#
# # TensorRT-LLM: merge with existing JSON
# EXISTING='{"return_perf_metrics": true}'
# JSON=$(build_trtllm_override_args_with_mem --merge-with-json "$EXISTING")
# python -m dynamo.trtllm --model-path "$MODEL" --override-engine-args "$JSON" &
# ---------------------------------------------------------------------------
build_trtllm_override_args_with_mem() {
local merge_json=""
while [[ $# -gt 0 ]]; do
case "$1" in
--merge-with-json)
merge_json="$2"
shift 2
;;
*) echo "build_trtllm_override_args_with_mem: unknown option '$1'" >&2; return 1 ;;
esac
done
local gpu_mem_json=""
# Token-based (preferred, simpler to reason about)
if [[ -n "${_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS:-}" ]]; then
gpu_mem_json='"kv_cache_config": {"max_tokens": '"${_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS}"'}'
# Byte-based (alternative, more precise)
elif [[ -n "${_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES:-}" ]]; then
gpu_mem_json='"kv_cache_config": {"max_gpu_total_bytes": '"${_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES}"'}'
fi
if [[ -n "$gpu_mem_json" ]]; then
if [[ -n "$merge_json" ]]; then
# Merge: GPU mem config first, then existing config
# Strip outer braces from existing JSON
local existing="${merge_json#\{}"
existing="${existing%\}}"
if [[ -n "${existing//[[:space:]]/}" ]]; then
echo "{${gpu_mem_json}, ${existing}}"
else
echo "{${gpu_mem_json}}"
fi
else
# Just GPU mem config
echo "{${gpu_mem_json}}"
fi
elif [[ -n "$merge_json" ]]; then
# No GPU override, return existing JSON as-is
echo "$merge_json"
fi
# No output if both are empty (engine uses default)
}
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Self-test: bash gpu_utils.sh --self-test # Self-test: bash gpu_utils.sh --self-test
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
...@@ -116,11 +199,56 @@ _gpu_utils_self_test() { ...@@ -116,11 +199,56 @@ _gpu_utils_self_test() {
build_gpu_mem_args sglang) build_gpu_mem_args sglang)
_assert "sglang ignores kv bytes" "" "$result" _assert "sglang ignores kv bytes" "" "$result"
echo ""
echo "=== trtllm: token cap env ==="
result=$(_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS=4096 \
build_trtllm_override_args_with_mem)
_assert "trtllm token cap" '{"kv_cache_config": {"max_tokens": 4096}}' "$result"
echo ""
echo "=== trtllm: byte cap env ==="
result=$(_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES=1073741824 \
build_trtllm_override_args_with_mem)
_assert "trtllm byte cap" '{"kv_cache_config": {"max_gpu_total_bytes": 1073741824}}' "$result"
echo ""
echo "=== trtllm: no override = empty ==="
result=$(build_trtllm_override_args_with_mem)
_assert "empty (engine default)" "" "$result"
echo ""
echo "=== trtllm: token cap takes precedence over byte cap ==="
result=$(_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS=2048 _PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES=999999 \
build_trtllm_override_args_with_mem)
_assert "trtllm token precedence" '{"kv_cache_config": {"max_tokens": 2048}}' "$result"
echo ""
echo "=== trtllm: merge with existing JSON ==="
result=$(_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS=2048 \
build_trtllm_override_args_with_mem --merge-with-json '{"return_perf_metrics": true, "otlp_traces_endpoint": "http://localhost:4317"}')
_assert "trtllm merged" '{"kv_cache_config": {"max_tokens": 2048}, "return_perf_metrics": true, "otlp_traces_endpoint": "http://localhost:4317"}' "$result"
echo ""
echo "=== trtllm: merge with empty JSON object ==="
result=$(_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS=2048 \
build_trtllm_override_args_with_mem --merge-with-json '{}')
_assert "trtllm merge empty obj" '{"kv_cache_config": {"max_tokens": 2048}}' "$result"
echo ""
echo "=== trtllm: no GPU override, but pass through existing JSON ==="
result=$(build_trtllm_override_args_with_mem --merge-with-json '{"return_perf_metrics": true}')
_assert "trtllm passthrough" '{"return_perf_metrics": true}' "$result"
echo "" echo ""
echo "=== missing engine ===" echo "=== missing engine ==="
(build_gpu_mem_args 2>/dev/null) (build_gpu_mem_args 2>/dev/null)
_assert "missing engine exits non-zero" "1" "$?" _assert "missing engine exits non-zero" "1" "$?"
echo ""
echo "=== trtllm rejected (use build_trtllm_override_args_with_mem) ==="
(build_gpu_mem_args trtllm 2>/dev/null)
_assert "trtllm rejected" "1" "$?"
echo "" echo ""
echo "==========================================" echo "=========================================="
echo "Results: $pass passed, $fail failed" echo "Results: $pass passed, $fail failed"
......
...@@ -234,10 +234,12 @@ markers = [ ...@@ -234,10 +234,12 @@ markers = [
"gpu_8: marks tests to run on 8GPUs", "gpu_8: marks tests to run on 8GPUs",
"xpu_1: marks tests to run on XPU", "xpu_1: marks tests to run on XPU",
"xpu_2: marks tests to run on 2XPUs", "xpu_2: marks tests to run on 2XPUs",
# These 3 (profiled_vram_gib and requested_*) are used for parallel pytest executions: # These 5 (profiled_vram_gib and requested_*) are used for parallel pytest executions:
"profiled_vram_gib(N): actual peak VRAM observed by nvidia-smi during profiling. Used for --max-vram-gib filtering and scheduler budget tracking", "profiled_vram_gib(N): actual peak VRAM observed by nvidia-smi during profiling. Used for --max-vram-gib filtering and scheduler budget tracking",
"requested_vllm_kv_cache_bytes(N): exact KV cache bytes for vLLM (skips memory profiling). Sets _PROFILE_PYTEST_KV_CACHE_BYTES. Most deterministic method for parallel execution", "requested_vllm_kv_cache_bytes(N): exact KV cache bytes for vLLM (skips memory profiling). Sets _PROFILE_PYTEST_KV_CACHE_BYTES. Most deterministic method for parallel execution",
"requested_sglang_kv_tokens(N): max KV cache tokens for SGLang parallel execution. Sets _OVERRIDE_SGLANG_MAX_TOTAL_TOKENS to cap --max-total-tokens and prevent over-allocation", "requested_sglang_kv_tokens(N): max KV cache tokens for SGLang parallel execution. Sets _OVERRIDE_SGLANG_MAX_TOTAL_TOKENS to cap --max-total-tokens and prevent over-allocation",
"requested_trtllm_kv_tokens(N): max KV cache tokens for TensorRT-LLM parallel execution. Sets _PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS to cap KvCacheConfig.max_tokens via --override-engine-args",
"requested_trtllm_vram_gib(N): max VRAM in GiB for TensorRT-LLM parallel execution. Sets _PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES to cap KvCacheConfig.max_gpu_total_bytes via --override-engine-args. Use for non-text workloads (video/image diffusion)",
"e2e: marks tests as end-to-end tests", "e2e: marks tests as end-to-end tests",
"integration: marks tests as integration tests", "integration: marks tests as integration tests",
"unit: marks tests as unit tests", "unit: marks tests as unit tests",
......
...@@ -117,6 +117,8 @@ Markers are required for all tests. They are used for test selection in CI and l ...@@ -117,6 +117,8 @@ Markers are required for all tests. They are used for test selection in CI and l
| VRAM (profiled) | profiled_vram_gib(N) | Actual peak VRAM observed by nvidia-smi during profiling (includes CUDA overhead). Used for `--max-vram-gib=N` filtering and GPU-parallel scheduler budget tracking. | | VRAM (profiled) | profiled_vram_gib(N) | Actual peak VRAM observed by nvidia-smi during profiling (includes CUDA overhead). Used for `--max-vram-gib=N` filtering and GPU-parallel scheduler budget tracking. |
| vLLM KV cache bytes | requested_vllm_kv_cache_bytes(N) | (vLLM only) Exact KV cache bytes. Sets `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES``--kv-cache-memory-bytes`. Deterministic, parallel-safe. | | vLLM KV cache bytes | requested_vllm_kv_cache_bytes(N) | (vLLM only) Exact KV cache bytes. Sets `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES``--kv-cache-memory-bytes`. Deterministic, parallel-safe. |
| SGLang KV tokens | requested_sglang_kv_tokens(N) | (SGLang only) Max KV cache tokens. Sets `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS``--max-total-tokens`. Deterministic, parallel-safe. | | SGLang KV tokens | requested_sglang_kv_tokens(N) | (SGLang only) Max KV cache tokens. Sets `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS``--max-total-tokens`. Deterministic, parallel-safe. |
| TRT-LLM KV tokens | requested_trtllm_kv_tokens(N) | (TRT-LLM only) Max KV cache tokens. Sets `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS``KvCacheConfig.max_tokens` via `--override-engine-args`. Deterministic, parallel-safe. |
| TRT-LLM VRAM GiB | requested_trtllm_vram_gib(N) | (TRT-LLM only) Max VRAM in GiB. Sets `_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES``KvCacheConfig.max_gpu_total_bytes` via `--override-engine-args`. For non-text workloads (video/image diffusion) where token-based control doesn't apply. |
| Component/Framework | vllm, trtllm, sglang, kvbm, kvbm_concurrency, planner, router | Backend or component specificity | | Component/Framework | vllm, trtllm, sglang, kvbm, kvbm_concurrency, planner, router | Backend or component specificity |
| Infrastructure | k8s, deploy, fault_tolerance | Infrastructure/environment needs | | Infrastructure | k8s, deploy, fault_tolerance | Infrastructure/environment needs |
| Execution | parallel | Test can run in parallel with pytest-xdist. Must use dynamic port allocation (`alloc_ports`) and not share resources (e.g. filesystem) | | Execution | parallel | Test can run in parallel with pytest-xdist. Must use dynamic port allocation (`alloc_ports`) and not share resources (e.g. filesystem) |
...@@ -147,6 +149,33 @@ def test_sglang_aggregated(): ...@@ -147,6 +149,33 @@ def test_sglang_aggregated():
... ...
``` ```
### Example (TRT-LLM with token cap)
```python
@pytest.mark.pre_merge
@pytest.mark.e2e
@pytest.mark.gpu_1
@pytest.mark.profiled_vram_gib(3.9) # actual nvidia-smi peak at recommended token count
@pytest.mark.requested_trtllm_kv_tokens(2592) # KV cache cap (2x safety over min=1296)
@pytest.mark.timeout(300)
@pytest.mark.trtllm
def test_trtllm_aggregated():
...
```
### Example (TRT-LLM diffusion — no KV cache)
```python
@pytest.mark.pre_merge
@pytest.mark.gpu_1
@pytest.mark.trtllm
# Diffusion models don't use KV cache, so requested_trtllm_kv_tokens doesn't apply
# and requested_trtllm_vram_gib (KvCacheConfig.max_gpu_total_bytes) has no effect —
# the VRAM is model weights + activations. Only profiled_vram_gib is meaningful.
@pytest.mark.profiled_vram_gib(17.1) # actual nvidia-smi peak
@pytest.mark.timeout(600)
def test_trtllm_video_diffusion():
...
```
### VRAM Markers and Filtering ### VRAM Markers and Filtering
Markers differ by engine: Markers differ by engine:
...@@ -159,6 +188,12 @@ Markers differ by engine: ...@@ -159,6 +188,12 @@ Markers differ by engine:
- **`profiled_vram_gib(N)`** — actual peak from nvidia-smi at the recommended token count. Used for `--max-vram-gib` filtering and scheduler budget. - **`profiled_vram_gib(N)`** — actual peak from nvidia-smi at the recommended token count. Used for `--max-vram-gib` filtering and scheduler budget.
- **`requested_sglang_kv_tokens(N)`** — max KV cache tokens. Sets `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS``--max-total-tokens`. SGLang's default `--mem-fraction-static` is never overridden; the token cap is the sole allocation control. Deterministic and parallel-safe (see `examples/common/gpu_utils.md`). - **`requested_sglang_kv_tokens(N)`** — max KV cache tokens. Sets `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS``--max-total-tokens`. SGLang's default `--mem-fraction-static` is never overridden; the token cap is the sole allocation control. Deterministic and parallel-safe (see `examples/common/gpu_utils.md`).
**TRT-LLM** uses token-based control (text models) or byte-based control (diffusion models):
- **`profiled_vram_gib(N)`** — actual peak from nvidia-smi. Used for `--max-vram-gib` filtering and scheduler budget.
- **`requested_trtllm_kv_tokens(N)`** — max KV cache tokens for text models. Sets `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS``KvCacheConfig.max_tokens` via `--override-engine-args` JSON. Deterministic and parallel-safe.
- **`requested_trtllm_vram_gib(N)`** — max VRAM in GiB for non-text workloads (video/image diffusion). Sets `_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES``KvCacheConfig.max_gpu_total_bytes` via `--override-engine-args` JSON. Note: diffusion models don't use KV cache, so this parameter may have no effect — `profiled_vram_gib` alone is sufficient for scheduler budget tracking.
- TRT-LLM requires JSON merging for `--override-engine-args`, handled by `build_trtllm_override_args_with_mem` in `gpu_utils.sh` (separate from `build_gpu_mem_args` used by vLLM/SGLang).
`--max-vram-gib=N` deselects tests whose `profiled_vram_gib` exceeds N. Tests without a VRAM marker are also deselected (unknown VRAM = unsafe for parallel). To add a test to the pool, profile it with `tests/utils/profile_pytest.py` (see [GPU VRAM Profiler](#gpu-vram-profiler-profile_pytestpy)). `--max-vram-gib=N` deselects tests whose `profiled_vram_gib` exceeds N. Tests without a VRAM marker are also deselected (unknown VRAM = unsafe for parallel). To add a test to the pool, profile it with `tests/utils/profile_pytest.py` (see [GPU VRAM Profiler](#gpu-vram-profiler-profile_pytestpy)).
### GPU-Parallel Execution ### GPU-Parallel Execution
...@@ -170,6 +205,7 @@ GPU tests run concurrently via a custom VRAM-aware scheduler (`tests/utils/pytes ...@@ -170,6 +205,7 @@ GPU tests run concurrently via a custom VRAM-aware scheduler (`tests/utils/pytes
3. **Engine-specific allocation**: each test gets a constrained allocation so it uses only its budgeted share. xdist has no mechanism for this. 3. **Engine-specific allocation**: each test gets a constrained allocation so it uses only its budgeted share. xdist has no mechanism for this.
- **vLLM**: `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES = N``--kv-cache-memory-bytes` (from `requested_vllm_kv_cache_bytes` marker). Byte-based cap is deterministic and doesn't depend on current free memory, making it inherently parallel-safe. - **vLLM**: `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES = N``--kv-cache-memory-bytes` (from `requested_vllm_kv_cache_bytes` marker). Byte-based cap is deterministic and doesn't depend on current free memory, making it inherently parallel-safe.
- **SGLang**: `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS = N``--max-total-tokens` (from `requested_sglang_kv_tokens` marker). Token-based cap is deterministic and doesn't depend on current free memory, making it inherently parallel-safe. - **SGLang**: `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS = N``--max-total-tokens` (from `requested_sglang_kv_tokens` marker). Token-based cap is deterministic and doesn't depend on current free memory, making it inherently parallel-safe.
- **TRT-LLM**: `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS = N``KvCacheConfig.max_tokens` via `--override-engine-args` JSON (from `requested_trtllm_kv_tokens` marker). Token-based cap is deterministic and parallel-safe. Uses `build_trtllm_override_args_with_mem` (not `build_gpu_mem_args`) because TRT-LLM requires JSON merging for `--override-engine-args`.
```bash ```bash
# Dry-run: preview which tests fit and the GPU plan # Dry-run: preview which tests fit and the GPU plan
...@@ -508,18 +544,26 @@ The profiler automatically detects the engine type and uses the appropriate bina ...@@ -508,18 +544,26 @@ The profiler automatically detects the engine type and uses the appropriate bina
- **vLLM**: bisects `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES` (bytes) → `--kv-cache-memory-bytes`. Finds the minimum KV cache bytes where the test passes, applies a 2x safety factor. Outputs `profiled_vram_gib` and `requested_vllm_kv_cache_bytes` markers. - **vLLM**: bisects `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES` (bytes) → `--kv-cache-memory-bytes`. Finds the minimum KV cache bytes where the test passes, applies a 2x safety factor. Outputs `profiled_vram_gib` and `requested_vllm_kv_cache_bytes` markers.
- **SGLang**: bisects `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS` (token count) → `--max-total-tokens`. Finds the minimum KV cache tokens where the test passes, applies a 2x safety factor, then runs a final probe at the safe token count to measure the actual VRAM. Outputs `profiled_vram_gib` and `requested_sglang_kv_tokens` markers. - **SGLang**: bisects `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS` (token count) → `--max-total-tokens`. Finds the minimum KV cache tokens where the test passes, applies a 2x safety factor, then runs a final probe at the safe token count to measure the actual VRAM. Outputs `profiled_vram_gib` and `requested_sglang_kv_tokens` markers.
- **TRT-LLM**: bisects `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` (token count) → `KvCacheConfig.max_tokens` via `--override-engine-args` JSON. Same logic as SGLang (token-based bisection, 2x safety). Outputs `profiled_vram_gib` and `requested_trtllm_kv_tokens` markers. For non-text models (video/image diffusion) that don't use KV cache, use `--no-find-min-vram` for a single-pass VRAM measurement — binary search won't work because the model doesn't log KV token allocation.
**Requirement (vLLM):** The launch script must honor `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES`. This is handled by `build_gpu_mem_args` in `gpu_utils.sh` (returns `--kv-cache-memory-bytes N`). **Requirement (vLLM):** The launch script must honor `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES`. This is handled by `build_gpu_mem_args` in `gpu_utils.sh` (returns `--kv-cache-memory-bytes N`).
**Requirement (SGLang):** The launch script must honor `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS`. This is handled by `build_gpu_mem_args` in `gpu_utils.sh` (returns `--max-total-tokens N`). **Requirement (SGLang):** The launch script must honor `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS`. This is handled by `build_gpu_mem_args` in `gpu_utils.sh` (returns `--max-total-tokens N`).
**Requirement (TRT-LLM):** The launch script must honor `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` (and optionally `_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES`). This is handled by `build_trtllm_override_args_with_mem` in `gpu_utils.sh` (returns JSON for `--override-engine-args`). Note: this is a separate function from `build_gpu_mem_args` because TRT-LLM requires JSON merging.
### Engine-specific mapping ### Engine-specific mapping
Launch scripts call `build_gpu_mem_args` (from `examples/common/gpu_utils.sh`) which checks env var overrides and returns the appropriate CLI flags: Launch scripts call `build_gpu_mem_args` (vLLM/SGLang) or `build_trtllm_override_args_with_mem` (TRT-LLM) from `examples/common/gpu_utils.sh`, which check env var overrides and return the appropriate CLI flags:
```bash ```bash
# vLLM / SGLang
GPU_MEM_ARGS=$(build_gpu_mem_args sglang) GPU_MEM_ARGS=$(build_gpu_mem_args sglang)
python -m dynamo.sglang --model-path "$MODEL" $GPU_MEM_ARGS & python -m dynamo.sglang --model-path "$MODEL" $GPU_MEM_ARGS &
# TRT-LLM (requires JSON merging, separate function)
OVERRIDE_JSON=$(build_trtllm_override_args_with_mem)
python -m dynamo.trtllm --model-path "$MODEL" ${OVERRIDE_JSON:+--override-engine-args "$OVERRIDE_JSON"} &
``` ```
Env vars control engine allocation during profiling and parallel test execution: Env vars control engine allocation during profiling and parallel test execution:
...@@ -536,7 +580,19 @@ Env vars control engine allocation during profiling and parallel test execution: ...@@ -536,7 +580,19 @@ Env vars control engine allocation during profiling and parallel test execution:
|---------|----------------------------------|-------| |---------|----------------------------------|-------|
| SGLang | `--max-total-tokens N` | Token-based KV cache cap | | SGLang | `--max-total-tokens N` | Token-based KV cache cap |
Both use absolute caps (bytes and tokens) — deterministic and independent of current free memory, which is critical for parallel test execution. See `examples/common/gpu_utils.md`. **`_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS`** (integer) — TRT-LLM text models:
| Engine | Returned JSON | Notes |
|---------|--------------------------------------------------------|-------|
| TRT-LLM | `{"kv_cache_config": {"max_tokens": N}}` | Token-based KV cache cap via `--override-engine-args` |
**`_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES`** (integer) — TRT-LLM non-text models:
| Engine | Returned JSON | Notes |
|---------|------------------------------------------------------------------|-------|
| TRT-LLM | `{"kv_cache_config": {"max_gpu_total_bytes": N}}` | Byte-based cap via `--override-engine-args`. For diffusion models. |
All use absolute caps — deterministic and independent of current free memory, which is critical for parallel test execution. See `examples/common/gpu_utils.md`.
### Usage ### Usage
...@@ -550,6 +606,12 @@ python tests/utils/profile_pytest.py --gpu 1 tests/serve/test_vllm.py::test_serv ...@@ -550,6 +606,12 @@ python tests/utils/profile_pytest.py --gpu 1 tests/serve/test_vllm.py::test_serv
# SGLang: binary search for minimum KV cache tokens (automatic) # SGLang: binary search for minimum KV cache tokens (automatic)
python tests/utils/profile_pytest.py tests/serve/test_sglang.py::test_sglang_deployment[aggregated-2] -xvs python tests/utils/profile_pytest.py tests/serve/test_sglang.py::test_sglang_deployment[aggregated-2] -xvs
# TRT-LLM: binary search for minimum KV cache tokens (text models)
python tests/utils/profile_pytest.py tests/serve/test_trtllm.py::test_deployment[aggregated-2] -xvs
# TRT-LLM: single-pass for diffusion models (no KV cache, binary search won't work)
python tests/utils/profile_pytest.py --no-find-min-vram tests/serve/test_trtllm.py::test_deployment[video_diffusion-2] -xvs
# Single-pass profiling (no binary search, just measure one run using default RAM) # Single-pass profiling (no binary search, just measure one run using default RAM)
python tests/utils/profile_pytest.py --no-find-min-vram tests/serve/test_vllm.py::test_serve_deployment[aggregated] python tests/utils/profile_pytest.py --no-find-min-vram tests/serve/test_vllm.py::test_serve_deployment[aggregated]
``` ```
...@@ -626,6 +688,36 @@ MINIMUM KV TOKENS RESULT ...@@ -626,6 +688,36 @@ MINIMUM KV TOKENS RESULT
======================================================================== ========================================================================
``` ```
### Example output (TRT-LLM — token-based bisection)
```bash
========================================================================
FIND MINIMUM KV TOKENS (TensorRT-LLM) (binary search)
========================================================================
GPU total : 48.0 GiB
GPU free : 47.1 GiB (in use: 0.9 GiB)
Test : tests/serve/test_trtllm.py::test_deployment[aggregated-2] -xvs
[probe 1] Validation run (no token cap, default fraction)
[PASS] peak 41.3 GiB, wall 48s, max_tokens=41472 (TensorRT-LLM), iter took 56s
...
[probe 6/12] tokens=1296
[PASS] tokens=1296, peak 3.7 GiB, wall 46s, iter took 54s
[EARLY STOP] Peak VRAM stable for last 3 probes
[final probe] Measuring VRAM at safe_tokens=2592
[PASS] tokens=2592, peak 3.9 GiB, wall 46s
========================================================================
MINIMUM KV TOKENS RESULT (TensorRT-LLM)
========================================================================
Minimum tokens : 1296 (raw bisection result)
Recommended : 2592 (2x safety)
Peak VRAM : 3.9 GiB (at 2592 tokens)
@pytest.mark.profiled_vram_gib(3.9)
@pytest.mark.requested_trtllm_kv_tokens(2592), # KV cache cap (2x safety over min=1296)
========================================================================
```
### How to use the recommendations ### How to use the recommendations
1. **Copy the `@pytest.mark.*` lines** into your test function or `pytestmark` list. 1. **Copy the `@pytest.mark.*` lines** into your test function or `pytestmark` list.
......
...@@ -100,6 +100,8 @@ def pytest_configure(config: pytest.Config) -> None: ...@@ -100,6 +100,8 @@ def pytest_configure(config: pytest.Config) -> None:
vram_limit = config.getoption("max_vram_gib", default=None) vram_limit = config.getoption("max_vram_gib", default=None)
if vram_limit is None: if vram_limit is None:
return return
if config.option.collectonly:
return
# Delayed: vram_utils requires pynvml, otherwise conftest fails to load # Delayed: vram_utils requires pynvml, otherwise conftest fails to load
# on CPU-only CI runners (e.g. ARM deploy tests) that lack nvidia-ml-py. # on CPU-only CI runners (e.g. ARM deploy tests) that lack nvidia-ml-py.
from tests.utils.pytest_parallel_gpu import _parse_cuda_visible from tests.utils.pytest_parallel_gpu import _parse_cuda_visible
...@@ -482,8 +484,9 @@ def pytest_collection_modifyitems(config, items): ...@@ -482,8 +484,9 @@ def pytest_collection_modifyitems(config, items):
# - Tests whose profiled VRAM exceeds the limit are removed # - Tests whose profiled VRAM exceeds the limit are removed
# - Tests WITHOUT a VRAM marker are also removed (unknown VRAM = unsafe) # - Tests WITHOUT a VRAM marker are also removed (unknown VRAM = unsafe)
# Using deselect (not skip) so they never reach the xdist scheduler. # Using deselect (not skip) so they never reach the xdist scheduler.
# Skip all VRAM logic during --collect-only (just listing tests).
vram_limit = config.getoption("--max-vram-gib", default=None) vram_limit = config.getoption("--max-vram-gib", default=None)
if vram_limit is not None: if vram_limit is not None and not config.option.collectonly:
keep = [] keep = []
deselected = [] deselected = []
for item in items: for item in items:
...@@ -497,7 +500,7 @@ def pytest_collection_modifyitems(config, items): ...@@ -497,7 +500,7 @@ def pytest_collection_modifyitems(config, items):
items[:] = keep items[:] = keep
# Write test metadata for the GPU orchestrator to read. # Write test metadata for the GPU orchestrator to read.
if vram_limit is not None: if vram_limit is not None and not config.option.collectonly:
# Delayed: see vram_utils pynvml note in pytest_configure # Delayed: see vram_utils pynvml note in pytest_configure
from tests.utils.vram_utils import print_gpu_plan, write_test_meta from tests.utils.vram_utils import print_gpu_plan, write_test_meta
......
...@@ -80,15 +80,20 @@ trtllm_configs = { ...@@ -80,15 +80,20 @@ trtllm_configs = {
directory=trtllm_dir, directory=trtllm_dir,
script_name="agg_metrics.sh", script_name="agg_metrics.sh",
marks=[ marks=[
pytest.mark.gpu_1, pytest.mark.gpu_1, # 1 GPU(s) used, peak 3.9 GiB
pytest.mark.pre_merge, pytest.mark.pre_merge,
pytest.mark.trtllm, pytest.mark.trtllm,
pytest.mark.profiled_vram_gib(3.9), # actual nvidia-smi peak 3.9 GiB
pytest.mark.requested_trtllm_kv_tokens(
2592
), # KV cache cap (2x safety over min=1296)
pytest.mark.timeout( pytest.mark.timeout(
300 300
), # 3x measured time (44.66s) + download time (150s) ), # 3x measured time (44.66s) + download time (150s)
], ],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
frontend_port=DefaultPort.FRONTEND.value, frontend_port=DefaultPort.FRONTEND.value,
delayed_start=5,
request_payloads=[ request_payloads=[
chat_payload_default(), chat_payload_default(),
completion_payload_default(), completion_payload_default(),
...@@ -137,9 +142,19 @@ trtllm_configs = { ...@@ -137,9 +142,19 @@ trtllm_configs = {
name="aggregated_logprobs", name="aggregated_logprobs",
directory=trtllm_dir, directory=trtllm_dir,
script_name="agg.sh", script_name="agg.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.trtllm], marks=[
pytest.mark.gpu_1, # 1 GPU(s) used, peak 3.8 GiB
pytest.mark.pre_merge,
pytest.mark.trtllm,
pytest.mark.profiled_vram_gib(3.8), # actual nvidia-smi peak 3.8 GiB
pytest.mark.requested_trtllm_kv_tokens(
2592
), # KV cache cap (2x safety over min=1296)
pytest.mark.timeout(300), # 3x measured time (~44s) + download time (150s)
],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
frontend_port=DefaultPort.FRONTEND.value, frontend_port=DefaultPort.FRONTEND.value,
delayed_start=5,
request_payloads=[ request_payloads=[
chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=5), chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=5),
chat_payload(content=TEXT_PROMPT, logprobs=False, top_logprobs=5), chat_payload(content=TEXT_PROMPT, logprobs=False, top_logprobs=5),
...@@ -360,9 +375,17 @@ trtllm_configs = { ...@@ -360,9 +375,17 @@ trtllm_configs = {
"17", "17",
], ],
marks=[ marks=[
pytest.mark.gpu_1, pytest.mark.gpu_1, # 1 GPU(s) used, peak 17.1 GiB
pytest.mark.trtllm, pytest.mark.trtllm,
pytest.mark.pre_merge, pytest.mark.pre_merge,
# Diffusion models don't use KV cache, so requested_trtllm_kv_tokens
# doesn't apply. requested_trtllm_vram_gib maps to
# KvCacheConfig.max_gpu_total_bytes which has no effect on the
# diffusion engine itself, but the parallel scheduler requires one
# of the KV/VRAM markers to accept the test. We set it to the
# profiled peak so the scheduler's VRAM budget is accurate.
pytest.mark.profiled_vram_gib(17.1), # actual nvidia-smi peak 17.1 GiB
pytest.mark.requested_trtllm_vram_gib(17.1),
pytest.mark.timeout( pytest.mark.timeout(
600 600
), # Video generation is slow even at small resolution ), # Video generation is slow even at small resolution
...@@ -370,7 +393,7 @@ trtllm_configs = { ...@@ -370,7 +393,7 @@ trtllm_configs = {
model="Wan-AI/Wan2.1-T2V-1.3B-Diffusers", model="Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
frontend_port=DefaultPort.FRONTEND.value, frontend_port=DefaultPort.FRONTEND.value,
timeout=300, timeout=300,
delayed_start=60, # Model loading takes time delayed_start=5,
request_payloads=[ request_payloads=[
VideoGenerationPayload( VideoGenerationPayload(
body={ body={
......
...@@ -15,8 +15,9 @@ in-process instrumentation. Using NVML directly (the same C library that ...@@ -15,8 +15,9 @@ in-process instrumentation. Using NVML directly (the same C library that
and allows high-frequency sampling. and allows high-frequency sampling.
In **binary-search mode** (the default), the profiler bisects the KV cache In **binary-search mode** (the default), the profiler bisects the KV cache
allocation — ``_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES`` for vLLM (bytes) or allocation — ``_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES`` for vLLM (bytes),
``_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS`` for SGLang (tokens). ``_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS`` for SGLang (tokens), or
``_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS`` for TensorRT-LLM (tokens).
If the test passes, the allocation is lowered; if it OOMs, it is raised — If the test passes, the allocation is lowered; if it OOMs, it is raised —
standard bisection to find the minimum the test needs. A safety factor standard bisection to find the minimum the test needs. A safety factor
is applied and the peak ``memory.used`` from the last passing run becomes is applied and the peak ``memory.used`` from the last passing run becomes
...@@ -24,7 +25,8 @@ the ``@pytest.mark.profiled_vram_gib`` recommendation. ...@@ -24,7 +25,8 @@ the ``@pytest.mark.profiled_vram_gib`` recommendation.
**IMPORTANT**: The test under profile **MUST** read the appropriate KV cache **IMPORTANT**: The test under profile **MUST** read the appropriate KV cache
override — either directly (see ``test_mock_gpu_alloc.py``) or via launch override — either directly (see ``test_mock_gpu_alloc.py``) or via launch
scripts that call ``build_gpu_mem_args`` (e.g. ``agg.sh``). If the test scripts that call ``build_gpu_mem_args`` (vLLM/SGLang) or
``build_trtllm_override_args_with_mem`` (TensorRT-LLM). If the test
ignores the override, every probe will pass at the same peak and the profiler ignores the override, every probe will pass at the same peak and the profiler
will warn that the binary search is unreliable. will warn that the binary search is unreliable.
...@@ -459,6 +461,7 @@ def _recommend_markers( ...@@ -459,6 +461,7 @@ def _recommend_markers(
model_name: str | None = None, model_name: str | None = None,
num_runs: int = 1, num_runs: int = 1,
requested_sglang_kv_tokens: int | None = None, requested_sglang_kv_tokens: int | None = None,
requested_trtllm_kv_tokens: int | None = None,
requested_vllm_kv_cache_bytes: int | None = None, requested_vllm_kv_cache_bytes: int | None = None,
min_kv_value: int | None = None, min_kv_value: int | None = None,
) -> tuple[list[MarkerRecommendation], list[str]]: ) -> tuple[list[MarkerRecommendation], list[str]]:
...@@ -559,6 +562,14 @@ def _recommend_markers( ...@@ -559,6 +562,14 @@ def _recommend_markers(
f"KV cache cap ({_KV_SAFETY_FACTOR:.0f}x safety{min_label})", f"KV cache cap ({_KV_SAFETY_FACTOR:.0f}x safety{min_label})",
) )
) )
if requested_trtllm_kv_tokens is not None:
min_label = f" over min={min_kv_value}" if min_kv_value is not None else ""
recs.append(
MarkerRecommendation(
f"requested_trtllm_kv_tokens({requested_trtllm_kv_tokens})",
f"KV cache cap ({_KV_SAFETY_FACTOR:.0f}x safety{min_label})",
)
)
if requested_vllm_kv_cache_bytes is not None: if requested_vllm_kv_cache_bytes is not None:
min_label = ( min_label = (
f" over min={min_kv_value:_}" if min_kv_value is not None else "" f" over min={min_kv_value:_}" if min_kv_value is not None else ""
...@@ -634,6 +645,7 @@ def _print_recommendations( ...@@ -634,6 +645,7 @@ def _print_recommendations(
_SGLANG_NODEID_MARKERS = ["test_sglang", "sglang"] _SGLANG_NODEID_MARKERS = ["test_sglang", "sglang"]
_TRTLLM_NODEID_MARKERS = ["test_trtllm", "trtllm"]
def _is_sglang_test(pytest_args: list[str]) -> bool: def _is_sglang_test(pytest_args: list[str]) -> bool:
...@@ -643,6 +655,13 @@ def _is_sglang_test(pytest_args: list[str]) -> bool: ...@@ -643,6 +655,13 @@ def _is_sglang_test(pytest_args: list[str]) -> bool:
) )
def _is_trtllm_test(pytest_args: list[str]) -> bool:
"""Check if any pytest arg looks like a TensorRT-LLM test node ID."""
return any(
marker in arg for arg in pytest_args for marker in _TRTLLM_NODEID_MARKERS
)
_OOM_PATTERNS = [ _OOM_PATTERNS = [
"OutOfMemoryError", "OutOfMemoryError",
"CUDA out of memory", "CUDA out of memory",
...@@ -673,6 +692,22 @@ def _extract_requested_sglang_kv_tokens(stdout: str) -> int | None: ...@@ -673,6 +692,22 @@ def _extract_requested_sglang_kv_tokens(stdout: str) -> int | None:
return None return None
_TRTLLM_MAX_TOKENS_RE = re.compile(
r"\[MemUsageChange\] Allocated .* for max tokens in paged KV cache \((\d+)\)"
)
def _extract_requested_trtllm_kv_tokens(stdout: str) -> int | None:
"""Extract max_tokens from TensorRT-LLM engine output.
TensorRT-LLM logs: "[MemUsageChange] Allocated 0.22 GiB for max tokens in paged KV cache (2048)."
"""
match = _TRTLLM_MAX_TOKENS_RE.search(stdout)
if match:
return int(match.group(1))
return None
_DEFAULT_PROBE_TIMEOUT = 300 # 5 minutes max per profile run _DEFAULT_PROBE_TIMEOUT = 300 # 5 minutes max per profile run
...@@ -765,15 +800,17 @@ def _find_min_vram( ...@@ -765,15 +800,17 @@ def _find_min_vram(
) -> int: ) -> int:
"""Binary search to find the minimum VRAM a test needs. """Binary search to find the minimum VRAM a test needs.
Three modes, two patterns: Three modes, three patterns:
KV bisection (deterministic, no profiling race): KV bisection (deterministic, no profiling race):
vLLM: bisects _PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES (bytes) vLLM: bisects _PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES (bytes)
SGLang: bisects _PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS (tokens) SGLang: bisects _PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS (tokens)
Both use the same _KV_SAFETY_FACTOR (2x) and the same bisect loop. TensorRT-LLM: bisects _PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS (tokens)
All use the same _KV_SAFETY_FACTOR (2x) and the same bisect loop.
The only differences are env var name, units, display, and bounds. The only differences are env var name, units, display, and bounds.
""" """
is_sglang = _is_sglang_test(pytest_args) is_sglang = _is_sglang_test(pytest_args)
is_trtllm = _is_trtllm_test(pytest_args)
gpu_info = _query_gpu_stats() gpu_info = _query_gpu_stats()
if not gpu_info: if not gpu_info:
...@@ -792,11 +829,13 @@ def _find_min_vram( ...@@ -792,11 +829,13 @@ def _find_min_vram(
model_name = _extract_model_from_markers(pytest_args) model_name = _extract_model_from_markers(pytest_args)
if not is_sglang: if not is_sglang and not is_trtllm:
kv_bytes_mode = True kv_bytes_mode = True
if kv_bytes_mode: if kv_bytes_mode:
mode_label = "KV CACHE BYTES (vLLM, deterministic)" mode_label = "KV CACHE BYTES (vLLM, deterministic)"
elif is_trtllm:
mode_label = "KV TOKENS (TensorRT-LLM)"
else: else:
mode_label = "KV TOKENS (SGLang)" mode_label = "KV TOKENS (SGLang)"
print(f"\n--- FIND MINIMUM {mode_label} (binary search) ---") print(f"\n--- FIND MINIMUM {mode_label} (binary search) ---")
...@@ -909,20 +948,31 @@ def _find_min_vram( ...@@ -909,20 +948,31 @@ def _find_min_vram(
f"iter took {iter_elapsed:.0f}s" f"iter took {iter_elapsed:.0f}s"
) )
else: else:
max_tokens = _extract_requested_sglang_kv_tokens(stdout) if is_trtllm:
if max_tokens is None: max_tokens = _extract_requested_trtllm_kv_tokens(stdout)
print( if max_tokens is None:
" [ERROR] Could not extract max_total_tokens from SGLang output.\n" print(
" The launch script must log 'max_total_tokens=N' (SGLang does this by default)." " [ERROR] Could not extract max_tokens from TensorRT-LLM output.\n"
) " The launch script must log '[MemUsageChange] Allocated ... for max tokens in paged KV cache (N)'."
return 4 )
return 4
backend_label = "TensorRT-LLM"
else:
max_tokens = _extract_requested_sglang_kv_tokens(stdout)
if max_tokens is None:
print(
" [ERROR] Could not extract max_total_tokens from SGLang output.\n"
" The launch script must log 'max_total_tokens=N' (SGLang does this by default)."
)
return 4
backend_label = "SGLang"
page_size = 16 page_size = 16
lo = page_size lo = page_size
hi = max_tokens hi = max_tokens
tolerance = page_size * 2 tolerance = page_size * 2
print( print(
f" [PASS] peak {_format_mib(peak_mib)}, wall {wall:.0f}s, " f" [PASS] peak {_format_mib(peak_mib)}, wall {wall:.0f}s, "
f"max_total_tokens={max_tokens}, iter took {iter_elapsed:.0f}s" f"max_tokens={max_tokens} ({backend_label}), iter took {iter_elapsed:.0f}s"
) )
baseline_time = iter_elapsed baseline_time = iter_elapsed
...@@ -968,6 +1018,14 @@ def _find_min_vram( ...@@ -968,6 +1018,14 @@ def _find_min_vram(
"_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES": str(mid_int), "_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES": str(mid_int),
} }
probe_desc = f"kv_cache={mid_int // (1024**2)} MiB ({mid_int:,} bytes)" probe_desc = f"kv_cache={mid_int // (1024**2)} MiB ({mid_int:,} bytes)"
elif is_trtllm:
mid_int = ((int(lo) + int(hi)) // 2 // page_size) * page_size
mid_int = max(mid_int, page_size)
probe_env = {
**_gpu_env,
"_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS": str(mid_int),
}
probe_desc = f"tokens={mid_int}"
else: else:
mid_int = ((int(lo) + int(hi)) // 2 // page_size) * page_size mid_int = ((int(lo) + int(hi)) // 2 // page_size) * page_size
mid_int = max(mid_int, page_size) mid_int = max(mid_int, page_size)
...@@ -1083,7 +1141,7 @@ def _find_min_vram( ...@@ -1083,7 +1141,7 @@ def _find_min_vram(
# safe_kv_bytes which allocates more KV cache and thus more VRAM. # safe_kv_bytes which allocates more KV cache and thus more VRAM.
print(f" [final probe] Measuring VRAM at safe_kv_bytes={safe_kv_mib} MiB") print(f" [final probe] Measuring VRAM at safe_kv_bytes={safe_kv_mib} MiB")
sys.stdout.flush() sys.stdout.flush()
rc_final, wall_final, reports_final, samples_final, stdout_final = _run_once( rc_final, wall_final, reports_final, samples_final, _stdout_final = _run_once(
pytest_args, pytest_args,
interval=interval, interval=interval,
baseline_seconds=baseline_seconds, baseline_seconds=baseline_seconds,
...@@ -1141,14 +1199,24 @@ def _find_min_vram( ...@@ -1141,14 +1199,24 @@ def _find_min_vram(
# safe_tokens which allocates more KV cache and thus more VRAM. # safe_tokens which allocates more KV cache and thus more VRAM.
print(f" [final probe] Measuring VRAM at safe_tokens={safe_tokens}") print(f" [final probe] Measuring VRAM at safe_tokens={safe_tokens}")
sys.stdout.flush() sys.stdout.flush()
rc_final, wall_final, reports_final, samples_final, stdout_final = _run_once(
if is_trtllm:
env_var_name = "_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS"
marker_name = "requested_trtllm_kv_tokens"
backend_label = "TensorRT-LLM"
else:
env_var_name = "_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS"
marker_name = "requested_sglang_kv_tokens"
backend_label = "SGLang"
rc_final, wall_final, reports_final, samples_final, _stdout_final = _run_once(
pytest_args, pytest_args,
interval=interval, interval=interval,
baseline_seconds=baseline_seconds, baseline_seconds=baseline_seconds,
teardown_seconds=teardown_seconds, teardown_seconds=teardown_seconds,
extra_env={ extra_env={
**_gpu_env, **_gpu_env,
"_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS": str(safe_tokens), env_var_name: str(safe_tokens),
}, },
quiet=True, quiet=True,
run_label="final", run_label="final",
...@@ -1171,7 +1239,7 @@ def _find_min_vram( ...@@ -1171,7 +1239,7 @@ def _find_min_vram(
) )
print(f"\n{'=' * 72}") print(f"\n{'=' * 72}")
print("MINIMUM KV TOKENS RESULT") print(f"MINIMUM KV TOKENS RESULT ({backend_label})")
print(f"{'=' * 72}") print(f"{'=' * 72}")
print(f" Minimum tokens : {min_tokens} (raw bisection result)") print(f" Minimum tokens : {min_tokens} (raw bisection result)")
print(f" Recommended : {safe_tokens} ({_KV_SAFETY_FACTOR:.0f}x safety)") print(f" Recommended : {safe_tokens} ({_KV_SAFETY_FACTOR:.0f}x safety)")
...@@ -1180,12 +1248,13 @@ def _find_min_vram( ...@@ -1180,12 +1248,13 @@ def _find_min_vram(
) )
print(f" {test_short}: @pytest.mark.profiled_vram_gib({peak_gib})") print(f" {test_short}: @pytest.mark.profiled_vram_gib({peak_gib})")
print( print(
f" {test_short}: @pytest.mark.requested_sglang_kv_tokens({safe_tokens}), # KV cache cap ({_KV_SAFETY_FACTOR:.0f}x safety over min={min_tokens})" f" {test_short}: @pytest.mark.{marker_name}({safe_tokens}), # KV cache cap ({_KV_SAFETY_FACTOR:.0f}x safety over min={min_tokens})"
) )
print(f"{'=' * 72}") print(f"{'=' * 72}")
# Marker recommendations # Marker recommendations
requested_sglang_kv_tokens = safe_tokens if is_sglang else None requested_sglang_kv_tokens = safe_tokens if is_sglang else None
requested_trtllm_kv_tokens = safe_tokens if is_trtllm else None
requested_vllm_kv_cache_bytes = safe_kv_bytes if kv_bytes_mode else None requested_vllm_kv_cache_bytes = safe_kv_bytes if kv_bytes_mode else None
min_kv_value = int(last_pass_value) min_kv_value = int(last_pass_value)
if recommend: if recommend:
...@@ -1196,6 +1265,7 @@ def _find_min_vram( ...@@ -1196,6 +1265,7 @@ def _find_min_vram(
model_name, model_name,
num_runs=len(pass_wall_times), num_runs=len(pass_wall_times),
requested_sglang_kv_tokens=requested_sglang_kv_tokens, requested_sglang_kv_tokens=requested_sglang_kv_tokens,
requested_trtllm_kv_tokens=requested_trtllm_kv_tokens,
requested_vllm_kv_cache_bytes=requested_vllm_kv_cache_bytes, requested_vllm_kv_cache_bytes=requested_vllm_kv_cache_bytes,
min_kv_value=min_kv_value, min_kv_value=min_kv_value,
) )
...@@ -1326,6 +1396,7 @@ def main(argv: list[str] | None = None) -> int: ...@@ -1326,6 +1396,7 @@ def main(argv: list[str] | None = None) -> int:
model_name = _extract_model_from_markers(pytest_args) model_name = _extract_model_from_markers(pytest_args)
is_sglang = _is_sglang_test(pytest_args) is_sglang = _is_sglang_test(pytest_args)
is_trtllm = _is_trtllm_test(pytest_args)
rc, wall_secs, reports, samples, stdout = _run_once( rc, wall_secs, reports, samples, stdout = _run_once(
pytest_args, pytest_args,
...@@ -1333,20 +1404,24 @@ def main(argv: list[str] | None = None) -> int: ...@@ -1333,20 +1404,24 @@ def main(argv: list[str] | None = None) -> int:
baseline_seconds=args.baseline_seconds, baseline_seconds=args.baseline_seconds,
teardown_seconds=args.teardown_seconds, teardown_seconds=args.teardown_seconds,
extra_env=gpu_env, extra_env=gpu_env,
run_label="profile" if is_sglang else None, run_label="profile" if (is_sglang or is_trtllm) else None,
) )
_print_report(reports, rc, wall_secs, model_name=model_name) _print_report(reports, rc, wall_secs, model_name=model_name)
if not args.no_recommend and reports: if not args.no_recommend and reports:
requested_sglang_kv_tokens = None requested_sglang_kv_tokens = None
requested_trtllm_kv_tokens = None
if is_sglang: if is_sglang:
requested_sglang_kv_tokens = _extract_requested_sglang_kv_tokens(stdout) requested_sglang_kv_tokens = _extract_requested_sglang_kv_tokens(stdout)
if is_trtllm:
requested_trtllm_kv_tokens = _extract_requested_trtllm_kv_tokens(stdout)
recs, warnings = _recommend_markers( recs, warnings = _recommend_markers(
reports, reports,
wall_secs, wall_secs,
model_name=model_name, model_name=model_name,
requested_sglang_kv_tokens=requested_sglang_kv_tokens, requested_sglang_kv_tokens=requested_sglang_kv_tokens,
requested_trtllm_kv_tokens=requested_trtllm_kv_tokens,
) )
_print_recommendations(recs, warnings, pytest_args=pytest_args) _print_recommendations(recs, warnings, pytest_args=pytest_args)
......
...@@ -59,6 +59,8 @@ class _TestEntry: ...@@ -59,6 +59,8 @@ class _TestEntry:
timeout: float timeout: float
requested_vllm_kv_cache_bytes: int | None = None requested_vllm_kv_cache_bytes: int | None = None
requested_sglang_kv_tokens: int | None = None requested_sglang_kv_tokens: int | None = None
requested_trtllm_kv_tokens: int | None = None
requested_trtllm_vram_gib: float | None = None
skip_reason: str | None = None skip_reason: str | None = None
w_id: int = 0 w_id: int = 0
assigned_gpu: int | None = None assigned_gpu: int | None = None
...@@ -117,6 +119,10 @@ def _fmt_req(test: _TestEntry) -> str: ...@@ -117,6 +119,10 @@ def _fmt_req(test: _TestEntry) -> str:
"""Format the resource request value for display.""" """Format the resource request value for display."""
if test.requested_sglang_kv_tokens is not None: if test.requested_sglang_kv_tokens is not None:
return f"req_kv_tokens={int(test.requested_sglang_kv_tokens)}" return f"req_kv_tokens={int(test.requested_sglang_kv_tokens)}"
if test.requested_trtllm_kv_tokens is not None:
return f"req_kv_tokens={int(test.requested_trtllm_kv_tokens)}"
if test.requested_trtllm_vram_gib is not None:
return f"req_vram={test.requested_trtllm_vram_gib:.1f} GiB"
if test.requested_vllm_kv_cache_bytes is not None: if test.requested_vllm_kv_cache_bytes is not None:
gib = int(test.requested_vllm_kv_cache_bytes) / (1024**3) gib = int(test.requested_vllm_kv_cache_bytes) / (1024**3)
return f"req_kv={gib:.2f} GiB" return f"req_kv={gib:.2f} GiB"
...@@ -347,6 +353,8 @@ def run_parallel( ...@@ -347,6 +353,8 @@ def run_parallel(
requested_vllm_kv_cache_bytes=m.get("requested_vllm_kv_cache_bytes"), requested_vllm_kv_cache_bytes=m.get("requested_vllm_kv_cache_bytes"),
timeout=m.get("timeout", 600), timeout=m.get("timeout", 600),
requested_sglang_kv_tokens=m.get("requested_sglang_kv_tokens"), requested_sglang_kv_tokens=m.get("requested_sglang_kv_tokens"),
requested_trtllm_kv_tokens=m.get("requested_trtllm_kv_tokens"),
requested_trtllm_vram_gib=m.get("requested_trtllm_vram_gib"),
skip_reason=m.get("skip_reason"), skip_reason=m.get("skip_reason"),
) )
) )
...@@ -367,19 +375,19 @@ def run_parallel( ...@@ -367,19 +375,19 @@ def run_parallel(
for t in tests for t in tests
if t.requested_vllm_kv_cache_bytes is None if t.requested_vllm_kv_cache_bytes is None
and t.requested_sglang_kv_tokens is None and t.requested_sglang_kv_tokens is None
and t.requested_trtllm_kv_tokens is None
and t.requested_trtllm_vram_gib is None
and t.profiled_gib > 0 and t.profiled_gib > 0
] ]
if no_kv: if no_kv:
_print( _print(
f"\nERROR: {len(no_kv)} test(s) lack a requested_vllm_kv_cache_bytes " f"\nERROR: {len(no_kv)} test(s) lack a requested_vllm_kv_cache_bytes, "
f"or requested_sglang_kv_tokens marker and cannot run in parallel:" f"requested_sglang_kv_tokens, requested_trtllm_kv_tokens, "
f"or requested_trtllm_vram_gib marker and cannot run in parallel:"
) )
for t in no_kv: for t in no_kv:
_print(f" {t.name}") _print(f" {t.name}")
_print( _print("\nAdd the appropriate marker via profile_pytest.py, " "then rerun.")
"\nAdd the appropriate marker via profile_pytest.py --kv-bytes, "
"then rerun."
)
return 1 return 1
# Identify tests in metadata that exceed the VRAM budget # Identify tests in metadata that exceed the VRAM budget
...@@ -502,6 +510,13 @@ def run_parallel( ...@@ -502,6 +510,13 @@ def run_parallel(
env["_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS"] = str( env["_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS"] = str(
int(test.requested_sglang_kv_tokens) int(test.requested_sglang_kv_tokens)
) )
elif test.requested_trtllm_kv_tokens is not None:
env["_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS"] = str(
int(test.requested_trtllm_kv_tokens)
)
elif test.requested_trtllm_vram_gib is not None:
gib_to_bytes = int(test.requested_trtllm_vram_gib * 1024**3)
env["_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES"] = str(gib_to_bytes)
elif test.requested_vllm_kv_cache_bytes is not None: elif test.requested_vllm_kv_cache_bytes is not None:
env["_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES"] = str( env["_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES"] = str(
int(test.requested_vllm_kv_cache_bytes) int(test.requested_vllm_kv_cache_bytes)
...@@ -705,7 +720,8 @@ def run_parallel( ...@@ -705,7 +720,8 @@ def run_parallel(
gi = entry.assigned_gpu gi = entry.assigned_gpu
assert gi is not None assert gi is not None
is_vllm = ( is_vllm = (
entry.requested_sglang_kv_tokens is None and entry.profiled_gib > 0 entry.requested_vllm_kv_cache_bytes is not None
and entry.profiled_gib > 0
) )
# Per-GPU vLLM stagger — only between vLLM tests on the # Per-GPU vLLM stagger — only between vLLM tests on the
......
...@@ -109,6 +109,12 @@ def write_test_meta(items, dest_dir: str | None = None) -> None: ...@@ -109,6 +109,12 @@ def write_test_meta(items, dest_dir: str | None = None) -> None:
kv_tokens_mark = item.get_closest_marker("requested_sglang_kv_tokens") kv_tokens_mark = item.get_closest_marker("requested_sglang_kv_tokens")
if kv_tokens_mark and kv_tokens_mark.args: if kv_tokens_mark and kv_tokens_mark.args:
meta["requested_sglang_kv_tokens"] = kv_tokens_mark.args[0] meta["requested_sglang_kv_tokens"] = kv_tokens_mark.args[0]
trtllm_tokens_mark = item.get_closest_marker("requested_trtllm_kv_tokens")
if trtllm_tokens_mark and trtllm_tokens_mark.args:
meta["requested_trtllm_kv_tokens"] = trtllm_tokens_mark.args[0]
trtllm_vram_mark = item.get_closest_marker("requested_trtllm_vram_gib")
if trtllm_vram_mark and trtllm_vram_mark.args:
meta["requested_trtllm_vram_gib"] = trtllm_vram_mark.args[0]
skip_mark = item.get_closest_marker("skip") skip_mark = item.get_closest_marker("skip")
if skip_mark: if skip_mark:
reason = skip_mark.kwargs.get("reason", "") reason = skip_mark.kwargs.get("reason", "")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment