feat: add a few TRT-LLM example support for GPU-parallel test execution (#7880)

Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>

feat: add a few TRT-LLM example support for GPU-parallel test execution (#7880)
Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>
ad2205eb · Keiven C · GitHub · 2af062ec · ad2205eb · ad2205eb
Unverified Commit ad2205eb authored Apr 06, 2026 by Keiven C Committed by GitHub Apr 06, 2026
12 changed files
--- a/components/src/dynamo/trtllm/workers/llm_worker.py
+++ b/components/src/dynamo/trtllm/workers/llm_worker.py
@@ -109,6 +109,23 @@ def build_kv_connector_config(config: Config):
    return None


+def _warn_override_collisions(target: dict, source: dict, path: str = "") -> None:
+    """Log warnings for keys in *source* that will overwrite existing values in *target*."""
+    for key, new_val in source.items():
+        full_key = f"{path}.{key}" if path else key
+        if key in target:
+            old_val = target[key]
+            if isinstance(new_val, dict) and isinstance(old_val, dict):
+                _warn_override_collisions(old_val, new_val, full_key)
+            elif old_val != new_val:
+                logging.warning(
+                    "override_engine_args will replace %s: %r -> %r",
+                    full_key,
+                    old_val,
+                    new_val,
+                )
+
+
 async def init_llm_worker(
    runtime: DistributedRuntime,
    config: Config,
@@ -206,6 +223,7 @@ async def init_llm_worker(
            overrides = json.loads(config.override_engine_args)
            logging.info(f"Applying engine arg overrides: {overrides}")

+            _warn_override_collisions(arg_map, overrides)
            deep_update(arg_map, overrides)
        except json.JSONDecodeError as e:
            logging.error(f"Failed to parse override_engine_args as JSON: {e}")

--- a/examples/backends/trtllm/launch/agg.sh
+++ b/examples/backends/trtllm/launch/agg.sh
@@ -6,7 +6,8 @@ set -e
 trap 'echo Cleaning up...; kill 0' EXIT

 SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
-source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+source "$SCRIPT_DIR/../../../common/gpu_utils.sh"   # build_trtllm_override_args_with_mem
+source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit

 # Environment variables with defaults
 export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
@@ -42,12 +43,22 @@ while [[ $# -gt 0 ]]; do
    esac
 done

-TRACE_ARGS=()
+TRTLLM_OVERRIDE_ARGS=()
 if [ "$ENABLE_OTEL" = true ]; then
    export DYN_LOGGING_JSONL=true
    export OTEL_EXPORT_ENABLED=1
    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-http://localhost:4317}
-    TRACE_ARGS+=(--override-engine-args "{\"return_perf_metrics\": true, \"otlp_traces_endpoint\": \"${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}\" }")
+    OTEL_JSON="{\"return_perf_metrics\": true, \"otlp_traces_endpoint\": \"${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}\"}"
+    # Merge GPU mem config with OTEL config
+    OVERRIDE_JSON=$(build_trtllm_override_args_with_mem --merge-with-json "$OTEL_JSON")
+else
+    # Just GPU mem config (if any)
+    OVERRIDE_JSON=$(build_trtllm_override_args_with_mem)
+fi
+
+# Add --override-engine-args if we have JSON
+if [[ -n "$OVERRIDE_JSON" ]]; then
+    TRTLLM_OVERRIDE_ARGS=(--override-engine-args "$OVERRIDE_JSON")
 fi

 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
@@ -66,7 +77,7 @@ python3 -m dynamo.trtllm \
  --served-model-name "$SERVED_MODEL_NAME" \
  --modality "$MODALITY" \
  --extra-engine-args "$AGG_ENGINE_ARGS" \
-  "${TRACE_ARGS[@]}" \
+  "${TRTLLM_OVERRIDE_ARGS[@]}" \
  "${EXTRA_ARGS[@]}" &

 # Exit on first worker failure; kill 0 in the EXIT trap tears down the rest

--- a/examples/backends/trtllm/launch/agg_metrics.sh
+++ b/examples/backends/trtllm/launch/agg_metrics.sh
@@ -6,7 +6,8 @@ set -e
 trap 'echo Cleaning up...; kill 0' EXIT

 SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
-source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+source "$SCRIPT_DIR/../../../common/gpu_utils.sh"   # build_trtllm_override_args_with_mem
+source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit

 # Environment variables with defaults
 export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
@@ -15,6 +16,15 @@ export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
 export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/agg.yaml"}
 export MODALITY=${MODALITY:-"text"}

+# Build GPU memory JSON (returns bare JSON, no flag)
+OVERRIDE_JSON=$(build_trtllm_override_args_with_mem)
+
+# Add --override-engine-args if we have JSON
+TRTLLM_OVERRIDE_ARGS=()
+if [[ -n "$OVERRIDE_JSON" ]]; then
+    TRTLLM_OVERRIDE_ARGS=(--override-engine-args "$OVERRIDE_JSON")
+fi
+
 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
 print_launch_banner "Launching Aggregated Serving + Metrics" "$MODEL_PATH" "$HTTP_PORT"

@@ -29,7 +39,8 @@ python3 -m dynamo.trtllm \
  --served-model-name "$SERVED_MODEL_NAME" \
  --modality "$MODALITY" \
  --extra-engine-args "$AGG_ENGINE_ARGS" \
-  --publish-events-and-metrics &
+  --publish-events-and-metrics \
+  "${TRTLLM_OVERRIDE_ARGS[@]}" &

 # Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
 wait_any_exit
--- a/examples/backends/trtllm/launch/agg_video_diffusion.sh
+++ b/examples/backends/trtllm/launch/agg_video_diffusion.sh
@@ -10,6 +10,7 @@ trap 'echo Cleaning up...; kill 0' EXIT

 SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
 source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+source "$SCRIPT_DIR/../../../common/gpu_utils.sh"   # build_trtllm_override_args_with_mem

 # Environment variables with defaults
 export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
@@ -36,6 +37,15 @@ while [[ $# -gt 0 ]]; do
    esac
 done

+# Build GPU memory JSON (returns bare JSON, no flag)
+OVERRIDE_JSON=$(build_trtllm_override_args_with_mem)
+
+# Add --override-engine-args if we have JSON
+TRTLLM_OVERRIDE_ARGS=()
+if [[ -n "$OVERRIDE_JSON" ]]; then
+    TRTLLM_OVERRIDE_ARGS=(--override-engine-args "$OVERRIDE_JSON")
+fi
+
 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
 print_launch_banner --no-curl "Launching Video Diffusion Serving (1 GPU)" "$MODEL_PATH" "$HTTP_PORT" \
    "Media URL:   $MEDIA_OUTPUT_FS_URL"
@@ -61,6 +71,7 @@ python3 -m dynamo.trtllm \
  --served-model-name "$SERVED_MODEL_NAME" \
  --modality video_diffusion \
  --media-output-fs-url "$MEDIA_OUTPUT_FS_URL" \
+  "${TRTLLM_OVERRIDE_ARGS[@]}" \
  "${EXTRA_ARGS[@]}" &

 # Exit on first worker failure; kill 0 in the EXIT trap tears down the rest

--- a/examples/common/gpu_utils.sh
+++ b/examples/common/gpu_utils.sh
@@ -14,10 +14,17 @@
 #       Returns engine-specific CLI args for GPU memory control based on
 #       environment variable overrides. Empty if no overrides.
 #
+#       Supported engines: vllm, sglang
+#
 #       vLLM:   _PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES      → --kv-cache-memory-bytes N --gpu-memory-utilization 0.01
 #       SGLang: _PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS → --max-total-tokens N
 #
+#       Note: TensorRT-LLM uses build_trtllm_override_args_with_mem() instead (requires JSON merging)
+#
+#       TODO: Split into build_vllm_gpu_mem_args and build_sglang_gpu_mem_args
+#
 # Usage:
+#   # vLLM / SGLang
 #   GPU_MEM_ARGS=$(build_gpu_mem_args sglang)
 #   python -m dynamo.sglang --model-path "$MODEL" $GPU_MEM_ARGS &
 #
@@ -27,6 +34,12 @@ build_gpu_mem_args() {
    local engine="${1:?usage: build_gpu_mem_args <engine> [--workers-per-gpu N]}"
    shift

+    # TensorRT-LLM uses build_trtllm_override_args_with_mem instead
+    if [[ "$engine" == "trtllm" ]]; then
+        echo "build_gpu_mem_args: TensorRT-LLM not supported. Use build_trtllm_override_args_with_mem instead." >&2
+        return 1
+    fi
+
    local workers_per_gpu=1
    while [[ $# -gt 0 ]]; do
        case "$1" in
@@ -59,6 +72,76 @@ build_gpu_mem_args() {
 }


+# ---------------------------------------------------------------------------
+# build_trtllm_override_args_with_mem [--merge-with-json JSON]
+#   TensorRT-LLM-specific: builds JSON for --override-engine-args with GPU memory config.
+#   Returns ONLY the bare JSON value (no --override-engine-args flag, no quotes).
+#
+#   Separate function because TRT-LLM requires JSON merging for --override-engine-args
+#   (unlike vLLM/SGLang which use direct CLI flags).
+#
+#   Environment variables:
+#     _PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS        → {"kv_cache_config": {"max_tokens": N}}
+#     _PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES → {"kv_cache_config": {"max_gpu_total_bytes": N}}
+#
+#   If --merge-with-json is provided, merges GPU config with the existing JSON.
+#
+# Usage:
+#   # TensorRT-LLM: simple case (no existing overrides)
+#   JSON=$(build_trtllm_override_args_with_mem)
+#   python -m dynamo.trtllm --model-path "$MODEL" ${JSON:+--override-engine-args "$JSON"} &
+#
+#   # TensorRT-LLM: merge with existing JSON
+#   EXISTING='{"return_perf_metrics": true}'
+#   JSON=$(build_trtllm_override_args_with_mem --merge-with-json "$EXISTING")
+#   python -m dynamo.trtllm --model-path "$MODEL" --override-engine-args "$JSON" &
+# ---------------------------------------------------------------------------
+build_trtllm_override_args_with_mem() {
+    local merge_json=""
+    while [[ $# -gt 0 ]]; do
+        case "$1" in
+            --merge-with-json)
+                merge_json="$2"
+                shift 2
+                ;;
+            *) echo "build_trtllm_override_args_with_mem: unknown option '$1'" >&2; return 1 ;;
+        esac
+    done
+
+    local gpu_mem_json=""
+
+    # Token-based (preferred, simpler to reason about)
+    if [[ -n "${_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS:-}" ]]; then
+        gpu_mem_json='"kv_cache_config": {"max_tokens": '"${_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS}"'}'
+    # Byte-based (alternative, more precise)
+    elif [[ -n "${_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES:-}" ]]; then
+        gpu_mem_json='"kv_cache_config": {"max_gpu_total_bytes": '"${_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES}"'}'
+    fi
+
+    if [[ -n "$gpu_mem_json" ]]; then
+        if [[ -n "$merge_json" ]]; then
+            # Merge: GPU mem config first, then existing config
+            # Strip outer braces from existing JSON
+            local existing="${merge_json#\{}"
+            existing="${existing%\}}"
+            if [[ -n "${existing//[[:space:]]/}" ]]; then
+                echo "{${gpu_mem_json}, ${existing}}"
+            else
+                echo "{${gpu_mem_json}}"
+            fi
+        else
+            # Just GPU mem config
+            echo "{${gpu_mem_json}}"
+        fi
+    elif [[ -n "$merge_json" ]]; then
+        # No GPU override, return existing JSON as-is
+        echo "$merge_json"
+    fi
+
+    # No output if both are empty (engine uses default)
+}
+
+
 # ---------------------------------------------------------------------------
 # Self-test: bash gpu_utils.sh --self-test
 # ---------------------------------------------------------------------------
@@ -116,11 +199,56 @@ _gpu_utils_self_test() {
        build_gpu_mem_args sglang)
    _assert "sglang ignores kv bytes" "" "$result"

+    echo ""
+    echo "=== trtllm: token cap env ==="
+    result=$(_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS=4096 \
+        build_trtllm_override_args_with_mem)
+    _assert "trtllm token cap" '{"kv_cache_config": {"max_tokens": 4096}}' "$result"
+
+    echo ""
+    echo "=== trtllm: byte cap env ==="
+    result=$(_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES=1073741824 \
+        build_trtllm_override_args_with_mem)
+    _assert "trtllm byte cap" '{"kv_cache_config": {"max_gpu_total_bytes": 1073741824}}' "$result"
+
+    echo ""
+    echo "=== trtllm: no override = empty ==="
+    result=$(build_trtllm_override_args_with_mem)
+    _assert "empty (engine default)" "" "$result"
+
+    echo ""
+    echo "=== trtllm: token cap takes precedence over byte cap ==="
+    result=$(_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS=2048 _PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES=999999 \
+        build_trtllm_override_args_with_mem)
+    _assert "trtllm token precedence" '{"kv_cache_config": {"max_tokens": 2048}}' "$result"
+
+    echo ""
+    echo "=== trtllm: merge with existing JSON ==="
+    result=$(_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS=2048 \
+        build_trtllm_override_args_with_mem --merge-with-json '{"return_perf_metrics": true, "otlp_traces_endpoint": "http://localhost:4317"}')
+    _assert "trtllm merged" '{"kv_cache_config": {"max_tokens": 2048}, "return_perf_metrics": true, "otlp_traces_endpoint": "http://localhost:4317"}' "$result"
+
+    echo ""
+    echo "=== trtllm: merge with empty JSON object ==="
+    result=$(_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS=2048 \
+        build_trtllm_override_args_with_mem --merge-with-json '{}')
+    _assert "trtllm merge empty obj" '{"kv_cache_config": {"max_tokens": 2048}}' "$result"
+
+    echo ""
+    echo "=== trtllm: no GPU override, but pass through existing JSON ==="
+    result=$(build_trtllm_override_args_with_mem --merge-with-json '{"return_perf_metrics": true}')
+    _assert "trtllm passthrough" '{"return_perf_metrics": true}' "$result"
+
    echo ""
    echo "=== missing engine ==="
    (build_gpu_mem_args 2>/dev/null)
    _assert "missing engine exits non-zero" "1" "$?"

+    echo ""
+    echo "=== trtllm rejected (use build_trtllm_override_args_with_mem) ==="
+    (build_gpu_mem_args trtllm 2>/dev/null)
+    _assert "trtllm rejected" "1" "$?"
+
    echo ""
    echo "=========================================="
    echo "Results: $pass passed, $fail failed"

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -234,10 +234,12 @@ markers = [
    "gpu_8: marks tests to run on 8GPUs",
    "xpu_1: marks tests to run on XPU",
    "xpu_2: marks tests to run on 2XPUs",
-    # These 3 (profiled_vram_gib and requested_*) are used for parallel pytest executions:
+    # These 5 (profiled_vram_gib and requested_*) are used for parallel pytest executions:
    "profiled_vram_gib(N): actual peak VRAM observed by nvidia-smi during profiling. Used for --max-vram-gib filtering and scheduler budget tracking",
    "requested_vllm_kv_cache_bytes(N): exact KV cache bytes for vLLM (skips memory profiling). Sets _PROFILE_PYTEST_KV_CACHE_BYTES. Most deterministic method for parallel execution",
    "requested_sglang_kv_tokens(N): max KV cache tokens for SGLang parallel execution. Sets _OVERRIDE_SGLANG_MAX_TOTAL_TOKENS to cap --max-total-tokens and prevent over-allocation",
+    "requested_trtllm_kv_tokens(N): max KV cache tokens for TensorRT-LLM parallel execution. Sets _PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS to cap KvCacheConfig.max_tokens via --override-engine-args",
+    "requested_trtllm_vram_gib(N): max VRAM in GiB for TensorRT-LLM parallel execution. Sets _PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES to cap KvCacheConfig.max_gpu_total_bytes via --override-engine-args. Use for non-text workloads (video/image diffusion)",
    "e2e: marks tests as end-to-end tests",
    "integration: marks tests as integration tests",
    "unit: marks tests as unit tests",

--- a/tests/README.md
+++ b/tests/README.md
@@ -117,6 +117,8 @@ Markers are required for all tests. They are used for test selection in CI and l
 | VRAM (profiled)         | profiled_vram_gib(N)                                                         | Actual peak VRAM observed by nvidia-smi during profiling (includes CUDA overhead). Used for `--max-vram-gib=N` filtering and GPU-parallel scheduler budget tracking. |
 | vLLM KV cache bytes     | requested_vllm_kv_cache_bytes(N)                                             | (vLLM only) Exact KV cache bytes. Sets `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES` → `--kv-cache-memory-bytes`. Deterministic, parallel-safe. |
 | SGLang KV tokens        | requested_sglang_kv_tokens(N)                                                          | (SGLang only) Max KV cache tokens. Sets `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS` → `--max-total-tokens`. Deterministic, parallel-safe. |
+| TRT-LLM KV tokens      | requested_trtllm_kv_tokens(N)                                                          | (TRT-LLM only) Max KV cache tokens. Sets `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` → `KvCacheConfig.max_tokens` via `--override-engine-args`. Deterministic, parallel-safe. |
+| TRT-LLM VRAM GiB       | requested_trtllm_vram_gib(N)                                                           | (TRT-LLM only) Max VRAM in GiB. Sets `_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES` → `KvCacheConfig.max_gpu_total_bytes` via `--override-engine-args`. For non-text workloads (video/image diffusion) where token-based control doesn't apply. |
 | Component/Framework     | vllm, trtllm, sglang, kvbm, kvbm_concurrency, planner, router   | Backend or component specificity   |
 | Infrastructure          | k8s, deploy, fault_tolerance                                     | Infrastructure/environment needs   |
 | Execution               | parallel                                                         | Test can run in parallel with pytest-xdist. Must use dynamic port allocation (`alloc_ports`) and not share resources (e.g. filesystem) |
@@ -147,6 +149,33 @@ def test_sglang_aggregated():
    ...
 ```

+### Example (TRT-LLM with token cap)
+```python
+@pytest.mark.pre_merge
+@pytest.mark.e2e
+@pytest.mark.gpu_1
+@pytest.mark.profiled_vram_gib(3.9)   # actual nvidia-smi peak at recommended token count
+@pytest.mark.requested_trtllm_kv_tokens(2592)   # KV cache cap (2x safety over min=1296)
+@pytest.mark.timeout(300)
+@pytest.mark.trtllm
+def test_trtllm_aggregated():
+    ...
+```
+
+### Example (TRT-LLM diffusion — no KV cache)
+```python
+@pytest.mark.pre_merge
+@pytest.mark.gpu_1
+@pytest.mark.trtllm
+# Diffusion models don't use KV cache, so requested_trtllm_kv_tokens doesn't apply
+# and requested_trtllm_vram_gib (KvCacheConfig.max_gpu_total_bytes) has no effect —
+# the VRAM is model weights + activations. Only profiled_vram_gib is meaningful.
+@pytest.mark.profiled_vram_gib(17.1)  # actual nvidia-smi peak
+@pytest.mark.timeout(600)
+def test_trtllm_video_diffusion():
+    ...
+```
+
 ### VRAM Markers and Filtering

 Markers differ by engine:
@@ -159,6 +188,12 @@ Markers differ by engine:
 - **`profiled_vram_gib(N)`** — actual peak from nvidia-smi at the recommended token count. Used for `--max-vram-gib` filtering and scheduler budget.
 - **`requested_sglang_kv_tokens(N)`** — max KV cache tokens. Sets `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS` → `--max-total-tokens`. SGLang's default `--mem-fraction-static` is never overridden; the token cap is the sole allocation control. Deterministic and parallel-safe (see `examples/common/gpu_utils.md`).

+**TRT-LLM** uses token-based control (text models) or byte-based control (diffusion models):
+- **`profiled_vram_gib(N)`** — actual peak from nvidia-smi. Used for `--max-vram-gib` filtering and scheduler budget.
+- **`requested_trtllm_kv_tokens(N)`** — max KV cache tokens for text models. Sets `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` → `KvCacheConfig.max_tokens` via `--override-engine-args` JSON. Deterministic and parallel-safe.
+- **`requested_trtllm_vram_gib(N)`** — max VRAM in GiB for non-text workloads (video/image diffusion). Sets `_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES` → `KvCacheConfig.max_gpu_total_bytes` via `--override-engine-args` JSON. Note: diffusion models don't use KV cache, so this parameter may have no effect — `profiled_vram_gib` alone is sufficient for scheduler budget tracking.
+- TRT-LLM requires JSON merging for `--override-engine-args`, handled by `build_trtllm_override_args_with_mem` in `gpu_utils.sh` (separate from `build_gpu_mem_args` used by vLLM/SGLang).
+
 `--max-vram-gib=N` deselects tests whose `profiled_vram_gib` exceeds N. Tests without a VRAM marker are also deselected (unknown VRAM = unsafe for parallel). To add a test to the pool, profile it with `tests/utils/profile_pytest.py` (see [GPU VRAM Profiler](#gpu-vram-profiler-profile_pytestpy)).

 ### GPU-Parallel Execution
@@ -170,6 +205,7 @@ GPU tests run concurrently via a custom VRAM-aware scheduler (`tests/utils/pytes
 3. **Engine-specific allocation**: each test gets a constrained allocation so it uses only its budgeted share. xdist has no mechanism for this.
   - **vLLM**: `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES = N` → `--kv-cache-memory-bytes` (from `requested_vllm_kv_cache_bytes` marker). Byte-based cap is deterministic and doesn't depend on current free memory, making it inherently parallel-safe.
   - **SGLang**: `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS = N` → `--max-total-tokens` (from `requested_sglang_kv_tokens` marker). Token-based cap is deterministic and doesn't depend on current free memory, making it inherently parallel-safe.
+   - **TRT-LLM**: `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS = N` → `KvCacheConfig.max_tokens` via `--override-engine-args` JSON (from `requested_trtllm_kv_tokens` marker). Token-based cap is deterministic and parallel-safe. Uses `build_trtllm_override_args_with_mem` (not `build_gpu_mem_args`) because TRT-LLM requires JSON merging for `--override-engine-args`.

 ```bash
 # Dry-run: preview which tests fit and the GPU plan
@@ -508,18 +544,26 @@ The profiler automatically detects the engine type and uses the appropriate bina

 - **vLLM**: bisects `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES` (bytes) → `--kv-cache-memory-bytes`. Finds the minimum KV cache bytes where the test passes, applies a 2x safety factor. Outputs `profiled_vram_gib` and `requested_vllm_kv_cache_bytes` markers.
 - **SGLang**: bisects `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS` (token count) → `--max-total-tokens`. Finds the minimum KV cache tokens where the test passes, applies a 2x safety factor, then runs a final probe at the safe token count to measure the actual VRAM. Outputs `profiled_vram_gib` and `requested_sglang_kv_tokens` markers.
+- **TRT-LLM**: bisects `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` (token count) → `KvCacheConfig.max_tokens` via `--override-engine-args` JSON. Same logic as SGLang (token-based bisection, 2x safety). Outputs `profiled_vram_gib` and `requested_trtllm_kv_tokens` markers. For non-text models (video/image diffusion) that don't use KV cache, use `--no-find-min-vram` for a single-pass VRAM measurement — binary search won't work because the model doesn't log KV token allocation.

 **Requirement (vLLM):** The launch script must honor `_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES`. This is handled by `build_gpu_mem_args` in `gpu_utils.sh` (returns `--kv-cache-memory-bytes N`).

 **Requirement (SGLang):** The launch script must honor `_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS`. This is handled by `build_gpu_mem_args` in `gpu_utils.sh` (returns `--max-total-tokens N`).

+**Requirement (TRT-LLM):** The launch script must honor `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` (and optionally `_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES`). This is handled by `build_trtllm_override_args_with_mem` in `gpu_utils.sh` (returns JSON for `--override-engine-args`). Note: this is a separate function from `build_gpu_mem_args` because TRT-LLM requires JSON merging.
+
 ### Engine-specific mapping

-Launch scripts call `build_gpu_mem_args` (from `examples/common/gpu_utils.sh`) which checks env var overrides and returns the appropriate CLI flags:
+Launch scripts call `build_gpu_mem_args` (vLLM/SGLang) or `build_trtllm_override_args_with_mem` (TRT-LLM) from `examples/common/gpu_utils.sh`, which check env var overrides and return the appropriate CLI flags:

 ```bash
+# vLLM / SGLang
 GPU_MEM_ARGS=$(build_gpu_mem_args sglang)
 python -m dynamo.sglang --model-path "$MODEL" $GPU_MEM_ARGS &
+
+# TRT-LLM (requires JSON merging, separate function)
+OVERRIDE_JSON=$(build_trtllm_override_args_with_mem)
+python -m dynamo.trtllm --model-path "$MODEL" ${OVERRIDE_JSON:+--override-engine-args "$OVERRIDE_JSON"} &
 ```

 Env vars control engine allocation during profiling and parallel test execution:
@@ -536,7 +580,19 @@ Env vars control engine allocation during profiling and parallel test execution:
 |---------|----------------------------------|-------|
 | SGLang  | `--max-total-tokens N`           | Token-based KV cache cap |

-Both use absolute caps (bytes and tokens) — deterministic and independent of current free memory, which is critical for parallel test execution. See `examples/common/gpu_utils.md`.
+**`_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS`** (integer) — TRT-LLM text models:
+
+| Engine  | Returned JSON                                          | Notes |
+|---------|--------------------------------------------------------|-------|
+| TRT-LLM | `{"kv_cache_config": {"max_tokens": N}}`              | Token-based KV cache cap via `--override-engine-args` |
+
+**`_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES`** (integer) — TRT-LLM non-text models:
+
+| Engine  | Returned JSON                                                    | Notes |
+|---------|------------------------------------------------------------------|-------|
+| TRT-LLM | `{"kv_cache_config": {"max_gpu_total_bytes": N}}`               | Byte-based cap via `--override-engine-args`. For diffusion models. |
+
+All use absolute caps — deterministic and independent of current free memory, which is critical for parallel test execution. See `examples/common/gpu_utils.md`.

 ### Usage

@@ -550,6 +606,12 @@ python tests/utils/profile_pytest.py --gpu 1 tests/serve/test_vllm.py::test_serv
 # SGLang: binary search for minimum KV cache tokens (automatic)
 python tests/utils/profile_pytest.py tests/serve/test_sglang.py::test_sglang_deployment[aggregated-2] -xvs

+# TRT-LLM: binary search for minimum KV cache tokens (text models)
+python tests/utils/profile_pytest.py tests/serve/test_trtllm.py::test_deployment[aggregated-2] -xvs
+
+# TRT-LLM: single-pass for diffusion models (no KV cache, binary search won't work)
+python tests/utils/profile_pytest.py --no-find-min-vram tests/serve/test_trtllm.py::test_deployment[video_diffusion-2] -xvs
+
 # Single-pass profiling (no binary search, just measure one run using default RAM)
 python tests/utils/profile_pytest.py --no-find-min-vram tests/serve/test_vllm.py::test_serve_deployment[aggregated]
 ```
@@ -626,6 +688,36 @@ MINIMUM KV TOKENS RESULT
 ========================================================================
 ```

+### Example output (TRT-LLM — token-based bisection)
+
+```bash
+========================================================================
+FIND MINIMUM KV TOKENS (TensorRT-LLM) (binary search)
+========================================================================
+  GPU total : 48.0 GiB
+  GPU free  : 47.1 GiB  (in use: 0.9 GiB)
+  Test      : tests/serve/test_trtllm.py::test_deployment[aggregated-2] -xvs
+
+  [probe 1] Validation run (no token cap, default fraction)
+  [PASS] peak 41.3 GiB, wall 48s, max_tokens=41472 (TensorRT-LLM), iter took 56s
+  ...
+  [probe 6/12] tokens=1296
+  [PASS] tokens=1296, peak 3.7 GiB, wall 46s, iter took 54s
+  [EARLY STOP] Peak VRAM stable for last 3 probes
+  [final probe] Measuring VRAM at safe_tokens=2592
+  [PASS] tokens=2592, peak 3.9 GiB, wall 46s
+
+========================================================================
+MINIMUM KV TOKENS RESULT (TensorRT-LLM)
+========================================================================
+  Minimum tokens  : 1296 (raw bisection result)
+  Recommended     : 2592 (2x safety)
+  Peak VRAM       : 3.9 GiB (at 2592 tokens)
+  @pytest.mark.profiled_vram_gib(3.9)
+  @pytest.mark.requested_trtllm_kv_tokens(2592),  # KV cache cap (2x safety over min=1296)
+========================================================================
+```
+
 ### How to use the recommendations

 1. **Copy the `@pytest.mark.*` lines** into your test function or `pytestmark` list.

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -100,6 +100,8 @@ def pytest_configure(config: pytest.Config) -> None:
    vram_limit = config.getoption("max_vram_gib", default=None)
    if vram_limit is None:
        return
+    if config.option.collectonly:
+        return
    # Delayed: vram_utils requires pynvml, otherwise conftest fails to load
    # on CPU-only CI runners (e.g. ARM deploy tests) that lack nvidia-ml-py.
    from tests.utils.pytest_parallel_gpu import _parse_cuda_visible
@@ -482,8 +484,9 @@ def pytest_collection_modifyitems(config, items):
    #   - Tests whose profiled VRAM exceeds the limit are removed
    #   - Tests WITHOUT a VRAM marker are also removed (unknown VRAM = unsafe)
    # Using deselect (not skip) so they never reach the xdist scheduler.
+    # Skip all VRAM logic during --collect-only (just listing tests).
    vram_limit = config.getoption("--max-vram-gib", default=None)
-    if vram_limit is not None:
+    if vram_limit is not None and not config.option.collectonly:
        keep = []
        deselected = []
        for item in items:
@@ -497,7 +500,7 @@ def pytest_collection_modifyitems(config, items):
            items[:] = keep

    # Write test metadata for the GPU orchestrator to read.
-    if vram_limit is not None:
+    if vram_limit is not None and not config.option.collectonly:
        # Delayed: see vram_utils pynvml note in pytest_configure
        from tests.utils.vram_utils import print_gpu_plan, write_test_meta


--- a/tests/serve/test_trtllm.py
+++ b/tests/serve/test_trtllm.py
@@ -80,15 +80,20 @@ trtllm_configs = {
        directory=trtllm_dir,
        script_name="agg_metrics.sh",
        marks=[
-            pytest.mark.gpu_1,
+            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 3.9 GiB
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
+            pytest.mark.profiled_vram_gib(3.9),  # actual nvidia-smi peak 3.9 GiB
+            pytest.mark.requested_trtllm_kv_tokens(
+                2592
+            ),  # KV cache cap (2x safety over min=1296)
            pytest.mark.timeout(
                300
            ),  # 3x measured time (44.66s) + download time (150s)
        ],
        model="Qwen/Qwen3-0.6B",
        frontend_port=DefaultPort.FRONTEND.value,
+        delayed_start=5,
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
@@ -137,9 +142,19 @@ trtllm_configs = {
        name="aggregated_logprobs",
        directory=trtllm_dir,
        script_name="agg.sh",
-        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.trtllm],
+        marks=[
+            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 3.8 GiB
+            pytest.mark.pre_merge,
+            pytest.mark.trtllm,
+            pytest.mark.profiled_vram_gib(3.8),  # actual nvidia-smi peak 3.8 GiB
+            pytest.mark.requested_trtllm_kv_tokens(
+                2592
+            ),  # KV cache cap (2x safety over min=1296)
+            pytest.mark.timeout(300),  # 3x measured time (~44s) + download time (150s)
+        ],
        model="Qwen/Qwen3-0.6B",
        frontend_port=DefaultPort.FRONTEND.value,
+        delayed_start=5,
        request_payloads=[
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=False, top_logprobs=5),
@@ -360,9 +375,17 @@ trtllm_configs = {
            "17",
        ],
        marks=[
-            pytest.mark.gpu_1,
+            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 17.1 GiB
            pytest.mark.trtllm,
            pytest.mark.pre_merge,
+            # Diffusion models don't use KV cache, so requested_trtllm_kv_tokens
+            # doesn't apply.  requested_trtllm_vram_gib maps to
+            # KvCacheConfig.max_gpu_total_bytes which has no effect on the
+            # diffusion engine itself, but the parallel scheduler requires one
+            # of the KV/VRAM markers to accept the test.  We set it to the
+            # profiled peak so the scheduler's VRAM budget is accurate.
+            pytest.mark.profiled_vram_gib(17.1),  # actual nvidia-smi peak 17.1 GiB
+            pytest.mark.requested_trtllm_vram_gib(17.1),
            pytest.mark.timeout(
                600
            ),  # Video generation is slow even at small resolution
@@ -370,7 +393,7 @@ trtllm_configs = {
        model="Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=300,
-        delayed_start=60,  # Model loading takes time
+        delayed_start=5,
        request_payloads=[
            VideoGenerationPayload(
                body={

--- a/tests/utils/profile_pytest.py
+++ b/tests/utils/profile_pytest.py
@@ -15,8 +15,9 @@ in-process instrumentation.  Using NVML directly (the same C library that
 and allows high-frequency sampling.

 In **binary-search mode** (the default), the profiler bisects the KV cache
-allocation — ``_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES`` for vLLM (bytes) or
-``_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS`` for SGLang (tokens).
+allocation — ``_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES`` for vLLM (bytes),
+``_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS`` for SGLang (tokens), or
+``_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS`` for TensorRT-LLM (tokens).
 If the test passes, the allocation is lowered; if it OOMs, it is raised —
 standard bisection to find the minimum the test needs.  A safety factor
 is applied and the peak ``memory.used`` from the last passing run becomes
@@ -24,7 +25,8 @@ the ``@pytest.mark.profiled_vram_gib`` recommendation.

 **IMPORTANT**: The test under profile **MUST** read the appropriate KV cache
 override — either directly (see ``test_mock_gpu_alloc.py``) or via launch
-scripts that call ``build_gpu_mem_args`` (e.g. ``agg.sh``).  If the test
+scripts that call ``build_gpu_mem_args`` (vLLM/SGLang) or
+``build_trtllm_override_args_with_mem`` (TensorRT-LLM).  If the test
 ignores the override, every probe will pass at the same peak and the profiler
 will warn that the binary search is unreliable.

@@ -459,6 +461,7 @@ def _recommend_markers(
    model_name: str | None = None,
    num_runs: int = 1,
    requested_sglang_kv_tokens: int | None = None,
+    requested_trtllm_kv_tokens: int | None = None,
    requested_vllm_kv_cache_bytes: int | None = None,
    min_kv_value: int | None = None,
 ) -> tuple[list[MarkerRecommendation], list[str]]:
@@ -559,6 +562,14 @@ def _recommend_markers(
                    f"KV cache cap ({_KV_SAFETY_FACTOR:.0f}x safety{min_label})",
                )
            )
+        if requested_trtllm_kv_tokens is not None:
+            min_label = f" over min={min_kv_value}" if min_kv_value is not None else ""
+            recs.append(
+                MarkerRecommendation(
+                    f"requested_trtllm_kv_tokens({requested_trtllm_kv_tokens})",
+                    f"KV cache cap ({_KV_SAFETY_FACTOR:.0f}x safety{min_label})",
+                )
+            )
        if requested_vllm_kv_cache_bytes is not None:
            min_label = (
                f" over min={min_kv_value:_}" if min_kv_value is not None else ""
@@ -634,6 +645,7 @@ def _print_recommendations(


 _SGLANG_NODEID_MARKERS = ["test_sglang", "sglang"]
+_TRTLLM_NODEID_MARKERS = ["test_trtllm", "trtllm"]


 def _is_sglang_test(pytest_args: list[str]) -> bool:
@@ -643,6 +655,13 @@ def _is_sglang_test(pytest_args: list[str]) -> bool:
    )


+def _is_trtllm_test(pytest_args: list[str]) -> bool:
+    """Check if any pytest arg looks like a TensorRT-LLM test node ID."""
+    return any(
+        marker in arg for arg in pytest_args for marker in _TRTLLM_NODEID_MARKERS
+    )
+
+
 _OOM_PATTERNS = [
    "OutOfMemoryError",
    "CUDA out of memory",
@@ -673,6 +692,22 @@ def _extract_requested_sglang_kv_tokens(stdout: str) -> int | None:
    return None


+_TRTLLM_MAX_TOKENS_RE = re.compile(
+    r"\[MemUsageChange\] Allocated .* for max tokens in paged KV cache \((\d+)\)"
+)
+
+
+def _extract_requested_trtllm_kv_tokens(stdout: str) -> int | None:
+    """Extract max_tokens from TensorRT-LLM engine output.
+
+    TensorRT-LLM logs: "[MemUsageChange] Allocated 0.22 GiB for max tokens in paged KV cache (2048)."
+    """
+    match = _TRTLLM_MAX_TOKENS_RE.search(stdout)
+    if match:
+        return int(match.group(1))
+    return None
+
+
 _DEFAULT_PROBE_TIMEOUT = 300  # 5 minutes max per profile run


@@ -765,15 +800,17 @@ def _find_min_vram(
 ) -> int:
    """Binary search to find the minimum VRAM a test needs.

-    Three modes, two patterns:
+    Three modes, three patterns:

    KV bisection (deterministic, no profiling race):
      vLLM:         bisects _PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES (bytes)
      SGLang:       bisects _PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS (tokens)
-      Both use the same _KV_SAFETY_FACTOR (2x) and the same bisect loop.
+      TensorRT-LLM: bisects _PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS (tokens)
+      All use the same _KV_SAFETY_FACTOR (2x) and the same bisect loop.
      The only differences are env var name, units, display, and bounds.
    """
    is_sglang = _is_sglang_test(pytest_args)
+    is_trtllm = _is_trtllm_test(pytest_args)

    gpu_info = _query_gpu_stats()
    if not gpu_info:
@@ -792,11 +829,13 @@ def _find_min_vram(

    model_name = _extract_model_from_markers(pytest_args)

-    if not is_sglang:
+    if not is_sglang and not is_trtllm:
        kv_bytes_mode = True

    if kv_bytes_mode:
        mode_label = "KV CACHE BYTES (vLLM, deterministic)"
+    elif is_trtllm:
+        mode_label = "KV TOKENS (TensorRT-LLM)"
    else:
        mode_label = "KV TOKENS (SGLang)"
    print(f"\n--- FIND MINIMUM {mode_label} (binary search) ---")
@@ -908,6 +947,16 @@ def _find_min_vram(
            f"  [PASS] peak {_format_mib(peak_mib)}, wall {wall:.0f}s, "
            f"iter took {iter_elapsed:.0f}s"
        )
+    else:
+        if is_trtllm:
+            max_tokens = _extract_requested_trtllm_kv_tokens(stdout)
+            if max_tokens is None:
+                print(
+                    "  [ERROR] Could not extract max_tokens from TensorRT-LLM output.\n"
+                    "  The launch script must log '[MemUsageChange] Allocated ... for max tokens in paged KV cache (N)'."
+                )
+                return 4
+            backend_label = "TensorRT-LLM"
        else:
            max_tokens = _extract_requested_sglang_kv_tokens(stdout)
            if max_tokens is None:
@@ -916,13 +965,14 @@ def _find_min_vram(
                    "  The launch script must log 'max_total_tokens=N' (SGLang does this by default)."
                )
                return 4
+            backend_label = "SGLang"
        page_size = 16
        lo = page_size
        hi = max_tokens
        tolerance = page_size * 2
        print(
            f"  [PASS] peak {_format_mib(peak_mib)}, wall {wall:.0f}s, "
-            f"max_total_tokens={max_tokens}, iter took {iter_elapsed:.0f}s"
+            f"max_tokens={max_tokens} ({backend_label}), iter took {iter_elapsed:.0f}s"
        )

    baseline_time = iter_elapsed
@@ -968,6 +1018,14 @@ def _find_min_vram(
                "_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES": str(mid_int),
            }
            probe_desc = f"kv_cache={mid_int // (1024**2)} MiB ({mid_int:,} bytes)"
+        elif is_trtllm:
+            mid_int = ((int(lo) + int(hi)) // 2 // page_size) * page_size
+            mid_int = max(mid_int, page_size)
+            probe_env = {
+                **_gpu_env,
+                "_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS": str(mid_int),
+            }
+            probe_desc = f"tokens={mid_int}"
        else:
            mid_int = ((int(lo) + int(hi)) // 2 // page_size) * page_size
            mid_int = max(mid_int, page_size)
@@ -1083,7 +1141,7 @@ def _find_min_vram(
        # safe_kv_bytes which allocates more KV cache and thus more VRAM.
        print(f"  [final probe] Measuring VRAM at safe_kv_bytes={safe_kv_mib} MiB")
        sys.stdout.flush()
-        rc_final, wall_final, reports_final, samples_final, stdout_final = _run_once(
+        rc_final, wall_final, reports_final, samples_final, _stdout_final = _run_once(
            pytest_args,
            interval=interval,
            baseline_seconds=baseline_seconds,
@@ -1141,14 +1199,24 @@ def _find_min_vram(
        # safe_tokens which allocates more KV cache and thus more VRAM.
        print(f"  [final probe] Measuring VRAM at safe_tokens={safe_tokens}")
        sys.stdout.flush()
-        rc_final, wall_final, reports_final, samples_final, stdout_final = _run_once(
+
+        if is_trtllm:
+            env_var_name = "_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS"
+            marker_name = "requested_trtllm_kv_tokens"
+            backend_label = "TensorRT-LLM"
+        else:
+            env_var_name = "_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS"
+            marker_name = "requested_sglang_kv_tokens"
+            backend_label = "SGLang"
+
+        rc_final, wall_final, reports_final, samples_final, _stdout_final = _run_once(
            pytest_args,
            interval=interval,
            baseline_seconds=baseline_seconds,
            teardown_seconds=teardown_seconds,
            extra_env={
                **_gpu_env,
-                "_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS": str(safe_tokens),
+                env_var_name: str(safe_tokens),
            },
            quiet=True,
            run_label="final",
@@ -1171,7 +1239,7 @@ def _find_min_vram(
            )

        print(f"\n{'=' * 72}")
-        print("MINIMUM KV TOKENS RESULT")
+        print(f"MINIMUM KV TOKENS RESULT ({backend_label})")
        print(f"{'=' * 72}")
        print(f"  Minimum tokens  : {min_tokens} (raw bisection result)")
        print(f"  Recommended     : {safe_tokens} ({_KV_SAFETY_FACTOR:.0f}x safety)")
@@ -1180,12 +1248,13 @@ def _find_min_vram(
        )
        print(f"  {test_short}: @pytest.mark.profiled_vram_gib({peak_gib})")
        print(
-            f"  {test_short}: @pytest.mark.requested_sglang_kv_tokens({safe_tokens}),  # KV cache cap ({_KV_SAFETY_FACTOR:.0f}x safety over min={min_tokens})"
+            f"  {test_short}: @pytest.mark.{marker_name}({safe_tokens}),  # KV cache cap ({_KV_SAFETY_FACTOR:.0f}x safety over min={min_tokens})"
        )
    print(f"{'=' * 72}")

    # Marker recommendations
    requested_sglang_kv_tokens = safe_tokens if is_sglang else None
+    requested_trtllm_kv_tokens = safe_tokens if is_trtllm else None
    requested_vllm_kv_cache_bytes = safe_kv_bytes if kv_bytes_mode else None
    min_kv_value = int(last_pass_value)
    if recommend:
@@ -1196,6 +1265,7 @@ def _find_min_vram(
            model_name,
            num_runs=len(pass_wall_times),
            requested_sglang_kv_tokens=requested_sglang_kv_tokens,
+            requested_trtllm_kv_tokens=requested_trtllm_kv_tokens,
            requested_vllm_kv_cache_bytes=requested_vllm_kv_cache_bytes,
            min_kv_value=min_kv_value,
        )
@@ -1326,6 +1396,7 @@ def main(argv: list[str] | None = None) -> int:

    model_name = _extract_model_from_markers(pytest_args)
    is_sglang = _is_sglang_test(pytest_args)
+    is_trtllm = _is_trtllm_test(pytest_args)

    rc, wall_secs, reports, samples, stdout = _run_once(
        pytest_args,
@@ -1333,20 +1404,24 @@ def main(argv: list[str] | None = None) -> int:
        baseline_seconds=args.baseline_seconds,
        teardown_seconds=args.teardown_seconds,
        extra_env=gpu_env,
-        run_label="profile" if is_sglang else None,
+        run_label="profile" if (is_sglang or is_trtllm) else None,
    )

    _print_report(reports, rc, wall_secs, model_name=model_name)

    if not args.no_recommend and reports:
        requested_sglang_kv_tokens = None
+        requested_trtllm_kv_tokens = None
        if is_sglang:
            requested_sglang_kv_tokens = _extract_requested_sglang_kv_tokens(stdout)
+        if is_trtllm:
+            requested_trtllm_kv_tokens = _extract_requested_trtllm_kv_tokens(stdout)
        recs, warnings = _recommend_markers(
            reports,
            wall_secs,
            model_name=model_name,
            requested_sglang_kv_tokens=requested_sglang_kv_tokens,
+            requested_trtllm_kv_tokens=requested_trtllm_kv_tokens,
        )
        _print_recommendations(recs, warnings, pytest_args=pytest_args)


--- a/tests/utils/pytest_parallel_gpu.py
+++ b/tests/utils/pytest_parallel_gpu.py
@@ -59,6 +59,8 @@ class _TestEntry:
    timeout: float
    requested_vllm_kv_cache_bytes: int | None = None
    requested_sglang_kv_tokens: int | None = None
+    requested_trtllm_kv_tokens: int | None = None
+    requested_trtllm_vram_gib: float | None = None
    skip_reason: str | None = None
    w_id: int = 0
    assigned_gpu: int | None = None
@@ -117,6 +119,10 @@ def _fmt_req(test: _TestEntry) -> str:
    """Format the resource request value for display."""
    if test.requested_sglang_kv_tokens is not None:
        return f"req_kv_tokens={int(test.requested_sglang_kv_tokens)}"
+    if test.requested_trtllm_kv_tokens is not None:
+        return f"req_kv_tokens={int(test.requested_trtllm_kv_tokens)}"
+    if test.requested_trtllm_vram_gib is not None:
+        return f"req_vram={test.requested_trtllm_vram_gib:.1f} GiB"
    if test.requested_vllm_kv_cache_bytes is not None:
        gib = int(test.requested_vllm_kv_cache_bytes) / (1024**3)
        return f"req_kv={gib:.2f} GiB"
@@ -347,6 +353,8 @@ def run_parallel(
                requested_vllm_kv_cache_bytes=m.get("requested_vllm_kv_cache_bytes"),
                timeout=m.get("timeout", 600),
                requested_sglang_kv_tokens=m.get("requested_sglang_kv_tokens"),
+                requested_trtllm_kv_tokens=m.get("requested_trtllm_kv_tokens"),
+                requested_trtllm_vram_gib=m.get("requested_trtllm_vram_gib"),
                skip_reason=m.get("skip_reason"),
            )
        )
@@ -367,19 +375,19 @@ def run_parallel(
        for t in tests
        if t.requested_vllm_kv_cache_bytes is None
        and t.requested_sglang_kv_tokens is None
+        and t.requested_trtllm_kv_tokens is None
+        and t.requested_trtllm_vram_gib is None
        and t.profiled_gib > 0
    ]
    if no_kv:
        _print(
-            f"\nERROR: {len(no_kv)} test(s) lack a requested_vllm_kv_cache_bytes "
-            f"or requested_sglang_kv_tokens marker and cannot run in parallel:"
+            f"\nERROR: {len(no_kv)} test(s) lack a requested_vllm_kv_cache_bytes, "
+            f"requested_sglang_kv_tokens, requested_trtllm_kv_tokens, "
+            f"or requested_trtllm_vram_gib marker and cannot run in parallel:"
        )
        for t in no_kv:
            _print(f"  {t.name}")
-        _print(
-            "\nAdd the appropriate marker via profile_pytest.py --kv-bytes, "
-            "then rerun."
-        )
+        _print("\nAdd the appropriate marker via profile_pytest.py, " "then rerun.")
        return 1

    # Identify tests in metadata that exceed the VRAM budget
@@ -502,6 +510,13 @@ def run_parallel(
            env["_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS"] = str(
                int(test.requested_sglang_kv_tokens)
            )
+        elif test.requested_trtllm_kv_tokens is not None:
+            env["_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS"] = str(
+                int(test.requested_trtllm_kv_tokens)
+            )
+        elif test.requested_trtllm_vram_gib is not None:
+            gib_to_bytes = int(test.requested_trtllm_vram_gib * 1024**3)
+            env["_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES"] = str(gib_to_bytes)
        elif test.requested_vllm_kv_cache_bytes is not None:
            env["_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES"] = str(
                int(test.requested_vllm_kv_cache_bytes)
@@ -705,7 +720,8 @@ def run_parallel(
                gi = entry.assigned_gpu
                assert gi is not None
                is_vllm = (
-                    entry.requested_sglang_kv_tokens is None and entry.profiled_gib > 0
+                    entry.requested_vllm_kv_cache_bytes is not None
+                    and entry.profiled_gib > 0
                )

                # Per-GPU vLLM stagger — only between vLLM tests on the

--- a/tests/utils/vram_utils.py
+++ b/tests/utils/vram_utils.py
@@ -109,6 +109,12 @@ def write_test_meta(items, dest_dir: str | None = None) -> None:
        kv_tokens_mark = item.get_closest_marker("requested_sglang_kv_tokens")
        if kv_tokens_mark and kv_tokens_mark.args:
            meta["requested_sglang_kv_tokens"] = kv_tokens_mark.args[0]
+        trtllm_tokens_mark = item.get_closest_marker("requested_trtllm_kv_tokens")
+        if trtllm_tokens_mark and trtllm_tokens_mark.args:
+            meta["requested_trtllm_kv_tokens"] = trtllm_tokens_mark.args[0]
+        trtllm_vram_mark = item.get_closest_marker("requested_trtllm_vram_gib")
+        if trtllm_vram_mark and trtllm_vram_mark.args:
+            meta["requested_trtllm_vram_gib"] = trtllm_vram_mark.args[0]
        skip_mark = item.get_closest_marker("skip")
        if skip_mark:
            reason = skip_mark.kwargs.get("reason", "")