feat: GPU VRAM profiler + profiled test markers for 1 GPU SGLang (part 3) (#7508)

Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>

feat: GPU VRAM profiler + profiled test markers for 1 GPU SGLang (part 3) (#7508)
Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>
6ed8ba0a · Keiven C · GitHub · a04a9401 · 6ed8ba0a · 6ed8ba0a
Unverified Commit 6ed8ba0a authored Mar 20, 2026 by Keiven C Committed by GitHub Mar 20, 2026
10 changed files
--- a/examples/backends/sglang/launch/agg.sh
+++ b/examples/backends/sglang/launch/agg.sh
@@ -9,7 +9,8 @@ set -e
 trap 'echo Cleaning up...; kill 0' EXIT
 SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
-source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+source "$SCRIPT_DIR/../../../common/gpu_utils.sh"   # build_gpu_mem_args
+source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
 # Default values
 MODEL="Qwen/Qwen3-0.6B"
@@ -54,6 +55,8 @@ if [ "$ENABLE_OTEL" = true ]; then
    TRACE_ARGS+=(--enable-trace --otlp-traces-endpoint localhost:4317)
 fi
+GPU_MEM_FRACTION=$(build_gpu_mem_args sglang --model "$MODEL")
 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
 print_launch_banner "Launching Aggregated Serving" "$MODEL" "$HTTP_PORT"
@@ -72,6 +75,7 @@ python3 -m dynamo.sglang \
  --trust-remote-code \
  --skip-tokenizer-init \
  --enable-metrics \
+  ${GPU_MEM_FRACTION:+--mem-fraction-static "$GPU_MEM_FRACTION"} \
  "${TRACE_ARGS[@]}" \
  "${EXTRA_ARGS[@]}" &

--- a/examples/backends/sglang/launch/agg_embed.sh
+++ b/examples/backends/sglang/launch/agg_embed.sh
@@ -9,14 +9,24 @@ set -e
 trap 'echo Cleaning up...; kill 0' EXIT
 SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
-source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+source "$SCRIPT_DIR/../../../common/gpu_utils.sh"   # build_gpu_mem_args
+source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
+# Default values
+MODEL="Qwen/Qwen3-Embedding-4B"
 # Parse command line arguments
+EXTRA_ARGS=()
 while [[ $# -gt 0 ]]; do
    case $1 in
+        --model-path)
+            MODEL="$2"
+            shift 2
+            ;;
        -h|--help)
            echo "Usage: $0 [OPTIONS]"
            echo "Options:"
+            echo "  --model-path <name>  Specify model (default: $MODEL)"
            echo "  -h, --help           Show this help message"
            echo ""
            echo "Note: System metrics are enabled by default on port 8081 (worker)"
@@ -24,14 +34,14 @@ while [[ $# -gt 0 ]]; do
            exit 0
            ;;
        *)
-            echo "Unknown option: $1"
+            EXTRA_ARGS+=("$1")
-            echo "Use --help for usage information"
+            shift
-            exit 1
            ;;
    esac
 done
-MODEL="Qwen/Qwen3-Embedding-4B"
+GPU_MEM_FRACTION=$(build_gpu_mem_args sglang --model "$MODEL" 2>/dev/null || true)
 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
 print_launch_banner --no-curl "Launching Embedding Worker" "$MODEL" "$HTTP_PORT"
@@ -52,13 +62,15 @@ python3 -m dynamo.frontend &
 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
 python3 -m dynamo.sglang \
  --embedding-worker \
-  --model-path Qwen/Qwen3-Embedding-4B \
+  --model-path "$MODEL" \
-  --served-model-name Qwen/Qwen3-Embedding-4B \
+  --served-model-name "$MODEL" \
  --page-size 16 \
  --tp 1 \
  --trust-remote-code \
  --use-sglang-tokenizer \
-  --enable-metrics &
+  ${GPU_MEM_FRACTION:+--mem-fraction-static "$GPU_MEM_FRACTION"} \
+  --enable-metrics \
+  "${EXTRA_ARGS[@]}" &
 # Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
 wait_any_exit
--- a/examples/backends/sglang/launch/agg_router.sh
+++ b/examples/backends/sglang/launch/agg_router.sh
@@ -9,7 +9,8 @@ set -e
 trap 'echo Cleaning up...; kill 0' EXIT
 SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
-source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+source "$SCRIPT_DIR/../../../common/gpu_utils.sh"   # build_gpu_mem_args
+source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
 # Parse command line arguments
 ENABLE_OTEL=false
@@ -52,6 +53,9 @@ if [ "$ENABLE_OTEL" = true ]; then
 fi
 MODEL="Qwen/Qwen3-0.6B"
+GPU_MEM_FRACTION=$(build_gpu_mem_args sglang --model "$MODEL")
 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
 print_launch_banner "Launching Aggregated + KV Routing (2 GPUs)" "$MODEL" "$HTTP_PORT"
@@ -75,22 +79,24 @@ fi
 OTEL_SERVICE_NAME=dynamo-worker-1 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_WORKER1:-8081} \
 python3 -m dynamo.sglang \
-  --model-path Qwen/Qwen3-0.6B \
+  --model-path "$MODEL" \
-  --served-model-name Qwen/Qwen3-0.6B \
+  --served-model-name "$MODEL" \
  --page-size 16 \
  --tp 1 \
  --trust-remote-code \
+  ${GPU_MEM_FRACTION:+--mem-fraction-static "$GPU_MEM_FRACTION"} \
  "${KV_EVENTS_ARGS_1[@]}" \
  --enable-metrics \
  "${TRACE_ARGS[@]}" &
 OTEL_SERVICE_NAME=dynamo-worker-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_WORKER2:-8082} \
 CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
-  --model-path Qwen/Qwen3-0.6B \
+  --model-path "$MODEL" \
-  --served-model-name Qwen/Qwen3-0.6B \
+  --served-model-name "$MODEL" \
  --page-size 16 \
  --tp 1 \
  --trust-remote-code \
+  ${GPU_MEM_FRACTION:+--mem-fraction-static "$GPU_MEM_FRACTION"} \
  "${KV_EVENTS_ARGS_2[@]}" \
  --enable-metrics \
  "${TRACE_ARGS[@]}" &

--- a/examples/backends/sglang/launch/disagg.sh
+++ b/examples/backends/sglang/launch/disagg.sh
@@ -9,7 +9,8 @@ set -e
 trap 'echo Cleaning up...; kill 0' EXIT
 SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
-source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+source "$SCRIPT_DIR/../../../common/gpu_utils.sh"   # build_gpu_mem_args
+source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
 # Parse command line arguments
 ENABLE_OTEL=false
@@ -46,6 +47,9 @@ if [ "$ENABLE_OTEL" = true ]; then
 fi
 MODEL="Qwen/Qwen3-0.6B"
+GPU_MEM_FRACTION=$(build_gpu_mem_args sglang --model "$MODEL")
 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
 print_launch_banner "Launching Disaggregated Serving (2 GPUs)" "$MODEL" "$HTTP_PORT"
@@ -61,8 +65,8 @@ python3 -m dynamo.frontend &
 # harnesses can set one simple pair for disaggregated deployments.
 OTEL_SERVICE_NAME=dynamo-worker-prefill DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
 python3 -m dynamo.sglang \
-  --model-path Qwen/Qwen3-0.6B \
+  --model-path "$MODEL" \
-  --served-model-name Qwen/Qwen3-0.6B \
+  --served-model-name "$MODEL" \
  --page-size 16 \
  --tp 1 \
  --trust-remote-code \
@@ -71,14 +75,15 @@ python3 -m dynamo.sglang \
  --host 0.0.0.0 \
  --port 40000 \
  --disaggregation-transfer-backend nixl \
+  ${GPU_MEM_FRACTION:+--mem-fraction-static "$GPU_MEM_FRACTION"} \
  --enable-metrics \
  "${TRACE_ARGS[@]}" &
 # run decode worker
 OTEL_SERVICE_NAME=dynamo-worker-decode DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
 CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
-  --model-path Qwen/Qwen3-0.6B \
+  --model-path "$MODEL" \
-  --served-model-name Qwen/Qwen3-0.6B \
+  --served-model-name "$MODEL" \
  --page-size 16 \
  --tp 1 \
  --trust-remote-code \
@@ -86,6 +91,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
  --disaggregation-bootstrap-port 12345 \
  --host 0.0.0.0 \
  --disaggregation-transfer-backend nixl \
+  ${GPU_MEM_FRACTION:+--mem-fraction-static "$GPU_MEM_FRACTION"} \
  --enable-metrics \
  "${TRACE_ARGS[@]}" &

--- a/examples/backends/sglang/launch/multimodal_disagg.sh
+++ b/examples/backends/sglang/launch/multimodal_disagg.sh
@@ -9,7 +9,8 @@ set -e
 trap 'echo Cleaning up...; kill 0' EXIT
 SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
-source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+source "$SCRIPT_DIR/../../../common/gpu_utils.sh"   # build_gpu_mem_args
+source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
 # Default values
 MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
@@ -86,6 +87,14 @@ DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.9}
 DYN_PREFILL_GPU_MEM=${DYN_PREFILL_GPU_MEM:-0.9}
 DYN_DECODE_GPU_MEM=${DYN_DECODE_GPU_MEM:-0.9}
+# Profiler override: scale prefill/decode fractions proportionally.
+# Encode worker has no --mem-fraction-static in single-gpu mode, so it's unaffected.
+if [[ -n "${_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE:-}" && "$SINGLE_GPU" == "true" ]]; then
+    _TOTAL_FRAC=$(awk -v p="$DYN_PREFILL_GPU_MEM" -v d="$DYN_DECODE_GPU_MEM" 'BEGIN { printf "%.4f", p + d }')
+    DYN_PREFILL_GPU_MEM=$(awk -v o="$_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE" -v p="$DYN_PREFILL_GPU_MEM" -v t="$_TOTAL_FRAC" 'BEGIN { printf "%.2f", o * p / t }')
+    DYN_DECODE_GPU_MEM=$(awk -v o="$_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE" -v d="$DYN_DECODE_GPU_MEM" -v t="$_TOTAL_FRAC" 'BEGIN { printf "%.2f", o * d / t }')
+fi
 ENCODE_EXTRA_ARGS=""
 PREFILL_EXTRA_ARGS=""
 DECODE_EXTRA_ARGS=""

--- a/examples/backends/sglang/launch/multimodal_epd.sh
+++ b/examples/backends/sglang/launch/multimodal_epd.sh
@@ -9,7 +9,8 @@ set -e
 trap 'echo Cleaning up...; kill 0' EXIT
 SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
-source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+source "$SCRIPT_DIR/../../../common/gpu_utils.sh"   # build_gpu_mem_args
+source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
 # Default values
 MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
@@ -78,6 +79,14 @@ DYN_WORKER_GPU=${DYN_WORKER_GPU:-1}
 DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.9}
 DYN_WORKER_GPU_MEM=${DYN_WORKER_GPU_MEM:-0.9}
+# Profiler override: split _PROFILE_PYTEST_VRAM_FRAC_OVERRIDE between workers
+# preserving the ratio set by the env vars.
+if [[ -n "${_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE:-}" && "$SINGLE_GPU" == "true" ]]; then
+    _TOTAL_FRAC=$(awk -v e="$DYN_ENCODE_GPU_MEM" -v w="$DYN_WORKER_GPU_MEM" 'BEGIN { printf "%.4f", e + w }')
+    DYN_ENCODE_GPU_MEM=$(awk -v o="$_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE" -v e="$DYN_ENCODE_GPU_MEM" -v t="$_TOTAL_FRAC" 'BEGIN { printf "%.2f", o * e / t }')
+    DYN_WORKER_GPU_MEM=$(awk -v o="$_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE" -v w="$DYN_WORKER_GPU_MEM" -v t="$_TOTAL_FRAC" 'BEGIN { printf "%.2f", o * w / t }')
+fi
 ENCODE_EXTRA_ARGS=""
 WORKER_EXTRA_ARGS=""

--- a/examples/common/gpu_utils.md
+++ b/examples/common/gpu_utils.md
@@ -76,7 +76,7 @@ kv_cache_total = kv_cache_per_token * max_model_len * max_concurrent_seqs
 overhead ≈ engine-dependent (auto-computed by estimate_worker_vram):
           vllm:   1.2 + 1.0 * sqrt(params_b) GiB  (0.6B≈2.0, 8B≈4.0)
-           sglang: 2.5 + 1.5 * sqrt(params_b) GiB  (0.6B≈3.7, 8B≈6.7)
+           sglang: 1.5 + 1.0 * sqrt(params_b) GiB  (0.6B≈2.3, 8B≈4.3)
           trtllm: 2.0 + 1.2 * sqrt(params_b) GiB  (0.6B≈2.9, 8B≈5.4)
 ```
@@ -104,11 +104,27 @@ This is slightly different from vLLM (which includes activations in the budget).
 sglang recommends keeping 5-8 GiB free for activations and overhead. If you
 see OOM errors, decrease `--mem-fraction-static` by 0.01-0.05 increments.
-### How `--context-length` works
+### How `--context-length` and `--max-running-requests` work
-Equivalent to vLLM's `--max-model-len`. Defaults to the model's native context
+Unlike vLLM (where `--max-model-len` directly affects KV cache sizing), sglang's
-window. Reducing it shrinks the per-request KV cache requirement and allows more
+`--context-length` and `--max-running-requests` do **not** affect KV cache
-concurrent sequences.
+allocation. The KV cache pool is sized entirely from `--mem-fraction-static`:
+```
+kv_cache_pool = total_vram * mem_fraction_static - model_weights
+```
+Profiling confirmed this: changing `--context-length` from 512 to 40960 produced
+identical `max_total_num_tokens` values (269,136 on a 48 GiB GPU at fraction 0.95).
+These flags only affect **request scheduling**:
+- `--context-length` caps the per-request token usage from the KV pool
+- `--max-running-requests` limits concurrent request slots (allocated from
+  memory outside the `--mem-fraction-static` budget)
+Setting `--max-running-requests` too high at high fractions can cause OOM because
+the request slot pool competes for the small amount of memory left after KV cache
+allocation.
 ### Estimating total GPU usage
@@ -117,9 +133,9 @@ total_vram ≈ model_weights + kv_cache_pool + activations_and_overhead
 kv_cache_pool = total_vram * mem_fraction_static - model_weights
-activations_and_overhead ≈ 1-8 GiB (depends on model size, batch size, seq len;
+activations_and_overhead ≈ 1-2 GiB for small models (0.6B-4B)
-                           ~1-2 GiB for small models like 0.6B,
+                           ~3-5 GiB for larger models (7B+)
-                           ~5-8 GiB for larger models like 8B+ with CUDA graphs)
+  (CUDA context, graphs, request pools — allocated outside mem_fraction_static)
 ```
 ---

--- a/examples/common/gpu_utils.sh
+++ b/examples/common/gpu_utils.sh
@@ -182,6 +182,11 @@ get_model_params() {
        # MHA (not GQA): num_key_value_heads == num_attention_heads == 32
        deepseek-ai/deepseek-llm-7b-base)
            pb=6.9;  wb=2; layers=30; kvh=32; hd=128 ;;
+        # https://huggingface.co/Qwen/Qwen3-Embedding-4B/raw/main/config.json
+        # params_b from model.safetensors.index.json metadata.total_size / 2 / 1e9
+        # head_dim = hidden_size(2560) / num_attention_heads(32) = 80
+        Qwen/Qwen3-Embedding-4B)
+            pb=4.0;  wb=2; layers=36; kvh=8;  hd=80 ;;
        # https://huggingface.co/llava-hf/llava-1.5-7b-hf/raw/main/config.json  (text_config)
        # MHA: num_key_value_heads == num_attention_heads == 32
        llava-hf/llava-1.5-7b-hf)
@@ -216,9 +221,13 @@ get_model_params() {
 #
 # Per-engine constants (calibrated from measurements on RTX 6000 Ada 48 GiB):
 #   vllm:   base=1.2, scale=1.0  → 0.6B≈2.0, 8B≈4.0, 30B≈6.7
-#   sglang: base=2.5, scale=1.5  → 0.6B≈3.7, 8B≈6.7, 30B≈10.8
+#   sglang: base=1.5, scale=1.0  → 0.6B≈2.3, 8B≈4.3, 30B≈7.0
 #   trtllm: base=2.0, scale=1.2  → 0.6B≈2.9, 8B≈5.4, 30B≈8.6
 #
+# sglang overhead was re-calibrated via profile_pytest.py bisection on
+# RTX 6000 Ada 48 GiB. Observed CUDA overhead (outside --mem-fraction-static):
+#   Qwen3-0.6B: ~1.8 GiB. Previous coefficients (2.5, 1.5) over-estimated by ~2x.
+#
 # If the 4th argument is a number, it's used directly (backward compatible).
 # If omitted, defaults to 2.0 (backward compatible).
 #
@@ -241,7 +250,7 @@ estimate_worker_vram() {
    local overhead
    case "$engine_or_overhead" in
        vllm)   overhead=$(awk -v p="$pb" 'BEGIN { printf "%.1f", 1.2 + 1.0 * sqrt(p) }') ;;
-        sglang) overhead=$(awk -v p="$pb" 'BEGIN { printf "%.1f", 2.5 + 1.5 * sqrt(p) }') ;;
+        sglang) overhead=$(awk -v p="$pb" 'BEGIN { printf "%.1f", 1.5 + 1.0 * sqrt(p) }') ;;
        trtllm) overhead=$(awk -v p="$pb" 'BEGIN { printf "%.1f", 2.0 + 1.2 * sqrt(p) }') ;;
        *)      overhead="$engine_or_overhead" ;;
    esac

--- a/tests/README.md
+++ b/tests/README.md
@@ -456,11 +456,13 @@ The profiler sets the `_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE` environment variable
 | Engine  | CLI flag                         | Launch script support |
 |---------|----------------------------------|-----------------------|
-| vLLM    | `--gpu-memory-utilization`       | Implemented in `agg.sh`, `disagg.sh`, etc. |
+| vLLM    | `--gpu-memory-utilization`       | Implemented in `agg.sh`, `disagg.sh`, etc. via `build_gpu_mem_args` |
-| SGLang  | `--mem-fraction-static`          | Not yet implemented (TODO) |
+| SGLang  | `--mem-fraction-static`          | Implemented in `agg.sh`, `agg_embed.sh`, `disagg.sh`, `agg_router.sh`, `disagg_same_gpu.sh` via `build_gpu_mem_args`. Multimodal scripts (`multimodal_epd.sh`, `multimodal_disagg.sh`) split the override proportionally between workers. |
 | TRT-LLM | `--free-gpu-memory-fraction`    | Not yet implemented (has its own `DYN_TRTLLM_FREE_GPU_MEMORY_FRACTION`, TODO: unify) |
-Scripts that already hard-code their own memory fraction (e.g. `agg_multimodal.sh` with 0.85) have a TODO to honor `_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE` in the future. If the profiler detects constant VRAM across all probes (meaning the env var is ignored), it prints a warning and skips marker recommendations.
+**Note on sglang:** Unlike vLLM (where `--max-model-len` affects KV cache sizing), sglang's `--mem-fraction-static` is the sole knob for KV cache allocation. `--context-length` and `--max-running-requests` only affect request scheduling, not memory allocation. See `examples/common/gpu_utils.md` for details.
+If the profiler detects constant VRAM across all probes (meaning the env var is ignored), it prints a warning and skips marker recommendations.
 ### Usage

--- a/tests/serve/test_sglang.py
+++ b/tests/serve/test_sglang.py
@@ -45,7 +45,10 @@ sglang_dir = os.environ.get("SGLANG_DIR") or os.path.join(
 # SGLang test configurations
 # NOTE: pytest.mark.gpu_1 tests take ~167s (2m 47s) total to run sequentially (with models pre-cached)
-# TODO: Parallelize these tests to reduce total execution time
+# TODO: Now that these tests use dynamic ports and each config has a max_vram_gib marker,
+# optimize the runtime by bin-packing multiple engine deployments in parallel on the same GPU.
+# A future collector/launcher can sum max_vram_gib values to decide how many tests fit
+# concurrently without exceeding available VRAM.
 sglang_configs = {
    "aggregated": SGLangConfig(
        # Uses backend agg.sh (with metrics enabled) for testing standard
@@ -55,8 +58,9 @@ sglang_configs = {
        script_name="agg.sh",
        marks=[
            pytest.mark.gpu_1,
+            pytest.mark.max_vram_gib(6.1),  # observed peak 5.6 GiB (+10% safety)
+            pytest.mark.timeout(240),  # profiled 34.4s on A6000
            pytest.mark.pre_merge,
-            pytest.mark.timeout(240),  # 3x measured time (39s) + download time (120s)
        ],
        model="Qwen/Qwen3-0.6B",
        env={},
@@ -76,7 +80,7 @@ sglang_configs = {
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.pre_merge,
-        ],
+        ],  # TODO(gpu_2): profile max_vram, timeout, add markers (separate PR)
        model="Qwen/Qwen3-0.6B",
        env={},
        frontend_port=DefaultPort.FRONTEND.value,
@@ -96,8 +100,10 @@ sglang_configs = {
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.skip(reason="unstable"),
+            # TODO: profile to get max_vram and timeout (currently skipped)
        ],
        model="Qwen/Qwen3-0.6B",
+        delayed_start=30,
        env={},
        frontend_port=DefaultPort.FRONTEND.value,
        request_payloads=[
@@ -126,7 +132,7 @@ sglang_configs = {
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.pre_merge,
-        ],
+        ],  # TODO(gpu_2): profile max_vram, timeout, add markers (separate PR)
        model="Qwen/Qwen3-0.6B",
        env={
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info",
@@ -154,9 +160,9 @@ sglang_configs = {
        script_name="template_verifier.sh",
        marks=[
            pytest.mark.gpu_1,
+            pytest.mark.timeout(240),  # profiled 11.7s on A6000 (no GPU model load)
            pytest.mark.pre_merge,
            pytest.mark.nightly,
-            pytest.mark.timeout(240),  # 3x measured time (20s) + download time (180s)
        ],
        model="Qwen/Qwen3-0.6B",
        env={},
@@ -167,13 +173,21 @@ sglang_configs = {
            )
        ],
    ),
-    # NOTE: Pack all workers on 1 GPU for lower CI resource requirements
+    # NOTE: Pack all workers on 1 GPU for lower CI resource requirements.
+    # NOTE: multimodal_epd.sh uses explicit --mem-fraction-static via DYN_ENCODE_GPU_MEM
+    # / DYN_WORKER_GPU_MEM env vars, so _PROFILE_PYTEST_VRAM_FRAC_OVERRIDE has no effect.
+    # Regardless of fraction overrides, the workers combined consistently use ~23.6 GiB.
    "multimodal_e_pd_qwen": SGLangConfig(
        # E/P/D architecture: Encode, Prefill, Decode workers all on GPU 0
        name="multimodal_e_pd_qwen",
        directory=sglang_dir,
        script_name="multimodal_epd.sh",
-        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
+        marks=[
+            pytest.mark.gpu_1,
+            pytest.mark.max_vram_gib(13.3),  # observed peak 12.1 GiB (+10% safety)
+            pytest.mark.timeout(360),  # profiled 31.0s on A6000
+            pytest.mark.pre_merge,
+        ],
        model="Qwen/Qwen3-VL-2B-Instruct",
        script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"],
        timeout=360,
@@ -212,8 +226,9 @@ sglang_configs = {
        script_name="multimodal_disagg.sh",
        marks=[
            pytest.mark.gpu_1,
+            pytest.mark.max_vram_gib(17.7),  # observed peak 16.1 GiB (+10% safety)
+            pytest.mark.timeout(360),  # profiled 36.0s on A6000
            pytest.mark.pre_merge,
-            pytest.mark.timeout(360),
        ],
        model="Qwen/Qwen3-VL-2B-Instruct",
        script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"],
@@ -246,9 +261,10 @@ sglang_configs = {
        script_name="agg.sh",
        marks=[
            pytest.mark.gpu_1,
+            pytest.mark.max_vram_gib(21.0),  # observed peak 19.1 GiB (+10% safety)
+            pytest.mark.timeout(300),  # profiled 41.3s on A6000
            pytest.mark.pre_merge,
            pytest.mark.nightly,
-            pytest.mark.timeout(300),
        ],
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        script_args=[
@@ -284,9 +300,10 @@ sglang_configs = {
        script_name="agg_embed.sh",
        marks=[
            pytest.mark.gpu_1,
+            pytest.mark.max_vram_gib(12.1),  # observed peak 11.0 GiB (+10% safety)
+            pytest.mark.timeout(270),  # profiled 25.5s on A6000
            pytest.mark.pre_merge,
            pytest.mark.nightly,
-            pytest.mark.timeout(270),  # 3x measured time (29s) + download time (180s)
        ],
        model="Qwen/Qwen3-Embedding-4B",
        delayed_start=0,
@@ -321,10 +338,9 @@ sglang_configs = {
        script_name="agg.sh",
        marks=[
            pytest.mark.gpu_1,
+            pytest.mark.max_vram_gib(16.2),  # observed peak 14.8 GiB (+10% safety)
+            pytest.mark.timeout(420),  # profiled 73s on A6000
            pytest.mark.post_merge,
-            pytest.mark.timeout(
-                420
-            ),  # Total test timeout: 2x measured average (79.36s) + download time (240s) for 7B model
        ],
        model="deepseek-ai/deepseek-llm-7b-base",
        script_args=[
@@ -346,6 +362,7 @@ sglang_configs = {
            pytest.mark.post_merge,
            pytest.mark.timeout(240),
            pytest.mark.skip(reason="DYN-2261"),
+            # TODO: profile to get max_vram (currently skipped)
        ],
        model="Qwen/Qwen3-0.6B",
        env={"DYN_ENABLE_ANTHROPIC_API": "1"},