fix: disagg_same_gpu - profiling markers, GPU pinning, and memory args (#7996)

Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>

fix: disagg_same_gpu - profiling markers, GPU pinning, and memory args (#7996)
Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>
e3e728a8 · Keiven C · GitHub · 47c4bd46 · e3e728a8 · e3e728a8
Unverified Commit e3e728a8 authored Apr 13, 2026 by Keiven C Committed by GitHub Apr 13, 2026
12 changed files
--- a/components/src/dynamo/sglang/init_llm.py
+++ b/components/src/dynamo/sglang/init_llm.py
@@ -166,8 +166,11 @@ async def init_prefill(
    # Use pre-created engine if provided (snapshot mode)
    if snapshot_engine is not None:
        engine = snapshot_engine
+        load_time = 0.0
    else:
+        start_time = time.time()
        engine = sgl.Engine(server_args=server_args)
+        load_time = time.time() - start_time

    generate_endpoint = runtime.endpoint(
        f"{dynamo_args.namespace}.{dynamo_args.component}.{dynamo_args.endpoint}"
@@ -179,6 +182,8 @@ async def init_prefill(
        engine, config, generate_endpoint
    )

+    publisher.component_gauges.set_model_load_time(load_time)
+
    if server_args.node_rank >= 1:
        await handle_non_leader_node(engine, publisher, metrics_task)
        return

--- a/examples/backends/sglang/launch/disagg_same_gpu.sh
+++ b/examples/backends/sglang/launch/disagg_same_gpu.sh
@@ -24,8 +24,13 @@ MODEL="Qwen/Qwen3-0.6B"
 # ---- Tunable (override via env vars) ----
 CONTEXT_LENGTH="${CONTEXT_LENGTH:-4096}"
 MAX_RUNNING_REQUESTS="${MAX_RUNNING_REQUESTS:-2}"
+MAX_TOTAL_TOKENS="${MAX_TOTAL_TOKENS:-25000}"
+CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"

 GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
+if [[ -z "$GPU_MEM_ARGS" ]]; then
+    GPU_MEM_ARGS="--max-total-tokens $MAX_TOTAL_TOKENS"
+fi

 source "$SCRIPT_DIR/../../../common/launch_utils.sh"

@@ -35,14 +40,15 @@ HTTP_PORT="${DYN_HTTP_PORT:-8000}"
 print_launch_banner "Launching Disaggregated (same GPU)" "$MODEL" "$HTTP_PORT" \
    "Workers:     2 (prefill + decode, fraction is per worker)"

-# run ingress with KV router mode for disaggregated setup
+# run ingress
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
-python3 -m dynamo.frontend --router-mode kv &
+python3 -m dynamo.frontend &

 # NOTE: Each worker picks a random NCCL port (get_free_port) for torch.distributed.
 # This has a TOCTOU race — the port can be grabbed before init_process_group binds it,
 # causing sporadic EADDRINUSE.  Pass --nccl-port <unique_port> per worker to avoid this.
 # run prefill worker with metrics on port 8081
+CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
 python3 -m dynamo.sglang \
  --model-path "$MODEL" \
@@ -63,16 +69,15 @@ python3 -m dynamo.sglang \
  --max-running-requests "$MAX_RUNNING_REQUESTS" \
  --enable-metrics &

-# Wait for prefill worker to initialize before starting decode worker
-# This prevents both workers from competing for GPU memory simultaneously, which can cause OOM.
-# The prefill worker needs time to:
-# 1. Load model weights and allocate its memory fraction
-# 2. Initialize KV cache with --delete-ckpt-after-loading to free checkpoint memory
-# 3. Register with NATS service discovery so decode worker can find it
-echo "Waiting for prefill worker to initialize..."
-sleep 5
+# Wait for prefill worker to initialize before starting decode worker.
+# Both workers share one GPU with --delete-ckpt-after-loading; without this
+# wait they compete for GPU memory during model loading and the scheduler OOMs.
+# || true: don't let set -e kill the script on timeout (wait_for_ready returns 1).
+PREFILL_SYSTEM_PORT="${DYN_SYSTEM_PORT1:-8081}"
+wait_for_ready "http://localhost:${PREFILL_SYSTEM_PORT}/health" 45 || true

 # run decode worker with metrics on port 8082
+CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
 python3 -m dynamo.sglang \
  --model-path "$MODEL" \

--- a/examples/backends/trtllm/launch/disagg_same_gpu.sh
+++ b/examples/backends/trtllm/launch/disagg_same_gpu.sh
@@ -3,20 +3,14 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 # Disaggregated prefill/decode on a SINGLE GPU.
-# Per-worker VRAM is controlled via env vars (MAX_SEQ_LEN, MAX_CONCURRENT_SEQS).
-# TODO: unify with build_trtllm_override_args_with_mem once trtllm --override-engine-args JSON
-# merging is supported.
+# Per-worker VRAM is controlled via absolute KV token caps (not fractions).
+# Profiler overrides (_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS) are handled via
+# build_trtllm_override_args_with_mem; standalone runs use MAX_TOTAL_TOKENS.
 #
-# NOTE — trtllm fraction semantics differ from vllm/sglang:
-#   vllm/sglang:  fraction of TOTAL VRAM  (weights + KV + activations all inside)
-#   trtllm:       fraction of FREE  VRAM  (KV cache only, after model load)
-# build_vllm_gpu_mem_args / build_sglang_gpu_mem_args handle this — see gpu_utils.sh / gpu_utils.md.
-#
-# Measured reference (Qwen/Qwen3-0.6B, --max-seq-len 4096, RTX 6000 Ada 48 GiB):
-#   estimate (from gpu_utils.sh) : ~8.0 GiB per worker (~16.0 GiB total)
-#   actual (nvidia-smi)          : ~7.4 GiB per worker (~14.8 GiB total)
-#   fraction per worker (free)   : 0.05
-#   Overestimating is intentional -- better to pad than OOM.
+# Measured reference (Qwen/Qwen3-0.6B, RTX 6000 Ada 48 GiB):
+#   peak VRAM (nvidia-smi)     : ~6.6 GiB total (both workers)
+#   default MAX_TOTAL_TOKENS   : 25000 per worker
+#   min tokens (profiled)      : 256 per worker

 set -e
 trap 'echo Cleaning up...; kill 0' EXIT
@@ -29,10 +23,7 @@ MODEL="Qwen/Qwen3-0.6B"
 # ---- Tunable (override via env vars) ----
 MAX_SEQ_LEN="${MAX_SEQ_LEN:-4096}"
 MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
-
-# TODO: unify with build_trtllm_override_args_with_mem once trtllm --override-engine-args JSON
-# merging is supported.
-GPU_MEM_FRACTION="${GPU_MEM_FRACTION:-}"
+MAX_TOTAL_TOKENS="${MAX_TOTAL_TOKENS:-25000}"

 # Environment variables with defaults
 export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
@@ -67,20 +58,35 @@ while [[ $# -gt 0 ]]; do
 done

 # Build --override-engine-args JSON.
-# Always override free_gpu_memory_fraction so the script controls KV cache size,
-# matching how vllm (--gpu-memory-utilization) and sglang (--mem-fraction-static)
-# pass memory parameters from the launch script.
-OVERRIDE_PAIRS=""
-if [[ -n "$GPU_MEM_FRACTION" ]]; then
-    OVERRIDE_PAIRS="\"kv_cache_config\": {\"free_gpu_memory_fraction\": ${GPU_MEM_FRACTION}}"
-fi
+#
+# KV cache control (always absolute caps, never fractions):
+#   1. Profiler env var (_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS or
+#      _PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES) via build_trtllm_override_args_with_mem.
+#   2. MAX_TOTAL_TOKENS env var (default 25000) for standalone runs.
+
+# Collect non-memory override pairs (otel, etc.)
+NON_MEM_PAIRS=""
 if [ "$ENABLE_OTEL" = true ]; then
    export DYN_LOGGING_JSONL=true
    export OTEL_EXPORT_ENABLED=1
    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-http://localhost:4317}
-    OVERRIDE_PAIRS="${OVERRIDE_PAIRS}, \"return_perf_metrics\": true, \"otlp_traces_endpoint\": \"${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}\""
+    NON_MEM_PAIRS="\"return_perf_metrics\": true, \"otlp_traces_endpoint\": \"${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}\""
+fi
+
+if [[ -n "${_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS:-}" ]] || [[ -n "${_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES:-}" ]]; then
+    # Profiler provides absolute cap
+    BASE_JSON=""
+    [[ -n "$NON_MEM_PAIRS" ]] && BASE_JSON="{${NON_MEM_PAIRS}}"
+    FINAL_JSON=$(build_trtllm_override_args_with_mem ${BASE_JSON:+--merge-with-json "$BASE_JSON"})
+    OVERRIDE_ARGS=(--override-engine-args "$FINAL_JSON")
+else
+    # No profiler — use absolute token cap from MAX_TOTAL_TOKENS
+    OVERRIDE_PAIRS="\"kv_cache_config\": {\"max_tokens\": ${MAX_TOTAL_TOKENS}}"
+    if [[ -n "$NON_MEM_PAIRS" ]]; then
+        OVERRIDE_PAIRS="${OVERRIDE_PAIRS}, $NON_MEM_PAIRS"
+    fi
+    OVERRIDE_ARGS=(--override-engine-args "{${OVERRIDE_PAIRS}}")
 fi
-OVERRIDE_ARGS=(--override-engine-args "{${OVERRIDE_PAIRS}}")

 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
 print_launch_banner "Launching Disaggregated on Same GPU (1 GPU)" "$MODEL" "$HTTP_PORT" \
@@ -104,6 +110,13 @@ python3 -m dynamo.trtllm \
  --disaggregation-mode prefill \
  "${OVERRIDE_ARGS[@]}" &

+# Wait for prefill worker to load model and allocate KV cache before starting
+# decode.  Both workers share one GPU; without this wait they compete for GPU
+# memory during model loading, which can cause OOM.
+# || true: don't let set -e kill the script on timeout (wait_for_ready returns 1).
+PREFILL_SYSTEM_PORT="${DYN_SYSTEM_PORT1:-8081}"
+wait_for_ready "http://localhost:${PREFILL_SYSTEM_PORT}/health" 45 || true
+
 # run decode worker (shares GPU with prefill)
 OTEL_SERVICE_NAME=dynamo-worker-decode \
 CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \

--- a/examples/backends/vllm/launch/disagg_same_gpu.sh
+++ b/examples/backends/vllm/launch/disagg_same_gpu.sh
@@ -24,8 +24,20 @@ MODEL="Qwen/Qwen3-0.6B"
 # ---- Tunable (override via env vars) ----
 MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
 MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
+# Inherit GPU from parent (profiler/test harness sets CUDA_VISIBLE_DEVICES);
+# default to GPU 0 for standalone use.
+CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
+# Per-worker KV cache byte cap (deterministic, GPU-size independent).
+# Profiled safe value: 1_023_525_000 bytes (~976 MiB, 2x over min 512 MiB).
+# --gpu-memory-utilization 0.01 prevents vLLM's startup free-memory check from
+# rejecting the launch when a co-resident worker already holds VRAM.
+# The profiler/parallel runner overrides via _PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES.
+DEFAULT_KV_CACHE_BYTES="${DEFAULT_KV_CACHE_BYTES:-1023525000}"

 GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
+if [[ -z "$GPU_MEM_ARGS" ]]; then
+    GPU_MEM_ARGS="--kv-cache-memory-bytes $DEFAULT_KV_CACHE_BYTES --gpu-memory-utilization 0.01"
+fi

 source "$SCRIPT_DIR/../../../common/launch_utils.sh"

@@ -41,8 +53,8 @@ python3 -m dynamo.frontend &
 # --enforce-eager is added for quick deployment. for production use, need to remove this flag
 # For disaggregated deployments we standardize on DYN_SYSTEM_PORT1/2 instead of
 # *_PREFILL/*_DECODE env names so test harnesses can set one simple pair.
+CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
-CUDA_VISIBLE_DEVICES=0 \
 python3 -m dynamo.vllm \
  --model "$MODEL" \
  --enforce-eager \
@@ -51,19 +63,17 @@ python3 -m dynamo.vllm \
  $GPU_MEM_ARGS \
  --max-model-len "$MAX_MODEL_LEN" &

-# Wait for decode worker to initialize before starting prefill worker
-# This prevents both workers from competing for GPU memory simultaneously, which can cause OOM.
-# The decode worker needs time to:
-# 1. Load model weights and allocate its memory fraction
-# 2. Initialize KV cache
-# 3. Register with NATS service discovery so prefill worker can find it
-echo "Waiting for decode worker to initialize..."
-sleep 10
+# Wait for decode worker to initialize before starting prefill worker.
+# Both workers share one GPU; without this wait they compete for GPU memory
+# during model loading and the scheduler OOMs.
+# || true: don't let set -e kill the script on timeout (wait_for_ready returns 1).
+DECODE_SYSTEM_PORT="${DYN_SYSTEM_PORT1:-8081}"
+wait_for_ready "http://localhost:${DECODE_SYSTEM_PORT}/health" 45 || true

 # run prefill worker with metrics on port 8082
+CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
 VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
-CUDA_VISIBLE_DEVICES=0 \
 python3 -m dynamo.vllm \
  --model "$MODEL" \
  --enforce-eager \

--- a/examples/common/launch_utils.sh
+++ b/examples/common/launch_utils.sh
@@ -183,6 +183,34 @@ CURL_EOF
    echo "=========================================="
 }

+# wait_for_ready <url> [timeout_seconds]
+#
+# Polls an HTTP endpoint until it returns 200 or timeout is reached.
+# Useful for waiting for a worker to finish loading before starting the
+# next one (e.g. disaggregated same-GPU deployments where concurrent
+# model loading causes OOM).
+#
+# Args:
+#   url              HTTP URL to poll (e.g. http://localhost:8081/health)
+#   timeout_seconds  Max seconds to wait (default: 30)
+#
+# Returns 0 on success, 1 on timeout.
+wait_for_ready() {
+    local _url="$1"
+    local _timeout="${2:-30}"
+    local _start=$SECONDS
+    echo "Polling $_url (timeout: ${_timeout}s)..."
+    while (( SECONDS - _start < _timeout )); do
+        if curl -sf --max-time 2 "$_url" > /dev/null 2>&1; then
+            echo "Ready after $(( SECONDS - _start ))s"
+            return 0
+        fi
+        sleep 1
+    done
+    echo "WARNING: $_url not ready after ${_timeout}s" >&2
+    return 1
+}
+
 # print_curl_footer
 #
 # Prints a custom curl example wrapped in the standard framing (matching

--- a/tests/README.md
+++ b/tests/README.md
@@ -552,6 +552,14 @@ The profiler automatically detects the engine type and uses the appropriate bina

 **Requirement (TRT-LLM):** The launch script must honor `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` (and optionally `_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES`). This is handled by `build_trtllm_override_args_with_mem` in `gpu_utils.sh` (returns JSON for `--override-engine-args`). Note: this is a separate function from `build_vllm_gpu_mem_args` / `build_sglang_gpu_mem_args` because TRT-LLM requires JSON merging.

+**Requirement (all engines):** Do not hardcode `CUDA_VISIBLE_DEVICES` in launch scripts. The profiler and parallel test runner set `CUDA_VISIBLE_DEVICES` to pin each test to a specific GPU. A script that overrides this (e.g. `CUDA_VISIBLE_DEVICES=0`) will ignore the assignment and land on the wrong GPU. Instead, inherit from the environment with a default:
+
+```bash
+CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
+```
+
+Then pass the variable to each worker: `CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES python3 -m dynamo.vllm ...`. For multi-GPU scripts that assign distinct GPUs per worker, use named env vars with defaults (e.g. `PREFILL_CUDA_VISIBLE_DEVICES="${PREFILL_CUDA_VISIBLE_DEVICES:-0}"`).
+
 ### Engine-specific mapping

 Launch scripts call engine-specific functions from `examples/common/gpu_utils.sh` which check env var overrides and return the appropriate CLI flags:

--- a/tests/deploy/test_deploy.py
+++ b/tests/deploy/test_deploy.py
@@ -145,6 +145,13 @@ async def test_deployment(
    framework = deployment_target.framework
    profile = deployment_target.profile

+    # NIXL_ERR_BACKEND: vCluster CI nodes lack RDMA/UCX for inter-pod KV
+    # transfer.  Prefill workers crash in NixlWrapper.create_backend.
+    if framework == "vllm" and profile in ("disagg", "disagg_router"):
+        pytest.skip(
+            "NIXL_ERR_BACKEND: CI cluster lacks RDMA/UCX for inter-pod KV transfer"
+        )
+
    model = next((s.model for s in deployment_spec.services if s.model), None)
    if not model:
        pytest.fail(

--- a/tests/serve/test_sglang.py
+++ b/tests/serve/test_sglang.py
@@ -32,6 +32,11 @@ from tests.utils.payload_builder import (
 logger = logging.getLogger(__name__)


+def _is_cuda13() -> bool:
+    v = os.environ.get("CUDA_VERSION", "")
+    return v.startswith("13")
+
+
 @dataclass
 class SGLangConfig(EngineConfig):
    """Configuration for SGLang test scenarios"""
@@ -106,29 +111,37 @@ sglang_configs = {
        script_name="disagg_same_gpu.sh",
        marks=[
            pytest.mark.gpu_1,
+            pytest.mark.profiled_vram_gib(9.9),  # actual profiled peak with kv-tokens
+            pytest.mark.requested_sglang_kv_tokens(
+                37472
+            ),  # KV cache cap (2x safety over min=18736)
+            # Local repro took ~289s wall time with worker readiness reaching
+            # "ready" at ~176s on a warm-cache RTX 6000 Ada.
+            pytest.mark.timeout(420),
            pytest.mark.pre_merge,
-            pytest.mark.skip(reason="unstable"),
-            # TODO: profile to get max_vram and timeout (currently skipped)
+            pytest.mark.skipif(
+                _is_cuda13(),
+                reason="torch-memory-saver preload .so links libcudart.so.12, missing in cuda13 images",
+            ),
        ],
        model="Qwen/Qwen3-0.6B",
-        delayed_start=30,
+        delayed_start=10,
+        health_check_workers=True,
        env={},
        frontend_port=DefaultPort.FRONTEND.value,
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
-            # Validate dynamo_component_* and sglang:* metrics from prefill worker
-            # (DefaultPort.SYSTEM1)
+            # Disagg workers expose fewer sglang:* metrics (~14 vs ~25 for aggregated)
+            # because each only runs half the scheduler pipeline.
            metric_payload_default(
                min_num_requests=6,
-                backend="sglang",
+                backend="sglang_disagg",
                port=DefaultPort.SYSTEM1.value,
            ),
-            # Validate dynamo_component_* and sglang:* metrics from decode worker
-            # (DefaultPort.SYSTEM2)
            metric_payload_default(
                min_num_requests=6,
-                backend="sglang",
+                backend="sglang_disagg",
                port=DefaultPort.SYSTEM2.value,
            ),
        ],

--- a/tests/serve/test_trtllm.py
+++ b/tests/serve/test_trtllm.py
@@ -117,16 +117,19 @@ trtllm_configs = {
        directory=trtllm_dir,
        script_name="disagg_same_gpu.sh",
        marks=[
-            pytest.mark.gpu_1,
+            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 6.6 GiB
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
-            pytest.mark.skip(reason="unstable"),
-            pytest.mark.timeout(
-                480
-            ),  # 3x measured time (103.66s) + download time (150s)
+            pytest.mark.profiled_vram_gib(6.6),  # actual nvidia-smi peak 6.6 GiB
+            pytest.mark.requested_trtllm_kv_tokens(
+                512
+            ),  # KV cache cap (2x safety over min=256)
+            pytest.mark.timeout(432),  # ~6x profiled wall time 72s
        ],
        model="Qwen/Qwen3-0.6B",
        frontend_port=DefaultPort.FRONTEND.value,
+        delayed_start=10,
+        health_check_workers=True,
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),

--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -306,6 +306,29 @@ vllm_configs = {
            completion_payload_default(),
        ],
    ),
+    "disaggregated_same_gpu": VLLMConfig(
+        name="disaggregated_same_gpu",
+        directory=vllm_dir,
+        script_name="disagg_same_gpu.sh",
+        marks=[
+            pytest.mark.gpu_1,
+            pytest.mark.profiled_vram_gib(7.3),  # actual profiled peak with kv-bytes
+            pytest.mark.requested_vllm_kv_cache_bytes(
+                1_023_525_000
+            ),  # KV cache cap (2x safety over min=511_762_432)
+            pytest.mark.timeout(300),  # ~6x observed 50s
+            # post_merge: cumulative sequential test time exceeds 35-min job budget.
+            # Move back to pre_merge once GPU tests run in parallel.
+            pytest.mark.post_merge,
+        ],
+        model="Qwen/Qwen3-0.6B",
+        delayed_start=10,
+        health_check_workers=True,
+        request_payloads=[
+            chat_payload_default(),
+            completion_payload_default(),
+        ],
+    ),
    "deepep": VLLMConfig(
        name="deepep",
        directory=vllm_dir,

--- a/tests/utils/engine_process.py
+++ b/tests/utils/engine_process.py
@@ -16,6 +16,7 @@ from tests.utils.payloads import BasePayload, check_health_generate, check_model

 logger = logging.getLogger(__name__)

+
 FRONTEND_PORT = (
    DefaultPort.FRONTEND.value
 )  # Do NOT use this in tests! Use allocate_port() instead.
@@ -49,6 +50,7 @@ class EngineConfig:
    frontend_port: int = DefaultPort.FRONTEND.value
    timeout: int = 600
    delayed_start: int = 0
+    health_check_workers: bool = False
    env: Dict[str, str] = field(default_factory=dict)
    stragglers: list[str] = field(default_factory=list)

@@ -169,14 +171,7 @@ class EngineProcess(ManagedProcess):
        if extra_env:
            env.update(extra_env)

-        return cls(
-            command=command,
-            env=env,
-            timeout=config.timeout,
-            display_output=True,
-            working_dir=config.directory,
-            health_check_ports=[],
-            health_check_urls=[
+        frontend_checks = [
            (
                f"http://localhost:{config.frontend_port}/v1/models",
                check_models_api,
@@ -185,8 +180,38 @@ class EngineProcess(ManagedProcess):
                f"http://localhost:{config.frontend_port}/health",
                check_health_generate,
            ),
-            ],
-            delayed_start=config.delayed_start,
+        ]
+
+        # For disagg-same-gpu deployments, health-check each worker's
+        # system port so we wait for ALL workers to be ready, not just the
+        # first one to register with the frontend.  Worker liveness checks
+        # run FIRST so the frontend has time to discover newly-registered
+        # workers before the frontend endpoint checks run.
+        #
+        # NOTE: DYN_SYSTEM_PORT* env vars are injected by the dynamic port
+        # fixtures for ALL tests, so we gate on health_check_workers (only
+        # set by same-gpu disagg configs) to avoid health-checking ports
+        # that don't serve /health in regular multi-GPU tests.
+        delayed = config.delayed_start
+        worker_checks: list[tuple] = []
+        if config.health_check_workers:
+            for key, val in sorted(env.items()):
+                if key.startswith("DYN_SYSTEM_PORT") and val.isdigit():
+                    worker_checks.append((f"http://localhost:{val}/health", None))
+            if worker_checks:
+                delayed = 0
+
+        health_urls = worker_checks + frontend_checks
+
+        return cls(
+            command=command,
+            env=env,
+            timeout=config.timeout,
+            display_output=True,
+            working_dir=config.directory,
+            health_check_ports=[],
+            health_check_urls=health_urls,
+            delayed_start=delayed,
            # Must stay False: command[0] is "bash", so True would kill every
            # bash process system-wide.  Stale cleanup relies on stragglers list
            # and process-group termination in __exit__ instead.

--- a/tests/utils/payloads.py
+++ b/tests/utils/payloads.py
@@ -1175,6 +1175,29 @@ class SGLangMetricsPayload(MetricsPayload):
        return checks


+@dataclass
+class SGLangDisaggMetricsPayload(SGLangMetricsPayload):
+    """Metrics validation for SGLang disaggregated workers.
+
+    Disagg workers (prefill/decode) expose fewer sglang:* metrics than
+    aggregated workers because each only runs half the scheduler pipeline.
+    Observed: ~14 unique sglang:* metrics vs ~25 for aggregated.
+    """
+
+    def _get_backend_specific_checks(self) -> list[MetricCheck]:
+        checks = super()._get_backend_specific_checks()
+        for check in checks:
+            if check.name == "sglang:*":
+                check.validator = lambda value: len(set(value)) >= 10
+                check.error_msg = lambda name, value: (
+                    f"Expected at least 10 unique sglang:* metrics, but found only {len(set(value))}"
+                )
+                check.success_msg = lambda name, value: (
+                    f"SUCCESS: Found {len(set(value))} unique sglang:* metrics (minimum required: 10)"
+                )
+        return checks
+
+
 @dataclass
 class TRTLLMMetricsPayload(MetricsPayload):
    """Metrics validation for TensorRT-LLM backend"""