fix: Add status file to prevent output-copier hang on failures (#5898)

Signed-off-by: Hannah Zhang <hannahz@nvidia.com>

fix: Add status file to prevent output-copier hang on failures (#5898)
Signed-off-by: Hannah Zhang <hannahz@nvidia.com>
0268aea4 · hhzhang16 · GitHub · 4f9a190c · 0268aea4 · 0268aea4
Unverified Commit 0268aea4 authored Feb 03, 2026 by hhzhang16 Committed by GitHub Feb 03, 2026
3 changed files
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -50,6 +50,10 @@ from benchmarks.profiler.utils.profile_prefill import (
    profile_prefill_aiconfigurator,
 )
 from benchmarks.profiler.utils.profiler_argparse import create_profiler_parser
+from benchmarks.profiler.utils.profiler_status import (
+    ProfilerStatus,
+    write_profiler_status,
+)
 from benchmarks.profiler.webui.select_config import (
    add_profiling_error,
    clear_profiling_errors,
@@ -142,6 +146,14 @@ async def run_profile(args):
    if not args.aic_backend:
        args.aic_backend = args.backend

+    # Write initial status for external jobs to monitor
+    os.makedirs(args.output_dir, exist_ok=True)
+    write_profiler_status(
+        args.output_dir,
+        status=ProfilerStatus.RUNNING,
+        message="Profiler job started",
+    )
+
    try:
        config_modifier = CONFIG_MODIFIERS[args.backend]

@@ -490,6 +502,12 @@ async def run_profile(args):
                error_msg = "No prefill results produced; skipping recommendations."
                logger.error(error_msg)
                add_profiling_error(error_msg)
+                write_profiler_status(
+                    args.output_dir,
+                    status=ProfilerStatus.FAILED,
+                    error=error_msg,
+                    message="Profiler failed: no prefill results produced",
+                )
                return

            if args.pick_with_webui:
@@ -527,6 +545,12 @@ async def run_profile(args):
                    error_msg = "No decode results produced; skipping recommendations."
                    logger.error(error_msg)
                    add_profiling_error(error_msg)
+                    write_profiler_status(
+                        args.output_dir,
+                        status=ProfilerStatus.FAILED,
+                        error=error_msg,
+                        message="Profiler failed: no decode results produced",
+                    )
                    return
                if min(decode_data.itl) > args.itl:
                    warning_msg = "No engine configuration satisfies the ITL requirement, please try a smaller model or more powerful hardware"
@@ -759,8 +783,26 @@ async def run_profile(args):
            else:
                yaml.safe_dump(mocker_config, f, sort_keys=False)

+        # Write success status with output files
+        write_profiler_status(
+            args.output_dir,
+            status=ProfilerStatus.SUCCESS,
+            message="Profiler completed successfully",
+            outputs={
+                "config_with_planner": "config_with_planner.yaml",
+                "mocker_config_with_planner": "mocker_config_with_planner.yaml",
+                "disagg_config": "disagg_config.yaml",
+            },
+        )
+
    except Exception as e:
-        logger.error(f"Profile job failed with error: {e}")
+        logger.exception("Profile job failed with error")
+        write_profiler_status(
+            args.output_dir,
+            status=ProfilerStatus.FAILED,
+            error=str(e),
+            message=f"Profiler failed with exception: {type(e).__name__}",
+        )
        raise
    finally:
        # Always clean up any remaining deployments, even if the job failed

--- a/benchmarks/profiler/utils/profiler_status.py
+++ b/benchmarks/profiler/utils/profiler_status.py
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Profiler status file management.
+
+Provides utilities for writing profiler status files.
+"""
+
+import logging
+import os
+import time
+from enum import Enum
+
+import yaml
+
+logger = logging.getLogger(__name__)
+
+
+class ProfilerStatus(str, Enum):
+    """Profiler execution status."""
+
+    RUNNING = "running"
+    SUCCESS = "success"
+    FAILED = "failed"
+
+
+STATUS_FILE_NAME = "profiler_status.yaml"
+
+
+def write_profiler_status(
+    output_dir: str,
+    status: ProfilerStatus,
+    message: str = "",
+    error: str = "",
+    outputs: dict | None = None,
+) -> None:
+    """
+    Write profiler status file.
+
+    Args:
+        output_dir: Output directory path
+        status: Status enum value
+        message: Optional status message
+        error: Optional error message (for failed status)
+        outputs: Optional dict of output files (for success status)
+    """
+    status_file = os.path.join(output_dir, STATUS_FILE_NAME)
+    status_data = {
+        "status": status.value,
+        "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+    }
+    if message:
+        status_data["message"] = message
+    if error:
+        status_data["error"] = error
+    if outputs:
+        status_data["outputs"] = outputs
+
+    try:
+        with open(status_file, "w") as f:
+            yaml.safe_dump(status_data, f, sort_keys=False)
+    except Exception as e:
+        logger.warning("Failed to write profiler status file: %s", e)
--- a/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
+++ b/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
@@ -181,24 +181,79 @@ const (
 const sidecarScriptTemplate = `
 set -e
 set -o pipefail
-# Wait for the profiler container to complete, not just for the file to exist
-# This ensures we capture the final config, not intermediate results
+
+# Wait for profiler container to terminate (no timeout - profiling can take hours)
 echo "Waiting for profiler to complete..."
+START_TIME=$(date +%s)
+LAST_PROGRESS_LOG=$START_TIME
+PROGRESS_INTERVAL=300
+
 while true; do
-  # Check if profiler container has finished (either Completed or Error state)
-  # Use kubectl to check the pod's container status
-  STATUS=$(kubectl get pod $HOSTNAME -n {{.Namespace}} -o jsonpath='{.status.containerStatuses[?(@.name=="profiler")].state}' 2>/dev/null || echo "")
-  if echo "$STATUS" | grep -q "terminated"; then
-    echo "Profiler container has terminated"
+  CURRENT_TIME=$(date +%s)
+  ELAPSED=$((CURRENT_TIME - START_TIME))
+
+  # Log progress every 5 minutes
+  if [ $((CURRENT_TIME - LAST_PROGRESS_LOG)) -ge $PROGRESS_INTERVAL ]; then
+    echo "Still waiting... ($(($ELAPSED / 60)) minutes elapsed)"
+    LAST_PROGRESS_LOG=$CURRENT_TIME
+  fi
+
+  # Check if profiler container terminated
+  CONTAINER_STATUS=$(kubectl get pod $HOSTNAME -n {{.Namespace}} -o jsonpath='{.status.containerStatuses[?(@.name=="profiler")].state}' 2>/dev/null || echo "")
+  if echo "$CONTAINER_STATUS" | grep -q "terminated"; then
+    echo "Profiler terminated (ran for $(($ELAPSED / 60)) minutes)"
    break
  fi
  sleep 5
 done

-# Now wait for the output file to exist
-echo "Waiting for output file {{.OutputPath}}/{{.OutputFile}}..."
-while [ ! -f {{.OutputPath}}/{{.OutputFile}} ]; do sleep 2; done
-echo "Output file found, creating ConfigMap..."
+# Check profiler status file (2 minute timeout)
+echo "Checking profiler status..."
+STATUS_FILE="{{.OutputPath}}/profiler_status.yaml"
+TIMEOUT=120
+CHECK_START=$(date +%s)
+
+# Wait for status file to exist
+while [ ! -f "$STATUS_FILE" ]; do
+  ELAPSED=$(($(date +%s) - CHECK_START))
+  if [ $ELAPSED -ge $TIMEOUT ]; then
+    echo "ERROR: Status file not found after ${TIMEOUT}s"
+    exit 1
+  fi
+  sleep 2
+done
+
+# Read and parse status from YAML file
+STATUS=$(grep "^status:" "$STATUS_FILE" | awk '{print $2}' | tr -d '"' | tr -d "'")
+
+if [ -z "$STATUS" ]; then
+  echo "ERROR: Invalid status file format"
+  exit 1
+fi
+
+# Check status value
+case "$STATUS" in
+  success)
+    MESSAGE=$(grep "^message:" "$STATUS_FILE" | sed 's/^message: *//' | tr -d '"' | tr -d "'")
+    echo "Profiler succeeded: $MESSAGE"
+    ;;
+  failed)
+    ERROR=$(grep "^error:" "$STATUS_FILE" | sed 's/^error: *//' | tr -d '"' | tr -d "'")
+    MESSAGE=$(grep "^message:" "$STATUS_FILE" | sed 's/^message: *//' | tr -d '"' | tr -d "'")
+    echo "ERROR: Profiler failed: ${ERROR:-$MESSAGE}"
+    exit 1
+    ;;
+  running)
+    echo "ERROR: Profiler still running (unexpected)"
+    exit 1
+    ;;
+  *)
+    echo "ERROR: Unknown status: $STATUS"
+    exit 1
+    ;;
+esac
+
+echo "Creating ConfigMap..."

 # Start building ConfigMap YAML with DGD spec
 cat >/tmp/cm.yaml <<EOF
@@ -222,6 +277,12 @@ if [ -f {{.OutputPath}}/{{.MockerOutputFile}} ]; then
  echo "Added mocker config to ConfigMap"
 fi

+# Add profiler status file for debugging
+if [ -f {{.OutputPath}}/profiler_status.yaml ]; then
+  echo "  profiler_status.yaml: |" >> /tmp/cm.yaml
+  sed 's/^/    /' {{.OutputPath}}/profiler_status.yaml >> /tmp/cm.yaml
+fi
+
 # Note: Profiling data (raw_data.npz converted to JSON) is included in the
 # generated DGD YAML as a separate ConfigMap by the profiler, no need to add it here