Unverified Commit 0268aea4 authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

fix: Add status file to prevent output-copier hang on failures (#5898)


Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 4f9a190c
......@@ -50,6 +50,10 @@ from benchmarks.profiler.utils.profile_prefill import (
profile_prefill_aiconfigurator,
)
from benchmarks.profiler.utils.profiler_argparse import create_profiler_parser
from benchmarks.profiler.utils.profiler_status import (
ProfilerStatus,
write_profiler_status,
)
from benchmarks.profiler.webui.select_config import (
add_profiling_error,
clear_profiling_errors,
......@@ -142,6 +146,14 @@ async def run_profile(args):
if not args.aic_backend:
args.aic_backend = args.backend
# Write initial status for external jobs to monitor
os.makedirs(args.output_dir, exist_ok=True)
write_profiler_status(
args.output_dir,
status=ProfilerStatus.RUNNING,
message="Profiler job started",
)
try:
config_modifier = CONFIG_MODIFIERS[args.backend]
......@@ -490,6 +502,12 @@ async def run_profile(args):
error_msg = "No prefill results produced; skipping recommendations."
logger.error(error_msg)
add_profiling_error(error_msg)
write_profiler_status(
args.output_dir,
status=ProfilerStatus.FAILED,
error=error_msg,
message="Profiler failed: no prefill results produced",
)
return
if args.pick_with_webui:
......@@ -527,6 +545,12 @@ async def run_profile(args):
error_msg = "No decode results produced; skipping recommendations."
logger.error(error_msg)
add_profiling_error(error_msg)
write_profiler_status(
args.output_dir,
status=ProfilerStatus.FAILED,
error=error_msg,
message="Profiler failed: no decode results produced",
)
return
if min(decode_data.itl) > args.itl:
warning_msg = "No engine configuration satisfies the ITL requirement, please try a smaller model or more powerful hardware"
......@@ -759,8 +783,26 @@ async def run_profile(args):
else:
yaml.safe_dump(mocker_config, f, sort_keys=False)
# Write success status with output files
write_profiler_status(
args.output_dir,
status=ProfilerStatus.SUCCESS,
message="Profiler completed successfully",
outputs={
"config_with_planner": "config_with_planner.yaml",
"mocker_config_with_planner": "mocker_config_with_planner.yaml",
"disagg_config": "disagg_config.yaml",
},
)
except Exception as e:
logger.error(f"Profile job failed with error: {e}")
logger.exception("Profile job failed with error")
write_profiler_status(
args.output_dir,
status=ProfilerStatus.FAILED,
error=str(e),
message=f"Profiler failed with exception: {type(e).__name__}",
)
raise
finally:
# Always clean up any remaining deployments, even if the job failed
......
#!/usr/bin/env python3
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Profiler status file management.
Provides utilities for writing profiler status files.
"""
import logging
import os
import time
from enum import Enum
import yaml
logger = logging.getLogger(__name__)
class ProfilerStatus(str, Enum):
"""Profiler execution status."""
RUNNING = "running"
SUCCESS = "success"
FAILED = "failed"
STATUS_FILE_NAME = "profiler_status.yaml"
def write_profiler_status(
output_dir: str,
status: ProfilerStatus,
message: str = "",
error: str = "",
outputs: dict | None = None,
) -> None:
"""
Write profiler status file.
Args:
output_dir: Output directory path
status: Status enum value
message: Optional status message
error: Optional error message (for failed status)
outputs: Optional dict of output files (for success status)
"""
status_file = os.path.join(output_dir, STATUS_FILE_NAME)
status_data = {
"status": status.value,
"timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
}
if message:
status_data["message"] = message
if error:
status_data["error"] = error
if outputs:
status_data["outputs"] = outputs
try:
with open(status_file, "w") as f:
yaml.safe_dump(status_data, f, sort_keys=False)
except Exception as e:
logger.warning("Failed to write profiler status file: %s", e)
......@@ -181,24 +181,79 @@ const (
const sidecarScriptTemplate = `
set -e
set -o pipefail
# Wait for the profiler container to complete, not just for the file to exist
# This ensures we capture the final config, not intermediate results
# Wait for profiler container to terminate (no timeout - profiling can take hours)
echo "Waiting for profiler to complete..."
START_TIME=$(date +%s)
LAST_PROGRESS_LOG=$START_TIME
PROGRESS_INTERVAL=300
while true; do
# Check if profiler container has finished (either Completed or Error state)
# Use kubectl to check the pod's container status
STATUS=$(kubectl get pod $HOSTNAME -n {{.Namespace}} -o jsonpath='{.status.containerStatuses[?(@.name=="profiler")].state}' 2>/dev/null || echo "")
if echo "$STATUS" | grep -q "terminated"; then
echo "Profiler container has terminated"
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - START_TIME))
# Log progress every 5 minutes
if [ $((CURRENT_TIME - LAST_PROGRESS_LOG)) -ge $PROGRESS_INTERVAL ]; then
echo "Still waiting... ($(($ELAPSED / 60)) minutes elapsed)"
LAST_PROGRESS_LOG=$CURRENT_TIME
fi
# Check if profiler container terminated
CONTAINER_STATUS=$(kubectl get pod $HOSTNAME -n {{.Namespace}} -o jsonpath='{.status.containerStatuses[?(@.name=="profiler")].state}' 2>/dev/null || echo "")
if echo "$CONTAINER_STATUS" | grep -q "terminated"; then
echo "Profiler terminated (ran for $(($ELAPSED / 60)) minutes)"
break
fi
sleep 5
done
# Now wait for the output file to exist
echo "Waiting for output file {{.OutputPath}}/{{.OutputFile}}..."
while [ ! -f {{.OutputPath}}/{{.OutputFile}} ]; do sleep 2; done
echo "Output file found, creating ConfigMap..."
# Check profiler status file (2 minute timeout)
echo "Checking profiler status..."
STATUS_FILE="{{.OutputPath}}/profiler_status.yaml"
TIMEOUT=120
CHECK_START=$(date +%s)
# Wait for status file to exist
while [ ! -f "$STATUS_FILE" ]; do
ELAPSED=$(($(date +%s) - CHECK_START))
if [ $ELAPSED -ge $TIMEOUT ]; then
echo "ERROR: Status file not found after ${TIMEOUT}s"
exit 1
fi
sleep 2
done
# Read and parse status from YAML file
STATUS=$(grep "^status:" "$STATUS_FILE" | awk '{print $2}' | tr -d '"' | tr -d "'")
if [ -z "$STATUS" ]; then
echo "ERROR: Invalid status file format"
exit 1
fi
# Check status value
case "$STATUS" in
success)
MESSAGE=$(grep "^message:" "$STATUS_FILE" | sed 's/^message: *//' | tr -d '"' | tr -d "'")
echo "Profiler succeeded: $MESSAGE"
;;
failed)
ERROR=$(grep "^error:" "$STATUS_FILE" | sed 's/^error: *//' | tr -d '"' | tr -d "'")
MESSAGE=$(grep "^message:" "$STATUS_FILE" | sed 's/^message: *//' | tr -d '"' | tr -d "'")
echo "ERROR: Profiler failed: ${ERROR:-$MESSAGE}"
exit 1
;;
running)
echo "ERROR: Profiler still running (unexpected)"
exit 1
;;
*)
echo "ERROR: Unknown status: $STATUS"
exit 1
;;
esac
echo "Creating ConfigMap..."
# Start building ConfigMap YAML with DGD spec
cat >/tmp/cm.yaml <<EOF
......@@ -222,6 +277,12 @@ if [ -f {{.OutputPath}}/{{.MockerOutputFile}} ]; then
echo "Added mocker config to ConfigMap"
fi
# Add profiler status file for debugging
if [ -f {{.OutputPath}}/profiler_status.yaml ]; then
echo " profiler_status.yaml: |" >> /tmp/cm.yaml
sed 's/^/ /' {{.OutputPath}}/profiler_status.yaml >> /tmp/cm.yaml
fi
# Note: Profiling data (raw_data.npz converted to JSON) is included in the
# generated DGD YAML as a separate ConfigMap by the profiler, no need to add it here
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment