Unverified Commit 0268aea4 authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

fix: Add status file to prevent output-copier hang on failures (#5898)


Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 4f9a190c
...@@ -50,6 +50,10 @@ from benchmarks.profiler.utils.profile_prefill import ( ...@@ -50,6 +50,10 @@ from benchmarks.profiler.utils.profile_prefill import (
profile_prefill_aiconfigurator, profile_prefill_aiconfigurator,
) )
from benchmarks.profiler.utils.profiler_argparse import create_profiler_parser from benchmarks.profiler.utils.profiler_argparse import create_profiler_parser
from benchmarks.profiler.utils.profiler_status import (
ProfilerStatus,
write_profiler_status,
)
from benchmarks.profiler.webui.select_config import ( from benchmarks.profiler.webui.select_config import (
add_profiling_error, add_profiling_error,
clear_profiling_errors, clear_profiling_errors,
...@@ -142,6 +146,14 @@ async def run_profile(args): ...@@ -142,6 +146,14 @@ async def run_profile(args):
if not args.aic_backend: if not args.aic_backend:
args.aic_backend = args.backend args.aic_backend = args.backend
# Write initial status for external jobs to monitor
os.makedirs(args.output_dir, exist_ok=True)
write_profiler_status(
args.output_dir,
status=ProfilerStatus.RUNNING,
message="Profiler job started",
)
try: try:
config_modifier = CONFIG_MODIFIERS[args.backend] config_modifier = CONFIG_MODIFIERS[args.backend]
...@@ -490,6 +502,12 @@ async def run_profile(args): ...@@ -490,6 +502,12 @@ async def run_profile(args):
error_msg = "No prefill results produced; skipping recommendations." error_msg = "No prefill results produced; skipping recommendations."
logger.error(error_msg) logger.error(error_msg)
add_profiling_error(error_msg) add_profiling_error(error_msg)
write_profiler_status(
args.output_dir,
status=ProfilerStatus.FAILED,
error=error_msg,
message="Profiler failed: no prefill results produced",
)
return return
if args.pick_with_webui: if args.pick_with_webui:
...@@ -527,6 +545,12 @@ async def run_profile(args): ...@@ -527,6 +545,12 @@ async def run_profile(args):
error_msg = "No decode results produced; skipping recommendations." error_msg = "No decode results produced; skipping recommendations."
logger.error(error_msg) logger.error(error_msg)
add_profiling_error(error_msg) add_profiling_error(error_msg)
write_profiler_status(
args.output_dir,
status=ProfilerStatus.FAILED,
error=error_msg,
message="Profiler failed: no decode results produced",
)
return return
if min(decode_data.itl) > args.itl: if min(decode_data.itl) > args.itl:
warning_msg = "No engine configuration satisfies the ITL requirement, please try a smaller model or more powerful hardware" warning_msg = "No engine configuration satisfies the ITL requirement, please try a smaller model or more powerful hardware"
...@@ -759,8 +783,26 @@ async def run_profile(args): ...@@ -759,8 +783,26 @@ async def run_profile(args):
else: else:
yaml.safe_dump(mocker_config, f, sort_keys=False) yaml.safe_dump(mocker_config, f, sort_keys=False)
# Write success status with output files
write_profiler_status(
args.output_dir,
status=ProfilerStatus.SUCCESS,
message="Profiler completed successfully",
outputs={
"config_with_planner": "config_with_planner.yaml",
"mocker_config_with_planner": "mocker_config_with_planner.yaml",
"disagg_config": "disagg_config.yaml",
},
)
except Exception as e: except Exception as e:
logger.error(f"Profile job failed with error: {e}") logger.exception("Profile job failed with error")
write_profiler_status(
args.output_dir,
status=ProfilerStatus.FAILED,
error=str(e),
message=f"Profiler failed with exception: {type(e).__name__}",
)
raise raise
finally: finally:
# Always clean up any remaining deployments, even if the job failed # Always clean up any remaining deployments, even if the job failed
......
#!/usr/bin/env python3
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Profiler status file management.
Provides utilities for writing profiler status files.
"""
import logging
import os
import time
from enum import Enum
import yaml
logger = logging.getLogger(__name__)
class ProfilerStatus(str, Enum):
"""Profiler execution status."""
RUNNING = "running"
SUCCESS = "success"
FAILED = "failed"
STATUS_FILE_NAME = "profiler_status.yaml"
def write_profiler_status(
output_dir: str,
status: ProfilerStatus,
message: str = "",
error: str = "",
outputs: dict | None = None,
) -> None:
"""
Write profiler status file.
Args:
output_dir: Output directory path
status: Status enum value
message: Optional status message
error: Optional error message (for failed status)
outputs: Optional dict of output files (for success status)
"""
status_file = os.path.join(output_dir, STATUS_FILE_NAME)
status_data = {
"status": status.value,
"timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
}
if message:
status_data["message"] = message
if error:
status_data["error"] = error
if outputs:
status_data["outputs"] = outputs
try:
with open(status_file, "w") as f:
yaml.safe_dump(status_data, f, sort_keys=False)
except Exception as e:
logger.warning("Failed to write profiler status file: %s", e)
...@@ -181,24 +181,79 @@ const ( ...@@ -181,24 +181,79 @@ const (
const sidecarScriptTemplate = ` const sidecarScriptTemplate = `
set -e set -e
set -o pipefail set -o pipefail
# Wait for the profiler container to complete, not just for the file to exist
# This ensures we capture the final config, not intermediate results # Wait for profiler container to terminate (no timeout - profiling can take hours)
echo "Waiting for profiler to complete..." echo "Waiting for profiler to complete..."
START_TIME=$(date +%s)
LAST_PROGRESS_LOG=$START_TIME
PROGRESS_INTERVAL=300
while true; do while true; do
# Check if profiler container has finished (either Completed or Error state) CURRENT_TIME=$(date +%s)
# Use kubectl to check the pod's container status ELAPSED=$((CURRENT_TIME - START_TIME))
STATUS=$(kubectl get pod $HOSTNAME -n {{.Namespace}} -o jsonpath='{.status.containerStatuses[?(@.name=="profiler")].state}' 2>/dev/null || echo "")
if echo "$STATUS" | grep -q "terminated"; then # Log progress every 5 minutes
echo "Profiler container has terminated" if [ $((CURRENT_TIME - LAST_PROGRESS_LOG)) -ge $PROGRESS_INTERVAL ]; then
echo "Still waiting... ($(($ELAPSED / 60)) minutes elapsed)"
LAST_PROGRESS_LOG=$CURRENT_TIME
fi
# Check if profiler container terminated
CONTAINER_STATUS=$(kubectl get pod $HOSTNAME -n {{.Namespace}} -o jsonpath='{.status.containerStatuses[?(@.name=="profiler")].state}' 2>/dev/null || echo "")
if echo "$CONTAINER_STATUS" | grep -q "terminated"; then
echo "Profiler terminated (ran for $(($ELAPSED / 60)) minutes)"
break break
fi fi
sleep 5 sleep 5
done done
# Now wait for the output file to exist # Check profiler status file (2 minute timeout)
echo "Waiting for output file {{.OutputPath}}/{{.OutputFile}}..." echo "Checking profiler status..."
while [ ! -f {{.OutputPath}}/{{.OutputFile}} ]; do sleep 2; done STATUS_FILE="{{.OutputPath}}/profiler_status.yaml"
echo "Output file found, creating ConfigMap..." TIMEOUT=120
CHECK_START=$(date +%s)
# Wait for status file to exist
while [ ! -f "$STATUS_FILE" ]; do
ELAPSED=$(($(date +%s) - CHECK_START))
if [ $ELAPSED -ge $TIMEOUT ]; then
echo "ERROR: Status file not found after ${TIMEOUT}s"
exit 1
fi
sleep 2
done
# Read and parse status from YAML file
STATUS=$(grep "^status:" "$STATUS_FILE" | awk '{print $2}' | tr -d '"' | tr -d "'")
if [ -z "$STATUS" ]; then
echo "ERROR: Invalid status file format"
exit 1
fi
# Check status value
case "$STATUS" in
success)
MESSAGE=$(grep "^message:" "$STATUS_FILE" | sed 's/^message: *//' | tr -d '"' | tr -d "'")
echo "Profiler succeeded: $MESSAGE"
;;
failed)
ERROR=$(grep "^error:" "$STATUS_FILE" | sed 's/^error: *//' | tr -d '"' | tr -d "'")
MESSAGE=$(grep "^message:" "$STATUS_FILE" | sed 's/^message: *//' | tr -d '"' | tr -d "'")
echo "ERROR: Profiler failed: ${ERROR:-$MESSAGE}"
exit 1
;;
running)
echo "ERROR: Profiler still running (unexpected)"
exit 1
;;
*)
echo "ERROR: Unknown status: $STATUS"
exit 1
;;
esac
echo "Creating ConfigMap..."
# Start building ConfigMap YAML with DGD spec # Start building ConfigMap YAML with DGD spec
cat >/tmp/cm.yaml <<EOF cat >/tmp/cm.yaml <<EOF
...@@ -222,6 +277,12 @@ if [ -f {{.OutputPath}}/{{.MockerOutputFile}} ]; then ...@@ -222,6 +277,12 @@ if [ -f {{.OutputPath}}/{{.MockerOutputFile}} ]; then
echo "Added mocker config to ConfigMap" echo "Added mocker config to ConfigMap"
fi fi
# Add profiler status file for debugging
if [ -f {{.OutputPath}}/profiler_status.yaml ]; then
echo " profiler_status.yaml: |" >> /tmp/cm.yaml
sed 's/^/ /' {{.OutputPath}}/profiler_status.yaml >> /tmp/cm.yaml
fi
# Note: Profiling data (raw_data.npz converted to JSON) is included in the # Note: Profiling data (raw_data.npz converted to JSON) is included in the
# generated DGD YAML as a separate ConfigMap by the profiler, no need to add it here # generated DGD YAML as a separate ConfigMap by the profiler, no need to add it here
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment