Unverified Commit 546d6a1e authored by Ashna Mehrotra's avatar Ashna Mehrotra Committed by GitHub
Browse files

fix: Improve failure handling for profiling failures (#7728)


Signed-off-by: default avatarashnamehrotra <ashnamehrotra@gmail.com>
parent f79b3d62
......@@ -28,6 +28,7 @@ import asyncio
import json
import logging
import os
import sys
from pathlib import Path
import yaml
......@@ -43,10 +44,27 @@ from .utils.profile_common import (
DEFAULT_PREFILL_INTERPOLATION_GRANULARITY,
ProfilerOperationalConfig,
)
from .utils.profiler_status import ProfilerStatus, write_profiler_status
logger = logging.getLogger(__name__)
def _resolve_output_dir() -> str:
"""Best-effort extraction of ``--output-dir`` from ``sys.argv``.
Falls back to :data:`DEFAULT_OUTPUT_DIR` when the flag is absent or
unparseable. This is intentionally independent of full argument parsing so
it can be called even when ``_parse_args()`` itself has failed.
"""
argv = sys.argv[1:]
for i, arg in enumerate(argv):
if arg == "--output-dir" and i + 1 < len(argv):
return argv[i + 1]
if arg.startswith("--output-dir="):
return arg.split("=", 1)[1]
return DEFAULT_OUTPUT_DIR
def _parse_dgdr_spec(config_arg: str) -> DynamoGraphDeploymentRequestSpec:
"""Parse a DGDR spec from a CLI ``--config`` argument.
......@@ -136,8 +154,32 @@ def main() -> None:
try:
dgdr, ops = _parse_args()
except SystemExit as e:
# argparse calls sys.exit(2) on invalid arguments; catch it so we
# can write a status file before the container terminates.
if e.code != 0:
output_dir = _resolve_output_dir()
os.makedirs(output_dir, exist_ok=True)
write_profiler_status(
output_dir,
ProfilerStatus.FAILED,
message="Config validation failed",
error=f"Argument parsing failed (exit code {e.code})",
)
raise
except (ValueError, Exception) as e:
logger.error("Failed to parse profiler config: %s", e)
# Resolve output dir so the sidecar can find profiler_status.yaml
# immediately instead of waiting for the profiler container to
# time out (~8-9 minutes).
output_dir = _resolve_output_dir()
os.makedirs(output_dir, exist_ok=True)
write_profiler_status(
output_dir,
ProfilerStatus.FAILED,
message="Config validation failed",
error=str(e),
)
raise SystemExit(1) from e
os.makedirs(ops.output_dir, exist_ok=True)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment