Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
0268aea4
Unverified
Commit
0268aea4
authored
Feb 03, 2026
by
hhzhang16
Committed by
GitHub
Feb 03, 2026
Browse files
fix: Add status file to prevent output-copier hang on failures (#5898)
Signed-off-by:
Hannah Zhang
<
hannahz@nvidia.com
>
parent
4f9a190c
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
180 additions
and
12 deletions
+180
-12
benchmarks/profiler/profile_sla.py
benchmarks/profiler/profile_sla.py
+43
-1
benchmarks/profiler/utils/profiler_status.py
benchmarks/profiler/utils/profiler_status.py
+65
-0
deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
...nal/controller/dynamographdeploymentrequest_controller.go
+72
-11
No files found.
benchmarks/profiler/profile_sla.py
View file @
0268aea4
...
...
@@ -50,6 +50,10 @@ from benchmarks.profiler.utils.profile_prefill import (
profile_prefill_aiconfigurator
,
)
from
benchmarks.profiler.utils.profiler_argparse
import
create_profiler_parser
from
benchmarks.profiler.utils.profiler_status
import
(
ProfilerStatus
,
write_profiler_status
,
)
from
benchmarks.profiler.webui.select_config
import
(
add_profiling_error
,
clear_profiling_errors
,
...
...
@@ -142,6 +146,14 @@ async def run_profile(args):
if
not
args
.
aic_backend
:
args
.
aic_backend
=
args
.
backend
# Write initial status for external jobs to monitor
os
.
makedirs
(
args
.
output_dir
,
exist_ok
=
True
)
write_profiler_status
(
args
.
output_dir
,
status
=
ProfilerStatus
.
RUNNING
,
message
=
"Profiler job started"
,
)
try
:
config_modifier
=
CONFIG_MODIFIERS
[
args
.
backend
]
...
...
@@ -490,6 +502,12 @@ async def run_profile(args):
error_msg
=
"No prefill results produced; skipping recommendations."
logger
.
error
(
error_msg
)
add_profiling_error
(
error_msg
)
write_profiler_status
(
args
.
output_dir
,
status
=
ProfilerStatus
.
FAILED
,
error
=
error_msg
,
message
=
"Profiler failed: no prefill results produced"
,
)
return
if
args
.
pick_with_webui
:
...
...
@@ -527,6 +545,12 @@ async def run_profile(args):
error_msg
=
"No decode results produced; skipping recommendations."
logger
.
error
(
error_msg
)
add_profiling_error
(
error_msg
)
write_profiler_status
(
args
.
output_dir
,
status
=
ProfilerStatus
.
FAILED
,
error
=
error_msg
,
message
=
"Profiler failed: no decode results produced"
,
)
return
if
min
(
decode_data
.
itl
)
>
args
.
itl
:
warning_msg
=
"No engine configuration satisfies the ITL requirement, please try a smaller model or more powerful hardware"
...
...
@@ -759,8 +783,26 @@ async def run_profile(args):
else
:
yaml
.
safe_dump
(
mocker_config
,
f
,
sort_keys
=
False
)
# Write success status with output files
write_profiler_status
(
args
.
output_dir
,
status
=
ProfilerStatus
.
SUCCESS
,
message
=
"Profiler completed successfully"
,
outputs
=
{
"config_with_planner"
:
"config_with_planner.yaml"
,
"mocker_config_with_planner"
:
"mocker_config_with_planner.yaml"
,
"disagg_config"
:
"disagg_config.yaml"
,
},
)
except
Exception
as
e
:
logger
.
error
(
f
"Profile job failed with error:
{
e
}
"
)
logger
.
exception
(
"Profile job failed with error"
)
write_profiler_status
(
args
.
output_dir
,
status
=
ProfilerStatus
.
FAILED
,
error
=
str
(
e
),
message
=
f
"Profiler failed with exception:
{
type
(
e
).
__name__
}
"
,
)
raise
finally
:
# Always clean up any remaining deployments, even if the job failed
...
...
benchmarks/profiler/utils/profiler_status.py
0 → 100755
View file @
0268aea4
#!/usr/bin/env python3
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Profiler status file management.
Provides utilities for writing profiler status files.
"""
import
logging
import
os
import
time
from
enum
import
Enum
import
yaml
logger
=
logging
.
getLogger
(
__name__
)
class
ProfilerStatus
(
str
,
Enum
):
"""Profiler execution status."""
RUNNING
=
"running"
SUCCESS
=
"success"
FAILED
=
"failed"
STATUS_FILE_NAME
=
"profiler_status.yaml"
def
write_profiler_status
(
output_dir
:
str
,
status
:
ProfilerStatus
,
message
:
str
=
""
,
error
:
str
=
""
,
outputs
:
dict
|
None
=
None
,
)
->
None
:
"""
Write profiler status file.
Args:
output_dir: Output directory path
status: Status enum value
message: Optional status message
error: Optional error message (for failed status)
outputs: Optional dict of output files (for success status)
"""
status_file
=
os
.
path
.
join
(
output_dir
,
STATUS_FILE_NAME
)
status_data
=
{
"status"
:
status
.
value
,
"timestamp"
:
time
.
strftime
(
"%Y-%m-%dT%H:%M:%SZ"
,
time
.
gmtime
()),
}
if
message
:
status_data
[
"message"
]
=
message
if
error
:
status_data
[
"error"
]
=
error
if
outputs
:
status_data
[
"outputs"
]
=
outputs
try
:
with
open
(
status_file
,
"w"
)
as
f
:
yaml
.
safe_dump
(
status_data
,
f
,
sort_keys
=
False
)
except
Exception
as
e
:
logger
.
warning
(
"Failed to write profiler status file: %s"
,
e
)
deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
View file @
0268aea4
...
...
@@ -181,24 +181,79 @@ const (
const
sidecarScriptTemplate
=
`
set -e
set -o pipefail
# Wait for the profiler container to complete, not just for the file to exist
#
This ensures we capture the final config, not in
term
ed
iate
results
#
Wait for profiler container to
termi
n
ate
(no timeout - profiling can take hours)
echo "Waiting for profiler to complete..."
START_TIME=$(date +%s)
LAST_PROGRESS_LOG=$START_TIME
PROGRESS_INTERVAL=300
while true; do
# Check if profiler container has finished (either Completed or Error state)
# Use kubectl to check the pod's container status
STATUS=$(kubectl get pod $HOSTNAME -n {{.Namespace}} -o jsonpath='{.status.containerStatuses[?(@.name=="profiler")].state}' 2>/dev/null || echo "")
if echo "$STATUS" | grep -q "terminated"; then
echo "Profiler container has terminated"
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - START_TIME))
# Log progress every 5 minutes
if [ $((CURRENT_TIME - LAST_PROGRESS_LOG)) -ge $PROGRESS_INTERVAL ]; then
echo "Still waiting... ($(($ELAPSED / 60)) minutes elapsed)"
LAST_PROGRESS_LOG=$CURRENT_TIME
fi
# Check if profiler container terminated
CONTAINER_STATUS=$(kubectl get pod $HOSTNAME -n {{.Namespace}} -o jsonpath='{.status.containerStatuses[?(@.name=="profiler")].state}' 2>/dev/null || echo "")
if echo "$CONTAINER_STATUS" | grep -q "terminated"; then
echo "Profiler terminated (ran for $(($ELAPSED / 60)) minutes)"
break
fi
sleep 5
done
# Now wait for the output file to exist
echo "Waiting for output file {{.OutputPath}}/{{.OutputFile}}..."
while [ ! -f {{.OutputPath}}/{{.OutputFile}} ]; do sleep 2; done
echo "Output file found, creating ConfigMap..."
# Check profiler status file (2 minute timeout)
echo "Checking profiler status..."
STATUS_FILE="{{.OutputPath}}/profiler_status.yaml"
TIMEOUT=120
CHECK_START=$(date +%s)
# Wait for status file to exist
while [ ! -f "$STATUS_FILE" ]; do
ELAPSED=$(($(date +%s) - CHECK_START))
if [ $ELAPSED -ge $TIMEOUT ]; then
echo "ERROR: Status file not found after ${TIMEOUT}s"
exit 1
fi
sleep 2
done
# Read and parse status from YAML file
STATUS=$(grep "^status:" "$STATUS_FILE" | awk '{print $2}' | tr -d '"' | tr -d "'")
if [ -z "$STATUS" ]; then
echo "ERROR: Invalid status file format"
exit 1
fi
# Check status value
case "$STATUS" in
success)
MESSAGE=$(grep "^message:" "$STATUS_FILE" | sed 's/^message: *//' | tr -d '"' | tr -d "'")
echo "Profiler succeeded: $MESSAGE"
;;
failed)
ERROR=$(grep "^error:" "$STATUS_FILE" | sed 's/^error: *//' | tr -d '"' | tr -d "'")
MESSAGE=$(grep "^message:" "$STATUS_FILE" | sed 's/^message: *//' | tr -d '"' | tr -d "'")
echo "ERROR: Profiler failed: ${ERROR:-$MESSAGE}"
exit 1
;;
running)
echo "ERROR: Profiler still running (unexpected)"
exit 1
;;
*)
echo "ERROR: Unknown status: $STATUS"
exit 1
;;
esac
echo "Creating ConfigMap..."
# Start building ConfigMap YAML with DGD spec
cat >/tmp/cm.yaml <<EOF
...
...
@@ -222,6 +277,12 @@ if [ -f {{.OutputPath}}/{{.MockerOutputFile}} ]; then
echo "Added mocker config to ConfigMap"
fi
# Add profiler status file for debugging
if [ -f {{.OutputPath}}/profiler_status.yaml ]; then
echo " profiler_status.yaml: |" >> /tmp/cm.yaml
sed 's/^/ /' {{.OutputPath}}/profiler_status.yaml >> /tmp/cm.yaml
fi
# Note: Profiling data (raw_data.npz converted to JSON) is included in the
# generated DGD YAML as a separate ConfigMap by the profiler, no need to add it here
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment