Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
740130eb
Unverified
Commit
740130eb
authored
Mar 20, 2026
by
Jonathan Tong
Committed by
GitHub
Mar 20, 2026
Browse files
feat: surface status and errors for profiler in DGDR (#6855)
Signed-off-by:
Jont828
<
jt572@cornell.edu
>
parent
1114004e
Changes
11
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
893 additions
and
40 deletions
+893
-40
components/src/dynamo/profiler/profile_sla.py
components/src/dynamo/profiler/profile_sla.py
+35
-0
components/src/dynamo/profiler/thorough.py
components/src/dynamo/profiler/thorough.py
+42
-2
components/src/dynamo/profiler/utils/profile_common.py
components/src/dynamo/profiler/utils/profile_common.py
+6
-2
components/src/dynamo/profiler/utils/profiler_status.py
components/src/dynamo/profiler/utils/profiler_status.py
+7
-0
deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamographdeploymentrequests.yaml
...erator/crds/nvidia.com_dynamographdeploymentrequests.yaml
+0
-2
deploy/operator/.gitignore
deploy/operator/.gitignore
+4
-0
deploy/operator/api/v1beta1/dynamographdeploymentrequest_types.go
...perator/api/v1beta1/dynamographdeploymentrequest_types.go
+2
-2
deploy/operator/config/crd/bases/nvidia.com_dynamographdeploymentrequests.yaml
...g/crd/bases/nvidia.com_dynamographdeploymentrequests.yaml
+0
-2
deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
...nal/controller/dynamographdeploymentrequest_controller.go
+227
-15
deploy/operator/internal/controller/dynamographdeploymentrequest_controller_test.go
...ontroller/dynamographdeploymentrequest_controller_test.go
+558
-7
deploy/utils/dynamo_deployment.py
deploy/utils/dynamo_deployment.py
+12
-8
No files found.
components/src/dynamo/profiler/profile_sla.py
View file @
740130eb
...
@@ -36,6 +36,7 @@ from dynamo.profiler.utils.dgd_generation import assemble_final_config
...
@@ -36,6 +36,7 @@ from dynamo.profiler.utils.dgd_generation import assemble_final_config
from
dynamo.profiler.utils.dgdr_v1beta1_types
import
(
from
dynamo.profiler.utils.dgdr_v1beta1_types
import
(
BackendType
,
BackendType
,
DynamoGraphDeploymentRequestSpec
,
DynamoGraphDeploymentRequestSpec
,
ProfilingPhase
,
)
)
from
dynamo.profiler.utils.dgdr_validate
import
(
from
dynamo.profiler.utils.dgdr_validate
import
(
valid_dgdr_spec
,
valid_dgdr_spec
,
...
@@ -183,6 +184,14 @@ async def _execute_strategy(
...
@@ -183,6 +184,14 @@ async def _execute_strategy(
deployment_clients
,
deployment_clients
,
)
)
ops
.
current_phase
=
ProfilingPhase
.
SelectingConfig
write_profiler_status
(
ops
.
output_dir
,
status
=
ProfilerStatus
.
RUNNING
,
message
=
"Filtering results and selecting cost-efficient configuration"
,
phase
=
ProfilingPhase
.
SelectingConfig
,
)
best_config_df
=
pick_result
[
"best_config_df"
]
best_config_df
=
pick_result
[
"best_config_df"
]
best_latencies
=
pick_result
[
"best_latencies"
]
best_latencies
=
pick_result
[
"best_latencies"
]
...
@@ -244,6 +253,7 @@ def _write_final_output(ops: ProfilerOperationalConfig, final_config: Any) -> bo
...
@@ -244,6 +253,7 @@ def _write_final_output(ops: ProfilerOperationalConfig, final_config: Any) -> bo
status
=
ProfilerStatus
.
FAILED
,
status
=
ProfilerStatus
.
FAILED
,
error
=
error_msg
,
error
=
error_msg
,
message
=
error_msg
,
message
=
error_msg
,
phase
=
ProfilingPhase
.
GeneratingDGD
,
)
)
return
False
return
False
else
:
else
:
...
@@ -261,6 +271,7 @@ def _write_final_output(ops: ProfilerOperationalConfig, final_config: Any) -> bo
...
@@ -261,6 +271,7 @@ def _write_final_output(ops: ProfilerOperationalConfig, final_config: Any) -> bo
outputs
=
{
outputs
=
{
"final_config"
:
"final_config.yaml"
,
"final_config"
:
"final_config.yaml"
,
},
},
phase
=
ProfilingPhase
.
Done
,
)
)
return
True
return
True
...
@@ -287,6 +298,7 @@ async def run_profile(
...
@@ -287,6 +298,7 @@ async def run_profile(
ops
.
output_dir
,
ops
.
output_dir
,
status
=
ProfilerStatus
.
RUNNING
,
status
=
ProfilerStatus
.
RUNNING
,
message
=
"Profiler job started"
,
message
=
"Profiler job started"
,
phase
=
ProfilingPhase
.
Initializing
,
)
)
try
:
try
:
...
@@ -312,6 +324,14 @@ async def run_profile(
...
@@ -312,6 +324,14 @@ async def run_profile(
# then validate DGDR features based on AIC support
# then validate DGDR features based on AIC support
validate_dgdr_dynamo_features
(
dgdr
,
aic_supported
)
validate_dgdr_dynamo_features
(
dgdr
,
aic_supported
)
ops
.
current_phase
=
ProfilingPhase
.
SweepingPrefill
write_profiler_status
(
ops
.
output_dir
,
status
=
ProfilerStatus
.
RUNNING
,
message
=
"Sweeping parallelization strategies"
,
phase
=
ops
.
current_phase
,
)
(
(
pick_result
,
pick_result
,
best_prefill_config
,
best_prefill_config
,
...
@@ -357,6 +377,13 @@ async def run_profile(
...
@@ -357,6 +377,13 @@ async def run_profile(
chosen_exp
=
pick_result
.
get
(
"chosen_exp"
,
""
)
chosen_exp
=
pick_result
.
get
(
"chosen_exp"
,
""
)
is_disagg_config
=
chosen_exp
not
in
(
"agg"
,)
and
bool
(
chosen_exp
)
is_disagg_config
=
chosen_exp
not
in
(
"agg"
,)
and
bool
(
chosen_exp
)
if
not
ops
.
dry_run
and
dgd_config
and
needs_profile_data
(
dgdr
):
if
not
ops
.
dry_run
and
dgd_config
and
needs_profile_data
(
dgdr
):
ops
.
current_phase
=
ProfilingPhase
.
BuildingCurves
write_profiler_status
(
ops
.
output_dir
,
status
=
ProfilerStatus
.
RUNNING
,
message
=
"Building interpolation curves for planner integration"
,
phase
=
ops
.
current_phase
,
)
if
not
is_disagg_config
:
if
not
is_disagg_config
:
logger
.
info
(
logger
.
info
(
"Picked config is aggregated (chosen_exp=%r) — "
"Picked config is aggregated (chosen_exp=%r) — "
...
@@ -396,6 +423,13 @@ async def run_profile(
...
@@ -396,6 +423,13 @@ async def run_profile(
# ---------------------------------------------------------------
# ---------------------------------------------------------------
# Final DGD assembly
# Final DGD assembly
# ---------------------------------------------------------------
# ---------------------------------------------------------------
ops
.
current_phase
=
ProfilingPhase
.
GeneratingDGD
write_profiler_status
(
ops
.
output_dir
,
status
=
ProfilerStatus
.
RUNNING
,
message
=
"Packaging data and generating final DGD YAML"
,
phase
=
ops
.
current_phase
,
)
final_config
=
assemble_final_config
(
final_config
=
assemble_final_config
(
dgdr
,
ops
,
dgd_config
,
best_prefill_config
,
best_decode_config
dgdr
,
ops
,
dgd_config
,
best_prefill_config
,
best_decode_config
)
)
...
@@ -431,6 +465,7 @@ async def run_profile(
...
@@ -431,6 +465,7 @@ async def run_profile(
status
=
ProfilerStatus
.
FAILED
,
status
=
ProfilerStatus
.
FAILED
,
error
=
str
(
e
),
error
=
str
(
e
),
message
=
f
"Profiler failed with exception:
{
type
(
e
).
__name__
}
"
,
message
=
f
"Profiler failed with exception:
{
type
(
e
).
__name__
}
"
,
phase
=
ops
.
current_phase
,
)
)
raise
raise
finally
:
finally
:
...
...
components/src/dynamo/profiler/thorough.py
View file @
740130eb
...
@@ -43,6 +43,7 @@ from dynamo.profiler.utils.config_modifiers.protocol import apply_dgd_overrides
...
@@ -43,6 +43,7 @@ from dynamo.profiler.utils.config_modifiers.protocol import apply_dgd_overrides
from
dynamo.profiler.utils.dgdr_v1beta1_types
import
(
from
dynamo.profiler.utils.dgdr_v1beta1_types
import
(
DynamoGraphDeploymentRequestSpec
,
DynamoGraphDeploymentRequestSpec
,
ModelCacheSpec
,
ModelCacheSpec
,
ProfilingPhase
,
)
)
from
dynamo.profiler.utils.profile_common
import
(
from
dynamo.profiler.utils.profile_common
import
(
ProfilerOperationalConfig
,
ProfilerOperationalConfig
,
...
@@ -51,6 +52,7 @@ from dynamo.profiler.utils.profile_common import (
...
@@ -51,6 +52,7 @@ from dynamo.profiler.utils.profile_common import (
inject_tolerations_into_dgd
,
inject_tolerations_into_dgd
,
)
)
from
dynamo.profiler.utils.profile_decode
import
get_num_request_range
from
dynamo.profiler.utils.profile_decode
import
get_num_request_range
from
dynamo.profiler.utils.profiler_status
import
ProfilerStatus
,
write_profiler_status
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -68,7 +70,8 @@ async def _benchmark_prefill_candidates(
...
@@ -68,7 +70,8 @@ async def _benchmark_prefill_candidates(
)
->
pd
.
DataFrame
:
)
->
pd
.
DataFrame
:
"""Deploy each prefill candidate, measure TTFT, return prefill_df."""
"""Deploy each prefill candidate, measure TTFT, return prefill_df."""
prefill_rows
:
list
[
dict
]
=
[]
prefill_rows
:
list
[
dict
]
=
[]
for
candidate
in
prefill_candidates
:
total_prefill
=
len
(
prefill_candidates
)
for
idx
,
candidate
in
enumerate
(
prefill_candidates
,
1
):
num_gpus
=
candidate
.
num_gpus
num_gpus
=
candidate
.
num_gpus
label
=
make_parallel_label
(
label
=
make_parallel_label
(
candidate
.
tp
,
candidate
.
tp
,
...
@@ -88,7 +91,18 @@ async def _benchmark_prefill_candidates(
...
@@ -88,7 +91,18 @@ async def _benchmark_prefill_candidates(
model_name
,
model_path
=
config_modifier
.
get_model_name
(
candidate
.
dgd_config
)
model_name
,
model_path
=
config_modifier
.
get_model_name
(
candidate
.
dgd_config
)
frontend_port
=
config_modifier
.
get_port
(
candidate
.
dgd_config
)
frontend_port
=
config_modifier
.
get_port
(
candidate
.
dgd_config
)
progress_msg
=
(
f
"Benchmarking prefill candidate
{
idx
}
/
{
total_prefill
}
: "
f
"
{
label
}
(
{
num_gpus
}
GPUs)"
)
logger
.
info
(
"Profiling prefill candidate %s with %d GPUs..."
,
label
,
num_gpus
)
logger
.
info
(
"Profiling prefill candidate %s with %d GPUs..."
,
label
,
num_gpus
)
ops
.
current_phase
=
ProfilingPhase
.
SweepingPrefill
write_profiler_status
(
ops
.
output_dir
,
status
=
ProfilerStatus
.
RUNNING
,
message
=
progress_msg
,
phase
=
ProfilingPhase
.
SweepingPrefill
,
)
client
=
DynamoDeploymentClient
(
client
=
DynamoDeploymentClient
(
namespace
=
ops
.
k8s_namespace
,
namespace
=
ops
.
k8s_namespace
,
...
@@ -158,7 +172,8 @@ async def _benchmark_decode_candidates(
...
@@ -158,7 +172,8 @@ async def _benchmark_decode_candidates(
)
->
pd
.
DataFrame
:
)
->
pd
.
DataFrame
:
"""Deploy each decode candidate, sweep num_request, return decode_df."""
"""Deploy each decode candidate, sweep num_request, return decode_df."""
decode_rows
:
list
[
dict
]
=
[]
decode_rows
:
list
[
dict
]
=
[]
for
candidate
in
decode_candidates
:
total_decode
=
len
(
decode_candidates
)
for
idx
,
candidate
in
enumerate
(
decode_candidates
,
1
):
num_gpus
=
candidate
.
num_gpus
num_gpus
=
candidate
.
num_gpus
label
=
make_parallel_label
(
label
=
make_parallel_label
(
candidate
.
tp
,
candidate
.
tp
,
...
@@ -178,7 +193,18 @@ async def _benchmark_decode_candidates(
...
@@ -178,7 +193,18 @@ async def _benchmark_decode_candidates(
model_name
,
model_path
=
config_modifier
.
get_model_name
(
candidate
.
dgd_config
)
model_name
,
model_path
=
config_modifier
.
get_model_name
(
candidate
.
dgd_config
)
frontend_port
=
config_modifier
.
get_port
(
candidate
.
dgd_config
)
frontend_port
=
config_modifier
.
get_port
(
candidate
.
dgd_config
)
progress_msg
=
(
f
"Benchmarking decode candidate
{
idx
}
/
{
total_decode
}
: "
f
"
{
label
}
(
{
num_gpus
}
GPUs)"
)
logger
.
info
(
"Profiling decode candidate %s with %d GPUs..."
,
label
,
num_gpus
)
logger
.
info
(
"Profiling decode candidate %s with %d GPUs..."
,
label
,
num_gpus
)
ops
.
current_phase
=
ProfilingPhase
.
SweepingDecode
write_profiler_status
(
ops
.
output_dir
,
status
=
ProfilerStatus
.
RUNNING
,
message
=
progress_msg
,
phase
=
ProfilingPhase
.
SweepingDecode
,
)
client
=
DynamoDeploymentClient
(
client
=
DynamoDeploymentClient
(
namespace
=
ops
.
k8s_namespace
,
namespace
=
ops
.
k8s_namespace
,
...
@@ -377,6 +403,13 @@ async def run_thorough(
...
@@ -377,6 +403,13 @@ async def run_thorough(
config_modifier
=
CONFIG_MODIFIERS
[
backend
]
config_modifier
=
CONFIG_MODIFIERS
[
backend
]
# --- Stage 2: Benchmarking ---
# --- Stage 2: Benchmarking ---
ops
.
current_phase
=
ProfilingPhase
.
SweepingPrefill
write_profiler_status
(
ops
.
output_dir
,
status
=
ProfilerStatus
.
RUNNING
,
message
=
"Sweeping parallelization strategies for prefill, measuring TTFT"
,
phase
=
ops
.
current_phase
,
)
prefill_df
=
await
_benchmark_prefill_candidates
(
prefill_df
=
await
_benchmark_prefill_candidates
(
prefill_candidates
,
prefill_candidates
,
ops
,
ops
,
...
@@ -388,6 +421,13 @@ async def run_thorough(
...
@@ -388,6 +421,13 @@ async def run_thorough(
deployment_clients
,
deployment_clients
,
config_modifier
,
config_modifier
,
)
)
ops
.
current_phase
=
ProfilingPhase
.
SweepingDecode
write_profiler_status
(
ops
.
output_dir
,
status
=
ProfilerStatus
.
RUNNING
,
message
=
"Sweeping parallelization strategies for decode, measuring ITL"
,
phase
=
ops
.
current_phase
,
)
decode_df
=
await
_benchmark_decode_candidates
(
decode_df
=
await
_benchmark_decode_candidates
(
decode_candidates
,
decode_candidates
,
ops
,
ops
,
...
...
components/src/dynamo/profiler/utils/profile_common.py
View file @
740130eb
...
@@ -18,14 +18,17 @@
...
@@ -18,14 +18,17 @@
import
copy
import
copy
import
logging
import
logging
import
os
import
os
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
,
field
import
pandas
as
pd
import
pandas
as
pd
from
dynamo.profiler.utils.config_modifiers.parallelization_mapping
import
(
from
dynamo.profiler.utils.config_modifiers.parallelization_mapping
import
(
PickedParallelConfig
,
PickedParallelConfig
,
)
)
from
dynamo.profiler.utils.dgdr_v1beta1_types
import
DynamoGraphDeploymentRequestSpec
from
dynamo.profiler.utils.dgdr_v1beta1_types
import
(
DynamoGraphDeploymentRequestSpec
,
ProfilingPhase
,
)
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -111,6 +114,7 @@ class ProfilerOperationalConfig:
...
@@ -111,6 +114,7 @@ class ProfilerOperationalConfig:
prefill_interpolation_granularity
:
int
=
DEFAULT_PREFILL_INTERPOLATION_GRANULARITY
prefill_interpolation_granularity
:
int
=
DEFAULT_PREFILL_INTERPOLATION_GRANULARITY
decode_interpolation_granularity
:
int
=
DEFAULT_DECODE_INTERPOLATION_GRANULARITY
decode_interpolation_granularity
:
int
=
DEFAULT_DECODE_INTERPOLATION_GRANULARITY
dry_run
:
bool
=
DEFAULT_DRY_RUN
dry_run
:
bool
=
DEFAULT_DRY_RUN
current_phase
:
ProfilingPhase
=
field
(
default
=
ProfilingPhase
.
Initializing
)
# ---------------------------------------------------------------------------
# ---------------------------------------------------------------------------
...
...
components/src/dynamo/profiler/utils/profiler_status.py
View file @
740130eb
...
@@ -16,6 +16,8 @@ from typing import Any
...
@@ -16,6 +16,8 @@ from typing import Any
import
yaml
import
yaml
from
dynamo.profiler.utils.dgdr_v1beta1_types
import
ProfilingPhase
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -36,6 +38,7 @@ def write_profiler_status(
...
@@ -36,6 +38,7 @@ def write_profiler_status(
message
:
str
=
""
,
message
:
str
=
""
,
error
:
str
=
""
,
error
:
str
=
""
,
outputs
:
dict
|
None
=
None
,
outputs
:
dict
|
None
=
None
,
phase
:
ProfilingPhase
|
None
=
None
,
)
->
None
:
)
->
None
:
"""
"""
Write profiler status file.
Write profiler status file.
...
@@ -46,6 +49,8 @@ def write_profiler_status(
...
@@ -46,6 +49,8 @@ def write_profiler_status(
message: Optional status message
message: Optional status message
error: Optional error message (for failed status)
error: Optional error message (for failed status)
outputs: Optional dict of output files (for success status)
outputs: Optional dict of output files (for success status)
phase: Optional profiling sub-phase (e.g. ProfilingPhase value).
Relayed by the sidecar to the controller for kubectl visibility.
"""
"""
status_file
=
os
.
path
.
join
(
output_dir
,
STATUS_FILE_NAME
)
status_file
=
os
.
path
.
join
(
output_dir
,
STATUS_FILE_NAME
)
status_data
:
dict
[
str
,
Any
]
=
{
status_data
:
dict
[
str
,
Any
]
=
{
...
@@ -58,6 +63,8 @@ def write_profiler_status(
...
@@ -58,6 +63,8 @@ def write_profiler_status(
status_data
[
"error"
]
=
error
status_data
[
"error"
]
=
error
if
outputs
:
if
outputs
:
status_data
[
"outputs"
]
=
outputs
status_data
[
"outputs"
]
=
outputs
if
phase
:
status_data
[
"phase"
]
=
phase
.
value
try
:
try
:
with
open
(
status_file
,
"w"
)
as
f
:
with
open
(
status_file
,
"w"
)
as
f
:
...
...
deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamographdeploymentrequests.yaml
View file @
740130eb
...
@@ -480,11 +480,9 @@ spec:
...
@@ -480,11 +480,9 @@ spec:
type: string
type: string
- jsonPath: .status.conditions[?(@.type=="Succeeded")].reason
- jsonPath: .status.conditions[?(@.type=="Succeeded")].reason
name: Reason
name: Reason
priority: 1
type: string
type: string
- jsonPath: .status.conditions[?(@.type=="Succeeded")].message
- jsonPath: .status.conditions[?(@.type=="Succeeded")].message
name: Message
name: Message
priority: 1
type: string
type: string
- jsonPath: .status.dgdName
- jsonPath: .status.dgdName
name: DGD
name: DGD
...
...
deploy/operator/.gitignore
View file @
740130eb
...
@@ -6,6 +6,7 @@
...
@@ -6,6 +6,7 @@
*.so
*.so
*.dylib
*.dylib
bin/*
bin/*
tilt_bin/
Dockerfile.cross
Dockerfile.cross
#temp files
#temp files
...
@@ -28,6 +29,9 @@ go.work
...
@@ -28,6 +29,9 @@ go.work
.idea
.idea
.vscode
.vscode
*.swp
*.swp
# Tilt local settings (personal overrides)
tilt-settings.local.yaml
*.swo
*.swo
*~
*~
...
...
deploy/operator/api/v1beta1/dynamographdeploymentrequest_types.go
View file @
740130eb
...
@@ -499,8 +499,8 @@ type DynamoGraphDeploymentRequestStatus struct {
...
@@ -499,8 +499,8 @@ type DynamoGraphDeploymentRequestStatus struct {
// +kubebuilder:printcolumn:name="Backend",type=string,JSONPath=`.spec.backend`
// +kubebuilder:printcolumn:name="Backend",type=string,JSONPath=`.spec.backend`
// +kubebuilder:printcolumn:name="Phase",type=string,JSONPath=`.status.phase`
// +kubebuilder:printcolumn:name="Phase",type=string,JSONPath=`.status.phase`
// +kubebuilder:printcolumn:name="Profiling",type=string,JSONPath=`.status.profilingPhase`
// +kubebuilder:printcolumn:name="Profiling",type=string,JSONPath=`.status.profilingPhase`
// +kubebuilder:printcolumn:name="Reason",type=string,JSONPath=`.status.conditions[?(@.type=="Succeeded")].reason`
,priority=1
// +kubebuilder:printcolumn:name="Reason",type=string,JSONPath=`.status.conditions[?(@.type=="Succeeded")].reason`
// +kubebuilder:printcolumn:name="Message",type=string,JSONPath=`.status.conditions[?(@.type=="Succeeded")].message`
,priority=1
// +kubebuilder:printcolumn:name="Message",type=string,JSONPath=`.status.conditions[?(@.type=="Succeeded")].message`
// +kubebuilder:printcolumn:name="DGD",type=string,JSONPath=`.status.dgdName`
// +kubebuilder:printcolumn:name="DGD",type=string,JSONPath=`.status.dgdName`
// +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp"
// +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp"
type
DynamoGraphDeploymentRequest
struct
{
type
DynamoGraphDeploymentRequest
struct
{
...
...
deploy/operator/config/crd/bases/nvidia.com_dynamographdeploymentrequests.yaml
View file @
740130eb
...
@@ -480,11 +480,9 @@ spec:
...
@@ -480,11 +480,9 @@ spec:
type: string
type: string
- jsonPath: .status.conditions[?(@.type=="Succeeded")].reason
- jsonPath: .status.conditions[?(@.type=="Succeeded")].reason
name: Reason
name: Reason
priority: 1
type: string
type: string
- jsonPath: .status.conditions[?(@.type=="Succeeded")].message
- jsonPath: .status.conditions[?(@.type=="Succeeded")].message
name: Message
name: Message
priority: 1
type: string
type: string
- jsonPath: .status.dgdName
- jsonPath: .status.dgdName
name: DGD
name: DGD
...
...
deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
View file @
740130eb
...
@@ -130,21 +130,61 @@ const (
...
@@ -130,21 +130,61 @@ const (
MessageModelCachePVCNotFound
=
"model cache PVC %s not found in namespace %s"
MessageModelCachePVCNotFound
=
"model cache PVC %s not found in namespace %s"
)
)
// shell script template for the output copier sidecar
// shell script template for the output copier sidecar.
//
// The sidecar is a continuous poller that:
// 1. During profiling: polls profiler_status.yaml every 10s, relays phase+message
// to the output ConfigMap so the controller can track sub-phase progress.
// 2. After profiler terminates: writes the final profiling output (final_config.yaml
// + profiler_status.yaml) to the same ConfigMap, preserving the phase+message keys.
const
sidecarScriptTemplate
=
`
const
sidecarScriptTemplate
=
`
set -e
set -e
set -o pipefail
set -o pipefail
# Wait for profiler container to terminate (no timeout - profiling can take hours)
STATUS_FILE="{{.OutputPath}}/profiler_status.yaml"
echo "Waiting for profiler to complete...
"
LAST_PHASE="
"
START_TIME=$(date +%s)
START_TIME=$(date +%s)
LAST_PROGRESS_LOG=$START_TIME
LAST_PROGRESS_LOG=$START_TIME
PROGRESS_INTERVAL=300
PROGRESS_INTERVAL=300
# relay_phase: read phase+message from profiler_status.yaml and write to ConfigMap.
# Only writes when the phase changes (debounce).
relay_phase() {
if [ ! -f "$STATUS_FILE" ]; then
return
fi
PHASE=$(grep "^phase:" "$STATUS_FILE" 2>/dev/null | awk '{print $2}' | tr -d '"' | tr -d "'" || true)
MESSAGE=$(grep "^message:" "$STATUS_FILE" 2>/dev/null | sed 's/^message: *//' | tr -d '"' | tr -d "'" || true)
if [ -z "$PHASE" ] || [ "$PHASE" = "$LAST_PHASE" ]; then
return
fi
echo "Phase update: $PHASE - $MESSAGE"
cat >/tmp/progress.yaml <<PEOF
apiVersion: v1
kind: ConfigMap
metadata:
name: {{.ConfigMapName}}
namespace: {{.Namespace}}
labels:
dgdr.nvidia.com/name: {{.DGDRName}}
dgdr.nvidia.com/namespace: {{.Namespace}}
nvidia.com/managed-by: dynamo-operator
data:
phase: "$PHASE"
message: "$MESSAGE"
PEOF
kubectl apply -f /tmp/progress.yaml 2>/dev/null && LAST_PHASE="$PHASE" || echo "Warning: failed to update progress ConfigMap"
}
# Main loop: poll profiler_status.yaml and wait for profiler to terminate
echo "Waiting for profiler to complete..."
while true; do
while true; do
CURRENT_TIME=$(date +%s)
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - START_TIME))
ELAPSED=$((CURRENT_TIME - START_TIME))
# Relay phase updates to ConfigMap
relay_phase
# Log progress every 5 minutes
# Log progress every 5 minutes
if [ $((CURRENT_TIME - LAST_PROGRESS_LOG)) -ge $PROGRESS_INTERVAL ]; then
if [ $((CURRENT_TIME - LAST_PROGRESS_LOG)) -ge $PROGRESS_INTERVAL ]; then
echo "Still waiting... ($(($ELAPSED / 60)) minutes elapsed)"
echo "Still waiting... ($(($ELAPSED / 60)) minutes elapsed)"
...
@@ -157,12 +197,14 @@ while true; do
...
@@ -157,12 +197,14 @@ while true; do
echo "Profiler terminated (ran for $(($ELAPSED / 60)) minutes)"
echo "Profiler terminated (ran for $(($ELAPSED / 60)) minutes)"
break
break
fi
fi
sleep
5
sleep
10
done
done
# Final relay: pick up any last phase change written just before termination
relay_phase
# Check profiler status file (2 minute timeout)
# Check profiler status file (2 minute timeout)
echo "Checking profiler status..."
echo "Checking profiler status..."
STATUS_FILE="{{.OutputPath}}/profiler_status.yaml"
TIMEOUT=120
TIMEOUT=120
CHECK_START=$(date +%s)
CHECK_START=$(date +%s)
...
@@ -206,9 +248,13 @@ case "$STATUS" in
...
@@ -206,9 +248,13 @@ case "$STATUS" in
;;
;;
esac
esac
echo "Creating ConfigMap..."
echo "Writing profiling output to ConfigMap..."
# Read final phase+message to preserve them alongside the profiling output
FINAL_PHASE=$(grep "^phase:" "$STATUS_FILE" 2>/dev/null | awk '{print $2}' | tr -d '"' | tr -d "'" || true)
FINAL_MESSAGE=$(grep "^message:" "$STATUS_FILE" 2>/dev/null | sed 's/^message: *//' | tr -d '"' | tr -d "'" || true)
# Start building ConfigMap YAML with DGD spec
# Start building ConfigMap YAML with DGD spec
+ preserved phase/message
cat >/tmp/cm.yaml <<EOF
cat >/tmp/cm.yaml <<EOF
apiVersion: v1
apiVersion: v1
kind: ConfigMap
kind: ConfigMap
...
@@ -217,8 +263,11 @@ metadata:
...
@@ -217,8 +263,11 @@ metadata:
namespace: {{.Namespace}}
namespace: {{.Namespace}}
labels:
labels:
dgdr.nvidia.com/name: {{.DGDRName}}
dgdr.nvidia.com/name: {{.DGDRName}}
dgdr.nvidia.com/namespace: {{.Namespace}}
nvidia.com/managed-by: dynamo-operator
nvidia.com/managed-by: dynamo-operator
data:
data:
phase: "$FINAL_PHASE"
message: "$FINAL_MESSAGE"
{{.OutputFile}}: |
{{.OutputFile}}: |
EOF
EOF
sed 's/^/ /' {{.OutputPath}}/{{.OutputFile}} >> /tmp/cm.yaml
sed 's/^/ /' {{.OutputPath}}/{{.OutputFile}} >> /tmp/cm.yaml
...
@@ -242,6 +291,44 @@ kubectl apply -f /tmp/cm.yaml
...
@@ -242,6 +291,44 @@ kubectl apply -f /tmp/cm.yaml
echo "Saved profiling output to ConfigMap {{.ConfigMapName}}"
echo "Saved profiling output to ConfigMap {{.ConfigMapName}}"
`
`
// profilingPhaseReason returns the condition Reason for a profiling sub-phase.
// By design, the ProfilingPhase string values are identical to the Reason values
// (e.g., ProfilingPhaseSweepingDecode = "SweepingDecode" = ProfilingReasonSweepingDecode).
func
profilingPhaseReason
(
phase
nvidiacomv1beta1
.
ProfilingPhase
)
string
{
if
phase
==
nvidiacomv1beta1
.
ProfilingPhaseDone
{
return
nvidiacomv1beta1
.
ProfilingReasonCompleted
}
return
string
(
phase
)
}
// profilingPhaseFailureReason returns the condition Reason for a failed profiling sub-phase.
// By convention, failure reasons are "<Phase>Failed" (e.g., "SweepingDecodeFailed").
// An empty phase yields the generic "ProfilingFailed".
func
profilingPhaseFailureReason
(
phase
nvidiacomv1beta1
.
ProfilingPhase
)
string
{
if
phase
==
""
{
return
"ProfilingFailed"
}
return
string
(
phase
)
+
"Failed"
}
// validProfilingPhases is the set of phases the profiler sidecar may report.
var
validProfilingPhases
=
map
[
nvidiacomv1beta1
.
ProfilingPhase
]
struct
{}{
nvidiacomv1beta1
.
ProfilingPhaseInitializing
:
{},
nvidiacomv1beta1
.
ProfilingPhaseSweepingPrefill
:
{},
nvidiacomv1beta1
.
ProfilingPhaseSweepingDecode
:
{},
nvidiacomv1beta1
.
ProfilingPhaseSelectingConfig
:
{},
nvidiacomv1beta1
.
ProfilingPhaseBuildingCurves
:
{},
nvidiacomv1beta1
.
ProfilingPhaseGeneratingDGD
:
{},
nvidiacomv1beta1
.
ProfilingPhaseDone
:
{},
}
// isValidProfilingPhase returns true if phase is a recognized ProfilingPhase value.
func
isValidProfilingPhase
(
phase
string
)
bool
{
_
,
ok
:=
validProfilingPhases
[
nvidiacomv1beta1
.
ProfilingPhase
(
phase
)]
return
ok
}
// DynamoGraphDeploymentRequestReconciler reconciles a DynamoGraphDeploymentRequest object
// DynamoGraphDeploymentRequestReconciler reconciles a DynamoGraphDeploymentRequest object
type
DynamoGraphDeploymentRequestReconciler
struct
{
type
DynamoGraphDeploymentRequestReconciler
struct
{
client
.
Client
client
.
Client
...
@@ -389,9 +476,66 @@ func (r *DynamoGraphDeploymentRequestReconciler) handlePendingPhase(ctx context.
...
@@ -389,9 +476,66 @@ func (r *DynamoGraphDeploymentRequestReconciler) handlePendingPhase(ctx context.
r
.
Recorder
.
Event
(
dgdr
,
corev1
.
EventTypeNormal
,
nvidiacomv1beta1
.
EventReasonProfilingJobCreated
,
MessageAICProfilingJobCreated
)
r
.
Recorder
.
Event
(
dgdr
,
corev1
.
EventTypeNormal
,
nvidiacomv1beta1
.
EventReasonProfilingJobCreated
,
MessageAICProfilingJobCreated
)
}
}
// Update to Profiling phase —
show DiscoveringHardware until the job is confirmed runn
ing.
// Update to Profiling phase —
use Initializing reason to indicate the profiler is load
ing.
dgdr
.
SetProfilingPhase
(
nvidiacomv1beta1
.
ProfilingPhaseInitializing
)
dgdr
.
SetProfilingPhase
(
nvidiacomv1beta1
.
ProfilingPhaseInitializing
)
return
r
.
updatePhaseWithCondition
(
ctx
,
dgdr
,
nvidiacomv1beta1
.
DGDRPhaseProfiling
,
nvidiacomv1beta1
.
ConditionTypeProfiling
,
metav1
.
ConditionFalse
,
"DiscoveringHardware"
,
MessageDiscoveringHardware
)
return
r
.
updatePhaseWithCondition
(
ctx
,
dgdr
,
nvidiacomv1beta1
.
DGDRPhaseProfiling
,
nvidiacomv1beta1
.
ConditionTypeProfiling
,
metav1
.
ConditionFalse
,
nvidiacomv1beta1
.
ProfilingReasonInitializing
,
MessageDiscoveringHardware
)
}
// updateProfilingSubPhase reads the output ConfigMap and updates status.profilingPhase
// and the Profiling/Succeeded conditions. The sidecar continuously polls profiler_status.yaml
// and writes phase+message to the output ConfigMap (dgdr-output-<name>). This function
// reads those keys and copies them verbatim into the DGDR status.
func
(
r
*
DynamoGraphDeploymentRequestReconciler
)
updateProfilingSubPhase
(
ctx
context
.
Context
,
dgdr
*
nvidiacomv1beta1
.
DynamoGraphDeploymentRequest
,
)
error
{
logger
:=
log
.
FromContext
(
ctx
)
outputCMName
:=
getOutputConfigMapName
(
dgdr
)
cm
:=
&
corev1
.
ConfigMap
{}
if
err
:=
r
.
Get
(
ctx
,
types
.
NamespacedName
{
Name
:
outputCMName
,
Namespace
:
dgdr
.
Namespace
,
},
cm
);
err
!=
nil
{
return
nil
// No output ConfigMap yet — skip
}
phase
,
exists
:=
cm
.
Data
[
"phase"
]
if
!
exists
||
phase
==
""
{
return
nil
}
if
!
isValidProfilingPhase
(
phase
)
{
return
fmt
.
Errorf
(
"invalid profiling phase %q in ConfigMap %s"
,
phase
,
outputCMName
)
}
profilingPhase
:=
nvidiacomv1beta1
.
ProfilingPhase
(
phase
)
if
dgdr
.
Status
.
ProfilingPhase
==
profilingPhase
{
return
nil
// No change
}
logger
.
Info
(
"Profiling sub-phase updated"
,
"phase"
,
phase
)
dgdr
.
SetProfilingPhase
(
profilingPhase
)
// Reason is derived from phase; message comes from the profiler via ConfigMap.
reason
:=
profilingPhaseReason
(
profilingPhase
)
message
:=
cm
.
Data
[
"message"
]
// written by profiler, relayed by sidecar
meta
.
SetStatusCondition
(
&
dgdr
.
Status
.
Conditions
,
metav1
.
Condition
{
Type
:
nvidiacomv1beta1
.
ConditionTypeProfiling
,
Status
:
metav1
.
ConditionFalse
,
ObservedGeneration
:
dgdr
.
Generation
,
Reason
:
reason
,
Message
:
message
,
})
meta
.
SetStatusCondition
(
&
dgdr
.
Status
.
Conditions
,
metav1
.
Condition
{
Type
:
nvidiacomv1beta1
.
ConditionTypeSucceeded
,
Status
:
metav1
.
ConditionFalse
,
ObservedGeneration
:
dgdr
.
Generation
,
Reason
:
reason
,
Message
:
message
,
})
return
r
.
Status
()
.
Update
(
ctx
,
dgdr
)
}
}
// handleProfilingPhase monitors profiling progress and generates spec when complete
// handleProfilingPhase monitors profiling progress and generates spec when complete
...
@@ -399,21 +543,54 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingPhase(ctx contex
...
@@ -399,21 +543,54 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingPhase(ctx contex
logger
:=
log
.
FromContext
(
ctx
)
logger
:=
log
.
FromContext
(
ctx
)
logger
.
Info
(
"Handling profiling phase"
,
"name"
,
dgdr
.
Name
)
logger
.
Info
(
"Handling profiling phase"
,
"name"
,
dgdr
.
Name
)
// Check for sub-phase updates from output ConfigMap (populated by sidecar poller)
if
err
:=
r
.
updateProfilingSubPhase
(
ctx
,
dgdr
);
err
!=
nil
{
return
ctrl
.
Result
{},
err
}
// Check profiling job status (both online and offline/AIC run as Jobs)
// Check profiling job status (both online and offline/AIC run as Jobs)
// Note: We watch the Job via Owns(), so we'll be triggered automatically on Job changes
// Note: We watch the Job via Owns(), so we'll be triggered automatically on Job changes
completed
,
err
:=
r
.
checkProfilingJobStatus
(
ctx
,
dgdr
)
completed
,
err
:=
r
.
checkProfilingJobStatus
(
ctx
,
dgdr
)
if
err
!=
nil
{
if
err
!=
nil
{
r
.
Recorder
.
Event
(
dgdr
,
corev1
.
EventTypeWarning
,
MessageProfilingCheckFailed
,
err
.
Error
())
r
.
Recorder
.
Event
(
dgdr
,
corev1
.
EventTypeWarning
,
MessageProfilingCheckFailed
,
err
.
Error
())
// Job failed - clear profiling sub-phase and transition to Failed
// Job failed - keep profilingPhase set so users can see where it died.
dgdr
.
ClearProfilingPhase
()
// profilingPhase is already current: set to Initializing on entry,
return
r
.
updatePhaseWithCondition
(
ctx
,
dgdr
,
nvidiacomv1beta1
.
DGDRPhaseFailed
,
nvidiacomv1beta1
.
ConditionTypeProfiling
,
metav1
.
ConditionFalse
,
"ProfilingFailed"
,
err
.
Error
())
// then updated by updateProfilingSubPhase() above (reads output ConfigMap).
failureReason
:=
"ProfilingFailed"
failureMessage
:=
err
.
Error
()
if
dgdr
.
Status
.
ProfilingPhase
!=
""
{
failureReason
=
profilingPhaseFailureReason
(
dgdr
.
Status
.
ProfilingPhase
)
}
// Set phase and conditions directly so we can use sub-phase-specific failure
// reason on both Profiling and Succeeded conditions. (updatePhaseWithCondition
// would hardcode Succeeded reason to generic "Failed".)
dgdr
.
Status
.
Phase
=
nvidiacomv1beta1
.
DGDRPhaseFailed
meta
.
SetStatusCondition
(
&
dgdr
.
Status
.
Conditions
,
metav1
.
Condition
{
Type
:
nvidiacomv1beta1
.
ConditionTypeSucceeded
,
Status
:
metav1
.
ConditionFalse
,
ObservedGeneration
:
dgdr
.
Generation
,
Reason
:
failureReason
,
Message
:
failureMessage
,
})
dgdr
.
AddStatusCondition
(
metav1
.
Condition
{
Type
:
nvidiacomv1beta1
.
ConditionTypeProfiling
,
Status
:
metav1
.
ConditionFalse
,
ObservedGeneration
:
dgdr
.
Generation
,
Reason
:
failureReason
,
Message
:
failureMessage
,
})
if
err
:=
r
.
Status
()
.
Update
(
ctx
,
dgdr
);
err
!=
nil
{
return
ctrl
.
Result
{},
err
}
return
ctrl
.
Result
{
Requeue
:
true
},
nil
}
}
if
!
completed
{
if
!
completed
{
logger
.
Info
(
"Profiling job still running"
,
"name"
,
dgdr
.
Name
)
logger
.
Info
(
"Profiling job still running"
,
"name"
,
dgdr
.
Name
)
// Transition from
DiscoveringHardware
to ProfilingRunning once the job is confirmed active.
// Transition from
Initializing
to ProfilingRunning once the job is confirmed active.
cond
:=
meta
.
FindStatusCondition
(
dgdr
.
Status
.
Conditions
,
nvidiacomv1beta1
.
ConditionTypeProfiling
)
cond
:=
meta
.
FindStatusCondition
(
dgdr
.
Status
.
Conditions
,
nvidiacomv1beta1
.
ConditionTypeProfiling
)
if
cond
!=
nil
&&
cond
.
Reason
==
"DiscoveringHardware"
{
if
cond
!=
nil
&&
cond
.
Reason
==
nvidiacomv1beta1
.
ProfilingReasonInitializing
{
return
r
.
updatePhaseWithCondition
(
ctx
,
dgdr
,
nvidiacomv1beta1
.
DGDRPhaseProfiling
,
nvidiacomv1beta1
.
ConditionTypeProfiling
,
metav1
.
ConditionFalse
,
"ProfilingRunning"
,
MessageProfilingInProgress
)
return
r
.
updatePhaseWithCondition
(
ctx
,
dgdr
,
nvidiacomv1beta1
.
DGDRPhaseProfiling
,
nvidiacomv1beta1
.
ConditionTypeProfiling
,
metav1
.
ConditionFalse
,
"ProfilingRunning"
,
MessageProfilingInProgress
)
}
}
// Don't requeue - we'll be triggered when the Job completes/fails
// Don't requeue - we'll be triggered when the Job completes/fails
...
@@ -1784,6 +1961,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) SetupWithManager(mgr ctrl.Manag
...
@@ -1784,6 +1961,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) SetupWithManager(mgr ctrl.Manag
UpdateFunc
:
func
(
de
event
.
UpdateEvent
)
bool
{
return
true
},
UpdateFunc
:
func
(
de
event
.
UpdateEvent
)
bool
{
return
true
},
GenericFunc
:
func
(
ge
event
.
GenericEvent
)
bool
{
return
true
},
GenericFunc
:
func
(
ge
event
.
GenericEvent
)
bool
{
return
true
},
}))
.
// Watch Jobs created by this controller (via ownerReference)
}))
.
// Watch Jobs created by this controller (via ownerReference)
// Watch DGDs created by this controller (via label)
Watches
(
Watches
(
&
dgdv1alpha1
.
DynamoGraphDeployment
{},
&
dgdv1alpha1
.
DynamoGraphDeployment
{},
handler
.
EnqueueRequestsFromMapFunc
(
func
(
ctx
context
.
Context
,
obj
client
.
Object
)
[]
ctrl
.
Request
{
handler
.
EnqueueRequestsFromMapFunc
(
func
(
ctx
context
.
Context
,
obj
client
.
Object
)
[]
ctrl
.
Request
{
...
@@ -1809,7 +1987,41 @@ func (r *DynamoGraphDeploymentRequestReconciler) SetupWithManager(mgr ctrl.Manag
...
@@ -1809,7 +1987,41 @@ func (r *DynamoGraphDeploymentRequestReconciler) SetupWithManager(mgr ctrl.Manag
GenericFunc
:
func
(
ge
event
.
GenericEvent
)
bool
{
return
true
},
GenericFunc
:
func
(
ge
event
.
GenericEvent
)
bool
{
return
true
},
}),
}),
)
.
)
.
// Watch DGDs created by this controller (via label)
// Watch output ConfigMaps for profiling sub-phase updates (via label)
Watches
(
&
corev1
.
ConfigMap
{},
handler
.
EnqueueRequestsFromMapFunc
(
func
(
ctx
context
.
Context
,
obj
client
.
Object
)
[]
ctrl
.
Request
{
// Only trigger for ConfigMaps with DGDR labels (written by the sidecar)
cm
:=
obj
.
(
*
corev1
.
ConfigMap
)
dgdrName
,
hasName
:=
cm
.
Labels
[
nvidiacomv1beta1
.
LabelDGDRName
]
dgdrNamespace
,
hasNamespace
:=
cm
.
Labels
[
nvidiacomv1beta1
.
LabelDGDRNamespace
]
if
!
hasName
||
!
hasNamespace
{
return
nil
}
return
[]
ctrl
.
Request
{{
NamespacedName
:
types
.
NamespacedName
{
Name
:
dgdrName
,
Namespace
:
dgdrNamespace
,
},
}}
}),
builder
.
WithPredicates
(
predicate
.
Funcs
{
CreateFunc
:
func
(
ce
event
.
CreateEvent
)
bool
{
labels
:=
ce
.
Object
.
GetLabels
()
_
,
hasName
:=
labels
[
nvidiacomv1beta1
.
LabelDGDRName
]
_
,
hasNamespace
:=
labels
[
nvidiacomv1beta1
.
LabelDGDRNamespace
]
return
hasName
&&
hasNamespace
},
UpdateFunc
:
func
(
ue
event
.
UpdateEvent
)
bool
{
labels
:=
ue
.
ObjectNew
.
GetLabels
()
_
,
hasName
:=
labels
[
nvidiacomv1beta1
.
LabelDGDRName
]
_
,
hasNamespace
:=
labels
[
nvidiacomv1beta1
.
LabelDGDRNamespace
]
return
hasName
&&
hasNamespace
},
DeleteFunc
:
func
(
de
event
.
DeleteEvent
)
bool
{
return
false
},
GenericFunc
:
func
(
ge
event
.
GenericEvent
)
bool
{
return
false
},
}),
)
.
// Set the event filter to ignore resources handled by other controllers in namespace-restricted mode
// Set the event filter to ignore resources handled by other controllers in namespace-restricted mode
WithEventFilter
(
commonController
.
EphemeralDeploymentEventFilter
(
r
.
Config
,
r
.
RuntimeConfig
))
.
WithEventFilter
(
commonController
.
EphemeralDeploymentEventFilter
(
r
.
Config
,
r
.
RuntimeConfig
))
.
Complete
(
observability
.
NewObservedReconciler
(
r
,
consts
.
ResourceTypeDynamoGraphDeploymentRequest
))
Complete
(
observability
.
NewObservedReconciler
(
r
,
consts
.
ResourceTypeDynamoGraphDeploymentRequest
))
...
...
deploy/operator/internal/controller/dynamographdeploymentrequest_controller_test.go
View file @
740130eb
This diff is collapsed.
Click to expand it.
deploy/utils/dynamo_deployment.py
View file @
740130eb
...
@@ -240,10 +240,11 @@ class DynamoDeploymentClient:
...
@@ -240,10 +240,11 @@ class DynamoDeploymentClient:
self
.
deployment_spec
is
not
None
self
.
deployment_spec
is
not
None
),
"Failed to load deployment specification"
),
"Failed to load deployment specification"
# Extract component names
# Extract component names (original case for label queries, lowercase for directories)
self
.
components
=
[
self
.
_original_components
=
list
(
svc
.
lower
()
for
svc
in
self
.
deployment_spec
[
"spec"
][
"services"
].
keys
()
self
.
deployment_spec
[
"spec"
][
"services"
].
keys
()
]
)
self
.
components
=
[
svc
.
lower
()
for
svc
in
self
.
_original_components
]
# Ensure name and namespace are set correctly
# Ensure name and namespace are set correctly
self
.
deployment_spec
[
"metadata"
][
"name"
]
=
self
.
deployment_name
self
.
deployment_spec
[
"metadata"
][
"name"
]
=
self
.
deployment_name
...
@@ -450,14 +451,17 @@ class DynamoDeploymentClient:
...
@@ -450,14 +451,17 @@ class DynamoDeploymentClient:
base_dir
=
self
.
base_log_dir
/
self
.
deployment_name
base_dir
=
self
.
base_log_dir
/
self
.
deployment_name
base_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
base_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
for
component
in
self
.
components
:
for
component
,
original_name
in
zip
(
self
.
components
,
self
.
_original_components
)
:
component_dir
=
base_dir
/
component
component_dir
=
base_dir
/
component
component_dir
.
mkdir
(
exist_ok
=
True
)
component_dir
.
mkdir
(
exist_ok
=
True
)
# List pods for this component using the selector label
# Use DGD name + component name labels which are consistent across
# nvidia.com/selector: deployment-name-component
# both Grove (PodCliqueSet) and non-Grove (DCD) deployment pathways.
# The previous nvidia.com/selector label includes a worker hash suffix
# on the DCD pathway, causing a mismatch with the expected base name.
label_selector
=
(
label_selector
=
(
f
"nvidia.com/selector=
{
self
.
deployment_name
}
-
{
component
.
lower
()
}
"
f
"nvidia.com/dynamo-graph-deployment-name=
{
self
.
deployment_name
}
,"
f
"nvidia.com/dynamo-component=
{
original_name
}
"
)
)
pods
=
await
self
.
core_api
.
list_namespaced_pod
(
pods
=
await
self
.
core_api
.
list_namespaced_pod
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment