Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
b97fde10
Unverified
Commit
b97fde10
authored
Mar 05, 2026
by
hhzhang16
Committed by
GitHub
Mar 06, 2026
Browse files
fix: propagate tolerations and cap auto-discovered GPUs (#6947)
Signed-off-by:
Hannah Zhang
<
hannahz@nvidia.com
>
parent
503d042a
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
98 additions
and
4 deletions
+98
-4
components/src/dynamo/profiler/profile_sla.py
components/src/dynamo/profiler/profile_sla.py
+25
-0
components/src/dynamo/profiler/thorough.py
components/src/dynamo/profiler/thorough.py
+20
-0
components/src/dynamo/profiler/utils/profile_common.py
components/src/dynamo/profiler/utils/profile_common.py
+33
-0
deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
...nal/controller/dynamographdeploymentrequest_controller.go
+20
-4
No files found.
components/src/dynamo/profiler/profile_sla.py
View file @
b97fde10
...
@@ -44,6 +44,8 @@ from dynamo.profiler.utils.dgdr_validate import (
...
@@ -44,6 +44,8 @@ from dynamo.profiler.utils.dgdr_validate import (
from
dynamo.profiler.utils.profile_common
import
(
from
dynamo.profiler.utils.profile_common
import
(
ProfilerOperationalConfig
,
ProfilerOperationalConfig
,
determine_picking_mode
,
determine_picking_mode
,
get_profiling_job_tolerations
,
inject_tolerations_into_dgd
,
needs_profile_data
,
needs_profile_data
,
picked_config_from_row
,
picked_config_from_row
,
resolve_model_path
,
resolve_model_path
,
...
@@ -57,6 +59,17 @@ logger = logging.getLogger(__name__)
...
@@ -57,6 +59,17 @@ logger = logging.getLogger(__name__)
_CONCRETE_BACKENDS
=
[
"trtllm"
,
"sglang"
,
"vllm"
]
_CONCRETE_BACKENDS
=
[
"trtllm"
,
"sglang"
,
"vllm"
]
def
_apply_tolerations_to_final_config
(
final_config
:
Any
,
tolerations
:
list
)
->
Any
:
"""Apply tolerations to a final DGD config (dict or multi-doc list)."""
if
not
tolerations
or
not
final_config
:
return
final_config
if
isinstance
(
final_config
,
list
):
result
=
list
(
final_config
)
result
[
-
1
]
=
inject_tolerations_into_dgd
(
result
[
-
1
],
tolerations
)
return
result
return
inject_tolerations_into_dgd
(
final_config
,
tolerations
)
def
_check_auto_backend_support
(
model
:
str
,
system
:
str
)
->
bool
:
def
_check_auto_backend_support
(
model
:
str
,
system
:
str
)
->
bool
:
"""
"""
Return True if *any* concrete backend is AIC-supported for this model/system.
Return True if *any* concrete backend is AIC-supported for this model/system.
...
@@ -370,6 +383,18 @@ async def run_profile(
...
@@ -370,6 +383,18 @@ async def run_profile(
elif
isinstance
(
final_config
,
dict
):
elif
isinstance
(
final_config
,
dict
):
final_config
=
apply_dgd_overrides
(
final_config
,
dgdr
.
overrides
.
dgd
)
final_config
=
apply_dgd_overrides
(
final_config
,
dgdr
.
overrides
.
dgd
)
logger
.
info
(
"Applied DGD overrides to the final config."
)
logger
.
info
(
"Applied DGD overrides to the final config."
)
# Propagate profiling-job tolerations to the final DGD
job_tolerations
=
get_profiling_job_tolerations
(
dgdr
)
if
job_tolerations
and
final_config
:
final_config
=
_apply_tolerations_to_final_config
(
final_config
,
job_tolerations
)
logger
.
debug
(
"Propagated %d profiling-job toleration(s) to the final DGD config."
,
len
(
job_tolerations
),
)
if
not
_write_final_output
(
ops
,
final_config
):
if
not
_write_final_output
(
ops
,
final_config
):
return
return
...
...
components/src/dynamo/profiler/thorough.py
View file @
b97fde10
...
@@ -47,6 +47,8 @@ from dynamo.profiler.utils.dgdr_v1beta1_types import (
...
@@ -47,6 +47,8 @@ from dynamo.profiler.utils.dgdr_v1beta1_types import (
from
dynamo.profiler.utils.profile_common
import
(
from
dynamo.profiler.utils.profile_common
import
(
ProfilerOperationalConfig
,
ProfilerOperationalConfig
,
derive_backend_image
,
derive_backend_image
,
get_profiling_job_tolerations
,
inject_tolerations_into_dgd
,
)
)
from
dynamo.profiler.utils.profile_decode
import
get_num_request_range
from
dynamo.profiler.utils.profile_decode
import
get_num_request_range
...
@@ -354,6 +356,24 @@ async def run_thorough(
...
@@ -354,6 +356,24 @@ async def run_thorough(
len
(
decode_candidates
),
len
(
decode_candidates
),
)
)
# Propagate profiling-job tolerations to candidate DGDs
job_tolerations
=
get_profiling_job_tolerations
(
dgdr
)
if
job_tolerations
:
for
candidate
in
prefill_candidates
:
candidate
.
dgd_config
=
inject_tolerations_into_dgd
(
candidate
.
dgd_config
,
job_tolerations
)
for
candidate
in
decode_candidates
:
candidate
.
dgd_config
=
inject_tolerations_into_dgd
(
candidate
.
dgd_config
,
job_tolerations
)
logger
.
debug
(
"Propagated %d profiling-job toleration(s) to %d prefill + %d decode candidates."
,
len
(
job_tolerations
),
len
(
prefill_candidates
),
len
(
decode_candidates
),
)
config_modifier
=
CONFIG_MODIFIERS
[
backend
]
config_modifier
=
CONFIG_MODIFIERS
[
backend
]
# --- Stage 2: Benchmarking ---
# --- Stage 2: Benchmarking ---
...
...
components/src/dynamo/profiler/utils/profile_common.py
View file @
b97fde10
...
@@ -15,6 +15,7 @@
...
@@ -15,6 +15,7 @@
"""Shared helpers and configuration for the profiler pipeline."""
"""Shared helpers and configuration for the profiler pipeline."""
import
copy
import
logging
import
logging
import
os
import
os
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
...
@@ -231,3 +232,35 @@ def warn_gpu_shortage(
...
@@ -231,3 +232,35 @@ def warn_gpu_shortage(
gpus_needed
,
gpus_needed
,
total_gpus
,
total_gpus
,
)
)
def
get_profiling_job_tolerations
(
dgdr
:
DynamoGraphDeploymentRequestSpec
)
->
list
:
"""Return tolerations from overrides.profilingJob.template.spec.tolerations."""
try
:
if
dgdr
.
overrides
is
None
or
dgdr
.
overrides
.
profilingJob
is
None
:
return
[]
return
(
dgdr
.
overrides
.
profilingJob
.
get
(
"template"
,
{})
.
get
(
"spec"
,
{})
.
get
(
"tolerations"
,
[])
)
except
(
AttributeError
,
KeyError
):
return
[]
def
inject_tolerations_into_dgd
(
dgd_config
:
dict
,
tolerations
:
list
)
->
dict
:
"""Add tolerations to every service's extraPodSpec in a DGD config dict.
Tolerations already present in a service are preserved; only new entries
(by identity) are appended. Returns a deep copy with tolerations applied.
"""
result
=
copy
.
deepcopy
(
dgd_config
)
for
_svc_name
,
svc
in
result
.
get
(
"spec"
,
{}).
get
(
"services"
,
{}).
items
():
if
not
isinstance
(
svc
,
dict
):
continue
eps
=
svc
.
setdefault
(
"extraPodSpec"
,
{})
existing
=
eps
.
get
(
"tolerations"
,
[])
new_entries
=
[
t
for
t
in
tolerations
if
t
not
in
existing
]
if
new_entries
:
eps
[
"tolerations"
]
=
list
(
existing
)
+
new_entries
return
result
deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
View file @
b97fde10
...
@@ -106,6 +106,7 @@ const (
...
@@ -106,6 +106,7 @@ const (
// Messages
// Messages
MessageInitialized
=
"DGDR initialized successfully"
MessageInitialized
=
"DGDR initialized successfully"
MessageDiscoveringHardware
=
"Discovering GPU hardware and preparing profiling job"
MessageProfilingJobCreated
=
"Profiling job created"
MessageProfilingJobCreated
=
"Profiling job created"
MessageAICProfilingJobCreated
=
"AIC profiling job created"
MessageAICProfilingJobCreated
=
"AIC profiling job created"
MessageProfilingInProgress
=
"Profiling is in progress"
MessageProfilingInProgress
=
"Profiling is in progress"
...
@@ -358,9 +359,11 @@ func (r *DynamoGraphDeploymentRequestReconciler) handlePendingPhase(ctx context.
...
@@ -358,9 +359,11 @@ func (r *DynamoGraphDeploymentRequestReconciler) handlePendingPhase(ctx context.
// Set observedGeneration to track the spec we're processing
// Set observedGeneration to track the spec we're processing
dgdr
.
Status
.
ObservedGeneration
=
dgdr
.
Generation
dgdr
.
Status
.
ObservedGeneration
=
dgdr
.
Generation
// Initialize status
// Initialize status
— next reconcile will discover hardware and create the profiling job.
r
.
Recorder
.
Event
(
dgdr
,
corev1
.
EventTypeNormal
,
nvidiacomv1beta1
.
EventReasonInitialized
,
MessageInitialized
)
r
.
Recorder
.
Event
(
dgdr
,
corev1
.
EventTypeNormal
,
nvidiacomv1beta1
.
EventReasonInitialized
,
MessageInitialized
)
return
r
.
updatePhaseAndRequeue
(
ctx
,
dgdr
,
nvidiacomv1beta1
.
DGDRPhasePending
,
MessageInitialized
)
return
r
.
updatePhaseWithCondition
(
ctx
,
dgdr
,
nvidiacomv1beta1
.
DGDRPhasePending
,
nvidiacomv1beta1
.
ConditionTypeProfiling
,
metav1
.
ConditionFalse
,
"DiscoveringHardware"
,
MessageDiscoveringHardware
)
}
}
logger
.
Info
(
"Handling pending phase"
,
"name"
,
dgdr
.
Name
)
logger
.
Info
(
"Handling pending phase"
,
"name"
,
dgdr
.
Name
)
...
@@ -378,9 +381,9 @@ func (r *DynamoGraphDeploymentRequestReconciler) handlePendingPhase(ctx context.
...
@@ -378,9 +381,9 @@ func (r *DynamoGraphDeploymentRequestReconciler) handlePendingPhase(ctx context.
r
.
Recorder
.
Event
(
dgdr
,
corev1
.
EventTypeNormal
,
nvidiacomv1beta1
.
EventReasonProfilingJobCreated
,
MessageAICProfilingJobCreated
)
r
.
Recorder
.
Event
(
dgdr
,
corev1
.
EventTypeNormal
,
nvidiacomv1beta1
.
EventReasonProfilingJobCreated
,
MessageAICProfilingJobCreated
)
}
}
// Update to Profiling phase
with Running status
// Update to Profiling phase
— show DiscoveringHardware until the job is confirmed running.
dgdr
.
SetProfilingPhase
(
nvidiacomv1beta1
.
ProfilingPhaseInitializing
)
dgdr
.
SetProfilingPhase
(
nvidiacomv1beta1
.
ProfilingPhaseInitializing
)
return
r
.
updatePhaseWithCondition
(
ctx
,
dgdr
,
nvidiacomv1beta1
.
DGDRPhaseProfiling
,
nvidiacomv1beta1
.
ConditionTypeProfiling
,
metav1
.
ConditionFalse
,
"
ProfilingRunning"
,
MessageProfilingInProgress
)
return
r
.
updatePhaseWithCondition
(
ctx
,
dgdr
,
nvidiacomv1beta1
.
DGDRPhaseProfiling
,
nvidiacomv1beta1
.
ConditionTypeProfiling
,
metav1
.
ConditionFalse
,
"
DiscoveringHardware"
,
MessageDiscoveringHardware
)
}
}
// handleProfilingPhase monitors profiling progress and generates spec when complete
// handleProfilingPhase monitors profiling progress and generates spec when complete
...
@@ -400,6 +403,11 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingPhase(ctx contex
...
@@ -400,6 +403,11 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingPhase(ctx contex
if
!
completed
{
if
!
completed
{
logger
.
Info
(
"Profiling job still running"
,
"name"
,
dgdr
.
Name
)
logger
.
Info
(
"Profiling job still running"
,
"name"
,
dgdr
.
Name
)
// Transition from DiscoveringHardware to ProfilingRunning once the job is confirmed active.
cond
:=
meta
.
FindStatusCondition
(
dgdr
.
Status
.
Conditions
,
nvidiacomv1beta1
.
ConditionTypeProfiling
)
if
cond
!=
nil
&&
cond
.
Reason
==
"DiscoveringHardware"
{
return
r
.
updatePhaseWithCondition
(
ctx
,
dgdr
,
nvidiacomv1beta1
.
DGDRPhaseProfiling
,
nvidiacomv1beta1
.
ConditionTypeProfiling
,
metav1
.
ConditionFalse
,
"ProfilingRunning"
,
MessageProfilingInProgress
)
}
// Don't requeue - we'll be triggered when the Job completes/fails
// Don't requeue - we'll be triggered when the Job completes/fails
return
ctrl
.
Result
{},
nil
return
ctrl
.
Result
{},
nil
}
}
...
@@ -1197,7 +1205,15 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx
...
@@ -1197,7 +1205,15 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx
hw
.
NumGPUsPerNode
=
&
n
hw
.
NumGPUsPerNode
=
&
n
}
}
if
hw
.
TotalGPUs
==
nil
{
if
hw
.
TotalGPUs
==
nil
{
// TODO: This is a temporary limit to prevent the profiler from using too many GPUs.
// Will be removed once a fix is in the Profiler/AIC.
const
defaultMaxAutoGPUs
=
int32
(
32
)
total
:=
int32
(
gpuInfo
.
GPUsPerNode
*
gpuInfo
.
NodesWithGPUs
)
total
:=
int32
(
gpuInfo
.
GPUsPerNode
*
gpuInfo
.
NodesWithGPUs
)
if
total
>
defaultMaxAutoGPUs
{
logger
.
Info
(
"Capping auto-discovered TotalGPUs at default limit; set hardware.totalGpus to override"
,
"discovered"
,
total
,
"cap"
,
defaultMaxAutoGPUs
)
total
=
defaultMaxAutoGPUs
}
hw
.
TotalGPUs
=
&
total
hw
.
TotalGPUs
=
&
total
}
}
return
nil
return
nil
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment