Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
135bce43
Unverified
Commit
135bce43
authored
Dec 02, 2025
by
Hongkuan Zhou
Committed by
GitHub
Dec 02, 2025
Browse files
chore: clearer prometheus port handling in planner (#4707)
Signed-off-by:
hongkuanz
<
hongkuanz@nvidia.com
>
parent
7e499b5c
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
74 additions
and
50 deletions
+74
-50
benchmarks/profiler/utils/dgd_generation.py
benchmarks/profiler/utils/dgd_generation.py
+2
-5
benchmarks/profiler/utils/planner_utils.py
benchmarks/profiler/utils/planner_utils.py
+50
-5
components/src/dynamo/planner/defaults.py
components/src/dynamo/planner/defaults.py
+9
-33
components/src/dynamo/planner/utils/planner_argparse.py
components/src/dynamo/planner/utils/planner_argparse.py
+10
-4
components/src/dynamo/planner/utils/planner_core.py
components/src/dynamo/planner/utils/planner_core.py
+3
-3
No files found.
benchmarks/profiler/utils/dgd_generation.py
View file @
135bce43
...
...
@@ -139,6 +139,8 @@ def generate_dgd_config_with_planner(
# Override profiling-specific arguments with results from profiling
# Remove and re-add to ensure correct values from profiling context
# Note: --namespace is NOT added here; planner gets it from DYN_NAMESPACE env var
# which is automatically injected by the operator based on dynamoNamespace
planner_args
=
[
arg
for
arg
in
planner_args
...
...
@@ -154,11 +156,9 @@ def generate_dgd_config_with_planner(
]
# Add arguments determined by profiling results
frontend_namespace
=
getattr
(
config
.
spec
.
services
[
"Frontend"
],
"dynamoNamespace"
,
"dynamo"
)
# type: ignore[attr-defined]
cm_mount_path
=
f
"
{
get_workspace_dir
()
}
/profiling_results"
planner_args
.
extend
(
[
f
"--namespace=
{
frontend_namespace
}
"
,
f
"--prefill-engine-num-gpu=
{
best_prefill_mapping
.
get_num_gpus
()
}
"
,
f
"--decode-engine-num-gpu=
{
best_decode_mapping
.
get_num_gpus
()
}
"
,
f
"--profile-results-dir=
{
cm_mount_path
}
"
,
...
...
@@ -375,7 +375,6 @@ def _generate_mocker_config_with_planner(
# Update planner's dynamoNamespace to match mocker's namespace
mocker_planner_dict
[
"dynamoNamespace"
]
=
mocker_namespace
# Override --backend to mocker and --namespace to match mocker's dynamoNamespace
# Planner args use --key=value format, so we need to find and replace
planner_main_container
=
mocker_planner_dict
.
get
(
"extraPodSpec"
,
{}).
get
(
"mainContainer"
,
{}
...
...
@@ -385,8 +384,6 @@ def _generate_mocker_config_with_planner(
for
arg
in
planner_args
:
if
arg
.
startswith
(
"--backend="
):
updated_planner_args
.
append
(
"--backend=mocker"
)
elif
arg
.
startswith
(
"--namespace="
):
updated_planner_args
.
append
(
f
"--namespace=
{
mocker_namespace
}
"
)
else
:
updated_planner_args
.
append
(
arg
)
planner_main_container
[
"args"
]
=
updated_planner_args
...
...
benchmarks/profiler/utils/planner_utils.py
View file @
135bce43
...
...
@@ -14,6 +14,7 @@
# limitations under the License.
import
argparse
from
typing
import
Any
from
dynamo.planner.utils.planner_argparse
import
create_sla_planner_parser
...
...
@@ -79,13 +80,35 @@ def _build_action_kwargs(
return
kwargs
def
_format_arg_for_command_line
(
arg_name
:
str
,
value
)
->
list
[
str
]:
def
_get_planner_defaults
()
->
dict
[
str
,
Any
]:
"""
Get default values for all planner arguments from the planner parser.
Returns:
Dictionary mapping argument names (with dashes) to their default values
"""
planner_parser
=
create_sla_planner_parser
()
defaults
=
{}
for
action
in
planner_parser
.
_actions
:
if
action
.
dest
==
"help"
or
not
action
.
option_strings
:
continue
# Convert dest (underscores) to arg name (dashes)
arg_name
=
action
.
dest
.
replace
(
"_"
,
"-"
)
defaults
[
arg_name
]
=
action
.
default
return
defaults
def
_format_arg_for_command_line
(
arg_name
:
str
,
value
,
defaults
:
dict
[
str
,
Any
]
|
None
=
None
)
->
list
[
str
]:
"""
Format an argument name and value for command line usage.
Args:
arg_name: The argument name (without dashes)
value: The argument value
defaults: Optional dict of default values. If provided and value matches
the default, the arg is skipped (allows operator env vars to take effect)
Returns:
List of command-line argument strings (empty list if value is None or False bool)
...
...
@@ -93,6 +116,13 @@ def _format_arg_for_command_line(arg_name: str, value) -> list[str]:
if
value
is
None
:
return
[]
# Skip args that match their default values
# This allows the operator's injected env vars to take effect
# (e.g., PLANNER_PROMETHEUS_PORT=9085 won't be overridden by --prometheus-port=0)
if
defaults
is
not
None
and
arg_name
in
defaults
:
if
value
==
defaults
[
arg_name
]:
return
[]
if
isinstance
(
value
,
bool
):
# For boolean flags, only add if True
if
value
:
...
...
@@ -104,7 +134,10 @@ def _format_arg_for_command_line(arg_name: str, value) -> list[str]:
def
_collect_args_from_namespace
(
args
:
argparse
.
Namespace
,
arg_names
:
list
[
str
],
prefix_to_strip
:
str
=
""
args
:
argparse
.
Namespace
,
arg_names
:
list
[
str
],
prefix_to_strip
:
str
=
""
,
defaults
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
list
[
str
]:
"""
Collect and format command-line arguments from a namespace for given attribute names.
...
...
@@ -113,6 +146,7 @@ def _collect_args_from_namespace(
args: The argparse Namespace containing parsed arguments
arg_names: List of attribute names to collect from the namespace
prefix_to_strip: Optional prefix to remove from attribute names before formatting
defaults: Optional dict of default values. Args matching defaults are skipped.
Returns:
List of formatted command-line argument strings
...
...
@@ -125,7 +159,7 @@ def _collect_args_from_namespace(
arg_name
=
attr_name
[
len
(
prefix_to_strip
)
:].
replace
(
"_"
,
"-"
)
else
:
arg_name
=
attr_name
.
replace
(
"_"
,
"-"
)
result
.
extend
(
_format_arg_for_command_line
(
arg_name
,
value
))
result
.
extend
(
_format_arg_for_command_line
(
arg_name
,
value
,
defaults
))
return
result
...
...
@@ -182,6 +216,9 @@ def build_planner_args_from_namespace(
Automatically detects shared arguments between profile_sla and planner,
and uses profile_sla values for those.
Args that match their default values are skipped, allowing the operator's
injected environment variables to take effect (e.g., PLANNER_PROMETHEUS_PORT).
Args:
args: Parsed arguments namespace
prefix: Prefix used for planner arguments
...
...
@@ -191,6 +228,10 @@ def build_planner_args_from_namespace(
"""
planner_args
=
[]
# Get default values to skip args that match defaults
# This allows operator-injected env vars to take effect
defaults
=
_get_planner_defaults
()
# Auto-detect shared arguments by comparing planner parser with args namespace
planner_parser
=
create_sla_planner_parser
()
planner_arg_dests
=
{
...
...
@@ -204,12 +245,16 @@ def build_planner_args_from_namespace(
shared_arg_dests
=
{
dest
for
dest
in
planner_arg_dests
if
hasattr
(
args
,
dest
)}
# Add shared arguments from profile_sla (without prefix)
planner_args
.
extend
(
_collect_args_from_namespace
(
args
,
list
(
shared_arg_dests
)))
planner_args
.
extend
(
_collect_args_from_namespace
(
args
,
list
(
shared_arg_dests
),
defaults
=
defaults
)
)
# Get all planner-prefixed attributes from args (planner-specific only)
prefixed_attrs
=
[
attr
for
attr
in
dir
(
args
)
if
attr
.
startswith
(
prefix
)]
planner_args
.
extend
(
_collect_args_from_namespace
(
args
,
prefixed_attrs
,
prefix_to_strip
=
prefix
)
_collect_args_from_namespace
(
args
,
prefixed_attrs
,
prefix_to_strip
=
prefix
,
defaults
=
defaults
)
)
return
planner_args
components/src/dynamo/planner/defaults.py
View file @
135bce43
...
...
@@ -21,7 +21,6 @@ from typing import Optional
from
pydantic
import
BaseModel
from
dynamo.planner.kube
import
get_current_k8s_namespace
from
dynamo.planner.utils.exceptions
import
(
DuplicateSubComponentError
,
SubComponentNotFoundError
,
...
...
@@ -32,17 +31,10 @@ configure_dynamo_logging()
logger
=
logging
.
getLogger
(
__name__
)
def
_get_prometheus_port_from_env
():
"""
Get prometheus port from environment variables if set.
Otherwise, return 0, which means not reporting metrics using prometheus.
"""
return
os
.
environ
.
get
(
"PLANNER_PROMETHEUS_PORT"
,
0
)
# Source of truth for planner defaults
class
BasePlannerDefaults
:
namespace
=
"dynamo"
# Namespace from DYN_NAMESPACE env var (injected by operator as "{k8s_namespace}-{dgd_name}")
namespace
=
os
.
environ
.
get
(
"DYN_NAMESPACE"
,
"dynamo"
)
environment
=
"kubernetes"
backend
=
"vllm"
no_operation
=
False
...
...
@@ -52,7 +44,8 @@ class BasePlannerDefaults:
min_endpoint
=
1
# applies to both decode and prefill
decode_engine_num_gpu
=
1
prefill_engine_num_gpu
=
1
prometheus_port
=
_get_prometheus_port_from_env
()
# Port for exposing planner's own metrics (0 means disabled)
metric_reporting_prometheus_port
=
int
(
os
.
environ
.
get
(
"PLANNER_PROMETHEUS_PORT"
,
0
))
class
LoadPlannerDefaults
(
BasePlannerDefaults
):
...
...
@@ -63,29 +56,12 @@ class LoadPlannerDefaults(BasePlannerDefaults):
prefill_queue_scale_down_threshold
=
0.2
def
_get_default_prometheus_endpoint
(
port
:
str
,
namespace
:
str
):
"""Compute default prometheus endpoint using environment variables and Kubernetes service discovery"""
prometheus_endpoint
=
os
.
environ
.
get
(
"PROMETHEUS_ENDPOINT"
,
""
).
strip
()
if
prometheus_endpoint
:
logger
.
debug
(
"Using PROMETHEUS_ENDPOINT override: %s"
,
prometheus_endpoint
)
return
prometheus_endpoint
k8s_namespace
=
get_current_k8s_namespace
()
if
k8s_namespace
and
k8s_namespace
!=
"default"
:
prometheus_service
=
f
"
{
namespace
}
-prometheus"
return
f
"http://
{
prometheus_service
}
.
{
k8s_namespace
}
.svc.cluster.local:
{
port
}
"
else
:
logger
.
warning
(
f
"Cannot determine Prometheus endpoint. Running in namespace '
{
k8s_namespace
}
'. "
"Ensure the planner is deployed in a Kubernetes cluster with proper namespace configuration."
)
return
f
"
{
namespace
}
-prometheus"
class
SLAPlannerDefaults
(
BasePlannerDefaults
):
port
=
os
.
environ
.
get
(
"PROMETHEUS_PORT"
,
"9090"
)
namespace
=
os
.
environ
.
get
(
"DYN_NAMESPACE"
,
"vllm-disagg-planner"
)
prometheus_endpoint
=
_get_default_prometheus_endpoint
(
port
,
namespace
)
# Prometheus endpoint URL for pulling/querying metrics
metric_pulling_prometheus_endpoint
=
os
.
environ
.
get
(
"PROMETHEUS_ENDPOINT"
,
"http://prometheus-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090"
,
)
profile_results_dir
=
"profiling_results"
isl
=
3000
# in number of tokens
osl
=
150
# in number of tokens
...
...
components/src/dynamo/planner/utils/planner_argparse.py
View file @
135bce43
...
...
@@ -34,7 +34,7 @@ def create_sla_planner_parser() -> argparse.ArgumentParser:
parser
.
add_argument
(
"--namespace"
,
default
=
SLAPlannerDefaults
.
namespace
,
help
=
"
N
amespace"
,
help
=
"
Dynamo n
amespace"
,
)
parser
.
add_argument
(
"--backend"
,
...
...
@@ -110,10 +110,16 @@ def create_sla_planner_parser() -> argparse.ArgumentParser:
help
=
"Load prediction window size"
,
)
parser
.
add_argument
(
"--prometheus-port"
,
"--metric-pulling-prometheus-endpoint"
,
type
=
str
,
default
=
SLAPlannerDefaults
.
metric_pulling_prometheus_endpoint
,
help
=
"Prometheus endpoint URL for pulling dynamo deployment metrics"
,
)
parser
.
add_argument
(
"--metric-reporting-prometheus-port"
,
type
=
int
,
default
=
SLAPlannerDefaults
.
prometheus_port
,
help
=
"Prometheus
port
"
,
default
=
SLAPlannerDefaults
.
metric_reporting_
prometheus_port
,
help
=
"P
ort for exposing planner's own metrics to P
rometheus"
,
)
parser
.
add_argument
(
"--no-correction"
,
...
...
components/src/dynamo/planner/utils/planner_core.py
View file @
135bce43
...
...
@@ -17,7 +17,7 @@ from dynamo.planner import (
TargetReplica
,
VirtualConnector
,
)
from
dynamo.planner.defaults
import
WORKER_COMPONENT_NAMES
,
SLAPlannerDefaults
from
dynamo.planner.defaults
import
WORKER_COMPONENT_NAMES
from
dynamo.planner.utils.load_predictor
import
LOAD_PREDICTORS
from
dynamo.planner.utils.perf_interpolation
import
(
DecodeInterpolator
,
...
...
@@ -90,7 +90,7 @@ class Planner:
raise
ValueError
(
f
"Invalid environment:
{
args
.
environment
}
"
)
self
.
prometheus_api_client
=
PrometheusAPIClient
(
SLAPlannerDefaults
.
prometheus_endpoint
,
args
.
metric_pulling_
prometheus_endpoint
,
args
.
namespace
,
)
...
...
@@ -150,7 +150,7 @@ class Planner:
self
.
last_adjustment_time
=
time
.
time
()
self
.
last_metrics
=
Metrics
()
self
.
prometheus_port
=
args
.
prometheus_port
self
.
prometheus_port
=
args
.
metric_reporting_
prometheus_port
# Initialize Prometheus metrics
# TODO: use proper naming
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment