Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
e49834c9
Unverified
Commit
e49834c9
authored
Dec 31, 2025
by
Hongkuan Zhou
Committed by
GitHub
Dec 31, 2025
Browse files
refactor: preview config in profiler webui use planner dgd generation logic (#4940)
Signed-off-by:
hongkuanz
<
hongkuanz@nvidia.com
>
parent
cceeb8e3
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
438 additions
and
278 deletions
+438
-278
benchmarks/profiler/profile_sla.py
benchmarks/profiler/profile_sla.py
+12
-9
benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py
...rofiler/utils/config_modifiers/parallelization_mapping.py
+25
-7
benchmarks/profiler/utils/config_modifiers/protocol.py
benchmarks/profiler/utils/config_modifiers/protocol.py
+5
-1
benchmarks/profiler/utils/config_modifiers/sglang.py
benchmarks/profiler/utils/config_modifiers/sglang.py
+6
-2
benchmarks/profiler/utils/config_modifiers/trtllm.py
benchmarks/profiler/utils/config_modifiers/trtllm.py
+6
-2
benchmarks/profiler/utils/config_modifiers/vllm.py
benchmarks/profiler/utils/config_modifiers/vllm.py
+6
-2
benchmarks/profiler/utils/dgd_generation.py
benchmarks/profiler/utils/dgd_generation.py
+261
-114
benchmarks/profiler/webui/utils.py
benchmarks/profiler/webui/utils.py
+117
-141
No files found.
benchmarks/profiler/profile_sla.py
View file @
e49834c9
...
...
@@ -59,7 +59,7 @@ from deploy.utils.dynamo_deployment import (
DynamoDeploymentClient
,
cleanup_remaining_deployments
,
)
from
dynamo.planner.defaults
import
WORKER_COMPONENT_NAMES
from
dynamo.planner.defaults
import
WORKER_COMPONENT_NAMES
,
SubComponentType
@
dataclass
...
...
@@ -239,7 +239,7 @@ async def run_profile(args):
prefill_config
=
apply_parallel_mapping_to_config
(
base_prefill_config
,
mapping
,
Engine
Type
.
PREFILL
,
SubComponent
Type
.
PREFILL
,
config_modifier
,
args
.
num_gpus_per_node
,
)
...
...
@@ -344,7 +344,7 @@ async def run_profile(args):
decode_config
=
apply_parallel_mapping_to_config
(
base_decode_config
,
mapping
,
Engine
Type
.
DECODE
,
SubComponent
Type
.
DECODE
,
config_modifier
,
args
.
num_gpus_per_node
,
)
...
...
@@ -493,6 +493,9 @@ async def run_profile(args):
selected_prefill_idx
,
selected_decode_idx
=
pick_config_with_webui
(
prefill_data
,
decode_data
,
args
)
# update TTFT/ITL SLA based on selected config
args
.
ttft
=
prefill_data
.
ttft
[
selected_prefill_idx
]
args
.
itl
=
decode_data
.
itl
[
selected_decode_idx
]
else
:
# automatically select P/D config within SLA with the highest throughput/GPU
# select best parallel mapping for prefill
...
...
@@ -563,7 +566,7 @@ async def run_profile(args):
prefill_config
=
apply_parallel_mapping_to_config
(
prefill_config
,
best_prefill_mapping
,
Engine
Type
.
PREFILL
,
SubComponent
Type
.
PREFILL
,
config_modifier
,
args
.
num_gpus_per_node
,
)
...
...
@@ -647,7 +650,7 @@ async def run_profile(args):
decode_config
=
apply_parallel_mapping_to_config
(
decode_config
,
best_decode_mapping
,
Engine
Type
.
DECODE
,
SubComponent
Type
.
DECODE
,
config_modifier
,
args
.
num_gpus_per_node
,
)
...
...
@@ -738,17 +741,17 @@ async def run_profile(args):
# save DGD config with planner; support multi-document output when a ConfigMap is included
with
open
(
f
"
{
args
.
output_dir
}
/config_with_planner.yaml"
,
"w"
)
as
f
:
if
isinstance
(
config
,
list
):
yaml
.
dump_all
(
config
,
f
)
yaml
.
safe_
dump_all
(
config
,
f
,
sort_keys
=
False
)
else
:
yaml
.
dump
(
config
,
f
)
yaml
.
safe_
dump
(
config
,
f
,
sort_keys
=
False
)
# save mocker config with planner for testing purposes
logger
.
debug
(
f
"Mocker config with planner:
{
mocker_config
}
"
)
with
open
(
f
"
{
args
.
output_dir
}
/mocker_config_with_planner.yaml"
,
"w"
)
as
f
:
if
isinstance
(
mocker_config
,
list
):
yaml
.
dump_all
(
mocker_config
,
f
)
yaml
.
safe_
dump_all
(
mocker_config
,
f
,
sort_keys
=
False
)
else
:
yaml
.
dump
(
mocker_config
,
f
)
yaml
.
safe_
dump
(
mocker_config
,
f
,
sort_keys
=
False
)
except
Exception
as
e
:
logger
.
error
(
f
"Profile job failed with error:
{
e
}
"
)
...
...
benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py
View file @
e49834c9
...
...
@@ -6,11 +6,12 @@ import logging
from
dataclasses
import
dataclass
from
enum
import
Enum
from
benchmarks.profiler.utils.defaults
import
PREFILL_MAX_NUM_TOKENS
,
EngineType
from
benchmarks.profiler.utils.defaults
import
PREFILL_MAX_NUM_TOKENS
from
benchmarks.profiler.utils.model_info
import
(
MOE_ADDITIONAL_TP_ARCHITECTURES
,
ModelInfo
,
)
from
dynamo.planner.defaults
import
SubComponentType
logger
=
logging
.
getLogger
(
__name__
)
logger
.
setLevel
(
logging
.
INFO
)
...
...
@@ -222,27 +223,44 @@ def get_candidate_parallel_mappings(
def
apply_parallel_mapping_to_config
(
base_config
:
dict
,
mapping
:
ParallelizationMapping
,
phase
:
str
,
phase
:
SubComponentType
,
config_modifier
,
num_gpus_per_node
:
int
|
None
,
is_aggregated_config
:
bool
=
True
,
)
->
dict
:
cfg
=
copy
.
deepcopy
(
base_config
)
# In aggregated configs (used for profiling individual phases), the worker service we mutate
# is always the decode worker (prefill is converted to decode in convert_config()).
# In disaggregated configs (final DGD), we mutate the service matching the requested phase.
component_type
=
SubComponentType
.
DECODE
if
is_aggregated_config
else
phase
if
mapping
.
tp
is
not
None
:
cfg
=
config_modifier
.
set_config_tp_size
(
cfg
,
mapping
.
tp
)
cfg
=
config_modifier
.
set_config_tp_size
(
cfg
,
mapping
.
tp
,
component_type
)
elif
mapping
.
tep
is
not
None
:
cfg
=
config_modifier
.
set_config_tep_size
(
cfg
,
mapping
.
tep
,
num_gpus_per_node
)
cfg
=
config_modifier
.
set_config_tep_size
(
cfg
,
mapping
.
tep
,
num_gpus_per_node
,
component_type
)
elif
mapping
.
dep
is
not
None
:
cfg
=
config_modifier
.
set_config_dep_size
(
cfg
,
mapping
.
dep
,
num_gpus_per_node
)
cfg
=
config_modifier
.
set_config_dep_size
(
cfg
,
mapping
.
dep
,
num_gpus_per_node
,
component_type
)
else
:
raise
ValueError
(
f
"Invalid mapping:
{
mapping
.
label
()
}
"
)
#
f
or prefill,set batch size to attention_dp_size
#
F
or prefill,
set batch size to attention_dp_size
# (this assume prompt is long enough to saturate the GPU, which is usually valid in disagg)
if
phase
==
EngineType
.
PREFILL
:
if
phase
==
SubComponentType
.
PREFILL
:
prefill_component_type
=
(
SubComponentType
.
DECODE
if
is_aggregated_config
else
SubComponentType
.
PREFILL
)
cfg
=
config_modifier
.
set_prefill_config
(
cfg
,
max_batch_size
=
mapping
.
get_attn_dp_size
(),
# max num tokens is shared by all attention dp ranks
max_num_tokens
=
PREFILL_MAX_NUM_TOKENS
*
mapping
.
get_attn_dp_size
(),
component_type
=
prefill_component_type
,
)
return
cfg
benchmarks/profiler/utils/config_modifiers/protocol.py
View file @
e49834c9
...
...
@@ -64,7 +64,11 @@ class ConfigModifierProtocol(Protocol):
@
classmethod
def
set_prefill_config
(
cls
,
config
:
dict
,
max_batch_size
:
int
,
max_num_tokens
:
int
cls
,
config
:
dict
,
max_batch_size
:
int
,
max_num_tokens
:
int
,
component_type
:
SubComponentType
=
SubComponentType
.
DECODE
,
)
->
dict
:
...
...
...
benchmarks/profiler/utils/config_modifiers/sglang.py
View file @
e49834c9
...
...
@@ -379,7 +379,11 @@ class SGLangConfigModifier:
@
classmethod
def
set_prefill_config
(
cls
,
config
:
dict
,
max_batch_size
:
int
,
max_num_tokens
:
int
cls
,
config
:
dict
,
max_batch_size
:
int
,
max_num_tokens
:
int
,
component_type
:
SubComponentType
=
SubComponentType
.
DECODE
,
)
->
dict
:
"""
Configure prefill-related limits for aggregated prefill runs.
...
...
@@ -388,7 +392,7 @@ class SGLangConfigModifier:
"""
cfg
=
Config
.
model_validate
(
config
)
worker_service
=
get_worker_service_from_config
(
cfg
,
backend
=
"sglang"
,
sub_component_type
=
SubC
omponent
T
ype
.
DECODE
cfg
,
backend
=
"sglang"
,
sub_component_type
=
c
omponent
_t
ype
)
args
=
validate_and_get_worker_args
(
worker_service
,
backend
=
"sglang"
)
args
=
break_arguments
(
args
)
...
...
benchmarks/profiler/utils/config_modifiers/trtllm.py
View file @
e49834c9
...
...
@@ -350,7 +350,11 @@ class TrtllmConfigModifier:
@
classmethod
def
set_prefill_config
(
cls
,
config
:
dict
,
max_batch_size
:
int
,
max_num_tokens
:
int
cls
,
config
:
dict
,
max_batch_size
:
int
,
max_num_tokens
:
int
,
component_type
:
SubComponentType
=
SubComponentType
.
DECODE
,
)
->
dict
:
"""
Configure prefill-related limits for aggregated prefill runs.
...
...
@@ -360,7 +364,7 @@ class TrtllmConfigModifier:
"""
cfg
=
Config
.
model_validate
(
config
)
worker_service
=
get_worker_service_from_config
(
cfg
,
backend
=
"trtllm"
,
sub_component_type
=
SubC
omponent
T
ype
.
DECODE
cfg
,
backend
=
"trtllm"
,
sub_component_type
=
c
omponent
_t
ype
)
args
=
validate_and_get_worker_args
(
worker_service
,
backend
=
"trtllm"
)
args
=
break_arguments
(
args
)
...
...
benchmarks/profiler/utils/config_modifiers/vllm.py
View file @
e49834c9
...
...
@@ -307,7 +307,11 @@ class VllmV1ConfigModifier:
@
classmethod
def
set_prefill_config
(
cls
,
config
:
dict
,
max_batch_size
:
int
,
max_num_tokens
:
int
cls
,
config
:
dict
,
max_batch_size
:
int
,
max_num_tokens
:
int
,
component_type
:
SubComponentType
=
SubComponentType
.
DECODE
,
)
->
dict
:
"""
Configure prefill-related limits for aggregated prefill runs.
...
...
@@ -316,7 +320,7 @@ class VllmV1ConfigModifier:
"""
cfg
=
Config
.
model_validate
(
config
)
worker_service
=
get_worker_service_from_config
(
cfg
,
backend
=
"vllm"
,
sub_component_type
=
SubC
omponent
T
ype
.
DECODE
cfg
,
backend
=
"vllm"
,
sub_component_type
=
c
omponent
_t
ype
)
args
=
validate_and_get_worker_args
(
worker_service
,
backend
=
"vllm"
)
args
=
break_arguments
(
args
)
...
...
benchmarks/profiler/utils/dgd_generation.py
View file @
e49834c9
...
...
@@ -26,8 +26,10 @@ from benchmarks.profiler.utils.config import (
DgdPlannerServiceConfig
,
set_argument_value
,
)
from
benchmarks.profiler.utils.config_modifiers
import
CONFIG_MODIFIERS
from
benchmarks.profiler.utils.config_modifiers.parallelization_mapping
import
(
ParallelizationMapping
,
apply_parallel_mapping_to_config
,
)
from
benchmarks.profiler.utils.planner_utils
import
build_planner_args_from_namespace
from
dynamo.common.utils.paths
import
get_workspace_dir
...
...
@@ -37,14 +39,196 @@ from dynamo.planner.defaults import MockerComponentName, SubComponentType
MOCKER_DISAGG_CONFIG_PATH
=
"examples/backends/mocker/deploy/disagg.yaml"
def
generate_dgd_config_with_planner
(
def
_get_config_modifier_from_args
(
args
):
"""Return an instantiated config modifier for args.backend."""
config_modifier_cls
=
CONFIG_MODIFIERS
[
args
.
backend
]
return
config_modifier_cls
()
def
_find_service_name_for_subcomponent
(
config
:
Config
,
subcomponent
:
SubComponentType
)
->
str
:
"""Find the service name in a DGD config for a given subComponentType."""
for
service_name
,
service_cfg
in
config
.
spec
.
services
.
items
():
if
getattr
(
service_cfg
,
"subComponentType"
,
None
)
==
subcomponent
:
return
service_name
raise
KeyError
(
f
"Could not find service with subComponentType=
{
subcomponent
!
r
}
"
)
def
_load_and_apply_mappings
(
*
,
config_path
:
str
,
args
,
config_modifier
,
output_dir
:
str
,
best_prefill_mapping
:
ParallelizationMapping
|
None
,
best_decode_mapping
:
ParallelizationMapping
|
None
,
num_gpus_per_node
:
int
,
)
->
Config
:
"""Load a DGD config file and apply optional prefill/decode parallel mappings (single source of truth)."""
with
open
(
config_path
,
"r"
)
as
f
:
raw
=
yaml
.
safe_load
(
f
)
# Update container image if provided (overrides config file images)
if
getattr
(
args
,
"dgd_image"
,
None
):
raw
=
config_modifier
.
update_image
(
raw
,
args
.
dgd_image
)
if
best_prefill_mapping
is
not
None
:
raw
=
apply_parallel_mapping_to_config
(
raw
,
best_prefill_mapping
,
SubComponentType
.
PREFILL
,
config_modifier
,
num_gpus_per_node
,
is_aggregated_config
=
False
,
)
if
best_decode_mapping
is
not
None
:
raw
=
apply_parallel_mapping_to_config
(
raw
,
best_decode_mapping
,
SubComponentType
.
DECODE
,
config_modifier
,
num_gpus_per_node
,
is_aggregated_config
=
False
,
)
return
Config
.
model_validate
(
raw
)
def
build_prefill_service_config
(
*
,
config_path
:
str
,
args
,
best_prefill_mapping
:
ParallelizationMapping
,
num_gpus_per_node
:
int
=
8
,
)
->
tuple
[
str
,
dict
]:
"""Return (service_name, service_dict) for the prefill worker after applying mapping."""
return
_build_single_worker_service_config
(
config_path
=
config_path
,
args
=
args
,
mapping
=
best_prefill_mapping
,
subcomponent
=
SubComponentType
.
PREFILL
,
num_gpus_per_node
=
num_gpus_per_node
,
)
def
build_decode_service_config
(
*
,
config_path
:
str
,
args
,
best_decode_mapping
:
ParallelizationMapping
,
num_gpus_per_node
:
int
=
8
,
)
->
tuple
[
str
,
dict
]:
"""Return (service_name, service_dict) for the decode worker after applying mapping."""
return
_build_single_worker_service_config
(
config_path
=
config_path
,
args
=
args
,
mapping
=
best_decode_mapping
,
subcomponent
=
SubComponentType
.
DECODE
,
num_gpus_per_node
=
num_gpus_per_node
,
)
def
_build_single_worker_service_config
(
*
,
config_path
:
str
,
args
,
mapping
:
ParallelizationMapping
,
subcomponent
:
SubComponentType
,
num_gpus_per_node
:
int
,
)
->
tuple
[
str
,
dict
]:
"""Shared helper for building a single worker service dict (prefill or decode)."""
config_modifier
=
_get_config_modifier_from_args
(
args
)
config
=
_load_and_apply_mappings
(
config_path
=
config_path
,
args
=
args
,
config_modifier
=
config_modifier
,
best_prefill_mapping
=
mapping
if
subcomponent
==
SubComponentType
.
PREFILL
else
None
,
best_decode_mapping
=
mapping
if
subcomponent
==
SubComponentType
.
DECODE
else
None
,
num_gpus_per_node
=
num_gpus_per_node
,
)
service_name
=
_find_service_name_for_subcomponent
(
config
,
subcomponent
)
config_dict
=
config
.
model_dump
(
exclude_unset
=
False
)
return
service_name
,
config_dict
[
"spec"
][
"services"
][
service_name
]
def
generate_prefill_service_config_preview
(
*
,
config_path
:
str
,
args
,
best_prefill_mapping
:
ParallelizationMapping
,
num_gpus_per_node
:
int
=
8
,
)
->
dict
:
"""Generate a prefill-only service config object for WebUI 'Show Config'."""
service_name
,
service_dict
=
build_prefill_service_config
(
config_path
=
config_path
,
args
=
args
,
best_prefill_mapping
=
best_prefill_mapping
,
num_gpus_per_node
=
num_gpus_per_node
,
)
return
{
service_name
:
service_dict
}
def
generate_decode_service_config_preview
(
*
,
config_path
:
str
,
args
,
best_decode_mapping
:
ParallelizationMapping
,
num_gpus_per_node
:
int
=
8
,
)
->
dict
:
"""Generate a decode-only service config object for WebUI 'Show Config'."""
service_name
,
service_dict
=
build_decode_service_config
(
config_path
=
config_path
,
args
=
args
,
best_decode_mapping
=
best_decode_mapping
,
num_gpus_per_node
=
num_gpus_per_node
,
)
return
{
service_name
:
service_dict
}
def
generate_prefill_decode_services_config_preview
(
*
,
config_path
:
str
,
args
,
best_prefill_mapping
:
ParallelizationMapping
,
best_decode_mapping
:
ParallelizationMapping
,
num_gpus_per_node
:
int
=
8
,
)
->
dict
[
str
,
dict
]:
"""Generate a (prefill+decode)-only services config object for WebUI 'Show Config'."""
config_modifier
=
_get_config_modifier_from_args
(
args
)
config
=
_load_and_apply_mappings
(
config_path
=
config_path
,
args
=
args
,
config_modifier
=
config_modifier
,
best_prefill_mapping
=
best_prefill_mapping
,
best_decode_mapping
=
best_decode_mapping
,
num_gpus_per_node
=
num_gpus_per_node
,
)
prefill_service_name
=
_find_service_name_for_subcomponent
(
config
,
SubComponentType
.
PREFILL
)
decode_service_name
=
_find_service_name_for_subcomponent
(
config
,
SubComponentType
.
DECODE
)
config_dict
=
config
.
model_dump
(
exclude_unset
=
False
)
services
=
{
prefill_service_name
:
config_dict
[
"spec"
][
"services"
][
prefill_service_name
],
decode_service_name
:
config_dict
[
"spec"
][
"services"
][
decode_service_name
],
}
return
services
def
generate_dgd_config_with_planner
(
config_path
:
str
,
config_modifier
,
output_dir
:
str
|
None
,
args
,
best_prefill_mapping
:
ParallelizationMapping
|
None
,
best_decode_mapping
:
ParallelizationMapping
|
None
,
num_gpus_per_node
:
int
=
8
,
)
->
tuple
[
list
[
dict
]
|
dict
,
list
[
dict
]
|
dict
]:
"""Generate DGD config with planner based on profiling results.
...
...
@@ -65,62 +249,14 @@ def generate_dgd_config_with_planner(
If a ConfigMap is generated, returns [ConfigMap, DGD]; otherwise returns a single DGD dict.
"""
# Load config from file
with
open
(
config_path
,
"r"
)
as
f
:
config
=
yaml
.
safe_load
(
f
)
# Update container image if provided
# This overrides the default image in the config file for all DGD components
if
args
.
dgd_image
:
config
=
config_modifier
.
update_image
(
config
,
args
.
dgd_image
)
# Apply prefill parallelization based on the actual mapping used in profiling
if
best_prefill_mapping
.
tp
is
not
None
:
# Dense model or TP for prefill
config
=
config_modifier
.
set_config_tp_size
(
config
,
best_prefill_mapping
.
tp
,
SubComponentType
.
PREFILL
)
elif
best_prefill_mapping
.
tep
is
not
None
:
# MoE model with TEP for prefill
config
=
config_modifier
.
set_config_tep_size
(
config
,
best_prefill_mapping
.
tep
,
num_gpus_per_node
,
SubComponentType
.
PREFILL
,
)
elif
best_prefill_mapping
.
dep
is
not
None
:
# MoE model with DEP for prefill
config
=
config_modifier
.
set_config_dep_size
(
config
,
best_prefill_mapping
.
dep
,
num_gpus_per_node
,
SubComponentType
.
PREFILL
,
)
# Apply decode parallelization based on the actual mapping used in profiling
if
best_decode_mapping
.
tp
is
not
None
:
# Dense model or TP for decode
config
=
config_modifier
.
set_config_tp_size
(
config
,
best_decode_mapping
.
tp
,
SubComponentType
.
DECODE
)
elif
best_decode_mapping
.
tep
is
not
None
:
# MoE model with TEP for decode
config
=
config_modifier
.
set_config_tep_size
(
config
,
best_decode_mapping
.
tep
,
num_gpus_per_node
,
SubComponentType
.
DECODE
,
)
elif
best_decode_mapping
.
dep
is
not
None
:
# MoE model with DEP for decode
config
=
config_modifier
.
set_config_dep_size
(
config
,
best_decode_mapping
.
dep
,
num_gpus_per_node
,
SubComponentType
.
DECODE
,
)
config
=
Config
.
model_validate
(
config
)
config
=
_load_and_apply_mappings
(
config_path
=
config_path
,
args
=
args
,
config_modifier
=
config_modifier
,
best_prefill_mapping
=
best_prefill_mapping
,
best_decode_mapping
=
best_decode_mapping
,
num_gpus_per_node
=
num_gpus_per_node
,
)
# add the planner service
planner_config
=
DgdPlannerServiceConfig
()
...
...
@@ -157,66 +293,69 @@ def generate_dgd_config_with_planner(
# Add arguments determined by profiling results
cm_mount_path
=
f
"
{
get_workspace_dir
()
}
/profiling_results"
planner_args
.
extend
(
[
f
"--prefill-engine-num-gpu=
{
best_prefill_mapping
.
get_num_gpus
()
}
"
,
f
"--decode-engine-num-gpu=
{
best_decode_mapping
.
get_num_gpus
()
}
"
,
f
"--profile-results-dir=
{
cm_mount_path
}
"
,
]
)
if
best_prefill_mapping
is
not
None
:
planner_args
.
append
(
f
"--prefill-engine-num-gpu=
{
best_prefill_mapping
.
get_num_gpus
()
}
"
)
if
best_decode_mapping
is
not
None
:
planner_args
.
append
(
f
"--decode-engine-num-gpu=
{
best_decode_mapping
.
get_num_gpus
()
}
"
)
if
(
planner_config
.
extraPodSpec
.
mainContainer
and
planner_config
.
extraPodSpec
.
mainContainer
.
args
is
not
None
):
planner_config
.
extraPodSpec
.
mainContainer
.
args
.
extend
(
planner_args
)
# Convert planner config to dict first, then the entire config to dict
# Work with plain dicts for PodSpec/Container extras (e.g. volumes, volumeMounts)
# because those fields are stored as "extra" and aren't exposed as pydantic attributes.
planner_dict
=
planner_config
.
model_dump
(
exclude_unset
=
False
)
config_dict
=
config
.
model_dump
(
exclude_unset
=
False
)
# Build a ConfigMap from NPZ profiling outputs and mount it into the Planner
# We store data as plain JSON (lists/float/int) to avoid binary artifacts.
prefill_npz
=
f
"
{
output_dir
}
/selected_prefill_interpolation/raw_data.npz"
decode_npz
=
f
"
{
output_dir
}
/selected_decode_interpolation/raw_data.npz"
config_map_obj
:
Optional
[
dict
]
=
None
try
:
with
np
.
load
(
prefill_npz
)
as
p_raw
:
prefill_json
=
{
"prefill_isl"
:
p_raw
[
"prefill_isl"
].
tolist
(),
"prefill_ttft"
:
p_raw
[
"prefill_ttft"
].
tolist
(),
"prefill_thpt_per_gpu"
:
p_raw
[
"prefill_thpt_per_gpu"
].
tolist
(),
}
except
FileNotFoundError
:
prefill_json
=
None
try
:
with
np
.
load
(
decode_npz
)
as
d_raw
:
# max_kv_tokens saved as array; convert to int
max_kv_tokens
=
d_raw
[
"max_kv_tokens"
]
if
hasattr
(
max_kv_tokens
,
"tolist"
):
max_kv_tokens_val
=
max_kv_tokens
.
tolist
()
# Handle [value] vs value
if
isinstance
(
max_kv_tokens_val
,
list
):
max_kv_tokens_val
=
(
int
(
max_kv_tokens_val
[
0
])
if
max_kv_tokens_val
else
0
)
prefill_json
=
None
decode_json
=
None
if
output_dir
is
not
None
:
# Build a ConfigMap from NPZ profiling outputs and mount it into the Planner
# We store data as plain JSON (lists/float/int) to avoid binary artifacts.
prefill_npz
=
f
"
{
output_dir
}
/selected_prefill_interpolation/raw_data.npz"
decode_npz
=
f
"
{
output_dir
}
/selected_decode_interpolation/raw_data.npz"
try
:
with
np
.
load
(
prefill_npz
)
as
p_raw
:
prefill_json
=
{
"prefill_isl"
:
p_raw
[
"prefill_isl"
].
tolist
(),
"prefill_ttft"
:
p_raw
[
"prefill_ttft"
].
tolist
(),
"prefill_thpt_per_gpu"
:
p_raw
[
"prefill_thpt_per_gpu"
].
tolist
(),
}
except
FileNotFoundError
:
prefill_json
=
None
try
:
with
np
.
load
(
decode_npz
)
as
d_raw
:
# max_kv_tokens saved as array; convert to int
max_kv_tokens
=
d_raw
[
"max_kv_tokens"
]
if
hasattr
(
max_kv_tokens
,
"tolist"
):
max_kv_tokens_val
=
max_kv_tokens
.
tolist
()
# Handle [value] vs value
if
isinstance
(
max_kv_tokens_val
,
list
):
max_kv_tokens_val
=
(
int
(
max_kv_tokens_val
[
0
])
if
max_kv_tokens_val
else
0
)
else
:
max_kv_tokens_val
=
int
(
max_kv_tokens_val
)
else
:
max_kv_tokens_val
=
int
(
max_kv_tokens_val
)
else
:
max_kv_tokens_val
=
int
(
max_kv_tokens
)
decode_json
=
{
"x_kv_usage"
:
d_raw
[
"x_kv_usage"
].
tolist
(),
"y_context_length"
:
d_raw
[
"y_context_length"
].
tolist
(),
"z_itl"
:
d_raw
[
"z_itl"
].
tolist
(),
"z_thpt_per_gpu"
:
d_raw
[
"z_thpt_per_gpu"
].
tolist
(),
"max_kv_tokens"
:
max_kv_tokens_val
,
}
except
FileNotFoundError
:
decode_json
=
None
max_kv_tokens_val
=
int
(
max_kv_tokens
)
decode_json
=
{
"x_kv_usage"
:
d_raw
[
"x_kv_usage"
].
tolist
(),
"y_context_length"
:
d_raw
[
"y_context_length"
].
tolist
(),
"z_itl"
:
d_raw
[
"z_itl"
].
tolist
(),
"z_thpt_per_gpu"
:
d_raw
[
"z_thpt_per_gpu"
].
tolist
(),
"max_kv_tokens"
:
max_kv_tokens_val
,
}
except
FileNotFoundError
:
decode_json
=
None
if
prefill_json
is
not
None
and
decode_json
is
not
None
:
# Only override planner profile directory when we actually have data to mount.
planner_args
.
append
(
f
"--profile-results-dir=
{
cm_mount_path
}
"
)
config_map_obj
=
{
"apiVersion"
:
"v1"
,
"kind"
:
"ConfigMap"
,
...
...
@@ -249,6 +388,13 @@ def generate_dgd_config_with_planner(
}
)
# Attach planner args (always)
mc_dict
=
planner_dict
.
setdefault
(
"extraPodSpec"
,
{}).
setdefault
(
"mainContainer"
,
{}
)
mc_args
=
mc_dict
.
setdefault
(
"args"
,
[])
mc_args
.
extend
(
planner_args
)
# Finalize DGD services
config_dict
[
"spec"
][
"services"
][
"Planner"
]
=
planner_dict
...
...
@@ -310,7 +456,7 @@ def _generate_mocker_config_with_planner(
"image"
]
=
args
.
dgd_image
# Update worker args: --planner-profile-data, --model-path, --model-name
# Update worker args: --planner-profile-data
(if available)
, --model-path, --model-name
mocker_worker_names
=
[
MockerComponentName
.
prefill_worker_k8s_name
,
MockerComponentName
.
decode_worker_k8s_name
,
...
...
@@ -324,9 +470,10 @@ def _generate_mocker_config_with_planner(
"mainContainer"
,
{}
)
args_list
=
main_container
.
get
(
"args"
,
[])
args_list
=
set_argument_value
(
args_list
,
"--planner-profile-data"
,
cm_mount_path
)
if
config_map_obj
is
not
None
:
args_list
=
set_argument_value
(
args_list
,
"--planner-profile-data"
,
cm_mount_path
)
# Update model path and name if available in args
args_list
=
set_argument_value
(
args_list
,
"--model-path"
,
args
.
model
)
args_list
=
set_argument_value
(
args_list
,
"--model-name"
,
args
.
model
)
...
...
benchmarks/profiler/webui/utils.py
View file @
e49834c9
...
...
@@ -20,6 +20,11 @@ from aiconfigurator.webapp.components.profiling import (
load_profiling_javascript
,
)
from
benchmarks.profiler.utils.dgd_generation
import
(
generate_decode_service_config_preview
,
generate_prefill_decode_services_config_preview
,
generate_prefill_service_config_preview
,
)
from
benchmarks.profiler.utils.pareto
import
compute_pareto
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -56,149 +61,87 @@ def clear_profiling_errors() -> None:
_profiling_errors
.
clear
()
def
generate_dgd_worker_config_yaml
(
parallel_mapping
,
engine_type
:
str
,
model
:
str
|
None
=
None
,
backend
:
str
|
None
=
None
,
ttft_or_itl
:
float
|
None
=
None
,
thpt_per_gpu
:
float
|
None
=
None
,
)
->
str
:
"""
Generate a DGD worker service config snippet for display in the WebUI.
def
dump_yaml_with_header
(
header_lines
:
list
[
str
],
obj
:
dict
)
->
str
:
"""Dump YAML with a leading comment header (used for WebUI config previews)."""
header
=
"
\n
"
.
join
(
header_lines
+
[
"#"
])
body
=
yaml
.
safe_dump
(
obj
,
sort_keys
=
False
)
return
f
"
{
header
}
\n
{
body
}
"
Uses ParallelizationMapping.label() for display and shows the service structure
that would be used in the final DynamoGraphDeployment.
Args
:
parallel_mapping: ParallelizationMapping instance
engine_type: "prefill" or "decode"
model:
Model name/path
backend: Backend name (sglang, vllm, trtllm
)
ttft_or_itl: TTFT (prefill) or ITL (decode) in ms
thpt_per_gpu: Throughput per GPU in tokens/s/GPU
def
_maybe_add_model_backend_header_lines
(
header_lines
:
list
[
str
],
args
)
->
None
:
model
=
getattr
(
args
,
"model"
,
None
)
backend
=
getattr
(
args
,
"backend"
,
None
)
if
model
:
header_lines
.
append
(
f
"# Model:
{
model
}
"
)
if
backend
:
header_lines
.
append
(
f
"# Backend:
{
backend
}
"
)
Returns:
YAML string representation of the DGD worker config
"""
num_gpus
=
parallel_mapping
.
get_num_gpus
()
# Build the worker config in DGD style
# Note: Actual args vary by backend; this shows the structure
worker_config
=
{
"componentType"
:
"worker"
,
"subComponentType"
:
engine_type
,
"replicas"
:
1
,
"resources"
:
{
"limits"
:
{
"gpu"
:
str
(
num_gpus
),
}
},
}
# Build header comments with profiling metadata
def
build_single_service_preview_header_lines
(
*
,
service_name
:
str
,
engine_type
:
str
,
mapping
,
ttft_or_itl_ms
:
float
|
None
,
thpt_per_gpu
:
float
|
None
,
args
,
)
->
list
[
str
]:
header_lines
=
[
"# DynamoGraphDeployment Worker Config"
,
"# DynamoGraphDeployment Service Config Preview"
,
f
"# Service:
{
service_name
}
"
,
f
"# Engine:
{
engine_type
}
"
,
f
"# Num GPUs:
{
num_gpus
}
"
,
f
"# Parallelization:
{
parallel_
mapping
.
label
()
}
"
,
f
"# Num GPUs:
{
mapping
.
get_
num_gpus
()
}
"
,
f
"# Parallelization:
{
mapping
.
label
()
}
"
,
]
if
engine_type
==
"prefill"
and
ttft_or_itl
is
not
None
:
header_lines
.
append
(
f
"# Profiled TTFT:
{
round
(
ttft_or_itl
,
2
)
}
ms"
)
elif
engine_type
==
"decode"
and
ttft_or_itl
is
not
None
:
header_lines
.
append
(
f
"# Profiled ITL:
{
round
(
ttft_or_itl
,
2
)
}
ms"
)
if
engine_type
==
"prefill"
and
ttft_or_itl_ms
is
not
None
:
header_lines
.
append
(
f
"# Profiled TTFT:
{
round
(
ttft_or_itl_ms
,
2
)
}
ms"
)
if
engine_type
==
"decode"
and
ttft_or_itl_ms
is
not
None
:
header_lines
.
append
(
f
"# Profiled ITL:
{
round
(
ttft_or_itl_ms
,
2
)
}
ms"
)
if
thpt_per_gpu
is
not
None
:
header_lines
.
append
(
f
"# Profiled Throughput:
{
round
(
thpt_per_gpu
,
2
)
}
tokens/s/GPU"
)
if
model
:
header_lines
.
append
(
f
"# Model:
{
model
}
"
)
if
backend
:
header_lines
.
append
(
f
"# Backend:
{
backend
}
"
)
header_lines
.
append
(
"#"
)
header_lines
.
append
(
"# Note: Final config generated after selection includes"
)
header_lines
.
append
(
"# backend-specific args and planner configuration."
)
# Add the actual config
service_name
=
f
"
{
engine_type
.
capitalize
()
}
Worker"
body
=
yaml
.
dump
(
{
service_name
:
worker_config
},
default_flow_style
=
False
,
sort_keys
=
False
_maybe_add_model_backend_header_lines
(
header_lines
,
args
)
header_lines
.
append
(
"# Note: This is a service-only preview. Final config includes planner."
)
return
"
\n
"
.
join
(
header_lines
)
+
"
\n
"
+
body
return
header_lines
def
generate_dgd_config_yaml_for_display
(
def
build_two_service_preview_header_lines
(
*
,
prefill_service_name
:
str
,
decode_service_name
:
str
,
prefill_mapping
,
decode_mapping
,
model
:
str
|
None
=
None
,
backend
:
str
|
None
=
None
,
)
->
str
:
"""
Generate a DGD config snippet for display in the WebUI.
This shows the combined prefill + decode DynamoGraphDeployment structure.
Uses ParallelizationMapping.label() for parallelization info.
Args:
prefill_mapping: ParallelizationMapping for prefill
decode_mapping: ParallelizationMapping for decode
model: Model name/path
backend: Backend name
Returns:
YAML string representation of the DGD configuration
"""
prefill_gpus
=
prefill_mapping
.
get_num_gpus
()
decode_gpus
=
decode_mapping
.
get_num_gpus
()
# Build DGD-style config showing the service structure
config
=
{
"apiVersion"
:
"nvidia.com/v1alpha1"
,
"kind"
:
"DynamoGraphDeployment"
,
"spec"
:
{
"services"
:
{
"PrefillWorker"
:
{
"componentType"
:
"worker"
,
"subComponentType"
:
"prefill"
,
"replicas"
:
1
,
"resources"
:
{
"limits"
:
{
"gpu"
:
str
(
prefill_gpus
)},
},
},
"DecodeWorker"
:
{
"componentType"
:
"worker"
,
"subComponentType"
:
"decode"
,
"replicas"
:
1
,
"resources"
:
{
"limits"
:
{
"gpu"
:
str
(
decode_gpus
)},
},
},
}
},
}
# Build header comments with parallelization and model info
prefill_ttft_ms
:
float
|
None
,
prefill_thpt_per_gpu
:
float
|
None
,
decode_itl_ms
:
float
|
None
,
decode_thpt_per_gpu
:
float
|
None
,
args
,
)
->
list
[
str
]:
header_lines
=
[
"# DynamoGraphDeployment
Configuration
Preview"
,
f
"# Prefill:
{
prefill_gpus
}
GPU(s),
{
prefill_mapping
.
label
()
}
"
,
f
"# Decode:
{
decode_gpus
}
GPU(s),
{
decode_mapping
.
label
()
}
"
,
"# DynamoGraphDeployment
Services Config
Preview"
,
f
"# Prefill
service
:
{
prefill_
service_name
}
(
{
prefill_mapping
.
get_num_
gpus
()
}
GPU(s),
{
prefill_mapping
.
label
()
}
)
"
,
f
"# Decode
service
:
{
decode_
service_name
}
(
{
decode_mapping
.
get_num_
gpus
()
}
GPU(s),
{
decode_mapping
.
label
()
}
)
"
,
]
if
model
:
header_lines
.
append
(
f
"# Model:
{
model
}
"
)
if
backend
:
header_lines
.
append
(
f
"# Backend:
{
backend
}
"
)
header_lines
.
append
(
"#"
)
header_lines
.
append
(
"# Full config with planner saved to: config_with_planner.yaml"
)
header
=
"
\n
"
.
join
(
header_lines
)
body
=
yaml
.
dump
(
config
,
default_flow_style
=
False
,
sort_keys
=
False
)
return
f
"
{
header
}
\n
{
body
}
"
if
prefill_ttft_ms
is
not
None
:
header_lines
.
append
(
f
"# Profiled TTFT:
{
round
(
prefill_ttft_ms
,
2
)
}
ms"
)
if
decode_itl_ms
is
not
None
:
header_lines
.
append
(
f
"# Profiled ITL:
{
round
(
decode_itl_ms
,
2
)
}
ms"
)
if
prefill_thpt_per_gpu
is
not
None
:
header_lines
.
append
(
f
"# Profiled Prefill Throughput:
{
round
(
prefill_thpt_per_gpu
,
2
)
}
tokens/s/GPU"
)
if
decode_thpt_per_gpu
is
not
None
:
header_lines
.
append
(
f
"# Profiled Decode Throughput:
{
round
(
decode_thpt_per_gpu
,
2
)
}
tokens/s/GPU"
)
_maybe_add_model_backend_header_lines
(
header_lines
,
args
)
header_lines
.
append
(
"# Note: This is a services-only preview. Final config includes planner."
)
return
header_lines
class
PlotType
(
str
,
Enum
):
...
...
@@ -326,15 +269,22 @@ def populate_prefill_data(data, prefill_data, args):
prefill_data
.
parallel_mappings
,
)
):
# Generate DGD worker config YAML for display
config_yaml
=
generate_dgd_worker_config_yaml
(
parallel_mapping
=
mapping
,
config_obj
=
generate_prefill_service_config_preview
(
config_path
=
args
.
config
,
args
=
args
,
best_prefill_mapping
=
mapping
,
num_gpus_per_node
=
getattr
(
args
,
"num_gpus_per_node"
,
8
),
)
service_name
=
next
(
iter
(
config_obj
.
keys
()))
header_lines
=
build_single_service_preview_header_lines
(
service_name
=
service_name
,
engine_type
=
"prefill"
,
model
=
getattr
(
args
,
"model"
,
None
),
backend
=
getattr
(
args
,
"backend"
,
None
),
ttft_or_itl
=
ttft
,
mapping
=
mapping
,
ttft_or_itl_ms
=
ttft
,
thpt_per_gpu
=
thpt
,
args
=
args
,
)
config_yaml
=
dump_yaml_with_header
(
header_lines
,
config_obj
)
table_data
.
append
([
gpu
,
round
(
ttft
,
2
),
round
(
thpt
,
2
),
config_yaml
])
data
[
PlotType
.
PREFILL
][
"table"
][
"data"
]
=
table_data
...
...
@@ -383,15 +333,22 @@ def populate_decode_data(data, decode_data, args):
decode_data
.
parallel_mappings
,
)
):
# Generate DGD worker config YAML for display
config_yaml
=
generate_dgd_worker_config_yaml
(
parallel_mapping
=
mapping
,
config_obj
=
generate_decode_service_config_preview
(
config_path
=
args
.
config
,
args
=
args
,
best_decode_mapping
=
mapping
,
num_gpus_per_node
=
getattr
(
args
,
"num_gpus_per_node"
,
8
),
)
service_name
=
next
(
iter
(
config_obj
.
keys
()))
header_lines
=
build_single_service_preview_header_lines
(
service_name
=
service_name
,
engine_type
=
"decode"
,
model
=
getattr
(
args
,
"model"
,
None
),
backend
=
getattr
(
args
,
"backend"
,
None
),
ttft_or_itl
=
itl
,
mapping
=
mapping
,
ttft_or_itl_ms
=
itl
,
thpt_per_gpu
=
thpt
,
args
=
args
,
)
config_yaml
=
dump_yaml_with_header
(
header_lines
,
config_obj
)
table_data
.
append
([
gpu
,
round
(
itl
,
2
),
round
(
thpt
,
2
),
config_yaml
])
data
[
PlotType
.
DECODE
][
"table"
][
"data"
]
=
table_data
...
...
@@ -468,13 +425,32 @@ def populate_cost_data(
# Store mapping from cost table row to original indices
cost_index_mapping
[
table_idx
]
=
(
orig_prefill_idx
,
orig_decode_idx
)
# Generate DGD config YAML for display
config_yaml
=
generate_dgd_config_yaml_for_display
(
services_obj
=
generate_prefill_decode_services_config_preview
(
config_path
=
args
.
config
,
args
=
args
,
best_prefill_mapping
=
prefill_mapping
,
best_decode_mapping
=
decode_mapping
,
num_gpus_per_node
=
getattr
(
args
,
"num_gpus_per_node"
,
8
),
)
# Determine service names (backend-dependent)
service_names
=
list
(
services_obj
.
keys
())
# Prefer stable names by picking based on subComponentType if present; fallback to insertion order.
prefill_service_name
=
service_names
[
0
]
decode_service_name
=
(
service_names
[
1
]
if
len
(
service_names
)
>
1
else
service_names
[
0
]
)
header_lines
=
build_two_service_preview_header_lines
(
prefill_service_name
=
prefill_service_name
,
decode_service_name
=
decode_service_name
,
prefill_mapping
=
prefill_mapping
,
decode_mapping
=
decode_mapping
,
model
=
getattr
(
args
,
"model"
,
None
),
backend
=
getattr
(
args
,
"backend"
,
None
),
prefill_ttft_ms
=
float
(
_p_ttft
),
prefill_thpt_per_gpu
=
float
(
_p_thpt
),
decode_itl_ms
=
float
(
_d_itl
),
decode_thpt_per_gpu
=
float
(
_d_thpt
),
args
=
args
,
)
config_yaml
=
dump_yaml_with_header
(
header_lines
,
services_obj
)
# Add to table data (GPU hours, not cost - frontend handles cost conversion)
table_data
.
append
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment