Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
273252e6
Unverified
Commit
273252e6
authored
Mar 31, 2026
by
Biswa Panda
Committed by
GitHub
Mar 31, 2026
Browse files
feat(frontend): three-layer frontend perf sweep with local and k8s support (#7700)
parent
023a299c
Changes
28
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
1594 additions
and
608 deletions
+1594
-608
benchmarks/frontend/scripts/sweep_executors/local.py
benchmarks/frontend/scripts/sweep_executors/local.py
+221
-0
benchmarks/frontend/scripts/sweep_k8s/__init__.py
benchmarks/frontend/scripts/sweep_k8s/__init__.py
+3
-0
benchmarks/frontend/scripts/sweep_k8s/aiperf.py
benchmarks/frontend/scripts/sweep_k8s/aiperf.py
+481
-0
benchmarks/frontend/scripts/sweep_k8s/dgd.py
benchmarks/frontend/scripts/sweep_k8s/dgd.py
+355
-0
benchmarks/frontend/scripts/sweep_k8s/kubectl.py
benchmarks/frontend/scripts/sweep_k8s/kubectl.py
+207
-0
benchmarks/frontend/scripts/sweep_k8s/metrics.py
benchmarks/frontend/scripts/sweep_k8s/metrics.py
+152
-0
benchmarks/frontend/scripts/sweep_k8s/template.py
benchmarks/frontend/scripts/sweep_k8s/template.py
+115
-0
benchmarks/frontend/scripts/sweep_runner.py
benchmarks/frontend/scripts/sweep_runner.py
+60
-608
No files found.
benchmarks/frontend/scripts/sweep_executors/local.py
0 → 100644
View file @
273252e6
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
LocalExecutor -- wraps run_perf.sh for local sweep execution.
This executor delegates each run to run_perf.sh, which handles service
lifecycle (mocker + frontend), observability captures, and aiperf load.
"""
from
__future__
import
annotations
import
json
import
os
import
signal
import
socket
import
subprocess
import
time
from
pathlib
import
Path
from
typing
import
Optional
from
sweep_core.models
import
DeployDimension
,
RunResult
,
RunSpec
,
SweepConfig
SCRIPT_DIR
=
Path
(
__file__
).
resolve
().
parent
.
parent
class
LocalExecutor
:
"""Executor that delegates runs to run_perf.sh."""
def
__init__
(
self
)
->
None
:
self
.
_config
:
Optional
[
SweepConfig
]
=
None
self
.
_frontend_port
:
int
=
8000
def
prepare
(
self
,
config
:
SweepConfig
)
->
None
:
"""Store config for use during runs."""
self
.
_config
=
config
self
.
_frontend_port
=
8000
# local mode always uses 8000
def
apply_deploy
(
self
,
deploy
:
DeployDimension
,
prev
:
Optional
[
DeployDimension
],
)
->
None
:
"""In local mode, run_perf.sh handles its own service lifecycle.
We just wait for the port to be free from the previous run.
"""
_wait_port_free
(
self
.
_frontend_port
)
def
execute_run
(
self
,
run_spec
:
RunSpec
,
run_dir
:
Path
)
->
RunResult
:
"""Execute a single run via run_perf.sh."""
if
self
.
_config
is
None
:
raise
RuntimeError
(
"prepare() must be called before execute_run()"
)
config
=
self
.
_config
deploy
=
run_spec
.
deploy
aiperf
=
run_spec
.
aiperf
result
=
RunResult
(
run_spec
=
run_spec
,
run_dir
=
str
(
run_dir
))
cmd
=
[
str
(
SCRIPT_DIR
/
"run_perf.sh"
),
"--model"
,
config
.
model
,
"--isl"
,
str
(
aiperf
.
isl
),
"--osl"
,
str
(
aiperf
.
osl
),
"--concurrency"
,
str
(
aiperf
.
concurrency
),
"--workers"
,
str
(
deploy
.
workers
),
"--speedup-ratio"
,
str
(
config
.
speedup_ratio
),
"--num-models"
,
str
(
deploy
.
num_models
),
"--aiperf-targets"
,
config
.
aiperf_targets
,
"--output-dir"
,
str
(
run_dir
),
]
if
aiperf
.
benchmark_duration
:
cmd
.
extend
([
"--benchmark-duration"
,
str
(
aiperf
.
benchmark_duration
)])
if
aiperf
.
num_requests
:
cmd
.
extend
([
"--num-requests"
,
str
(
aiperf
.
num_requests
)])
if
aiperf
.
request_rate
:
cmd
.
extend
([
"--request-rate"
,
str
(
aiperf
.
request_rate
)])
if
deploy
.
tokenizer
in
(
"fast"
,
"fastokens"
):
cmd
.
append
(
"--fast-tokens"
)
# TODO: when run_perf.sh gains --backend vllm support, pass it here
if
deploy
.
backend
==
"vllm"
:
print
(
" WARNING: vllm backend not yet supported by run_perf.sh; using mocker"
)
# Passthrough args (e.g., --skip-bpf --skip-nsys)
cmd
.
extend
(
config
.
passthrough_args
)
print
(
f
" cmd:
{
' '
.
join
(
cmd
[:
6
])
}
..."
)
try
:
proc
=
subprocess
.
Popen
(
cmd
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
STDOUT
,
text
=
True
,
start_new_session
=
True
,
)
stdout
,
_
=
proc
.
communicate
(
timeout
=
600
)
if
proc
.
returncode
==
0
:
result
.
status
=
"ok"
else
:
result
.
status
=
"fail"
print
(
f
" run_perf.sh failed (rc=
{
proc
.
returncode
}
)"
)
lines
=
(
stdout
or
""
).
strip
().
split
(
"
\n
"
)
for
line
in
lines
[
-
5
:]:
print
(
f
"
{
line
}
"
)
except
subprocess
.
TimeoutExpired
:
result
.
status
=
"fail"
print
(
" TIMEOUT after 600s"
)
try
:
pgid
=
os
.
getpgid
(
proc
.
pid
)
os
.
killpg
(
pgid
,
signal
.
SIGTERM
)
time
.
sleep
(
2
)
os
.
killpg
(
pgid
,
signal
.
SIGKILL
)
except
ProcessLookupError
:
pass
except
Exception
as
e
:
result
.
status
=
"fail"
print
(
f
" ERROR:
{
e
}
"
)
# Parse aiperf results
_parse_aiperf_into_result
(
result
,
run_dir
)
return
result
def
cleanup
(
self
)
->
None
:
"""No persistent state to clean up in local mode."""
pass
# ── Helpers ──────────────────────────────────────────────────────────────────
def
_parse_aiperf_json
(
json_path
:
Path
)
->
dict
:
"""Parse aiperf profile_export_aiperf.json."""
if
not
json_path
.
exists
():
return
{}
try
:
data
=
json
.
loads
(
json_path
.
read_text
())
result
=
{}
rt
=
data
.
get
(
"request_throughput"
,
{})
result
[
"req_per_sec"
]
=
rt
.
get
(
"avg"
,
0
)
ot
=
data
.
get
(
"output_token_throughput"
,
{})
result
[
"output_tok_per_sec"
]
=
ot
.
get
(
"avg"
,
0
)
ttft
=
data
.
get
(
"time_to_first_token"
,
data
.
get
(
"ttft"
,
{}))
if
isinstance
(
ttft
,
dict
):
result
[
"ttft_p50_ms"
]
=
ttft
.
get
(
"p50"
,
0
)
or
0
result
[
"ttft_p99_ms"
]
=
ttft
.
get
(
"p99"
,
0
)
or
0
itl
=
data
.
get
(
"inter_token_latency"
,
data
.
get
(
"itl"
,
{}))
if
isinstance
(
itl
,
dict
):
result
[
"itl_p50_ms"
]
=
itl
.
get
(
"p50"
,
0
)
or
0
result
[
"itl_p99_ms"
]
=
itl
.
get
(
"p99"
,
0
)
or
0
bd
=
data
.
get
(
"benchmark_duration"
,
0
)
result
[
"duration_sec"
]
=
bd
.
get
(
"avg"
,
0
)
if
isinstance
(
bd
,
dict
)
else
(
bd
or
0
)
return
result
except
(
json
.
JSONDecodeError
,
KeyError
,
TypeError
):
return
{}
def
_parse_aiperf_into_result
(
result
:
RunResult
,
run_dir
:
Path
)
->
None
:
"""Parse aiperf results from the run directory into the RunResult."""
aiperf_json
=
run_dir
/
"aiperf"
/
"profile_export_aiperf.json"
if
not
aiperf_json
.
exists
():
# Multi-model: results are in aiperf/<model-name>/
for
candidate
in
sorted
(
(
run_dir
/
"aiperf"
).
glob
(
"*/profile_export_aiperf.json"
)
):
aiperf_json
=
candidate
break
metrics
=
_parse_aiperf_json
(
aiperf_json
)
if
metrics
:
result
.
req_per_sec
=
metrics
.
get
(
"req_per_sec"
,
0
)
result
.
output_tok_per_sec
=
metrics
.
get
(
"output_tok_per_sec"
,
0
)
result
.
ttft_p50_ms
=
metrics
.
get
(
"ttft_p50_ms"
,
0
)
result
.
ttft_p99_ms
=
metrics
.
get
(
"ttft_p99_ms"
,
0
)
result
.
itl_p50_ms
=
metrics
.
get
(
"itl_p50_ms"
,
0
)
result
.
itl_p99_ms
=
metrics
.
get
(
"itl_p99_ms"
,
0
)
result
.
duration_sec
=
metrics
.
get
(
"duration_sec"
,
0
)
def
_port_free
(
port
:
int
)
->
bool
:
"""Check if a port is free."""
with
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
)
as
s
:
return
s
.
connect_ex
((
"127.0.0.1"
,
port
))
!=
0
def
_kill_port
(
port
:
int
)
->
None
:
"""Kill any process holding a port."""
subprocess
.
run
(
f
"fuser -k -TERM
{
port
}
/tcp"
,
shell
=
True
,
capture_output
=
True
,
timeout
=
5
)
time
.
sleep
(
2
)
subprocess
.
run
(
f
"fuser -k -KILL
{
port
}
/tcp"
,
shell
=
True
,
capture_output
=
True
,
timeout
=
5
)
def
_wait_port_free
(
port
:
int
,
timeout
:
int
=
30
)
->
None
:
"""Wait for a port to become free."""
for
i
in
range
(
timeout
):
if
_port_free
(
port
):
return
if
i
==
0
:
print
(
f
" Waiting for port
{
port
}
to free..."
)
time
.
sleep
(
1
)
print
(
f
" Forcing port
{
port
}
release..."
)
_kill_port
(
port
)
time
.
sleep
(
2
)
benchmarks/frontend/scripts/sweep_k8s/__init__.py
0 → 100644
View file @
273252e6
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""sweep_k8s -- Kubernetes subprocess helpers for DGD-based sweeps."""
benchmarks/frontend/scripts/sweep_k8s/aiperf.py
0 → 100644
View file @
273252e6
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
K8s aiperf Job launcher.
Runs aiperf as a k8s Job inside the same namespace as the DGD, using the
in-cluster service DNS endpoint. Uses python:3.12-slim with pip-installed
aiperf (same pattern as recipes/qwen3-235b-a22b-fp8/trtllm/agg/perf.yaml).
Artifacts are written inside the pod, then copied back to the local host
via kubectl cp.
"""
from
__future__
import
annotations
import
json
import
subprocess
import
time
from
pathlib
import
Path
from
typing
import
Optional
from
sweep_k8s.kubectl
import
run_kubectl
DEFAULT_HF_TOKEN_SECRET_NAME
=
"hf-token-secret"
def
_build_aiperf_script
(
model_name
:
str
,
endpoint
:
str
,
concurrency
:
int
,
isl
:
int
,
osl
:
int
=
256
,
benchmark_duration
:
Optional
[
int
]
=
None
,
num_requests
:
Optional
[
int
]
=
None
,
request_rate
:
Optional
[
int
]
=
None
,
warmup_duration
:
Optional
[
int
]
=
None
,
warmup_count
:
Optional
[
int
]
=
None
,
export_level
:
str
=
"summary"
,
)
->
str
:
"""Build the shell script that runs inside the Job container."""
# Build load-control args
load_args
=
""
if
benchmark_duration
:
load_args
+=
f
" --benchmark-duration
{
benchmark_duration
}
"
if
num_requests
:
load_args
+=
f
" --request-count
{
num_requests
}
"
if
request_rate
:
load_args
+=
f
" --request-rate
{
request_rate
}
"
if
not
load_args
.
strip
():
auto_count
=
max
(
concurrency
*
20
,
640
)
load_args
=
f
" --request-count
{
auto_count
}
"
# Warmup args
warmup_args
=
""
if
warmup_duration
:
warmup_args
=
f
" --warmup-duration
{
warmup_duration
}
"
elif
warmup_count
:
warmup_args
=
f
" --warmup-request-count
{
warmup_count
}
"
else
:
warmup_args
=
f
" --warmup-request-count
{
concurrency
}
"
return
f
"""set -e
apt-get update -qq && apt-get install -y -qq curl jq git procps 2>/dev/null
pip install --quiet git+https://github.com/ai-dynamo/aiperf.git@54cd6dc820bff8bfebc875da104e59d745e14f75
echo "aiperf installed"
# Wait for model
echo "Waiting for model '
{
model_name
}
' at http://
{
endpoint
}
/v1/models..."
while ! curl -sf "http://
{
endpoint
}
/v1/models" 2>/dev/null |
\\
jq -e --arg m "
{
model_name
}
" '.data[]? | select(.id == $m)' >/dev/null 2>&1; do
echo " Model not ready, sleeping 5s..."
sleep 5
done
echo "Model ready!"
# Write artifacts to PVC so they persist after pod completion
ARTIFACT_DIR="${{ARTIFACT_PVC_DIR:-/model-cache/perf/${{JOB_NAME}}}}"
mkdir -p "$ARTIFACT_DIR"
echo "Running aiperf: c=
{
concurrency
}
isl=
{
isl
}
osl=
{
osl
}
"
echo "Artifact dir: $ARTIFACT_DIR"
aiperf profile
\\
--artifact-dir "$ARTIFACT_DIR"
\\
--model "
{
model_name
}
"
\\
--tokenizer "
{
model_name
}
"
\\
--endpoint-type chat
\\
--endpoint /v1/chat/completions
\\
--streaming
\\
--url "http://
{
endpoint
}
"
\\
--synthetic-input-tokens-mean
{
isl
}
\\
--synthetic-input-tokens-stddev 0
\\
--output-tokens-mean
{
osl
}
\\
--output-tokens-stddev 0
\\
--extra-inputs "max_tokens:
{
osl
}
"
\\
--extra-inputs "min_tokens:
{
osl
}
"
\\
--extra-inputs "ignore_eos:true"
\\
--extra-inputs "repetition_penalty:1.0"
\\
--extra-inputs "temperature:0.0"
\\
--concurrency
{
concurrency
}
\\
{
load_args
.
strip
()
}
\\
{
warmup_args
.
strip
()
}
\\
--num-dataset-entries 12800
\\
--random-seed 100
\\
--workers-max
{
concurrency
}
\\
--record-processors 32
\\
--export-level
{
export_level
}
\\
--ui simple
echo "aiperf done. Artifacts:"
ls -la "$ARTIFACT_DIR"/
"""
def
_indent
(
text
:
str
,
spaces
:
int
)
->
str
:
"""Indent each line of text by N spaces."""
prefix
=
" "
*
spaces
return
"
\n
"
.
join
(
prefix
+
line
for
line
in
text
.
split
(
"
\n
"
))
def
_build_job_yaml
(
job_name
:
str
,
namespace
:
str
,
script
:
str
,
image_pull_secret
:
str
=
""
,
hf_token_secret_name
:
str
=
DEFAULT_HF_TOKEN_SECRET_NAME
,
)
->
str
:
"""Build the aiperf k8s Job YAML.
Uses python:3.12-slim with pip-installed aiperf (same pattern as
recipes/qwen3-235b-a22b-fp8/trtllm/agg/perf.yaml).
"""
image_pull_secret_block
=
""
if
image_pull_secret
:
image_pull_secret_block
=
f
"""
imagePullSecrets:
- name:
{
image_pull_secret
}
"""
return
f
"""apiVersion: batch/v1
kind: Job
metadata:
name:
{
job_name
}
namespace:
{
namespace
}
labels:
app: sweep-aiperf
spec:
backoffLimit: 0
completions: 1
parallelism: 1
ttlSecondsAfterFinished: 600
template:
metadata:
labels:
app: sweep-aiperf
job-name:
{
job_name
}
spec:
restartPolicy: Never
{
image_pull_secret_block
}
securityContext:
sysctls:
- name: net.ipv4.ip_local_port_range
value: "1024 65000"
containers:
- name: aiperf
image: python:3.12-slim
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
command:
- /bin/bash
- -c
- |
{
_indent
(
script
,
14
)
}
env:
- name: HF_HOME
value: /model-cache
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name:
{
hf_token_secret_name
}
key: HF_TOKEN
- name: PYTHONUNBUFFERED
value: "1"
- name: AIPERF_HTTP_CONNECTION_LIMIT
value: "512"
- name: JOB_NAME
value:
{
job_name
}
- name: ARTIFACT_PVC_DIR
value: /model-cache/perf/
{
job_name
}
volumeMounts:
- name: model-cache
mountPath: /model-cache
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: model-cache
"""
def
_wait_for_job
(
job_name
:
str
,
namespace
:
str
,
timeout
:
int
=
600
,
)
->
bool
:
"""Poll for Job completion. Returns True if succeeded."""
waited
=
0
while
waited
<
timeout
:
try
:
result
=
run_kubectl
(
[
"get"
,
"job"
,
job_name
,
"-o"
,
"json"
],
namespace
=
namespace
,
check
=
False
,
)
if
result
.
returncode
!=
0
:
time
.
sleep
(
5
)
waited
+=
5
continue
job_data
=
json
.
loads
(
result
.
stdout
)
conditions
=
job_data
.
get
(
"status"
,
{}).
get
(
"conditions"
,
[])
for
cond
in
conditions
:
if
cond
.
get
(
"type"
)
==
"Complete"
and
cond
.
get
(
"status"
)
==
"True"
:
print
(
f
" aiperf Job completed (waited
{
waited
}
s)"
)
return
True
if
cond
.
get
(
"type"
)
==
"Failed"
and
cond
.
get
(
"status"
)
==
"True"
:
print
(
f
" aiperf Job FAILED (waited
{
waited
}
s)"
)
_print_job_logs
(
job_name
,
namespace
)
return
False
except
(
json
.
JSONDecodeError
,
subprocess
.
SubprocessError
,
OSError
)
as
e
:
print
(
f
" Transient error polling job
{
job_name
}
in
{
namespace
}
:
{
e
}
"
)
time
.
sleep
(
5
)
waited
+=
5
if
waited
%
30
==
0
:
print
(
f
" aiperf Job running (
{
waited
}
s /
{
timeout
}
s)..."
)
print
(
f
" aiperf Job timed out after
{
timeout
}
s"
)
_print_job_logs
(
job_name
,
namespace
)
return
False
def
_print_job_logs
(
job_name
:
str
,
namespace
:
str
,
tail
:
int
=
20
)
->
None
:
"""Print last N lines of the Job pod logs."""
result
=
run_kubectl
(
[
"logs"
,
f
"job/
{
job_name
}
"
,
f
"--tail=
{
tail
}
"
],
namespace
=
namespace
,
check
=
False
,
)
if
result
.
stdout
:
print
(
f
" --- Last
{
tail
}
lines of aiperf logs ---"
)
for
line
in
result
.
stdout
.
strip
().
split
(
"
\n
"
):
print
(
f
"
{
line
}
"
)
def
_get_job_pod_name
(
job_name
:
str
,
namespace
:
str
)
->
Optional
[
str
]:
"""Get the pod name for a Job."""
result
=
run_kubectl
(
[
"get"
,
"pods"
,
"-l"
,
f
"job-name=
{
job_name
}
"
,
"-o"
,
"jsonpath={.items[0].metadata.name}"
,
],
namespace
=
namespace
,
check
=
False
,
)
name
=
result
.
stdout
.
strip
()
return
name
if
name
else
None
def
_copy_artifacts_from_pvc
(
job_name
:
str
,
namespace
:
str
,
local_dir
:
Path
,
)
->
bool
:
"""Copy aiperf artifacts from the model-cache PVC to the local filesystem.
Spins up a temporary busybox pod that mounts the PVC, uses kubectl cp
to extract the artifacts, then deletes the pod.
Returns True if artifacts were successfully copied and the expected
profile_export_aiperf.json exists, False otherwise.
"""
local_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
artifacts_ok
=
False
helper_name
=
f
"copy-
{
job_name
[
-
20
:]
}
"
pvc_path
=
f
"/model-cache/perf/
{
job_name
}
"
try
:
# Create a helper pod to access the PVC
helper_yaml
=
f
"""apiVersion: v1
kind: Pod
metadata:
name:
{
helper_name
}
namespace:
{
namespace
}
spec:
restartPolicy: Never
containers:
- name: copy
image: busybox:latest
command: ["sh", "-c", "echo ready && sleep 300"]
volumeMounts:
- name: model-cache
mountPath: /model-cache
readOnly: true
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: model-cache
"""
run_kubectl
([
"apply"
,
"-f"
,
"-"
],
namespace
=
namespace
,
input_data
=
helper_yaml
)
# Wait for helper pod to be ready
for
_
in
range
(
30
):
result
=
run_kubectl
(
[
"get"
,
"pod"
,
helper_name
,
"-o"
,
"jsonpath={.status.phase}"
],
namespace
=
namespace
,
check
=
False
,
)
if
result
.
stdout
.
strip
()
==
"Running"
:
break
time
.
sleep
(
2
)
# List what's on the PVC
result
=
run_kubectl
(
[
"exec"
,
helper_name
,
"--"
,
"ls"
,
"-la"
,
pvc_path
],
namespace
=
namespace
,
check
=
False
,
)
if
result
.
stdout
:
print
(
f
" PVC artifacts (
{
pvc_path
}
):"
)
for
line
in
result
.
stdout
.
strip
().
split
(
"
\n
"
)[:
6
]:
print
(
f
"
{
line
}
"
)
# Copy artifacts locally
subprocess
.
run
(
[
"kubectl"
,
"cp"
,
f
"
{
namespace
}
/
{
helper_name
}
:
{
pvc_path
}
/"
,
str
(
local_dir
)
+
"/"
,
],
capture_output
=
True
,
text
=
True
,
check
=
True
,
timeout
=
120
,
)
files
=
list
(
local_dir
.
glob
(
"*"
))
print
(
f
" Copied
{
len
(
files
)
}
artifact files to local"
)
for
f
in
sorted
(
files
)[:
5
]:
print
(
f
"
{
f
.
name
}
(
{
f
.
stat
().
st_size
}
bytes)"
)
expected
=
local_dir
/
"profile_export_aiperf.json"
if
expected
.
exists
()
and
expected
.
stat
().
st_size
>
0
:
artifacts_ok
=
True
else
:
print
(
f
" WARNING: expected artifact missing or empty:
{
expected
.
name
}
"
)
except
Exception
as
e
:
print
(
f
" WARNING: artifact copy failed:
{
e
}
"
)
finally
:
# Cleanup helper pod
run_kubectl
(
[
"delete"
,
"pod"
,
helper_name
,
"--ignore-not-found"
,
"--grace-period=0"
],
namespace
=
namespace
,
check
=
False
,
)
return
artifacts_ok
def
run_aiperf
(
artifact_dir
:
Path
,
endpoint
:
str
,
model_name
:
str
,
concurrency
:
int
,
isl
:
int
,
namespace
:
str
,
image
:
str
,
run_id
:
str
,
osl
:
int
=
256
,
benchmark_duration
:
Optional
[
int
]
=
None
,
num_requests
:
Optional
[
int
]
=
None
,
request_rate
:
Optional
[
int
]
=
None
,
warmup_duration
:
Optional
[
int
]
=
None
,
warmup_count
:
Optional
[
int
]
=
None
,
export_level
:
str
=
"summary"
,
image_pull_secret
:
str
=
""
,
hf_token_secret_name
:
str
=
DEFAULT_HF_TOKEN_SECRET_NAME
,
timeout
:
int
=
600
,
)
->
bool
:
"""Run aiperf as a k8s Job inside the namespace.
Creates a Job with python:3.12-slim, installs aiperf via pip, runs the
benchmark against the in-cluster service endpoint, then copies artifacts
back to the local filesystem.
Args:
artifact_dir: Local directory for aiperf artifacts.
endpoint: In-cluster frontend endpoint (service:port).
model_name: Model name for aiperf --model.
concurrency: Concurrency level.
isl: Input sequence length.
namespace: K8s namespace.
image: Container image (unused -- uses python:3.12-slim).
run_id: Unique run identifier (used in Job name).
osl: Output sequence length.
benchmark_duration: Optional benchmark duration in seconds.
num_requests: Optional request count.
request_rate: Optional request rate limit.
warmup_duration: Optional warmup duration in seconds.
warmup_count: Optional warmup request count.
export_level: aiperf export level (summary, records, raw).
image_pull_secret: Optional image pull secret for the Job pod.
hf_token_secret_name: Secret name that stores HF_TOKEN.
timeout: Job timeout in seconds.
Returns:
True if aiperf succeeded, False otherwise.
"""
# Sanitize run_id for k8s naming (lowercase, no underscores, max 63 chars)
safe_id
=
run_id
.
lower
().
replace
(
"_"
,
"-"
)[:
40
]
ts
=
str
(
int
(
time
.
time
()))[
-
6
:]
job_name
=
f
"aiperf-
{
safe_id
}
-
{
ts
}
"
print
(
f
" Creating aiperf Job:
{
job_name
}
(c=
{
concurrency
}
isl=
{
isl
}
)"
)
script
=
_build_aiperf_script
(
model_name
=
model_name
,
endpoint
=
endpoint
,
concurrency
=
concurrency
,
isl
=
isl
,
osl
=
osl
,
benchmark_duration
=
benchmark_duration
,
num_requests
=
num_requests
,
request_rate
=
request_rate
,
warmup_duration
=
warmup_duration
,
warmup_count
=
warmup_count
,
export_level
=
export_level
,
)
job_yaml
=
_build_job_yaml
(
job_name
=
job_name
,
namespace
=
namespace
,
script
=
script
,
image_pull_secret
=
image_pull_secret
,
hf_token_secret_name
=
hf_token_secret_name
,
)
# Create the Job
try
:
run_kubectl
(
[
"apply"
,
"-f"
,
"-"
],
namespace
=
namespace
,
input_data
=
job_yaml
,
)
except
Exception
as
e
:
print
(
f
" ERROR: Failed to create aiperf Job:
{
e
}
"
)
return
False
# Wait for completion
success
=
_wait_for_job
(
job_name
,
namespace
,
timeout
=
timeout
)
# Copy artifacts from PVC regardless of success (partial results may exist)
artifacts_ok
=
_copy_artifacts_from_pvc
(
job_name
,
namespace
,
artifact_dir
)
if
success
and
not
artifacts_ok
:
print
(
" Job succeeded but artifacts missing -- marking as failure"
)
success
=
False
# Print logs on failure
if
not
success
:
_print_job_logs
(
job_name
,
namespace
,
tail
=
30
)
# Clean up the Job
run_kubectl
(
[
"delete"
,
"job"
,
job_name
,
"--ignore-not-found"
],
namespace
=
namespace
,
check
=
False
,
)
return
success
benchmarks/frontend/scripts/sweep_k8s/dgd.py
0 → 100644
View file @
273252e6
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
DynamoGraphDeployment helpers -- backend switch, restart, readiness.
Ported from sweep.sh functions: dgd_switch_backend, dgd_restart_frontend,
dgd_restart_graph, dgd_wait_all_ready.
"""
from
__future__
import
annotations
import
json
import
random
import
subprocess
import
time
import
urllib.error
import
urllib.request
from
sweep_k8s.kubectl
import
(
delete_pod
,
get_json
,
get_pod_name
,
patch_json
,
patch_merge
,
run_kubectl
,
wait_for_pod_deletion
,
wait_pod
,
)
# Tokenizer backend name mapping for DGD env vars
TOKENIZER_BACKEND_MAP
=
{
"hf"
:
"default"
,
"default"
:
"default"
,
"fast"
:
"fast"
,
"fastokens"
:
"fast"
,
}
def
dgd_label_selector
(
dgd_name
:
str
,
component_type
:
str
)
->
str
:
"""Build a label selector for DGD-managed pods."""
return
(
f
"nvidia.com/dynamo-graph-deployment-name=
{
dgd_name
}
,"
f
"nvidia.com/dynamo-component-type=
{
component_type
}
"
)
def
wait_model_ready
(
endpoint
:
str
,
model_name
:
str
,
max_wait
:
int
=
300
,
namespace
:
str
=
""
,
)
->
None
:
"""Wait for a model to be registered at the frontend /v1/models endpoint.
Tries direct HTTP first. If the endpoint is not reachable from localhost
(in-cluster DNS), falls back to kubectl run to check from inside the cluster.
"""
print
(
f
" Waiting for model '
{
model_name
}
' at http://
{
endpoint
}
/v1/models..."
)
waited
=
0
while
True
:
# Try direct HTTP (works if endpoint is port-forwarded or localhost)
try
:
req
=
urllib
.
request
.
Request
(
f
"http://
{
endpoint
}
/v1/models"
,
headers
=
{
"Accept"
:
"application/json"
},
)
with
urllib
.
request
.
urlopen
(
req
,
timeout
=
10
)
as
resp
:
data
=
json
.
loads
(
resp
.
read
().
decode
())
models
=
data
.
get
(
"data"
,
[])
if
any
(
m
.
get
(
"id"
)
==
model_name
for
m
in
models
):
print
(
f
" Model ready (waited
{
waited
}
s)"
)
return
except
(
urllib
.
error
.
URLError
,
json
.
JSONDecodeError
,
OSError
,
ValueError
):
pass
# Fallback: kubectl-based check for in-cluster endpoints
if
namespace
and
_check_model_via_kubectl
(
endpoint
,
model_name
,
namespace
):
print
(
f
" Model ready via kubectl (waited
{
waited
}
s)"
)
return
time
.
sleep
(
5
)
waited
+=
5
if
waited
>=
max_wait
:
print
(
f
"ERROR: Model not ready after
{
max_wait
}
s"
)
raise
TimeoutError
(
f
"Model '
{
model_name
}
' not ready after
{
max_wait
}
s"
)
if
waited
%
15
==
0
:
print
(
f
" Still waiting (
{
waited
}
s /
{
max_wait
}
s)..."
)
def
_check_model_via_kubectl
(
endpoint
:
str
,
model_name
:
str
,
namespace
:
str
,
)
->
bool
:
"""Check model readiness by running curl from inside the cluster."""
pod_name
=
f
"model-check-
{
int
(
time
.
time
())
}
-
{
random
.
randint
(
0
,
9999
)
}
"
try
:
result
=
subprocess
.
run
(
[
"kubectl"
,
"run"
,
pod_name
,
"--rm"
,
"-i"
,
"--restart=Never"
,
"-n"
,
namespace
,
"--quiet"
,
"--image=curlimages/curl:latest"
,
"--"
,
"-sf"
,
f
"http://
{
endpoint
}
/v1/models"
,
],
capture_output
=
True
,
text
=
True
,
timeout
=
20
,
)
if
result
.
returncode
==
0
and
result
.
stdout
.
strip
():
data
=
json
.
loads
(
result
.
stdout
)
models
=
data
.
get
(
"data"
,
[])
return
any
(
m
.
get
(
"id"
)
==
model_name
for
m
in
models
)
except
(
subprocess
.
SubprocessError
,
json
.
JSONDecodeError
,
OSError
):
pass
return
False
def
dgd_wait_all_ready
(
dgd_name
:
str
,
namespace
:
str
,
endpoint
:
str
,
model_name
:
str
,
max_wait
:
int
=
300
,
)
->
None
:
"""Wait for all DGD worker pods to be Ready, then wait for model endpoint."""
print
(
" Waiting for all worker pods to be Ready..."
)
retries
=
3
for
attempt
in
range
(
retries
):
try
:
wait_pod
(
dgd_label_selector
(
dgd_name
,
"worker"
),
namespace
,
timeout
=
max_wait
,
)
break
except
subprocess
.
TimeoutExpired
:
raise
except
subprocess
.
CalledProcessError
as
e
:
if
attempt
<
retries
-
1
:
print
(
f
" kubectl error (attempt
{
attempt
+
1
}
/
{
retries
}
), retrying..."
)
time
.
sleep
(
5
)
else
:
raise
RuntimeError
(
f
"Worker pods not ready after
{
retries
}
retries:
{
e
}
"
)
from
e
wait_model_ready
(
endpoint
,
model_name
,
max_wait
,
namespace
=
namespace
)
def
dgd_switch_backend
(
dgd_name
:
str
,
namespace
:
str
,
endpoint
:
str
,
model_name
:
str
,
backend
:
str
,
)
->
None
:
"""Switch tokenizer backend on a DynamoGraphDeployment.
Patches the DGD spec to set DYN_TOKENIZER_BACKEND; the Grove operator
recreates the frontend pod automatically.
"""
mapped_backend
=
TOKENIZER_BACKEND_MAP
.
get
(
backend
,
backend
)
print
(
f
"
\n
--- Switching DGD tokenizer backend ->
{
mapped_backend
}
(dgd=
{
dgd_name
}
) ---"
)
# Find the index of DYN_TOKENIZER_BACKEND in the Frontend env array
try
:
dgd_json
=
get_json
(
"dgd"
,
dgd_name
,
namespace
)
env_list
=
(
dgd_json
.
get
(
"spec"
,
{})
.
get
(
"services"
,
{})
.
get
(
"Frontend"
,
{})
.
get
(
"extraPodSpec"
,
{})
.
get
(
"mainContainer"
,
{})
.
get
(
"env"
,
[])
)
idx
=
None
for
i
,
env_var
in
enumerate
(
env_list
):
if
env_var
.
get
(
"name"
)
==
"DYN_TOKENIZER_BACKEND"
:
idx
=
i
break
except
Exception
:
idx
=
None
# Capture the current frontend pod name BEFORE patching so we track
# the right pod for deletion (avoids racing with the operator).
old_pod
=
get_pod_name
(
dgd_label_selector
(
dgd_name
,
"frontend"
),
namespace
,
)
if
idx
is
not
None
:
patch_json
(
"dgd"
,
dgd_name
,
namespace
,
[
{
"op"
:
"replace"
,
"path"
:
f
"/spec/services/Frontend/extraPodSpec/mainContainer/env/
{
idx
}
/value"
,
"value"
:
mapped_backend
,
}
],
)
else
:
patch_json
(
"dgd"
,
dgd_name
,
namespace
,
[
{
"op"
:
"add"
,
"path"
:
"/spec/services/Frontend/extraPodSpec/mainContainer/env/-"
,
"value"
:
{
"name"
:
"DYN_TOKENIZER_BACKEND"
,
"value"
:
mapped_backend
},
}
],
)
print
(
" DGD patched -- waiting for frontend pod replacement..."
)
if
old_pod
:
print
(
f
" Waiting for old pod
{
old_pod
}
to terminate..."
)
wait_for_pod_deletion
(
old_pod
,
namespace
,
timeout
=
120
)
# Wait for new frontend pod to be Ready
print
(
" Waiting for new frontend pod to be Ready..."
)
wait_pod
(
dgd_label_selector
(
dgd_name
,
"frontend"
),
namespace
,
timeout
=
300
,
)
dgd_wait_all_ready
(
dgd_name
,
namespace
,
endpoint
,
model_name
)
def
dgd_restart_frontend
(
dgd_name
:
str
,
namespace
:
str
,
endpoint
:
str
,
model_name
:
str
,
)
->
None
:
"""Restart only the frontend component to reset metrics counters."""
print
(
" Restarting frontend pod to reset metrics counters..."
)
old_pod
=
get_pod_name
(
dgd_label_selector
(
dgd_name
,
"frontend"
),
namespace
,
)
if
old_pod
:
delete_pod
(
old_pod
,
namespace
,
grace_period
=
5
)
print
(
f
" Waiting for old pod
{
old_pod
}
to terminate..."
)
# Wait for delete
try
:
run_kubectl
(
[
"wait"
,
"pod"
,
old_pod
,
"--for=delete"
,
"--timeout=90s"
],
namespace
=
namespace
,
check
=
False
,
)
except
Exception
:
pass
print
(
" Waiting for new frontend pod to be Ready..."
)
wait_pod
(
dgd_label_selector
(
dgd_name
,
"frontend"
),
namespace
,
timeout
=
300
,
)
dgd_wait_all_ready
(
dgd_name
,
namespace
,
endpoint
,
model_name
)
def
dgd_restart_graph
(
dgd_name
:
str
,
namespace
:
str
,
endpoint
:
str
,
model_name
:
str
,
)
->
None
:
"""Trigger a full DGD restart through spec.restart.
Every run starts from a clean graph deployment state.
"""
restart_id
=
f
"bench-
{
time
.
strftime
(
'%Y%m%d-%H%M%S'
)
}
-
{
random
.
randint
(
0
,
9999
)
}
"
print
(
f
" Restarting full DGD deployment (id=
{
restart_id
}
)..."
)
# Discover service names from the DGD spec so the restart order is correct
# for any backend (mocker, vllm, trtllm, etc.)
try
:
dgd_spec
=
get_json
(
"dgd"
,
dgd_name
,
namespace
,
timeout
=
60
)
services
=
list
(
dgd_spec
.
get
(
"spec"
,
{}).
get
(
"services"
,
{}).
keys
())
# Put workers before frontend: restart workers first, then frontend
frontend_names
=
[
s
for
s
in
services
if
s
.
lower
()
==
"frontend"
]
worker_names
=
[
s
for
s
in
services
if
s
.
lower
()
!=
"frontend"
]
restart_order
=
worker_names
+
frontend_names
except
Exception
:
restart_order
=
[
"Frontend"
]
print
(
f
" Restart order:
{
restart_order
}
"
)
patch_merge
(
"dgd"
,
dgd_name
,
namespace
,
{
"spec"
:
{
"restart"
:
{
"id"
:
restart_id
,
"strategy"
:
{
"type"
:
"Sequential"
,
"order"
:
restart_order
,
},
}
}
},
)
waited
=
0
phase
=
"pending"
while
True
:
try
:
state_json
=
get_json
(
"dgd"
,
dgd_name
,
namespace
,
timeout
=
60
)
restart_status
=
state_json
.
get
(
"status"
,
{}).
get
(
"restart"
,
{})
observed
=
restart_status
.
get
(
"observedID"
,
""
)
phase
=
restart_status
.
get
(
"phase"
,
""
)
if
observed
==
restart_id
:
if
phase
==
"Completed"
:
print
(
f
" DGD restart completed (waited
{
waited
}
s)"
)
break
elif
phase
in
(
"Failed"
,
"Superseded"
):
raise
RuntimeError
(
f
"DGD restart
{
restart_id
}
ended with phase=
{
phase
}
"
)
except
(
KeyError
,
TypeError
):
pass
except
(
subprocess
.
TimeoutExpired
,
subprocess
.
CalledProcessError
)
as
e
:
# Transient kubectl timeout -- retry
print
(
f
" kubectl transient error, retrying... (
{
e
.
__class__
.
__name__
}
)"
)
time
.
sleep
(
5
)
waited
+=
5
if
waited
>=
600
:
raise
TimeoutError
(
f
"Timed out waiting for DGD restart
{
restart_id
}
"
)
print
(
f
" Waiting for DGD restart (
{
waited
}
s / 600s)... phase=
{
phase
}
"
)
dgd_wait_all_ready
(
dgd_name
,
namespace
,
endpoint
,
model_name
)
benchmarks/frontend/scripts/sweep_k8s/kubectl.py
0 → 100644
View file @
273252e6
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Safe kubectl subprocess helpers.
All k8s interactions go through this module for consistent error handling
and namespace scoping.
"""
from
__future__
import
annotations
import
json
import
subprocess
import
time
from
typing
import
Any
,
Dict
,
List
,
Optional
def
run_kubectl
(
args
:
List
[
str
],
namespace
:
Optional
[
str
]
=
None
,
capture
:
bool
=
True
,
check
:
bool
=
True
,
timeout
:
int
=
60
,
input_data
:
Optional
[
str
]
=
None
,
)
->
subprocess
.
CompletedProcess
:
"""Run a kubectl command with namespace scoping and error handling.
Args:
args: kubectl arguments (e.g., ["get", "pods"]).
namespace: K8s namespace (prepended as -n <namespace>).
capture: Whether to capture stdout/stderr.
check: Whether to raise on non-zero exit.
timeout: Command timeout in seconds.
input_data: Optional stdin input.
Returns:
CompletedProcess result.
"""
cmd
=
[
"kubectl"
]
if
namespace
:
cmd
.
extend
([
"-n"
,
namespace
])
cmd
.
extend
(
args
)
result
=
subprocess
.
run
(
cmd
,
capture_output
=
capture
,
text
=
True
,
check
=
False
,
timeout
=
timeout
,
input
=
input_data
,
)
if
check
and
result
.
returncode
!=
0
:
stderr
=
result
.
stderr
.
strip
()
if
result
.
stderr
else
""
print
(
f
" kubectl error (rc=
{
result
.
returncode
}
):
{
' '
.
join
(
args
[:
4
])
}
"
)
if
stderr
:
print
(
f
"
{
stderr
}
"
)
result
.
check_returncode
()
return
result
def
get_json
(
resource
:
str
,
name
:
str
,
namespace
:
str
,
timeout
:
int
=
30
,
)
->
Dict
[
str
,
Any
]:
"""Get a k8s resource as a parsed JSON dict."""
result
=
run_kubectl
(
[
"get"
,
resource
,
name
,
"-o"
,
"json"
],
namespace
=
namespace
,
timeout
=
timeout
,
)
return
json
.
loads
(
result
.
stdout
)
def
patch_json
(
resource
:
str
,
name
:
str
,
namespace
:
str
,
patch
:
List
[
Dict
[
str
,
Any
]],
timeout
:
int
=
30
,
)
->
None
:
"""Apply a JSON patch to a k8s resource."""
patch_str
=
json
.
dumps
(
patch
)
run_kubectl
(
[
"patch"
,
resource
,
name
,
"--type=json"
,
f
"-p=
{
patch_str
}
"
],
namespace
=
namespace
,
timeout
=
timeout
,
)
def
patch_merge
(
resource
:
str
,
name
:
str
,
namespace
:
str
,
patch
:
Dict
[
str
,
Any
],
timeout
:
int
=
30
,
)
->
None
:
"""Apply a strategic merge patch to a k8s resource."""
patch_str
=
json
.
dumps
(
patch
)
run_kubectl
(
[
"patch"
,
resource
,
name
,
"--type=merge"
,
f
"-p=
{
patch_str
}
"
],
namespace
=
namespace
,
timeout
=
timeout
,
)
def
wait_pod
(
label_selector
:
str
,
namespace
:
str
,
condition
:
str
=
"Ready"
,
timeout
:
int
=
300
,
)
->
None
:
"""Wait for pod(s) matching a label selector to reach a condition."""
run_kubectl
(
[
"wait"
,
"pod"
,
"-l"
,
label_selector
,
f
"--for=condition=
{
condition
}
"
,
f
"--timeout=
{
timeout
}
s"
,
],
namespace
=
namespace
,
timeout
=
timeout
+
10
,
)
def
delete_pod
(
name
:
str
,
namespace
:
str
,
grace_period
:
int
=
5
,
)
->
None
:
"""Delete a pod by name."""
run_kubectl
(
[
"delete"
,
"pod"
,
name
,
f
"--grace-period=
{
grace_period
}
"
],
namespace
=
namespace
,
check
=
False
,
)
def
get_pod_name
(
label_selector
:
str
,
namespace
:
str
,
)
->
Optional
[
str
]:
"""Get the name of the first pod matching a label selector."""
result
=
run_kubectl
(
[
"get"
,
"pod"
,
"-l"
,
label_selector
,
"-o"
,
"jsonpath={.items[0].metadata.name}"
,
],
namespace
=
namespace
,
check
=
False
,
)
name
=
result
.
stdout
.
strip
()
return
name
if
name
else
None
def
pod_exists
(
name
:
str
,
namespace
:
str
)
->
bool
:
"""Check if a pod exists."""
result
=
run_kubectl
(
[
"get"
,
"pod"
,
name
],
namespace
=
namespace
,
check
=
False
,
)
return
result
.
returncode
==
0
def
apply_yaml
(
yaml_content
:
str
,
namespace
:
str
)
->
None
:
"""Apply YAML content via kubectl apply -f -."""
run_kubectl
(
[
"apply"
,
"-f"
,
"-"
],
namespace
=
namespace
,
input_data
=
yaml_content
,
)
def
apply_secret_literal
(
name
:
str
,
namespace
:
str
,
key
:
str
,
value
:
str
)
->
None
:
"""Create or update an opaque Secret from a literal value."""
secret_yaml
=
f
"""apiVersion: v1
kind: Secret
metadata:
name:
{
name
}
type: Opaque
stringData:
{
key
}
:
{
json
.
dumps
(
value
)
}
"""
apply_yaml
(
secret_yaml
,
namespace
)
def
wait_for_pod_deletion
(
name
:
str
,
namespace
:
str
,
timeout
:
int
=
120
,
)
->
None
:
"""Wait for a pod to be deleted."""
waited
=
0
while
pod_exists
(
name
,
namespace
):
time
.
sleep
(
5
)
waited
+=
5
if
waited
>=
timeout
:
print
(
f
" WARNING: pod
{
name
}
still present after
{
timeout
}
s"
)
break
benchmarks/frontend/scripts/sweep_k8s/metrics.py
0 → 100644
View file @
273252e6
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Prometheus metrics capture for k8s sweeps.
Captures pre/post frontend /metrics snapshots for delta analysis.
Supports both direct HTTP (when endpoint is reachable) and kubectl-exec
(when only in-cluster DNS is available).
"""
from
__future__
import
annotations
import
shlex
import
subprocess
import
time
import
urllib.request
from
pathlib
import
Path
from
typing
import
Optional
def
capture_metrics
(
endpoint
:
str
,
dest
:
Path
,
namespace
:
Optional
[
str
]
=
None
,
pod_label
:
Optional
[
str
]
=
None
,
)
->
None
:
"""Capture frontend /metrics to a file.
Tries direct HTTP first. If that fails and namespace + pod_label are
provided, falls back to kubectl exec curl from the frontend pod.
Args:
endpoint: Frontend endpoint (host:port) -- may be in-cluster DNS.
dest: Destination file path.
namespace: K8s namespace (for kubectl exec fallback).
pod_label: Pod label selector (for kubectl exec fallback).
"""
dest
.
parent
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
# Try direct HTTP first (works if port-forwarded or on same network)
body
=
_try_http
(
endpoint
)
# Fallback: kubectl exec into the frontend pod to curl metrics
if
body
is
None
and
namespace
and
pod_label
:
body
=
_try_kubectl_exec
(
endpoint
,
namespace
,
pod_label
)
# Fallback 2: kubectl run a temporary pod to curl
if
body
is
None
and
namespace
:
body
=
_try_kubectl_run
(
endpoint
,
namespace
)
if
body
and
body
.
strip
():
dest
.
write_text
(
body
)
line_count
=
len
(
body
.
strip
().
split
(
"
\n
"
))
print
(
f
" Metrics captured ->
{
dest
.
name
}
(
{
line_count
}
lines)"
)
else
:
msg
=
f
"# metrics capture failed at
{
time
.
strftime
(
'%Y-%m-%dT%H:%M:%S'
)
}
\n
"
dest
.
write_text
(
msg
)
print
(
f
" WARNING: could not capture metrics from
{
endpoint
}
"
)
def
_try_http
(
endpoint
:
str
)
->
Optional
[
str
]:
"""Try fetching metrics via direct HTTP."""
try
:
req
=
urllib
.
request
.
Request
(
f
"http://
{
endpoint
}
/metrics"
)
with
urllib
.
request
.
urlopen
(
req
,
timeout
=
10
)
as
resp
:
return
resp
.
read
().
decode
()
except
Exception
:
return
None
def
_try_kubectl_exec
(
endpoint
:
str
,
namespace
:
str
,
pod_label
:
str
,
)
->
Optional
[
str
]:
"""Fetch metrics by exec-ing curl inside a running pod."""
try
:
# Get a pod name from the label selector
result
=
subprocess
.
run
(
[
"kubectl"
,
"-n"
,
namespace
,
"get"
,
"pod"
,
"-l"
,
pod_label
,
"-o"
,
"jsonpath={.items[0].metadata.name}"
,
],
capture_output
=
True
,
text
=
True
,
timeout
=
10
,
)
pod_name
=
result
.
stdout
.
strip
()
if
not
pod_name
:
return
None
# Exec curl inside the pod (curl may not be available; try wget too)
safe_endpoint
=
shlex
.
quote
(
endpoint
)
result
=
subprocess
.
run
(
[
"kubectl"
,
"-n"
,
namespace
,
"exec"
,
pod_name
,
"--"
,
"sh"
,
"-c"
,
f
"curl -sf http://
{
safe_endpoint
}
/metrics 2>/dev/null || "
f
"wget -qO- http://
{
safe_endpoint
}
/metrics 2>/dev/null || "
f
'python3 -c "import urllib.request,sys; print(urllib.request.urlopen(sys.argv[1]).read().decode())" http://
{
safe_endpoint
}
/metrics 2>/dev/null'
,
],
capture_output
=
True
,
text
=
True
,
timeout
=
15
,
)
if
result
.
returncode
==
0
and
result
.
stdout
.
strip
():
return
result
.
stdout
except
Exception
:
pass
return
None
def
_try_kubectl_run
(
endpoint
:
str
,
namespace
:
str
)
->
Optional
[
str
]:
"""Fetch metrics via a one-shot kubectl run --rm pod."""
try
:
result
=
subprocess
.
run
(
[
"kubectl"
,
"run"
,
"metrics-fetch"
,
"--rm"
,
"-i"
,
"--restart=Never"
,
"-n"
,
namespace
,
"--image=curlimages/curl:latest"
,
"--"
,
"-sf"
,
f
"http://
{
endpoint
}
/metrics"
,
],
capture_output
=
True
,
text
=
True
,
timeout
=
30
,
)
if
result
.
returncode
==
0
and
result
.
stdout
.
strip
():
return
result
.
stdout
except
Exception
:
pass
return
None
benchmarks/frontend/scripts/sweep_k8s/template.py
0 → 100644
View file @
273252e6
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Deploy YAML template rendering and application.
Supports ${VARIABLE} placeholders using Python's string.Template.
This enables arbitrary backend deployments (mocker, vLLM, TensorRT-LLM, etc.)
without hardcoding DGD structures.
"""
from
__future__
import
annotations
import
string
from
pathlib
import
Path
from
typing
import
Dict
from
sweep_core.models
import
DeployDimension
,
SweepConfig
from
sweep_k8s.kubectl
import
apply_yaml
# Tokenizer backend mapping for template substitution
TOKENIZER_TEMPLATE_MAP
=
{
"hf"
:
"default"
,
"default"
:
"default"
,
"fast"
:
"fast"
,
"fastokens"
:
"fast"
,
}
DEFAULT_HF_TOKEN_SECRET_NAME
=
"hf-token-secret"
def
_indent_block
(
text
:
str
,
spaces
:
int
)
->
str
:
prefix
=
" "
*
spaces
return
"
\n
"
.
join
(
f
"
{
prefix
}{
line
}
"
if
line
else
""
for
line
in
text
.
splitlines
())
def
_build_image_pull_secrets_block
(
image_pull_secret
:
str
)
->
str
:
if
not
image_pull_secret
:
return
""
return
_indent_block
(
f
"""imagePullSecrets:
- name:
{
image_pull_secret
}
"""
,
8
,
)
def
build_substitution_dict
(
deploy
:
DeployDimension
,
config
:
SweepConfig
,
)
->
Dict
[
str
,
str
]:
"""Build a variable substitution dictionary for template rendering.
Combines deploy dimensions, sweep config, and k8s config into a flat
dictionary suitable for string.Template substitution.
"""
k8s
=
config
.
k8s
hf_token_secret_name
=
DEFAULT_HF_TOKEN_SECRET_NAME
variables
:
Dict
[
str
,
str
]
=
{
# Deploy dimensions
"DYN_TOKENIZER_BACKEND"
:
TOKENIZER_TEMPLATE_MAP
.
get
(
deploy
.
tokenizer
,
deploy
.
tokenizer
),
"NUM_WORKERS"
:
str
(
deploy
.
workers
),
# Model info
"MODEL"
:
config
.
model
,
"MODEL_NAME"
:
config
.
model_name
,
"MODEL_PATH"
:
config
.
model
,
# Image
"IMAGE"
:
k8s
.
image
,
# K8s config
"NAMESPACE"
:
k8s
.
namespace
,
"DGD_NAME"
:
k8s
.
dgd_name
,
"FRONTEND_PORT"
:
str
(
k8s
.
frontend_port
),
"WORKER_REPLICAS"
:
str
(
k8s
.
worker_replicas
),
"FRONTEND_REPLICAS"
:
str
(
k8s
.
frontend_replicas
),
"SPEEDUP_RATIO"
:
str
(
config
.
speedup_ratio
),
"REQUEST_PLANE"
:
k8s
.
request_plane
,
"EVENT_PLANE"
:
k8s
.
event_plane
,
"ROUTER_MODE"
:
k8s
.
router_mode
,
"HF_TOKEN_SECRET_NAME"
:
hf_token_secret_name
,
"FRONTEND_IMAGE_PULL_SECRETS_BLOCK"
:
_build_image_pull_secrets_block
(
k8s
.
image_pull_secret
),
"WORKER_IMAGE_PULL_SECRETS_BLOCK"
:
_build_image_pull_secrets_block
(
k8s
.
image_pull_secret
),
}
# Add any env_overrides from the deploy dimension
variables
.
update
(
deploy
.
env_overrides
)
return
variables
def
render_template
(
template_path
:
Path
,
variables
:
Dict
[
str
,
str
])
->
str
:
"""Read a deploy YAML template and substitute ${VAR} placeholders.
Uses safe_substitute so missing variables are left as-is rather than
raising KeyError. This is important because DGD templates may contain
${VARIABLE} references that are resolved by the k8s operator at runtime.
"""
raw
=
template_path
.
read_text
()
tmpl
=
string
.
Template
(
raw
)
return
tmpl
.
safe_substitute
(
variables
)
def
apply_rendered_template
(
template_path
:
Path
,
deploy
:
DeployDimension
,
config
:
SweepConfig
,
)
->
None
:
"""Render a deploy template and apply it via kubectl."""
variables
=
build_substitution_dict
(
deploy
,
config
)
rendered
=
render_template
(
template_path
,
variables
)
print
(
f
" Applying rendered template:
{
template_path
.
name
}
"
)
apply_yaml
(
rendered
,
config
.
k8s
.
namespace
)
benchmarks/frontend/scripts/sweep_runner.py
View file @
273252e6
...
...
@@ -4,10 +4,12 @@
"""
Frontend performance sweep runner.
Standalone Python script that orchestrates performance sweeps by delegating
each run to run_perf.sh. Combines the sweep grid logic of sweep.sh with
the saturation analysis of tasks/sweep.py, and the Prometheus/report
integration of the analysis scripts.
Thin CLI entry point that delegates to sweep_core (pure logic), sweep_executors
(how runs execute), and sweep_k8s (k8s helpers).
Supports two execution modes:
- local: delegates each run to run_perf.sh (mocker + frontend per run)
- k8s: DGD-based execution with aiperf against a k8s-deployed frontend
Sweep dimensions (all configurable):
- tokenizers (hf, fastokens)
...
...
@@ -15,644 +17,94 @@ Sweep dimensions (all configurable):
- ISL values
- worker counts
Backends:
- mocker (default): fast synthetic backend, no real inference
- vllm: real vLLM inference server (produces TTFT/ITL metrics)
Each (tokenizer, concurrency, ISL) point is a separate run_perf.sh invocation.
Results are collected into CSV + summary.md + per-run reports.
Usage:
#
S
moke test (2 runs)
#
Local s
moke test (2 runs)
python3 sweep_runner.py --tokenizers hf,fastokens --concurrency 32 --isl 512
\\
--benchmark-duration 30 --speedup-ratio 0
--benchmark-duration 30 --speedup-ratio
100000
0
# Full sweep with mocker
# Full
local
sweep with mocker
python3 sweep_runner.py --tokenizers hf,fastokens --concurrency 32,64 --isl 512,1024,2048
# vLLM backend (real inference)
python3 sweep_runner.py --backend vllm --tokenizers hf --concurrency 128 --isl 1024
# K8s sweep with DGD
python3 sweep_runner.py --mode k8s --dgd-name dynamo-bench-mocker
\\
--tokenizers hf,fastokens --concurrency 50,100 --isl 512
# K8s with custom deploy template
python3 sweep_runner.py --mode k8s --deploy-template dgd/templates/vllm.yaml
\\
--tokenizers hf --concurrency 128 --isl 1024
# Transport saturation sweep
(tasks/sweep.py style)
# Transport saturation sweep
python3 sweep_runner.py --tokenizers hf --concurrency 4096
\\
--num-requests 16384,32768 --workers 1,2,4,8 --speedup-ratio 0
--num-requests 16384,32768 --workers 1,2,4,8 --speedup-ratio
100000
0
# Dry run
python3 sweep_runner.py --dry-run --tokenizers hf,fastokens --concurrency 32,64 --isl 512,1024
# Emit plan as JSON (for Argo or MCP)
python3 sweep_runner.py --emit-plan --tokenizers hf --concurrency 50 --isl 512
"""
import
argparse
import
csv
import
json
import
os
import
signal
import
subprocess
import
sys
import
time
from
dataclasses
import
dataclass
from
pathlib
import
Path
from
typing
import
Optional
# Ensure the scripts directory is on the path for package imports
SCRIPT_DIR
=
Path
(
__file__
).
resolve
().
parent
REPO_ROOT
=
SCRIPT_DIR
.
parent
.
parent
.
parent
ANALYSIS_DIR
=
SCRIPT_DIR
/
"analysis"
# ── Defaults ─────────────────────────────────────────────────────────────────
DEFAULT_MODEL
=
"Qwen/Qwen3-0.6B"
DEFAULT_OSL
=
256
DEFAULT_SPEEDUP
=
1.0
DEFAULT_BENCHMARK_DURATION
=
60
DEFAULT_MAX_CONSECUTIVE_FAILS
=
2
DEFAULT_COOLDOWN
=
3
TOKENIZER_MAP
=
{
"fast"
:
"fastokens"
,
"fastokens"
:
"fastokens"
,
"hf"
:
"default"
,
"default"
:
"default"
,
}
# ── Data ─────────────────────────────────────────────────────────────────────
@
dataclass
class
RunConfig
:
"""Configuration for a single sweep point."""
backend
:
str
# "mocker" or "vllm"
tokenizer
:
str
# "hf" or "fastokens"
concurrency
:
int
isl
:
int
osl
:
int
workers
:
int
num_models
:
int
aiperf_targets
:
str
# "first" or "all"
speedup_ratio
:
float
model
:
str
benchmark_duration
:
Optional
[
int
]
num_requests
:
Optional
[
int
]
request_rate
:
Optional
[
int
]
@
property
def
run_id
(
self
)
->
str
:
base
=
f
"
{
self
.
tokenizer
}
_c
{
self
.
concurrency
}
_isl
{
self
.
isl
}
_w
{
self
.
workers
}
"
if
self
.
num_models
>
1
:
base
+=
f
"_m
{
self
.
num_models
}
"
if
self
.
request_rate
:
base
+=
f
"_rps
{
self
.
request_rate
}
"
return
base
@
dataclass
class
RunResult
:
"""Result from a single sweep point."""
if
str
(
SCRIPT_DIR
)
not
in
sys
.
path
:
sys
.
path
.
insert
(
0
,
str
(
SCRIPT_DIR
))
config
:
RunConfig
status
:
str
=
"pending"
# ok, fail, skipped
req_per_sec
:
float
=
0.0
output_tok_per_sec
:
float
=
0.0
ttft_p50_ms
:
float
=
0.0
ttft_p99_ms
:
float
=
0.0
itl_p50_ms
:
float
=
0.0
itl_p99_ms
:
float
=
0.0
duration_sec
:
float
=
0.0
run_dir
:
str
=
""
# ── Helpers ──────────────────────────────────────────────────────────────────
def
_kill_port
(
port
:
int
):
"""Kill any process holding a port (SIGTERM first, then SIGKILL)."""
subprocess
.
run
(
f
"fuser -k -TERM
{
port
}
/tcp"
,
shell
=
True
,
capture_output
=
True
,
timeout
=
5
)
time
.
sleep
(
2
)
subprocess
.
run
(
f
"fuser -k -KILL
{
port
}
/tcp"
,
shell
=
True
,
capture_output
=
True
,
timeout
=
5
)
def
_port_free
(
port
:
int
)
->
bool
:
import
socket
s
=
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
)
try
:
return
s
.
connect_ex
((
"127.0.0.1"
,
port
))
!=
0
finally
:
s
.
close
()
def
_wait_port_free
(
port
:
int
,
timeout
:
int
=
30
):
"""Wait for a port to become free."""
for
i
in
range
(
timeout
):
if
_port_free
(
port
):
return
if
i
==
0
:
print
(
f
" Waiting for port
{
port
}
to free..."
)
time
.
sleep
(
1
)
print
(
f
" Forcing port
{
port
}
release..."
)
_kill_port
(
port
)
time
.
sleep
(
2
)
def
_parse_aiperf_json
(
json_path
:
Path
)
->
dict
:
"""Parse aiperf profile_export_aiperf.json."""
if
not
json_path
.
exists
():
return
{}
try
:
data
=
json
.
loads
(
json_path
.
read_text
())
result
=
{}
# Request throughput
rt
=
data
.
get
(
"request_throughput"
,
{})
result
[
"req_per_sec"
]
=
rt
.
get
(
"avg"
,
0
)
# Output token throughput
ot
=
data
.
get
(
"output_token_throughput"
,
{})
result
[
"output_tok_per_sec"
]
=
ot
.
get
(
"avg"
,
0
)
# TTFT (aiperf exports in ms already)
ttft
=
data
.
get
(
"time_to_first_token"
,
data
.
get
(
"ttft"
,
{}))
if
isinstance
(
ttft
,
dict
):
result
[
"ttft_p50_ms"
]
=
ttft
.
get
(
"p50"
,
0
)
or
0
result
[
"ttft_p99_ms"
]
=
ttft
.
get
(
"p99"
,
0
)
or
0
# ITL
itl
=
data
.
get
(
"inter_token_latency"
,
data
.
get
(
"itl"
,
{}))
if
isinstance
(
itl
,
dict
):
result
[
"itl_p50_ms"
]
=
itl
.
get
(
"p50"
,
0
)
or
0
result
[
"itl_p99_ms"
]
=
itl
.
get
(
"p99"
,
0
)
or
0
# Duration (can be dict with .avg or raw float)
bd
=
data
.
get
(
"benchmark_duration"
,
0
)
result
[
"duration_sec"
]
=
bd
.
get
(
"avg"
,
0
)
if
isinstance
(
bd
,
dict
)
else
(
bd
or
0
)
return
result
except
(
json
.
JSONDecodeError
,
KeyError
,
TypeError
):
return
{}
def
_run_single
(
cfg
:
RunConfig
,
run_dir
:
Path
,
passthrough_args
:
list
[
str
],
)
->
RunResult
:
"""Execute a single run_perf.sh invocation."""
result
=
RunResult
(
config
=
cfg
,
run_dir
=
str
(
run_dir
))
cmd
=
[
str
(
SCRIPT_DIR
/
"run_perf.sh"
),
"--model"
,
cfg
.
model
,
"--isl"
,
str
(
cfg
.
isl
),
"--osl"
,
str
(
cfg
.
osl
),
"--concurrency"
,
str
(
cfg
.
concurrency
),
"--workers"
,
str
(
cfg
.
workers
),
"--speedup-ratio"
,
str
(
cfg
.
speedup_ratio
),
"--num-models"
,
str
(
cfg
.
num_models
),
"--aiperf-targets"
,
cfg
.
aiperf_targets
,
"--output-dir"
,
str
(
run_dir
),
]
if
cfg
.
benchmark_duration
:
cmd
.
extend
([
"--benchmark-duration"
,
str
(
cfg
.
benchmark_duration
)])
if
cfg
.
num_requests
:
cmd
.
extend
([
"--num-requests"
,
str
(
cfg
.
num_requests
)])
if
cfg
.
request_rate
:
cmd
.
extend
([
"--request-rate"
,
str
(
cfg
.
request_rate
)])
if
cfg
.
tokenizer
in
(
"fast"
,
"fastokens"
):
cmd
.
append
(
"--fast-tokens"
)
# TODO: when run_perf.sh gains --backend vllm support, pass it here
if
cfg
.
backend
==
"vllm"
:
print
(
" WARNING: vllm backend not yet supported by run_perf.sh; using mocker"
)
cmd
.
extend
(
passthrough_args
)
print
(
f
" cmd:
{
' '
.
join
(
cmd
[:
6
])
}
..."
)
try
:
proc
=
subprocess
.
Popen
(
cmd
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
STDOUT
,
text
=
True
,
start_new_session
=
True
,
)
stdout
,
_
=
proc
.
communicate
(
timeout
=
600
)
if
proc
.
returncode
==
0
:
result
.
status
=
"ok"
else
:
result
.
status
=
"fail"
print
(
f
" run_perf.sh failed (rc=
{
proc
.
returncode
}
)"
)
# Print last few lines of output for debugging
lines
=
(
stdout
or
""
).
strip
().
split
(
"
\n
"
)
for
line
in
lines
[
-
5
:]:
print
(
f
"
{
line
}
"
)
except
subprocess
.
TimeoutExpired
:
result
.
status
=
"fail"
print
(
" TIMEOUT after 600s"
)
try
:
pgid
=
os
.
getpgid
(
proc
.
pid
)
os
.
killpg
(
pgid
,
signal
.
SIGTERM
)
time
.
sleep
(
2
)
os
.
killpg
(
pgid
,
signal
.
SIGKILL
)
except
ProcessLookupError
:
pass
# already exited
except
Exception
as
e
:
result
.
status
=
"fail"
print
(
f
" ERROR:
{
e
}
"
)
# Parse aiperf results -- check both flat and multi-model layouts
aiperf_json
=
run_dir
/
"aiperf"
/
"profile_export_aiperf.json"
if
not
aiperf_json
.
exists
():
# Multi-model: results are in aiperf/<model-name>/
for
candidate
in
sorted
(
(
run_dir
/
"aiperf"
).
glob
(
"*/profile_export_aiperf.json"
)
):
aiperf_json
=
candidate
break
# Use the first model's results for the summary row
metrics
=
_parse_aiperf_json
(
aiperf_json
)
if
metrics
:
result
.
req_per_sec
=
metrics
.
get
(
"req_per_sec"
,
0
)
result
.
output_tok_per_sec
=
metrics
.
get
(
"output_tok_per_sec"
,
0
)
result
.
ttft_p50_ms
=
metrics
.
get
(
"ttft_p50_ms"
,
0
)
result
.
ttft_p99_ms
=
metrics
.
get
(
"ttft_p99_ms"
,
0
)
result
.
itl_p50_ms
=
metrics
.
get
(
"itl_p50_ms"
,
0
)
result
.
itl_p99_ms
=
metrics
.
get
(
"itl_p99_ms"
,
0
)
result
.
duration_sec
=
metrics
.
get
(
"duration_sec"
,
0
)
return
result
def
_generate_report
(
run_dir
:
Path
):
"""Run create_report.py on a single run directory."""
try
:
sys
.
path
.
insert
(
0
,
str
(
ANALYSIS_DIR
))
from
create_report
import
run_analysis
report
=
run_analysis
(
run_dir
)
(
run_dir
/
"report.md"
).
write_text
(
report
)
except
Exception
as
e
:
print
(
f
" Report generation failed:
{
e
}
"
)
# ── Output ───────────────────────────────────────────────────────────────────
def
_write_csv
(
results
:
list
[
RunResult
],
csv_path
:
Path
):
"""Write incremental CSV (called after each run)."""
fieldnames
=
[
"run_id"
,
"backend"
,
"tokenizer"
,
"concurrency"
,
"isl"
,
"osl"
,
"workers"
,
"speedup_ratio"
,
"status"
,
"req_per_sec"
,
"output_tok_per_sec"
,
"ttft_p50_ms"
,
"ttft_p99_ms"
,
"itl_p50_ms"
,
"itl_p99_ms"
,
"duration_sec"
,
"run_dir"
,
]
with
open
(
csv_path
,
"w"
,
newline
=
""
)
as
f
:
writer
=
csv
.
DictWriter
(
f
,
fieldnames
=
fieldnames
,
extrasaction
=
"ignore"
)
writer
.
writeheader
()
for
r
in
results
:
row
=
{
"run_id"
:
r
.
config
.
run_id
,
"backend"
:
r
.
config
.
backend
,
"tokenizer"
:
r
.
config
.
tokenizer
,
"concurrency"
:
r
.
config
.
concurrency
,
"isl"
:
r
.
config
.
isl
,
"osl"
:
r
.
config
.
osl
,
"workers"
:
r
.
config
.
workers
,
"speedup_ratio"
:
r
.
config
.
speedup_ratio
,
"status"
:
r
.
status
,
"req_per_sec"
:
f
"
{
r
.
req_per_sec
:.
2
f
}
"
if
r
.
req_per_sec
else
""
,
"output_tok_per_sec"
:
f
"
{
r
.
output_tok_per_sec
:.
1
f
}
"
if
r
.
output_tok_per_sec
else
""
,
"ttft_p50_ms"
:
f
"
{
r
.
ttft_p50_ms
:.
1
f
}
"
if
r
.
ttft_p50_ms
else
""
,
"ttft_p99_ms"
:
f
"
{
r
.
ttft_p99_ms
:.
1
f
}
"
if
r
.
ttft_p99_ms
else
""
,
"itl_p50_ms"
:
f
"
{
r
.
itl_p50_ms
:.
1
f
}
"
if
r
.
itl_p50_ms
else
""
,
"itl_p99_ms"
:
f
"
{
r
.
itl_p99_ms
:.
1
f
}
"
if
r
.
itl_p99_ms
else
""
,
"duration_sec"
:
f
"
{
r
.
duration_sec
:.
1
f
}
"
if
r
.
duration_sec
else
""
,
"run_dir"
:
r
.
run_dir
,
}
writer
.
writerow
(
row
)
def
_write_summary
(
results
:
list
[
RunResult
],
summary_path
:
Path
):
"""Write markdown summary table."""
lines
=
[
"# Sweep Summary
\n
"
]
lines
.
append
(
f
"**Generated:**
{
time
.
strftime
(
'%Y-%m-%d %H:%M:%S'
)
}
\n
"
)
lines
.
append
(
"| Run ID | Req/s | Tok/s | TTFT p50 | TTFT p99 | ITL p50 | Duration | Status |"
)
lines
.
append
(
"|--------|------:|------:|---------:|---------:|--------:|---------:|--------|"
)
for
r
in
results
:
rps
=
f
"
{
r
.
req_per_sec
:.
1
f
}
"
if
r
.
req_per_sec
else
"-"
tps
=
f
"
{
r
.
output_tok_per_sec
:.
0
f
}
"
if
r
.
output_tok_per_sec
else
"-"
tp50
=
f
"
{
r
.
ttft_p50_ms
:.
1
f
}
ms"
if
r
.
ttft_p50_ms
else
"-"
tp99
=
f
"
{
r
.
ttft_p99_ms
:.
1
f
}
ms"
if
r
.
ttft_p99_ms
else
"-"
ip50
=
f
"
{
r
.
itl_p50_ms
:.
1
f
}
ms"
if
r
.
itl_p50_ms
else
"-"
dur
=
f
"
{
r
.
duration_sec
:.
0
f
}
s"
if
r
.
duration_sec
else
"-"
lines
.
append
(
f
"|
{
r
.
config
.
run_id
}
|
{
rps
}
|
{
tps
}
|
{
tp50
}
|
{
tp99
}
|
{
ip50
}
|
{
dur
}
|
{
r
.
status
}
|"
)
lines
.
append
(
""
)
ok
=
sum
(
1
for
r
in
results
if
r
.
status
==
"ok"
)
fail
=
sum
(
1
for
r
in
results
if
r
.
status
==
"fail"
)
skip
=
sum
(
1
for
r
in
results
if
r
.
status
==
"skipped"
)
lines
.
append
(
f
"**Totals:**
{
ok
}
passed,
{
fail
}
failed,
{
skip
}
skipped out of
{
len
(
results
)
}
"
)
summary_path
.
write_text
(
"
\n
"
.
join
(
lines
)
+
"
\n
"
)
def
_print_results_table
(
results
:
list
[
RunResult
]):
"""Print a compact results table to stdout."""
print
(
f
"
\n
{
'='
*
90
}
"
)
print
(
f
"
{
'Run ID'
:
<
30
}
{
'Req/s'
:
>
8
}
{
'Tok/s'
:
>
8
}
{
'TTFT p50'
:
>
10
}
{
'TTFT p99'
:
>
10
}
{
'Status'
:
>
8
}
"
)
print
(
f
"
{
'-'
*
30
}
{
'-'
*
8
}
{
'-'
*
8
}
{
'-'
*
10
}
{
'-'
*
10
}
{
'-'
*
8
}
"
)
for
r
in
results
:
rps
=
f
"
{
r
.
req_per_sec
:.
1
f
}
"
if
r
.
req_per_sec
else
"N/A"
tps
=
f
"
{
r
.
output_tok_per_sec
:.
0
f
}
"
if
r
.
output_tok_per_sec
else
"N/A"
tp50
=
f
"
{
r
.
ttft_p50_ms
:.
1
f
}
ms"
if
r
.
ttft_p50_ms
else
"N/A"
tp99
=
f
"
{
r
.
ttft_p99_ms
:.
1
f
}
ms"
if
r
.
ttft_p99_ms
else
"N/A"
print
(
f
"
{
r
.
config
.
run_id
:
<
30
}
{
rps
:
>
8
}
{
tps
:
>
8
}
{
tp50
:
>
10
}
{
tp99
:
>
10
}
{
r
.
status
:
>
8
}
"
)
print
(
f
"
{
'='
*
90
}
"
)
# ── Main ─────────────────────────────────────────────────────────────────────
from
sweep_core.config
import
build_argument_parser
,
config_from_args
# noqa: E402
from
sweep_core.orchestrator
import
run
as
run_sweep
# noqa: E402
from
sweep_core.planner
import
build_plan
,
print_plan
# noqa: E402
def
main
():
parser
=
argparse
.
ArgumentParser
(
description
=
"Frontend performance sweep runner"
,
formatter_class
=
argparse
.
RawDescriptionHelpFormatter
,
epilog
=
"""Examples:
# Smoke test
python3 sweep_runner.py --tokenizers hf,fastokens --concurrency 32 --isl 512
\\
--benchmark-duration 30 --speedup-ratio 0
parser
=
build_argument_parser
()
# Full tokenizer comparison
python3 sweep_runner.py --tokenizers hf,fastokens --concurrency 32,64 --isl 512,1024,2048
# vLLM backend (real inference)
python3 sweep_runner.py --backend vllm --tokenizers hf --concurrency 128 --isl 1024
# Transport saturation (high concurrency, vary workers)
python3 sweep_runner.py --tokenizers hf --concurrency 4096
\\
--num-requests 16384,32768 --workers 1,2,4,8 --speedup-ratio 0
# With profilers (needs sudo for BPF)
sudo -E python3 sweep_runner.py --tokenizers hf --concurrency 64 --isl 1024
\\
-- --with-nsys --with-perf --with-bpf
"""
,
)
parser
.
add_argument
(
"--model"
,
default
=
DEFAULT_MODEL
,
help
=
"HF model path"
)
parser
.
add_argument
(
"--backend"
,
choices
=
[
"mocker"
,
"vllm"
],
default
=
"mocker"
,
help
=
"Engine backend: mocker (synthetic) or vllm (real inference)"
,
)
parser
.
add_argument
(
"--tokenizers"
,
default
=
"hf,fastokens"
,
help
=
"Comma-separated tokenizer backends (hf, fastokens)"
,
)
parser
.
add_argument
(
"--concurrency"
,
default
=
"50,100,200"
,
help
=
"Comma-separated concurrency levels"
)
parser
.
add_argument
(
"--isl"
,
default
=
"512,1024,2048"
,
help
=
"Comma-separated ISL values"
)
parser
.
add_argument
(
"--osl"
,
type
=
int
,
default
=
DEFAULT_OSL
,
help
=
"Output sequence length"
)
parser
.
add_argument
(
"--workers"
,
default
=
"2"
,
help
=
"Comma-separated worker counts per model"
)
parser
.
add_argument
(
"--num-models"
,
type
=
int
,
default
=
1
,
help
=
"Number of model instances (each gets --workers workers, named model-1, model-2, ...)"
,
)
parser
.
add_argument
(
"--aiperf-targets"
,
choices
=
[
"first"
,
"all"
],
default
=
"first"
,
help
=
"'first': aiperf targets model-1 only (default). 'all': run aiperf for each model."
,
)
parser
.
add_argument
(
"--speedup-ratio"
,
type
=
float
,
default
=
DEFAULT_SPEEDUP
,
help
=
"Mocker speedup (0=infinite)"
,
)
parser
.
add_argument
(
"--benchmark-duration"
,
type
=
int
,
default
=
DEFAULT_BENCHMARK_DURATION
,
help
=
"aiperf duration (seconds)"
,
)
parser
.
add_argument
(
"--num-requests"
,
default
=
None
,
help
=
"Comma-separated request counts (overrides --benchmark-duration)"
,
)
parser
.
add_argument
(
"--rps"
,
default
=
None
,
help
=
"Comma-separated target request rates (req/s). Sweep dimension when multiple values given."
,
)
parser
.
add_argument
(
"--output-dir"
,
default
=
None
,
help
=
"Output directory (default: auto timestamped)"
,
)
parser
.
add_argument
(
"--max-consecutive-fails"
,
type
=
int
,
default
=
DEFAULT_MAX_CONSECUTIVE_FAILS
)
# Add CLI-only flags that don't belong in SweepConfig
parser
.
add_argument
(
"--cooldown"
,
type
=
int
,
default
=
DEFAULT_COOLDOWN
,
help
=
"Seconds between runs"
)
parser
.
add_argument
(
"--dry-run"
,
action
=
"store_true"
,
help
=
"Print plan without executing"
)
parser
.
add_argument
(
"--no-report"
,
action
=
"store_true"
,
help
=
"Skip per-run report generation"
)
parser
.
add_argument
(
"passthrough"
,
nargs
=
"*"
,
help
=
"Extra args passed to run_perf.sh (after --)"
"--emit-plan"
,
action
=
"store_true"
,
help
=
"Print the sweep plan as JSON and exit (no execution)"
,
)
args
=
parser
.
parse_args
()
# Parse lists
tokenizers
=
[
t
.
strip
()
for
t
in
args
.
tokenizers
.
split
(
","
)]
concurrencies
=
[
int
(
c
)
for
c
in
args
.
concurrency
.
split
(
","
)]
isls
=
[
int
(
i
)
for
i
in
args
.
isl
.
split
(
","
)]
worker_counts
=
[
int
(
w
)
for
w
in
args
.
workers
.
split
(
","
)]
num_requests_list
=
(
[
int
(
n
)
for
n
in
args
.
num_requests
.
split
(
","
)]
if
args
.
num_requests
else
[
None
]
)
rps_list
=
[
int
(
r
)
for
r
in
args
.
rps
.
split
(
","
)]
if
args
.
rps
else
[
None
]
# Build sweep grid
configs
:
list
[
RunConfig
]
=
[]
for
tokenizer
in
tokenizers
:
for
workers
in
worker_counts
:
for
concurrency
in
concurrencies
:
for
isl
in
isls
:
for
nr
in
num_requests_list
:
for
rps
in
rps_list
:
configs
.
append
(
RunConfig
(
backend
=
args
.
backend
,
tokenizer
=
tokenizer
,
concurrency
=
concurrency
,
isl
=
isl
,
osl
=
args
.
osl
,
workers
=
workers
,
num_models
=
args
.
num_models
,
aiperf_targets
=
args
.
aiperf_targets
,
speedup_ratio
=
args
.
speedup_ratio
,
model
=
args
.
model
,
benchmark_duration
=
args
.
benchmark_duration
if
nr
is
None
else
None
,
num_requests
=
nr
,
request_rate
=
rps
,
)
)
# Build typed config from args
config
=
config_from_args
(
args
)
# Output directory
if
args
.
output_dir
:
output_root
=
Path
(
args
.
output_dir
)
else
:
ts
=
time
.
strftime
(
"%Y%m%d_%H%M%S"
)
output_root
=
REPO_ROOT
/
"artifacts"
/
f
"sweep_
{
ts
}
"
total
=
len
(
configs
)
print
(
f
"Sweep plan:
{
total
}
runs"
)
print
(
f
" Model:
{
args
.
model
}
"
)
print
(
f
" Backend:
{
args
.
backend
}
"
)
print
(
f
" Tokenizers:
{
tokenizers
}
"
)
print
(
f
" Concurrencies:
{
concurrencies
}
"
)
print
(
f
" ISLs:
{
isls
}
"
)
print
(
f
" Workers/model:
{
worker_counts
}
"
)
print
(
f
" Models:
{
args
.
num_models
}
"
)
print
(
f
" Benchmark dur:
{
args
.
benchmark_duration
}
s"
)
if
args
.
num_requests
:
print
(
f
" Num requests:
{
[
int
(
n
)
for
n
in
args
.
num_requests
.
split
(
','
)]
}
"
)
if
args
.
rps
:
print
(
f
" Request rates:
{
[
int
(
r
)
for
r
in
args
.
rps
.
split
(
','
)]
}
req/s"
)
print
(
f
" Output:
{
output_root
}
"
)
print
()
# Build plan
plan
=
build_plan
(
config
)
print_plan
(
plan
)
if
args
.
dry_run
:
for
i
,
cfg
in
enumerate
(
configs
,
1
)
:
print
(
f
" [
{
i
}
/
{
total
}
]
{
cfg
.
run_id
}
"
)
# Emit plan JSON mode
if
args
.
emit_plan
:
print
(
plan
.
to_json
()
)
return
output_root
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
csv_path
=
output_root
/
"results.csv"
summary_path
=
output_root
/
"summary.md"
# Passthrough args for run_perf.sh (e.g., --skip-bpf --skip-nsys)
passthrough
=
args
.
passthrough
or
[]
results
:
list
[
RunResult
]
=
[]
consecutive_fails
:
dict
[
tuple
,
int
]
=
{}
# (backend, concurrency, workers) -> count
try
:
for
i
,
cfg
in
enumerate
(
configs
,
1
):
key
=
(
cfg
.
backend
,
cfg
.
concurrency
,
cfg
.
workers
)
run_dir
=
output_root
/
cfg
.
run_id
# Skip after consecutive failures
if
consecutive_fails
.
get
(
key
,
0
)
>=
args
.
max_consecutive_fails
:
result
=
RunResult
(
config
=
cfg
,
status
=
"skipped"
,
run_dir
=
str
(
run_dir
))
results
.
append
(
result
)
print
(
f
"
\n
[
{
i
}
/
{
total
}
] SKIPPED
{
cfg
.
run_id
}
(
{
args
.
max_consecutive_fails
}
consecutive failures)"
)
continue
print
(
f
"
\n
{
'='
*
60
}
"
)
print
(
f
" [
{
i
}
/
{
total
}
]
{
cfg
.
run_id
}
"
)
print
(
f
"
{
'='
*
60
}
"
)
# Wait for port from previous run
_wait_port_free
(
8000
)
# Run
result
=
_run_single
(
cfg
,
run_dir
,
passthrough
)
results
.
append
(
result
)
# Update consecutive failure tracking
if
result
.
status
==
"ok"
:
consecutive_fails
[
key
]
=
0
rps
=
f
"
{
result
.
req_per_sec
:.
1
f
}
"
if
result
.
req_per_sec
else
"N/A"
tp50
=
f
"
{
result
.
ttft_p50_ms
:.
1
f
}
ms"
if
result
.
ttft_p50_ms
else
"N/A"
print
(
f
" OK:
{
rps
}
req/s, TTFT p50=
{
tp50
}
"
)
else
:
consecutive_fails
[
key
]
=
consecutive_fails
.
get
(
key
,
0
)
+
1
print
(
f
" FAIL (consecutive:
{
consecutive_fails
[
key
]
}
/
{
args
.
max_consecutive_fails
}
)"
)
# Generate per-run report
if
not
args
.
no_report
and
result
.
status
==
"ok"
:
_generate_report
(
run_dir
)
# Dry run mode
if
config
.
dry_run
:
for
i
,
run_spec
in
enumerate
(
plan
.
runs
,
1
):
print
(
f
" [
{
i
}
/
{
plan
.
total_runs
}
]
{
run_spec
.
run_id
}
"
)
return
# Write incremental CSV + summary
_write_csv
(
results
,
csv_path
)
_write_summary
(
results
,
summary_path
)
# Select executor based on mode
if
config
.
mode
==
"local"
:
from
sweep_executors.local
import
LocalExecutor
# Cooldown
if
i
<
total
:
time
.
sleep
(
args
.
cooldown
)
executor
=
LocalExecutor
()
elif
config
.
mode
==
"k8s"
:
from
sweep_executors.k8s_dgd
import
K8sDgdExecutor
except
KeyboardInterrupt
:
print
(
"
\n\n
Interrupted! Partial results saved."
)
finally
:
_write_csv
(
results
,
csv_path
)
_write_summary
(
results
,
summary_path
)
executor
=
K8sDgdExecutor
()
else
:
print
(
f
"ERROR: Unknown mode '
{
config
.
mode
}
'. Use 'local' or 'k8s'."
,
file
=
sys
.
stderr
,
)
sys
.
exit
(
1
)
# Final output
_print_results_table
(
results
)
print
(
f
"
\n
Results:
{
csv_path
}
"
)
print
(
f
"Summary:
{
summary_path
}
"
)
print
(
f
"Per-run:
{
output_root
}
/<run_id>/report.md"
)
# Run the sweep
run_sweep
(
plan
,
executor
)
if
__name__
==
"__main__"
:
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment