Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
f8219b12
"lib/bindings/python/vscode:/vscode.git/clone" did not exist on "fc1243604856497feb2f4a2b1678df4a9bfd295c"
Unverified
Commit
f8219b12
authored
Nov 12, 2025
by
ishandhanani
Committed by
GitHub
Nov 12, 2025
Browse files
feat: agg support + solidfy fp8 configs (#3886)
parent
2c910b2e
Changes
15
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
1362 additions
and
304 deletions
+1362
-304
examples/backends/sglang/slurm_jobs/job_script_template_agg.j2
...les/backends/sglang/slurm_jobs/job_script_template_agg.j2
+298
-0
examples/backends/sglang/slurm_jobs/job_script_template_disagg.j2
.../backends/sglang/slurm_jobs/job_script_template_disagg.j2
+42
-11
examples/backends/sglang/slurm_jobs/parse.py
examples/backends/sglang/slurm_jobs/parse.py
+212
-0
examples/backends/sglang/slurm_jobs/scripts/aiperf/bench.sh
examples/backends/sglang/slurm_jobs/scripts/aiperf/bench.sh
+6
-4
examples/backends/sglang/slurm_jobs/scripts/gb200-fp4/disagg/default.sh
...nds/sglang/slurm_jobs/scripts/gb200-fp4/disagg/default.sh
+29
-6
examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/agg/default.sh
...ckends/sglang/slurm_jobs/scripts/gb200-fp8/agg/default.sh
+94
-0
examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/disagg/1p_4d.sh
...kends/sglang/slurm_jobs/scripts/gb200-fp8/disagg/1p_4d.sh
+187
-0
examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/disagg/default.sh
...nds/sglang/slurm_jobs/scripts/gb200-fp8/disagg/default.sh
+22
-6
examples/backends/sglang/slurm_jobs/scripts/h100.sh
examples/backends/sglang/slurm_jobs/scripts/h100.sh
+0
-181
examples/backends/sglang/slurm_jobs/scripts/sglang/bench.sh
examples/backends/sglang/slurm_jobs/scripts/sglang/bench.sh
+7
-5
examples/backends/sglang/slurm_jobs/scripts/vllm/backend_request_func.py
...ds/sglang/slurm_jobs/scripts/vllm/backend_request_func.py
+7
-1
examples/backends/sglang/slurm_jobs/scripts/vllm/bench.sh
examples/backends/sglang/slurm_jobs/scripts/vllm/bench.sh
+37
-26
examples/backends/sglang/slurm_jobs/scripts/worker_setup.py
examples/backends/sglang/slurm_jobs/scripts/worker_setup.py
+159
-20
examples/backends/sglang/slurm_jobs/submit_disagg.sh
examples/backends/sglang/slurm_jobs/submit_disagg.sh
+9
-1
examples/backends/sglang/slurm_jobs/submit_job_script.py
examples/backends/sglang/slurm_jobs/submit_job_script.py
+253
-43
No files found.
examples/backends/sglang/slurm_jobs/job_script_template_agg.j2
0 → 100755
View file @
f8219b12
#!/bin/bash
#SBATCH --job-name={{ job_name }}
#SBATCH --nodes={{ total_nodes }}
#SBATCH --ntasks={{ total_nodes }}
#SBATCH --ntasks-per-node=1
#SBATCH --account={{ account }}
#SBATCH --time={{ time_limit }}
#SBATCH --output=logs/%j_{{ agg_workers }}A_{{ timestamp }}/log.out
#SBATCH --error=logs/%j_{{ agg_workers }}A_{{ timestamp }}/log.err
#SBATCH --partition={{ partition }}
# Constants
set
-x
AGG_NODES
={{
agg_nodes
}}
AGG_WORKERS
={{
agg_workers
}}
TOTAL_NODES
={{
total_nodes
}}
GPUS_PER_NODE
={{
gpus_per_node
}}
TOTAL_GPUS
=
$((
AGG_NODES
*
GPUS_PER_NODE
))
PREFILL_GPUS
=
0
DECODE_GPUS
=
$TOTAL_GPUS
AGG_NODES_PER_WORKER
=
$((
AGG_NODES
/
AGG_WORKERS
))
LOG_DIR
=
"
${
SLURM_SUBMIT_DIR
}
/logs/
${
SLURM_JOB_ID
}
_{{ agg_workers }}A_{{ timestamp }}"
SCRIPT_DIR
=
"
${
SLURM_SUBMIT_DIR
}
/scripts"
OUTPUT_DIR
=
"
${
SLURM_SUBMIT_DIR
}
/outputs"
MODEL_DIR
=
"{{ model_dir }}"
CONFIG_DIR
=
"{{ config_dir }}"
CONTAINER_IMAGE
=
"{{ container_image }}"
NETWORK_INTERFACE
=
"{{ network_interface }}"
GPU_TYPE
=
"{{ gpu_type | default('h100') }}"
set
+x
{
% raw %
}
mkdir
-p
"
${
OUTPUT_DIR
}
"
"
${
LOG_DIR
}
"
nodes
=(
$(
scontrol show hostnames
$SLURM_NODELIST
)
)
if
[
${#
nodes
[@]
}
-ne
$TOTAL_NODES
]
;
then
echo
"Error: Expected
$TOTAL_NODES
nodes but got
${#
nodes
[@]
}
nodes"
exit
1
fi
# Print node information
for
i
in
"
${
!nodes[@]
}
"
;
do
echo
"Node
$i
:
${
nodes
[
$i
]
}
"
done
{
% endraw %
}
{
%
if
enable_multiple_frontends %
}
{
% raw %
}
# Multiple frontend architecture
# Node 0: nginx + aggregated worker shard
# Node 1: NATS/ETCD + first frontend
# Node 2+: aggregated workers + optional additional frontends
NGINX_NODE
=
${
nodes
[0]
}
MASTER_NODE
=
${
nodes
[1]
}
MASTER_IP
=
$(
srun
--nodes
=
1
--ntasks
=
1
--nodelist
=
$MASTER_NODE
ip addr show
$NETWORK_INTERFACE
|
grep
'inet '
|
awk
'{print $2}'
|
cut
-d
'/'
-f1
)
if
[
-z
"
$MASTER_IP
"
]
;
then
echo
"Error: Could not retrieve IP address for master host
$MASTER_NODE
on interface
$NETWORK_INTERFACE
"
exit
1
fi
echo
"Master IP address (node 1):
$MASTER_IP
"
echo
"Nginx node (node 0):
$NGINX_NODE
"
# Generate frontend IP list for nginx config
frontend_hosts
=()
frontend_ips
=()
# Node 1 always has a frontend (with NATS/ETCD)
frontend_hosts+
=(
"
$MASTER_NODE
"
)
frontend_ips+
=(
"
$MASTER_IP
"
)
# Add additional frontends based on num_additional_frontends
{
% endraw %
}
ADDITIONAL_FRONTENDS
={{
num_additional_frontends
}}{
% raw %
}
if
[
"
$ADDITIONAL_FRONTENDS
"
-gt
0
]
;
then
# Calculate which nodes get additional frontends
# We have AGG_NODES aggregated worker nodes, distribute additional frontends across them
nodes_per_frontend
=
$((
(
AGG_NODES
-
1
+
ADDITIONAL_FRONTENDS
-
1
)
/
ADDITIONAL_FRONTENDS
))
# ceil division
frontend_node_idx
=
2
# Start from node 2 (node 1 already has frontend)
for
i
in
$(
seq
1
$ADDITIONAL_FRONTENDS
)
;
do
if
[
$frontend_node_idx
-lt
$TOTAL_NODES
]
;
then
node_name
=
${
nodes
[
$frontend_node_idx
]
}
node_ip
=
$(
srun
--nodes
=
1
--ntasks
=
1
--nodelist
=
$node_name
ip addr show
$NETWORK_INTERFACE
|
grep
'inet '
|
awk
'{print $2}'
|
cut
-d
'/'
-f1
)
frontend_hosts+
=(
"
$node_name
"
)
frontend_ips+
=(
"
$node_ip
"
)
echo
"Additional frontend
$i
on node
$frontend_node_idx
:
$node_name
(
$node_ip
)"
frontend_node_idx
=
$((
frontend_node_idx
+
nodes_per_frontend
))
fi
done
fi
echo
"Frontend hosts:
${
frontend_hosts
[@]
}
"
echo
"Frontend IPs:
${
frontend_ips
[@]
}
"
# Generate nginx configuration
# Build a Python list literal of frontend hosts from the bash array
FRONTEND_LIST
=
$(
printf
"'%s',"
"
${
frontend_ips
[@]
}
"
)
FRONTEND_LIST
=
"[
${
FRONTEND_LIST
%,
}
]"
export
FRONTEND_LIST SCRIPT_DIR LOG_DIR
python3 -
<<
'
PY
'
import os
from jinja2 import Template
template_path = os.path.join(os.environ['SCRIPT_DIR'], 'nginx.conf.j2')
output_path = os.path.join(os.environ['LOG_DIR'], 'nginx.conf')
with open(template_path, 'r') as f:
tmpl = Template(f.read())
frontend_hosts = eval(os.environ['FRONTEND_LIST'])
config = tmpl.render(frontend_hosts=frontend_hosts)
with open(output_path, 'w') as f:
f.write(config)
PY
{
% endraw %
}
{
%
else
%
}
{
% raw %
}
# Traditional architecture - first aggregated worker node handles everything
MASTER_IP
=
$(
srun
--nodes
=
1
--ntasks
=
1
--nodelist
=
${
nodes
[0]
}
ip addr show
$NETWORK_INTERFACE
|
grep
'inet '
|
awk
'{print $2}'
|
cut
-d
'/'
-f1
)
if
[
-z
"
$MASTER_IP
"
]
;
then
echo
"Error: Could not retrieve IP address for master host
${
nodes
[0]
}
on interface
$NETWORK_INTERFACE
"
exit
1
fi
echo
"Master IP address:
$MASTER_IP
"
{
% endraw %
}
{
% endif %
}
{
% raw %
}
# Compute leader nodes for each aggregated worker
{
% endraw %
}
{
%
if
enable_multiple_frontends %
}
{
% raw %
}
# With multiple frontends: keep offset 0; nginx coexists on node 0
WORKER_NODE_OFFSET
=
0
{
% endraw %
}
{
%
else
%
}
{
% raw %
}
# Traditional: workers start from node 0
WORKER_NODE_OFFSET
=
0
{
% endraw %
}
{
% endif %
}
{
% raw %
}
agg_leaders
=()
for
i
in
$(
seq
0
$((
AGG_WORKERS
-
1
))
)
;
do
leader_idx
=
$((
WORKER_NODE_OFFSET
+
i
*
AGG_NODES_PER_WORKER
))
agg_leaders[
$i
]=
$leader_idx
done
echo
"Aggregated worker leaders:
${
agg_leaders
[@]
}
"
# Prepare enroot arguments to pass to srun commands
ENROOT_ARGS
=
"
\
--container-image=
${
CONTAINER_IMAGE
}
\
--no-container-entrypoint
\
--no-container-mount-home
\
--container-mounts=
${
MODEL_DIR
}
:/model/,
${
CONFIG_DIR
}
:/configs/,
${
SCRIPT_DIR
}
:/scripts/,
${
OUTPUT_DIR
}
:/outputs/,
${
LOG_DIR
}
:/logs/
\
"
# Build common worker arguments
{
% endraw %
}
SCRIPT_VARIANT
=
"{{ script_variant | default('default') }}"
{
% raw %
}
WORKER_ARGS
=
"--gpu_type
${
GPU_TYPE
}
--script-variant
${
SCRIPT_VARIANT
}
--gpus_per_node
${
GPUS_PER_NODE
}
--master_ip
${
MASTER_IP
}
"
{
% endraw %
}
{
%
if
enable_multiple_frontends %
}
{
% raw %
}
# Add multiple frontends flag for worker setup
WORKER_ARGS
=
"
$WORKER_ARGS
--multiple-frontends-enabled"
{
% endraw %
}
{
% endif %
}
{
%
if
run_in_ci %
}
{
% raw %
}
# Add CI mode flag for worker setup
WORKER_ARGS
=
"
$WORKER_ARGS
--run-in-ci"
{
% endraw %
}
{
% endif %
}
{
% raw %
}
{
% endraw %
}
{
%
if
enable_multiple_frontends %
}
{
% raw %
}
# Launch nginx on node 0
echo
"Launching nginx on
${
NGINX_NODE
}
"
cmd
=
"srun --overlap
$ENROOT_ARGS
--nodes=1 --ntasks=1 --nodelist=
$NGINX_NODE
--output=
${
LOG_DIR
}
/
${
NGINX_NODE
}
_nginx.out --error=
${
LOG_DIR
}
/
${
NGINX_NODE
}
_nginx.err python /scripts/worker_setup.py --worker_type nginx --nginx_config /logs/nginx.conf
${
WORKER_ARGS
}
"
echo
"
$cmd
"
$cmd
&
# Launch frontend on master node (node 1) - this will also start NATS/ETCD
echo
"Launching frontend + NATS/ETCD on master node
${
MASTER_NODE
}
"
cmd
=
"srun --overlap
$ENROOT_ARGS
--nodes=1 --ntasks=1 --nodelist=
$MASTER_NODE
--output=
${
LOG_DIR
}
/
${
MASTER_NODE
}
_frontend_0.out --error=
${
LOG_DIR
}
/
${
MASTER_NODE
}
_frontend.err python /scripts/worker_setup.py --worker_type frontend --worker_idx 0
${
WORKER_ARGS
}
"
echo
"
$cmd
"
$cmd
&
# Launch additional frontends on designated nodes
if
[
"
$ADDITIONAL_FRONTENDS
"
-gt
0
]
;
then
frontend_idx
=
1
# Start from 1 since node 1 is frontend 0
nodes_per_frontend
=
$((
(
TOTAL_NODES
-
2
+
ADDITIONAL_FRONTENDS
-
1
)
/
ADDITIONAL_FRONTENDS
))
frontend_node_idx
=
2
for
i
in
$(
seq
1
$ADDITIONAL_FRONTENDS
)
;
do
if
[
$frontend_node_idx
-lt
$TOTAL_NODES
]
;
then
node
=
${
nodes
[
$frontend_node_idx
]
}
echo
"Launching additional frontend
$frontend_idx
on node
$frontend_node_idx
:
$node
"
cmd
=
"srun --overlap
$ENROOT_ARGS
--nodes=1 --ntasks=1 --nodelist=
$node
--output=
${
LOG_DIR
}
/
${
node
}
_frontend_
${
frontend_idx
}
.out --error=
${
LOG_DIR
}
/
${
node
}
_frontend_
${
frontend_idx
}
.err python /scripts/worker_setup.py --worker_type frontend --worker_idx
${
frontend_idx
}
${
WORKER_ARGS
}
"
echo
"
$cmd
"
$cmd
&
frontend_idx
=
$((
frontend_idx
+
1
))
frontend_node_idx
=
$((
frontend_node_idx
+
nodes_per_frontend
))
fi
done
fi
{
% endraw %
}
{
%
else
%
}
{
% raw %
}
# Traditional: first aggregated worker node also runs frontend + NATS/ETCD
# This is handled in setup_aggregated_worker when worker_idx=0 and local_rank=0
{
% endraw %
}
{
% endif %
}
{
% raw %
}
# Launch aggregated workers
for
worker_idx
in
$(
seq
0
$((
AGG_WORKERS
-
1
))
)
;
do
leader_idx
=
${
agg_leaders
[
$worker_idx
]
}
leader_node
=
${
nodes
[
$leader_idx
]
}
# Get leader IP for this worker group
LEADER_IP
=
$(
srun
--nodes
=
1
--ntasks
=
1
--nodelist
=
$leader_node
ip addr show
$NETWORK_INTERFACE
|
grep
'inet '
|
awk
'{print $2}'
|
cut
-d
'/'
-f1
)
echo
"Aggregated worker
$worker_idx
leader:
$leader_node
(
$LEADER_IP
)"
# Launch all nodes for this worker
for
node_idx
in
$(
seq
0
$((
AGG_NODES_PER_WORKER
-
1
))
)
;
do
global_node_idx
=
$((
leader_idx
+
node_idx
))
node
=
${
nodes
[
$global_node_idx
]
}
local_rank
=
$node_idx
echo
"Launching aggregated worker
$worker_idx
, node
$global_node_idx
(local_rank
$local_rank
):
$node
"
{
% endraw %
}
{
%
if
enable_config_dump %
}
{
% raw %
}
CONFIG_DUMP_ARG
=
"--dump-config-path /logs/
${
node
}
_config.json"
{
% endraw %
}
{
%
else
%
}
{
% raw %
}
CONFIG_DUMP_ARG
=
""
{
% endraw %
}
{
% endif %
}
{
% raw %
}
cmd
=
"srun --overlap
$ENROOT_ARGS
--nodes=1 --ntasks=1 --nodelist=
$node
--output=
${
LOG_DIR
}
/
${
node
}
_agg_w
${
worker_idx
}
.out --error=
${
LOG_DIR
}
/
${
node
}
_agg_w
${
worker_idx
}
.err python /scripts/worker_setup.py --leader_ip
${
LEADER_IP
}
--worker_idx
${
worker_idx
}
--local_rank
${
local_rank
}
--nodes_per_worker
${
AGG_NODES_PER_WORKER
}
--worker_type aggregated --gpu_utilization_log /logs/
${
node
}
_agg_w
${
worker_idx
}
_gpu_utilization.log
${
CONFIG_DUMP_ARG
}
${
WORKER_ARGS
}
"
echo
"
$cmd
"
$cmd
&
done
done
echo
""
{
% endraw %
}
{
%
if
enable_multiple_frontends %
}
{
% raw %
}
echo
"Frontend available at: http://
${
NGINX_NODE
}
:8000"
echo
"To connect to the nginx node:"
echo
"srun
$ENROOT_ARGS
--jobid
$SLURM_JOB_ID
-w
${
NGINX_NODE
}
--overlap --pty bash"
echo
"To connect to the master node (NATS/ETCD):"
echo
"srun
$ENROOT_ARGS
--jobid
$SLURM_JOB_ID
-w
${
MASTER_NODE
}
--overlap --pty bash"
{
% endraw %
}
{
%
else
%
}
{
% raw %
}
echo
"To connect to the master node:"
echo
"srun
$ENROOT_ARGS
--jobid
$SLURM_JOB_ID
-w
${
nodes
[0]
}
--overlap --pty bash"
{
% endraw %
}
{
% endif %
}
{
% raw %
}
echo
""
echo
"Make sure to cancel the job at the end:"
echo
"scancel
$SLURM_JOB_ID
"
# Instead of waiting for all tasks to complete, wait for profile.sh to complete and then exit.
{
% endraw %
}
PROFILER_TYPE
={{
profiler_type
}}
PROFILER_ARGS
=
"{{ profiler_arg }}"
{
%
if
do_profile %
}
{
% raw %
}
srun
--nodes
=
1
--ntasks
=
1
$ENROOT_ARGS
--jobid
$SLURM_JOB_ID
-w
${
nodes
[0]
}
--output
=
${
LOG_DIR
}
/profile.out
--error
=
${
LOG_DIR
}
/profile.err
--overlap
bash /scripts/
${
PROFILER_TYPE
}
/bench.sh 0
$AGG_WORKERS
$PREFILL_GPUS
$DECODE_GPUS
$TOTAL_GPUS
${
PROFILER_ARGS
}
&
{
% endraw %
}
{
% endif %
}
{
% raw %
}
wait
-n
first_exit_code
=
$?
echo
"Script finished at
$(
date
)
with exit code
${
first_exit_code
}
"
exit
$first_exit_code
{
% endraw %
}
examples/backends/sglang/slurm_jobs/job_script_template.j2
→
examples/backends/sglang/slurm_jobs/job_script_template
_disagg
.j2
View file @
f8219b12
...
@@ -5,8 +5,8 @@
...
@@ -5,8 +5,8 @@
#SBATCH --ntasks-per-node=1
#SBATCH --ntasks-per-node=1
#SBATCH --account={{ account }}
#SBATCH --account={{ account }}
#SBATCH --time={{ time_limit }}
#SBATCH --time={{ time_limit }}
#SBATCH --output=logs/%j/log.out
#SBATCH --output=logs/%j
_{{ prefill_workers }}P_{{ decode_workers }}D_{{ timestamp }}
/log.out
#SBATCH --error=logs/%j/log.err
#SBATCH --error=logs/%j
_{{ prefill_workers }}P_{{ decode_workers }}D_{{ timestamp }}
/log.err
#SBATCH --partition={{ partition }}
#SBATCH --partition={{ partition }}
# Constants
# Constants
...
@@ -18,9 +18,11 @@ DECODE_WORKERS={{ decode_workers }}
...
@@ -18,9 +18,11 @@ DECODE_WORKERS={{ decode_workers }}
TOTAL_NODES
=
$((
PREFILL_NODES
+
DECODE_NODES
))
TOTAL_NODES
=
$((
PREFILL_NODES
+
DECODE_NODES
))
GPUS_PER_NODE
={{
gpus_per_node
}}
GPUS_PER_NODE
={{
gpus_per_node
}}
TOTAL_GPUS
=
$((
TOTAL_NODES
*
GPUS_PER_NODE
))
TOTAL_GPUS
=
$((
TOTAL_NODES
*
GPUS_PER_NODE
))
PREFILL_GPUS
=
$((
PREFILL_NODES
*
GPUS_PER_NODE
))
DECODE_GPUS
=
$((
DECODE_NODES
*
GPUS_PER_NODE
))
PREFILL_NODES_PER_WORKER
=
$((
PREFILL_NODES
/
PREFILL_WORKERS
))
PREFILL_NODES_PER_WORKER
=
$((
PREFILL_NODES
/
PREFILL_WORKERS
))
DECODE_NODES_PER_WORKER
=
$((
DECODE_NODES
/
DECODE_WORKERS
))
DECODE_NODES_PER_WORKER
=
$((
DECODE_NODES
/
DECODE_WORKERS
))
LOG_DIR
=
"
${
SLURM_SUBMIT_DIR
}
/logs/
${
SLURM_JOB_ID
}
"
LOG_DIR
=
"
${
SLURM_SUBMIT_DIR
}
/logs/
${
SLURM_JOB_ID
}
_{{ prefill_workers }}P_{{ decode_workers }}D_{{ timestamp }}
"
SCRIPT_DIR
=
"
${
SLURM_SUBMIT_DIR
}
/scripts"
SCRIPT_DIR
=
"
${
SLURM_SUBMIT_DIR
}
/scripts"
OUTPUT_DIR
=
"
${
SLURM_SUBMIT_DIR
}
/outputs"
OUTPUT_DIR
=
"
${
SLURM_SUBMIT_DIR
}
/outputs"
MODEL_DIR
=
"{{ model_dir }}"
MODEL_DIR
=
"{{ model_dir }}"
...
@@ -49,8 +51,8 @@ done
...
@@ -49,8 +51,8 @@ done
{
%
if
enable_multiple_frontends %
}
{
%
if
enable_multiple_frontends %
}
{
% raw %
}
{
% raw %
}
# Multiple frontend architecture
# Multiple frontend architecture
# Node 0: nginx only
# Node 0: nginx only
+ prefill shard
# Node 1: NATS/ETCD + first frontend + prefill
worker
# Node 1: NATS/ETCD + first frontend + prefill
shard
# Node 2+: prefill/decode workers + optional additional frontends
# Node 2+: prefill/decode workers + optional additional frontends
NGINX_NODE
=
${
nodes
[0]
}
NGINX_NODE
=
${
nodes
[0]
}
...
@@ -168,7 +170,10 @@ ENROOT_ARGS="\
...
@@ -168,7 +170,10 @@ ENROOT_ARGS="\
"
"
# Build common worker arguments
# Build common worker arguments
WORKER_ARGS
=
"--gpu_type
${
GPU_TYPE
}
--gpus_per_node
${
GPUS_PER_NODE
}
--master_ip
${
MASTER_IP
}
"
{
% endraw %
}
SCRIPT_VARIANT
=
"{{ script_variant | default('default') }}"
{
% raw %
}
WORKER_ARGS
=
"--gpu_type
${
GPU_TYPE
}
--script-variant
${
SCRIPT_VARIANT
}
--gpus_per_node
${
GPUS_PER_NODE
}
--master_ip
${
MASTER_IP
}
"
{
% endraw %
}
{
% endraw %
}
{
%
if
enable_multiple_frontends %
}
{
%
if
enable_multiple_frontends %
}
{
% raw %
}
{
% raw %
}
...
@@ -182,6 +187,12 @@ WORKER_ARGS="$WORKER_ARGS --multiple-frontends-enabled"
...
@@ -182,6 +187,12 @@ WORKER_ARGS="$WORKER_ARGS --multiple-frontends-enabled"
WORKER_ARGS
=
"
$WORKER_ARGS
--use_init_locations"
WORKER_ARGS
=
"
$WORKER_ARGS
--use_init_locations"
{
% endraw %
}
{
% endraw %
}
{
% endif %
}
{
% endif %
}
{
%
if
run_in_ci %
}
{
% raw %
}
# Add CI mode flag for worker setup
WORKER_ARGS
=
"
$WORKER_ARGS
--run-in-ci"
{
% endraw %
}
{
% endif %
}
{
% raw %
}
{
% raw %
}
{
% endraw %
}
{
% endraw %
}
...
@@ -237,8 +248,18 @@ for worker_idx in $(seq 0 $((PREFILL_WORKERS - 1))); do
...
@@ -237,8 +248,18 @@ for worker_idx in $(seq 0 $((PREFILL_WORKERS - 1))); do
local_rank
=
$node_idx
local_rank
=
$node_idx
echo
"Launching prefill worker
$worker_idx
, node
$global_node_idx
(local_rank
$local_rank
):
$node
"
echo
"Launching prefill worker
$worker_idx
, node
$global_node_idx
(local_rank
$local_rank
):
$node
"
{
% endraw %
}
cmd
=
"srun --overlap
$ENROOT_ARGS
--nodes=1 --ntasks=1 --nodelist=
$node
--output=
${
LOG_DIR
}
/
${
node
}
_prefill_w
${
worker_idx
}
.out --error=
${
LOG_DIR
}
/
${
node
}
_prefill_w
${
worker_idx
}
.err python /scripts/worker_setup.py --leader_ip
${
LEADER_IP
}
--worker_idx
${
worker_idx
}
--local_rank
${
local_rank
}
--nodes_per_worker
${
PREFILL_NODES_PER_WORKER
}
--worker_type prefill --gpu_utilization_log /logs/
${
node
}
_prefill_w
${
worker_idx
}
_gpu_utilization.log
${
WORKER_ARGS
}
"
{
%
if
enable_config_dump %
}
{
% raw %
}
CONFIG_DUMP_ARG
=
"--dump-config-path /logs/
${
node
}
_config.json"
{
% endraw %
}
{
%
else
%
}
{
% raw %
}
CONFIG_DUMP_ARG
=
""
{
% endraw %
}
{
% endif %
}
{
% raw %
}
cmd
=
"srun --overlap
$ENROOT_ARGS
--nodes=1 --ntasks=1 --nodelist=
$node
--output=
${
LOG_DIR
}
/
${
node
}
_prefill_w
${
worker_idx
}
.out --error=
${
LOG_DIR
}
/
${
node
}
_prefill_w
${
worker_idx
}
.err python /scripts/worker_setup.py --leader_ip
${
LEADER_IP
}
--worker_idx
${
worker_idx
}
--local_rank
${
local_rank
}
--nodes_per_worker
${
PREFILL_NODES_PER_WORKER
}
--worker_type prefill --gpu_utilization_log /logs/
${
node
}
_prefill_w
${
worker_idx
}
_gpu_utilization.log
${
WORKER_ARGS
}
${
CONFIG_DUMP_ARG
}
"
echo
"
$cmd
"
echo
"
$cmd
"
$cmd
&
$cmd
&
done
done
...
@@ -260,8 +281,18 @@ for worker_idx in $(seq 0 $((DECODE_WORKERS - 1))); do
...
@@ -260,8 +281,18 @@ for worker_idx in $(seq 0 $((DECODE_WORKERS - 1))); do
local_rank
=
$node_idx
local_rank
=
$node_idx
echo
"Launching decode worker
$worker_idx
, node
$global_node_idx
(local_rank
$local_rank
):
$node
"
echo
"Launching decode worker
$worker_idx
, node
$global_node_idx
(local_rank
$local_rank
):
$node
"
{
% endraw %
}
cmd
=
"srun --overlap
$ENROOT_ARGS
--nodes=1 --ntasks=1 --nodelist=
$node
--output=
${
LOG_DIR
}
/
${
node
}
_decode_w
${
worker_idx
}
.out --error=
${
LOG_DIR
}
/
${
node
}
_decode_w
${
worker_idx
}
.err python /scripts/worker_setup.py --leader_ip
${
LEADER_IP
}
--worker_idx
${
worker_idx
}
--local_rank
${
local_rank
}
--nodes_per_worker
${
DECODE_NODES_PER_WORKER
}
--worker_type decode --gpu_utilization_log /logs/
${
node
}
_decode_w
${
worker_idx
}
_gpu_utilization.log
${
WORKER_ARGS
}
"
{
%
if
enable_config_dump %
}
{
% raw %
}
CONFIG_DUMP_ARG
=
"--dump-config-path /logs/
${
node
}
_config.json"
{
% endraw %
}
{
%
else
%
}
{
% raw %
}
CONFIG_DUMP_ARG
=
""
{
% endraw %
}
{
% endif %
}
{
% raw %
}
cmd
=
"srun --overlap
$ENROOT_ARGS
--nodes=1 --ntasks=1 --nodelist=
$node
--output=
${
LOG_DIR
}
/
${
node
}
_decode_w
${
worker_idx
}
.out --error=
${
LOG_DIR
}
/
${
node
}
_decode_w
${
worker_idx
}
.err python /scripts/worker_setup.py --leader_ip
${
LEADER_IP
}
--worker_idx
${
worker_idx
}
--local_rank
${
local_rank
}
--nodes_per_worker
${
DECODE_NODES_PER_WORKER
}
--worker_type decode --gpu_utilization_log /logs/
${
node
}
_decode_w
${
worker_idx
}
_gpu_utilization.log
${
CONFIG_DUMP_ARG
}
${
WORKER_ARGS
}
"
echo
"
$cmd
"
echo
"
$cmd
"
$cmd
&
$cmd
&
done
done
...
@@ -298,7 +329,7 @@ PROFILER_ARGS="{{ profiler_arg }}"
...
@@ -298,7 +329,7 @@ PROFILER_ARGS="{{ profiler_arg }}"
{
%
if
do_profile %
}
{
%
if
do_profile %
}
{
% raw %
}
{
% raw %
}
srun
--nodes
=
1
--ntasks
=
1
$ENROOT_ARGS
--jobid
$SLURM_JOB_ID
-w
${
nodes
[0]
}
--output
=
${
LOG_DIR
}
/profile.out
--error
=
${
LOG_DIR
}
/profile.err
--overlap
bash /scripts/
${
PROFILER_TYPE
}
/bench.sh
$PREFILL_WORKERS
$DECODE_WORKERS
$
TOTAL
_GPUS
${
PROFILER_ARGS
}
&
srun
--nodes
=
1
--ntasks
=
1
$ENROOT_ARGS
--jobid
$SLURM_JOB_ID
-w
${
nodes
[0]
}
--output
=
${
LOG_DIR
}
/profile.out
--error
=
${
LOG_DIR
}
/profile.err
--overlap
bash /scripts/
${
PROFILER_TYPE
}
/bench.sh
$PREFILL_WORKERS
$DECODE_WORKERS
$
PREFILL_GPUS
$DECODE
_GPUS
${
PROFILER_ARGS
}
&
{
% endraw %
}
{
% endraw %
}
{
% endif %
}
{
% endif %
}
...
...
examples/backends/sglang/slurm_jobs/parse.py
0 → 100644
View file @
f8219b12
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# ruff: noqa
# pylint: skip-file
import
json
import
os
import
re
### Slurm configs
SLURM_JOB_ID
=
"slurm id"
### Model Deployment configurations
PREFILL_TP
=
"Prefill TP"
PREFILL_DP
=
"Prefill DP"
DECODE_TP
=
"Decode TP"
DECODE_DP
=
"Decode DP"
FRONTENDS
=
"Frontends"
### Profiler configs
PROFILER_TYPE
=
"Profiler type"
ISL
=
"ISL"
OSL
=
"OSL"
REQUEST_RATE
=
"Request rate"
CONCURRENCIES
=
"Concurrencies"
OUTPUT_TPS
=
"Output TPS"
OUTPUT_TPS_PER_USER
=
"Output TPS/User"
ITL
=
"Mean ITL (ms)"
TTFT
=
"Mean TTFT (ms)"
TPOT
=
"Mean TPOT (ms)"
### FORMAT PRINT ORDERS
KEY_PRINT_ORDER
=
[
SLURM_JOB_ID
,
PREFILL_TP
,
PREFILL_DP
,
DECODE_TP
,
DECODE_DP
,
FRONTENDS
,
PROFILER_TYPE
,
ISL
,
OSL
,
REQUEST_RATE
,
CONCURRENCIES
,
OUTPUT_TPS
,
OUTPUT_TPS_PER_USER
,
ITL
,
TTFT
,
TPOT
,
]
def
format_key_order
():
report
=
"================
\n
The following log will be reported according to this order:
\n
----
\n
"
for
key
in
KEY_PRINT_ORDER
:
report
+=
f
"
{
key
}
\n
"
print
(
report
[:
-
1
])
def
format_print
(
result
):
report
=
"================
\n
"
for
key
in
KEY_PRINT_ORDER
:
report
+=
f
"
{
result
.
get
(
key
,
''
)
}
\n
"
print
(
report
[:
-
1
])
def
analyze_sgl_out
(
folder
):
result
=
[]
for
file
in
os
.
listdir
(
folder
):
with
open
(
f
"
{
folder
}
/
{
file
}
"
,
"r"
)
as
f
:
content
=
json
.
load
(
f
)
res
=
[
content
[
"max_concurrency"
],
content
[
"output_throughput"
],
content
[
"mean_itl_ms"
],
content
[
"mean_ttft_ms"
],
content
[
"request_rate"
],
]
if
"mean_tpot_ms"
in
content
:
res
.
append
(
content
[
"mean_tpot_ms"
])
result
.
append
(
res
)
out
=
{
REQUEST_RATE
:
[],
CONCURRENCIES
:
[],
OUTPUT_TPS
:
[],
ITL
:
[],
TTFT
:
[],
TPOT
:
[],
}
for
data
in
sorted
(
result
,
key
=
lambda
x
:
x
[
0
]):
con
,
tps
,
itl
,
ttft
,
req_rate
=
data
[
0
:
5
]
out
[
CONCURRENCIES
].
append
(
con
)
out
[
OUTPUT_TPS
].
append
(
tps
)
out
[
ITL
].
append
(
itl
)
out
[
TTFT
].
append
(
ttft
)
out
[
REQUEST_RATE
].
append
(
req_rate
)
if
len
(
data
)
>=
6
:
if
TPOT
not
in
out
:
out
[
TPOT
]
=
[]
out
[
TPOT
].
append
(
data
[
5
])
return
out
def
analyze_gap_out
(
folder
):
result
=
[]
for
file
in
os
.
listdir
(
folder
):
with
open
(
f
"
{
folder
}
/
{
file
}
"
,
"r"
)
as
f
:
content
=
json
.
load
(
f
)
result
.
append
(
(
content
[
"input_config"
][
"perf_analyzer"
][
"stimulus"
][
"concurrency"
],
content
[
"output_token_throughput_per_user"
][
"avg"
],
content
[
"output_token_throughput"
][
"avg"
],
)
)
out
=
{
CONCURRENCIES
:
[],
OUTPUT_TPS
:
[],
OUTPUT_TPS_PER_USER
:
[]}
for
con
,
tpspuser
,
tps
in
sorted
(
result
,
key
=
lambda
x
:
x
[
0
]):
out
[
CONCURRENCIES
].
append
(
con
)
out
[
OUTPUT_TPS
].
append
(
tps
)
out
[
OUTPUT_TPS_PER_USER
].
append
(
tpspuser
)
return
out
def
analyze
(
p
):
files
=
os
.
listdir
(
p
)
prefill_nodes
=
{}
decode_nodes
=
{}
frontends
=
[]
profile_result
=
{}
for
file
in
files
:
p_re
=
re
.
search
(
"([-_A-Za-z0-9]+)_(prefill|decode|nginx|frontend)_([a-zA-Z0-9]+).out"
,
file
)
if
p_re
is
not
None
:
_
,
node_type
,
number
=
p_re
.
groups
()
if
node_type
==
"prefill"
:
if
number
not
in
prefill_nodes
:
prefill_nodes
[
number
]
=
[]
prefill_nodes
[
number
].
append
(
file
)
elif
node_type
==
"decode"
:
if
number
not
in
decode_nodes
:
decode_nodes
[
number
]
=
[]
decode_nodes
[
number
].
append
(
file
)
elif
node_type
==
"frontend"
:
frontends
.
append
(
file
)
profiler_match
=
re
.
match
(
"(sglang|vllm|gap)_isl_([0-9]+)_osl_([0-9]+)"
,
file
)
if
profiler_match
:
profiler
,
isl
,
osl
=
profiler_match
.
groups
()
if
profiler
==
"gap"
:
profile_result
=
analyze_gap_out
(
f
"
{
p
}
/
{
file
}
"
)
else
:
profile_result
=
analyze_sgl_out
(
f
"
{
p
}
/
{
file
}
"
)
profile_result
[
PROFILER_TYPE
]
=
profiler
profile_result
[
ISL
]
=
isl
profile_result
[
OSL
]
=
osl
config
=
{
SLURM_JOB_ID
:
p
}
if
len
(
prefill_nodes
.
values
())
!=
0
:
config
[
PREFILL_TP
]
=
f
"
{
len
(
list
(
prefill_nodes
.
values
())[
0
])
*
4
}
"
config
[
PREFILL_DP
]
=
f
"
{
len
(
prefill_nodes
.
keys
())
}
"
if
len
(
decode_nodes
.
values
())
!=
0
:
config
[
DECODE_TP
]
=
f
"
{
len
(
list
(
decode_nodes
.
values
())[
0
])
*
4
}
"
config
[
DECODE_DP
]
=
f
"
{
len
(
decode_nodes
.
keys
())
}
"
if
len
(
frontends
)
!=
0
:
config
[
FRONTENDS
]
=
f
"
{
len
(
frontends
)
}
"
result
=
{
**
config
}
for
key
,
value
in
profile_result
.
items
():
result
[
key
]
=
(
value
if
type
(
value
)
!=
list
else
", "
.
join
([
str
(
x
)
for
x
in
value
])
# ignore:
)
return
result
paths
=
[
x
for
x
in
os
.
listdir
(
"."
)
if
".py"
not
in
x
and
os
.
path
.
isdir
(
x
)]
format_key_order
()
def
extract_job_id
(
dirname
):
"""Extract job ID from directory name for sorting.
Handles formats like:
- 12345_3P_1D_20250104_123456 (disaggregated)
- 12345_4A_20250104_123456 (aggregated)
- 12345 (legacy format)
"""
try
:
return
int
(
dirname
.
split
(
"_"
)[
0
])
except
(
ValueError
,
IndexError
):
# If directory name doesn't match expected format, return -1
return
-
1
for
path
in
sorted
(
paths
,
key
=
extract_job_id
,
reverse
=
True
):
result
=
analyze
(
path
)
if
OUTPUT_TPS
not
in
result
:
pass
else
:
format_print
(
result
)
examples/backends/sglang/slurm_jobs/scripts/
gap
/bench.sh
→
examples/backends/sglang/slurm_jobs/scripts/
aiperf
/bench.sh
View file @
f8219b12
...
@@ -4,11 +4,13 @@
...
@@ -4,11 +4,13 @@
prefill_workers
=
$1
prefill_workers
=
$1
decode_workers
=
$2
decode_workers
=
$2
total_gpus
=
$3
prefill_gpus
=
$3
decode_gpus
=
$4
total_gpus
=
$((
prefill_gpus+decode_gpus
))
chosen_isl
=
$
4
chosen_isl
=
$
5
chosen_osl
=
$
5
chosen_osl
=
$
6
chosen_concurrencies
=
$
6
chosen_concurrencies
=
$
7
echo
"Profiling for model with PrefillDP=
${
prefill_workers
}
, DecodeDP=
${
decode_workers
}
"
echo
"Profiling for model with PrefillDP=
${
prefill_workers
}
, DecodeDP=
${
decode_workers
}
"
...
...
examples/backends/sglang/slurm_jobs/scripts/gb200-fp4.sh
→
examples/backends/sglang/slurm_jobs/scripts/gb200-fp4
/disagg/default
.sh
View file @
f8219b12
...
@@ -67,10 +67,18 @@ fi
...
@@ -67,10 +67,18 @@ fi
# Construct command based on mode
# Construct command based on mode
if
[
"
$mode
"
=
"prefill"
]
;
then
if
[
"
$mode
"
=
"prefill"
]
;
then
set
-x
set
-x
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
# no expert locations collected for fp4 yet
# no expert locations collected for fp4 yet
command_suffix
=
""
if
[[
"
${
USE_INIT_LOCATIONS
,,
}
"
==
"true"
]]
;
then
command_suffix
=
" "
;
fi
if
[[
"
${
USE_INIT_LOCATIONS
,,
}
"
==
"true"
]]
;
then
command_suffix
=
" "
;
fi
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
# we have to install pre-release cutedsl for a integer overflow fix
python3
-m
pip
install
--no-cache-dir
--upgrade
--pre
nvidia-cutlass-dsl
# set your own cache variables here
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
export
SGLANG_DG_CACHE_DIR
=
"/configs/dg-10212025"
export
FLASHINFER_WORKSPACE_BASE
=
"/configs/flashinfer-cache"
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN
=
1
\
SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN
=
1
\
...
@@ -112,12 +120,12 @@ if [ "$mode" = "prefill" ]; then
...
@@ -112,12 +120,12 @@ if [ "$mode" = "prefill" ]; then
--max-prefill-tokens
16384
\
--max-prefill-tokens
16384
\
--load-balance-method
round_robin
\
--load-balance-method
round_robin
\
--quantization
modelopt_fp4
\
--quantization
modelopt_fp4
\
--enable-ep-moe
\
--moe-runner-backend
flashinfer_cutlass
\
--moe-runner-backend
flashinfer_cutlass
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--disaggregation-bootstrap-port
30001
\
--disaggregation-bootstrap-port
30001
\
--nnodes
"
$TOTAL_NODES
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--node-rank
"
$RANK
"
\
--ep-size
"
$TOTAL_GPUS
"
\
--tp-size
"
$TOTAL_GPUS
"
\
--tp-size
"
$TOTAL_GPUS
"
\
--dp-size
"
$TOTAL_GPUS
"
\
--dp-size
"
$TOTAL_GPUS
"
\
--enable-dp-attention
\
--enable-dp-attention
\
...
@@ -125,12 +133,27 @@ if [ "$mode" = "prefill" ]; then
...
@@ -125,12 +133,27 @@ if [ "$mode" = "prefill" ]; then
--stream-interval
50
\
--stream-interval
50
\
--log-level
debug
${
command_suffix
}
--log-level
debug
${
command_suffix
}
# For now we must keep SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK and cuda-graph-bs at 1024 until
# DeepEP merges in https://github.com/deepseek-ai/DeepEP/pull/440
# the nvidia-cutlass-dsl install fixes https://github.com/flashinfer-ai/flashinfer/issues/1830#issuecomment-3380074018
# which was previously limiting us to DISPATCH_TOKENS and cuda-graph-bs == 384
# For now use 12 nodes for fp4 since flashinfer_cutedsl requires experts per gpu < 8
# We have 288 (256 + 32 redundant) => 288/48 = 6
elif
[
"
$mode
"
=
"decode"
]
;
then
elif
[
"
$mode
"
=
"decode"
]
;
then
set
-x
set
-x
# no expert locations collected for fp4 yet
command_suffix
=
""
command_suffix
=
""
if
[[
"
${
USE_INIT_LOCATIONS
,,
}
"
==
"true"
]]
;
then
command_suffix
=
" "
;
fi
if
[[
"
${
USE_INIT_LOCATIONS
,,
}
"
==
"true"
]]
;
then
command_suffix
=
" "
;
fi
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
# set your own cache variables here
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
export
SGLANG_DG_CACHE_DIR
=
"/configs/dg-10212025"
export
FLASHINFER_WORKSPACE_BASE
=
"/configs/flashinfer-cache"
# we have to install pre-release cutedsl for a integer overflow fix
python3
-m
pip
install
--no-cache-dir
--upgrade
--pre
nvidia-cutlass-dsl
SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN
=
1
\
SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN
=
1
\
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2
=
1
\
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2
=
1
\
...
@@ -141,7 +164,7 @@ elif [ "$mode" = "decode" ]; then
...
@@ -141,7 +164,7 @@ elif [ "$mode" = "decode" ]; then
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM
=
1
\
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM
=
1
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK
=
1408
\
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK
=
384
\
SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH
=
1
\
SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH
=
1
\
SGLANG_FP4_GEMM_BACKEND
=
cutlass
\
SGLANG_FP4_GEMM_BACKEND
=
cutlass
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
...
@@ -165,11 +188,10 @@ elif [ "$mode" = "decode" ]; then
...
@@ -165,11 +188,10 @@ elif [ "$mode" = "decode" ]; then
--enable-dp-attention
\
--enable-dp-attention
\
--chunked-prefill-size
786432
\
--chunked-prefill-size
786432
\
--mem-fraction-static
0.83
\
--mem-fraction-static
0.83
\
--enable-ep-moe
\
--moe-a2a-backend
deepep
\
--moe-a2a-backend
deepep
\
--deepep-mode
low_latency
\
--deepep-mode
low_latency
\
--ep-dispatch-algorithm
static
\
--ep-dispatch-algorithm
static
\
--cuda-graph-bs
1408
\
--cuda-graph-bs
384
\
--num-reserved-decode-tokens
112
\
--num-reserved-decode-tokens
112
\
--ep-num-redundant-experts
32
\
--ep-num-redundant-experts
32
\
--eplb-algorithm
deepseek
\
--eplb-algorithm
deepseek
\
...
@@ -184,6 +206,7 @@ elif [ "$mode" = "decode" ]; then
...
@@ -184,6 +206,7 @@ elif [ "$mode" = "decode" ]; then
--nnodes
"
$TOTAL_NODES
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--node-rank
"
$RANK
"
\
--tp-size
"
$TOTAL_GPUS
"
\
--tp-size
"
$TOTAL_GPUS
"
\
--ep-size
"
$TOTAL_GPUS
"
\
--dp-size
"
$TOTAL_GPUS
"
\
--dp-size
"
$TOTAL_GPUS
"
\
--enable-single-batch-overlap
\
--enable-single-batch-overlap
\
--enable-dp-attention
\
--enable-dp-attention
\
...
...
examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/agg/default.sh
0 → 100755
View file @
f8219b12
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Simple agg script (not an optimized config)
print_usage
()
{
echo
"Usage:
$0
"
echo
""
echo
"This script runs aggregated mode (single dynamo.sglang instance)"
exit
1
}
echo
"Mode: aggregated"
echo
"Command: dynamo"
# Check if required environment variables are set
if
[
-z
"
$HOST_IP_MACHINE
"
]
;
then
echo
"Error: HOST_IP_MACHINE environment variable is not set"
exit
1
fi
if
[
-z
"
$PORT
"
]
;
then
echo
"Error: PORT environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_GPUS
"
]
;
then
echo
"Error: TOTAL_GPUS environment variable is not set"
exit
1
fi
if
[
-z
"
$RANK
"
]
;
then
echo
"Error: RANK environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_NODES
"
]
;
then
echo
"Error: TOTAL_NODES environment variable is not set"
exit
1
fi
# Construct command suffix for config dump
command_suffix
=
""
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
set
-x
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
export
SGLANG_DG_CACHE_DIR
=
"/configs/dg-10212025"
export
FLASHINFER_WORKSPACE_BASE
=
"/configs/flashinfer-cache"
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
MC_TE_METRIC
=
true
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
PYTHONUNBUFFERED
=
1
\
python3
-m
dynamo.sglang
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--skip-tokenizer-init
\
--trust-remote-code
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--tp-size
"
$TOTAL_GPUS
"
\
--dp-size
"
$TOTAL_GPUS
"
\
--enable-dp-attention
\
--host
0.0.0.0
\
--max-running-requests
30000
\
--context-length
2200
\
--disable-radix-cache
\
--moe-a2a-backend
deepep
\
--load-balance-method
round_robin
\
--deepep-mode
normal
\
--ep-dispatch-algorithm
dynamic
\
--moe-dense-tp-size
1
\
--enable-dp-lm-head
\
--disable-shared-experts-fusion
\
--ep-num-redundant-experts
32
\
--eplb-algorithm
deepseek
\
--attention-backend
trtllm_mla
\
--kv-cache-dtype
fp8_e4m3
\
--watchdog-timeout
1000000
\
--disable-cuda-graph
\
--chunked-prefill-size
131072
\
--max-total-tokens
524288
\
--deepep-config
/configs/deepep_config.json
\
--stream-interval
50
\
--mem-fraction-static
0.75
${
command_suffix
}
examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/disagg/1p_4d.sh
0 → 100755
View file @
f8219b12
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Low Latency Config
# Function to print usage
print_usage
()
{
echo
"Usage:
$0
<mode>"
echo
" mode: prefill or decode"
echo
""
echo
"Examples:"
echo
"
$0
prefill"
echo
"
$0
decode"
exit
1
}
# Check if correct number of arguments provided
if
[
$#
-ne
1
]
;
then
echo
"Error: Expected 1 argument, got $#"
print_usage
fi
# Parse arguments
mode
=
$1
# Validate mode argument
if
[
"
$mode
"
!=
"prefill"
]
&&
[
"
$mode
"
!=
"decode"
]
;
then
echo
"Error: mode must be 'prefill' or 'decode', got '
$mode
'"
print_usage
fi
echo
"Mode:
$mode
"
echo
"Command: dynamo"
# Check if required environment variables are set
if
[
-z
"
$HOST_IP_MACHINE
"
]
;
then
echo
"Error: HOST_IP_MACHINE environment variable is not set"
exit
1
fi
if
[
-z
"
$PORT
"
]
;
then
echo
"Error: PORT environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_GPUS
"
]
;
then
echo
"Error: TOTAL_GPUS environment variable is not set"
exit
1
fi
if
[
-z
"
$RANK
"
]
;
then
echo
"Error: RANK environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_NODES
"
]
;
then
echo
"Error: TOTAL_NODES environment variable is not set"
exit
1
fi
if
[
-z
"
$USE_INIT_LOCATIONS
"
]
;
then
echo
"Error: USE_INIT_LOCATIONS environment variable is not set"
exit
1
fi
if
[
-z
"
$RUN_IN_CI
"
]
;
then
echo
"Error: RUN_IN_CI environment variable is not set"
exit
1
fi
# Construct command based on mode
if
[
"
$mode
"
=
"prefill"
]
;
then
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.6.1-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.6.1-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
export
SGLANG_DG_CACHE_DIR
=
"/configs/dg-10212025"
command_suffix
=
""
if
[[
"
${
USE_INIT_LOCATIONS
,,
}
"
==
"true"
]]
;
then
command_suffix
=
"--init-expert-location /configs/prefill_dsr1-0528_in1000out1000_num40000.json"
;
fi
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
MC_TE_METRIC
=
true
\
SGLANG_ENABLE_FLASHINFER_GEMM
=
1
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
=
0
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
PYTHONUNBUFFERED
=
1
\
python3
-m
dynamo.sglang
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--trust-remote-code
\
--disable-radix-cache
\
--moe-dense-tp-size
1
\
--max-running-requests
512
\
--chunked-prefill-size
8192
\
--mem-fraction-static
0.95
\
--cuda-graph-max-bs
128
\
--context-length
2200
\
--kv-cache-dtype
fp8_e4m3
\
--quantization
fp8
\
--attention-backend
trtllm_mla
\
--stream-interval
10
\
--max-total-tokens
8192
\
--enable-flashinfer-allreduce-fusion
\
--moe-runner-backend
flashinfer_trtllm
\
--load-balance-method
round_robin
\
--scheduler-recv-interval
10
\
--enable-symm-mem
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--base-gpu-id
0
\
--disaggregation-mode
prefill
\
--host
0.0.0.0
\
--tensor-parallel-size
"
$TOTAL_GPUS
"
\
--data-parallel-size
1
\
--expert-parallel-size
1
${
command_suffix
}
elif
[
"
$mode
"
=
"decode"
]
;
then
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.6.1-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.6.1-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
export
SGLANG_DG_CACHE_DIR
=
"/configs/dg-10212025"
command_suffix
=
""
if
[[
"
${
USE_INIT_LOCATIONS
,,
}
"
==
"true"
]]
;
then
command_suffix
=
"--init-expert-location /configs/decode_dsr1-0528_loadgen_in1024out1024_num2000_2p12d.json"
;
fi
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
MC_TE_METRIC
=
true
\
SGLANG_ENABLE_FLASHINFER_GEMM
=
1
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_DECODE_BOOTSTRAP_TIMEOUT
=
1000
\
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM
=
1
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
=
0
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
PYTHONUNBUFFERED
=
1
\
python3
-m
dynamo.sglang
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--trust-remote-code
\
--disable-radix-cache
\
--moe-dense-tp-size
1
\
--max-running-requests
512
\
--chunked-prefill-size
8192
\
--mem-fraction-static
0.95
\
--cuda-graph-max-bs
128
\
--context-length
2200
\
--kv-cache-dtype
fp8_e4m3
\
--quantization
fp8
\
--attention-backend
trtllm_mla
\
--stream-interval
10
\
--enable-flashinfer-allreduce-fusion
\
--moe-runner-backend
flashinfer_trtllm
\
--prefill-round-robin-balance
\
--scheduler-recv-interval
10
\
--enable-symm-mem
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--base-gpu-id
0
\
--disaggregation-mode
decode
\
--host
0.0.0.0
\
--tensor-parallel-size
"
$TOTAL_GPUS
"
\
--data-parallel-size
1
\
--expert-parallel-size
1
${
command_suffix
}
fi
\ No newline at end of file
examples/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
→
examples/backends/sglang/slurm_jobs/scripts/gb200-fp8
/disagg/default
.sh
View file @
f8219b12
...
@@ -62,14 +62,24 @@ if [ -z "$USE_INIT_LOCATIONS" ]; then
...
@@ -62,14 +62,24 @@ if [ -z "$USE_INIT_LOCATIONS" ]; then
exit
1
exit
1
fi
fi
if
[
-z
"
$RUN_IN_CI
"
]
;
then
echo
"Error: RUN_IN_CI environment variable is not set"
exit
1
fi
# Construct command based on mode
# Construct command based on mode
if
[
"
$mode
"
=
"prefill"
]
;
then
if
[
"
$mode
"
=
"prefill"
]
;
then
set
-x
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.6.1-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.6.1-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
export
SGL_DG_CACHE_DIR
=
"/configs/dg
cache/3p1dcache
"
export
SGL
ANG
_DG_CACHE_DIR
=
"/configs/dg
-10212025
"
command_suffix
=
""
command_suffix
=
""
if
[[
"
${
USE_INIT_LOCATIONS
,,
}
"
==
"true"
]]
;
then
command_suffix
=
"--init-expert-location /configs/prefill_dsr1-0528_in1000out1000_num40000.json"
;
fi
if
[[
"
${
USE_INIT_LOCATIONS
,,
}
"
==
"true"
]]
;
then
command_suffix
=
"--init-expert-location /configs/prefill_dsr1-0528_in1000out1000_num40000.json"
;
fi
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
MC_TE_METRIC
=
true
\
MC_TE_METRIC
=
true
\
...
@@ -109,7 +119,8 @@ if [ "$mode" = "prefill" ]; then
...
@@ -109,7 +119,8 @@ if [ "$mode" = "prefill" ]; then
--disable-shared-experts-fusion
\
--disable-shared-experts-fusion
\
--ep-num-redundant-experts
32
\
--ep-num-redundant-experts
32
\
--eplb-algorithm
deepseek
\
--eplb-algorithm
deepseek
\
--attention-backend
cutlass_mla
\
--attention-backend
trtllm_mla
\
--kv-cache-dtype
fp8_e4m3
\
--watchdog-timeout
1000000
\
--watchdog-timeout
1000000
\
--disable-cuda-graph
\
--disable-cuda-graph
\
--chunked-prefill-size
131072
\
--chunked-prefill-size
131072
\
...
@@ -120,12 +131,16 @@ if [ "$mode" = "prefill" ]; then
...
@@ -120,12 +131,16 @@ if [ "$mode" = "prefill" ]; then
elif
[
"
$mode
"
=
"decode"
]
;
then
elif
[
"
$mode
"
=
"decode"
]
;
then
set
-x
set
-x
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.6.1-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.6.1-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
export
SGL_DG_CACHE_DIR
=
"/configs/dg
cache/3p1dcache
"
export
SGL
ANG
_DG_CACHE_DIR
=
"/configs/dg
-10212025
"
command_suffix
=
""
command_suffix
=
""
if
[[
"
${
USE_INIT_LOCATIONS
,,
}
"
==
"true"
]]
;
then
command_suffix
=
"--init-expert-location /configs/decode_dsr1-0528_loadgen_in1024out1024_num2000_2p12d.json"
;
fi
if
[[
"
${
USE_INIT_LOCATIONS
,,
}
"
==
"true"
]]
;
then
command_suffix
=
"--init-expert-location /configs/decode_dsr1-0528_loadgen_in1024out1024_num2000_2p12d.json"
;
fi
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK
=
768
\
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK
=
768
\
...
@@ -171,10 +186,11 @@ elif [ "$mode" = "decode" ]; then
...
@@ -171,10 +186,11 @@ elif [ "$mode" = "decode" ]; then
--ep-num-redundant-experts
32
\
--ep-num-redundant-experts
32
\
--ep-dispatch-algorithm
static
\
--ep-dispatch-algorithm
static
\
--eplb-algorithm
deepseek
\
--eplb-algorithm
deepseek
\
--attention-backend
cutlass_mla
\
--attention-backend
trtllm_mla
\
--kv-cache-dtype
fp8_e4m3
\
--watchdog-timeout
1000000
\
--watchdog-timeout
1000000
\
--chunked-prefill-size
36864
\
--chunked-prefill-size
36864
\
--stream-interval
50
\
--stream-interval
50
\
--deepep-config
/configs/deepep_config.json
\
--deepep-config
/configs/deepep_config.json
\
--mem-fraction-static
0.82
${
command_suffix
}
--mem-fraction-static
0.82
${
command_suffix
}
fi
fi
\ No newline at end of file
examples/backends/sglang/slurm_jobs/scripts/h100.sh
deleted
100755 → 0
View file @
2c910b2e
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Function to print usage
print_usage
()
{
echo
"Usage:
$0
<mode>"
echo
" mode: prefill or decode"
echo
""
echo
"Examples:"
echo
"
$0
prefill"
echo
"
$0
decode"
exit
1
}
# Check if correct number of arguments provided
if
[
$#
-ne
1
]
;
then
echo
"Error: Expected 1 argument, got $#"
print_usage
fi
# Parse arguments
mode
=
$1
# Validate mode argument
if
[
"
$mode
"
!=
"prefill"
]
&&
[
"
$mode
"
!=
"decode"
]
;
then
echo
"Error: mode must be 'prefill' or 'decode', got '
$mode
'"
print_usage
fi
echo
"Mode:
$mode
"
echo
"Command: dynamo"
# Check if required environment variables are set
if
[
-z
"
$HOST_IP
"
]
;
then
echo
"Error: HOST_IP environment variable is not set"
exit
1
fi
if
[
-z
"
$PORT
"
]
;
then
echo
"Error: PORT environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_GPUS
"
]
;
then
echo
"Error: TOTAL_GPUS environment variable is not set"
exit
1
fi
if
[
-z
"
$RANK
"
]
;
then
echo
"Error: RANK environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_NODES
"
]
;
then
echo
"Error: TOTAL_NODES environment variable is not set"
exit
1
fi
# Construct command based on mode
if
[
"
$mode
"
=
"prefill"
]
;
then
if
[
"
$cmd
"
=
"dynamo"
]
;
then
# H100 dynamo prefill command
python3
-m
dynamo.sglang
\
--model-path
/model/
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--skip-tokenizer-init
\
--disaggregation-mode
prefill
\
--disaggregation-transfer-backend
nixl
\
--disaggregation-bootstrap-port
30001
\
--dist-init-addr
"
$HOST_IP
:
$PORT
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--tp-size
"
$TOTAL_GPUS
"
\
--dp-size
"
$TOTAL_GPUS
"
\
--enable-dp-attention
\
--decode-log-interval
1
\
--enable-deepep-moe
\
--page-size
1
\
--trust-remote-code
\
--moe-dense-tp-size
1
\
--enable-dp-lm-head
\
--disable-radix-cache
\
--watchdog-timeout
1000000
\
--enable-two-batch-overlap
\
--deepep-mode
normal
\
--mem-fraction-static
0.85
\
--deepep-config
/configs/deepep.json
\
--ep-num-redundant-experts
32
\
--ep-dispatch-algorithm
dynamic
\
--eplb-algorithm
deepseek
elif
[
"
$cmd
"
=
"sglang"
]
;
then
# H100 sglang prefill command
python3
-m
sglang.launch_server
\
--model-path
/model/
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--disaggregation-transfer-backend
nixl
\
--disaggregation-mode
prefill
\
--dist-init-addr
"
$HOST_IP
:
$PORT
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--tp-size
"
$TOTAL_GPUS
"
\
--dp-size
"
$TOTAL_GPUS
"
\
--enable-dp-attention
\
--decode-log-interval
1
\
--enable-deepep-moe
\
--page-size
1
\
--host
0.0.0.0
\
--trust-remote-code
\
--moe-dense-tp-size
1
\
--enable-dp-lm-head
\
--disable-radix-cache
\
--watchdog-timeout
1000000
\
--enable-two-batch-overlap
\
--deepep-mode
normal
\
--mem-fraction-static
0.85
\
--ep-num-redundant-experts
32
\
--ep-dispatch-algorithm
dynamic
\
--eplb-algorithm
deepseek
\
--deepep-config
/configs/deepep.json
fi
elif
[
"
$mode
"
=
"decode"
]
;
then
if
[
"
$cmd
"
=
"dynamo"
]
;
then
# H100 dynamo decode command
python3
-m
dynamo.sglang
\
--model-path
/model/
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--skip-tokenizer-init
\
--disaggregation-mode
decode
\
--disaggregation-transfer-backend
nixl
\
--disaggregation-bootstrap-port
30001
\
--dist-init-addr
"
$HOST_IP
:
$PORT
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--tp-size
"
$TOTAL_GPUS
"
\
--dp-size
"
$TOTAL_GPUS
"
\
--enable-dp-attention
\
--decode-log-interval
1
\
--enable-deepep-moe
\
--page-size
1
\
--trust-remote-code
\
--moe-dense-tp-size
1
\
--enable-dp-lm-head
\
--disable-radix-cache
\
--watchdog-timeout
1000000
\
--enable-two-batch-overlap
\
--deepep-mode
low_latency
\
--mem-fraction-static
0.835
\
--ep-num-redundant-experts
32
\
--cuda-graph-bs
128
elif
[
"
$cmd
"
=
"sglang"
]
;
then
# H100 sglang decode command
python3
-m
sglang.launch_server
\
--model-path
/model/
\
--disaggregation-transfer-backend
nixl
\
--disaggregation-mode
decode
\
--dist-init-addr
"
$HOST_IP
:
$PORT
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--tp-size
"
$TOTAL_GPUS
"
\
--dp-size
"
$TOTAL_GPUS
"
\
--enable-dp-attention
\
--decode-log-interval
1
\
--enable-deepep-moe
\
--page-size
1
\
--host
0.0.0.0
\
--trust-remote-code
\
--moe-dense-tp-size
1
\
--enable-dp-lm-head
\
--disable-radix-cache
\
--watchdog-timeout
1000000
\
--enable-two-batch-overlap
\
--deepep-mode
low_latency
\
--mem-fraction-static
0.835
\
--ep-num-redundant-experts
32
\
--cuda-graph-bs
128
fi
fi
examples/backends/sglang/slurm_jobs/scripts/sglang/bench.sh
View file @
f8219b12
...
@@ -4,14 +4,16 @@
...
@@ -4,14 +4,16 @@
n_prefill
=
$1
n_prefill
=
$1
n_decode
=
$2
n_decode
=
$2
total_gpus
=
$3
prefill_gpus
=
$3
decode_gpus
=
$4
total_gpus
=
$((
prefill_gpus+decode_gpus
))
chosen_isl
=
$
4
chosen_isl
=
$
5
chosen_osl
=
$
5
chosen_osl
=
$
6
concurrency_list
=
$
6
concurrency_list
=
$
7
IFS
=
'x'
read
-r
-a
chosen_concurrencies
<<<
"
$concurrency_list
"
IFS
=
'x'
read
-r
-a
chosen_concurrencies
<<<
"
$concurrency_list
"
chosen_req_rate
=
$
7
chosen_req_rate
=
$
8
echo
"Config
${
chosen_isl
}
;
${
chosen_osl
}
;
${
chosen_concurrencies
[@]
}
;
${
chosen_req_rate
}
"
echo
"Config
${
chosen_isl
}
;
${
chosen_osl
}
;
${
chosen_concurrencies
[@]
}
;
${
chosen_req_rate
}
"
...
...
examples/backends/sglang/slurm_jobs/scripts/vllm/backend_request_func.py
View file @
f8219b12
...
@@ -380,7 +380,13 @@ async def async_request_dynamo_completions(
...
@@ -380,7 +380,13 @@ async def async_request_dynamo_completions(
if
not
chunk_bytes
:
if
not
chunk_bytes
:
continue
continue
chunk
=
chunk_bytes
.
decode
(
"utf-8"
).
removeprefix
(
"data: "
)
chunk
=
chunk_bytes
.
decode
(
"utf-8"
)
# Skip SSE event/comment lines (not data)
if
chunk
.
startswith
(
"event:"
)
or
chunk
.
startswith
(
":"
):
continue
chunk
=
chunk
.
removeprefix
(
"data: "
)
if
chunk
!=
"[DONE]"
:
if
chunk
!=
"[DONE]"
:
data
=
json
.
loads
(
chunk
)
data
=
json
.
loads
(
chunk
)
...
...
examples/backends/sglang/slurm_jobs/scripts/vllm/bench.sh
View file @
f8219b12
...
@@ -11,17 +11,19 @@ head_port=8000
...
@@ -11,17 +11,19 @@ head_port=8000
n_prefill
=
$1
n_prefill
=
$1
n_decode
=
$2
n_decode
=
$2
total_gpus
=
$3
prefill_gpus
=
$3
decode_gpus
=
$4
total_gpus
=
$((
prefill_gpus+decode_gpus
))
source
/scripts/benchmark_utils.sh
source
/scripts/benchmark_utils.sh
work_dir
=
"/scripts/vllm/"
work_dir
=
"/scripts/vllm/"
cd
$work_dir
cd
$work_dir
chosen_isl
=
$
4
chosen_isl
=
$
5
chosen_osl
=
$
5
chosen_osl
=
$
6
concurrency_list
=
$
6
concurrency_list
=
$
7
IFS
=
'x'
read
-r
-a
chosen_concurrencies
<<<
"
$concurrency_list
"
IFS
=
'x'
read
-r
-a
chosen_concurrencies
<<<
"
$concurrency_list
"
chosen_req_rate
=
$
7
chosen_req_rate
=
$
8
echo
"Config
${
chosen_isl
}
;
${
chosen_osl
}
;
${
chosen_concurrencies
[@]
}
;
${
chosen_req_rate
}
"
echo
"Config
${
chosen_isl
}
;
${
chosen_osl
}
;
${
chosen_concurrencies
[@]
}
;
${
chosen_req_rate
}
"
...
@@ -32,28 +34,35 @@ wait_for_model_report_interval=60 # wait_for_model report interval -> 60s
...
@@ -32,28 +34,35 @@ wait_for_model_report_interval=60 # wait_for_model report interval -> 60s
wait_for_model
$head_node
$head_port
$n_prefill
$n_decode
$wait_for_model_check_interval
$wait_for_model_timeout
$wait_for_model_report_interval
wait_for_model
$head_node
$head_port
$n_prefill
$n_decode
$wait_for_model_check_interval
$wait_for_model_timeout
$wait_for_model_report_interval
set
-e
set
-e
# Warmup the model
# Warmup the model
with a sweep of concurrencies
warmup_isl
=
$chosen_isl
warmup_isl
=
$chosen_isl
warmup_osl
=
$chosen_osl
warmup_osl
=
$chosen_osl
warmup_prompts
=
10000
warmup_concurrencies
=
10000
warmup_req_rate
=
250
warmup_req_rate
=
250
set
-x
warmup_concurrency_list
=(
1 4 8 32 64 128 256 512 1024 4096
)
python3 benchmark_serving.py
\
--model
${
model_name
}
--tokenizer
${
model_path
}
\
for
warmup_concurrency
in
"
${
warmup_concurrency_list
[@]
}
"
--host
$head_node
--port
$head_port
\
do
--backend
"dynamo"
--endpoint
/v1/completions
\
echo
"Warming up model with concurrency
$warmup_concurrency
"
--disable-tqdm
\
echo
"
$(
date
'+%Y-%m-%d %H:%M:%S'
)
"
--dataset-name
random
\
num_prompts
=
$((
warmup_concurrency
*
5
))
--num-prompts
"
$warmup_prompts
"
\
set
-x
--random-input-len
$warmup_isl
\
python3
-u
benchmark_serving.py
\
--random-output-len
$warmup_osl
\
--model
${
model_name
}
--tokenizer
${
model_path
}
\
--random-range-ratio
0.8
\
--host
$head_node
--port
$head_port
\
--ignore-eos
\
--backend
"dynamo"
--endpoint
/v1/completions
\
--request-rate
${
warmup_req_rate
}
\
--disable-tqdm
\
--percentile-metrics
ttft,tpot,itl,e2el
\
--dataset-name
random
\
--max-concurrency
"
$warmup_concurrencies
"
--num-prompts
"
$num_prompts
"
\
set
+x
--random-input-len
$warmup_isl
\
--random-output-len
$warmup_osl
\
--random-range-ratio
0.8
\
--ignore-eos
\
--request-rate
${
warmup_req_rate
}
\
--percentile-metrics
ttft,tpot,itl,e2el
\
--max-concurrency
"
$warmup_concurrency
"
set
+x
echo
"
$(
date
'+%Y-%m-%d %H:%M:%S'
)
"
done
set
+e
set
+e
result_dir
=
"/logs/vllm_isl_
${
chosen_isl
}
_osl_
${
chosen_osl
}
"
result_dir
=
"/logs/vllm_isl_
${
chosen_isl
}
_osl_
${
chosen_osl
}
"
...
@@ -64,10 +73,11 @@ for concurrency in "${chosen_concurrencies[@]}"
...
@@ -64,10 +73,11 @@ for concurrency in "${chosen_concurrencies[@]}"
do
do
num_prompts
=
$((
concurrency
*
5
))
num_prompts
=
$((
concurrency
*
5
))
echo
"Running benchmark with concurrency:
$concurrency
and num-prompts:
$num_prompts
, writing to file
${
result_dir
}
"
echo
"Running benchmark with concurrency:
$concurrency
and num-prompts:
$num_prompts
, writing to file
${
result_dir
}
"
result_filename
=
"isl_
${
chosen_isl
}
_osl_
${
chosen_osl
}
_concurrency_
${
concurrency
}
_req_rate_
${
chosen_req_rate
}
_
gpus
${
total
_gpus
}
.json"
result_filename
=
"isl_
${
chosen_isl
}
_osl_
${
chosen_osl
}
_concurrency_
${
concurrency
}
_req_rate_
${
chosen_req_rate
}
_
ctx
${
prefill_gpus
}
_gen
${
decode
_gpus
}
.json"
set
-x
set
-x
python3 benchmark_serving.py
\
echo
"
$(
date
'+%Y-%m-%d %H:%M:%S'
)
"
python3
-u
benchmark_serving.py
\
--model
${
model_name
}
--tokenizer
${
model_path
}
\
--model
${
model_name
}
--tokenizer
${
model_path
}
\
--host
$head_node
--port
$head_port
\
--host
$head_node
--port
$head_port
\
--backend
"dynamo"
--endpoint
/v1/completions
\
--backend
"dynamo"
--endpoint
/v1/completions
\
...
@@ -84,6 +94,7 @@ do
...
@@ -84,6 +94,7 @@ do
--save-result
--result-dir
$result_dir
--result-filename
$result_filename
--save-result
--result-dir
$result_dir
--result-filename
$result_filename
set
+x
set
+x
echo
"
$(
date
'+%Y-%m-%d %H:%M:%S'
)
"
echo
"Completed benchmark with concurrency:
$concurrency
"
echo
"Completed benchmark with concurrency:
$concurrency
"
echo
"-----------------------------------------"
echo
"-----------------------------------------"
done
done
...
...
examples/backends/sglang/slurm_jobs/scripts/worker_setup.py
View file @
f8219b12
...
@@ -155,7 +155,7 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
...
@@ -155,7 +155,7 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--worker_type"
,
"--worker_type"
,
choices
=
[
"decode"
,
"prefill"
,
"frontend"
,
"nginx"
],
choices
=
[
"decode"
,
"prefill"
,
"frontend"
,
"nginx"
,
"aggregated"
],
required
=
True
,
required
=
True
,
help
=
"Type of worker to run"
,
help
=
"Type of worker to run"
,
)
)
...
@@ -175,9 +175,14 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
...
@@ -175,9 +175,14 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
parser
.
add_argument
(
parser
.
add_argument
(
"--gpu_type"
,
"--gpu_type"
,
type
=
str
,
type
=
str
,
choices
=
[
"gb200-fp8"
,
"gb200-fp4"
],
default
=
"gb200-fp8"
,
default
=
"gb200-fp8"
,
help
=
"Type of GPU to use. You can choose between gb200-fp8 and gb200-fp4."
,
help
=
"Type of GPU to use (script will be validated at runtime)"
,
)
parser
.
add_argument
(
"--script-variant"
,
type
=
str
,
default
=
"default"
,
help
=
"Script variant to use (e.g., 'default', 'optim', 'decode-optim'). Defaults to 'default'"
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
...
@@ -198,6 +203,19 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
...
@@ -198,6 +203,19 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
help
=
"Whether we add --init-expert-locations to launch commands"
,
help
=
"Whether we add --init-expert-locations to launch commands"
,
)
)
parser
.
add_argument
(
"--dump-config-path"
,
type
=
str
,
default
=
None
,
help
=
"Path to dump config file (e.g., /logs/node_config.json)"
,
)
parser
.
add_argument
(
"--run-in-ci"
,
action
=
"store_true"
,
help
=
"Run in CI mode - use binaries from /configs/ for nats/etcd"
,
)
return
parser
.
parse_args
(
args
)
return
parser
.
parse_args
(
args
)
...
@@ -236,6 +254,8 @@ def setup_env_vars_for_gpu_script(
...
@@ -236,6 +254,8 @@ def setup_env_vars_for_gpu_script(
total_nodes
:
int
,
total_nodes
:
int
,
port
:
int
=
DIST_INIT_PORT
,
port
:
int
=
DIST_INIT_PORT
,
use_init_locations
:
bool
=
True
,
use_init_locations
:
bool
=
True
,
dump_config_path
:
str
|
None
=
None
,
run_in_ci
:
bool
=
False
,
):
):
"""Setup environment variables required by GPU scripts (gb200-fp8.sh)"""
"""Setup environment variables required by GPU scripts (gb200-fp8.sh)"""
os
.
environ
[
"HOST_IP_MACHINE"
]
=
host_ip
os
.
environ
[
"HOST_IP_MACHINE"
]
=
host_ip
...
@@ -244,6 +264,11 @@ def setup_env_vars_for_gpu_script(
...
@@ -244,6 +264,11 @@ def setup_env_vars_for_gpu_script(
os
.
environ
[
"RANK"
]
=
str
(
local_rank
)
os
.
environ
[
"RANK"
]
=
str
(
local_rank
)
os
.
environ
[
"TOTAL_NODES"
]
=
str
(
total_nodes
)
os
.
environ
[
"TOTAL_NODES"
]
=
str
(
total_nodes
)
os
.
environ
[
"USE_INIT_LOCATIONS"
]
=
str
(
use_init_locations
)
os
.
environ
[
"USE_INIT_LOCATIONS"
]
=
str
(
use_init_locations
)
os
.
environ
[
"RUN_IN_CI"
]
=
str
(
run_in_ci
)
if
dump_config_path
:
os
.
environ
[
"DUMP_CONFIG_PATH"
]
=
dump_config_path
else
:
os
.
environ
.
pop
(
"DUMP_CONFIG_PATH"
,
None
)
logging
.
info
(
f
"Set HOST_IP:
{
host_ip
}
"
)
logging
.
info
(
f
"Set HOST_IP:
{
host_ip
}
"
)
logging
.
info
(
f
"Set PORT:
{
port
}
"
)
logging
.
info
(
f
"Set PORT:
{
port
}
"
)
...
@@ -251,30 +276,65 @@ def setup_env_vars_for_gpu_script(
...
@@ -251,30 +276,65 @@ def setup_env_vars_for_gpu_script(
logging
.
info
(
f
"Set RANK:
{
local_rank
}
"
)
logging
.
info
(
f
"Set RANK:
{
local_rank
}
"
)
logging
.
info
(
f
"Set TOTAL_NODES:
{
total_nodes
}
"
)
logging
.
info
(
f
"Set TOTAL_NODES:
{
total_nodes
}
"
)
logging
.
info
(
f
"Set USE_INIT_LOCATIONS:
{
use_init_locations
}
"
)
logging
.
info
(
f
"Set USE_INIT_LOCATIONS:
{
use_init_locations
}
"
)
logging
.
info
(
f
"Set RUN_IN_CI:
{
run_in_ci
}
"
)
if
dump_config_path
:
logging
.
info
(
f
"Set DUMP_CONFIG_PATH:
{
dump_config_path
}
"
)
def
get_gpu_command
(
worker_type
:
str
,
gpu_type
:
str
)
->
str
:
def
get_gpu_command
(
"""Generate command to run the appropriate GPU script"""
worker_type
:
str
,
gpu_type
:
str
,
script_variant
:
str
=
"default"
script_name
=
f
"
{
gpu_type
}
.sh"
)
->
str
:
script_path
=
Path
(
__file__
).
parent
/
script_name
"""Generate command to run the appropriate GPU script.
mode
=
worker_type
# "prefill" or "decode"
return
f
"bash
{
script_path
}
{
mode
}
"
Scripts are organized as: scripts/{gpu_type}/{agg,disagg}/{script_variant}.sh
"""
script_base
=
Path
(
__file__
).
parent
script_name
=
f
"
{
script_variant
}
.sh"
if
worker_type
==
"aggregated"
:
# Remove any -prefill or -decode suffix if present
base_gpu_type
=
gpu_type
.
replace
(
"-prefill"
,
""
).
replace
(
"-decode"
,
""
)
script_path
=
script_base
/
base_gpu_type
/
"agg"
/
script_name
if
not
script_path
.
exists
():
raise
ValueError
(
f
"Aggregated GPU script not found:
{
script_path
}
"
)
return
f
"bash
{
script_path
}
"
else
:
# Disaggregated mode: scripts/{gpu_type}/disagg/{script_variant}.sh {prefill|decode}
script_path
=
script_base
/
gpu_type
/
"disagg"
/
script_name
if
not
script_path
.
exists
():
raise
ValueError
(
f
"Disaggregated GPU script not found:
{
script_path
}
"
)
mode
=
worker_type
# "prefill" or "decode"
return
f
"bash
{
script_path
}
{
mode
}
"
def
setup_head_prefill_node
(
prefill_host_ip
:
str
)
->
None
:
def
setup_head_prefill_node
(
prefill_host_ip
:
str
,
run_in_ci
:
bool
=
False
)
->
None
:
"""
"""
Setup NATS, etcd, ingress, and http servers on the prefill host node.
Setup NATS, etcd, ingress, and http servers on the prefill host node.
"""
"""
logging
.
info
(
f
"Starting nats server on node
{
prefill_host_ip
}
"
)
if
run_in_ci
:
logging
.
info
(
f
"Starting nats server on node
{
prefill_host_ip
}
(CI mode - using /configs/nats-server)"
)
nats_cmd
=
"/configs/nats-server -js"
else
:
logging
.
info
(
f
"Starting nats server on node
{
prefill_host_ip
}
"
)
nats_cmd
=
"nats-server -js"
nats_process
=
run_command
(
"
nats
-server -js"
,
background
=
True
)
nats_process
=
run_command
(
nats
_cmd
,
background
=
True
)
if
not
nats_process
:
if
not
nats_process
:
raise
RuntimeError
(
"Failed to start nats-server"
)
raise
RuntimeError
(
"Failed to start nats-server"
)
logging
.
info
(
f
"Starting etcd server on node
{
prefill_host_ip
}
"
)
if
run_in_ci
:
logging
.
info
(
f
"Starting etcd server on node
{
prefill_host_ip
}
(CI mode - using /configs/etcd)"
)
etcd_binary
=
"/configs/etcd"
else
:
logging
.
info
(
f
"Starting etcd server on node
{
prefill_host_ip
}
"
)
etcd_binary
=
"etcd"
etcd_cmd
=
(
etcd_cmd
=
(
f
"etcd --listen-client-urls
{
ETCD_LISTEN_ADDR
}
:
{
ETCD_CLIENT_PORT
}
"
f
"
{
etcd
_binary
}
--listen-client-urls
{
ETCD_LISTEN_ADDR
}
:
{
ETCD_CLIENT_PORT
}
"
f
"--advertise-client-urls
{
ETCD_LISTEN_ADDR
}
:
{
ETCD_CLIENT_PORT
}
"
f
"--advertise-client-urls
{
ETCD_LISTEN_ADDR
}
:
{
ETCD_CLIENT_PORT
}
"
f
"--listen-peer-urls
{
ETCD_LISTEN_ADDR
}
:
{
ETCD_PEER_PORT
}
"
f
"--listen-peer-urls
{
ETCD_LISTEN_ADDR
}
:
{
ETCD_PEER_PORT
}
"
f
"--initial-cluster default=http://
{
prefill_host_ip
}
:
{
ETCD_PEER_PORT
}
"
f
"--initial-cluster default=http://
{
prefill_host_ip
}
:
{
ETCD_PEER_PORT
}
"
...
@@ -296,13 +356,15 @@ def setup_nginx_worker(master_ip: str, nginx_config: str) -> int:
...
@@ -296,13 +356,15 @@ def setup_nginx_worker(master_ip: str, nginx_config: str) -> int:
return
run_command
(
nginx_cmd
)
return
run_command
(
nginx_cmd
)
def
setup_frontend_worker
(
worker_idx
:
int
,
master_ip
:
str
)
->
int
:
def
setup_frontend_worker
(
worker_idx
:
int
,
master_ip
:
str
,
run_in_ci
:
bool
=
False
)
->
int
:
"""Setup a frontend worker"""
"""Setup a frontend worker"""
logging
.
info
(
f
"Setting up frontend worker
{
worker_idx
}
"
)
logging
.
info
(
f
"Setting up frontend worker
{
worker_idx
}
"
)
# First frontend (worker_idx 0) also sets up NATS/ETCD
# First frontend (worker_idx 0) also sets up NATS/ETCD
if
worker_idx
==
0
:
if
worker_idx
==
0
:
setup_head_prefill_node
(
master_ip
)
setup_head_prefill_node
(
master_ip
,
run_in_ci
)
else
:
else
:
logging
.
info
(
f
"Setting up additional frontend worker
{
worker_idx
}
"
)
logging
.
info
(
f
"Setting up additional frontend worker
{
worker_idx
}
"
)
if
not
wait_for_etcd
(
f
"http://
{
master_ip
}
:
{
ETCD_CLIENT_PORT
}
"
):
if
not
wait_for_etcd
(
f
"http://
{
master_ip
}
:
{
ETCD_CLIENT_PORT
}
"
):
...
@@ -310,6 +372,8 @@ def setup_frontend_worker(worker_idx: int, master_ip: str) -> int:
...
@@ -310,6 +372,8 @@ def setup_frontend_worker(worker_idx: int, master_ip: str) -> int:
# All frontends run the ingress server
# All frontends run the ingress server
frontend_cmd
=
"python3 -m dynamo.frontend --http-port=8000"
frontend_cmd
=
"python3 -m dynamo.frontend --http-port=8000"
if
run_in_ci
:
frontend_cmd
=
"python3 -m pip install /configs/ai_dynamo_runtime-0.6.1-cp310-abi3-manylinux_2_28_aarch64.whl && python3 -m pip install /configs/ai_dynamo-0.6.1-py3-none-any.whl && python3 -m dynamo.frontend --http-port=8000"
return
run_command
(
frontend_cmd
)
return
run_command
(
frontend_cmd
)
...
@@ -323,6 +387,9 @@ def setup_prefill_worker(
...
@@ -323,6 +387,9 @@ def setup_prefill_worker(
gpu_type
:
str
,
gpu_type
:
str
,
multiple_frontends_enabled
:
bool
=
False
,
multiple_frontends_enabled
:
bool
=
False
,
use_init_locations
:
bool
=
True
,
use_init_locations
:
bool
=
True
,
dump_config_path
:
str
|
None
=
None
,
script_variant
:
str
=
"default"
,
run_in_ci
:
bool
=
False
,
)
->
int
:
)
->
int
:
"""
"""
Setup the prefill worker.
Setup the prefill worker.
...
@@ -330,7 +397,7 @@ def setup_prefill_worker(
...
@@ -330,7 +397,7 @@ def setup_prefill_worker(
total_gpus
=
nodes_per_worker
*
gpus_per_node
total_gpus
=
nodes_per_worker
*
gpus_per_node
# Only setup infrastructure in traditional mode (not multiple frontends)
# Only setup infrastructure in traditional mode (not multiple frontends)
if
not
multiple_frontends_enabled
and
worker_idx
==
0
and
local_rank
==
0
:
if
not
multiple_frontends_enabled
and
worker_idx
==
0
and
local_rank
==
0
:
setup_head_prefill_node
(
master_ip
)
setup_head_prefill_node
(
master_ip
,
run_in_ci
)
else
:
else
:
logging
.
info
(
f
"Setting up prefill worker
{
worker_idx
}
, local rank
{
local_rank
}
"
)
logging
.
info
(
f
"Setting up prefill worker
{
worker_idx
}
, local rank
{
local_rank
}
"
)
if
not
wait_for_etcd
(
f
"http://
{
master_ip
}
:
{
ETCD_CLIENT_PORT
}
"
):
if
not
wait_for_etcd
(
f
"http://
{
master_ip
}
:
{
ETCD_CLIENT_PORT
}
"
):
...
@@ -343,10 +410,12 @@ def setup_prefill_worker(
...
@@ -343,10 +410,12 @@ def setup_prefill_worker(
total_gpus
,
total_gpus
,
nodes_per_worker
,
nodes_per_worker
,
use_init_locations
=
use_init_locations
,
use_init_locations
=
use_init_locations
,
dump_config_path
=
dump_config_path
,
run_in_ci
=
run_in_ci
,
)
)
# Use appropriate GPU script instead of generating command directly
# Use appropriate GPU script instead of generating command directly
cmd_to_run
=
get_gpu_command
(
"prefill"
,
gpu_type
)
cmd_to_run
=
get_gpu_command
(
"prefill"
,
gpu_type
,
script_variant
)
return
run_command
(
cmd_to_run
)
return
run_command
(
cmd_to_run
)
...
@@ -359,6 +428,9 @@ def setup_decode_worker(
...
@@ -359,6 +428,9 @@ def setup_decode_worker(
gpus_per_node
:
int
,
gpus_per_node
:
int
,
gpu_type
:
str
,
gpu_type
:
str
,
use_init_locations
:
bool
=
True
,
use_init_locations
:
bool
=
True
,
dump_config_path
:
str
|
None
=
None
,
script_variant
:
str
=
"default"
,
run_in_ci
:
bool
=
False
,
)
->
int
:
)
->
int
:
"""
"""
Setup the decode worker.
Setup the decode worker.
...
@@ -376,10 +448,56 @@ def setup_decode_worker(
...
@@ -376,10 +448,56 @@ def setup_decode_worker(
total_gpus
,
total_gpus
,
nodes_per_worker
,
nodes_per_worker
,
use_init_locations
=
use_init_locations
,
use_init_locations
=
use_init_locations
,
dump_config_path
=
dump_config_path
,
run_in_ci
=
run_in_ci
,
)
)
# Use appropriate GPU script instead of generating command directly
# Use appropriate GPU script instead of generating command directly
cmd_to_run
=
get_gpu_command
(
"decode"
,
gpu_type
)
cmd_to_run
=
get_gpu_command
(
"decode"
,
gpu_type
,
script_variant
)
return
run_command
(
cmd_to_run
)
def
setup_aggregated_worker
(
worker_idx
:
int
,
local_rank
:
int
,
leader_ip
:
str
,
master_ip
:
str
,
nodes_per_worker
:
int
,
gpus_per_node
:
int
,
gpu_type
:
str
,
multiple_frontends_enabled
:
bool
=
False
,
dump_config_path
:
str
|
None
=
None
,
script_variant
:
str
=
"default"
,
run_in_ci
:
bool
=
False
,
)
->
int
:
"""
Setup the aggregated worker.
"""
total_gpus
=
nodes_per_worker
*
gpus_per_node
# Only setup infrastructure in traditional mode (not multiple frontends) on first worker, first node
if
not
multiple_frontends_enabled
and
worker_idx
==
0
and
local_rank
==
0
:
setup_head_prefill_node
(
master_ip
,
run_in_ci
)
else
:
logging
.
info
(
f
"Setting up aggregated worker
{
worker_idx
}
, local rank
{
local_rank
}
"
)
if
not
wait_for_etcd
(
f
"http://
{
master_ip
}
:
{
ETCD_CLIENT_PORT
}
"
):
raise
RuntimeError
(
"Failed to connect to etcd"
)
# Setup environment variables for GPU script - use leader_ip as dist-init-addr
# Aggregated mode doesn't use init locations
setup_env_vars_for_gpu_script
(
leader_ip
,
local_rank
,
total_gpus
,
nodes_per_worker
,
use_init_locations
=
False
,
dump_config_path
=
dump_config_path
,
run_in_ci
=
run_in_ci
,
)
# Use appropriate aggregated GPU script
cmd_to_run
=
get_gpu_command
(
"aggregated"
,
gpu_type
,
script_variant
)
return
run_command
(
cmd_to_run
)
return
run_command
(
cmd_to_run
)
...
@@ -410,6 +528,7 @@ def main(input_args: list[str] | None = None):
...
@@ -410,6 +528,7 @@ def main(input_args: list[str] | None = None):
logging
.
info
(
f
"Leader IP:
{
args
.
leader_ip
}
"
)
logging
.
info
(
f
"Leader IP:
{
args
.
leader_ip
}
"
)
logging
.
info
(
f
"Master IP:
{
args
.
master_ip
}
"
)
logging
.
info
(
f
"Master IP:
{
args
.
master_ip
}
"
)
logging
.
info
(
f
"Nodes per worker:
{
args
.
nodes_per_worker
}
"
)
logging
.
info
(
f
"Nodes per worker:
{
args
.
nodes_per_worker
}
"
)
logging
.
info
(
f
"Run in CI mode?:
{
args
.
run_in_ci
}
"
)
logging
.
info
(
f
"Use init locations?:
{
args
.
use_init_locations
}
"
)
logging
.
info
(
f
"Use init locations?:
{
args
.
use_init_locations
}
"
)
setup_env
(
args
.
master_ip
)
setup_env
(
args
.
master_ip
)
...
@@ -419,7 +538,7 @@ def main(input_args: list[str] | None = None):
...
@@ -419,7 +538,7 @@ def main(input_args: list[str] | None = None):
raise
ValueError
(
"--nginx_config is required for nginx worker type"
)
raise
ValueError
(
"--nginx_config is required for nginx worker type"
)
setup_nginx_worker
(
args
.
master_ip
,
args
.
nginx_config
)
setup_nginx_worker
(
args
.
master_ip
,
args
.
nginx_config
)
elif
args
.
worker_type
==
"frontend"
:
elif
args
.
worker_type
==
"frontend"
:
setup_frontend_worker
(
args
.
worker_idx
,
args
.
master_ip
)
setup_frontend_worker
(
args
.
worker_idx
,
args
.
master_ip
,
args
.
run_in_ci
)
elif
args
.
worker_type
==
"prefill"
:
elif
args
.
worker_type
==
"prefill"
:
setup_prefill_worker
(
setup_prefill_worker
(
args
.
worker_idx
,
args
.
worker_idx
,
...
@@ -431,6 +550,9 @@ def main(input_args: list[str] | None = None):
...
@@ -431,6 +550,9 @@ def main(input_args: list[str] | None = None):
args
.
gpu_type
,
args
.
gpu_type
,
args
.
multiple_frontends_enabled
,
args
.
multiple_frontends_enabled
,
args
.
use_init_locations
,
args
.
use_init_locations
,
args
.
dump_config_path
,
args
.
script_variant
,
args
.
run_in_ci
,
)
)
elif
args
.
worker_type
==
"decode"
:
elif
args
.
worker_type
==
"decode"
:
setup_decode_worker
(
setup_decode_worker
(
...
@@ -442,6 +564,23 @@ def main(input_args: list[str] | None = None):
...
@@ -442,6 +564,23 @@ def main(input_args: list[str] | None = None):
args
.
gpus_per_node
,
args
.
gpus_per_node
,
args
.
gpu_type
,
args
.
gpu_type
,
args
.
use_init_locations
,
args
.
use_init_locations
,
args
.
dump_config_path
,
args
.
script_variant
,
args
.
run_in_ci
,
)
elif
args
.
worker_type
==
"aggregated"
:
setup_aggregated_worker
(
args
.
worker_idx
,
args
.
local_rank
,
args
.
leader_ip
,
args
.
master_ip
,
args
.
nodes_per_worker
,
args
.
gpus_per_node
,
args
.
gpu_type
,
args
.
multiple_frontends_enabled
,
args
.
dump_config_path
,
args
.
script_variant
,
args
.
run_in_ci
,
)
)
logging
.
info
(
f
"
{
args
.
worker_type
.
capitalize
()
}
worker setup complete"
)
logging
.
info
(
f
"
{
args
.
worker_type
.
capitalize
()
}
worker setup complete"
)
...
...
examples/backends/sglang/slurm_jobs/submit_disagg.sh
View file @
f8219b12
...
@@ -62,6 +62,7 @@ ISL=$6
...
@@ -62,6 +62,7 @@ ISL=$6
OSL
=
$7
OSL
=
$7
CONCURRENCIES
=
$8
CONCURRENCIES
=
$8
REQUEST_RATE
=
$9
REQUEST_RATE
=
$9
SCRIPT_VARIANT
=
${
10
}
RETRIES
=
1
# defaults to retry the job 1 time to avoid transient errors
RETRIES
=
1
# defaults to retry the job 1 time to avoid transient errors
...
@@ -74,10 +75,14 @@ if [[ $PREFILL_NODES -eq 6 ]] && [[ $PREFILL_WORKERS -eq 3 ]] && [[ $DECODE_NODE
...
@@ -74,10 +75,14 @@ if [[ $PREFILL_NODES -eq 6 ]] && [[ $PREFILL_WORKERS -eq 3 ]] && [[ $DECODE_NODE
USE_INIT_LOCATIONS
=(
--use-init-location
)
USE_INIT_LOCATIONS
=(
--use-init-location
)
fi
fi
SCRIPT_VARIANT_ARGS
=()
if
[[
-n
"
$SCRIPT_VARIANT
"
]]
;
then
SCRIPT_VARIANT_ARGS
=(
--script-variant
"
$SCRIPT_VARIANT
"
)
fi
command
=(
command
=(
python3 submit_job_script.py
python3 submit_job_script.py
--account
$SLURM_ACCOUNT
--partition
$SLURM_PARTITION
--time-limit
$TIME_LIMIT
--account
$SLURM_ACCOUNT
--partition
$SLURM_PARTITION
--time-limit
$TIME_LIMIT
--template
job_script_template.j2
--model-dir
$MODEL_PATH
--config-dir
$CONFIG_DIR
--model-dir
$MODEL_PATH
--config-dir
$CONFIG_DIR
--container-image
$CONTAINER_IMAGE
--container-image
$CONTAINER_IMAGE
...
@@ -90,6 +95,9 @@ command=(
...
@@ -90,6 +95,9 @@ command=(
--profiler
"
${
profiler_args
}
"
--profiler
"
${
profiler_args
}
"
--retries
$RETRIES
--retries
$RETRIES
--run-in-ci
${
SCRIPT_VARIANT_ARGS
[@]
}
)
)
"
${
command
[@]
}
"
"
${
command
[@]
}
"
examples/backends/sglang/slurm_jobs/submit_job_script.py
View file @
f8219b12
...
@@ -19,24 +19,27 @@ Script to generate SLURM job scripts from Jinja2 templates.
...
@@ -19,24 +19,27 @@ Script to generate SLURM job scripts from Jinja2 templates.
import
argparse
import
argparse
import
logging
import
logging
import
os
import
pathlib
import
subprocess
import
subprocess
import
tempfile
import
tempfile
from
datetime
import
datetime
from
jinja2
import
Template
from
jinja2
import
Template
def
print_welcome_message
(
job_ids
:
list
[
str
]):
def
print_welcome_message
(
job_ids
:
list
[
str
]
,
log_dir_name
:
str
):
"""Print a clean welcome message with job information."""
"""Print a clean welcome message with job information."""
job_id
=
f
"
<
{
', '
.
join
(
job_ids
)
}
>
"
_
=
f
"
{
', '
.
join
(
job_ids
)
}
"
print
(
print
(
f
"""
f
"""
🚀 Welcome! We hope you enjoy your time on our GB200 NVL72.
🚀 Welcome! We hope you enjoy your time on our GB200 NVL72.
Your logs for this submitted job will be available in logs/
{
job_id
}
Your logs for this submitted job will be available in logs/
{
log_dir_name
}
You can access them by running:
You can access them by running:
cd logs/
{
job_id
}
cd logs/
{
log_dir_name
}
You can view all of the prefill/decode worker logs by running:
You can view all of the prefill/decode worker logs by running:
...
@@ -71,7 +74,7 @@ def generate_job_script(template_path, output_path, **kwargs):
...
@@ -71,7 +74,7 @@ def generate_job_script(template_path, output_path, **kwargs):
with
open
(
output_path
,
"w"
)
as
f
:
with
open
(
output_path
,
"w"
)
as
f
:
f
.
write
(
rendered_script
)
f
.
write
(
rendered_script
)
return
output_path
return
output_path
,
rendered_script
def
submit_job
(
job_script_path
,
extra_slurm_args
=
[]):
def
submit_job
(
job_script_path
,
extra_slurm_args
=
[]):
...
@@ -105,13 +108,36 @@ def submit_job(job_script_path, extra_slurm_args=[]):
...
@@ -105,13 +108,36 @@ def submit_job(job_script_path, extra_slurm_args=[]):
raise
raise
def
_get_available_gpu_types
()
->
list
[
str
]:
"""Discover available GPU types by scanning scripts directory structure.
Looks for scripts in: scripts/{gpu_type}/{agg,disagg}/*.sh
"""
script_dir
=
pathlib
.
Path
(
__file__
).
parent
/
"scripts"
gpu_types
=
set
()
# Scan for GPU type directories (directories that contain agg/ or disagg/)
for
gpu_dir
in
script_dir
.
iterdir
():
if
not
gpu_dir
.
is_dir
():
continue
# Check if this directory has agg/ or disagg/ subdirectories
has_agg
=
(
gpu_dir
/
"agg"
).
is_dir
()
has_disagg
=
(
gpu_dir
/
"disagg"
).
is_dir
()
if
has_agg
or
has_disagg
:
gpu_types
.
add
(
gpu_dir
.
name
)
return
sorted
(
list
(
gpu_types
))
def
_parse_command_line_args
(
args
:
list
[
str
]
|
None
=
None
)
->
argparse
.
Namespace
:
def
_parse_command_line_args
(
args
:
list
[
str
]
|
None
=
None
)
->
argparse
.
Namespace
:
parser
=
argparse
.
ArgumentParser
(
parser
=
argparse
.
ArgumentParser
(
description
=
"Generate and submit SLURM job scripts"
description
=
"Generate and submit SLURM job scripts"
)
)
parser
.
add_argument
(
"--template"
,
required
=
True
,
help
=
"Path to Jinja2 template file"
# Get available GPU types dynamically
)
available_gpu_types
=
_get_available_gpu_types
(
)
# Template parameters
# Template parameters
parser
.
add_argument
(
"--job-name"
,
default
=
"dynamo_setup"
,
help
=
"SLURM job name"
)
parser
.
add_argument
(
"--job-name"
,
default
=
"dynamo_setup"
,
help
=
"SLURM job name"
)
...
@@ -123,16 +149,22 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
...
@@ -123,16 +149,22 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
"--time-limit"
,
default
=
"04:00:00"
,
help
=
"Time limit (HH:MM:SS)"
"--time-limit"
,
default
=
"04:00:00"
,
help
=
"Time limit (HH:MM:SS)"
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--prefill-nodes"
,
type
=
int
,
default
=
2
,
help
=
"Number of prefill nodes"
"--prefill-nodes"
,
type
=
int
,
default
=
None
,
help
=
"Number of prefill nodes"
)
parser
.
add_argument
(
"--decode-nodes"
,
type
=
int
,
default
=
None
,
help
=
"Number of decode nodes"
)
parser
.
add_argument
(
"--prefill-workers"
,
type
=
int
,
default
=
None
,
help
=
"Number of prefill workers"
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--decode-
node
s"
,
type
=
int
,
default
=
2
,
help
=
"Number of decode
node
s"
"--decode-
worker
s"
,
type
=
int
,
default
=
None
,
help
=
"Number of decode
worker
s"
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--
prefill-worker
s"
,
type
=
int
,
default
=
1
,
help
=
"Number of
prefill
workers"
"--
agg-node
s"
,
type
=
int
,
default
=
None
,
help
=
"Number of
aggregated
worker
node
s"
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--
decode
-workers"
,
type
=
int
,
default
=
1
,
help
=
"Number of
decode
workers"
"--
agg
-workers"
,
type
=
int
,
default
=
None
,
help
=
"Number of
aggregated
workers"
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--gpus-per-node"
,
type
=
int
,
default
=
8
,
help
=
"Number of GPUs per node"
"--gpus-per-node"
,
type
=
int
,
default
=
8
,
help
=
"Number of GPUs per node"
...
@@ -142,9 +174,15 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
...
@@ -142,9 +174,15 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--gpu-type"
,
"--gpu-type"
,
choices
=
[
"gb200-fp8"
,
"gb200-fp4"
],
choices
=
available_gpu_types
,
default
=
"gb200-fp8"
,
default
=
available_gpu_types
[
0
]
if
available_gpu_types
else
None
,
help
=
"GPU type to use. You can choose between gb200-fp8 and gb200-fp4."
,
help
=
f
"GPU type to use. Available types:
{
', '
.
join
(
available_gpu_types
)
}
"
,
)
parser
.
add_argument
(
"--script-variant"
,
type
=
str
,
default
=
"default"
,
help
=
"Script variant to use (e.g., 'default', 'optim', 'decode-optim'). Defaults to 'default.sh'"
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
...
@@ -191,31 +229,137 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
...
@@ -191,31 +229,137 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
help
=
"Tries to launch the job multiple times to catch transient errors"
,
help
=
"Tries to launch the job multiple times to catch transient errors"
,
)
)
parser
.
add_argument
(
"--disable-config-dump"
,
action
=
"store_false"
,
dest
=
"enable_config_dump"
,
default
=
True
,
help
=
"Disable dumping config to file on each node (default: config dump is enabled)"
,
)
parser
.
add_argument
(
"--run-in-ci"
,
action
=
"store_true"
,
help
=
"Run in CI mode - use binaries from /configs/ for nats/etcd and install dynamo wheel"
,
)
return
parser
.
parse_args
(
args
)
return
parser
.
parse_args
(
args
)
def
main
(
input_args
:
list
[
str
]
|
None
=
None
):
def
_validate_args
(
args
:
argparse
.
Namespace
)
->
None
:
setup_logging
()
"""Validate arguments and ensure aggregated and disaggregated args are mutually exclusive."""
args
=
_parse_command_line_args
(
input_args
)
has_disagg_args
=
any
(
[
args
.
prefill_nodes
is
not
None
,
args
.
decode_nodes
is
not
None
,
args
.
prefill_workers
is
not
None
,
args
.
decode_workers
is
not
None
,
]
)
has_agg_args
=
any
(
[
args
.
agg_nodes
is
not
None
,
args
.
agg_workers
is
not
None
,
]
)
# Validation
if
has_disagg_args
and
has_agg_args
:
if
args
.
prefill_nodes
%
args
.
prefill_workers
!=
0
:
raise
ValueError
(
raise
ValueError
(
f
"Prefill nodes (
{
args
.
prefill_nodes
}
) must be divisible by prefill workers (
{
args
.
prefill_workers
}
)"
"Cannot specify both aggregated (--agg-nodes, --agg-workers) and "
"disaggregated (--prefill-nodes, --decode-nodes, --prefill-workers, --decode-workers) arguments"
)
)
if
args
.
decode_nodes
%
args
.
decode_workers
!=
0
:
if
has_disagg_args
:
# Validate disaggregated args
if
args
.
prefill_nodes
is
None
or
args
.
decode_nodes
is
None
:
raise
ValueError
(
"Disaggregated mode requires both --prefill-nodes and --decode-nodes"
)
if
args
.
prefill_workers
is
None
or
args
.
decode_workers
is
None
:
raise
ValueError
(
"Disaggregated mode requires both --prefill-workers and --decode-workers"
)
if
args
.
prefill_nodes
%
args
.
prefill_workers
!=
0
:
raise
ValueError
(
f
"Prefill nodes (
{
args
.
prefill_nodes
}
) must be divisible by prefill workers (
{
args
.
prefill_workers
}
)"
)
if
args
.
decode_nodes
%
args
.
decode_workers
!=
0
:
raise
ValueError
(
f
"Decode nodes (
{
args
.
decode_nodes
}
) must be divisible by decode workers (
{
args
.
decode_workers
}
)"
)
# Validate GPU script exists for disaggregated mode
script_dir
=
pathlib
.
Path
(
__file__
).
parent
/
"scripts"
disagg_dir
=
script_dir
/
args
.
gpu_type
/
"disagg"
# Use script variant (defaults to "default")
script_name
=
f
"
{
args
.
script_variant
}
.sh"
gpu_script
=
disagg_dir
/
script_name
if
not
gpu_script
.
exists
():
raise
ValueError
(
f
"Disaggregated GPU script not found:
{
gpu_script
}
. Available GPU types:
{
', '
.
join
(
_get_available_gpu_types
())
}
"
)
if
has_agg_args
:
# Validate aggregated args
if
args
.
agg_nodes
is
None
or
args
.
agg_workers
is
None
:
raise
ValueError
(
"Aggregated mode requires both --agg-nodes and --agg-workers"
)
if
args
.
agg_nodes
%
args
.
agg_workers
!=
0
:
raise
ValueError
(
f
"Aggregated nodes (
{
args
.
agg_nodes
}
) must be divisible by aggregated workers (
{
args
.
agg_workers
}
)"
)
# Validate aggregated GPU script exists
script_dir
=
pathlib
.
Path
(
__file__
).
parent
/
"scripts"
# Remove any -prefill or -decode suffix if present
base_gpu_type
=
args
.
gpu_type
.
replace
(
"-prefill"
,
""
).
replace
(
"-decode"
,
""
)
agg_dir
=
script_dir
/
base_gpu_type
/
"agg"
# Use script variant (defaults to "default")
script_name
=
f
"
{
args
.
script_variant
}
.sh"
agg_gpu_script
=
agg_dir
/
script_name
if
not
agg_gpu_script
.
exists
():
raise
ValueError
(
f
"Aggregated GPU script not found:
{
agg_gpu_script
}
. Available GPU types:
{
', '
.
join
(
_get_available_gpu_types
())
}
"
)
if
not
has_disagg_args
and
not
has_agg_args
:
raise
ValueError
(
raise
ValueError
(
f
"Decode nodes (
{
args
.
decode_nodes
}
) must be divisible by decode workers (
{
args
.
decode_workers
}
)"
"Must specify either aggregated (--agg-nodes, --agg-workers) or "
"disaggregated (--prefill-nodes, --decode-nodes, --prefill-workers, --decode-workers) arguments"
)
)
def
main
(
input_args
:
list
[
str
]
|
None
=
None
):
setup_logging
()
args
=
_parse_command_line_args
(
input_args
)
# Validate arguments
_validate_args
(
args
)
# Determine mode and set defaults
is_aggregated
=
args
.
agg_nodes
is
not
None
if
is_aggregated
:
agg_nodes
=
args
.
agg_nodes
agg_workers
=
args
.
agg_workers
prefill_nodes
=
0
decode_nodes
=
0
prefill_workers
=
0
decode_workers
=
0
total_nodes
=
agg_nodes
else
:
prefill_nodes
=
args
.
prefill_nodes
decode_nodes
=
args
.
decode_nodes
prefill_workers
=
args
.
prefill_workers
decode_workers
=
args
.
decode_workers
agg_nodes
=
0
agg_workers
=
0
total_nodes
=
prefill_nodes
+
decode_nodes
# Validation for multiple frontends
# Validation for multiple frontends
if
args
.
enable_multiple_frontends
:
if
args
.
enable_multiple_frontends
:
if
args
.
num_additional_frontends
<
0
:
if
args
.
num_additional_frontends
<
0
:
raise
ValueError
(
"Number of additional frontends cannot be negative"
)
raise
ValueError
(
"Number of additional frontends cannot be negative"
)
total_nodes
=
args
.
prefill_nodes
+
args
.
decode_nodes
# parse profiler configs
# parse profiler configs
profiler_config
=
{}
profiler_config
=
{}
if
args
.
profiler
:
if
args
.
profiler
:
...
@@ -248,21 +392,34 @@ def main(input_args: list[str] | None = None):
...
@@ -248,21 +392,34 @@ def main(input_args: list[str] | None = None):
else
:
else
:
assert
False
,
profiler_config
[
"type"
]
assert
False
,
profiler_config
[
"type"
]
# Generate timestamp for log directory naming
timestamp
=
datetime
.
now
().
strftime
(
"%Y%m%d_%H%M%S"
)
# Select template based on mode
if
is_aggregated
:
template_path
=
"job_script_template_agg.j2"
else
:
template_path
=
"job_script_template_disagg.j2"
template_vars
=
{
template_vars
=
{
"job_name"
:
args
.
job_name
,
"job_name"
:
args
.
job_name
,
"total_nodes"
:
total_nodes
,
"total_nodes"
:
total_nodes
,
"account"
:
args
.
account
,
"account"
:
args
.
account
,
"time_limit"
:
args
.
time_limit
,
"time_limit"
:
args
.
time_limit
,
"prefill_nodes"
:
args
.
prefill_nodes
,
"prefill_nodes"
:
prefill_nodes
,
"decode_nodes"
:
args
.
decode_nodes
,
"decode_nodes"
:
decode_nodes
,
"prefill_workers"
:
args
.
prefill_workers
,
"prefill_workers"
:
prefill_workers
,
"decode_workers"
:
args
.
decode_workers
,
"decode_workers"
:
decode_workers
,
"agg_nodes"
:
agg_nodes
,
"agg_workers"
:
agg_workers
,
"is_aggregated"
:
is_aggregated
,
"model_dir"
:
args
.
model_dir
,
"model_dir"
:
args
.
model_dir
,
"config_dir"
:
args
.
config_dir
,
"config_dir"
:
args
.
config_dir
,
"container_image"
:
args
.
container_image
,
"container_image"
:
args
.
container_image
,
"gpus_per_node"
:
args
.
gpus_per_node
,
"gpus_per_node"
:
args
.
gpus_per_node
,
"network_interface"
:
args
.
network_interface
,
"network_interface"
:
args
.
network_interface
,
"gpu_type"
:
args
.
gpu_type
,
"gpu_type"
:
args
.
gpu_type
,
"script_variant"
:
args
.
script_variant
,
"partition"
:
args
.
partition
,
"partition"
:
args
.
partition
,
"enable_multiple_frontends"
:
args
.
enable_multiple_frontends
,
"enable_multiple_frontends"
:
args
.
enable_multiple_frontends
,
"num_additional_frontends"
:
args
.
num_additional_frontends
,
"num_additional_frontends"
:
args
.
num_additional_frontends
,
...
@@ -270,27 +427,80 @@ def main(input_args: list[str] | None = None):
...
@@ -270,27 +427,80 @@ def main(input_args: list[str] | None = None):
"do_profile"
:
profiler_config
[
"type"
]
!=
"manual"
,
"do_profile"
:
profiler_config
[
"type"
]
!=
"manual"
,
"profiler_type"
:
profiler_config
[
"type"
],
"profiler_type"
:
profiler_config
[
"type"
],
"profiler_arg"
:
parsable_config
,
"profiler_arg"
:
parsable_config
,
"timestamp"
:
timestamp
,
"enable_config_dump"
:
args
.
enable_config_dump
,
"run_in_ci"
:
args
.
run_in_ci
,
}
}
with
tempfile
.
NamedTemporaryFile
(
mode
=
"w"
,
suffix
=
".sh"
)
as
temp_file
:
# Create temporary file for sbatch script
generate_job_script
(
args
.
template
,
temp_file
.
name
,
**
template_vars
)
temp_file
=
tempfile
.
NamedTemporaryFile
(
mode
=
"w"
,
suffix
=
".sh"
,
delete
=
False
)
temp_path
=
temp_file
.
name
temp_file
.
close
()
try
:
_
,
rendered_script
=
generate_job_script
(
template_path
,
temp_path
,
**
template_vars
)
submitted_job_ids
=
[]
submitted_job_ids
=
[]
job_id
=
submit_job
(
temp_
file
.
name
,
args
.
extra_slurm_args
)
job_id
=
submit_job
(
temp_
path
,
args
.
extra_slurm_args
)
submitted_job_ids
.
append
(
job_id
)
submitted_job_ids
.
append
(
job_id
)
# Create log directory with new naming format IMMEDIATELY after submission
# SLURM will write log.out/log.err to this directory when job starts
if
is_aggregated
:
log_dir_name
=
f
"
{
job_id
}
_
{
agg_workers
}
A_
{
timestamp
}
"
else
:
log_dir_name
=
f
"
{
job_id
}
_
{
prefill_workers
}
P_
{
decode_workers
}
D_
{
timestamp
}
"
log_dir_path
=
os
.
path
.
join
(
"logs"
,
log_dir_name
)
os
.
makedirs
(
log_dir_path
,
exist_ok
=
True
)
# Save rendered sbatch script
sbatch_script_path
=
os
.
path
.
join
(
log_dir_path
,
"sbatch_script.sh"
)
with
open
(
sbatch_script_path
,
"w"
)
as
f
:
f
.
write
(
rendered_script
)
logging
.
info
(
f
"Saved rendered sbatch script to
{
sbatch_script_path
}
"
)
# retries logic
# retries logic
extra_slurm_args_without_dependencies
=
[
if
args
.
retries
>
0
:
x
for
x
in
args
.
extra_slurm_args
if
"dependency"
not
in
x
extra_slurm_args_without_dependencies
=
[
]
x
for
x
in
args
.
extra_slurm_args
if
"dependency"
not
in
x
for
_
in
range
(
args
.
retries
):
dependencies
=
","
.
join
([
f
"afternotok:
{
job
}
"
for
job
in
submitted_job_ids
])
slurm_args
=
extra_slurm_args_without_dependencies
+
[
f
"dependency=
{
dependencies
}
"
]
]
job_id
=
submit_job
(
temp_file
.
name
,
slurm_args
)
for
_
in
range
(
args
.
retries
):
submitted_job_ids
.
append
(
job_id
)
dependencies
=
","
.
join
(
[
f
"afternotok:
{
job
}
"
for
job
in
submitted_job_ids
]
print_welcome_message
(
submitted_job_ids
)
)
slurm_args
=
extra_slurm_args_without_dependencies
+
[
f
"dependency=
{
dependencies
}
"
]
job_id
=
submit_job
(
temp_path
,
slurm_args
)
submitted_job_ids
.
append
(
job_id
)
# Save script for retry job as well
if
is_aggregated
:
retry_log_dir_name
=
f
"
{
job_id
}
_
{
agg_workers
}
A_
{
timestamp
}
"
else
:
retry_log_dir_name
=
(
f
"
{
job_id
}
_
{
prefill_workers
}
P_
{
decode_workers
}
D_
{
timestamp
}
"
)
retry_log_dir_path
=
os
.
path
.
join
(
"logs"
,
retry_log_dir_name
)
os
.
makedirs
(
retry_log_dir_path
,
exist_ok
=
True
)
retry_sbatch_script_path
=
os
.
path
.
join
(
retry_log_dir_path
,
"sbatch_script.sh"
)
with
open
(
retry_sbatch_script_path
,
"w"
)
as
f
:
f
.
write
(
rendered_script
)
logging
.
info
(
f
"Saved rendered sbatch script to
{
retry_sbatch_script_path
}
"
)
print_welcome_message
(
submitted_job_ids
,
log_dir_name
)
finally
:
# Clean up temporary file
try
:
os
.
unlink
(
temp_path
)
except
OSError
:
pass
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment