Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
80dfb82c
Unverified
Commit
80dfb82c
authored
Dec 08, 2025
by
ishandhanani
Committed by
GitHub
Dec 08, 2025
Browse files
feat: slurm jobs added fp4 and 8k1k (#4747)
parent
3fea2e10
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
859 additions
and
203 deletions
+859
-203
examples/backends/sglang/launch/disagg.sh
examples/backends/sglang/launch/disagg.sh
+14
-9
examples/backends/sglang/slurm_jobs/scripts/gb200-fp4/disagg/1k1k-low-latency.sh
...g/slurm_jobs/scripts/gb200-fp4/disagg/1k1k-low-latency.sh
+179
-0
examples/backends/sglang/slurm_jobs/scripts/gb200-fp4/disagg/1k1k-max-tpt.sh
...glang/slurm_jobs/scripts/gb200-fp4/disagg/1k1k-max-tpt.sh
+200
-0
examples/backends/sglang/slurm_jobs/scripts/gb200-fp4/disagg/1k1k-middle-curve.sh
.../slurm_jobs/scripts/gb200-fp4/disagg/1k1k-middle-curve.sh
+74
-88
examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/agg/default.sh
...ckends/sglang/slurm_jobs/scripts/gb200-fp8/agg/default.sh
+0
-94
examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/disagg/1k1k-low-latency.sh
...g/slurm_jobs/scripts/gb200-fp8/disagg/1k1k-low-latency.sh
+4
-4
examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/disagg/1k1k-max-tpt.sh
...glang/slurm_jobs/scripts/gb200-fp8/disagg/1k1k-max-tpt.sh
+4
-4
examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/disagg/8k1k-low-latency.sh
...g/slurm_jobs/scripts/gb200-fp8/disagg/8k1k-low-latency.sh
+184
-0
examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/disagg/8k1k-max-tpt.sh
...glang/slurm_jobs/scripts/gb200-fp8/disagg/8k1k-max-tpt.sh
+194
-0
examples/backends/sglang/slurm_jobs/scripts/worker_setup.py
examples/backends/sglang/slurm_jobs/scripts/worker_setup.py
+1
-1
examples/backends/sglang/slurm_jobs/submit_disagg.sh
examples/backends/sglang/slurm_jobs/submit_disagg.sh
+5
-3
No files found.
examples/backends/sglang/launch/disagg.sh
View file @
80dfb82c
...
...
@@ -49,31 +49,36 @@ OTEL_SERVICE_NAME=dynamo-frontend \
python3
-m
dynamo.frontend &
DYNAMO_PID
=
$!
#AssertionError: Prefill round robin balance is required when dp size > 1. Please make sure that the prefill instance is launched with `--load-balance-method round_robin` and `--prefill-round-robin-balance` is set for decode server.
# run prefill worker
OTEL_SERVICE_NAME
=
dynamo-worker-prefill
DYN_SYSTEM_PORT
=
${
DYN_SYSTEM_PORT_PREFILL
:-
8081
}
\
python3
-m
dynamo.sglang
\
--model-path
Qwen/Qwen3-0.6B
\
--served-model-name
Qwen/Qwen3-0.6B
\
--model-path
silence09/DeepSeek-R1-Small-2layers
\
--served-model-name
silence09/DeepSeek-R1-Small-2layers
\
--page-size
16
\
--tp
1
\
--tp
2
--dp-size
2
--enable-dp-attention
\
--load-balance-method
round_robin
\
--trust-remote-code
\
--disaggregation-mode
prefill
\
--disaggregation-bootstrap-port
12345
\
--host
0.0.0.0
\
--port
40000
\
--disaggregation-transfer-backend
nixl
\
--enable-metrics
&
--enable-metrics
--log-level
debug
&
PREFILL_PID
=
$!
# run decode worker
OTEL_SERVICE_NAME
=
dynamo-worker-decode
DYN_SYSTEM_PORT
=
${
DYN_SYSTEM_PORT_DECODE
:-
8082
}
\
CUDA_VISIBLE_DEVICES
=
1
python3
-m
dynamo.sglang
\
--model-path
Qwen/Qwen3-0.6B
\
--served-model-name
Qwen/Qwen3-0.6B
\
CUDA_VISIBLE_DEVICES
=
2,3
python3
-m
dynamo.sglang
\
--model-path
silence09/DeepSeek-R1-Small-2layers
\
--served-model-name
silence09/DeepSeek-R1-Small-2layers
\
--page-size
16
\
--tp
1
\
--prefill-round-robin-balance
\
--tp
2
--dp-size
2
--enable-dp-attention
\
--trust-remote-code
\
--disaggregation-mode
decode
\
--disaggregation-bootstrap-port
12345
\
--host
0.0.0.0
\
--disaggregation-transfer-backend
nixl
\
--enable-metrics
--enable-metrics
--log-level
debug
examples/backends/sglang/slurm_jobs/scripts/gb200-fp4/disagg/1k1k-low-latency.sh
0 → 100755
View file @
80dfb82c
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Function to print usage
print_usage
()
{
echo
"Usage:
$0
<mode>"
echo
" mode: prefill or decode"
echo
""
echo
"Examples:"
echo
"
$0
prefill"
echo
"
$0
decode"
exit
1
}
# Check if correct number of arguments provided
if
[
$#
-ne
1
]
;
then
echo
"Error: Expected 1 argument, got $#"
print_usage
fi
# Parse arguments
mode
=
$1
# Validate mode argument
if
[
"
$mode
"
!=
"prefill"
]
&&
[
"
$mode
"
!=
"decode"
]
;
then
echo
"Error: mode must be 'prefill' or 'decode', got '
$mode
'"
print_usage
fi
echo
"Mode:
$mode
"
echo
"Command: dynamo"
# Check if required environment variables are set
if
[
-z
"
$HOST_IP_MACHINE
"
]
;
then
echo
"Error: HOST_IP_MACHINE environment variable is not set"
exit
1
fi
if
[
-z
"
$PORT
"
]
;
then
echo
"Error: PORT environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_GPUS
"
]
;
then
echo
"Error: TOTAL_GPUS environment variable is not set"
exit
1
fi
if
[
-z
"
$RANK
"
]
;
then
echo
"Error: RANK environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_NODES
"
]
;
then
echo
"Error: TOTAL_NODES environment variable is not set"
exit
1
fi
if
[
-z
"
$USE_INIT_LOCATIONS
"
]
;
then
echo
"Error: USE_INIT_LOCATIONS environment variable is not set"
exit
1
fi
if
[
-z
"
$RUN_IN_CI
"
]
;
then
echo
"Error: RUN_IN_CI environment variable is not set"
exit
1
fi
# Construct command based on mode
if
[
"
$mode
"
=
"prefill"
]
;
then
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
command_suffix
=
""
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
PYTHONUNBUFFERED
=
1
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
=
0
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_DECODE_BOOTSTRAP_TIMEOUT
=
1000
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
SGLANG_ENABLE_JIT_DEEPGEMM
=
false
\
SGLANG_ENABLE_FLASHINFER_GEMM
=
true
\
python3
-m
dynamo.sglang
\
--disaggregation-mode
prefill
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--trust-remote-code
\
--disable-radix-cache
\
--kv-cache-dtype
fp8_e4m3
\
--attention-backend
trtllm_mla
\
--quantization
modelopt_fp4
\
--moe-runner-backend
flashinfer_trtllm
\
--stream-interval
10
\
--watchdog-timeout
1000000
\
--context-length
2200
\
--mem-fraction-static
0.95
\
--max-total-tokens
8192
\
--chunked-prefill-size
8192
\
--cuda-graph-max-bs
256
\
--max-running-requests
512
\
--scheduler-recv-interval
10
\
--enable-symm-mem
\
--moe-dense-tp-size
1
\
--load-balance-method
round_robin
\
--disaggregation-bootstrap-port
30001
\
--data-parallel-size
1
\
--tensor-parallel-size
"
$TOTAL_GPUS
"
\
--expert-parallel-size
1
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--host
0.0.0.0
${
command_suffix
}
elif
[
"
$mode
"
=
"decode"
]
;
then
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
command_suffix
=
""
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
PYTHONUNBUFFERED
=
1
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
=
0
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_DECODE_BOOTSTRAP_TIMEOUT
=
1000
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
SGLANG_ENABLE_JIT_DEEPGEMM
=
false
\
SGLANG_ENABLE_FLASHINFER_GEMM
=
true
\
python3
-m
dynamo.sglang
\
--disaggregation-mode
decode
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--prefill-round-robin-balance
\
--trust-remote-code
\
--disable-radix-cache
\
--kv-cache-dtype
fp8_e4m3
\
--attention-backend
trtllm_mla
\
--quantization
modelopt_fp4
\
--moe-runner-backend
flashinfer_trtllm
\
--disaggregation-bootstrap-port
30001
\
--stream-interval
10
\
--watchdog-timeout
1000000
\
--context-length
2200
\
--mem-fraction-static
0.95
\
--chunked-prefill-size
8192
\
--cuda-graph-max-bs
256
\
--scheduler-recv-interval
10
\
--enable-symm-mem
\
--moe-dense-tp-size
1
\
--tensor-parallel-size
"
$TOTAL_GPUS
"
\
--expert-parallel-size
1
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--host
0.0.0.0
${
command_suffix
}
fi
\ No newline at end of file
examples/backends/sglang/slurm_jobs/scripts/gb200-fp4/disagg/1k1k-max-tpt.sh
0 → 100755
View file @
80dfb82c
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Function to print usage
print_usage
()
{
echo
"Usage:
$0
<mode>"
echo
" mode: prefill or decode"
echo
""
echo
"Examples:"
echo
"
$0
prefill"
echo
"
$0
decode"
exit
1
}
# Check if correct number of arguments provided
if
[
$#
-ne
1
]
;
then
echo
"Error: Expected 1 argument, got $#"
print_usage
fi
# Parse arguments
mode
=
$1
# Validate mode argument
if
[
"
$mode
"
!=
"prefill"
]
&&
[
"
$mode
"
!=
"decode"
]
;
then
echo
"Error: mode must be 'prefill' or 'decode', got '
$mode
'"
print_usage
fi
echo
"Mode:
$mode
"
echo
"Command: dynamo"
# Check if required environment variables are set
if
[
-z
"
$HOST_IP_MACHINE
"
]
;
then
echo
"Error: HOST_IP_MACHINE environment variable is not set"
exit
1
fi
if
[
-z
"
$PORT
"
]
;
then
echo
"Error: PORT environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_GPUS
"
]
;
then
echo
"Error: TOTAL_GPUS environment variable is not set"
exit
1
fi
if
[
-z
"
$RANK
"
]
;
then
echo
"Error: RANK environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_NODES
"
]
;
then
echo
"Error: TOTAL_NODES environment variable is not set"
exit
1
fi
if
[
-z
"
$USE_INIT_LOCATIONS
"
]
;
then
echo
"Error: USE_INIT_LOCATIONS environment variable is not set"
exit
1
fi
if
[
-z
"
$RUN_IN_CI
"
]
;
then
echo
"Error: RUN_IN_CI environment variable is not set"
exit
1
fi
# Construct command based on mode
if
[
"
$mode
"
=
"prefill"
]
;
then
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
command_suffix
=
""
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
PYTHONUNBUFFERED
=
1
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN
=
1
\
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2
=
1
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM
=
1
\
MC_TE_METRIC
=
true
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
=
0
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
python3
-m
dynamo.sglang
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--trust-remote-code
\
--kv-cache-dtype
fp8_e4m3
\
--attention-backend
trtllm_mla
\
--quantization
modelopt_fp4
\
--moe-runner-backend
flashinfer_cutlass
\
--disable-radix-cache
\
--disable-chunked-prefix-cache
\
--stream-interval
50
\
--decode-log-interval
1000
\
--watchdog-timeout
1000000
\
--context-length
2176
\
--disable-shared-experts-fusion
\
--eplb-algorithm
deepseek
\
--disaggregation-bootstrap-port
30001
\
--disaggregation-mode
prefill
\
--mem-fraction-static
0.84
\
--max-total-tokens
131072
\
--max-prefill-tokens
32768
\
--chunked-prefill-size
65536
\
--enable-single-batch-overlap
\
--max-running-requests
30000
\
--load-balance-method
round_robin
\
--disable-cuda-graph
\
--enable-dp-attention
\
--tp-size
"
$TOTAL_GPUS
"
\
--dp-size
"
$TOTAL_GPUS
"
\
--ep-size
"
$TOTAL_GPUS
"
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--host
0.0.0.0
${
command_suffix
}
elif
[
"
$mode
"
=
"decode"
]
;
then
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
command_suffix
=
""
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
PYTHONUNBUFFERED
=
1
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN
=
1
\
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2
=
1
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM
=
1
\
MC_TE_METRIC
=
true
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
=
0
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK
=
1024
\
SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH
=
1
\
SGLANG_FLASHINFER_FP4_GEMM_BACKEND
=
cutlass
\
python3
-m
dynamo.sglang
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--trust-remote-code
\
--kv-cache-dtype
fp8_e4m3
\
--attention-backend
trtllm_mla
\
--quantization
modelopt_fp4
\
--moe-runner-backend
flashinfer_cutedsl
\
--disable-radix-cache
\
--disable-chunked-prefix-cache
\
--stream-interval
50
\
--decode-log-interval
1000
\
--watchdog-timeout
1000000
\
--context-length
2176
\
--disable-shared-experts-fusion
\
--eplb-algorithm
deepseek
\
--disaggregation-bootstrap-port
30001
\
--disaggregation-mode
decode
\
--mem-fraction-static
0.83
\
--max-total-tokens
3122380
\
--chunked-prefill-size
786432
\
--max-running-requests
67584
\
--moe-a2a-backend
deepep
\
--deepep-mode
low_latency
\
--ep-dispatch-algorithm
static
\
--ep-num-redundant-experts
32
\
--cuda-graph-bs
1 2 4 8 16 24 32 40 48 56 64 72 80 88 96 104 112 120 128 136 144 152 160 168 176 184 192 200 208 216 224 232 240 248 256 264 272 280 288 296 304 312 320 328 336 344 352 360 368 376 384 416 448 480 512 544 576 608 640 672 704 736 768 1024
\
--num-reserved-decode-tokens
112
\
--moe-dense-tp-size
1
\
--enable-dp-lm-head
\
--prefill-round-robin-balance
\
--enable-dp-attention
\
--tp-size
"
$TOTAL_GPUS
"
\
--dp-size
"
$TOTAL_GPUS
"
\
--ep-size
"
$TOTAL_GPUS
"
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--host
0.0.0.0
${
command_suffix
}
fi
\ No newline at end of file
examples/backends/sglang/slurm_jobs/scripts/gb200-fp4/disagg/
default
.sh
→
examples/backends/sglang/slurm_jobs/scripts/gb200-fp4/disagg/
1k1k-middle-curve
.sh
View file @
80dfb82c
...
...
@@ -2,8 +2,6 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# This comes from https://github.com/sgl-project/sglang/issues/10903 and uses the low-prec decode setup
# Function to print usage
print_usage
()
{
echo
"Usage:
$0
<mode>"
...
...
@@ -64,152 +62,140 @@ if [ -z "$USE_INIT_LOCATIONS" ]; then
exit
1
fi
if
[
-z
"
$RUN_IN_CI
"
]
;
then
echo
"Error: RUN_IN_CI environment variable is not set"
exit
1
fi
# Construct command based on mode
if
[
"
$mode
"
=
"prefill"
]
;
then
set
-x
# no expert locations collected for fp4 yet
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
command_suffix
=
""
if
[[
"
${
USE_INIT_LOCATIONS
,,
}
"
==
"true"
]]
;
then
command_suffix
=
" "
;
fi
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
# we have to install pre-release cutedsl for a integer overflow fix
python3
-m
pip
install
--no-cache-dir
--upgrade
--pre
nvidia-cutlass-dsl
# set your own cache variables here
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
export
SGLANG_DG_CACHE_DIR
=
"/configs/dg-10212025"
export
FLASHINFER_WORKSPACE_BASE
=
"/configs/flashinfer-cache"
PYTHONUNBUFFERED
=
1
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN
=
1
\
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2
=
1
\
SGL_JIT_DEEPGEMM_PRECOMPILE
=
0
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM
=
1
\
MC_TE_METRIC
=
true
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
=
0
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
PYTHONUNBUFFERED
=
1
\
python3
-m
dynamo.sglang
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--skip-tokenizer-init
\
--disaggregation-mode
prefill
\
--trust-remote-code
\
--kv-cache-dtype
fp8_e4m3
\
--attention-backend
trtllm_mla
\
--quantization
modelopt_fp4
\
--moe-runner-backend
flashinfer_cutlass
\
--disable-radix-cache
\
--disable-chunked-prefix-cache
\
--stream-interval
50
\
--decode-log-interval
1000
\
--
max-running-requests
5632
\
--
watchdog-timeout
1000000
\
--context-length
2176
\
--disable-radix-cache
\
--disable-shared-experts-fusion
\
--watchdog-timeout
1000000
\
--disable-chunked-prefix-cache
\
--attention-backend
trtllm_mla
\
--kv-cache-dtype
fp8_e4m3
\
--enable-single-batch-overlap
\
--chunked-prefill-size
65536
\
--eplb-algorithm
deepseek
\
--
trust-remote-code
\
--disa
ble-cuda-graph
\
--
disaggregation-bootstrap-port
30001
\
--disa
ggregation-mode
prefill
\
--mem-fraction-static
0.84
\
--max-total-tokens
131072
\
--max-prefill-tokens
16384
\
--max-prefill-tokens
32768
\
--chunked-prefill-size
65536
\
--enable-single-batch-overlap
\
--max-running-requests
30000
\
--load-balance-method
round_robin
\
--quantization
modelopt_fp4
\
--moe-runner-backend
flashinfer_cutlass
\
--disable-cuda-graph
\
--enable-dp-attention
\
--tp-size
"
$TOTAL_GPUS
"
\
--dp-size
"
$TOTAL_GPUS
"
\
--ep-size
"
$TOTAL_GPUS
"
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--disaggregation-bootstrap-port
30001
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--ep-size
"
$TOTAL_GPUS
"
\
--tp-size
"
$TOTAL_GPUS
"
\
--dp-size
"
$TOTAL_GPUS
"
\
--enable-dp-attention
\
--host
0.0.0.0
\
--stream-interval
50
\
--log-level
debug
${
command_suffix
}
# For now we must keep SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK and cuda-graph-bs at 1024 until
# DeepEP merges in https://github.com/deepseek-ai/DeepEP/pull/440
# the nvidia-cutlass-dsl install fixes https://github.com/flashinfer-ai/flashinfer/issues/1830#issuecomment-3380074018
# which was previously limiting us to DISPATCH_TOKENS and cuda-graph-bs == 384
# For now use 12 nodes for fp4 since flashinfer_cutedsl requires experts per gpu < 8
# We have 288 (256 + 32 redundant) => 288/48 = 6
--host
0.0.0.0
${
command_suffix
}
elif
[
"
$mode
"
=
"decode"
]
;
then
set
-x
# no expert locations collected for fp4 yet
command_suffix
=
""
if
[[
"
${
USE_INIT_LOCATIONS
,,
}
"
==
"true"
]]
;
then
command_suffix
=
" "
;
fi
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
# set your own cache variables here
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
export
SGLANG_DG_CACHE_DIR
=
"/configs/dg-10212025"
export
FLASHINFER_WORKSPACE_BASE
=
"/configs/flashinfer-cache"
# we have to install pre-release cutedsl for a integer overflow fix
python3
-m
pip
install
--no-cache-dir
--upgrade
--pre
nvidia-cutlass-dsl
command_suffix
=
""
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
PYTHONUNBUFFERED
=
1
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN
=
1
\
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2
=
1
\
SGL_JIT_DEEPGEMM_PRECOMPILE
=
0
\
MC_TE_METRIC
=
true
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM
=
1
\
MC_TE_METRIC
=
true
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK
=
384
\
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
=
0
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK
=
1024
\
SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH
=
1
\
SGLANG_FP4_GEMM_BACKEND
=
cutlass
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
PYTHONUNBUFFERED
=
1
\
SGLANG_FLASHINFER_FP4_GEMM_BACKEND
=
cutlass
\
python3
-m
dynamo.sglang
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--skip-tokenizer-init
\
--trust-remote-code
\
--disaggregation-mode
decode
\
--host
0.0.0.0
\
--decode-log-interval
1
\
--max-running-requests
67584
\
--context-length
2176
\
--kv-cache-dtype
fp8_e4m3
\
--attention-backend
trtllm_mla
\
--quantization
modelopt_fp4
\
--moe-runner-backend
flashinfer_cutedsl
\
--disable-radix-cache
\
--disable-shared-experts-fusion
\
--watchdog-timeout
1000000
\
--disable-chunked-prefix-cache
\
--attention-backend
trtllm_mla
\
--kv-cache-dtype
fp8_e4m3
\
--enable-dp-attention
\
--chunked-prefill-size
786432
\
--stream-interval
50
\
--decode-log-interval
1000
\
--watchdog-timeout
1000000
\
--context-length
2176
\
--disable-shared-experts-fusion
\
--eplb-algorithm
deepseek
\
--disaggregation-bootstrap-port
30001
\
--disaggregation-mode
decode
\
--mem-fraction-static
0.83
\
--max-total-tokens
3122380
\
--chunked-prefill-size
786432
\
--max-running-requests
67584
\
--enable-single-batch-overlap
\
--moe-a2a-backend
deepep
\
--deepep-mode
low_latency
\
--ep-dispatch-algorithm
static
\
--cuda-graph-bs
384
\
--num-reserved-decode-tokens
112
\
--ep-num-redundant-experts
32
\
--eplb-algorithm
deepseek
\
--cuda-graph-bs
1 2 4 8 16 24 32 40 48 56 64 72 80 88 96 104 112 120 128 136 144 152 160 168 176 184 192 200 208 216 224 232 240 248 256 264 272 280 288 296 304 312 320 328 336 344 352 360 368 376 384 416 448 480 512 544 576 608 640 672 704 736 768 1024
\
--num-reserved-decode-tokens
112
\
--moe-dense-tp-size
1
\
--enable-dp-lm-head
\
--prefill-round-robin-balance
\
--max-total-tokens
3122380
\
--quantization
modelopt_fp4
\
--moe-runner-backend
flashinfer_cutedsl
\
--enable-dp-attention
\
--tp-size
"
$TOTAL_GPUS
"
\
--dp-size
"
$TOTAL_GPUS
"
\
--ep-size
"
$TOTAL_GPUS
"
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--disaggregation-bootstrap-port
30001
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--tp-size
"
$TOTAL_GPUS
"
\
--ep-size
"
$TOTAL_GPUS
"
\
--dp-size
"
$TOTAL_GPUS
"
\
--enable-single-batch-overlap
\
--enable-dp-attention
\
--stream-interval
50
\
--mem-fraction-static
0.82
${
command_suffix
}
fi
--host
0.0.0.0
${
command_suffix
}
fi
\ No newline at end of file
examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/agg/default.sh
deleted
100755 → 0
View file @
3fea2e10
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Simple agg script (not an optimized config)
print_usage
()
{
echo
"Usage:
$0
"
echo
""
echo
"This script runs aggregated mode (single dynamo.sglang instance)"
exit
1
}
echo
"Mode: aggregated"
echo
"Command: dynamo"
# Check if required environment variables are set
if
[
-z
"
$HOST_IP_MACHINE
"
]
;
then
echo
"Error: HOST_IP_MACHINE environment variable is not set"
exit
1
fi
if
[
-z
"
$PORT
"
]
;
then
echo
"Error: PORT environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_GPUS
"
]
;
then
echo
"Error: TOTAL_GPUS environment variable is not set"
exit
1
fi
if
[
-z
"
$RANK
"
]
;
then
echo
"Error: RANK environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_NODES
"
]
;
then
echo
"Error: TOTAL_NODES environment variable is not set"
exit
1
fi
# Construct command suffix for config dump
command_suffix
=
""
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
set
-x
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
export
SGLANG_DG_CACHE_DIR
=
"/configs/dg-10212025"
export
FLASHINFER_WORKSPACE_BASE
=
"/configs/flashinfer-cache"
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
MC_TE_METRIC
=
true
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
PYTHONUNBUFFERED
=
1
\
python3
-m
dynamo.sglang
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--skip-tokenizer-init
\
--trust-remote-code
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--tp-size
"
$TOTAL_GPUS
"
\
--dp-size
"
$TOTAL_GPUS
"
\
--enable-dp-attention
\
--host
0.0.0.0
\
--max-running-requests
30000
\
--context-length
2200
\
--disable-radix-cache
\
--moe-a2a-backend
deepep
\
--load-balance-method
round_robin
\
--deepep-mode
normal
\
--ep-dispatch-algorithm
dynamic
\
--moe-dense-tp-size
1
\
--enable-dp-lm-head
\
--disable-shared-experts-fusion
\
--ep-num-redundant-experts
32
\
--eplb-algorithm
deepseek
\
--attention-backend
trtllm_mla
\
--kv-cache-dtype
fp8_e4m3
\
--watchdog-timeout
1000000
\
--disable-cuda-graph
\
--chunked-prefill-size
131072
\
--max-total-tokens
524288
\
--deepep-config
/configs/deepep_config.json
\
--stream-interval
50
\
--mem-fraction-static
0.75
${
command_suffix
}
examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/disagg/1
p_4d
.sh
→
examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/disagg/1
k1k-low-latency
.sh
View file @
80dfb82c
...
...
@@ -73,8 +73,8 @@ fi
if
[
"
$mode
"
=
"prefill"
]
;
then
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.
6.1
-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.
6.1
-py3-none-any.whl
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.
7.0
-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.
7.0
-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
export
SGLANG_DG_CACHE_DIR
=
"/configs/dg-10212025"
...
...
@@ -131,8 +131,8 @@ if [ "$mode" = "prefill" ]; then
elif
[
"
$mode
"
=
"decode"
]
;
then
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.
6.1
-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.
6.1
-py3-none-any.whl
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.
7.0
-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.
7.0
-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
export
SGLANG_DG_CACHE_DIR
=
"/configs/dg-10212025"
...
...
examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/disagg/
defaul
t.sh
→
examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/disagg/
1k1k-max-tp
t.sh
View file @
80dfb82c
...
...
@@ -71,8 +71,8 @@ fi
if
[
"
$mode
"
=
"prefill"
]
;
then
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.
6.1
-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.
6.1
-py3-none-any.whl
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.
7.0
-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.
7.0
-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
export
SGLANG_DG_CACHE_DIR
=
"/configs/dg-10212025"
...
...
@@ -132,8 +132,8 @@ if [ "$mode" = "prefill" ]; then
elif
[
"
$mode
"
=
"decode"
]
;
then
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.
6.1
-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.
6.1
-py3-none-any.whl
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.
7.0
-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.
7.0
-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
export
SGLANG_DG_CACHE_DIR
=
"/configs/dg-10212025"
...
...
examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/disagg/8k1k-low-latency.sh
0 → 100755
View file @
80dfb82c
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Function to print usage
print_usage
()
{
echo
"Usage:
$0
<mode>"
echo
" mode: prefill or decode"
echo
""
echo
"Examples:"
echo
"
$0
prefill"
echo
"
$0
decode"
exit
1
}
# Check if correct number of arguments provided
if
[
$#
-ne
1
]
;
then
echo
"Error: Expected 1 argument, got $#"
print_usage
fi
# Parse arguments
mode
=
$1
# Validate mode argument
if
[
"
$mode
"
!=
"prefill"
]
&&
[
"
$mode
"
!=
"decode"
]
;
then
echo
"Error: mode must be 'prefill' or 'decode', got '
$mode
'"
print_usage
fi
echo
"Mode:
$mode
"
echo
"Command: dynamo"
# Check if required environment variables are set
if
[
-z
"
$HOST_IP_MACHINE
"
]
;
then
echo
"Error: HOST_IP_MACHINE environment variable is not set"
exit
1
fi
if
[
-z
"
$PORT
"
]
;
then
echo
"Error: PORT environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_GPUS
"
]
;
then
echo
"Error: TOTAL_GPUS environment variable is not set"
exit
1
fi
if
[
-z
"
$RANK
"
]
;
then
echo
"Error: RANK environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_NODES
"
]
;
then
echo
"Error: TOTAL_NODES environment variable is not set"
exit
1
fi
if
[
-z
"
$USE_INIT_LOCATIONS
"
]
;
then
echo
"Error: USE_INIT_LOCATIONS environment variable is not set"
exit
1
fi
if
[
-z
"
$RUN_IN_CI
"
]
;
then
echo
"Error: RUN_IN_CI environment variable is not set"
exit
1
fi
# Construct command based on mode
if
[
"
$mode
"
=
"prefill"
]
;
then
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
export
SGLANG_DG_CACHE_DIR
=
"/configs/dg-10212025"
command_suffix
=
""
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
PYTHONUNBUFFERED
=
1
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
SGLANG_ENABLE_JIT_DEEPGEMM
=
false
\
SGLANG_ENABLE_FLASHINFER_GEMM
=
1
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
=
0
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
MC_TE_METRIC
=
true
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
python3
-m
dynamo.sglang
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--trust-remote-code
\
--kv-cache-dtype
fp8_e4m3
\
--attention-backend
trtllm_mla
\
--quantization
fp8
\
--moe-runner-backend
flashinfer_trtllm
\
--disable-radix-cache
\
--watchdog-timeout
1000000
\
--context-length
9600
\
--disaggregation-mode
prefill
\
--mem-fraction-static
0.95
\
--max-total-tokens
32768
\
--chunked-prefill-size
24576
\
--cuda-graph-max-bs
512
\
--max-running-requests
512
\
--load-balance-method
round_robin
\
--scheduler-recv-interval
10
\
--enable-flashinfer-allreduce-fusion
\
--moe-dense-tp-size
1
\
--tensor-parallel-size
"
$TOTAL_GPUS
"
\
--data-parallel-size
1
\
--expert-parallel-size
1
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--disaggregation-bootstrap-port
30001
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--host
0.0.0.0
${
command_suffix
}
elif
[
"
$mode
"
=
"decode"
]
;
then
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
export
SGLANG_DG_CACHE_DIR
=
"/configs/dg-10212025"
command_suffix
=
""
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
PYTHONUNBUFFERED
=
1
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
SGLANG_ENABLE_JIT_DEEPGEMM
=
false
\
SGLANG_ENABLE_FLASHINFER_GEMM
=
1
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_DECODE_BOOTSTRAP_TIMEOUT
=
1000
\
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM
=
1
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
=
0
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
MC_TE_METRIC
=
true
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
python3
-m
dynamo.sglang
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--trust-remote-code
\
--kv-cache-dtype
fp8_e4m3
\
--attention-backend
trtllm_mla
\
--quantization
fp8
\
--moe-runner-backend
flashinfer_trtllm
\
--disable-radix-cache
\
--watchdog-timeout
1000000
\
--context-length
9600
\
--disaggregation-mode
decode
\
--mem-fraction-static
0.95
\
--chunked-prefill-size
8192
\
--cuda-graph-max-bs
512
\
--max-running-requests
512
\
--scheduler-recv-interval
10
\
--enable-flashinfer-allreduce-fusion
\
--enable-symm-mem
\
--moe-dense-tp-size
1
\
--prefill-round-robin-balance
\
--tensor-parallel-size
"
$TOTAL_GPUS
"
\
--data-parallel-size
1
\
--expert-parallel-size
1
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--disaggregation-bootstrap-port
30001
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--host
0.0.0.0
${
command_suffix
}
fi
examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/disagg/8k1k-max-tpt.sh
0 → 100755
View file @
80dfb82c
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Function to print usage
print_usage
()
{
echo
"Usage:
$0
<mode>"
echo
" mode: prefill or decode"
echo
""
echo
"Examples:"
echo
"
$0
prefill"
echo
"
$0
decode"
exit
1
}
# Check if correct number of arguments provided
if
[
$#
-ne
1
]
;
then
echo
"Error: Expected 1 argument, got $#"
print_usage
fi
# Parse arguments
mode
=
$1
# Validate mode argument
if
[
"
$mode
"
!=
"prefill"
]
&&
[
"
$mode
"
!=
"decode"
]
;
then
echo
"Error: mode must be 'prefill' or 'decode', got '
$mode
'"
print_usage
fi
echo
"Mode:
$mode
"
echo
"Command: dynamo"
# Check if required environment variables are set
if
[
-z
"
$HOST_IP_MACHINE
"
]
;
then
echo
"Error: HOST_IP_MACHINE environment variable is not set"
exit
1
fi
if
[
-z
"
$PORT
"
]
;
then
echo
"Error: PORT environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_GPUS
"
]
;
then
echo
"Error: TOTAL_GPUS environment variable is not set"
exit
1
fi
if
[
-z
"
$RANK
"
]
;
then
echo
"Error: RANK environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_NODES
"
]
;
then
echo
"Error: TOTAL_NODES environment variable is not set"
exit
1
fi
if
[
-z
"
$USE_INIT_LOCATIONS
"
]
;
then
echo
"Error: USE_INIT_LOCATIONS environment variable is not set"
exit
1
fi
if
[
-z
"
$RUN_IN_CI
"
]
;
then
echo
"Error: RUN_IN_CI environment variable is not set"
exit
1
fi
# Construct command based on mode
if
[
"
$mode
"
=
"prefill"
]
;
then
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
export
SGLANG_DG_CACHE_DIR
=
"/configs/dg-10212025"
command_suffix
=
""
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
PYTHONUNBUFFERED
=
1
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
MC_TE_METRIC
=
true
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
MC_FORCE_MNNVL
=
1
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
=
0
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
python3
-m
dynamo.sglang
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--trust-remote-code
\
--tp-size
"
$TOTAL_GPUS
"
\
--dp-size
"
$TOTAL_GPUS
"
\
--ep-size
"
$TOTAL_GPUS
"
\
--enable-dp-attention
\
--attention-backend
trtllm_mla
\
--kv-cache-dtype
fp8_e4m3
\
--disable-radix-cache
\
--stream-interval
50
\
--max-running-requests
30000
\
--context-length
9300
\
--watchdog-timeout
1000000
\
--disable-shared-experts-fusion
\
--eplb-algorithm
deepseek
\
--disaggregation-bootstrap-port
30001
\
--disaggregation-mode
prefill
\
--mem-fraction-static
0.80
\
--max-total-tokens
524288
\
--chunked-prefill-size
131072
\
--load-balance-method
round_robin
\
--disable-cuda-graph
\
--moe-a2a-backend
deepep
\
--deepep-mode
normal
\
--ep-dispatch-algorithm
dynamic
\
--moe-dense-tp-size
1
\
--enable-dp-lm-head
\
--ep-num-redundant-experts
32
\
--deepep-config
/configs/deepep_config.json
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--host
0.0.0.0
${
command_suffix
}
elif
[
"
$mode
"
=
"decode"
]
;
then
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
export
SGLANG_DG_CACHE_DIR
=
"/configs/dg-10212025"
command_suffix
=
""
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
PYTHONUNBUFFERED
=
1
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK
=
256
\
MC_TE_METRIC
=
true
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_DECODE_BOOTSTRAP_TIMEOUT
=
1000
\
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM
=
1
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
=
0
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
python3
-m
dynamo.sglang
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--skip-tokenizer-init
\
--trust-remote-code
\
--tp-size
"
$TOTAL_GPUS
"
\
--dp-size
"
$TOTAL_GPUS
"
\
--ep-size
"
$TOTAL_GPUS
"
\
--enable-dp-attention
\
--attention-backend
trtllm_mla
\
--kv-cache-dtype
fp8_e4m3
\
--disable-radix-cache
\
--stream-interval
50
\
--decode-log-interval
1000
\
--max-running-requests
8192
\
--context-length
9300
\
--watchdog-timeout
1000000
\
--disable-shared-experts-fusion
\
--eplb-algorithm
deepseek
\
--disaggregation-bootstrap-port
30001
\
--disaggregation-mode
decode
\
--mem-fraction-static
0.82
\
--chunked-prefill-size
36864
\
--moe-a2a-backend
deepep
\
--deepep-mode
low_latency
\
--ep-dispatch-algorithm
static
\
--moe-dense-tp-size
1
\
--enable-dp-lm-head
\
--prefill-round-robin-balance
\
--ep-num-redundant-experts
32
\
--deepep-config
/configs/deepep_config.json
\
--cuda-graph-max-bs
256
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--host
0.0.0.0
${
command_suffix
}
fi
examples/backends/sglang/slurm_jobs/scripts/worker_setup.py
View file @
80dfb82c
...
...
@@ -373,7 +373,7 @@ def setup_frontend_worker(
# All frontends run the ingress server
frontend_cmd
=
"python3 -m dynamo.frontend --http-port=8000"
if
run_in_ci
:
frontend_cmd
=
"python3 -m pip install /configs/ai_dynamo_runtime-0.
6.1
-cp310-abi3-manylinux_2_28_aarch64.whl && python3 -m pip install /configs/ai_dynamo-0.
6.1
-py3-none-any.whl && python3 -m dynamo.frontend --http-port=8000"
frontend_cmd
=
"python3 -m pip install /configs/ai_dynamo_runtime-0.
7.0
-cp310-abi3-manylinux_2_28_aarch64.whl && python3 -m pip install /configs/ai_dynamo-0.
7.0
-py3-none-any.whl && python3 -m dynamo.frontend --http-port=8000"
return
run_command
(
frontend_cmd
)
...
...
examples/backends/sglang/slurm_jobs/submit_disagg.sh
View file @
80dfb82c
...
...
@@ -48,7 +48,6 @@ check_env MODEL_PATH
check_env CONFIG_DIR
check_env CONTAINER_IMAGE
GPU_TYPE
=
"gb200-fp8"
GPUS_PER_NODE
=
4
:
"
${
NETWORK_INTERFACE
:
=enP6p9s0np0
}
"
...
...
@@ -62,7 +61,8 @@ ISL=$6
OSL
=
$7
CONCURRENCIES
=
$8
REQUEST_RATE
=
$9
SCRIPT_VARIANT
=
${
10
}
GPU_TYPE
=
${
10
}
SCRIPT_VARIANT
=
${
11
}
RETRIES
=
1
# defaults to retry the job 1 time to avoid transient errors
...
...
@@ -86,7 +86,7 @@ command=(
--model-dir
$MODEL_PATH
--config-dir
$CONFIG_DIR
--container-image
$CONTAINER_IMAGE
--gpu-type
$GPU_TYPE
--gpus-per-node
$GPUS_PER_NODE
--network-interface
$NETWORK_INTERFACE
--gpus-per-node
$GPUS_PER_NODE
--network-interface
$NETWORK_INTERFACE
--prefill-nodes
$PREFILL_NODES
--prefill-workers
$PREFILL_WORKERS
--decode-nodes
$DECODE_NODES
--decode-workers
$DECODE_WORKERS
...
...
@@ -96,6 +96,8 @@ command=(
--retries
$RETRIES
--gpu-type
$GPU_TYPE
--run-in-ci
${
SCRIPT_VARIANT_ARGS
[@]
}
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment