Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
9defc01b
Unverified
Commit
9defc01b
authored
Oct 23, 2025
by
ishandhanani
Committed by
GitHub
Oct 24, 2025
Browse files
feat(sglang): experimental gb200 fp4 and updated gb200 fp8 commands (#3745)
parent
7c208309
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
212 additions
and
22 deletions
+212
-22
components/backends/sglang/slurm_jobs/scripts/gb200-fp4.sh
components/backends/sglang/slurm_jobs/scripts/gb200-fp4.sh
+192
-0
components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
+16
-18
components/backends/sglang/slurm_jobs/scripts/worker_setup.py
...onents/backends/sglang/slurm_jobs/scripts/worker_setup.py
+2
-2
components/backends/sglang/slurm_jobs/submit_job_script.py
components/backends/sglang/slurm_jobs/submit_job_script.py
+2
-2
No files found.
components/backends/sglang/slurm_jobs/scripts/gb200-fp4.sh
0 → 100755
View file @
9defc01b
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# This comes from https://github.com/sgl-project/sglang/issues/10903 and uses the low-prec decode setup
# Function to print usage
print_usage
()
{
echo
"Usage:
$0
<mode>"
echo
" mode: prefill or decode"
echo
""
echo
"Examples:"
echo
"
$0
prefill"
echo
"
$0
decode"
exit
1
}
# Check if correct number of arguments provided
if
[
$#
-ne
1
]
;
then
echo
"Error: Expected 1 argument, got $#"
print_usage
fi
# Parse arguments
mode
=
$1
# Validate mode argument
if
[
"
$mode
"
!=
"prefill"
]
&&
[
"
$mode
"
!=
"decode"
]
;
then
echo
"Error: mode must be 'prefill' or 'decode', got '
$mode
'"
print_usage
fi
echo
"Mode:
$mode
"
echo
"Command: dynamo"
# Check if required environment variables are set
if
[
-z
"
$HOST_IP_MACHINE
"
]
;
then
echo
"Error: HOST_IP_MACHINE environment variable is not set"
exit
1
fi
if
[
-z
"
$PORT
"
]
;
then
echo
"Error: PORT environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_GPUS
"
]
;
then
echo
"Error: TOTAL_GPUS environment variable is not set"
exit
1
fi
if
[
-z
"
$RANK
"
]
;
then
echo
"Error: RANK environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_NODES
"
]
;
then
echo
"Error: TOTAL_NODES environment variable is not set"
exit
1
fi
if
[
-z
"
$USE_INIT_LOCATIONS
"
]
;
then
echo
"Error: USE_INIT_LOCATIONS environment variable is not set"
exit
1
fi
# Construct command based on mode
if
[
"
$mode
"
=
"prefill"
]
;
then
set
-x
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
# no expert locations collected for fp4 yet
if
[[
"
${
USE_INIT_LOCATIONS
,,
}
"
==
"true"
]]
;
then
command_suffix
=
" "
;
fi
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN
=
1
\
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2
=
1
\
SGL_JIT_DEEPGEMM_PRECOMPILE
=
0
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM
=
1
\
MC_TE_METRIC
=
true
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
=
0
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
PYTHONUNBUFFERED
=
1
\
python3
-m
dynamo.sglang
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--skip-tokenizer-init
\
--disaggregation-mode
prefill
\
--decode-log-interval
1000
\
--max-running-requests
5632
\
--context-length
2176
\
--disable-radix-cache
\
--disable-shared-experts-fusion
\
--watchdog-timeout
1000000
\
--disable-chunked-prefix-cache
\
--attention-backend
trtllm_mla
\
--kv-cache-dtype
fp8_e4m3
\
--enable-single-batch-overlap
\
--chunked-prefill-size
65536
\
--eplb-algorithm
deepseek
\
--trust-remote-code
\
--disable-cuda-graph
\
--mem-fraction-static
0.84
\
--max-total-tokens
131072
\
--max-prefill-tokens
16384
\
--load-balance-method
round_robin
\
--quantization
modelopt_fp4
\
--enable-ep-moe
\
--moe-runner-backend
flashinfer_cutlass
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--disaggregation-bootstrap-port
30001
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--tp-size
"
$TOTAL_GPUS
"
\
--dp-size
"
$TOTAL_GPUS
"
\
--enable-dp-attention
\
--host
0.0.0.0
\
--stream-interval
50
\
--log-level
debug
${
command_suffix
}
elif
[
"
$mode
"
=
"decode"
]
;
then
set
-x
command_suffix
=
""
if
[[
"
${
USE_INIT_LOCATIONS
,,
}
"
==
"true"
]]
;
then
command_suffix
=
" "
;
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN
=
1
\
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2
=
1
\
SGL_JIT_DEEPGEMM_PRECOMPILE
=
0
\
MC_TE_METRIC
=
true
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM
=
1
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK
=
1408
\
SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH
=
1
\
SGLANG_FP4_GEMM_BACKEND
=
cutlass
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
PYTHONUNBUFFERED
=
1
\
python3
-m
dynamo.sglang
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--skip-tokenizer-init
\
--trust-remote-code
\
--disaggregation-mode
decode
\
--host
0.0.0.0
\
--decode-log-interval
1
\
--max-running-requests
67584
\
--context-length
2176
\
--disable-radix-cache
\
--disable-shared-experts-fusion
\
--watchdog-timeout
1000000
\
--disable-chunked-prefix-cache
\
--attention-backend
trtllm_mla
\
--kv-cache-dtype
fp8_e4m3
\
--enable-dp-attention
\
--chunked-prefill-size
786432
\
--mem-fraction-static
0.83
\
--enable-ep-moe
\
--moe-a2a-backend
deepep
\
--deepep-mode
low_latency
\
--ep-dispatch-algorithm
static
\
--cuda-graph-bs
1408
\
--num-reserved-decode-tokens
112
\
--ep-num-redundant-experts
32
\
--eplb-algorithm
deepseek
\
--moe-dense-tp-size
1
\
--enable-dp-lm-head
\
--prefill-round-robin-balance
\
--max-total-tokens
3122380
\
--quantization
modelopt_fp4
\
--moe-runner-backend
flashinfer_cutedsl
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--disaggregation-bootstrap-port
30001
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--tp-size
"
$TOTAL_GPUS
"
\
--dp-size
"
$TOTAL_GPUS
"
\
--enable-single-batch-overlap
\
--enable-dp-attention
\
--stream-interval
50
\
--mem-fraction-static
0.82
${
command_suffix
}
fi
components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
View file @
9defc01b
...
...
@@ -64,13 +64,11 @@ fi
# Construct command based on mode
if
[
"
$mode
"
=
"prefill"
]
;
then
# GB200 dynamo prefill command
set
-x
# SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=2048 \
# timeouts and kernel cache
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
export
SGL_DG_CACHE_DIR
=
"/configs/dgcache/3p1dcache"
command_suffix
=
""
if
[[
"
${
USE_INIT_LOCATIONS
,,
}
"
==
"true"
]]
;
then
command_suffix
=
"--init-expert-location /configs/prefill_dsr1-0528_in1000out1000_num40000.json"
;
fi
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
...
...
@@ -99,9 +97,8 @@ if [ "$mode" = "prefill" ]; then
--dp-size
"
$TOTAL_GPUS
"
\
--enable-dp-attention
\
--host
0.0.0.0
\
--decode-log-interval
1000
\
--max-running-requests
12288
\
--context-length
9600
\
--max-running-requests
30000
\
--context-length
2200
\
--disable-radix-cache
\
--moe-a2a-backend
deepep
\
--load-balance-method
round_robin
\
...
...
@@ -119,28 +116,28 @@ if [ "$mode" = "prefill" ]; then
--max-total-tokens
524288
\
--deepep-config
/configs/deepep_config.json
\
--stream-interval
50
\
--
log-level
debug
${
command_suffix
}
--
mem-fraction-static
0.75
${
command_suffix
}
elif
[
"
$mode
"
=
"decode"
]
;
then
set
-x
command_suffix
=
""
if
[[
"
${
USE_INIT_LOCATIONS
,,
}
"
==
"true"
]]
;
then
command_suffix
=
"--init-expert-location /configs/decode_dsr1-0528_loadgen_in1024out1024_num2000_2p12d.json"
;
fi
# timeouts and kernel cache
set
-x
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
export
SGL_DG_CACHE_DIR
=
"/configs/dgcache/3p1dcache"
# GB200 dynamo decode command
command_suffix
=
""
if
[[
"
${
USE_INIT_LOCATIONS
,,
}
"
==
"true"
]]
;
then
command_suffix
=
"--init-expert-location /configs/decode_dsr1-0528_loadgen_in1024out1024_num2000_2p12d.json"
;
fi
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK
=
512
\
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK
=
768
\
MC_TE_METRIC
=
true
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_DECODE_BOOTSTRAP_TIMEOUT
=
1000
\
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM
=
1
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
NCCL_MNNVL_ENABLE
=
1
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
=
0
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
...
...
@@ -160,16 +157,16 @@ elif [ "$mode" = "decode" ]; then
--enable-dp-attention
\
--host
0.0.0.0
\
--decode-log-interval
1000
\
--max-running-requests
36864
\
--context-length
96
00
\
--max-running-requests
45000
\
--context-length
22
00
\
--disable-radix-cache
\
--moe-a2a-backend
deepep
\
--prefill-round-robin-balance
\
--deepep-mode
low_latency
\
--moe-dense-tp-size
1
\
--enable-dp-lm-head
\
--cuda-graph-bs
1 2 4 8 16 24 32 40 48 56 64
80 96 112 128 160 192 224 256 320 384 448 512
\
--cuda-graph-max-bs
512
\
--cuda-graph-bs
1 2 4 8 16 24 32 40 48 56 64
72 80 88 96 104 112 120 128 136 144 152 160 168 176 184 192 200 208 216 224 232 240 248 256 264 272 280 288 296 304 312 320 328 336 344 352 360 368 376 384 416 448 480 512 544 576 608 640 672 704 736 768
\
--cuda-graph-max-bs
768
\
--disable-shared-experts-fusion
\
--ep-num-redundant-experts
32
\
--ep-dispatch-algorithm
static
\
...
...
@@ -178,5 +175,6 @@ elif [ "$mode" = "decode" ]; then
--watchdog-timeout
1000000
\
--chunked-prefill-size
36864
\
--stream-interval
50
\
--deepep-config
/configs/deepep_config.json
\
--mem-fraction-static
0.82
${
command_suffix
}
fi
components/backends/sglang/slurm_jobs/scripts/worker_setup.py
View file @
9defc01b
...
...
@@ -175,9 +175,9 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
parser
.
add_argument
(
"--gpu_type"
,
type
=
str
,
choices
=
[
"gb200-fp8"
],
choices
=
[
"gb200-fp8"
,
"gb200-fp4"
],
default
=
"gb200-fp8"
,
help
=
"Type of GPU to use"
,
help
=
"Type of GPU to use
. You can choose between gb200-fp8 and gb200-fp4.
"
,
)
parser
.
add_argument
(
...
...
components/backends/sglang/slurm_jobs/submit_job_script.py
View file @
9defc01b
...
...
@@ -142,9 +142,9 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
)
parser
.
add_argument
(
"--gpu-type"
,
choices
=
[
"gb200-fp8"
],
choices
=
[
"gb200-fp8"
,
"gb200-fp4"
],
default
=
"gb200-fp8"
,
help
=
"GPU type to use"
,
help
=
"GPU type to use
. You can choose between gb200-fp8 and gb200-fp4.
"
,
)
parser
.
add_argument
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment