Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
69817c2d
Unverified
Commit
69817c2d
authored
Dec 10, 2025
by
ishandhanani
Committed by
GitHub
Dec 11, 2025
Browse files
chore: add gb200 fp4 8k1k (#4874)
parent
ac8d36c6
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
577 additions
and
0 deletions
+577
-0
examples/backends/sglang/slurm_jobs/scripts/gb200-fp4/disagg/8k1k-low-latency.sh
...g/slurm_jobs/scripts/gb200-fp4/disagg/8k1k-low-latency.sh
+181
-0
examples/backends/sglang/slurm_jobs/scripts/gb200-fp4/disagg/8k1k-max-tpt.sh
...glang/slurm_jobs/scripts/gb200-fp4/disagg/8k1k-max-tpt.sh
+198
-0
examples/backends/sglang/slurm_jobs/scripts/gb200-fp4/disagg/8k1k-middle-curve.sh
.../slurm_jobs/scripts/gb200-fp4/disagg/8k1k-middle-curve.sh
+198
-0
No files found.
examples/backends/sglang/slurm_jobs/scripts/gb200-fp4/disagg/8k1k-low-latency.sh
0 → 100755
View file @
69817c2d
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Function to print usage
print_usage
()
{
echo
"Usage:
$0
<mode>"
echo
" mode: prefill or decode"
echo
""
echo
"Examples:"
echo
"
$0
prefill"
echo
"
$0
decode"
exit
1
}
# Check if correct number of arguments provided
if
[
$#
-ne
1
]
;
then
echo
"Error: Expected 1 argument, got $#"
print_usage
fi
# Parse arguments
mode
=
$1
# Validate mode argument
if
[
"
$mode
"
!=
"prefill"
]
&&
[
"
$mode
"
!=
"decode"
]
;
then
echo
"Error: mode must be 'prefill' or 'decode', got '
$mode
'"
print_usage
fi
echo
"Mode:
$mode
"
echo
"Command: dynamo"
# Check if required environment variables are set
if
[
-z
"
$HOST_IP_MACHINE
"
]
;
then
echo
"Error: HOST_IP_MACHINE environment variable is not set"
exit
1
fi
if
[
-z
"
$PORT
"
]
;
then
echo
"Error: PORT environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_GPUS
"
]
;
then
echo
"Error: TOTAL_GPUS environment variable is not set"
exit
1
fi
if
[
-z
"
$RANK
"
]
;
then
echo
"Error: RANK environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_NODES
"
]
;
then
echo
"Error: TOTAL_NODES environment variable is not set"
exit
1
fi
if
[
-z
"
$USE_INIT_LOCATIONS
"
]
;
then
echo
"Error: USE_INIT_LOCATIONS environment variable is not set"
exit
1
fi
if
[
-z
"
$RUN_IN_CI
"
]
;
then
echo
"Error: RUN_IN_CI environment variable is not set"
exit
1
fi
# Construct command based on mode
if
[
"
$mode
"
=
"prefill"
]
;
then
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
command_suffix
=
""
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
PYTHONUNBUFFERED
=
1
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
=
0
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_DECODE_BOOTSTRAP_TIMEOUT
=
1000
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
SGLANG_ENABLE_JIT_DEEPGEMM
=
false
\
SGLANG_ENABLE_FLASHINFER_GEMM
=
true
\
python3
-m
dynamo.sglang
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--trust-remote-code
\
--disable-radix-cache
\
--kv-cache-dtype
fp8_e4m3
\
--attention-backend
trtllm_mla
\
--quantization
modelopt_fp4
\
--moe-runner-backend
flashinfer_trtllm
\
--stream-interval
50
\
--watchdog-timeout
1000000
\
--context-length
9600
\
--mem-fraction-static
0.95
\
--max-total-tokens
32768
\
--chunked-prefill-size
24576
\
--cuda-graph-max-bs
256
\
--max-running-requests
512
\
--scheduler-recv-interval
10
\
--enable-symm-mem
\
--moe-dense-tp-size
1
\
--load-balance-method
round_robin
\
--disaggregation-bootstrap-port
30001
\
--disaggregation-mode
prefill
\
--dp-size
1
\
--tp-size
"
$TOTAL_GPUS
"
\
--ep-size
1
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--host
0.0.0.0
${
command_suffix
}
elif
[
"
$mode
"
=
"decode"
]
;
then
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
command_suffix
=
""
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
PYTHONUNBUFFERED
=
1
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
=
0
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_DECODE_BOOTSTRAP_TIMEOUT
=
1000
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
SGLANG_ENABLE_JIT_DEEPGEMM
=
false
\
SGLANG_ENABLE_FLASHINFER_GEMM
=
true
\
python3
-m
dynamo.sglang
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--prefill-round-robin-balance
\
--trust-remote-code
\
--disable-radix-cache
\
--kv-cache-dtype
fp8_e4m3
\
--attention-backend
trtllm_mla
\
--quantization
modelopt_fp4
\
--moe-runner-backend
flashinfer_trtllm
\
--disaggregation-bootstrap-port
30001
\
--disaggregation-mode
decode
\
--stream-interval
50
\
--watchdog-timeout
1000000
\
--context-length
9600
\
--mem-fraction-static
0.95
\
--chunked-prefill-size
8192
\
--cuda-graph-max-bs
256
\
--scheduler-recv-interval
10
\
--enable-symm-mem
\
--moe-dense-tp-size
1
\
--dp-size
1
\
--tp-size
"
$TOTAL_GPUS
"
\
--ep-size
1
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--host
0.0.0.0
${
command_suffix
}
fi
examples/backends/sglang/slurm_jobs/scripts/gb200-fp4/disagg/8k1k-max-tpt.sh
0 → 100755
View file @
69817c2d
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Function to print usage
print_usage
()
{
echo
"Usage:
$0
<mode>"
echo
" mode: prefill or decode"
echo
""
echo
"Examples:"
echo
"
$0
prefill"
echo
"
$0
decode"
exit
1
}
# Check if correct number of arguments provided
if
[
$#
-ne
1
]
;
then
echo
"Error: Expected 1 argument, got $#"
print_usage
fi
# Parse arguments
mode
=
$1
# Validate mode argument
if
[
"
$mode
"
!=
"prefill"
]
&&
[
"
$mode
"
!=
"decode"
]
;
then
echo
"Error: mode must be 'prefill' or 'decode', got '
$mode
'"
print_usage
fi
echo
"Mode:
$mode
"
echo
"Command: dynamo"
# Check if required environment variables are set
if
[
-z
"
$HOST_IP_MACHINE
"
]
;
then
echo
"Error: HOST_IP_MACHINE environment variable is not set"
exit
1
fi
if
[
-z
"
$PORT
"
]
;
then
echo
"Error: PORT environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_GPUS
"
]
;
then
echo
"Error: TOTAL_GPUS environment variable is not set"
exit
1
fi
if
[
-z
"
$RANK
"
]
;
then
echo
"Error: RANK environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_NODES
"
]
;
then
echo
"Error: TOTAL_NODES environment variable is not set"
exit
1
fi
if
[
-z
"
$USE_INIT_LOCATIONS
"
]
;
then
echo
"Error: USE_INIT_LOCATIONS environment variable is not set"
exit
1
fi
if
[
-z
"
$RUN_IN_CI
"
]
;
then
echo
"Error: RUN_IN_CI environment variable is not set"
exit
1
fi
# Construct command based on mode
if
[
"
$mode
"
=
"prefill"
]
;
then
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
command_suffix
=
""
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
PYTHONUNBUFFERED
=
1
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN
=
1
\
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2
=
1
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM
=
1
\
MC_TE_METRIC
=
true
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
=
0
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
python3
-m
dynamo.sglang
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--trust-remote-code
\
--kv-cache-dtype
fp8_e4m3
\
--attention-backend
trtllm_mla
\
--quantization
modelopt_fp4
\
--moe-runner-backend
flashinfer_trtllm
\
--disable-radix-cache
\
--disable-chunked-prefix-cache
\
--stream-interval
50
\
--decode-log-interval
1000
\
--watchdog-timeout
1000000
\
--context-length
9600
\
--disable-shared-experts-fusion
\
--disaggregation-bootstrap-port
30001
\
--disaggregation-mode
prefill
\
--mem-fraction-static
0.95
\
--max-total-tokens
131072
\
--max-prefill-tokens
524288
\
--chunked-prefill-size
131072
\
--max-running-requests
30000
\
--load-balance-method
round_robin
\
--disable-cuda-graph
\
--tp-size
"
$TOTAL_GPUS
"
\
--dp-size
1
\
--ep-size
1
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--host
0.0.0.0
${
command_suffix
}
elif
[
"
$mode
"
=
"decode"
]
;
then
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
command_suffix
=
""
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
PYTHONUNBUFFERED
=
1
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN
=
1
\
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2
=
1
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM
=
1
\
MC_TE_METRIC
=
true
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
=
0
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK
=
512
\
SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH
=
1
\
SGLANG_FLASHINFER_FP4_GEMM_BACKEND
=
cutlass
\
python3
-m
dynamo.sglang
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--trust-remote-code
\
--kv-cache-dtype
fp8_e4m3
\
--attention-backend
trtllm_mla
\
--quantization
modelopt_fp4
\
--moe-runner-backend
flashinfer_cutedsl
\
--disable-radix-cache
\
--disable-chunked-prefix-cache
\
--stream-interval
50
\
--decode-log-interval
1000
\
--watchdog-timeout
1000000
\
--context-length
9600
\
--disable-shared-experts-fusion
\
--eplb-algorithm
deepseek
\
--disaggregation-bootstrap-port
30001
\
--disaggregation-mode
decode
\
--mem-fraction-static
0.83
\
--max-total-tokens
524288
\
--chunked-prefill-size
24576
\
--max-running-requests
16384
\
--moe-a2a-backend
deepep
\
--deepep-mode
low_latency
\
--ep-dispatch-algorithm
static
\
--ep-num-redundant-experts
32
\
--cuda-graph-max-bs
512
\
--num-reserved-decode-tokens
112
\
--moe-dense-tp-size
1
\
--enable-dp-lm-head
\
--prefill-round-robin-balance
\
--enable-dp-attention
\
--tp-size
"
$TOTAL_GPUS
"
\
--dp-size
"
$TOTAL_GPUS
"
\
--ep-size
"
$TOTAL_GPUS
"
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--host
0.0.0.0
${
command_suffix
}
fi
examples/backends/sglang/slurm_jobs/scripts/gb200-fp4/disagg/8k1k-middle-curve.sh
0 → 100755
View file @
69817c2d
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Function to print usage
print_usage
()
{
echo
"Usage:
$0
<mode>"
echo
" mode: prefill or decode"
echo
""
echo
"Examples:"
echo
"
$0
prefill"
echo
"
$0
decode"
exit
1
}
# Check if correct number of arguments provided
if
[
$#
-ne
1
]
;
then
echo
"Error: Expected 1 argument, got $#"
print_usage
fi
# Parse arguments
mode
=
$1
# Validate mode argument
if
[
"
$mode
"
!=
"prefill"
]
&&
[
"
$mode
"
!=
"decode"
]
;
then
echo
"Error: mode must be 'prefill' or 'decode', got '
$mode
'"
print_usage
fi
echo
"Mode:
$mode
"
echo
"Command: dynamo"
# Check if required environment variables are set
if
[
-z
"
$HOST_IP_MACHINE
"
]
;
then
echo
"Error: HOST_IP_MACHINE environment variable is not set"
exit
1
fi
if
[
-z
"
$PORT
"
]
;
then
echo
"Error: PORT environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_GPUS
"
]
;
then
echo
"Error: TOTAL_GPUS environment variable is not set"
exit
1
fi
if
[
-z
"
$RANK
"
]
;
then
echo
"Error: RANK environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_NODES
"
]
;
then
echo
"Error: TOTAL_NODES environment variable is not set"
exit
1
fi
if
[
-z
"
$USE_INIT_LOCATIONS
"
]
;
then
echo
"Error: USE_INIT_LOCATIONS environment variable is not set"
exit
1
fi
if
[
-z
"
$RUN_IN_CI
"
]
;
then
echo
"Error: RUN_IN_CI environment variable is not set"
exit
1
fi
# Construct command based on mode
if
[
"
$mode
"
=
"prefill"
]
;
then
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
command_suffix
=
""
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
PYTHONUNBUFFERED
=
1
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN
=
1
\
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2
=
1
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM
=
1
\
MC_TE_METRIC
=
true
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
=
0
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
python3
-m
dynamo.sglang
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--trust-remote-code
\
--kv-cache-dtype
fp8_e4m3
\
--attention-backend
trtllm_mla
\
--quantization
modelopt_fp4
\
--moe-runner-backend
flashinfer_trtllm
\
--disable-radix-cache
\
--disable-chunked-prefix-cache
\
--stream-interval
50
\
--decode-log-interval
1000
\
--watchdog-timeout
1000000
\
--context-length
9600
\
--disable-shared-experts-fusion
\
--disaggregation-bootstrap-port
30001
\
--disaggregation-mode
prefill
\
--mem-fraction-static
0.95
\
--max-total-tokens
131072
\
--max-prefill-tokens
524288
\
--chunked-prefill-size
131072
\
--max-running-requests
30000
\
--load-balance-method
round_robin
\
--disable-cuda-graph
\
--tp-size
"
$TOTAL_GPUS
"
\
--dp-size
1
\
--ep-size
1
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--host
0.0.0.0
${
command_suffix
}
elif
[
"
$mode
"
=
"decode"
]
;
then
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
command_suffix
=
""
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
PYTHONUNBUFFERED
=
1
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN
=
1
\
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2
=
1
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM
=
1
\
MC_TE_METRIC
=
true
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
=
0
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK
=
512
\
SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH
=
1
\
SGLANG_FLASHINFER_FP4_GEMM_BACKEND
=
cutlass
\
python3
-m
dynamo.sglang
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--trust-remote-code
\
--kv-cache-dtype
fp8_e4m3
\
--attention-backend
trtllm_mla
\
--quantization
modelopt_fp4
\
--moe-runner-backend
flashinfer_cutedsl
\
--disable-radix-cache
\
--disable-chunked-prefix-cache
\
--stream-interval
50
\
--decode-log-interval
1000
\
--watchdog-timeout
1000000
\
--context-length
9600
\
--disable-shared-experts-fusion
\
--eplb-algorithm
deepseek
\
--disaggregation-bootstrap-port
30001
\
--disaggregation-mode
decode
\
--mem-fraction-static
0.83
\
--max-total-tokens
524288
\
--chunked-prefill-size
24576
\
--max-running-requests
16384
\
--moe-a2a-backend
deepep
\
--deepep-mode
low_latency
\
--ep-dispatch-algorithm
static
\
--ep-num-redundant-experts
32
\
--cuda-graph-max-bs
512
\
--num-reserved-decode-tokens
112
\
--moe-dense-tp-size
1
\
--enable-dp-lm-head
\
--prefill-round-robin-balance
\
--enable-dp-attention
\
--tp-size
"
$TOTAL_GPUS
"
\
--dp-size
"
$TOTAL_GPUS
"
\
--ep-size
"
$TOTAL_GPUS
"
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--host
0.0.0.0
${
command_suffix
}
fi
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment