Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
69817c2d
"lib/bindings/python/vscode:/vscode.git/clone" did not exist on "329752d9d359ca8c805ba2a0543e339ff4a044ec"
Unverified
Commit
69817c2d
authored
Dec 10, 2025
by
ishandhanani
Committed by
GitHub
Dec 11, 2025
Browse files
chore: add gb200 fp4 8k1k (#4874)
parent
ac8d36c6
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
577 additions
and
0 deletions
+577
-0
examples/backends/sglang/slurm_jobs/scripts/gb200-fp4/disagg/8k1k-low-latency.sh
...g/slurm_jobs/scripts/gb200-fp4/disagg/8k1k-low-latency.sh
+181
-0
examples/backends/sglang/slurm_jobs/scripts/gb200-fp4/disagg/8k1k-max-tpt.sh
...glang/slurm_jobs/scripts/gb200-fp4/disagg/8k1k-max-tpt.sh
+198
-0
examples/backends/sglang/slurm_jobs/scripts/gb200-fp4/disagg/8k1k-middle-curve.sh
.../slurm_jobs/scripts/gb200-fp4/disagg/8k1k-middle-curve.sh
+198
-0
No files found.
examples/backends/sglang/slurm_jobs/scripts/gb200-fp4/disagg/8k1k-low-latency.sh
0 → 100755
View file @
69817c2d
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Function to print usage
print_usage
()
{
echo
"Usage:
$0
<mode>"
echo
" mode: prefill or decode"
echo
""
echo
"Examples:"
echo
"
$0
prefill"
echo
"
$0
decode"
exit
1
}
# Check if correct number of arguments provided
if
[
$#
-ne
1
]
;
then
echo
"Error: Expected 1 argument, got $#"
print_usage
fi
# Parse arguments
mode
=
$1
# Validate mode argument
if
[
"
$mode
"
!=
"prefill"
]
&&
[
"
$mode
"
!=
"decode"
]
;
then
echo
"Error: mode must be 'prefill' or 'decode', got '
$mode
'"
print_usage
fi
echo
"Mode:
$mode
"
echo
"Command: dynamo"
# Check if required environment variables are set
if
[
-z
"
$HOST_IP_MACHINE
"
]
;
then
echo
"Error: HOST_IP_MACHINE environment variable is not set"
exit
1
fi
if
[
-z
"
$PORT
"
]
;
then
echo
"Error: PORT environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_GPUS
"
]
;
then
echo
"Error: TOTAL_GPUS environment variable is not set"
exit
1
fi
if
[
-z
"
$RANK
"
]
;
then
echo
"Error: RANK environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_NODES
"
]
;
then
echo
"Error: TOTAL_NODES environment variable is not set"
exit
1
fi
if
[
-z
"
$USE_INIT_LOCATIONS
"
]
;
then
echo
"Error: USE_INIT_LOCATIONS environment variable is not set"
exit
1
fi
if
[
-z
"
$RUN_IN_CI
"
]
;
then
echo
"Error: RUN_IN_CI environment variable is not set"
exit
1
fi
# Construct command based on mode
if
[
"
$mode
"
=
"prefill"
]
;
then
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
command_suffix
=
""
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
PYTHONUNBUFFERED
=
1
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
=
0
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_DECODE_BOOTSTRAP_TIMEOUT
=
1000
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
SGLANG_ENABLE_JIT_DEEPGEMM
=
false
\
SGLANG_ENABLE_FLASHINFER_GEMM
=
true
\
python3
-m
dynamo.sglang
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--trust-remote-code
\
--disable-radix-cache
\
--kv-cache-dtype
fp8_e4m3
\
--attention-backend
trtllm_mla
\
--quantization
modelopt_fp4
\
--moe-runner-backend
flashinfer_trtllm
\
--stream-interval
50
\
--watchdog-timeout
1000000
\
--context-length
9600
\
--mem-fraction-static
0.95
\
--max-total-tokens
32768
\
--chunked-prefill-size
24576
\
--cuda-graph-max-bs
256
\
--max-running-requests
512
\
--scheduler-recv-interval
10
\
--enable-symm-mem
\
--moe-dense-tp-size
1
\
--load-balance-method
round_robin
\
--disaggregation-bootstrap-port
30001
\
--disaggregation-mode
prefill
\
--dp-size
1
\
--tp-size
"
$TOTAL_GPUS
"
\
--ep-size
1
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--host
0.0.0.0
${
command_suffix
}
elif
[
"
$mode
"
=
"decode"
]
;
then
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
command_suffix
=
""
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
PYTHONUNBUFFERED
=
1
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
=
0
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_DECODE_BOOTSTRAP_TIMEOUT
=
1000
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
SGLANG_ENABLE_JIT_DEEPGEMM
=
false
\
SGLANG_ENABLE_FLASHINFER_GEMM
=
true
\
python3
-m
dynamo.sglang
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--prefill-round-robin-balance
\
--trust-remote-code
\
--disable-radix-cache
\
--kv-cache-dtype
fp8_e4m3
\
--attention-backend
trtllm_mla
\
--quantization
modelopt_fp4
\
--moe-runner-backend
flashinfer_trtllm
\
--disaggregation-bootstrap-port
30001
\
--disaggregation-mode
decode
\
--stream-interval
50
\
--watchdog-timeout
1000000
\
--context-length
9600
\
--mem-fraction-static
0.95
\
--chunked-prefill-size
8192
\
--cuda-graph-max-bs
256
\
--scheduler-recv-interval
10
\
--enable-symm-mem
\
--moe-dense-tp-size
1
\
--dp-size
1
\
--tp-size
"
$TOTAL_GPUS
"
\
--ep-size
1
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--host
0.0.0.0
${
command_suffix
}
fi
examples/backends/sglang/slurm_jobs/scripts/gb200-fp4/disagg/8k1k-max-tpt.sh
0 → 100755
View file @
69817c2d
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Function to print usage
print_usage
()
{
echo
"Usage:
$0
<mode>"
echo
" mode: prefill or decode"
echo
""
echo
"Examples:"
echo
"
$0
prefill"
echo
"
$0
decode"
exit
1
}
# Check if correct number of arguments provided
if
[
$#
-ne
1
]
;
then
echo
"Error: Expected 1 argument, got $#"
print_usage
fi
# Parse arguments
mode
=
$1
# Validate mode argument
if
[
"
$mode
"
!=
"prefill"
]
&&
[
"
$mode
"
!=
"decode"
]
;
then
echo
"Error: mode must be 'prefill' or 'decode', got '
$mode
'"
print_usage
fi
echo
"Mode:
$mode
"
echo
"Command: dynamo"
# Check if required environment variables are set
if
[
-z
"
$HOST_IP_MACHINE
"
]
;
then
echo
"Error: HOST_IP_MACHINE environment variable is not set"
exit
1
fi
if
[
-z
"
$PORT
"
]
;
then
echo
"Error: PORT environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_GPUS
"
]
;
then
echo
"Error: TOTAL_GPUS environment variable is not set"
exit
1
fi
if
[
-z
"
$RANK
"
]
;
then
echo
"Error: RANK environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_NODES
"
]
;
then
echo
"Error: TOTAL_NODES environment variable is not set"
exit
1
fi
if
[
-z
"
$USE_INIT_LOCATIONS
"
]
;
then
echo
"Error: USE_INIT_LOCATIONS environment variable is not set"
exit
1
fi
if
[
-z
"
$RUN_IN_CI
"
]
;
then
echo
"Error: RUN_IN_CI environment variable is not set"
exit
1
fi
# Construct command based on mode
if
[
"
$mode
"
=
"prefill"
]
;
then
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
command_suffix
=
""
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
PYTHONUNBUFFERED
=
1
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN
=
1
\
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2
=
1
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM
=
1
\
MC_TE_METRIC
=
true
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
=
0
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
python3
-m
dynamo.sglang
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--trust-remote-code
\
--kv-cache-dtype
fp8_e4m3
\
--attention-backend
trtllm_mla
\
--quantization
modelopt_fp4
\
--moe-runner-backend
flashinfer_trtllm
\
--disable-radix-cache
\
--disable-chunked-prefix-cache
\
--stream-interval
50
\
--decode-log-interval
1000
\
--watchdog-timeout
1000000
\
--context-length
9600
\
--disable-shared-experts-fusion
\
--disaggregation-bootstrap-port
30001
\
--disaggregation-mode
prefill
\
--mem-fraction-static
0.95
\
--max-total-tokens
131072
\
--max-prefill-tokens
524288
\
--chunked-prefill-size
131072
\
--max-running-requests
30000
\
--load-balance-method
round_robin
\
--disable-cuda-graph
\
--tp-size
"
$TOTAL_GPUS
"
\
--dp-size
1
\
--ep-size
1
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--host
0.0.0.0
${
command_suffix
}
elif
[
"
$mode
"
=
"decode"
]
;
then
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
command_suffix
=
""
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
PYTHONUNBUFFERED
=
1
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN
=
1
\
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2
=
1
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM
=
1
\
MC_TE_METRIC
=
true
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
=
0
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK
=
512
\
SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH
=
1
\
SGLANG_FLASHINFER_FP4_GEMM_BACKEND
=
cutlass
\
python3
-m
dynamo.sglang
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--trust-remote-code
\
--kv-cache-dtype
fp8_e4m3
\
--attention-backend
trtllm_mla
\
--quantization
modelopt_fp4
\
--moe-runner-backend
flashinfer_cutedsl
\
--disable-radix-cache
\
--disable-chunked-prefix-cache
\
--stream-interval
50
\
--decode-log-interval
1000
\
--watchdog-timeout
1000000
\
--context-length
9600
\
--disable-shared-experts-fusion
\
--eplb-algorithm
deepseek
\
--disaggregation-bootstrap-port
30001
\
--disaggregation-mode
decode
\
--mem-fraction-static
0.83
\
--max-total-tokens
524288
\
--chunked-prefill-size
24576
\
--max-running-requests
16384
\
--moe-a2a-backend
deepep
\
--deepep-mode
low_latency
\
--ep-dispatch-algorithm
static
\
--ep-num-redundant-experts
32
\
--cuda-graph-max-bs
512
\
--num-reserved-decode-tokens
112
\
--moe-dense-tp-size
1
\
--enable-dp-lm-head
\
--prefill-round-robin-balance
\
--enable-dp-attention
\
--tp-size
"
$TOTAL_GPUS
"
\
--dp-size
"
$TOTAL_GPUS
"
\
--ep-size
"
$TOTAL_GPUS
"
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--host
0.0.0.0
${
command_suffix
}
fi
examples/backends/sglang/slurm_jobs/scripts/gb200-fp4/disagg/8k1k-middle-curve.sh
0 → 100755
View file @
69817c2d
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Function to print usage
print_usage
()
{
echo
"Usage:
$0
<mode>"
echo
" mode: prefill or decode"
echo
""
echo
"Examples:"
echo
"
$0
prefill"
echo
"
$0
decode"
exit
1
}
# Check if correct number of arguments provided
if
[
$#
-ne
1
]
;
then
echo
"Error: Expected 1 argument, got $#"
print_usage
fi
# Parse arguments
mode
=
$1
# Validate mode argument
if
[
"
$mode
"
!=
"prefill"
]
&&
[
"
$mode
"
!=
"decode"
]
;
then
echo
"Error: mode must be 'prefill' or 'decode', got '
$mode
'"
print_usage
fi
echo
"Mode:
$mode
"
echo
"Command: dynamo"
# Check if required environment variables are set
if
[
-z
"
$HOST_IP_MACHINE
"
]
;
then
echo
"Error: HOST_IP_MACHINE environment variable is not set"
exit
1
fi
if
[
-z
"
$PORT
"
]
;
then
echo
"Error: PORT environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_GPUS
"
]
;
then
echo
"Error: TOTAL_GPUS environment variable is not set"
exit
1
fi
if
[
-z
"
$RANK
"
]
;
then
echo
"Error: RANK environment variable is not set"
exit
1
fi
if
[
-z
"
$TOTAL_NODES
"
]
;
then
echo
"Error: TOTAL_NODES environment variable is not set"
exit
1
fi
if
[
-z
"
$USE_INIT_LOCATIONS
"
]
;
then
echo
"Error: USE_INIT_LOCATIONS environment variable is not set"
exit
1
fi
if
[
-z
"
$RUN_IN_CI
"
]
;
then
echo
"Error: RUN_IN_CI environment variable is not set"
exit
1
fi
# Construct command based on mode
if
[
"
$mode
"
=
"prefill"
]
;
then
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
command_suffix
=
""
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
PYTHONUNBUFFERED
=
1
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN
=
1
\
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2
=
1
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM
=
1
\
MC_TE_METRIC
=
true
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
=
0
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
python3
-m
dynamo.sglang
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--trust-remote-code
\
--kv-cache-dtype
fp8_e4m3
\
--attention-backend
trtllm_mla
\
--quantization
modelopt_fp4
\
--moe-runner-backend
flashinfer_trtllm
\
--disable-radix-cache
\
--disable-chunked-prefix-cache
\
--stream-interval
50
\
--decode-log-interval
1000
\
--watchdog-timeout
1000000
\
--context-length
9600
\
--disable-shared-experts-fusion
\
--disaggregation-bootstrap-port
30001
\
--disaggregation-mode
prefill
\
--mem-fraction-static
0.95
\
--max-total-tokens
131072
\
--max-prefill-tokens
524288
\
--chunked-prefill-size
131072
\
--max-running-requests
30000
\
--load-balance-method
round_robin
\
--disable-cuda-graph
\
--tp-size
"
$TOTAL_GPUS
"
\
--dp-size
1
\
--ep-size
1
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--host
0.0.0.0
${
command_suffix
}
elif
[
"
$mode
"
=
"decode"
]
;
then
set
-x
if
[[
"
${
RUN_IN_CI
,,
}
"
==
"true"
]]
;
then
python3
-m
pip
install
/configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3
-m
pip
install
/configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT
=
1800
command_suffix
=
""
if
[[
-n
"
${
DUMP_CONFIG_PATH
}
"
]]
;
then
command_suffix
=
"
${
command_suffix
}
--dump-config-to
${
DUMP_CONFIG_PATH
}
"
;
fi
PYTHONUNBUFFERED
=
1
\
DYN_SKIP_SGLANG_LOG_FORMATTING
=
1
\
SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN
=
1
\
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2
=
1
\
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
=
100000
\
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
=
100000
\
SGLANG_DISAGGREGATION_WAITING_TIMEOUT
=
100000
\
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM
=
1
\
MC_TE_METRIC
=
true
\
MC_FORCE_MNNVL
=
1
\
NCCL_MNNVL_ENABLE
=
1
\
NCCL_CUMEM_ENABLE
=
1
\
SGLANG_MOONCAKE_CUSTOM_MEM_POOL
=
True
\
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
=
0
\
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
=
1
\
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK
=
512
\
SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH
=
1
\
SGLANG_FLASHINFER_FP4_GEMM_BACKEND
=
cutlass
\
python3
-m
dynamo.sglang
\
--served-model-name
deepseek-ai/DeepSeek-R1
\
--model-path
/model/
\
--trust-remote-code
\
--kv-cache-dtype
fp8_e4m3
\
--attention-backend
trtllm_mla
\
--quantization
modelopt_fp4
\
--moe-runner-backend
flashinfer_cutedsl
\
--disable-radix-cache
\
--disable-chunked-prefix-cache
\
--stream-interval
50
\
--decode-log-interval
1000
\
--watchdog-timeout
1000000
\
--context-length
9600
\
--disable-shared-experts-fusion
\
--eplb-algorithm
deepseek
\
--disaggregation-bootstrap-port
30001
\
--disaggregation-mode
decode
\
--mem-fraction-static
0.83
\
--max-total-tokens
524288
\
--chunked-prefill-size
24576
\
--max-running-requests
16384
\
--moe-a2a-backend
deepep
\
--deepep-mode
low_latency
\
--ep-dispatch-algorithm
static
\
--ep-num-redundant-experts
32
\
--cuda-graph-max-bs
512
\
--num-reserved-decode-tokens
112
\
--moe-dense-tp-size
1
\
--enable-dp-lm-head
\
--prefill-round-robin-balance
\
--enable-dp-attention
\
--tp-size
"
$TOTAL_GPUS
"
\
--dp-size
"
$TOTAL_GPUS
"
\
--ep-size
"
$TOTAL_GPUS
"
\
--dist-init-addr
"
$HOST_IP_MACHINE
:
$PORT
"
\
--nnodes
"
$TOTAL_NODES
"
\
--node-rank
"
$RANK
"
\
--host
0.0.0.0
${
command_suffix
}
fi
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment