Unverified Commit 9defc01b authored by ishandhanani's avatar ishandhanani Committed by GitHub
Browse files

feat(sglang): experimental gb200 fp4 and updated gb200 fp8 commands (#3745)

parent 7c208309
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# This comes from https://github.com/sgl-project/sglang/issues/10903 and uses the low-prec decode setup
# Function to print usage
print_usage() {
echo "Usage: $0 <mode>"
echo " mode: prefill or decode"
echo ""
echo "Examples:"
echo " $0 prefill"
echo " $0 decode"
exit 1
}
# Check if correct number of arguments provided
if [ $# -ne 1 ]; then
echo "Error: Expected 1 argument, got $#"
print_usage
fi
# Parse arguments
mode=$1
# Validate mode argument
if [ "$mode" != "prefill" ] && [ "$mode" != "decode" ]; then
echo "Error: mode must be 'prefill' or 'decode', got '$mode'"
print_usage
fi
echo "Mode: $mode"
echo "Command: dynamo"
# Check if required environment variables are set
if [ -z "$HOST_IP_MACHINE" ]; then
echo "Error: HOST_IP_MACHINE environment variable is not set"
exit 1
fi
if [ -z "$PORT" ]; then
echo "Error: PORT environment variable is not set"
exit 1
fi
if [ -z "$TOTAL_GPUS" ]; then
echo "Error: TOTAL_GPUS environment variable is not set"
exit 1
fi
if [ -z "$RANK" ]; then
echo "Error: RANK environment variable is not set"
exit 1
fi
if [ -z "$TOTAL_NODES" ]; then
echo "Error: TOTAL_NODES environment variable is not set"
exit 1
fi
if [ -z "$USE_INIT_LOCATIONS" ]; then
echo "Error: USE_INIT_LOCATIONS environment variable is not set"
exit 1
fi
# Construct command based on mode
if [ "$mode" = "prefill" ]; then
set -x
export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
# no expert locations collected for fp4 yet
if [[ "${USE_INIT_LOCATIONS,,}" == "true" ]]; then command_suffix=" "; fi
DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN=1 \
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2=1 \
SGL_JIT_DEEPGEMM_PRECOMPILE=0 \
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \
MC_TE_METRIC=true \
SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
MC_FORCE_MNNVL=1 \
NCCL_MNNVL_ENABLE=1 \
NCCL_CUMEM_ENABLE=1 \
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
PYTHONUNBUFFERED=1 \
python3 -m dynamo.sglang \
--served-model-name deepseek-ai/DeepSeek-R1 \
--model-path /model/ \
--skip-tokenizer-init \
--disaggregation-mode prefill \
--decode-log-interval 1000 \
--max-running-requests 5632 \
--context-length 2176 \
--disable-radix-cache \
--disable-shared-experts-fusion \
--watchdog-timeout 1000000 \
--disable-chunked-prefix-cache \
--attention-backend trtllm_mla \
--kv-cache-dtype fp8_e4m3 \
--enable-single-batch-overlap \
--chunked-prefill-size 65536 \
--eplb-algorithm deepseek \
--trust-remote-code \
--disable-cuda-graph \
--mem-fraction-static 0.84 \
--max-total-tokens 131072 \
--max-prefill-tokens 16384 \
--load-balance-method round_robin \
--quantization modelopt_fp4 \
--enable-ep-moe \
--moe-runner-backend flashinfer_cutlass \
--dist-init-addr "$HOST_IP_MACHINE:$PORT" \
--disaggregation-bootstrap-port 30001 \
--nnodes "$TOTAL_NODES" \
--node-rank "$RANK" \
--tp-size "$TOTAL_GPUS" \
--dp-size "$TOTAL_GPUS" \
--enable-dp-attention \
--host 0.0.0.0 \
--stream-interval 50 \
--log-level debug ${command_suffix}
elif [ "$mode" = "decode" ]; then
set -x
command_suffix=""
if [[ "${USE_INIT_LOCATIONS,,}" == "true" ]]; then command_suffix=" "; fi
export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN=1 \
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2=1 \
SGL_JIT_DEEPGEMM_PRECOMPILE=0 \
MC_TE_METRIC=true \
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \
SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=1408 \
SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH=1 \
SGLANG_FP4_GEMM_BACKEND=cutlass \
DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
PYTHONUNBUFFERED=1 \
python3 -m dynamo.sglang \
--served-model-name deepseek-ai/DeepSeek-R1 \
--model-path /model/ \
--skip-tokenizer-init \
--trust-remote-code \
--disaggregation-mode decode \
--host 0.0.0.0 \
--decode-log-interval 1 \
--max-running-requests 67584 \
--context-length 2176 \
--disable-radix-cache \
--disable-shared-experts-fusion \
--watchdog-timeout 1000000 \
--disable-chunked-prefix-cache \
--attention-backend trtllm_mla \
--kv-cache-dtype fp8_e4m3 \
--enable-dp-attention \
--chunked-prefill-size 786432 \
--mem-fraction-static 0.83 \
--enable-ep-moe \
--moe-a2a-backend deepep \
--deepep-mode low_latency \
--ep-dispatch-algorithm static \
--cuda-graph-bs 1408 \
--num-reserved-decode-tokens 112 \
--ep-num-redundant-experts 32 \
--eplb-algorithm deepseek \
--moe-dense-tp-size 1 \
--enable-dp-lm-head \
--prefill-round-robin-balance \
--max-total-tokens 3122380 \
--quantization modelopt_fp4 \
--moe-runner-backend flashinfer_cutedsl \
--dist-init-addr "$HOST_IP_MACHINE:$PORT" \
--disaggregation-bootstrap-port 30001 \
--nnodes "$TOTAL_NODES" \
--node-rank "$RANK" \
--tp-size "$TOTAL_GPUS" \
--dp-size "$TOTAL_GPUS" \
--enable-single-batch-overlap \
--enable-dp-attention \
--stream-interval 50 \
--mem-fraction-static 0.82 ${command_suffix}
fi
...@@ -64,13 +64,11 @@ fi ...@@ -64,13 +64,11 @@ fi
# Construct command based on mode # Construct command based on mode
if [ "$mode" = "prefill" ]; then if [ "$mode" = "prefill" ]; then
# GB200 dynamo prefill command
set -x set -x
# SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=2048 \
# timeouts and kernel cache
export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800 export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
export SGL_DG_CACHE_DIR="/configs/dgcache/3p1dcache" export SGL_DG_CACHE_DIR="/configs/dgcache/3p1dcache"
command_suffix=""
if [[ "${USE_INIT_LOCATIONS,,}" == "true" ]]; then command_suffix="--init-expert-location /configs/prefill_dsr1-0528_in1000out1000_num40000.json"; fi if [[ "${USE_INIT_LOCATIONS,,}" == "true" ]]; then command_suffix="--init-expert-location /configs/prefill_dsr1-0528_in1000out1000_num40000.json"; fi
DYN_SKIP_SGLANG_LOG_FORMATTING=1 \ DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
...@@ -99,9 +97,8 @@ if [ "$mode" = "prefill" ]; then ...@@ -99,9 +97,8 @@ if [ "$mode" = "prefill" ]; then
--dp-size "$TOTAL_GPUS" \ --dp-size "$TOTAL_GPUS" \
--enable-dp-attention \ --enable-dp-attention \
--host 0.0.0.0 \ --host 0.0.0.0 \
--decode-log-interval 1000 \ --max-running-requests 30000 \
--max-running-requests 12288 \ --context-length 2200 \
--context-length 9600 \
--disable-radix-cache \ --disable-radix-cache \
--moe-a2a-backend deepep \ --moe-a2a-backend deepep \
--load-balance-method round_robin \ --load-balance-method round_robin \
...@@ -119,28 +116,28 @@ if [ "$mode" = "prefill" ]; then ...@@ -119,28 +116,28 @@ if [ "$mode" = "prefill" ]; then
--max-total-tokens 524288 \ --max-total-tokens 524288 \
--deepep-config /configs/deepep_config.json \ --deepep-config /configs/deepep_config.json \
--stream-interval 50 \ --stream-interval 50 \
--log-level debug ${command_suffix} --mem-fraction-static 0.75 ${command_suffix}
elif [ "$mode" = "decode" ]; then elif [ "$mode" = "decode" ]; then
set -x set -x
command_suffix="" set -x
if [[ "${USE_INIT_LOCATIONS,,}" == "true" ]]; then command_suffix="--init-expert-location /configs/decode_dsr1-0528_loadgen_in1024out1024_num2000_2p12d.json"; fi
# timeouts and kernel cache
export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800 export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
export SGL_DG_CACHE_DIR="/configs/dgcache/3p1dcache" export SGL_DG_CACHE_DIR="/configs/dgcache/3p1dcache"
# GB200 dynamo decode command command_suffix=""
if [[ "${USE_INIT_LOCATIONS,,}" == "true" ]]; then command_suffix="--init-expert-location /configs/decode_dsr1-0528_loadgen_in1024out1024_num2000_2p12d.json"; fi
DYN_SKIP_SGLANG_LOG_FORMATTING=1 \ DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=512 \ SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=768 \
MC_TE_METRIC=true \ MC_TE_METRIC=true \
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \ SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \ SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \ SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
SGLANG_DECODE_BOOTSTRAP_TIMEOUT=1000 \
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \ SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \
SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \ SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
NCCL_MNNVL_ENABLE=1 \
MC_FORCE_MNNVL=1 \ MC_FORCE_MNNVL=1 \
NCCL_MNNVL_ENABLE=1 \
NCCL_CUMEM_ENABLE=1 \ NCCL_CUMEM_ENABLE=1 \
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \ SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \ SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
...@@ -160,16 +157,16 @@ elif [ "$mode" = "decode" ]; then ...@@ -160,16 +157,16 @@ elif [ "$mode" = "decode" ]; then
--enable-dp-attention \ --enable-dp-attention \
--host 0.0.0.0 \ --host 0.0.0.0 \
--decode-log-interval 1000 \ --decode-log-interval 1000 \
--max-running-requests 36864 \ --max-running-requests 45000 \
--context-length 9600 \ --context-length 2200 \
--disable-radix-cache \ --disable-radix-cache \
--moe-a2a-backend deepep \ --moe-a2a-backend deepep \
--prefill-round-robin-balance \ --prefill-round-robin-balance \
--deepep-mode low_latency \ --deepep-mode low_latency \
--moe-dense-tp-size 1 \ --moe-dense-tp-size 1 \
--enable-dp-lm-head \ --enable-dp-lm-head \
--cuda-graph-bs 1 2 4 8 16 24 32 40 48 56 64 80 96 112 128 160 192 224 256 320 384 448 512 \ --cuda-graph-bs 1 2 4 8 16 24 32 40 48 56 64 72 80 88 96 104 112 120 128 136 144 152 160 168 176 184 192 200 208 216 224 232 240 248 256 264 272 280 288 296 304 312 320 328 336 344 352 360 368 376 384 416 448 480 512 544 576 608 640 672 704 736 768 \
--cuda-graph-max-bs 512 \ --cuda-graph-max-bs 768 \
--disable-shared-experts-fusion \ --disable-shared-experts-fusion \
--ep-num-redundant-experts 32 \ --ep-num-redundant-experts 32 \
--ep-dispatch-algorithm static \ --ep-dispatch-algorithm static \
...@@ -178,5 +175,6 @@ elif [ "$mode" = "decode" ]; then ...@@ -178,5 +175,6 @@ elif [ "$mode" = "decode" ]; then
--watchdog-timeout 1000000 \ --watchdog-timeout 1000000 \
--chunked-prefill-size 36864 \ --chunked-prefill-size 36864 \
--stream-interval 50 \ --stream-interval 50 \
--deepep-config /configs/deepep_config.json \
--mem-fraction-static 0.82 ${command_suffix} --mem-fraction-static 0.82 ${command_suffix}
fi fi
...@@ -175,9 +175,9 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac ...@@ -175,9 +175,9 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
parser.add_argument( parser.add_argument(
"--gpu_type", "--gpu_type",
type=str, type=str,
choices=["gb200-fp8"], choices=["gb200-fp8", "gb200-fp4"],
default="gb200-fp8", default="gb200-fp8",
help="Type of GPU to use", help="Type of GPU to use. You can choose between gb200-fp8 and gb200-fp4.",
) )
parser.add_argument( parser.add_argument(
......
...@@ -142,9 +142,9 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac ...@@ -142,9 +142,9 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
) )
parser.add_argument( parser.add_argument(
"--gpu-type", "--gpu-type",
choices=["gb200-fp8"], choices=["gb200-fp8", "gb200-fp4"],
default="gb200-fp8", default="gb200-fp8",
help="GPU type to use", help="GPU type to use. You can choose between gb200-fp8 and gb200-fp4.",
) )
parser.add_argument( parser.add_argument(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment