Unverified Commit 80dfb82c authored by ishandhanani's avatar ishandhanani Committed by GitHub
Browse files

feat: slurm jobs added fp4 and 8k1k (#4747)

parent 3fea2e10
......@@ -49,31 +49,36 @@ OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend &
DYNAMO_PID=$!
#AssertionError: Prefill round robin balance is required when dp size > 1. Please make sure that the prefill instance is launched with `--load-balance-method round_robin` and `--prefill-round-robin-balance` is set for decode server.
# run prefill worker
OTEL_SERVICE_NAME=dynamo-worker-prefill DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_PREFILL:-8081} \
python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
--model-path silence09/DeepSeek-R1-Small-2layers \
--served-model-name silence09/DeepSeek-R1-Small-2layers \
--page-size 16 \
--tp 1 \
--tp 2 --dp-size 2 --enable-dp-attention \
--load-balance-method round_robin \
--trust-remote-code \
--disaggregation-mode prefill \
--disaggregation-bootstrap-port 12345 \
--host 0.0.0.0 \
--port 40000 \
--disaggregation-transfer-backend nixl \
--enable-metrics &
--enable-metrics --log-level debug &
PREFILL_PID=$!
# run decode worker
OTEL_SERVICE_NAME=dynamo-worker-decode DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_DECODE:-8082} \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
CUDA_VISIBLE_DEVICES=2,3 python3 -m dynamo.sglang \
--model-path silence09/DeepSeek-R1-Small-2layers \
--served-model-name silence09/DeepSeek-R1-Small-2layers \
--page-size 16 \
--tp 1 \
--prefill-round-robin-balance \
--tp 2 --dp-size 2 --enable-dp-attention \
--trust-remote-code \
--disaggregation-mode decode \
--disaggregation-bootstrap-port 12345 \
--host 0.0.0.0 \
--disaggregation-transfer-backend nixl \
--enable-metrics
--enable-metrics --log-level debug
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Function to print usage
print_usage() {
echo "Usage: $0 <mode>"
echo " mode: prefill or decode"
echo ""
echo "Examples:"
echo " $0 prefill"
echo " $0 decode"
exit 1
}
# Check if correct number of arguments provided
if [ $# -ne 1 ]; then
echo "Error: Expected 1 argument, got $#"
print_usage
fi
# Parse arguments
mode=$1
# Validate mode argument
if [ "$mode" != "prefill" ] && [ "$mode" != "decode" ]; then
echo "Error: mode must be 'prefill' or 'decode', got '$mode'"
print_usage
fi
echo "Mode: $mode"
echo "Command: dynamo"
# Check if required environment variables are set
if [ -z "$HOST_IP_MACHINE" ]; then
echo "Error: HOST_IP_MACHINE environment variable is not set"
exit 1
fi
if [ -z "$PORT" ]; then
echo "Error: PORT environment variable is not set"
exit 1
fi
if [ -z "$TOTAL_GPUS" ]; then
echo "Error: TOTAL_GPUS environment variable is not set"
exit 1
fi
if [ -z "$RANK" ]; then
echo "Error: RANK environment variable is not set"
exit 1
fi
if [ -z "$TOTAL_NODES" ]; then
echo "Error: TOTAL_NODES environment variable is not set"
exit 1
fi
if [ -z "$USE_INIT_LOCATIONS" ]; then
echo "Error: USE_INIT_LOCATIONS environment variable is not set"
exit 1
fi
if [ -z "$RUN_IN_CI" ]; then
echo "Error: RUN_IN_CI environment variable is not set"
exit 1
fi
# Construct command based on mode
if [ "$mode" = "prefill" ]; then
set -x
if [[ "${RUN_IN_CI,,}" == "true" ]]; then
python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
command_suffix=""
if [[ -n "${DUMP_CONFIG_PATH}" ]]; then command_suffix="${command_suffix} --dump-config-to ${DUMP_CONFIG_PATH}"; fi
PYTHONUNBUFFERED=1 \
DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
SGLANG_DECODE_BOOTSTRAP_TIMEOUT=1000 \
MC_FORCE_MNNVL=1 \
NCCL_MNNVL_ENABLE=1 \
NCCL_CUMEM_ENABLE=1 \
SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
SGLANG_ENABLE_JIT_DEEPGEMM=false \
SGLANG_ENABLE_FLASHINFER_GEMM=true \
python3 -m dynamo.sglang \
--disaggregation-mode prefill \
--served-model-name deepseek-ai/DeepSeek-R1 \
--model-path /model/ \
--trust-remote-code \
--disable-radix-cache \
--kv-cache-dtype fp8_e4m3 \
--attention-backend trtllm_mla \
--quantization modelopt_fp4 \
--moe-runner-backend flashinfer_trtllm \
--stream-interval 10 \
--watchdog-timeout 1000000 \
--context-length 2200 \
--mem-fraction-static 0.95 \
--max-total-tokens 8192 \
--chunked-prefill-size 8192 \
--cuda-graph-max-bs 256 \
--max-running-requests 512 \
--scheduler-recv-interval 10 \
--enable-symm-mem \
--moe-dense-tp-size 1 \
--load-balance-method round_robin \
--disaggregation-bootstrap-port 30001 \
--data-parallel-size 1 \
--tensor-parallel-size "$TOTAL_GPUS" \
--expert-parallel-size 1 \
--dist-init-addr "$HOST_IP_MACHINE:$PORT" \
--nnodes "$TOTAL_NODES" \
--node-rank "$RANK" \
--host 0.0.0.0 ${command_suffix}
elif [ "$mode" = "decode" ]; then
set -x
if [[ "${RUN_IN_CI,,}" == "true" ]]; then
python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
command_suffix=""
if [[ -n "${DUMP_CONFIG_PATH}" ]]; then command_suffix="${command_suffix} --dump-config-to ${DUMP_CONFIG_PATH}"; fi
PYTHONUNBUFFERED=1 \
DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
SGLANG_DECODE_BOOTSTRAP_TIMEOUT=1000 \
MC_FORCE_MNNVL=1 \
NCCL_MNNVL_ENABLE=1 \
NCCL_CUMEM_ENABLE=1 \
SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
SGLANG_ENABLE_JIT_DEEPGEMM=false \
SGLANG_ENABLE_FLASHINFER_GEMM=true \
python3 -m dynamo.sglang \
--disaggregation-mode decode \
--served-model-name deepseek-ai/DeepSeek-R1 \
--model-path /model/ \
--prefill-round-robin-balance \
--trust-remote-code \
--disable-radix-cache \
--kv-cache-dtype fp8_e4m3 \
--attention-backend trtllm_mla \
--quantization modelopt_fp4 \
--moe-runner-backend flashinfer_trtllm \
--disaggregation-bootstrap-port 30001 \
--stream-interval 10 \
--watchdog-timeout 1000000 \
--context-length 2200 \
--mem-fraction-static 0.95 \
--chunked-prefill-size 8192 \
--cuda-graph-max-bs 256 \
--scheduler-recv-interval 10 \
--enable-symm-mem \
--moe-dense-tp-size 1 \
--tensor-parallel-size "$TOTAL_GPUS" \
--expert-parallel-size 1 \
--dist-init-addr "$HOST_IP_MACHINE:$PORT" \
--nnodes "$TOTAL_NODES" \
--node-rank "$RANK" \
--host 0.0.0.0 ${command_suffix}
fi
\ No newline at end of file
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Function to print usage
print_usage() {
echo "Usage: $0 <mode>"
echo " mode: prefill or decode"
echo ""
echo "Examples:"
echo " $0 prefill"
echo " $0 decode"
exit 1
}
# Check if correct number of arguments provided
if [ $# -ne 1 ]; then
echo "Error: Expected 1 argument, got $#"
print_usage
fi
# Parse arguments
mode=$1
# Validate mode argument
if [ "$mode" != "prefill" ] && [ "$mode" != "decode" ]; then
echo "Error: mode must be 'prefill' or 'decode', got '$mode'"
print_usage
fi
echo "Mode: $mode"
echo "Command: dynamo"
# Check if required environment variables are set
if [ -z "$HOST_IP_MACHINE" ]; then
echo "Error: HOST_IP_MACHINE environment variable is not set"
exit 1
fi
if [ -z "$PORT" ]; then
echo "Error: PORT environment variable is not set"
exit 1
fi
if [ -z "$TOTAL_GPUS" ]; then
echo "Error: TOTAL_GPUS environment variable is not set"
exit 1
fi
if [ -z "$RANK" ]; then
echo "Error: RANK environment variable is not set"
exit 1
fi
if [ -z "$TOTAL_NODES" ]; then
echo "Error: TOTAL_NODES environment variable is not set"
exit 1
fi
if [ -z "$USE_INIT_LOCATIONS" ]; then
echo "Error: USE_INIT_LOCATIONS environment variable is not set"
exit 1
fi
if [ -z "$RUN_IN_CI" ]; then
echo "Error: RUN_IN_CI environment variable is not set"
exit 1
fi
# Construct command based on mode
if [ "$mode" = "prefill" ]; then
set -x
if [[ "${RUN_IN_CI,,}" == "true" ]]; then
python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
command_suffix=""
if [[ -n "${DUMP_CONFIG_PATH}" ]]; then command_suffix="${command_suffix} --dump-config-to ${DUMP_CONFIG_PATH}"; fi
PYTHONUNBUFFERED=1 \
DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN=1 \
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2=1 \
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \
MC_TE_METRIC=true \
MC_FORCE_MNNVL=1 \
NCCL_MNNVL_ENABLE=1 \
NCCL_CUMEM_ENABLE=1 \
SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
python3 -m dynamo.sglang \
--served-model-name deepseek-ai/DeepSeek-R1 \
--model-path /model/ \
--trust-remote-code \
--kv-cache-dtype fp8_e4m3 \
--attention-backend trtllm_mla \
--quantization modelopt_fp4 \
--moe-runner-backend flashinfer_cutlass \
--disable-radix-cache \
--disable-chunked-prefix-cache \
--stream-interval 50 \
--decode-log-interval 1000 \
--watchdog-timeout 1000000 \
--context-length 2176 \
--disable-shared-experts-fusion \
--eplb-algorithm deepseek \
--disaggregation-bootstrap-port 30001 \
--disaggregation-mode prefill \
--mem-fraction-static 0.84 \
--max-total-tokens 131072 \
--max-prefill-tokens 32768 \
--chunked-prefill-size 65536 \
--enable-single-batch-overlap \
--max-running-requests 30000 \
--load-balance-method round_robin \
--disable-cuda-graph \
--enable-dp-attention \
--tp-size "$TOTAL_GPUS" \
--dp-size "$TOTAL_GPUS" \
--ep-size "$TOTAL_GPUS" \
--dist-init-addr "$HOST_IP_MACHINE:$PORT" \
--nnodes "$TOTAL_NODES" \
--node-rank "$RANK" \
--host 0.0.0.0 ${command_suffix}
elif [ "$mode" = "decode" ]; then
set -x
if [[ "${RUN_IN_CI,,}" == "true" ]]; then
python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
command_suffix=""
if [[ -n "${DUMP_CONFIG_PATH}" ]]; then command_suffix="${command_suffix} --dump-config-to ${DUMP_CONFIG_PATH}"; fi
PYTHONUNBUFFERED=1 \
DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN=1 \
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2=1 \
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \
MC_TE_METRIC=true \
MC_FORCE_MNNVL=1 \
NCCL_MNNVL_ENABLE=1 \
NCCL_CUMEM_ENABLE=1 \
SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=1024 \
SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH=1 \
SGLANG_FLASHINFER_FP4_GEMM_BACKEND=cutlass \
python3 -m dynamo.sglang \
--served-model-name deepseek-ai/DeepSeek-R1 \
--model-path /model/ \
--trust-remote-code \
--kv-cache-dtype fp8_e4m3 \
--attention-backend trtllm_mla \
--quantization modelopt_fp4 \
--moe-runner-backend flashinfer_cutedsl \
--disable-radix-cache \
--disable-chunked-prefix-cache \
--stream-interval 50 \
--decode-log-interval 1000 \
--watchdog-timeout 1000000 \
--context-length 2176 \
--disable-shared-experts-fusion \
--eplb-algorithm deepseek \
--disaggregation-bootstrap-port 30001 \
--disaggregation-mode decode \
--mem-fraction-static 0.83 \
--max-total-tokens 3122380 \
--chunked-prefill-size 786432 \
--max-running-requests 67584 \
--moe-a2a-backend deepep \
--deepep-mode low_latency \
--ep-dispatch-algorithm static \
--ep-num-redundant-experts 32 \
--cuda-graph-bs 1 2 4 8 16 24 32 40 48 56 64 72 80 88 96 104 112 120 128 136 144 152 160 168 176 184 192 200 208 216 224 232 240 248 256 264 272 280 288 296 304 312 320 328 336 344 352 360 368 376 384 416 448 480 512 544 576 608 640 672 704 736 768 1024 \
--num-reserved-decode-tokens 112 \
--moe-dense-tp-size 1 \
--enable-dp-lm-head \
--prefill-round-robin-balance \
--enable-dp-attention \
--tp-size "$TOTAL_GPUS" \
--dp-size "$TOTAL_GPUS" \
--ep-size "$TOTAL_GPUS" \
--dist-init-addr "$HOST_IP_MACHINE:$PORT" \
--nnodes "$TOTAL_NODES" \
--node-rank "$RANK" \
--host 0.0.0.0 ${command_suffix}
fi
\ No newline at end of file
......@@ -2,8 +2,6 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# This comes from https://github.com/sgl-project/sglang/issues/10903 and uses the low-prec decode setup
# Function to print usage
print_usage() {
echo "Usage: $0 <mode>"
......@@ -64,152 +62,140 @@ if [ -z "$USE_INIT_LOCATIONS" ]; then
exit 1
fi
if [ -z "$RUN_IN_CI" ]; then
echo "Error: RUN_IN_CI environment variable is not set"
exit 1
fi
# Construct command based on mode
if [ "$mode" = "prefill" ]; then
set -x
# no expert locations collected for fp4 yet
if [[ "${RUN_IN_CI,,}" == "true" ]]; then
python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
command_suffix=""
if [[ "${USE_INIT_LOCATIONS,,}" == "true" ]]; then command_suffix=" "; fi
if [[ -n "${DUMP_CONFIG_PATH}" ]]; then command_suffix="${command_suffix} --dump-config-to ${DUMP_CONFIG_PATH}"; fi
# we have to install pre-release cutedsl for a integer overflow fix
python3 -m pip install --no-cache-dir --upgrade --pre nvidia-cutlass-dsl
# set your own cache variables here
export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
export SGLANG_DG_CACHE_DIR="/configs/dg-10212025"
export FLASHINFER_WORKSPACE_BASE="/configs/flashinfer-cache"
PYTHONUNBUFFERED=1 \
DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN=1 \
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2=1 \
SGL_JIT_DEEPGEMM_PRECOMPILE=0 \
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \
MC_TE_METRIC=true \
SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
MC_FORCE_MNNVL=1 \
NCCL_MNNVL_ENABLE=1 \
NCCL_CUMEM_ENABLE=1 \
SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
PYTHONUNBUFFERED=1 \
python3 -m dynamo.sglang \
--served-model-name deepseek-ai/DeepSeek-R1 \
--model-path /model/ \
--skip-tokenizer-init \
--disaggregation-mode prefill \
--trust-remote-code \
--kv-cache-dtype fp8_e4m3 \
--attention-backend trtllm_mla \
--quantization modelopt_fp4 \
--moe-runner-backend flashinfer_cutlass \
--disable-radix-cache \
--disable-chunked-prefix-cache \
--stream-interval 50 \
--decode-log-interval 1000 \
--max-running-requests 5632 \
--watchdog-timeout 1000000 \
--context-length 2176 \
--disable-radix-cache \
--disable-shared-experts-fusion \
--watchdog-timeout 1000000 \
--disable-chunked-prefix-cache \
--attention-backend trtllm_mla \
--kv-cache-dtype fp8_e4m3 \
--enable-single-batch-overlap \
--chunked-prefill-size 65536 \
--eplb-algorithm deepseek \
--trust-remote-code \
--disable-cuda-graph \
--disaggregation-bootstrap-port 30001 \
--disaggregation-mode prefill \
--mem-fraction-static 0.84 \
--max-total-tokens 131072 \
--max-prefill-tokens 16384 \
--max-prefill-tokens 32768 \
--chunked-prefill-size 65536 \
--enable-single-batch-overlap \
--max-running-requests 30000 \
--load-balance-method round_robin \
--quantization modelopt_fp4 \
--moe-runner-backend flashinfer_cutlass \
--disable-cuda-graph \
--enable-dp-attention \
--tp-size "$TOTAL_GPUS" \
--dp-size "$TOTAL_GPUS" \
--ep-size "$TOTAL_GPUS" \
--dist-init-addr "$HOST_IP_MACHINE:$PORT" \
--disaggregation-bootstrap-port 30001 \
--nnodes "$TOTAL_NODES" \
--node-rank "$RANK" \
--ep-size "$TOTAL_GPUS" \
--tp-size "$TOTAL_GPUS" \
--dp-size "$TOTAL_GPUS" \
--enable-dp-attention \
--host 0.0.0.0 \
--stream-interval 50 \
--log-level debug ${command_suffix}
# For now we must keep SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK and cuda-graph-bs at 1024 until
# DeepEP merges in https://github.com/deepseek-ai/DeepEP/pull/440
# the nvidia-cutlass-dsl install fixes https://github.com/flashinfer-ai/flashinfer/issues/1830#issuecomment-3380074018
# which was previously limiting us to DISPATCH_TOKENS and cuda-graph-bs == 384
# For now use 12 nodes for fp4 since flashinfer_cutedsl requires experts per gpu < 8
# We have 288 (256 + 32 redundant) => 288/48 = 6
--host 0.0.0.0 ${command_suffix}
elif [ "$mode" = "decode" ]; then
set -x
# no expert locations collected for fp4 yet
command_suffix=""
if [[ "${USE_INIT_LOCATIONS,,}" == "true" ]]; then command_suffix=" "; fi
if [[ -n "${DUMP_CONFIG_PATH}" ]]; then command_suffix="${command_suffix} --dump-config-to ${DUMP_CONFIG_PATH}"; fi
# set your own cache variables here
if [[ "${RUN_IN_CI,,}" == "true" ]]; then
python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
export SGLANG_DG_CACHE_DIR="/configs/dg-10212025"
export FLASHINFER_WORKSPACE_BASE="/configs/flashinfer-cache"
# we have to install pre-release cutedsl for a integer overflow fix
python3 -m pip install --no-cache-dir --upgrade --pre nvidia-cutlass-dsl
command_suffix=""
if [[ -n "${DUMP_CONFIG_PATH}" ]]; then command_suffix="${command_suffix} --dump-config-to ${DUMP_CONFIG_PATH}"; fi
PYTHONUNBUFFERED=1 \
DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN=1 \
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2=1 \
SGL_JIT_DEEPGEMM_PRECOMPILE=0 \
MC_TE_METRIC=true \
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \
MC_TE_METRIC=true \
MC_FORCE_MNNVL=1 \
NCCL_MNNVL_ENABLE=1 \
NCCL_CUMEM_ENABLE=1 \
SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=384 \
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=1024 \
SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH=1 \
SGLANG_FP4_GEMM_BACKEND=cutlass \
DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
PYTHONUNBUFFERED=1 \
SGLANG_FLASHINFER_FP4_GEMM_BACKEND=cutlass \
python3 -m dynamo.sglang \
--served-model-name deepseek-ai/DeepSeek-R1 \
--model-path /model/ \
--skip-tokenizer-init \
--trust-remote-code \
--disaggregation-mode decode \
--host 0.0.0.0 \
--decode-log-interval 1 \
--max-running-requests 67584 \
--context-length 2176 \
--kv-cache-dtype fp8_e4m3 \
--attention-backend trtllm_mla \
--quantization modelopt_fp4 \
--moe-runner-backend flashinfer_cutedsl \
--disable-radix-cache \
--disable-shared-experts-fusion \
--watchdog-timeout 1000000 \
--disable-chunked-prefix-cache \
--attention-backend trtllm_mla \
--kv-cache-dtype fp8_e4m3 \
--enable-dp-attention \
--chunked-prefill-size 786432 \
--stream-interval 50 \
--decode-log-interval 1000 \
--watchdog-timeout 1000000 \
--context-length 2176 \
--disable-shared-experts-fusion \
--eplb-algorithm deepseek \
--disaggregation-bootstrap-port 30001 \
--disaggregation-mode decode \
--mem-fraction-static 0.83 \
--max-total-tokens 3122380 \
--chunked-prefill-size 786432 \
--max-running-requests 67584 \
--enable-single-batch-overlap \
--moe-a2a-backend deepep \
--deepep-mode low_latency \
--ep-dispatch-algorithm static \
--cuda-graph-bs 384 \
--num-reserved-decode-tokens 112 \
--ep-num-redundant-experts 32 \
--eplb-algorithm deepseek \
--cuda-graph-bs 1 2 4 8 16 24 32 40 48 56 64 72 80 88 96 104 112 120 128 136 144 152 160 168 176 184 192 200 208 216 224 232 240 248 256 264 272 280 288 296 304 312 320 328 336 344 352 360 368 376 384 416 448 480 512 544 576 608 640 672 704 736 768 1024 \
--num-reserved-decode-tokens 112 \
--moe-dense-tp-size 1 \
--enable-dp-lm-head \
--prefill-round-robin-balance \
--max-total-tokens 3122380 \
--quantization modelopt_fp4 \
--moe-runner-backend flashinfer_cutedsl \
--enable-dp-attention \
--tp-size "$TOTAL_GPUS" \
--dp-size "$TOTAL_GPUS" \
--ep-size "$TOTAL_GPUS" \
--dist-init-addr "$HOST_IP_MACHINE:$PORT" \
--disaggregation-bootstrap-port 30001 \
--nnodes "$TOTAL_NODES" \
--node-rank "$RANK" \
--tp-size "$TOTAL_GPUS" \
--ep-size "$TOTAL_GPUS" \
--dp-size "$TOTAL_GPUS" \
--enable-single-batch-overlap \
--enable-dp-attention \
--stream-interval 50 \
--mem-fraction-static 0.82 ${command_suffix}
--host 0.0.0.0 ${command_suffix}
fi
\ No newline at end of file
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Simple agg script (not an optimized config)
print_usage() {
echo "Usage: $0"
echo ""
echo "This script runs aggregated mode (single dynamo.sglang instance)"
exit 1
}
echo "Mode: aggregated"
echo "Command: dynamo"
# Check if required environment variables are set
if [ -z "$HOST_IP_MACHINE" ]; then
echo "Error: HOST_IP_MACHINE environment variable is not set"
exit 1
fi
if [ -z "$PORT" ]; then
echo "Error: PORT environment variable is not set"
exit 1
fi
if [ -z "$TOTAL_GPUS" ]; then
echo "Error: TOTAL_GPUS environment variable is not set"
exit 1
fi
if [ -z "$RANK" ]; then
echo "Error: RANK environment variable is not set"
exit 1
fi
if [ -z "$TOTAL_NODES" ]; then
echo "Error: TOTAL_NODES environment variable is not set"
exit 1
fi
# Construct command suffix for config dump
command_suffix=""
if [[ -n "${DUMP_CONFIG_PATH}" ]]; then command_suffix="--dump-config-to ${DUMP_CONFIG_PATH}"; fi
set -x
export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
export SGLANG_DG_CACHE_DIR="/configs/dg-10212025"
export FLASHINFER_WORKSPACE_BASE="/configs/flashinfer-cache"
DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
MC_TE_METRIC=true \
SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
MC_FORCE_MNNVL=1 \
NCCL_MNNVL_ENABLE=1 \
NCCL_CUMEM_ENABLE=1 \
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
PYTHONUNBUFFERED=1 \
python3 -m dynamo.sglang \
--served-model-name deepseek-ai/DeepSeek-R1 \
--model-path /model/ \
--skip-tokenizer-init \
--trust-remote-code \
--dist-init-addr "$HOST_IP_MACHINE:$PORT" \
--nnodes "$TOTAL_NODES" \
--node-rank "$RANK" \
--tp-size "$TOTAL_GPUS" \
--dp-size "$TOTAL_GPUS" \
--enable-dp-attention \
--host 0.0.0.0 \
--max-running-requests 30000 \
--context-length 2200 \
--disable-radix-cache \
--moe-a2a-backend deepep \
--load-balance-method round_robin \
--deepep-mode normal \
--ep-dispatch-algorithm dynamic \
--moe-dense-tp-size 1 \
--enable-dp-lm-head \
--disable-shared-experts-fusion \
--ep-num-redundant-experts 32 \
--eplb-algorithm deepseek \
--attention-backend trtllm_mla \
--kv-cache-dtype fp8_e4m3 \
--watchdog-timeout 1000000 \
--disable-cuda-graph \
--chunked-prefill-size 131072 \
--max-total-tokens 524288 \
--deepep-config /configs/deepep_config.json \
--stream-interval 50 \
--mem-fraction-static 0.75 ${command_suffix}
......@@ -73,8 +73,8 @@ fi
if [ "$mode" = "prefill" ]; then
set -x
if [[ "${RUN_IN_CI,,}" == "true" ]]; then
python3 -m pip install /configs/ai_dynamo_runtime-0.6.1-cp310-abi3-manylinux_2_28_aarch64.whl
python3 -m pip install /configs/ai_dynamo-0.6.1-py3-none-any.whl
python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
export SGLANG_DG_CACHE_DIR="/configs/dg-10212025"
......@@ -131,8 +131,8 @@ if [ "$mode" = "prefill" ]; then
elif [ "$mode" = "decode" ]; then
set -x
if [[ "${RUN_IN_CI,,}" == "true" ]]; then
python3 -m pip install /configs/ai_dynamo_runtime-0.6.1-cp310-abi3-manylinux_2_28_aarch64.whl
python3 -m pip install /configs/ai_dynamo-0.6.1-py3-none-any.whl
python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
export SGLANG_DG_CACHE_DIR="/configs/dg-10212025"
......
......@@ -71,8 +71,8 @@ fi
if [ "$mode" = "prefill" ]; then
set -x
if [[ "${RUN_IN_CI,,}" == "true" ]]; then
python3 -m pip install /configs/ai_dynamo_runtime-0.6.1-cp310-abi3-manylinux_2_28_aarch64.whl
python3 -m pip install /configs/ai_dynamo-0.6.1-py3-none-any.whl
python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
export SGLANG_DG_CACHE_DIR="/configs/dg-10212025"
......@@ -132,8 +132,8 @@ if [ "$mode" = "prefill" ]; then
elif [ "$mode" = "decode" ]; then
set -x
if [[ "${RUN_IN_CI,,}" == "true" ]]; then
python3 -m pip install /configs/ai_dynamo_runtime-0.6.1-cp310-abi3-manylinux_2_28_aarch64.whl
python3 -m pip install /configs/ai_dynamo-0.6.1-py3-none-any.whl
python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
export SGLANG_DG_CACHE_DIR="/configs/dg-10212025"
......
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Function to print usage
print_usage() {
echo "Usage: $0 <mode>"
echo " mode: prefill or decode"
echo ""
echo "Examples:"
echo " $0 prefill"
echo " $0 decode"
exit 1
}
# Check if correct number of arguments provided
if [ $# -ne 1 ]; then
echo "Error: Expected 1 argument, got $#"
print_usage
fi
# Parse arguments
mode=$1
# Validate mode argument
if [ "$mode" != "prefill" ] && [ "$mode" != "decode" ]; then
echo "Error: mode must be 'prefill' or 'decode', got '$mode'"
print_usage
fi
echo "Mode: $mode"
echo "Command: dynamo"
# Check if required environment variables are set
if [ -z "$HOST_IP_MACHINE" ]; then
echo "Error: HOST_IP_MACHINE environment variable is not set"
exit 1
fi
if [ -z "$PORT" ]; then
echo "Error: PORT environment variable is not set"
exit 1
fi
if [ -z "$TOTAL_GPUS" ]; then
echo "Error: TOTAL_GPUS environment variable is not set"
exit 1
fi
if [ -z "$RANK" ]; then
echo "Error: RANK environment variable is not set"
exit 1
fi
if [ -z "$TOTAL_NODES" ]; then
echo "Error: TOTAL_NODES environment variable is not set"
exit 1
fi
if [ -z "$USE_INIT_LOCATIONS" ]; then
echo "Error: USE_INIT_LOCATIONS environment variable is not set"
exit 1
fi
if [ -z "$RUN_IN_CI" ]; then
echo "Error: RUN_IN_CI environment variable is not set"
exit 1
fi
# Construct command based on mode
if [ "$mode" = "prefill" ]; then
set -x
if [[ "${RUN_IN_CI,,}" == "true" ]]; then
python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
export SGLANG_DG_CACHE_DIR="/configs/dg-10212025"
command_suffix=""
if [[ -n "${DUMP_CONFIG_PATH}" ]]; then command_suffix="${command_suffix} --dump-config-to ${DUMP_CONFIG_PATH}"; fi
PYTHONUNBUFFERED=1 \
DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
SGLANG_ENABLE_JIT_DEEPGEMM=false \
SGLANG_ENABLE_FLASHINFER_GEMM=1 \
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
MC_TE_METRIC=true \
MC_FORCE_MNNVL=1 \
NCCL_MNNVL_ENABLE=1 \
NCCL_CUMEM_ENABLE=1 \
python3 -m dynamo.sglang \
--served-model-name deepseek-ai/DeepSeek-R1 \
--model-path /model/ \
--trust-remote-code \
--kv-cache-dtype fp8_e4m3 \
--attention-backend trtllm_mla \
--quantization fp8 \
--moe-runner-backend flashinfer_trtllm \
--disable-radix-cache \
--watchdog-timeout 1000000 \
--context-length 9600 \
--disaggregation-mode prefill \
--mem-fraction-static 0.95 \
--max-total-tokens 32768 \
--chunked-prefill-size 24576 \
--cuda-graph-max-bs 512 \
--max-running-requests 512 \
--load-balance-method round_robin \
--scheduler-recv-interval 10 \
--enable-flashinfer-allreduce-fusion \
--moe-dense-tp-size 1 \
--tensor-parallel-size "$TOTAL_GPUS" \
--data-parallel-size 1 \
--expert-parallel-size 1 \
--dist-init-addr "$HOST_IP_MACHINE:$PORT" \
--disaggregation-bootstrap-port 30001 \
--nnodes "$TOTAL_NODES" \
--node-rank "$RANK" \
--host 0.0.0.0 ${command_suffix}
elif [ "$mode" = "decode" ]; then
set -x
if [[ "${RUN_IN_CI,,}" == "true" ]]; then
python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
export SGLANG_DG_CACHE_DIR="/configs/dg-10212025"
command_suffix=""
if [[ -n "${DUMP_CONFIG_PATH}" ]]; then command_suffix="${command_suffix} --dump-config-to ${DUMP_CONFIG_PATH}"; fi
PYTHONUNBUFFERED=1 \
DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
SGLANG_ENABLE_JIT_DEEPGEMM=false \
SGLANG_ENABLE_FLASHINFER_GEMM=1 \
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
SGLANG_DECODE_BOOTSTRAP_TIMEOUT=1000 \
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \
SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
MC_TE_METRIC=true \
MC_FORCE_MNNVL=1 \
NCCL_MNNVL_ENABLE=1 \
NCCL_CUMEM_ENABLE=1 \
python3 -m dynamo.sglang \
--served-model-name deepseek-ai/DeepSeek-R1 \
--model-path /model/ \
--trust-remote-code \
--kv-cache-dtype fp8_e4m3 \
--attention-backend trtllm_mla \
--quantization fp8 \
--moe-runner-backend flashinfer_trtllm \
--disable-radix-cache \
--watchdog-timeout 1000000 \
--context-length 9600 \
--disaggregation-mode decode \
--mem-fraction-static 0.95 \
--chunked-prefill-size 8192 \
--cuda-graph-max-bs 512 \
--max-running-requests 512 \
--scheduler-recv-interval 10 \
--enable-flashinfer-allreduce-fusion \
--enable-symm-mem \
--moe-dense-tp-size 1 \
--prefill-round-robin-balance \
--tensor-parallel-size "$TOTAL_GPUS" \
--data-parallel-size 1 \
--expert-parallel-size 1 \
--dist-init-addr "$HOST_IP_MACHINE:$PORT" \
--disaggregation-bootstrap-port 30001 \
--nnodes "$TOTAL_NODES" \
--node-rank "$RANK" \
--host 0.0.0.0 ${command_suffix}
fi
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Function to print usage
print_usage() {
echo "Usage: $0 <mode>"
echo " mode: prefill or decode"
echo ""
echo "Examples:"
echo " $0 prefill"
echo " $0 decode"
exit 1
}
# Check if correct number of arguments provided
if [ $# -ne 1 ]; then
echo "Error: Expected 1 argument, got $#"
print_usage
fi
# Parse arguments
mode=$1
# Validate mode argument
if [ "$mode" != "prefill" ] && [ "$mode" != "decode" ]; then
echo "Error: mode must be 'prefill' or 'decode', got '$mode'"
print_usage
fi
echo "Mode: $mode"
echo "Command: dynamo"
# Check if required environment variables are set
if [ -z "$HOST_IP_MACHINE" ]; then
echo "Error: HOST_IP_MACHINE environment variable is not set"
exit 1
fi
if [ -z "$PORT" ]; then
echo "Error: PORT environment variable is not set"
exit 1
fi
if [ -z "$TOTAL_GPUS" ]; then
echo "Error: TOTAL_GPUS environment variable is not set"
exit 1
fi
if [ -z "$RANK" ]; then
echo "Error: RANK environment variable is not set"
exit 1
fi
if [ -z "$TOTAL_NODES" ]; then
echo "Error: TOTAL_NODES environment variable is not set"
exit 1
fi
if [ -z "$USE_INIT_LOCATIONS" ]; then
echo "Error: USE_INIT_LOCATIONS environment variable is not set"
exit 1
fi
if [ -z "$RUN_IN_CI" ]; then
echo "Error: RUN_IN_CI environment variable is not set"
exit 1
fi
# Construct command based on mode
if [ "$mode" = "prefill" ]; then
set -x
if [[ "${RUN_IN_CI,,}" == "true" ]]; then
python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
export SGLANG_DG_CACHE_DIR="/configs/dg-10212025"
command_suffix=""
if [[ -n "${DUMP_CONFIG_PATH}" ]]; then command_suffix="${command_suffix} --dump-config-to ${DUMP_CONFIG_PATH}"; fi
PYTHONUNBUFFERED=1 \
DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
MC_TE_METRIC=true \
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
MC_FORCE_MNNVL=1 \
SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
NCCL_MNNVL_ENABLE=1 \
NCCL_CUMEM_ENABLE=1 \
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
python3 -m dynamo.sglang \
--served-model-name deepseek-ai/DeepSeek-R1 \
--model-path /model/ \
--trust-remote-code \
--tp-size "$TOTAL_GPUS" \
--dp-size "$TOTAL_GPUS" \
--ep-size "$TOTAL_GPUS" \
--enable-dp-attention \
--attention-backend trtllm_mla \
--kv-cache-dtype fp8_e4m3 \
--disable-radix-cache \
--stream-interval 50 \
--max-running-requests 30000 \
--context-length 9300 \
--watchdog-timeout 1000000 \
--disable-shared-experts-fusion \
--eplb-algorithm deepseek \
--disaggregation-bootstrap-port 30001 \
--disaggregation-mode prefill \
--mem-fraction-static 0.80 \
--max-total-tokens 524288 \
--chunked-prefill-size 131072 \
--load-balance-method round_robin \
--disable-cuda-graph \
--moe-a2a-backend deepep \
--deepep-mode normal \
--ep-dispatch-algorithm dynamic \
--moe-dense-tp-size 1 \
--enable-dp-lm-head \
--ep-num-redundant-experts 32 \
--deepep-config /configs/deepep_config.json \
--dist-init-addr "$HOST_IP_MACHINE:$PORT" \
--nnodes "$TOTAL_NODES" \
--node-rank "$RANK" \
--host 0.0.0.0 ${command_suffix}
elif [ "$mode" = "decode" ]; then
set -x
if [[ "${RUN_IN_CI,,}" == "true" ]]; then
python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
fi
export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
export SGLANG_DG_CACHE_DIR="/configs/dg-10212025"
command_suffix=""
if [[ -n "${DUMP_CONFIG_PATH}" ]]; then command_suffix="${command_suffix} --dump-config-to ${DUMP_CONFIG_PATH}"; fi
PYTHONUNBUFFERED=1 \
DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 \
MC_TE_METRIC=true \
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
SGLANG_DECODE_BOOTSTRAP_TIMEOUT=1000 \
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \
SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
MC_FORCE_MNNVL=1 \
NCCL_MNNVL_ENABLE=1 \
NCCL_CUMEM_ENABLE=1 \
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
python3 -m dynamo.sglang \
--served-model-name deepseek-ai/DeepSeek-R1 \
--model-path /model/ \
--skip-tokenizer-init \
--trust-remote-code \
--tp-size "$TOTAL_GPUS" \
--dp-size "$TOTAL_GPUS" \
--ep-size "$TOTAL_GPUS" \
--enable-dp-attention \
--attention-backend trtllm_mla \
--kv-cache-dtype fp8_e4m3 \
--disable-radix-cache \
--stream-interval 50 \
--decode-log-interval 1000 \
--max-running-requests 8192 \
--context-length 9300 \
--watchdog-timeout 1000000 \
--disable-shared-experts-fusion \
--eplb-algorithm deepseek \
--disaggregation-bootstrap-port 30001 \
--disaggregation-mode decode \
--mem-fraction-static 0.82 \
--chunked-prefill-size 36864 \
--moe-a2a-backend deepep \
--deepep-mode low_latency \
--ep-dispatch-algorithm static \
--moe-dense-tp-size 1 \
--enable-dp-lm-head \
--prefill-round-robin-balance \
--ep-num-redundant-experts 32 \
--deepep-config /configs/deepep_config.json \
--cuda-graph-max-bs 256 \
--dist-init-addr "$HOST_IP_MACHINE:$PORT" \
--nnodes "$TOTAL_NODES" \
--node-rank "$RANK" \
--host 0.0.0.0 ${command_suffix}
fi
......@@ -373,7 +373,7 @@ def setup_frontend_worker(
# All frontends run the ingress server
frontend_cmd = "python3 -m dynamo.frontend --http-port=8000"
if run_in_ci:
frontend_cmd = "python3 -m pip install /configs/ai_dynamo_runtime-0.6.1-cp310-abi3-manylinux_2_28_aarch64.whl && python3 -m pip install /configs/ai_dynamo-0.6.1-py3-none-any.whl && python3 -m dynamo.frontend --http-port=8000"
frontend_cmd = "python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl && python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl && python3 -m dynamo.frontend --http-port=8000"
return run_command(frontend_cmd)
......
......@@ -48,7 +48,6 @@ check_env MODEL_PATH
check_env CONFIG_DIR
check_env CONTAINER_IMAGE
GPU_TYPE="gb200-fp8"
GPUS_PER_NODE=4
: "${NETWORK_INTERFACE:=enP6p9s0np0}"
......@@ -62,7 +61,8 @@ ISL=$6
OSL=$7
CONCURRENCIES=$8
REQUEST_RATE=$9
SCRIPT_VARIANT=${10}
GPU_TYPE=${10}
SCRIPT_VARIANT=${11}
RETRIES=1 # defaults to retry the job 1 time to avoid transient errors
......@@ -86,7 +86,7 @@ command=(
--model-dir $MODEL_PATH --config-dir $CONFIG_DIR
--container-image $CONTAINER_IMAGE
--gpu-type $GPU_TYPE --gpus-per-node $GPUS_PER_NODE --network-interface $NETWORK_INTERFACE
--gpus-per-node $GPUS_PER_NODE --network-interface $NETWORK_INTERFACE
--prefill-nodes $PREFILL_NODES --prefill-workers $PREFILL_WORKERS
--decode-nodes $DECODE_NODES --decode-workers $DECODE_WORKERS
......@@ -96,6 +96,8 @@ command=(
--retries $RETRIES
--gpu-type $GPU_TYPE
--run-in-ci
${SCRIPT_VARIANT_ARGS[@]}
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment