feat: slurm jobs added fp4 and 8k1k (#4747)

80dfb82c · ishandhanani · GitHub · 3fea2e10 · 80dfb82c · 80dfb82c
Unverified Commit 80dfb82c authored Dec 08, 2025 by ishandhanani Committed by GitHub Dec 08, 2025
11 changed files
--- a/examples/backends/sglang/launch/disagg.sh
+++ b/examples/backends/sglang/launch/disagg.sh
@@ -49,31 +49,36 @@ OTEL_SERVICE_NAME=dynamo-frontend \
 python3 -m dynamo.frontend &
 DYNAMO_PID=$!

+#AssertionError: Prefill round robin balance is required when dp size > 1. Please make sure that the prefill instance is launched with `--load-balance-method round_robin` and `--prefill-round-robin-balance` is set for decode server.
+
 # run prefill worker
 OTEL_SERVICE_NAME=dynamo-worker-prefill DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_PREFILL:-8081} \
 python3 -m dynamo.sglang \
-  --model-path Qwen/Qwen3-0.6B \
-  --served-model-name Qwen/Qwen3-0.6B \
+  --model-path silence09/DeepSeek-R1-Small-2layers \
+  --served-model-name silence09/DeepSeek-R1-Small-2layers \
  --page-size 16 \
-  --tp 1 \
+  --tp 2 --dp-size 2 --enable-dp-attention \
+  --load-balance-method round_robin \
  --trust-remote-code \
  --disaggregation-mode prefill \
  --disaggregation-bootstrap-port 12345 \
  --host 0.0.0.0 \
+  --port 40000 \
  --disaggregation-transfer-backend nixl \
-  --enable-metrics &
+  --enable-metrics --log-level debug &
 PREFILL_PID=$!

 # run decode worker
 OTEL_SERVICE_NAME=dynamo-worker-decode DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_DECODE:-8082} \
-CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
-  --model-path Qwen/Qwen3-0.6B \
-  --served-model-name Qwen/Qwen3-0.6B \
+CUDA_VISIBLE_DEVICES=2,3 python3 -m dynamo.sglang \
+  --model-path silence09/DeepSeek-R1-Small-2layers \
+  --served-model-name silence09/DeepSeek-R1-Small-2layers \
  --page-size 16 \
-  --tp 1 \
+  --prefill-round-robin-balance \
+  --tp 2 --dp-size 2 --enable-dp-attention \
  --trust-remote-code \
  --disaggregation-mode decode \
  --disaggregation-bootstrap-port 12345 \
  --host 0.0.0.0 \
  --disaggregation-transfer-backend nixl \
-  --enable-metrics
+  --enable-metrics --log-level debug
--- a/examples/backends/sglang/slurm_jobs/scripts/gb200-fp4/disagg/1k1k-low-latency.sh
+++ b/examples/backends/sglang/slurm_jobs/scripts/gb200-fp4/disagg/1k1k-low-latency.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Function to print usage
+print_usage() {
+    echo "Usage: $0 <mode>"
+    echo "  mode: prefill or decode"
+    echo ""
+    echo "Examples:"
+    echo "  $0 prefill"
+    echo "  $0 decode"
+    exit 1
+}
+
+# Check if correct number of arguments provided
+if [ $# -ne 1 ]; then
+    echo "Error: Expected 1 argument, got $#"
+    print_usage
+fi
+
+# Parse arguments
+mode=$1
+
+# Validate mode argument
+if [ "$mode" != "prefill" ] && [ "$mode" != "decode" ]; then
+    echo "Error: mode must be 'prefill' or 'decode', got '$mode'"
+    print_usage
+fi
+
+echo "Mode: $mode"
+echo "Command: dynamo"
+
+# Check if required environment variables are set
+if [ -z "$HOST_IP_MACHINE" ]; then
+    echo "Error: HOST_IP_MACHINE environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$PORT" ]; then
+    echo "Error: PORT environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$TOTAL_GPUS" ]; then
+    echo "Error: TOTAL_GPUS environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$RANK" ]; then
+    echo "Error: RANK environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$TOTAL_NODES" ]; then
+    echo "Error: TOTAL_NODES environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$USE_INIT_LOCATIONS" ]; then
+    echo "Error: USE_INIT_LOCATIONS environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$RUN_IN_CI" ]; then
+    echo "Error: RUN_IN_CI environment variable is not set"
+    exit 1
+fi
+
+# Construct command based on mode
+if [ "$mode" = "prefill" ]; then
+    set -x
+    if [[ "${RUN_IN_CI,,}" == "true" ]]; then
+        python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
+        python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
+    fi
+    export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
+
+    command_suffix=""
+    if [[ -n "${DUMP_CONFIG_PATH}" ]]; then command_suffix="${command_suffix} --dump-config-to ${DUMP_CONFIG_PATH}"; fi
+
+    PYTHONUNBUFFERED=1 \
+    DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT=1000 \
+    MC_FORCE_MNNVL=1 \
+    NCCL_MNNVL_ENABLE=1 \
+    NCCL_CUMEM_ENABLE=1 \
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
+    SGLANG_ENABLE_JIT_DEEPGEMM=false \
+    SGLANG_ENABLE_FLASHINFER_GEMM=true \
+    python3 -m dynamo.sglang \
+        --disaggregation-mode prefill \
+        --served-model-name deepseek-ai/DeepSeek-R1 \
+        --model-path /model/ \
+        --trust-remote-code \
+        --disable-radix-cache \
+        --kv-cache-dtype fp8_e4m3 \
+        --attention-backend trtllm_mla \
+        --quantization modelopt_fp4 \
+        --moe-runner-backend flashinfer_trtllm \
+        --stream-interval 10 \
+        --watchdog-timeout 1000000 \
+        --context-length 2200 \
+        --mem-fraction-static 0.95 \
+        --max-total-tokens 8192 \
+        --chunked-prefill-size 8192 \
+        --cuda-graph-max-bs 256 \
+        --max-running-requests 512 \
+        --scheduler-recv-interval 10 \
+        --enable-symm-mem \
+        --moe-dense-tp-size 1 \
+        --load-balance-method round_robin \
+        --disaggregation-bootstrap-port 30001 \
+        --data-parallel-size 1 \
+        --tensor-parallel-size "$TOTAL_GPUS" \
+        --expert-parallel-size 1 \
+        --dist-init-addr "$HOST_IP_MACHINE:$PORT" \
+        --nnodes "$TOTAL_NODES" \
+        --node-rank "$RANK" \
+        --host 0.0.0.0 ${command_suffix}
+
+elif [ "$mode" = "decode" ]; then
+    set -x
+    if [[ "${RUN_IN_CI,,}" == "true" ]]; then
+        python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
+        python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
+    fi
+    export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
+
+    command_suffix=""
+    if [[ -n "${DUMP_CONFIG_PATH}" ]]; then command_suffix="${command_suffix} --dump-config-to ${DUMP_CONFIG_PATH}"; fi
+
+    PYTHONUNBUFFERED=1 \
+    DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT=1000 \
+    MC_FORCE_MNNVL=1 \
+    NCCL_MNNVL_ENABLE=1 \
+    NCCL_CUMEM_ENABLE=1 \
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
+    SGLANG_ENABLE_JIT_DEEPGEMM=false \
+    SGLANG_ENABLE_FLASHINFER_GEMM=true \
+    python3 -m dynamo.sglang \
+        --disaggregation-mode decode \
+        --served-model-name deepseek-ai/DeepSeek-R1 \
+        --model-path /model/ \
+        --prefill-round-robin-balance \
+        --trust-remote-code \
+        --disable-radix-cache \
+        --kv-cache-dtype fp8_e4m3 \
+        --attention-backend trtllm_mla \
+        --quantization modelopt_fp4 \
+        --moe-runner-backend flashinfer_trtllm \
+        --disaggregation-bootstrap-port 30001 \
+        --stream-interval 10 \
+        --watchdog-timeout 1000000 \
+        --context-length 2200 \
+        --mem-fraction-static 0.95 \
+        --chunked-prefill-size 8192 \
+        --cuda-graph-max-bs 256 \
+        --scheduler-recv-interval 10 \
+        --enable-symm-mem \
+        --moe-dense-tp-size 1 \
+        --tensor-parallel-size "$TOTAL_GPUS" \
+        --expert-parallel-size 1 \
+        --dist-init-addr "$HOST_IP_MACHINE:$PORT" \
+        --nnodes "$TOTAL_NODES" \
+        --node-rank "$RANK" \
+        --host 0.0.0.0 ${command_suffix}
+fi
\ No newline at end of file
--- a/examples/backends/sglang/slurm_jobs/scripts/gb200-fp4/disagg/1k1k-max-tpt.sh
+++ b/examples/backends/sglang/slurm_jobs/scripts/gb200-fp4/disagg/1k1k-max-tpt.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Function to print usage
+print_usage() {
+    echo "Usage: $0 <mode>"
+    echo "  mode: prefill or decode"
+    echo ""
+    echo "Examples:"
+    echo "  $0 prefill"
+    echo "  $0 decode"
+    exit 1
+}
+
+# Check if correct number of arguments provided
+if [ $# -ne 1 ]; then
+    echo "Error: Expected 1 argument, got $#"
+    print_usage
+fi
+
+# Parse arguments
+mode=$1
+
+# Validate mode argument
+if [ "$mode" != "prefill" ] && [ "$mode" != "decode" ]; then
+    echo "Error: mode must be 'prefill' or 'decode', got '$mode'"
+    print_usage
+fi
+
+echo "Mode: $mode"
+echo "Command: dynamo"
+
+# Check if required environment variables are set
+if [ -z "$HOST_IP_MACHINE" ]; then
+    echo "Error: HOST_IP_MACHINE environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$PORT" ]; then
+    echo "Error: PORT environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$TOTAL_GPUS" ]; then
+    echo "Error: TOTAL_GPUS environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$RANK" ]; then
+    echo "Error: RANK environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$TOTAL_NODES" ]; then
+    echo "Error: TOTAL_NODES environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$USE_INIT_LOCATIONS" ]; then
+    echo "Error: USE_INIT_LOCATIONS environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$RUN_IN_CI" ]; then
+    echo "Error: RUN_IN_CI environment variable is not set"
+    exit 1
+fi
+
+# Construct command based on mode
+if [ "$mode" = "prefill" ]; then
+    set -x
+    if [[ "${RUN_IN_CI,,}" == "true" ]]; then
+        python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
+        python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
+    fi
+    export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
+
+    command_suffix=""
+    if [[ -n "${DUMP_CONFIG_PATH}" ]]; then command_suffix="${command_suffix} --dump-config-to ${DUMP_CONFIG_PATH}"; fi
+
+    PYTHONUNBUFFERED=1 \
+    DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
+    SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN=1 \
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2=1 \
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
+    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \
+    MC_TE_METRIC=true \
+    MC_FORCE_MNNVL=1 \
+    NCCL_MNNVL_ENABLE=1 \
+    NCCL_CUMEM_ENABLE=1 \
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
+    python3 -m dynamo.sglang \
+        --served-model-name deepseek-ai/DeepSeek-R1 \
+        --model-path /model/ \
+        --trust-remote-code \
+        --kv-cache-dtype fp8_e4m3 \
+        --attention-backend trtllm_mla \
+        --quantization modelopt_fp4 \
+        --moe-runner-backend flashinfer_cutlass \
+        --disable-radix-cache \
+        --disable-chunked-prefix-cache \
+        --stream-interval 50 \
+        --decode-log-interval 1000 \
+        --watchdog-timeout 1000000 \
+        --context-length 2176 \
+        --disable-shared-experts-fusion \
+        --eplb-algorithm deepseek \
+        --disaggregation-bootstrap-port 30001 \
+        --disaggregation-mode prefill \
+        --mem-fraction-static 0.84 \
+        --max-total-tokens 131072 \
+        --max-prefill-tokens 32768 \
+        --chunked-prefill-size 65536 \
+        --enable-single-batch-overlap \
+        --max-running-requests 30000 \
+        --load-balance-method round_robin \
+        --disable-cuda-graph \
+        --enable-dp-attention \
+        --tp-size "$TOTAL_GPUS" \
+        --dp-size "$TOTAL_GPUS" \
+        --ep-size "$TOTAL_GPUS" \
+        --dist-init-addr "$HOST_IP_MACHINE:$PORT" \
+        --nnodes "$TOTAL_NODES" \
+        --node-rank "$RANK" \
+        --host 0.0.0.0 ${command_suffix}
+
+elif [ "$mode" = "decode" ]; then
+    set -x
+    if [[ "${RUN_IN_CI,,}" == "true" ]]; then
+        python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
+        python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
+    fi
+    export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
+
+    command_suffix=""
+    if [[ -n "${DUMP_CONFIG_PATH}" ]]; then command_suffix="${command_suffix} --dump-config-to ${DUMP_CONFIG_PATH}"; fi
+
+    PYTHONUNBUFFERED=1 \
+    DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
+    SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN=1 \
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2=1 \
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
+    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \
+    MC_TE_METRIC=true \
+    MC_FORCE_MNNVL=1 \
+    NCCL_MNNVL_ENABLE=1 \
+    NCCL_CUMEM_ENABLE=1 \
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=1024 \
+    SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH=1 \
+    SGLANG_FLASHINFER_FP4_GEMM_BACKEND=cutlass \
+    python3 -m dynamo.sglang \
+        --served-model-name deepseek-ai/DeepSeek-R1 \
+        --model-path /model/ \
+        --trust-remote-code \
+        --kv-cache-dtype fp8_e4m3 \
+        --attention-backend trtllm_mla \
+        --quantization modelopt_fp4 \
+        --moe-runner-backend flashinfer_cutedsl \
+        --disable-radix-cache \
+        --disable-chunked-prefix-cache \
+        --stream-interval 50 \
+        --decode-log-interval 1000 \
+        --watchdog-timeout 1000000 \
+        --context-length 2176 \
+        --disable-shared-experts-fusion \
+        --eplb-algorithm deepseek \
+        --disaggregation-bootstrap-port 30001 \
+        --disaggregation-mode decode \
+        --mem-fraction-static 0.83 \
+        --max-total-tokens 3122380 \
+        --chunked-prefill-size 786432 \
+        --max-running-requests 67584 \
+        --moe-a2a-backend deepep \
+        --deepep-mode low_latency \
+        --ep-dispatch-algorithm static \
+        --ep-num-redundant-experts 32 \
+        --cuda-graph-bs 1 2 4 8 16 24 32 40 48 56 64 72 80 88 96 104 112 120 128 136 144 152 160 168 176 184 192 200 208 216 224 232 240 248 256 264 272 280 288 296 304 312 320 328 336 344 352 360 368 376 384 416 448 480 512 544 576 608 640 672 704 736 768 1024 \
+        --num-reserved-decode-tokens 112 \
+        --moe-dense-tp-size 1 \
+        --enable-dp-lm-head \
+        --prefill-round-robin-balance \
+        --enable-dp-attention \
+        --tp-size "$TOTAL_GPUS" \
+        --dp-size "$TOTAL_GPUS" \
+        --ep-size "$TOTAL_GPUS" \
+        --dist-init-addr "$HOST_IP_MACHINE:$PORT" \
+        --nnodes "$TOTAL_NODES" \
+        --node-rank "$RANK" \
+        --host 0.0.0.0 ${command_suffix}
+fi
\ No newline at end of file
--- a/examples/backends/sglang/slurm_jobs/scripts/gb200-fp4/disagg/default.sh
+++ b/examples/backends/sglang/slurm_jobs/scripts/gb200-fp4/disagg/default.sh
@@ -2,8 +2,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0

-# This comes from https://github.com/sgl-project/sglang/issues/10903 and uses the low-prec decode setup
-
 # Function to print usage
 print_usage() {
    echo "Usage: $0 <mode>"
@@ -64,152 +62,140 @@ if [ -z "$USE_INIT_LOCATIONS" ]; then
    exit 1
 fi

+if [ -z "$RUN_IN_CI" ]; then
+    echo "Error: RUN_IN_CI environment variable is not set"
+    exit 1
+fi
+
 # Construct command based on mode
 if [ "$mode" = "prefill" ]; then
    set -x
-    # no expert locations collected for fp4 yet
+    if [[ "${RUN_IN_CI,,}" == "true" ]]; then
+        python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
+        python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
+    fi
+    export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
+
    command_suffix=""
-    if [[ "${USE_INIT_LOCATIONS,,}" == "true" ]]; then command_suffix=" "; fi
    if [[ -n "${DUMP_CONFIG_PATH}" ]]; then command_suffix="${command_suffix} --dump-config-to ${DUMP_CONFIG_PATH}"; fi

-    # we have to install pre-release cutedsl for a integer overflow fix
-    python3 -m pip install --no-cache-dir --upgrade --pre nvidia-cutlass-dsl
-
-    # set your own cache variables here
-    export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
-    export SGLANG_DG_CACHE_DIR="/configs/dg-10212025"
-    export FLASHINFER_WORKSPACE_BASE="/configs/flashinfer-cache"
-
+    PYTHONUNBUFFERED=1 \
    DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
    SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN=1 \
    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2=1 \
-    SGL_JIT_DEEPGEMM_PRECOMPILE=0 \
    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
    SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \
    MC_TE_METRIC=true \
-    SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
    MC_FORCE_MNNVL=1 \
    NCCL_MNNVL_ENABLE=1 \
    NCCL_CUMEM_ENABLE=1 \
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
-    PYTHONUNBUFFERED=1 \
    python3 -m dynamo.sglang \
        --served-model-name deepseek-ai/DeepSeek-R1 \
        --model-path /model/ \
-        --skip-tokenizer-init \
-        --disaggregation-mode prefill \
+        --trust-remote-code \
+        --kv-cache-dtype fp8_e4m3 \
+        --attention-backend trtllm_mla \
+        --quantization modelopt_fp4 \
+        --moe-runner-backend flashinfer_cutlass \
+        --disable-radix-cache \
+        --disable-chunked-prefix-cache \
+        --stream-interval 50 \
        --decode-log-interval 1000 \
-        --max-running-requests 5632 \
+        --watchdog-timeout 1000000 \
        --context-length 2176 \
-        --disable-radix-cache \
        --disable-shared-experts-fusion \
-        --watchdog-timeout 1000000 \
-        --disable-chunked-prefix-cache \
-        --attention-backend trtllm_mla \
-        --kv-cache-dtype fp8_e4m3 \
-        --enable-single-batch-overlap \
-        --chunked-prefill-size 65536 \
        --eplb-algorithm deepseek \
-        --trust-remote-code \
-        --disable-cuda-graph \
+        --disaggregation-bootstrap-port 30001 \
+        --disaggregation-mode prefill \
        --mem-fraction-static 0.84 \
        --max-total-tokens 131072 \
-        --max-prefill-tokens 16384 \
+        --max-prefill-tokens 32768 \
+        --chunked-prefill-size 65536 \
+        --enable-single-batch-overlap \
+        --max-running-requests 30000 \
        --load-balance-method round_robin \
-        --quantization modelopt_fp4 \
-        --moe-runner-backend flashinfer_cutlass \
+        --disable-cuda-graph \
+        --enable-dp-attention \
+        --tp-size "$TOTAL_GPUS" \
+        --dp-size "$TOTAL_GPUS" \
+        --ep-size "$TOTAL_GPUS" \
        --dist-init-addr "$HOST_IP_MACHINE:$PORT" \
-        --disaggregation-bootstrap-port 30001 \
        --nnodes "$TOTAL_NODES" \
        --node-rank "$RANK" \
-        --ep-size "$TOTAL_GPUS" \
-        --tp-size "$TOTAL_GPUS" \
-        --dp-size "$TOTAL_GPUS" \
-        --enable-dp-attention \
-        --host 0.0.0.0 \
-        --stream-interval 50 \
-        --log-level debug ${command_suffix}
-
-# For now we must keep SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK and cuda-graph-bs at 1024 until
-# DeepEP merges in https://github.com/deepseek-ai/DeepEP/pull/440
-# the nvidia-cutlass-dsl install fixes https://github.com/flashinfer-ai/flashinfer/issues/1830#issuecomment-3380074018
-# which was previously limiting us to DISPATCH_TOKENS and cuda-graph-bs == 384
-# For now use 12 nodes for fp4 since flashinfer_cutedsl requires experts per gpu < 8
-# We have 288 (256 + 32 redundant) => 288/48 = 6
+        --host 0.0.0.0 ${command_suffix}

 elif [ "$mode" = "decode" ]; then
    set -x
-    # no expert locations collected for fp4 yet
-    command_suffix=""
-    if [[ "${USE_INIT_LOCATIONS,,}" == "true" ]]; then command_suffix=" "; fi
-    if [[ -n "${DUMP_CONFIG_PATH}" ]]; then command_suffix="${command_suffix} --dump-config-to ${DUMP_CONFIG_PATH}"; fi
-
-    # set your own cache variables here
+    if [[ "${RUN_IN_CI,,}" == "true" ]]; then
+        python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
+        python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
+    fi
    export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
-    export SGLANG_DG_CACHE_DIR="/configs/dg-10212025"
-    export FLASHINFER_WORKSPACE_BASE="/configs/flashinfer-cache"

-    # we have to install pre-release cutedsl for a integer overflow fix
-    python3 -m pip install --no-cache-dir --upgrade --pre nvidia-cutlass-dsl
+    command_suffix=""
+    if [[ -n "${DUMP_CONFIG_PATH}" ]]; then command_suffix="${command_suffix} --dump-config-to ${DUMP_CONFIG_PATH}"; fi

+    PYTHONUNBUFFERED=1 \
+    DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
    SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN=1 \
    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2=1 \
-    SGL_JIT_DEEPGEMM_PRECOMPILE=0 \
-    MC_TE_METRIC=true \
    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
    SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \
+    MC_TE_METRIC=true \
+    MC_FORCE_MNNVL=1 \
+    NCCL_MNNVL_ENABLE=1 \
+    NCCL_CUMEM_ENABLE=1 \
    SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=384 \
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=1024 \
    SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH=1 \
-    SGLANG_FP4_GEMM_BACKEND=cutlass \
-    DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
-    PYTHONUNBUFFERED=1 \
+    SGLANG_FLASHINFER_FP4_GEMM_BACKEND=cutlass \
    python3 -m dynamo.sglang \
        --served-model-name deepseek-ai/DeepSeek-R1 \
        --model-path /model/ \
-        --skip-tokenizer-init \
        --trust-remote-code \
-        --disaggregation-mode decode \
-        --host 0.0.0.0 \
-        --decode-log-interval 1 \
-        --max-running-requests 67584 \
-        --context-length 2176 \
+        --kv-cache-dtype fp8_e4m3 \
+        --attention-backend trtllm_mla \
+        --quantization modelopt_fp4 \
+        --moe-runner-backend flashinfer_cutedsl \
        --disable-radix-cache \
-        --disable-shared-experts-fusion \
-        --watchdog-timeout 1000000 \
        --disable-chunked-prefix-cache \
-        --attention-backend trtllm_mla \
-        --kv-cache-dtype fp8_e4m3 \
-        --enable-dp-attention \
-        --chunked-prefill-size 786432 \
+        --stream-interval 50 \
+        --decode-log-interval 1000 \
+        --watchdog-timeout 1000000 \
+        --context-length 2176 \
+        --disable-shared-experts-fusion \
+        --eplb-algorithm deepseek \
+        --disaggregation-bootstrap-port 30001 \
+        --disaggregation-mode decode \
        --mem-fraction-static 0.83 \
+        --max-total-tokens 3122380 \
+        --chunked-prefill-size 786432 \
+        --max-running-requests 67584 \
+        --enable-single-batch-overlap \
        --moe-a2a-backend deepep \
        --deepep-mode low_latency \
        --ep-dispatch-algorithm static \
-        --cuda-graph-bs 384 \
-        --num-reserved-decode-tokens 112 \
        --ep-num-redundant-experts 32 \
-        --eplb-algorithm deepseek \
+        --cuda-graph-bs 1 2 4 8 16 24 32 40 48 56 64 72 80 88 96 104 112 120 128 136 144 152 160 168 176 184 192 200 208 216 224 232 240 248 256 264 272 280 288 296 304 312 320 328 336 344 352 360 368 376 384 416 448 480 512 544 576 608 640 672 704 736 768 1024 \
+        --num-reserved-decode-tokens 112 \
        --moe-dense-tp-size 1 \
        --enable-dp-lm-head \
        --prefill-round-robin-balance \
-        --max-total-tokens 3122380 \
-        --quantization modelopt_fp4 \
-        --moe-runner-backend flashinfer_cutedsl \
+        --enable-dp-attention \
+        --tp-size "$TOTAL_GPUS" \
+        --dp-size "$TOTAL_GPUS" \
+        --ep-size "$TOTAL_GPUS" \
        --dist-init-addr "$HOST_IP_MACHINE:$PORT" \
-        --disaggregation-bootstrap-port 30001 \
        --nnodes "$TOTAL_NODES" \
        --node-rank "$RANK" \
-        --tp-size "$TOTAL_GPUS" \
-        --ep-size "$TOTAL_GPUS" \
-        --dp-size "$TOTAL_GPUS" \
-        --enable-single-batch-overlap \
-        --enable-dp-attention \
-        --stream-interval 50 \
-        --mem-fraction-static 0.82 ${command_suffix}
-fi
+        --host 0.0.0.0 ${command_suffix}
+fi
\ No newline at end of file
--- a/examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/agg/default.sh
+++ b/examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/agg/default.sh
-#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-# Simple agg script (not an optimized config)
-
-print_usage() {
-    echo "Usage: $0"
-    echo ""
-    echo "This script runs aggregated mode (single dynamo.sglang instance)"
-    exit 1
-}
-
-echo "Mode: aggregated"
-echo "Command: dynamo"
-
-# Check if required environment variables are set
-if [ -z "$HOST_IP_MACHINE" ]; then
-    echo "Error: HOST_IP_MACHINE environment variable is not set"
-    exit 1
-fi
-
-if [ -z "$PORT" ]; then
-    echo "Error: PORT environment variable is not set"
-    exit 1
-fi
-
-if [ -z "$TOTAL_GPUS" ]; then
-    echo "Error: TOTAL_GPUS environment variable is not set"
-    exit 1
-fi
-
-if [ -z "$RANK" ]; then
-    echo "Error: RANK environment variable is not set"
-    exit 1
-fi
-
-if [ -z "$TOTAL_NODES" ]; then
-    echo "Error: TOTAL_NODES environment variable is not set"
-    exit 1
-fi
-
-# Construct command suffix for config dump
-command_suffix=""
-if [[ -n "${DUMP_CONFIG_PATH}" ]]; then command_suffix="--dump-config-to ${DUMP_CONFIG_PATH}"; fi
-
-set -x
-export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
-export SGLANG_DG_CACHE_DIR="/configs/dg-10212025"
-export FLASHINFER_WORKSPACE_BASE="/configs/flashinfer-cache"
-
-DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
-MC_TE_METRIC=true \
-SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
-MC_FORCE_MNNVL=1 \
-NCCL_MNNVL_ENABLE=1 \
-NCCL_CUMEM_ENABLE=1 \
-SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
-PYTHONUNBUFFERED=1 \
-python3 -m dynamo.sglang \
-    --served-model-name deepseek-ai/DeepSeek-R1 \
-    --model-path /model/ \
-    --skip-tokenizer-init \
-    --trust-remote-code \
-    --dist-init-addr "$HOST_IP_MACHINE:$PORT" \
-    --nnodes "$TOTAL_NODES" \
-    --node-rank "$RANK" \
-    --tp-size "$TOTAL_GPUS" \
-    --dp-size "$TOTAL_GPUS" \
-    --enable-dp-attention \
-    --host 0.0.0.0 \
-    --max-running-requests 30000 \
-    --context-length 2200 \
-    --disable-radix-cache \
-    --moe-a2a-backend deepep \
-    --load-balance-method round_robin \
-    --deepep-mode normal \
-    --ep-dispatch-algorithm dynamic \
-    --moe-dense-tp-size 1 \
-    --enable-dp-lm-head \
-    --disable-shared-experts-fusion \
-    --ep-num-redundant-experts 32 \
-    --eplb-algorithm deepseek \
-    --attention-backend trtllm_mla \
-    --kv-cache-dtype fp8_e4m3 \
-    --watchdog-timeout 1000000 \
-    --disable-cuda-graph \
-    --chunked-prefill-size 131072 \
-    --max-total-tokens 524288 \
-    --deepep-config /configs/deepep_config.json \
-    --stream-interval 50 \
-    --mem-fraction-static 0.75 ${command_suffix}
-
-
--- a/examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/disagg/1p_4d.sh
+++ b/examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/disagg/1p_4d.sh
@@ -73,8 +73,8 @@ fi
 if [ "$mode" = "prefill" ]; then
    set -x
    if [[ "${RUN_IN_CI,,}" == "true" ]]; then
-        python3 -m pip install /configs/ai_dynamo_runtime-0.6.1-cp310-abi3-manylinux_2_28_aarch64.whl
-        python3 -m pip install /configs/ai_dynamo-0.6.1-py3-none-any.whl
+        python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
+        python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
    fi
    export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
    export SGLANG_DG_CACHE_DIR="/configs/dg-10212025"
@@ -131,8 +131,8 @@ if [ "$mode" = "prefill" ]; then
 elif [ "$mode" = "decode" ]; then
    set -x
    if [[ "${RUN_IN_CI,,}" == "true" ]]; then
-        python3 -m pip install /configs/ai_dynamo_runtime-0.6.1-cp310-abi3-manylinux_2_28_aarch64.whl
-        python3 -m pip install /configs/ai_dynamo-0.6.1-py3-none-any.whl
+        python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
+        python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
    fi
    export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
    export SGLANG_DG_CACHE_DIR="/configs/dg-10212025"

--- a/examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/disagg/default.sh
+++ b/examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/disagg/default.sh
@@ -71,8 +71,8 @@ fi
 if [ "$mode" = "prefill" ]; then
    set -x
    if [[ "${RUN_IN_CI,,}" == "true" ]]; then
-        python3 -m pip install /configs/ai_dynamo_runtime-0.6.1-cp310-abi3-manylinux_2_28_aarch64.whl
-        python3 -m pip install /configs/ai_dynamo-0.6.1-py3-none-any.whl
+        python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
+        python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
    fi
    export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
    export SGLANG_DG_CACHE_DIR="/configs/dg-10212025"
@@ -132,8 +132,8 @@ if [ "$mode" = "prefill" ]; then
 elif [ "$mode" = "decode" ]; then
    set -x
    if [[ "${RUN_IN_CI,,}" == "true" ]]; then
-        python3 -m pip install /configs/ai_dynamo_runtime-0.6.1-cp310-abi3-manylinux_2_28_aarch64.whl
-        python3 -m pip install /configs/ai_dynamo-0.6.1-py3-none-any.whl
+        python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
+        python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
    fi
    export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
    export SGLANG_DG_CACHE_DIR="/configs/dg-10212025"

--- a/examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/disagg/8k1k-low-latency.sh
+++ b/examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/disagg/8k1k-low-latency.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Function to print usage
+print_usage() {
+    echo "Usage: $0 <mode>"
+    echo "  mode: prefill or decode"
+    echo ""
+    echo "Examples:"
+    echo "  $0 prefill"
+    echo "  $0 decode"
+    exit 1
+}
+
+# Check if correct number of arguments provided
+if [ $# -ne 1 ]; then
+    echo "Error: Expected 1 argument, got $#"
+    print_usage
+fi
+
+# Parse arguments
+mode=$1
+
+# Validate mode argument
+if [ "$mode" != "prefill" ] && [ "$mode" != "decode" ]; then
+    echo "Error: mode must be 'prefill' or 'decode', got '$mode'"
+    print_usage
+fi
+
+echo "Mode: $mode"
+echo "Command: dynamo"
+
+# Check if required environment variables are set
+if [ -z "$HOST_IP_MACHINE" ]; then
+    echo "Error: HOST_IP_MACHINE environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$PORT" ]; then
+    echo "Error: PORT environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$TOTAL_GPUS" ]; then
+    echo "Error: TOTAL_GPUS environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$RANK" ]; then
+    echo "Error: RANK environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$TOTAL_NODES" ]; then
+    echo "Error: TOTAL_NODES environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$USE_INIT_LOCATIONS" ]; then
+    echo "Error: USE_INIT_LOCATIONS environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$RUN_IN_CI" ]; then
+    echo "Error: RUN_IN_CI environment variable is not set"
+    exit 1
+fi
+
+# Construct command based on mode
+if [ "$mode" = "prefill" ]; then
+    set -x
+    if [[ "${RUN_IN_CI,,}" == "true" ]]; then
+        python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
+        python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
+    fi
+    export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
+    export SGLANG_DG_CACHE_DIR="/configs/dg-10212025"
+
+    command_suffix=""
+    if [[ -n "${DUMP_CONFIG_PATH}" ]]; then command_suffix="${command_suffix} --dump-config-to ${DUMP_CONFIG_PATH}"; fi
+
+    PYTHONUNBUFFERED=1 \
+    DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
+    SGLANG_ENABLE_JIT_DEEPGEMM=false \
+    SGLANG_ENABLE_FLASHINFER_GEMM=1 \
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
+    MC_TE_METRIC=true \
+    MC_FORCE_MNNVL=1 \
+    NCCL_MNNVL_ENABLE=1 \
+    NCCL_CUMEM_ENABLE=1 \
+    python3 -m dynamo.sglang \
+        --served-model-name deepseek-ai/DeepSeek-R1 \
+        --model-path /model/ \
+        --trust-remote-code \
+        --kv-cache-dtype fp8_e4m3 \
+        --attention-backend trtllm_mla \
+        --quantization fp8 \
+        --moe-runner-backend flashinfer_trtllm \
+        --disable-radix-cache \
+        --watchdog-timeout 1000000 \
+        --context-length 9600 \
+        --disaggregation-mode prefill \
+        --mem-fraction-static 0.95 \
+        --max-total-tokens 32768 \
+        --chunked-prefill-size 24576 \
+        --cuda-graph-max-bs 512 \
+        --max-running-requests 512 \
+        --load-balance-method round_robin \
+        --scheduler-recv-interval 10 \
+        --enable-flashinfer-allreduce-fusion \
+        --moe-dense-tp-size 1 \
+        --tensor-parallel-size "$TOTAL_GPUS" \
+        --data-parallel-size 1 \
+        --expert-parallel-size 1 \
+        --dist-init-addr "$HOST_IP_MACHINE:$PORT" \
+        --disaggregation-bootstrap-port 30001 \
+        --nnodes "$TOTAL_NODES" \
+        --node-rank "$RANK" \
+        --host 0.0.0.0 ${command_suffix}
+
+elif [ "$mode" = "decode" ]; then
+    set -x
+    if [[ "${RUN_IN_CI,,}" == "true" ]]; then
+        python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
+        python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
+    fi
+    export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
+    export SGLANG_DG_CACHE_DIR="/configs/dg-10212025"
+
+    command_suffix=""
+    if [[ -n "${DUMP_CONFIG_PATH}" ]]; then command_suffix="${command_suffix} --dump-config-to ${DUMP_CONFIG_PATH}"; fi
+
+    PYTHONUNBUFFERED=1 \
+    DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
+    SGLANG_ENABLE_JIT_DEEPGEMM=false \
+    SGLANG_ENABLE_FLASHINFER_GEMM=1 \
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT=1000 \
+    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
+    MC_TE_METRIC=true \
+    MC_FORCE_MNNVL=1 \
+    NCCL_MNNVL_ENABLE=1 \
+    NCCL_CUMEM_ENABLE=1 \
+    python3 -m dynamo.sglang \
+        --served-model-name deepseek-ai/DeepSeek-R1 \
+        --model-path /model/ \
+        --trust-remote-code \
+        --kv-cache-dtype fp8_e4m3 \
+        --attention-backend trtllm_mla \
+        --quantization fp8 \
+        --moe-runner-backend flashinfer_trtllm \
+        --disable-radix-cache \
+        --watchdog-timeout 1000000 \
+        --context-length 9600 \
+        --disaggregation-mode decode \
+        --mem-fraction-static 0.95 \
+        --chunked-prefill-size 8192 \
+        --cuda-graph-max-bs 512 \
+        --max-running-requests 512 \
+        --scheduler-recv-interval 10 \
+        --enable-flashinfer-allreduce-fusion \
+        --enable-symm-mem \
+        --moe-dense-tp-size 1 \
+        --prefill-round-robin-balance \
+        --tensor-parallel-size "$TOTAL_GPUS" \
+        --data-parallel-size 1 \
+        --expert-parallel-size 1 \
+        --dist-init-addr "$HOST_IP_MACHINE:$PORT" \
+        --disaggregation-bootstrap-port 30001 \
+        --nnodes "$TOTAL_NODES" \
+        --node-rank "$RANK" \
+        --host 0.0.0.0 ${command_suffix}
+fi
--- a/examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/disagg/8k1k-max-tpt.sh
+++ b/examples/backends/sglang/slurm_jobs/scripts/gb200-fp8/disagg/8k1k-max-tpt.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Function to print usage
+print_usage() {
+    echo "Usage: $0 <mode>"
+    echo "  mode: prefill or decode"
+    echo ""
+    echo "Examples:"
+    echo "  $0 prefill"
+    echo "  $0 decode"
+    exit 1
+}
+
+# Check if correct number of arguments provided
+if [ $# -ne 1 ]; then
+    echo "Error: Expected 1 argument, got $#"
+    print_usage
+fi
+
+# Parse arguments
+mode=$1
+
+# Validate mode argument
+if [ "$mode" != "prefill" ] && [ "$mode" != "decode" ]; then
+    echo "Error: mode must be 'prefill' or 'decode', got '$mode'"
+    print_usage
+fi
+
+echo "Mode: $mode"
+echo "Command: dynamo"
+
+# Check if required environment variables are set
+if [ -z "$HOST_IP_MACHINE" ]; then
+    echo "Error: HOST_IP_MACHINE environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$PORT" ]; then
+    echo "Error: PORT environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$TOTAL_GPUS" ]; then
+    echo "Error: TOTAL_GPUS environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$RANK" ]; then
+    echo "Error: RANK environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$TOTAL_NODES" ]; then
+    echo "Error: TOTAL_NODES environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$USE_INIT_LOCATIONS" ]; then
+    echo "Error: USE_INIT_LOCATIONS environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$RUN_IN_CI" ]; then
+    echo "Error: RUN_IN_CI environment variable is not set"
+    exit 1
+fi
+
+# Construct command based on mode
+if [ "$mode" = "prefill" ]; then
+    set -x
+    if [[ "${RUN_IN_CI,,}" == "true" ]]; then
+        python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
+        python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
+    fi
+    export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
+    export SGLANG_DG_CACHE_DIR="/configs/dg-10212025"
+
+    command_suffix=""
+    if [[ -n "${DUMP_CONFIG_PATH}" ]]; then command_suffix="${command_suffix} --dump-config-to ${DUMP_CONFIG_PATH}"; fi
+
+    PYTHONUNBUFFERED=1 \
+    DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
+    MC_TE_METRIC=true \
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
+    MC_FORCE_MNNVL=1 \
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
+    NCCL_MNNVL_ENABLE=1 \
+    NCCL_CUMEM_ENABLE=1 \
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
+    python3 -m dynamo.sglang \
+        --served-model-name deepseek-ai/DeepSeek-R1 \
+        --model-path /model/ \
+        --trust-remote-code \
+        --tp-size "$TOTAL_GPUS" \
+        --dp-size "$TOTAL_GPUS" \
+        --ep-size "$TOTAL_GPUS" \
+        --enable-dp-attention \
+        --attention-backend trtllm_mla \
+        --kv-cache-dtype fp8_e4m3 \
+        --disable-radix-cache \
+        --stream-interval 50 \
+        --max-running-requests 30000 \
+        --context-length 9300 \
+        --watchdog-timeout 1000000 \
+        --disable-shared-experts-fusion \
+        --eplb-algorithm deepseek \
+        --disaggregation-bootstrap-port 30001 \
+        --disaggregation-mode prefill \
+        --mem-fraction-static 0.80 \
+        --max-total-tokens 524288 \
+        --chunked-prefill-size 131072 \
+        --load-balance-method round_robin \
+        --disable-cuda-graph \
+        --moe-a2a-backend deepep \
+        --deepep-mode normal \
+        --ep-dispatch-algorithm dynamic \
+        --moe-dense-tp-size 1 \
+        --enable-dp-lm-head \
+        --ep-num-redundant-experts 32 \
+        --deepep-config /configs/deepep_config.json \
+        --dist-init-addr "$HOST_IP_MACHINE:$PORT" \
+        --nnodes "$TOTAL_NODES" \
+        --node-rank "$RANK" \
+        --host 0.0.0.0 ${command_suffix}
+
+elif [ "$mode" = "decode" ]; then
+    set -x
+    if [[ "${RUN_IN_CI,,}" == "true" ]]; then
+        python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl
+        python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl
+    fi
+    export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800
+    export SGLANG_DG_CACHE_DIR="/configs/dg-10212025"
+
+    command_suffix=""
+    if [[ -n "${DUMP_CONFIG_PATH}" ]]; then command_suffix="${command_suffix} --dump-config-to ${DUMP_CONFIG_PATH}"; fi
+
+    PYTHONUNBUFFERED=1 \
+    DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 \
+    MC_TE_METRIC=true \
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT=1000 \
+    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
+    MC_FORCE_MNNVL=1 \
+    NCCL_MNNVL_ENABLE=1 \
+    NCCL_CUMEM_ENABLE=1 \
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
+    python3 -m dynamo.sglang \
+        --served-model-name deepseek-ai/DeepSeek-R1 \
+        --model-path /model/ \
+        --skip-tokenizer-init \
+        --trust-remote-code \
+        --tp-size "$TOTAL_GPUS" \
+        --dp-size "$TOTAL_GPUS" \
+        --ep-size "$TOTAL_GPUS" \
+        --enable-dp-attention \
+        --attention-backend trtllm_mla \
+        --kv-cache-dtype fp8_e4m3 \
+        --disable-radix-cache \
+        --stream-interval 50 \
+        --decode-log-interval 1000 \
+        --max-running-requests 8192 \
+        --context-length 9300 \
+        --watchdog-timeout 1000000 \
+        --disable-shared-experts-fusion \
+        --eplb-algorithm deepseek \
+        --disaggregation-bootstrap-port 30001 \
+        --disaggregation-mode decode \
+        --mem-fraction-static 0.82 \
+        --chunked-prefill-size 36864 \
+        --moe-a2a-backend deepep \
+        --deepep-mode low_latency \
+        --ep-dispatch-algorithm static \
+        --moe-dense-tp-size 1 \
+        --enable-dp-lm-head \
+        --prefill-round-robin-balance \
+        --ep-num-redundant-experts 32 \
+        --deepep-config /configs/deepep_config.json \
+        --cuda-graph-max-bs 256 \
+        --dist-init-addr "$HOST_IP_MACHINE:$PORT" \
+        --nnodes "$TOTAL_NODES" \
+        --node-rank "$RANK" \
+        --host 0.0.0.0 ${command_suffix}
+fi
--- a/examples/backends/sglang/slurm_jobs/scripts/worker_setup.py
+++ b/examples/backends/sglang/slurm_jobs/scripts/worker_setup.py
@@ -373,7 +373,7 @@ def setup_frontend_worker(
    # All frontends run the ingress server
    frontend_cmd = "python3 -m dynamo.frontend --http-port=8000"
    if run_in_ci:
-        frontend_cmd = "python3 -m pip install /configs/ai_dynamo_runtime-0.6.1-cp310-abi3-manylinux_2_28_aarch64.whl && python3 -m pip install /configs/ai_dynamo-0.6.1-py3-none-any.whl && python3 -m dynamo.frontend --http-port=8000"
+        frontend_cmd = "python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl && python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl && python3 -m dynamo.frontend --http-port=8000"
    return run_command(frontend_cmd)



--- a/examples/backends/sglang/slurm_jobs/submit_disagg.sh
+++ b/examples/backends/sglang/slurm_jobs/submit_disagg.sh
@@ -48,7 +48,6 @@ check_env MODEL_PATH
 check_env CONFIG_DIR
 check_env CONTAINER_IMAGE

-GPU_TYPE="gb200-fp8"
 GPUS_PER_NODE=4
 : "${NETWORK_INTERFACE:=enP6p9s0np0}"

@@ -62,7 +61,8 @@ ISL=$6
 OSL=$7
 CONCURRENCIES=$8
 REQUEST_RATE=$9
-SCRIPT_VARIANT=${10}
+GPU_TYPE=${10}
+SCRIPT_VARIANT=${11}

 RETRIES=1 # defaults to retry the job 1 time to avoid transient errors

@@ -86,7 +86,7 @@ command=(
    --model-dir $MODEL_PATH --config-dir $CONFIG_DIR
    --container-image $CONTAINER_IMAGE

-    --gpu-type $GPU_TYPE --gpus-per-node $GPUS_PER_NODE --network-interface $NETWORK_INTERFACE
+    --gpus-per-node $GPUS_PER_NODE --network-interface $NETWORK_INTERFACE

    --prefill-nodes $PREFILL_NODES --prefill-workers $PREFILL_WORKERS
    --decode-nodes $DECODE_NODES --decode-workers $DECODE_WORKERS
@@ -96,6 +96,8 @@ command=(

    --retries $RETRIES

+    --gpu-type $GPU_TYPE
+
    --run-in-ci
    ${SCRIPT_VARIANT_ARGS[@]}
 )