dsr1_dep.sh

#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

set -ex

# Default values
NUM_NODES=""
NODE_RANK=""
GPUS_PER_NODE=""
MASTER_ADDR="localhost"
LOG_DIR="./logs"
MODEL="deepseek-ai/DeepSeek-R1"

# Parse command line arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        --num-nodes)
            NUM_NODES="$2"
            shift 2
            ;;
        --node-rank)
            NODE_RANK="$2"
            shift 2
            ;;
        --gpus-per-node)
            GPUS_PER_NODE="$2"
            shift 2
            ;;
        --master-addr)
            MASTER_ADDR="$2"
            shift 2
            ;;
        --log-dir)
            LOG_DIR="$2"
            shift 2
            ;;
        --model)
            MODEL="$2"
            shift 2
            ;;
        -h|--help)
            echo "Usage: $0 [OPTIONS]"
            echo "Options:"
            echo "  --num-nodes N         Number of nodes in the cluster (required, int)"
            echo "  --node-rank M         Rank of this node (0-based, required, int)"
            echo "  --gpus-per-node L     Number of GPUs per node (required, int)"
            echo "  --master-addr ADDR    Master node address (default: localhost)"
            echo "  --log-dir DIR         Directory for log files (default: ./logs)"
            echo "  --model MODEL         Model name to use (default: ${MODEL})"
            echo "  -h, --help            Show this help message"
            exit 0
            ;;
        *)
            echo "Unknown option: $1"
            echo "Use --help for usage information"
            exit 1
            ;;
    esac
done

# Validate required arguments
if [ -z "$NUM_NODES" ] || [ -z "$NODE_RANK" ] || [ -z "$GPUS_PER_NODE" ]; then
    echo "Error: Missing required arguments"
    echo "Required: --num-nodes, --node-rank, --gpus-per-node"
    echo "Use --help for usage information"
    exit 1
fi

# Calculate data parallel size
DATA_PARALLEL_SIZE=$((NUM_NODES * GPUS_PER_NODE))

HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching DeepSeek-R1 Data Parallel (Multi-Node)"
echo "=========================================="
echo "Model:       $MODEL"
if [ "$NODE_RANK" -eq 0 ]; then
echo "Frontend:    http://localhost:$HTTP_PORT"
fi
echo "Number of nodes: $NUM_NODES"
echo "Node rank:       $NODE_RANK"
echo "GPUs per node:   $GPUS_PER_NODE"
echo "Data parallel:   $DATA_PARALLEL_SIZE"
echo "Master address:  $MASTER_ADDR"
echo "Log directory:   $LOG_DIR"
echo "=========================================="
if [ "$NODE_RANK" -eq 0 ]; then
echo ""
echo "Example test command:"
echo ""
echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo "    -H 'Content-Type: application/json' \\"
echo "    -d '{"
echo "      \"model\": \"${MODEL}\","
echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo "      \"max_tokens\": 32"
echo "    }'"
echo ""
echo "=========================================="
fi

trap 'echo Cleaning up...; kill 0' EXIT

# run ingress if it's node 0
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
if [ $NODE_RANK -eq 0 ]; then
    DYN_LOG=debug python -m dynamo.frontend --router-mode kv 2>&1 | tee $LOG_DIR/dsr1_dep_ingress.log &
fi

mkdir -p $LOG_DIR

# Data Parallel Attention / Expert Parallelism
# Routing to DP workers managed by Dynamo
# [NOTE] depending on the warmup and KV allocation setting of vLLM,
# the GPU memory requires for vLLM reservation and runtime spike (not
# reserved by vLLM) can be different and cause model fails to start,
# adjust '--gpu-memory-utilization' as needed
for ((i=0; i<GPUS_PER_NODE; i++)); do
    dp_rank=$((i + NODE_RANK * GPUS_PER_NODE))
    CUDA_VISIBLE_DEVICES=$i \
        VLLM_NIXL_SIDE_CHANNEL_PORT=$((20096 + i)) \
        VLLM_ALL2ALL_BACKEND="deepep_low_latency" \
        VLLM_USE_DEEP_GEMM=1 \
        VLLM_RANDOMIZE_DP_DUMMY_INPUTS=1 \
        python3 -m dynamo.vllm \
        --model $MODEL \
        --data_parallel_size $DATA_PARALLEL_SIZE \
        --data-parallel-rank $dp_rank \
        --enable-expert-parallel \
        --max-model-len 4096 \
        --data-parallel-address $MASTER_ADDR \
        --data-parallel-rpc-port 13345 \
        --gpu-memory-utilization 0.91 \
        --enforce-eager \
        --kv-events-config "{\"publisher\":\"zmq\",\"topic\":\"kv-events\",\"endpoint\":\"tcp://*:$((20080 + i))\",\"enable_kv_cache_events\":true}" 2>&1 | tee $LOG_DIR/dsr1_dep_${dp_rank}.log &
done

echo "All workers starting. (press Ctrl+C to stop)..."
wait