#!/bin/bash # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 set -ex SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" source "$SCRIPT_DIR/../../../common/launch_utils.sh" # Default values NUM_NODES="" NODE_RANK="" GPUS_PER_NODE="" MASTER_ADDR="localhost" LOG_DIR="./logs" MODEL="deepseek-ai/DeepSeek-R1" # Parse command line arguments while [[ $# -gt 0 ]]; do case $1 in --num-nodes) NUM_NODES="$2" shift 2 ;; --node-rank) NODE_RANK="$2" shift 2 ;; --gpus-per-node) GPUS_PER_NODE="$2" shift 2 ;; --master-addr) MASTER_ADDR="$2" shift 2 ;; --log-dir) LOG_DIR="$2" shift 2 ;; --model) MODEL="$2" shift 2 ;; -h|--help) echo "Usage: $0 [OPTIONS]" echo "Options:" echo " --num-nodes N Number of nodes in the cluster (required, int)" echo " --node-rank M Rank of this node (0-based, required, int)" echo " --gpus-per-node L Number of GPUs per node (required, int)" echo " --master-addr ADDR Master node address (default: localhost)" echo " --log-dir DIR Directory for log files (default: ./logs)" echo " --model MODEL Model name to use (default: ${MODEL})" echo " -h, --help Show this help message" exit 0 ;; *) echo "Unknown option: $1" echo "Use --help for usage information" exit 1 ;; esac done # Validate required arguments if [ -z "$NUM_NODES" ] || [ -z "$NODE_RANK" ] || [ -z "$GPUS_PER_NODE" ]; then echo "Error: Missing required arguments" echo "Required: --num-nodes, --node-rank, --gpus-per-node" echo "Use --help for usage information" exit 1 fi # Calculate data parallel size DATA_PARALLEL_SIZE=$((NUM_NODES * GPUS_PER_NODE)) HTTP_PORT="${DYN_HTTP_PORT:-8000}" print_launch_banner --no-curl "Launching DeepSeek-R1 Data Parallel (Multi-Node)" "$MODEL" "$HTTP_PORT" \ "Number of nodes: $NUM_NODES" \ "Node rank: $NODE_RANK" \ "GPUs per node: $GPUS_PER_NODE" \ "Data parallel: $DATA_PARALLEL_SIZE" \ "Master address: $MASTER_ADDR" \ "Log directory: $LOG_DIR" if [ "$NODE_RANK" -eq 0 ]; then echo "" echo "Example test command:" echo "" cat <&1 | tee $LOG_DIR/dsr1_dep_ingress.log & fi mkdir -p $LOG_DIR # Data Parallel Attention / Expert Parallelism # Routing to DP workers managed by Dynamo # [NOTE] depending on the warmup and KV allocation setting of vLLM, # the GPU memory requires for vLLM reservation and runtime spike (not # reserved by vLLM) can be different and cause model fails to start, # adjust '--gpu-memory-utilization' as needed GPU_MEM_UTIL="0.91" KV_BYTES="${_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES:-}" if [[ -n "$KV_BYTES" ]]; then GPU_MEM_ARGS="--kv-cache-memory-bytes $KV_BYTES --gpu-memory-utilization 0.01" else GPU_MEM_ARGS="--gpu-memory-utilization $GPU_MEM_UTIL" fi dp_start_rank=$((NODE_RANK * GPUS_PER_NODE)) VLLM_NIXL_SIDE_CHANNEL_PORT=20096 \ VLLM_ALL2ALL_BACKEND="deepep_low_latency" \ VLLM_USE_DEEP_GEMM=1 \ VLLM_RANDOMIZE_DP_DUMMY_INPUTS=1 \ python3 -m dynamo.vllm \ --model $MODEL \ --data-parallel-hybrid-lb \ --data-parallel-size $DATA_PARALLEL_SIZE \ --data-parallel-size-local $GPUS_PER_NODE \ --data-parallel-start-rank $dp_start_rank \ --enable-expert-parallel \ --max-model-len 4096 \ --data-parallel-address $MASTER_ADDR \ --data-parallel-rpc-port 13345 \ $GPU_MEM_ARGS \ --enforce-eager \ --kv-events-config "{\"publisher\":\"zmq\",\"topic\":\"kv-events\",\"endpoint\":\"tcp://*:20080\",\"enable_kv_cache_events\":true}" 2>&1 | tee $LOG_DIR/dsr1_dep_${dp_start_rank}.log & echo "All workers starting. (press Ctrl+C to stop)..." # Exit on first worker failure; kill 0 in the EXIT trap tears down the rest wait_any_exit