ci_start_disaggregation_servers.sh 2.98 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/bin/bash

MODEL_PATH="/raid/models/meta-llama/Llama-3.1-8B-Instruct"

# Function to find the first available active IB device
find_active_ib_device() {
    for device in mlx5_{0..11}; do
        if ibv_devinfo $device >/dev/null 2>&1; then
            state=$(ibv_devinfo $device | grep "state:" | head -1 | awk '{print $2}')
            if [[ "$state" == "PORT_ACTIVE" ]]; then
                echo "$device"
                return 0
            fi
        fi
    done
    echo "No active IB device found" >&2
    return 1
}

# Get the first available active IB device
DEVICE=$(find_active_ib_device)
echo "Using IB device: $DEVICE"

# Launch prefill servers on GPU 0–3
for i in {0..3}; do
  PORT=$((30001 + i))
  BOOTSTRAP_PORT=$((9001 + i))
  HOST="127.0.0.$((i + 1))"
  echo "Launching PREFILL server on GPU $i at $HOST:$PORT (bootstrap: $BOOTSTRAP_PORT)"
  CUDA_VISIBLE_DEVICES=$i \
  python3 -m sglang.launch_server \
    --model-path "$MODEL_PATH" \
    --disaggregation-mode prefill \
    --host "$HOST" \
    --port "$PORT" \
    --disaggregation-ib-device "$DEVICE" \
    --disaggregation-bootstrap-port "$BOOTSTRAP_PORT" &
done

# Launch decode servers on GPU 4–7
for i in {4..7}; do
  PORT=$((30001 + i))
  HOST="127.0.0.$((i + 1))"
  echo "Launching DECODE server on GPU $i at $HOST:$PORT"
  CUDA_VISIBLE_DEVICES=$i \
  python3 -m sglang.launch_server \
    --model-path "$MODEL_PATH" \
    --disaggregation-mode decode \
    --host "$HOST" \
    --port "$PORT" \
    --disaggregation-ib-device "$DEVICE" \
    --base-gpu-id 0 &
done

# Wait for disaggregation servers to initialize
echo "Waiting for disaggregation servers to initialize..."

# Health check with 5-minute timeout
TIMEOUT=300
START_TIME=$(date +%s)

echo "Checking health of all 8 servers..."
while true; do
    CURRENT_TIME=$(date +%s)
    ELAPSED=$((CURRENT_TIME - START_TIME))

    if [ $ELAPSED -ge $TIMEOUT ]; then
        echo "❌ Timeout: Servers did not become healthy within 5 minutes"
        exit 1
    fi

    HEALTHY_COUNT=0
    # Check all 8 servers (127.0.0.1-8:30001-30008)
    for i in {1..8}; do
        if curl -s -f "http://127.0.0.$i:$((30000 + i))/health" >/dev/null 2>&1; then
            HEALTHY_COUNT=$((HEALTHY_COUNT + 1))
        fi
    done

    echo "Healthy servers: $HEALTHY_COUNT/8 (elapsed: ${ELAPSED}s)"

    if [ $HEALTHY_COUNT -eq 8 ]; then
        echo "✅ All 8 servers are healthy!"
        break
    else
        sleep 10  # Wait 10 seconds before next check
    fi
done

# Launch the router
echo "Launching router at 127.0.0.9:8000..."
python3 -m sglang_router.launch_router \
  --pd-disaggregation \
  --policy power_of_two \
  --prefill http://127.0.0.1:30001 9001 \
  --prefill http://127.0.0.2:30002 9002 \
  --prefill http://127.0.0.3:30003 9003 \
  --prefill http://127.0.0.4:30004 9004 \
  --decode http://127.0.0.5:30005 \
  --decode http://127.0.0.6:30006 \
  --decode http://127.0.0.7:30007 \
  --decode http://127.0.0.8:30008 \
  --host 127.0.0.9 \
  --port 8000 &

wait  # Wait for all background jobs to finish