run-multi-node-test.sh 3.85 KB
Newer Older
1
2
3
4
#!/bin/bash

set -euox pipefail

5
6
7
8
9
10
11
12
13
14
15
# To detect ROCm
# Check multiple indicators:
if [ -e /dev/kfd ] || \
    [ -d /opt/rocm ] || \
    command -v rocm-smi &> /dev/null || \
    [ -n "$ROCM_HOME" ]; then
    IS_ROCM=1
else
    IS_ROCM=0
fi

16
if [[ $# -lt 4 ]]; then
17
    echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
18
19
20
    exit 1
fi

21
22
23
24
WORKING_DIR=$1
NUM_NODES=$2
NUM_GPUS=$3
DOCKER_IMAGE=$4
25

26
shift 4
27
COMMANDS=("$@")
28
if [ ${#COMMANDS[@]} -ne "$NUM_NODES" ]; then
29
30
31
32
33
34
35
36
    echo "The number of commands must be equal to the number of nodes."
    echo "Number of nodes: $NUM_NODES"
    echo "Number of commands: ${#COMMANDS[@]}"
    exit 1
fi

echo "List of commands"
for command in "${COMMANDS[@]}"; do
37
    echo "$command"
38
39
done

40

41
42
43
44
45
46
start_network() {
    docker network create --subnet=192.168.10.0/24 docker-net
}

start_nodes() {
    for node in $(seq 0 $(($NUM_NODES-1))); do
47
48
49
50
51
        if [ "$IS_ROCM" -eq 1 ]; then
            GPU_DEVICES='--device /dev/kfd --device /dev/dri -e HIP_VISIBLE_DEVICES='
        else
            GPU_DEVICES='--gpus "device='
        fi
52
53
54
        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
            GPU_DEVICES+=$(($DEVICE_NUM))
55
            if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
56
57
58
                GPU_DEVICES+=','
            fi
        done
59
60
61
        if [ "$IS_ROCM" -eq 0 ]; then
            GPU_DEVICES+='"'
        fi
62
63
64
65
66
67
68
69

        # start the container in detached mode
        # things to note:
        # 1. --shm-size=10.24gb is required. don't use --ipc=host
        # 2. pass HF_TOKEN to the container
        # 3. map the huggingface cache directory to the container
        # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
        #    starting from 192.168.10.11)
70
        docker run -d $GPU_DEVICES --shm-size=10.24gb -e HF_TOKEN \
71
72
73
            -v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
            --network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
            /bin/bash -c "tail -f /dev/null"
74
75

        # organize containers into a ray cluster
76
        if [ "$node" -eq 0 ]; then
77
            # start the ray head node
78
            docker exec -d "node$node" /bin/bash -c "ray start --head --port=6379 --block"
79
80
81
82
            # wait for the head node to be ready
            sleep 10
        else
            # start the ray worker nodes, and connect them to the head node
83
            docker exec -d "node$node" /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
84
        fi
85
    done
86
87
88
89
90
91

    # wait for the cluster to be ready
    sleep 10

    # print the cluster status
    docker exec node0 /bin/bash -c "ray status"
92
93
94
}

run_nodes() {
95
96
97
98
    # important: iterate in reverse order to start the head node last
    # we start the worker nodes first, in detached mode, and then start the head node
    # in the foreground, so that the output of the head node is visible in the buildkite logs
    for node in $(seq $(($NUM_NODES - 1)) -1 0); do
99
100
101
102
        GPU_DEVICES='"device='
        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
            GPU_DEVICES+=$(($DEVICE_NUM))
103
            if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
104
105
106
107
108
                GPU_DEVICES+=','
            fi
        done
        GPU_DEVICES+='"'
        echo "Running node$node with GPU devices: $GPU_DEVICES"
109
110
        if [ "$node" -ne 0 ]; then
            docker exec -d "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
111
        else
112
            docker exec "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
113
114
115
116
117
        fi
    done
}
cleanup() {
    for node in $(seq 0 $(($NUM_NODES-1))); do
118
        docker stop "node$node"
119
120
121
122
123
124
125
126
    done
    docker network rm docker-net
}
trap cleanup EXIT
start_network
start_nodes
run_nodes