docker-cluster-up.sh 4.94 KB
Newer Older
1
2
3
4
5
6
7
8
9
#!/bin/bash
set -e

# =====================================================================
# 节点映射表
# 1. 当前节点是主节点,WORKER_CONFIG不包括主节点
# 2. 第一列是物理机hostname或IP,第二列是docker容器hostname
# =====================================================================
MASTER_NODE=$(hostname)
10
DOCKER_MASTER=node01
11
12
13
14
15
16
WORKER_CONFIG="
node2    node02
node3    node03
master   node04
"

17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# =====================================================================
# 默认值
# =====================================================================
IMAGE_NAME=harbor.sourcefind.cn:5443/dcu/admin/base/vllm:0.11.0-ubuntu22.04-dtk26.04-0130-py3.10-20260204
CONTAINER_NAME=cluster-dtk26-20260204
SSH_PORT=3333     # 确保端口可用
WORKDIR="${PWD}"  # 修改为主节点的rocHPL/rocHPCG等应用目录
FORCE_RM=0
SKIP_PULL=0

usage() {
  echo "Usage: $0 [-f|--force] [-i|--image IMAGE] [-n|--name NAME] [-p|--port PORT] [-w|--workdir WORKDIR]"
  echo "  -f, --force              Force remove existing container before starting"
  echo "  -i, --image    IMAGE     Docker image name (default: ${IMAGE_NAME})"
  echo "  -n, --name     NAME      Container name across all nodes (default: ${CONTAINER_NAME})"
  echo "  -p, --port     PORT      SSH port inside container (default: ${SSH_PORT})"
  echo "  -w, --workdir  WORKDIR   Workspace directory to mount for the master node (default: \$PWD)"
  echo "  --no-pull                Skip pulling the image"
  echo "  -h, --help               Show this help message"
  exit 1
}

OPTS=$(getopt -o fi:n:p:w: --long force,image:,name:,port:,workdir:,no-pull -n "$0" -- "$@") || usage
eval set -- "${OPTS}"

while true; do
  case "$1" in
    -f|--force)      FORCE_RM=1; shift ;;
    -i|--image)      IMAGE_NAME="$2"; shift 2 ;;
    -n|--name)       CONTAINER_NAME="$2"; shift 2 ;;
    -p|--port)       SSH_PORT="$2"; shift 2 ;;
    -w|--workdir)    WORKDIR="$2"; shift 2 ;;
    --no-pull)       SKIP_PULL=1; shift ;;
    --)              shift; break ;;
    *)               usage ;;
  esac
done

55
56
57
58
59
60
61
62
63
# =====================================================================
# 解析hosts
# =====================================================================
resolve_ip() {
  local target=$1
  local ip=""

  # Try getent
  if command -v getent >/dev/null 2>&1; then
64
    ip=$(getent ahosts "$target" 2>/dev/null | awk '{print $1}' | head -n 1 || true)
65
66
67
68
  fi

  # Try ping
  if [ -z "$ip" ] && command -v ping >/dev/null 2>&1; then
69
    ip=$(ping -c 1 -n "$target" 2>/dev/null | awk -F'[()]' '/PING/{print $2}' || true)
70
71
72
73
74
75
76
77
78
79
80
  fi

  # Fallback to /etc/hosts
  if [ -z "$ip" ]; then
    ip=$(awk -v h="$target" '!/^#/ && ($2 == h || $3 == h) {print $1; exit}' /etc/hosts)
  fi

  echo "$ip"
}

MASTER_IP=$(resolve_ip ${MASTER_NODE})
81
DOCKER_ADD_HOSTS="--add-host ${DOCKER_MASTER}:${MASTER_IP}"
82
echo "[INFO] Master node: ${MASTER_NODE} -> ${DOCKER_MASTER} (${MASTER_IP})"
83
84
85
86
87
88
89

WORKER_NODE_ARR=()
while read -r phys_host std_name rest; do
  [[ -z "$phys_host" || "$phys_host" == \#* ]] && continue
  
  IP=$(resolve_ip "$phys_host")
  if [ -z "$IP" ]; then
90
    echo "[Error] Failed to resolve IP for worker node '$phys_host'!"
91
92
93
94
95
96
    exit 1
  fi
  
  DOCKER_ADD_HOSTS="${DOCKER_ADD_HOSTS} --add-host ${std_name}:${IP}"
  WORKER_NODE_ARR+=("$phys_host")
  
97
  echo "[INFO] Worker node: ${phys_host} -> ${std_name} (${IP})"
98
99
100
101
102
103
104
105
106
107
done <<< "$WORKER_CONFIG"

WORKER_NODES=$(IFS=,; echo "${WORKER_NODE_ARR[*]}")

ALL_NODES="${MASTER_NODE}"
[ -n "$WORKER_NODES" ] && ALL_NODES="${ALL_NODES},${WORKER_NODES}"

# =====================================================================
# 启动容器
# =====================================================================
108
109
110
111
112
113
if [ "${SKIP_PULL}" == "1" ]; then
  echo "[INFO] Image pulling skipped."
else
  echo "[INFO] Pulling the image on each node..."
  pdsh -w ${ALL_NODES} -S "docker pull ${IMAGE_NAME} >/dev/null"
fi
114

115
116
if [ "${FORCE_RM}" == "1" ]; then
  echo "[INFO] Force removing existing containers..."
117
118
119
  pdsh -w ${ALL_NODES} -S "docker rm -f ${CONTAINER_NAME} 2>/dev/null || true"
fi

120
121
echo "[INFO] Starting docker containers..."
mkdir -p ${WORKDIR}
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
DOCKER_ARGS="--name=${CONTAINER_NAME} \
  -v /opt/hyhal:/opt/hyhal:ro \
  -v /root/.ssh:/root/.ssh \
  -w /workspace \
  ${DOCKER_ADD_HOSTS} \
  --network=host \
  --ipc=host \
  --device=/dev/kfd \
  --device=/dev/mkfd \
  --device=/dev/dri \
  --shm-size=512G \
  --privileged \
  --group-add video \
  --cap-add=SYS_PTRACE \
  -u root \
  --security-opt seccomp=unconfined"

docker run -itd \
  ${DOCKER_ARGS} \
141
  -v ${WORKDIR}:/workspace \
142
143
144
145
146
147
148
149
150
151
  ${IMAGE_NAME} \
  bash -c "mkdir -p /run/sshd && /usr/sbin/sshd -p ${SSH_PORT}; sleep infinity"

if [ -n "$WORKER_NODES" ]; then
  pdsh -w ${WORKER_NODES} -S "docker run -itd \
    ${DOCKER_ARGS} \
    ${IMAGE_NAME} \
    bash -c 'mkdir -p /run/sshd && /usr/sbin/sshd -p ${SSH_PORT}; sleep infinity'"
fi
  
152
153
154
echo "[INFO] All containers are ready!"
echo "[INFO] To access the container on the current node, run:"
echo "       docker exec -it ${CONTAINER_NAME} bash"