docker-cluster-up.sh 4.95 KB
Newer Older
1
2
3
4
5
6
7
8
9
#!/bin/bash
set -e

# =====================================================================
# 节点映射表
# 1. 当前节点是主节点,WORKER_CONFIG不包括主节点
# 2. 第一列是物理机hostname或IP,第二列是docker容器hostname
# =====================================================================
MASTER_NODE=$(hostname)
10
DOCKER_MASTER=node01
11
12
13
14
15
16
WORKER_CONFIG="
node2    node02
node3    node03
master   node04
"

17
# =====================================================================
one's avatar
one committed
18
# 默认值和命令行参数
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# =====================================================================
IMAGE_NAME=harbor.sourcefind.cn:5443/dcu/admin/base/vllm:0.11.0-ubuntu22.04-dtk26.04-0130-py3.10-20260204
CONTAINER_NAME=cluster-dtk26-20260204
SSH_PORT=3333     # 确保端口可用
WORKDIR="${PWD}"  # 修改为主节点的rocHPL/rocHPCG等应用目录
FORCE_RM=0
SKIP_PULL=0

usage() {
  echo "Usage: $0 [-f|--force] [-i|--image IMAGE] [-n|--name NAME] [-p|--port PORT] [-w|--workdir WORKDIR]"
  echo "  -f, --force              Force remove existing container before starting"
  echo "  -i, --image    IMAGE     Docker image name (default: ${IMAGE_NAME})"
  echo "  -n, --name     NAME      Container name across all nodes (default: ${CONTAINER_NAME})"
  echo "  -p, --port     PORT      SSH port inside container (default: ${SSH_PORT})"
  echo "  -w, --workdir  WORKDIR   Workspace directory to mount for the master node (default: \$PWD)"
  echo "  --no-pull                Skip pulling the image"
  echo "  -h, --help               Show this help message"
  exit 1
}

OPTS=$(getopt -o fi:n:p:w: --long force,image:,name:,port:,workdir:,no-pull -n "$0" -- "$@") || usage
eval set -- "${OPTS}"

while true; do
  case "$1" in
    -f|--force)      FORCE_RM=1; shift ;;
    -i|--image)      IMAGE_NAME="$2"; shift 2 ;;
    -n|--name)       CONTAINER_NAME="$2"; shift 2 ;;
    -p|--port)       SSH_PORT="$2"; shift 2 ;;
    -w|--workdir)    WORKDIR="$2"; shift 2 ;;
    --no-pull)       SKIP_PULL=1; shift ;;
    --)              shift; break ;;
    *)               usage ;;
  esac
done

55
56
57
58
59
60
61
62
63
# =====================================================================
# 解析hosts
# =====================================================================
resolve_ip() {
  local target=$1
  local ip=""

  # Try getent
  if command -v getent >/dev/null 2>&1; then
64
    ip=$(getent ahosts "$target" 2>/dev/null | awk '{print $1}' | head -n 1 || true)
65
66
67
68
  fi

  # Try ping
  if [ -z "$ip" ] && command -v ping >/dev/null 2>&1; then
69
    ip=$(ping -c 1 -n "$target" 2>/dev/null | awk -F'[()]' '/PING/{print $2}' || true)
70
71
72
73
74
75
76
77
78
79
80
  fi

  # Fallback to /etc/hosts
  if [ -z "$ip" ]; then
    ip=$(awk -v h="$target" '!/^#/ && ($2 == h || $3 == h) {print $1; exit}' /etc/hosts)
  fi

  echo "$ip"
}

MASTER_IP=$(resolve_ip ${MASTER_NODE})
81
DOCKER_ADD_HOSTS="--add-host ${DOCKER_MASTER}:${MASTER_IP}"
82
echo "[INFO] Master node: ${MASTER_NODE} -> ${DOCKER_MASTER} (${MASTER_IP})"
83
84
85
86
87
88
89

WORKER_NODE_ARR=()
while read -r phys_host std_name rest; do
  [[ -z "$phys_host" || "$phys_host" == \#* ]] && continue
  
  IP=$(resolve_ip "$phys_host")
  if [ -z "$IP" ]; then
90
    echo "[Error] Failed to resolve IP for worker node '$phys_host'!"
91
92
93
94
95
96
    exit 1
  fi
  
  DOCKER_ADD_HOSTS="${DOCKER_ADD_HOSTS} --add-host ${std_name}:${IP}"
  WORKER_NODE_ARR+=("$phys_host")
  
97
  echo "[INFO] Worker node: ${phys_host} -> ${std_name} (${IP})"
98
99
100
101
102
103
104
105
106
107
done <<< "$WORKER_CONFIG"

WORKER_NODES=$(IFS=,; echo "${WORKER_NODE_ARR[*]}")

ALL_NODES="${MASTER_NODE}"
[ -n "$WORKER_NODES" ] && ALL_NODES="${ALL_NODES},${WORKER_NODES}"

# =====================================================================
# 启动容器
# =====================================================================
108
109
110
111
112
113
if [ "${SKIP_PULL}" == "1" ]; then
  echo "[INFO] Image pulling skipped."
else
  echo "[INFO] Pulling the image on each node..."
  pdsh -w ${ALL_NODES} -S "docker pull ${IMAGE_NAME} >/dev/null"
fi
114

115
116
if [ "${FORCE_RM}" == "1" ]; then
  echo "[INFO] Force removing existing containers..."
117
118
119
  pdsh -w ${ALL_NODES} -S "docker rm -f ${CONTAINER_NAME} 2>/dev/null || true"
fi

120
121
echo "[INFO] Starting docker containers..."
mkdir -p ${WORKDIR}
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
DOCKER_ARGS="--name=${CONTAINER_NAME} \
  -v /opt/hyhal:/opt/hyhal:ro \
  -v /root/.ssh:/root/.ssh \
  -w /workspace \
  ${DOCKER_ADD_HOSTS} \
  --network=host \
  --ipc=host \
  --device=/dev/kfd \
  --device=/dev/mkfd \
  --device=/dev/dri \
  --shm-size=512G \
  --privileged \
  --group-add video \
  --cap-add=SYS_PTRACE \
  -u root \
  --security-opt seccomp=unconfined"

docker run -itd \
  ${DOCKER_ARGS} \
141
  -v ${WORKDIR}:/workspace \
142
143
144
145
146
147
148
149
150
151
  ${IMAGE_NAME} \
  bash -c "mkdir -p /run/sshd && /usr/sbin/sshd -p ${SSH_PORT}; sleep infinity"

if [ -n "$WORKER_NODES" ]; then
  pdsh -w ${WORKER_NODES} -S "docker run -itd \
    ${DOCKER_ARGS} \
    ${IMAGE_NAME} \
    bash -c 'mkdir -p /run/sshd && /usr/sbin/sshd -p ${SSH_PORT}; sleep infinity'"
fi
  
152
153
154
echo "[INFO] All containers are ready!"
echo "[INFO] To access the container on the current node, run:"
echo "       docker exec -it ${CONTAINER_NAME} bash"