run-docker.sh 3.48 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#!/bin/bash
set -e

IMAGE_NAME=harbor.sourcefind.cn:5443/dcu/admin/base/vllm:0.11.0-ubuntu22.04-dtk26.04-0130-py3.10-20260204
CONTAINER_NAME=rochpl-dtk26-20260204
SSH_PORT=3333           # 确保端口可用
WORKSPACE_DIR="${PWD}"  # 修改为主节点的rocHPL目录

# =====================================================================
# 节点映射表
# 1. 当前节点是主节点,WORKER_CONFIG不包括主节点
# 2. 第一列是物理机hostname或IP,第二列是docker容器hostname
# =====================================================================
MASTER_NODE=$(hostname)
15
DOCKER_MASTER=node01
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
WORKER_CONFIG="
node2    node02
node3    node03
master   node04
"

# =====================================================================
# 解析hosts
# =====================================================================
resolve_ip() {
  local target=$1
  local ip=""

  # Try getent
  if command -v getent >/dev/null 2>&1; then
31
    ip=$(getent ahosts "$target" 2>/dev/null | awk '{print $1}' | head -n 1 || true)
32
33
34
35
  fi

  # Try ping
  if [ -z "$ip" ] && command -v ping >/dev/null 2>&1; then
36
    ip=$(ping -c 1 -n "$target" 2>/dev/null | awk -F'[()]' '/PING/{print $2}' || true)
37
38
39
40
41
42
43
44
45
46
47
  fi

  # Fallback to /etc/hosts
  if [ -z "$ip" ]; then
    ip=$(awk -v h="$target" '!/^#/ && ($2 == h || $3 == h) {print $1; exit}' /etc/hosts)
  fi

  echo "$ip"
}

MASTER_IP=$(resolve_ip ${MASTER_NODE})
48
49
DOCKER_ADD_HOSTS="--add-host ${DOCKER_MASTER}:${MASTER_IP}"
echo "Master node: ${MASTER_NODE} -> ${DOCKER_MASTER} (${MASTER_IP})"
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119

WORKER_NODE_ARR=()
while read -r phys_host std_name rest; do
  [[ -z "$phys_host" || "$phys_host" == \#* ]] && continue
  
  IP=$(resolve_ip "$phys_host")
  if [ -z "$IP" ]; then
    echo "Error: failed to resolve IP for worker node '$phys_host'!"
    exit 1
  fi
  
  DOCKER_ADD_HOSTS="${DOCKER_ADD_HOSTS} --add-host ${std_name}:${IP}"
  WORKER_NODE_ARR+=("$phys_host")
  
  echo "Worker node: ${phys_host} -> ${std_name} (${IP})"
done <<< "$WORKER_CONFIG"

WORKER_NODES=$(IFS=,; echo "${WORKER_NODE_ARR[*]}")

ALL_NODES="${MASTER_NODE}"
[ -n "$WORKER_NODES" ] && ALL_NODES="${ALL_NODES},${WORKER_NODES}"

# =====================================================================
# 启动容器
# =====================================================================
echo "[1/3] Pulling the image on each node..."
pdsh -w ${ALL_NODES} -S "docker pull ${IMAGE_NAME} >/dev/null"

if [ "$1" == "-f" ]; then
  echo "[2/3] Force cleaning up old container..."
  pdsh -w ${ALL_NODES} -S "docker rm -f ${CONTAINER_NAME} 2>/dev/null || true"
else
  echo "[2/3] Skipping cleanup."
fi

echo "[3/3] Starting containers..."
mkdir -p ${WORKSPACE_DIR}
DOCKER_ARGS="--name=${CONTAINER_NAME} \
  -v /opt/hyhal:/opt/hyhal:ro \
  -v /root/.ssh:/root/.ssh \
  -w /workspace \
  ${DOCKER_ADD_HOSTS} \
  --network=host \
  --ipc=host \
  --device=/dev/kfd \
  --device=/dev/mkfd \
  --device=/dev/dri \
  --shm-size=512G \
  --privileged \
  --group-add video \
  --cap-add=SYS_PTRACE \
  -u root \
  --security-opt seccomp=unconfined"

docker run -itd \
  ${DOCKER_ARGS} \
  -v ${WORKSPACE_DIR}:/workspace \
  ${IMAGE_NAME} \
  bash -c "mkdir -p /run/sshd && /usr/sbin/sshd -p ${SSH_PORT}; sleep infinity"

if [ -n "$WORKER_NODES" ]; then
  pdsh -w ${WORKER_NODES} -S "docker run -itd \
    ${DOCKER_ARGS} \
    ${IMAGE_NAME} \
    bash -c 'mkdir -p /run/sshd && /usr/sbin/sshd -p ${SSH_PORT}; sleep infinity'"
fi
  
echo "All containers are ready!"
echo "To access the container on the current node, run:"
echo "docker exec -it ${CONTAINER_NAME} bash"