Commit 01be5357 authored by one's avatar one
Browse files

[rocHPL] Add a script to run containers on multiple nodes

parent 135d8afb
#!/bin/bash
set -e
IMAGE_NAME=harbor.sourcefind.cn:5443/dcu/admin/base/vllm:0.11.0-ubuntu22.04-dtk26.04-0130-py3.10-20260204
CONTAINER_NAME=rochpl-dtk26-20260204
SSH_PORT=3333 # 确保端口可用
WORKSPACE_DIR="${PWD}" # 修改为主节点的rocHPL目录
# =====================================================================
# 节点映射表
# 1. 当前节点是主节点,WORKER_CONFIG不包括主节点
# 2. 第一列是物理机hostname或IP,第二列是docker容器hostname
# =====================================================================
MASTER_NODE=$(hostname)
WORKER_CONFIG="
node2 node02
node3 node03
master node04
"
# =====================================================================
# 解析hosts
# =====================================================================
resolve_ip() {
local target=$1
local ip=""
# Try getent
if command -v getent >/dev/null 2>&1; then
ip=$(getent ahosts "$target" 2>/dev/null | awk '{print $1}' | head -n 1)
fi
# Try ping
if [ -z "$ip" ] && command -v ping >/dev/null 2>&1; then
ip=$(ping -c 1 -n "$target" 2>/dev/null | awk -F'[()]' '/PING/{print $2}')
fi
# Fallback to /etc/hosts
if [ -z "$ip" ]; then
ip=$(awk -v h="$target" '!/^#/ && ($2 == h || $3 == h) {print $1; exit}' /etc/hosts)
fi
echo "$ip"
}
MASTER_IP=$(resolve_ip ${MASTER_NODE})
DOCKER_ADD_HOSTS="--add-host node01:${MASTER_IP}"
echo "Master node: ${MASTER_NODE} -> node01 (${MASTER_IP})"
WORKER_NODE_ARR=()
while read -r phys_host std_name rest; do
[[ -z "$phys_host" || "$phys_host" == \#* ]] && continue
IP=$(resolve_ip "$phys_host")
if [ -z "$IP" ]; then
echo "Error: failed to resolve IP for worker node '$phys_host'!"
exit 1
fi
DOCKER_ADD_HOSTS="${DOCKER_ADD_HOSTS} --add-host ${std_name}:${IP}"
WORKER_NODE_ARR+=("$phys_host")
echo "Worker node: ${phys_host} -> ${std_name} (${IP})"
done <<< "$WORKER_CONFIG"
WORKER_NODES=$(IFS=,; echo "${WORKER_NODE_ARR[*]}")
ALL_NODES="${MASTER_NODE}"
[ -n "$WORKER_NODES" ] && ALL_NODES="${ALL_NODES},${WORKER_NODES}"
# =====================================================================
# 启动容器
# =====================================================================
echo "[1/3] Pulling the image on each node..."
pdsh -w ${ALL_NODES} -S "docker pull ${IMAGE_NAME} >/dev/null"
if [ "$1" == "-f" ]; then
echo "[2/3] Force cleaning up old container..."
pdsh -w ${ALL_NODES} -S "docker rm -f ${CONTAINER_NAME} 2>/dev/null || true"
else
echo "[2/3] Skipping cleanup."
fi
echo "[3/3] Starting containers..."
mkdir -p ${WORKSPACE_DIR}
DOCKER_ARGS="--name=${CONTAINER_NAME} \
-v /opt/hyhal:/opt/hyhal:ro \
-v /root/.ssh:/root/.ssh \
-w /workspace \
${DOCKER_ADD_HOSTS} \
--network=host \
--ipc=host \
--device=/dev/kfd \
--device=/dev/mkfd \
--device=/dev/dri \
--shm-size=512G \
--privileged \
--group-add video \
--cap-add=SYS_PTRACE \
-u root \
--security-opt seccomp=unconfined"
docker run -itd \
${DOCKER_ARGS} \
-v ${WORKSPACE_DIR}:/workspace \
${IMAGE_NAME} \
bash -c "mkdir -p /run/sshd && /usr/sbin/sshd -p ${SSH_PORT}; sleep infinity"
if [ -n "$WORKER_NODES" ]; then
pdsh -w ${WORKER_NODES} -S "docker run -itd \
${DOCKER_ARGS} \
${IMAGE_NAME} \
bash -c 'mkdir -p /run/sshd && /usr/sbin/sshd -p ${SSH_PORT}; sleep infinity'"
fi
echo "All containers are ready!"
echo "To access the container on the current node, run:"
echo "docker exec -it ${CONTAINER_NAME} bash"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment