#!/bin/bash set -e # ===================================================================== # 节点映射表 # 1. 当前节点是主节点,WORKER_CONFIG不包括主节点 # 2. 第一列是物理机hostname或IP,第二列是docker容器hostname # ===================================================================== MASTER_NODE=$(hostname) DOCKER_MASTER=node01 WORKER_CONFIG=" node2 node02 node3 node03 master node04 " # ===================================================================== # 默认值 # ===================================================================== IMAGE_NAME=harbor.sourcefind.cn:5443/dcu/admin/base/vllm:0.11.0-ubuntu22.04-dtk26.04-0130-py3.10-20260204 CONTAINER_NAME=cluster-dtk26-20260204 SSH_PORT=3333 # 确保端口可用 WORKDIR="${PWD}" # 修改为主节点的rocHPL/rocHPCG等应用目录 FORCE_RM=0 SKIP_PULL=0 usage() { echo "Usage: $0 [-f|--force] [-i|--image IMAGE] [-n|--name NAME] [-p|--port PORT] [-w|--workdir WORKDIR]" echo " -f, --force Force remove existing container before starting" echo " -i, --image IMAGE Docker image name (default: ${IMAGE_NAME})" echo " -n, --name NAME Container name across all nodes (default: ${CONTAINER_NAME})" echo " -p, --port PORT SSH port inside container (default: ${SSH_PORT})" echo " -w, --workdir WORKDIR Workspace directory to mount for the master node (default: \$PWD)" echo " --no-pull Skip pulling the image" echo " -h, --help Show this help message" exit 1 } OPTS=$(getopt -o fi:n:p:w: --long force,image:,name:,port:,workdir:,no-pull -n "$0" -- "$@") || usage eval set -- "${OPTS}" while true; do case "$1" in -f|--force) FORCE_RM=1; shift ;; -i|--image) IMAGE_NAME="$2"; shift 2 ;; -n|--name) CONTAINER_NAME="$2"; shift 2 ;; -p|--port) SSH_PORT="$2"; shift 2 ;; -w|--workdir) WORKDIR="$2"; shift 2 ;; --no-pull) SKIP_PULL=1; shift ;; --) shift; break ;; *) usage ;; esac done # ===================================================================== # 解析hosts # ===================================================================== resolve_ip() { local target=$1 local ip="" # Try getent if command -v getent >/dev/null 2>&1; then ip=$(getent ahosts "$target" 2>/dev/null | awk '{print $1}' | head -n 1 || true) fi # Try ping if [ -z "$ip" ] && command -v ping >/dev/null 2>&1; then ip=$(ping -c 1 -n "$target" 2>/dev/null | awk -F'[()]' '/PING/{print $2}' || true) fi # Fallback to /etc/hosts if [ -z "$ip" ]; then ip=$(awk -v h="$target" '!/^#/ && ($2 == h || $3 == h) {print $1; exit}' /etc/hosts) fi echo "$ip" } MASTER_IP=$(resolve_ip ${MASTER_NODE}) DOCKER_ADD_HOSTS="--add-host ${DOCKER_MASTER}:${MASTER_IP}" echo "[INFO] Master node: ${MASTER_NODE} -> ${DOCKER_MASTER} (${MASTER_IP})" WORKER_NODE_ARR=() while read -r phys_host std_name rest; do [[ -z "$phys_host" || "$phys_host" == \#* ]] && continue IP=$(resolve_ip "$phys_host") if [ -z "$IP" ]; then echo "[Error] Failed to resolve IP for worker node '$phys_host'!" exit 1 fi DOCKER_ADD_HOSTS="${DOCKER_ADD_HOSTS} --add-host ${std_name}:${IP}" WORKER_NODE_ARR+=("$phys_host") echo "[INFO] Worker node: ${phys_host} -> ${std_name} (${IP})" done <<< "$WORKER_CONFIG" WORKER_NODES=$(IFS=,; echo "${WORKER_NODE_ARR[*]}") ALL_NODES="${MASTER_NODE}" [ -n "$WORKER_NODES" ] && ALL_NODES="${ALL_NODES},${WORKER_NODES}" # ===================================================================== # 启动容器 # ===================================================================== if [ "${SKIP_PULL}" == "1" ]; then echo "[INFO] Image pulling skipped." else echo "[INFO] Pulling the image on each node..." pdsh -w ${ALL_NODES} -S "docker pull ${IMAGE_NAME} >/dev/null" fi if [ "${FORCE_RM}" == "1" ]; then echo "[INFO] Force removing existing containers..." pdsh -w ${ALL_NODES} -S "docker rm -f ${CONTAINER_NAME} 2>/dev/null || true" fi echo "[INFO] Starting docker containers..." mkdir -p ${WORKDIR} DOCKER_ARGS="--name=${CONTAINER_NAME} \ -v /opt/hyhal:/opt/hyhal:ro \ -v /root/.ssh:/root/.ssh \ -w /workspace \ ${DOCKER_ADD_HOSTS} \ --network=host \ --ipc=host \ --device=/dev/kfd \ --device=/dev/mkfd \ --device=/dev/dri \ --shm-size=512G \ --privileged \ --group-add video \ --cap-add=SYS_PTRACE \ -u root \ --security-opt seccomp=unconfined" docker run -itd \ ${DOCKER_ARGS} \ -v ${WORKDIR}:/workspace \ ${IMAGE_NAME} \ bash -c "mkdir -p /run/sshd && /usr/sbin/sshd -p ${SSH_PORT}; sleep infinity" if [ -n "$WORKER_NODES" ]; then pdsh -w ${WORKER_NODES} -S "docker run -itd \ ${DOCKER_ARGS} \ ${IMAGE_NAME} \ bash -c 'mkdir -p /run/sshd && /usr/sbin/sshd -p ${SSH_PORT}; sleep infinity'" fi echo "[INFO] All containers are ready!" echo "[INFO] To access the container on the current node, run:" echo " docker exec -it ${CONTAINER_NAME} bash"