#!/bin/bash set -e # ===================================================================== # 节点映射表 # 1. 当前节点是主节点,会挂载一个工作目录 # 2. 第一列是物理机hostname或IP,第二列是docker容器hostname # ===================================================================== CLUSTER_CONFIG=" $(hostname) node01 node1 node02 node2 node03 node3 node04 " # ===================================================================== # 默认值和命令行参数 # ===================================================================== IMAGE_NAME=harbor.sourcefind.cn:5443/dcu/admin/base/vllm:0.11.0-ubuntu22.04-dtk26.04-0130-py3.10-20260204 CONTAINER_NAME=cluster-dtk26-20260204 SSH_PORT=3333 # 确保端口可用 WORKDIR="${PWD}" # 修改为主节点的rocHPL/rocHPCG等应用目录 FORCE_RM=0 SKIP_PULL=0 usage() { echo "Usage: $0 [-f|--force] [-i|--image IMAGE] [-n|--name NAME] [-p|--port PORT] [-w|--workdir WORKDIR]" echo " -f, --force Force remove existing container before starting" echo " -i, --image IMAGE Docker image name (default: ${IMAGE_NAME})" echo " -n, --name NAME Container name across all nodes (default: ${CONTAINER_NAME})" echo " -p, --port PORT SSH port inside container (default: ${SSH_PORT})" echo " -w, --workdir WORKDIR Workspace directory to mount for the master node (default: \$PWD)" echo " --no-pull Skip pulling the image" echo " -h, --help Show this help message" exit 1 } OPTS=$(getopt -o hfi:n:p:w: --long help,force,image:,name:,port:,workdir:,no-pull -n "$0" -- "$@") || usage eval set -- "${OPTS}" while true; do case "$1" in -h|--help) usage ;; -f|--force) FORCE_RM=1; shift ;; -i|--image) IMAGE_NAME="$2"; shift 2 ;; -n|--name) CONTAINER_NAME="$2"; shift 2 ;; -p|--port) SSH_PORT="$2"; shift 2 ;; -w|--workdir) WORKDIR="$2"; shift 2 ;; --no-pull) SKIP_PULL=1; shift ;; --) shift; break ;; *) usage ;; esac done # ===================================================================== # 解析hosts # ===================================================================== resolve_ip() { local target=$1 local ip="" # Try getent if command -v getent >/dev/null 2>&1; then ip=$(getent ahosts "$target" 2>/dev/null | awk '{print $1}' | head -n 1 || true) fi # Try ping if [ -z "$ip" ] && command -v ping >/dev/null 2>&1; then ip=$(ping -c 1 -n "$target" 2>/dev/null | awk -F'[()]' '/PING/{print $2}' || true) fi # Fallback to /etc/hosts if [ -z "$ip" ]; then ip=$(awk -v h="$target" '!/^#/ && ($2 == h || $3 == h) {print $1; exit}' /etc/hosts) fi echo "$ip" } MASTER_NODE=$(hostname) DOCKER_MASTER="" MASTER_IP="" DOCKER_ADD_HOSTS="" MAPPING_STR="" WORKER_NODE_ARR=() # 读取hostname映射表 while read -r phys_host std_name rest; do [[ -z "$phys_host" || "$phys_host" == \#* ]] && continue IP=$(resolve_ip "$phys_host") if [ -z "$IP" ]; then echo "[Error] Failed to resolve IP for node '$phys_host'!" exit 1 fi # 所有节点的主机记录都需要加上 DOCKER_ADD_HOSTS="${DOCKER_ADD_HOSTS} --add-host ${std_name}:${IP}" if [ -z "$MAPPING_STR" ]; then MAPPING_STR="${phys_host}:${std_name}" else MAPPING_STR="${MAPPING_STR},${phys_host}:${std_name}" fi # 判断是否是本物理机(主节点) if [ "$phys_host" = "$MASTER_NODE" ]; then DOCKER_MASTER=$std_name MASTER_IP=$IP echo "[INFO] Master node: ${phys_host} -> ${std_name} (${IP})" else WORKER_NODE_ARR+=("$phys_host") echo "[INFO] Worker node: ${phys_host} -> ${std_name} (${IP})" fi done <<< "$CLUSTER_CONFIG" if [ -z "$DOCKER_MASTER" ]; then echo "[Error] Current node '$MASTER_NODE' is not found in CLUSTER_CONFIG!" exit 1 fi WORKER_NODES=$(IFS=,; echo "${WORKER_NODE_ARR[*]}") ALL_NODES="${MASTER_NODE}" [ -n "$WORKER_NODES" ] && ALL_NODES="${ALL_NODES},${WORKER_NODES}" # ===================================================================== # 启动容器 # ===================================================================== if [ "${SKIP_PULL}" == "1" ]; then echo "[INFO] Image pulling skipped." else echo "[INFO] Pulling the image on each node..." pdsh -w ${ALL_NODES} -S "docker pull ${IMAGE_NAME} >/dev/null" fi if [ "${FORCE_RM}" == "1" ]; then echo "[INFO] Force removing existing containers..." pdsh -w ${ALL_NODES} -S "docker rm -f ${CONTAINER_NAME} 2>/dev/null || true" fi echo "[INFO] Starting docker containers..." mkdir -p ${WORKDIR} DOCKER_ARGS="--name=${CONTAINER_NAME} \ -e NODE_MAPPING=${MAPPING_STR} \ -v /opt/hyhal:/opt/hyhal:ro \ -v /root/.ssh:/root/.ssh \ -w /workspace \ ${DOCKER_ADD_HOSTS} \ --network=host \ --ipc=host \ --device=/dev/kfd \ --device=/dev/mkfd \ --device=/dev/dri \ --shm-size=512G \ --privileged \ --group-add video \ --cap-add=SYS_PTRACE \ -u root \ --security-opt seccomp=unconfined" docker run -itd \ ${DOCKER_ARGS} \ -v ${WORKDIR}:/workspace \ ${IMAGE_NAME} \ bash -c "hostname ${DOCKER_MASTER} && mkdir -p /run/sshd && /usr/sbin/sshd -p ${SSH_PORT}; sleep infinity" if [ -n "$WORKER_NODES" ]; then pdsh -w ${WORKER_NODES} -S "docker run -itd \ ${DOCKER_ARGS} \ ${IMAGE_NAME} \ bash -c 'PHYS=\$(hostname); for m in \${NODE_MAPPING//,/ }; do [ \"\${m%%:*}\" = \"\$PHYS\" ] && hostname \${m##*:} && break; done; mkdir -p /run/sshd && /usr/sbin/sshd -p ${SSH_PORT}; sleep infinity'" fi echo "[INFO] All containers are ready!" echo "[INFO] To access the container on the current node, run:" echo echo " docker exec -it ${CONTAINER_NAME} bash" echo