#!/bin/bash set -e IMAGE_NAME=harbor.sourcefind.cn:5443/dcu/admin/base/vllm:0.11.0-ubuntu22.04-dtk26.04-0130-py3.10-20260204 CONTAINER_NAME=rochpl-dtk26-20260204 SSH_PORT=3333 # 确保端口可用 WORKSPACE_DIR="${PWD}" # 修改为主节点的rocHPL目录 # ===================================================================== # 节点映射表 # 1. 当前节点是主节点,WORKER_CONFIG不包括主节点 # 2. 第一列是物理机hostname或IP,第二列是docker容器hostname # ===================================================================== MASTER_NODE=$(hostname) DOCKER_MASTER=node01 WORKER_CONFIG=" node2 node02 node3 node03 master node04 " # ===================================================================== # 解析hosts # ===================================================================== resolve_ip() { local target=$1 local ip="" # Try getent if command -v getent >/dev/null 2>&1; then ip=$(getent ahosts "$target" 2>/dev/null | awk '{print $1}' | head -n 1 || true) fi # Try ping if [ -z "$ip" ] && command -v ping >/dev/null 2>&1; then ip=$(ping -c 1 -n "$target" 2>/dev/null | awk -F'[()]' '/PING/{print $2}' || true) fi # Fallback to /etc/hosts if [ -z "$ip" ]; then ip=$(awk -v h="$target" '!/^#/ && ($2 == h || $3 == h) {print $1; exit}' /etc/hosts) fi echo "$ip" } MASTER_IP=$(resolve_ip ${MASTER_NODE}) DOCKER_ADD_HOSTS="--add-host ${DOCKER_MASTER}:${MASTER_IP}" echo "Master node: ${MASTER_NODE} -> ${DOCKER_MASTER} (${MASTER_IP})" WORKER_NODE_ARR=() while read -r phys_host std_name rest; do [[ -z "$phys_host" || "$phys_host" == \#* ]] && continue IP=$(resolve_ip "$phys_host") if [ -z "$IP" ]; then echo "Error: failed to resolve IP for worker node '$phys_host'!" exit 1 fi DOCKER_ADD_HOSTS="${DOCKER_ADD_HOSTS} --add-host ${std_name}:${IP}" WORKER_NODE_ARR+=("$phys_host") echo "Worker node: ${phys_host} -> ${std_name} (${IP})" done <<< "$WORKER_CONFIG" WORKER_NODES=$(IFS=,; echo "${WORKER_NODE_ARR[*]}") ALL_NODES="${MASTER_NODE}" [ -n "$WORKER_NODES" ] && ALL_NODES="${ALL_NODES},${WORKER_NODES}" # ===================================================================== # 启动容器 # ===================================================================== echo "[1/3] Pulling the image on each node..." pdsh -w ${ALL_NODES} -S "docker pull ${IMAGE_NAME} >/dev/null" if [ "$1" == "-f" ]; then echo "[2/3] Force cleaning up old container..." pdsh -w ${ALL_NODES} -S "docker rm -f ${CONTAINER_NAME} 2>/dev/null || true" else echo "[2/3] Skipping cleanup." fi echo "[3/3] Starting containers..." mkdir -p ${WORKSPACE_DIR} DOCKER_ARGS="--name=${CONTAINER_NAME} \ -v /opt/hyhal:/opt/hyhal:ro \ -v /root/.ssh:/root/.ssh \ -w /workspace \ ${DOCKER_ADD_HOSTS} \ --network=host \ --ipc=host \ --device=/dev/kfd \ --device=/dev/mkfd \ --device=/dev/dri \ --shm-size=512G \ --privileged \ --group-add video \ --cap-add=SYS_PTRACE \ -u root \ --security-opt seccomp=unconfined" docker run -itd \ ${DOCKER_ARGS} \ -v ${WORKSPACE_DIR}:/workspace \ ${IMAGE_NAME} \ bash -c "mkdir -p /run/sshd && /usr/sbin/sshd -p ${SSH_PORT}; sleep infinity" if [ -n "$WORKER_NODES" ]; then pdsh -w ${WORKER_NODES} -S "docker run -itd \ ${DOCKER_ARGS} \ ${IMAGE_NAME} \ bash -c 'mkdir -p /run/sshd && /usr/sbin/sshd -p ${SSH_PORT}; sleep infinity'" fi echo "All containers are ready!" echo "To access the container on the current node, run:" echo "docker exec -it ${CONTAINER_NAME} bash"