Commit b094b3f8 authored by one's avatar one
Browse files

[cluster] Add a lightweight container launcher

parent c84e5170
#!/bin/bash #!/bin/bash
set -e set -e
IMAGE_NAME=harbor.sourcefind.cn:5443/dcu/admin/base/vllm:0.11.0-ubuntu22.04-dtk26.04-0130-py3.10-20260204
CONTAINER_NAME=rochpl-dtk26-20260204
SSH_PORT=3333 # 确保端口可用
WORKSPACE_DIR="${PWD}" # 修改为主节点的rocHPL目录
# ===================================================================== # =====================================================================
# 节点映射表 # 节点映射表
# 1. 当前节点是主节点,WORKER_CONFIG不包括主节点 # 1. 当前节点是主节点,WORKER_CONFIG不包括主节点
...@@ -19,6 +14,44 @@ node3 node03 ...@@ -19,6 +14,44 @@ node3 node03
master node04 master node04
" "
# =====================================================================
# 默认值
# =====================================================================
IMAGE_NAME=harbor.sourcefind.cn:5443/dcu/admin/base/vllm:0.11.0-ubuntu22.04-dtk26.04-0130-py3.10-20260204
CONTAINER_NAME=cluster-dtk26-20260204
SSH_PORT=3333 # 确保端口可用
WORKDIR="${PWD}" # 修改为主节点的rocHPL/rocHPCG等应用目录
FORCE_RM=0
SKIP_PULL=0
usage() {
echo "Usage: $0 [-f|--force] [-i|--image IMAGE] [-n|--name NAME] [-p|--port PORT] [-w|--workdir WORKDIR]"
echo " -f, --force Force remove existing container before starting"
echo " -i, --image IMAGE Docker image name (default: ${IMAGE_NAME})"
echo " -n, --name NAME Container name across all nodes (default: ${CONTAINER_NAME})"
echo " -p, --port PORT SSH port inside container (default: ${SSH_PORT})"
echo " -w, --workdir WORKDIR Workspace directory to mount for the master node (default: \$PWD)"
echo " --no-pull Skip pulling the image"
echo " -h, --help Show this help message"
exit 1
}
OPTS=$(getopt -o fi:n:p:w: --long force,image:,name:,port:,workdir:,no-pull -n "$0" -- "$@") || usage
eval set -- "${OPTS}"
while true; do
case "$1" in
-f|--force) FORCE_RM=1; shift ;;
-i|--image) IMAGE_NAME="$2"; shift 2 ;;
-n|--name) CONTAINER_NAME="$2"; shift 2 ;;
-p|--port) SSH_PORT="$2"; shift 2 ;;
-w|--workdir) WORKDIR="$2"; shift 2 ;;
--no-pull) SKIP_PULL=1; shift ;;
--) shift; break ;;
*) usage ;;
esac
done
# ===================================================================== # =====================================================================
# 解析hosts # 解析hosts
# ===================================================================== # =====================================================================
...@@ -46,7 +79,7 @@ resolve_ip() { ...@@ -46,7 +79,7 @@ resolve_ip() {
MASTER_IP=$(resolve_ip ${MASTER_NODE}) MASTER_IP=$(resolve_ip ${MASTER_NODE})
DOCKER_ADD_HOSTS="--add-host ${DOCKER_MASTER}:${MASTER_IP}" DOCKER_ADD_HOSTS="--add-host ${DOCKER_MASTER}:${MASTER_IP}"
echo "Master node: ${MASTER_NODE} -> ${DOCKER_MASTER} (${MASTER_IP})" echo "[INFO] Master node: ${MASTER_NODE} -> ${DOCKER_MASTER} (${MASTER_IP})"
WORKER_NODE_ARR=() WORKER_NODE_ARR=()
while read -r phys_host std_name rest; do while read -r phys_host std_name rest; do
...@@ -54,14 +87,14 @@ while read -r phys_host std_name rest; do ...@@ -54,14 +87,14 @@ while read -r phys_host std_name rest; do
IP=$(resolve_ip "$phys_host") IP=$(resolve_ip "$phys_host")
if [ -z "$IP" ]; then if [ -z "$IP" ]; then
echo "Error: failed to resolve IP for worker node '$phys_host'!" echo "[Error] Failed to resolve IP for worker node '$phys_host'!"
exit 1 exit 1
fi fi
DOCKER_ADD_HOSTS="${DOCKER_ADD_HOSTS} --add-host ${std_name}:${IP}" DOCKER_ADD_HOSTS="${DOCKER_ADD_HOSTS} --add-host ${std_name}:${IP}"
WORKER_NODE_ARR+=("$phys_host") WORKER_NODE_ARR+=("$phys_host")
echo "Worker node: ${phys_host} -> ${std_name} (${IP})" echo "[INFO] Worker node: ${phys_host} -> ${std_name} (${IP})"
done <<< "$WORKER_CONFIG" done <<< "$WORKER_CONFIG"
WORKER_NODES=$(IFS=,; echo "${WORKER_NODE_ARR[*]}") WORKER_NODES=$(IFS=,; echo "${WORKER_NODE_ARR[*]}")
...@@ -72,18 +105,20 @@ ALL_NODES="${MASTER_NODE}" ...@@ -72,18 +105,20 @@ ALL_NODES="${MASTER_NODE}"
# ===================================================================== # =====================================================================
# 启动容器 # 启动容器
# ===================================================================== # =====================================================================
echo "[1/3] Pulling the image on each node..." if [ "${SKIP_PULL}" == "1" ]; then
pdsh -w ${ALL_NODES} -S "docker pull ${IMAGE_NAME} >/dev/null" echo "[INFO] Image pulling skipped."
else
echo "[INFO] Pulling the image on each node..."
pdsh -w ${ALL_NODES} -S "docker pull ${IMAGE_NAME} >/dev/null"
fi
if [ "$1" == "-f" ]; then if [ "${FORCE_RM}" == "1" ]; then
echo "[2/3] Force cleaning up old container..." echo "[INFO] Force removing existing containers..."
pdsh -w ${ALL_NODES} -S "docker rm -f ${CONTAINER_NAME} 2>/dev/null || true" pdsh -w ${ALL_NODES} -S "docker rm -f ${CONTAINER_NAME} 2>/dev/null || true"
else
echo "[2/3] Skipping cleanup."
fi fi
echo "[3/3] Starting containers..." echo "[INFO] Starting docker containers..."
mkdir -p ${WORKSPACE_DIR} mkdir -p ${WORKDIR}
DOCKER_ARGS="--name=${CONTAINER_NAME} \ DOCKER_ARGS="--name=${CONTAINER_NAME} \
-v /opt/hyhal:/opt/hyhal:ro \ -v /opt/hyhal:/opt/hyhal:ro \
-v /root/.ssh:/root/.ssh \ -v /root/.ssh:/root/.ssh \
...@@ -103,7 +138,7 @@ DOCKER_ARGS="--name=${CONTAINER_NAME} \ ...@@ -103,7 +138,7 @@ DOCKER_ARGS="--name=${CONTAINER_NAME} \
docker run -itd \ docker run -itd \
${DOCKER_ARGS} \ ${DOCKER_ARGS} \
-v ${WORKSPACE_DIR}:/workspace \ -v ${WORKDIR}:/workspace \
${IMAGE_NAME} \ ${IMAGE_NAME} \
bash -c "mkdir -p /run/sshd && /usr/sbin/sshd -p ${SSH_PORT}; sleep infinity" bash -c "mkdir -p /run/sshd && /usr/sbin/sshd -p ${SSH_PORT}; sleep infinity"
...@@ -114,6 +149,6 @@ if [ -n "$WORKER_NODES" ]; then ...@@ -114,6 +149,6 @@ if [ -n "$WORKER_NODES" ]; then
bash -c 'mkdir -p /run/sshd && /usr/sbin/sshd -p ${SSH_PORT}; sleep infinity'" bash -c 'mkdir -p /run/sshd && /usr/sbin/sshd -p ${SSH_PORT}; sleep infinity'"
fi fi
echo "All containers are ready!" echo "[INFO] All containers are ready!"
echo "To access the container on the current node, run:" echo "[INFO] To access the container on the current node, run:"
echo "docker exec -it ${CONTAINER_NAME} bash" echo " docker exec -it ${CONTAINER_NAME} bash"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment