Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tsoc
hg-misc-tools
Commits
b094b3f8
Commit
b094b3f8
authored
Mar 02, 2026
by
one
Browse files
[cluster] Add a lightweight container launcher
parent
c84e5170
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
55 additions
and
20 deletions
+55
-20
projects/cluster/docker-cluster-up.sh
projects/cluster/docker-cluster-up.sh
+55
-20
No files found.
projects/
rocHPL/run-docker
.sh
→
projects/
cluster/docker-cluster-up
.sh
View file @
b094b3f8
#!/bin/bash
#!/bin/bash
set
-e
set
-e
IMAGE_NAME
=
harbor.sourcefind.cn:5443/dcu/admin/base/vllm:0.11.0-ubuntu22.04-dtk26.04-0130-py3.10-20260204
CONTAINER_NAME
=
rochpl-dtk26-20260204
SSH_PORT
=
3333
# 确保端口可用
WORKSPACE_DIR
=
"
${
PWD
}
"
# 修改为主节点的rocHPL目录
# =====================================================================
# =====================================================================
# 节点映射表
# 节点映射表
# 1. 当前节点是主节点,WORKER_CONFIG不包括主节点
# 1. 当前节点是主节点,WORKER_CONFIG不包括主节点
...
@@ -19,6 +14,44 @@ node3 node03
...
@@ -19,6 +14,44 @@ node3 node03
master node04
master node04
"
"
# =====================================================================
# 默认值
# =====================================================================
IMAGE_NAME
=
harbor.sourcefind.cn:5443/dcu/admin/base/vllm:0.11.0-ubuntu22.04-dtk26.04-0130-py3.10-20260204
CONTAINER_NAME
=
cluster-dtk26-20260204
SSH_PORT
=
3333
# 确保端口可用
WORKDIR
=
"
${
PWD
}
"
# 修改为主节点的rocHPL/rocHPCG等应用目录
FORCE_RM
=
0
SKIP_PULL
=
0
usage
()
{
echo
"Usage:
$0
[-f|--force] [-i|--image IMAGE] [-n|--name NAME] [-p|--port PORT] [-w|--workdir WORKDIR]"
echo
" -f, --force Force remove existing container before starting"
echo
" -i, --image IMAGE Docker image name (default:
${
IMAGE_NAME
}
)"
echo
" -n, --name NAME Container name across all nodes (default:
${
CONTAINER_NAME
}
)"
echo
" -p, --port PORT SSH port inside container (default:
${
SSH_PORT
}
)"
echo
" -w, --workdir WORKDIR Workspace directory to mount for the master node (default:
\$
PWD)"
echo
" --no-pull Skip pulling the image"
echo
" -h, --help Show this help message"
exit
1
}
OPTS
=
$(
getopt
-o
fi
:n:p:w:
--long
force,image:,name:,port:,workdir:,no-pull
-n
"
$0
"
--
"
$@
"
)
||
usage
eval set
--
"
${
OPTS
}
"
while
true
;
do
case
"
$1
"
in
-f
|
--force
)
FORCE_RM
=
1
;
shift
;;
-i
|
--image
)
IMAGE_NAME
=
"
$2
"
;
shift
2
;;
-n
|
--name
)
CONTAINER_NAME
=
"
$2
"
;
shift
2
;;
-p
|
--port
)
SSH_PORT
=
"
$2
"
;
shift
2
;;
-w
|
--workdir
)
WORKDIR
=
"
$2
"
;
shift
2
;;
--no-pull
)
SKIP_PULL
=
1
;
shift
;;
--
)
shift
;
break
;;
*
)
usage
;;
esac
done
# =====================================================================
# =====================================================================
# 解析hosts
# 解析hosts
# =====================================================================
# =====================================================================
...
@@ -46,7 +79,7 @@ resolve_ip() {
...
@@ -46,7 +79,7 @@ resolve_ip() {
MASTER_IP
=
$(
resolve_ip
${
MASTER_NODE
}
)
MASTER_IP
=
$(
resolve_ip
${
MASTER_NODE
}
)
DOCKER_ADD_HOSTS
=
"--add-host
${
DOCKER_MASTER
}
:
${
MASTER_IP
}
"
DOCKER_ADD_HOSTS
=
"--add-host
${
DOCKER_MASTER
}
:
${
MASTER_IP
}
"
echo
"Master node:
${
MASTER_NODE
}
->
${
DOCKER_MASTER
}
(
${
MASTER_IP
}
)"
echo
"
[INFO]
Master node:
${
MASTER_NODE
}
->
${
DOCKER_MASTER
}
(
${
MASTER_IP
}
)"
WORKER_NODE_ARR
=()
WORKER_NODE_ARR
=()
while
read
-r
phys_host std_name rest
;
do
while
read
-r
phys_host std_name rest
;
do
...
@@ -54,14 +87,14 @@ while read -r phys_host std_name rest; do
...
@@ -54,14 +87,14 @@ while read -r phys_host std_name rest; do
IP
=
$(
resolve_ip
"
$phys_host
"
)
IP
=
$(
resolve_ip
"
$phys_host
"
)
if
[
-z
"
$IP
"
]
;
then
if
[
-z
"
$IP
"
]
;
then
echo
"Error
: f
ailed to resolve IP for worker node '
$phys_host
'!"
echo
"
[
Error
] F
ailed to resolve IP for worker node '
$phys_host
'!"
exit
1
exit
1
fi
fi
DOCKER_ADD_HOSTS
=
"
${
DOCKER_ADD_HOSTS
}
--add-host
${
std_name
}
:
${
IP
}
"
DOCKER_ADD_HOSTS
=
"
${
DOCKER_ADD_HOSTS
}
--add-host
${
std_name
}
:
${
IP
}
"
WORKER_NODE_ARR+
=(
"
$phys_host
"
)
WORKER_NODE_ARR+
=(
"
$phys_host
"
)
echo
"Worker node:
${
phys_host
}
->
${
std_name
}
(
${
IP
}
)"
echo
"
[INFO]
Worker node:
${
phys_host
}
->
${
std_name
}
(
${
IP
}
)"
done
<<<
"
$WORKER_CONFIG
"
done
<<<
"
$WORKER_CONFIG
"
WORKER_NODES
=
$(
IFS
=
,
;
echo
"
${
WORKER_NODE_ARR
[*]
}
"
)
WORKER_NODES
=
$(
IFS
=
,
;
echo
"
${
WORKER_NODE_ARR
[*]
}
"
)
...
@@ -72,18 +105,20 @@ ALL_NODES="${MASTER_NODE}"
...
@@ -72,18 +105,20 @@ ALL_NODES="${MASTER_NODE}"
# =====================================================================
# =====================================================================
# 启动容器
# 启动容器
# =====================================================================
# =====================================================================
echo
"[1/3] Pulling the image on each node..."
if
[
"
${
SKIP_PULL
}
"
==
"1"
]
;
then
pdsh
-w
${
ALL_NODES
}
-S
"docker pull
${
IMAGE_NAME
}
>/dev/null"
echo
"[INFO] Image pulling skipped."
else
echo
"[INFO] Pulling the image on each node..."
pdsh
-w
${
ALL_NODES
}
-S
"docker pull
${
IMAGE_NAME
}
>/dev/null"
fi
if
[
"
$
1
"
==
"
-f
"
]
;
then
if
[
"
$
{
FORCE_RM
}
"
==
"
1
"
]
;
then
echo
"[
2/3
] Force
cleaning up old
container..."
echo
"[
INFO
] Force
removing existing
container
s
..."
pdsh
-w
${
ALL_NODES
}
-S
"docker rm -f
${
CONTAINER_NAME
}
2>/dev/null || true"
pdsh
-w
${
ALL_NODES
}
-S
"docker rm -f
${
CONTAINER_NAME
}
2>/dev/null || true"
else
echo
"[2/3] Skipping cleanup."
fi
fi
echo
"[
3/3
] Starting containers..."
echo
"[
INFO
] Starting
docker
containers..."
mkdir
-p
${
WORK
SPACE_
DIR
}
mkdir
-p
${
WORKDIR
}
DOCKER_ARGS
=
"--name=
${
CONTAINER_NAME
}
\
DOCKER_ARGS
=
"--name=
${
CONTAINER_NAME
}
\
-v /opt/hyhal:/opt/hyhal:ro
\
-v /opt/hyhal:/opt/hyhal:ro
\
-v /root/.ssh:/root/.ssh
\
-v /root/.ssh:/root/.ssh
\
...
@@ -103,7 +138,7 @@ DOCKER_ARGS="--name=${CONTAINER_NAME} \
...
@@ -103,7 +138,7 @@ DOCKER_ARGS="--name=${CONTAINER_NAME} \
docker run
-itd
\
docker run
-itd
\
${
DOCKER_ARGS
}
\
${
DOCKER_ARGS
}
\
-v
${
WORK
SPACE_
DIR
}
:/workspace
\
-v
${
WORKDIR
}
:/workspace
\
${
IMAGE_NAME
}
\
${
IMAGE_NAME
}
\
bash
-c
"mkdir -p /run/sshd && /usr/sbin/sshd -p
${
SSH_PORT
}
; sleep infinity"
bash
-c
"mkdir -p /run/sshd && /usr/sbin/sshd -p
${
SSH_PORT
}
; sleep infinity"
...
@@ -114,6 +149,6 @@ if [ -n "$WORKER_NODES" ]; then
...
@@ -114,6 +149,6 @@ if [ -n "$WORKER_NODES" ]; then
bash -c 'mkdir -p /run/sshd && /usr/sbin/sshd -p
${
SSH_PORT
}
; sleep infinity'"
bash -c 'mkdir -p /run/sshd && /usr/sbin/sshd -p
${
SSH_PORT
}
; sleep infinity'"
fi
fi
echo
"All containers are ready!"
echo
"
[INFO]
All containers are ready!"
echo
"To access the container on the current node, run:"
echo
"
[INFO]
To access the container on the current node, run:"
echo
"docker exec -it
${
CONTAINER_NAME
}
bash"
echo
"
docker exec -it
${
CONTAINER_NAME
}
bash"
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment