Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tsoc
hg-misc-tools
Commits
01be5357
Commit
01be5357
authored
Mar 02, 2026
by
one
Browse files
[rocHPL] Add a script to run containers on multiple nodes
parent
135d8afb
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
118 additions
and
0 deletions
+118
-0
projects/rocHPL/run-docker.sh
projects/rocHPL/run-docker.sh
+118
-0
No files found.
projects/rocHPL/run-docker.sh
0 → 100644
View file @
01be5357
#!/bin/bash
set
-e
IMAGE_NAME
=
harbor.sourcefind.cn:5443/dcu/admin/base/vllm:0.11.0-ubuntu22.04-dtk26.04-0130-py3.10-20260204
CONTAINER_NAME
=
rochpl-dtk26-20260204
SSH_PORT
=
3333
# 确保端口可用
WORKSPACE_DIR
=
"
${
PWD
}
"
# 修改为主节点的rocHPL目录
# =====================================================================
# 节点映射表
# 1. 当前节点是主节点,WORKER_CONFIG不包括主节点
# 2. 第一列是物理机hostname或IP,第二列是docker容器hostname
# =====================================================================
MASTER_NODE
=
$(
hostname
)
WORKER_CONFIG
=
"
node2 node02
node3 node03
master node04
"
# =====================================================================
# 解析hosts
# =====================================================================
resolve_ip
()
{
local
target
=
$1
local
ip
=
""
# Try getent
if
command
-v
getent
>
/dev/null 2>&1
;
then
ip
=
$(
getent ahosts
"
$target
"
2>/dev/null |
awk
'{print $1}'
|
head
-n
1
)
fi
# Try ping
if
[
-z
"
$ip
"
]
&&
command
-v
ping
>
/dev/null 2>&1
;
then
ip
=
$(
ping
-c
1
-n
"
$target
"
2>/dev/null |
awk
-F
'[()]'
'/PING/{print $2}'
)
fi
# Fallback to /etc/hosts
if
[
-z
"
$ip
"
]
;
then
ip
=
$(
awk
-v
h
=
"
$target
"
'!/^#/ && ($2 == h || $3 == h) {print $1; exit}'
/etc/hosts
)
fi
echo
"
$ip
"
}
MASTER_IP
=
$(
resolve_ip
${
MASTER_NODE
}
)
DOCKER_ADD_HOSTS
=
"--add-host node01:
${
MASTER_IP
}
"
echo
"Master node:
${
MASTER_NODE
}
-> node01 (
${
MASTER_IP
}
)"
WORKER_NODE_ARR
=()
while
read
-r
phys_host std_name rest
;
do
[[
-z
"
$phys_host
"
||
"
$phys_host
"
==
\#
*
]]
&&
continue
IP
=
$(
resolve_ip
"
$phys_host
"
)
if
[
-z
"
$IP
"
]
;
then
echo
"Error: failed to resolve IP for worker node '
$phys_host
'!"
exit
1
fi
DOCKER_ADD_HOSTS
=
"
${
DOCKER_ADD_HOSTS
}
--add-host
${
std_name
}
:
${
IP
}
"
WORKER_NODE_ARR+
=(
"
$phys_host
"
)
echo
"Worker node:
${
phys_host
}
->
${
std_name
}
(
${
IP
}
)"
done
<<<
"
$WORKER_CONFIG
"
WORKER_NODES
=
$(
IFS
=
,
;
echo
"
${
WORKER_NODE_ARR
[*]
}
"
)
ALL_NODES
=
"
${
MASTER_NODE
}
"
[
-n
"
$WORKER_NODES
"
]
&&
ALL_NODES
=
"
${
ALL_NODES
}
,
${
WORKER_NODES
}
"
# =====================================================================
# 启动容器
# =====================================================================
echo
"[1/3] Pulling the image on each node..."
pdsh
-w
${
ALL_NODES
}
-S
"docker pull
${
IMAGE_NAME
}
>/dev/null"
if
[
"
$1
"
==
"-f"
]
;
then
echo
"[2/3] Force cleaning up old container..."
pdsh
-w
${
ALL_NODES
}
-S
"docker rm -f
${
CONTAINER_NAME
}
2>/dev/null || true"
else
echo
"[2/3] Skipping cleanup."
fi
echo
"[3/3] Starting containers..."
mkdir
-p
${
WORKSPACE_DIR
}
DOCKER_ARGS
=
"--name=
${
CONTAINER_NAME
}
\
-v /opt/hyhal:/opt/hyhal:ro
\
-v /root/.ssh:/root/.ssh
\
-w /workspace
\
${
DOCKER_ADD_HOSTS
}
\
--network=host
\
--ipc=host
\
--device=/dev/kfd
\
--device=/dev/mkfd
\
--device=/dev/dri
\
--shm-size=512G
\
--privileged
\
--group-add video
\
--cap-add=SYS_PTRACE
\
-u root
\
--security-opt seccomp=unconfined"
docker run
-itd
\
${
DOCKER_ARGS
}
\
-v
${
WORKSPACE_DIR
}
:/workspace
\
${
IMAGE_NAME
}
\
bash
-c
"mkdir -p /run/sshd && /usr/sbin/sshd -p
${
SSH_PORT
}
; sleep infinity"
if
[
-n
"
$WORKER_NODES
"
]
;
then
pdsh
-w
${
WORKER_NODES
}
-S
"docker run -itd
\
${
DOCKER_ARGS
}
\
${
IMAGE_NAME
}
\
bash -c 'mkdir -p /run/sshd && /usr/sbin/sshd -p
${
SSH_PORT
}
; sleep infinity'"
fi
echo
"All containers are ready!"
echo
"To access the container on the current node, run:"
echo
"docker exec -it
${
CONTAINER_NAME
}
bash"
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment