Unverified Commit 7ee82bef authored by Alexei-V-Ivanov-AMD's avatar Alexei-V-Ivanov-AMD Committed by GitHub
Browse files

[CI/Build] Adding functionality to reset the node's GPUs before processing. (#4213)

parent fbf152d9
...@@ -5,6 +5,19 @@ set -ex ...@@ -5,6 +5,19 @@ set -ex
# Print ROCm version # Print ROCm version
rocminfo rocminfo
echo "reset" > /opt/amdgpu/etc/gpu_state
while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done
# Try building the docker image # Try building the docker image
docker build -t rocm -f Dockerfile.rocm . docker build -t rocm -f Dockerfile.rocm .
...@@ -14,7 +27,8 @@ trap remove_docker_container EXIT ...@@ -14,7 +27,8 @@ trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Run the image # Run the image
docker run --device /dev/kfd --device /dev/dri --network host --name rocm rocm python3 -m vllm.entrypoints.api_server & export HIP_VISIBLE_DEVICES=1
docker run --device /dev/kfd --device /dev/dri --network host -e HIP_VISIBLE_DEVICES --name rocm rocm python3 -m vllm.entrypoints.api_server &
# Wait for the server to start # Wait for the server to start
wait_for_server_to_start() { wait_for_server_to_start() {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment