Unverified Commit 80c92753 authored by Alexei-V-Ivanov-AMD's avatar Alexei-V-Ivanov-AMD Committed by GitHub
Browse files

Enabling cooperative multi-gpu tests on multi-gpu nodes (#27986)


Signed-off-by: default avatarAlexei V. Ivanov <alexei.ivanov@amd.com>
parent e50c4546
......@@ -173,6 +173,14 @@ fi
PARALLEL_JOB_COUNT=8
MYPYTHONPATH=".."
# Test that we're launching on the machine that has
# proper access to GPUs
render_gid=$(getent group render | cut -d: -f3)
if [[ -z "$render_gid" ]]; then
echo "Error: 'render' group not found. This is required for GPU access." >&2
exit 1
fi
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then
# assign job count as the number of shards used
......@@ -186,6 +194,7 @@ if [[ $commands == *"--shard-id="* ]]; then
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network=host \
--shm-size=16gb \
--group-add "$render_gid" \
--rm \
-e HIP_VISIBLE_DEVICES="${GPU}" \
-e HF_TOKEN \
......@@ -217,8 +226,8 @@ else
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network=host \
--shm-size=16gb \
--group-add "$render_gid" \
--rm \
-e HIP_VISIBLE_DEVICES=0 \
-e HF_TOKEN \
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment