[Hardware][AMD][CI] Refactor AMD tests to properly use BuildKite parallelism (#32745)

Signed-off-by: Matthew Wong <Matthew.Wong2@amd.com>

[Hardware][AMD][CI] Refactor AMD tests to properly use BuildKite parallelism (#32745)
Signed-off-by: Matthew Wong <Matthew.Wong2@amd.com>
08e09499 · Matt · GitHub · d88a1df6 · 08e09499 · 08e09499
Unverified Commit 08e09499 authored Feb 04, 2026 by Matt Committed by GitHub Feb 04, 2026
Show whitespace changes
Inline Side-by-side

Showing with 14 additions and 61 deletions

.buildkite/scripts/hardware_ci/run-amd-test.sh .buildkite/scripts/hardware_ci/run-amd-test.sh +5 -52

.buildkite/test-amd.yaml .buildkite/test-amd.yaml +9 -9

No files found.
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -87,7 +87,7 @@ mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"

 commands=$@
-echo "Commands:$commands"
+echo "Raw commands: $commands"

 commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}

@@ -169,6 +169,9 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
  --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi

+commands=$(echo "$commands" | sed 's/ \\ / /g')
+echo "Final commands: $commands"
+
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
@@ -176,7 +179,6 @@ fi
 # --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13


-PARALLEL_JOB_COUNT=8
 MYPYTHONPATH=".."

 # Test that we're launching on the machine that has
@@ -187,56 +189,7 @@ if [[ -z "$render_gid" ]]; then
  exit 1
 fi

-# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
-if [[ $commands == *"--shard-id="* ]]; then
-  # assign job count as the number of shards used
-  commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
-  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
-    # assign shard-id for each shard
-    commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
-    echo "Shard ${GPU} commands:$commands_gpu"
-    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
-    docker run \
-        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
-        --network=host \
-        --shm-size=16gb \
-        --group-add "$render_gid" \
-        --rm \
-        -e HIP_VISIBLE_DEVICES="${GPU}" \
-        -e HF_TOKEN \
-        -e AWS_ACCESS_KEY_ID \
-        -e AWS_SECRET_ACCESS_KEY \
-        -v "${HF_CACHE}:${HF_MOUNT}" \
-        -e "HF_HOME=${HF_MOUNT}" \
-        -e "PYTHONPATH=${MYPYTHONPATH}" \
-        --name "${container_name}_${GPU}" \
-        "${image_name}" \
-        /bin/bash -c "${commands_gpu}" \
-        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
-    PIDS+=($!)
-  done
-  #wait for all processes to finish and collect exit codes
-  for pid in "${PIDS[@]}"; do
-    wait "${pid}"
-    STATUS+=($?)
-  done
-  at_least_one_shard_with_tests=0
-  for st in "${STATUS[@]}"; do
-    if [[ ${st} -ne 0 ]] && [[ ${st} -ne 5 ]]; then
-      echo "One of the processes failed with $st"
-      exit "${st}"
-    elif [[ ${st} -eq 5 ]]; then
-      echo "Shard exited with status 5 (no tests collected) - treating as success"
-    else # This means st is 0
-      at_least_one_shard_with_tests=1
-    fi
-  done
-  if [[ ${#STATUS[@]} -gt 0 && ${at_least_one_shard_with_tests} -eq 0 ]]; then
-    echo "All shards reported no tests collected. Failing the build."
-    exit 1
-  fi
-
-elif [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then
+if [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then

  export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')


--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -542,7 +542,7 @@ steps:
 - label: LoRA Test %N # 20min each
  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_8
+  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - vllm/lora
@@ -636,7 +636,7 @@ steps:
 - label: Kernels Attention Test %N # 23min
  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_8
+  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - csrc/attention/
@@ -651,7 +651,7 @@ steps:
 - label: Kernels Quantization Test %N # 64min
  timeout_in_minutes: 90
  mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_8
+  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - csrc/quantization/
@@ -664,7 +664,7 @@ steps:
 - label: Kernels MoE Test %N # 40min
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_8
+  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - csrc/quantization/cutlass_w8a8/moe/
@@ -742,7 +742,7 @@ steps:
 - label: Benchmarks # 11min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_8
+  agent_pool: mi325_1
  # grade: Blocking
  working_dir: "/vllm-workspace/.buildkite"
  source_file_dependencies:
@@ -753,7 +753,7 @@ steps:
 - label: Benchmarks CLI Test # 7min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_8
+  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - vllm/
@@ -827,7 +827,7 @@ steps:
 - label: Basic Models Tests (Extra Initialization) %N
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_8
+  agent_pool: mi325_1
  # grade: Blocking
  torch_nightly: true
  source_file_dependencies:
@@ -888,7 +888,7 @@ steps:
 - label: Language Models Tests (Extra Standard) %N
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_8
+  agent_pool: mi325_1
  # grade: Blocking
  torch_nightly: true
  source_file_dependencies:
@@ -909,7 +909,7 @@ steps:
 - label: Language Models Tests (Hybrid) %N
  timeout_in_minutes: 75
  mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_8
+  agent_pool: mi325_1
  # grade: Blocking
  torch_nightly: true
  source_file_dependencies: