Enabling "2 node" distributed tests in the AMD CI pipeline. (#32719)

Signed-off-by: DCCS-4560 <alivanov@chi-mi325x-pod1-112.ord.vultr.cpe.ice.amd.com> Co-authored-by: DCCS-4560 <alivanov@chi-mi325x-pod1-112.ord.vultr.cpe.ice.amd.com> Co-authored-by: TJian <tunjian.tan@embeddedllm.com>

Enabling "2 node" distributed tests in the AMD CI pipeline. (#32719)
Signed-off-by: DCCS-4560 <alivanov@chi-mi325x-pod1-112.ord.vultr.cpe.ice.amd.com> Co-authored-by: DCCS-4560 <alivanov@chi-mi325x-pod1-112.ord.vultr.cpe.ice.amd.com> Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
3c3c547c · Alexei-V-Ivanov-AMD · GitHub · 1cbccb6d · 3c3c547c · 3c3c547c
Unverified Commit 3c3c547c authored Jan 27, 2026 by Alexei-V-Ivanov-AMD Committed by GitHub Jan 27, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 46 additions and 6 deletions

.buildkite/scripts/hardware_ci/run-amd-test.sh .buildkite/scripts/hardware_ci/run-amd-test.sh +40 -0

.buildkite/test-amd.yaml .buildkite/test-amd.yaml +6 -6

No files found.
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -44,6 +44,17 @@ cleanup_docker() {
  fi
 }
+cleanup_network() {
+  for node in $(seq 0 $((NUM_NODES-1))); do
+    if docker pr -a -q -f name="node${node}" | grep -q .; then
+      docker stop "node${node}"
+    fi
+  done
+  if docker network ls | grep docker-net; then
+    docker network rm docker-net
+  fi
+}
 # Call the cleanup docker function
 cleanup_docker
@@ -224,6 +235,35 @@ if [[ $commands == *"--shard-id="* ]]; then
    echo "All shards reported no tests collected. Failing the build."
    exit 1
  fi
+elif [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then
+  export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')
+  if [[ "$commands" =~ ^(.*)"["(.*)"] && ["(.*)"]"$ ]]; then
+      prefix=$( echo "${BASH_REMATCH[1]}" | sed 's/;//g')
+      echo "PREFIX: ${prefix}"
+      export composite_command="(command rocm-smi || true)"
+      myIFS=$IFS
+      IFS=','
+      read -ra node0 <<< ${BASH_REMATCH[2]}
+      read -ra node1 <<< ${BASH_REMATCH[3]}
+      IFS=$myIFS
+      for i in "${!node0[@]}";do 
+        command_node_0=$(echo ${node0[i]} | sed 's/\"//g')
+        command_node_1=$(echo ${node1[i]} | sed 's/\"//g')
+        export commands="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
+        echo "COMMANDS: ${commands}"
+        composite_command=$(echo "${composite_command} && ${commands}")
+      done
+      /bin/bash -c "${composite_command}"
+      cleanup_network
+  else
+      echo "Failed to parse node commands! Exiting."
+      cleanup_network
+      exit 111
+  fi
 else
  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
  docker run \

--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1278,7 +1278,7 @@ steps:
 - label: 2 Node Tests (4 GPUs in total) # 16min
  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdmultinode]
  agent_pool: mi325_4
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
@@ -1292,15 +1292,15 @@ steps:
  - tests/distributed/
  - tests/examples/offline_inference/data_parallel.py
  commands:
-  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
+  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)  | grep 'Same node test passed'   | grep 'Node count test passed'
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py 
-    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
+    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py 
    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py 
-    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
+    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py 
    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
 - label: Distributed Tests (2 GPUs) # 68min