Unverified Commit 3c3c547c authored by Alexei-V-Ivanov-AMD's avatar Alexei-V-Ivanov-AMD Committed by GitHub
Browse files

Enabling "2 node" distributed tests in the AMD CI pipeline. (#32719)


Signed-off-by: default avatarDCCS-4560 <alivanov@chi-mi325x-pod1-112.ord.vultr.cpe.ice.amd.com>
Co-authored-by: default avatarDCCS-4560 <alivanov@chi-mi325x-pod1-112.ord.vultr.cpe.ice.amd.com>
Co-authored-by: default avatarTJian <tunjian.tan@embeddedllm.com>
parent 1cbccb6d
...@@ -44,6 +44,17 @@ cleanup_docker() { ...@@ -44,6 +44,17 @@ cleanup_docker() {
fi fi
} }
cleanup_network() {
for node in $(seq 0 $((NUM_NODES-1))); do
if docker pr -a -q -f name="node${node}" | grep -q .; then
docker stop "node${node}"
fi
done
if docker network ls | grep docker-net; then
docker network rm docker-net
fi
}
# Call the cleanup docker function # Call the cleanup docker function
cleanup_docker cleanup_docker
...@@ -224,6 +235,35 @@ if [[ $commands == *"--shard-id="* ]]; then ...@@ -224,6 +235,35 @@ if [[ $commands == *"--shard-id="* ]]; then
echo "All shards reported no tests collected. Failing the build." echo "All shards reported no tests collected. Failing the build."
exit 1 exit 1
fi fi
elif [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then
export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')
if [[ "$commands" =~ ^(.*)"["(.*)"] && ["(.*)"]"$ ]]; then
prefix=$( echo "${BASH_REMATCH[1]}" | sed 's/;//g')
echo "PREFIX: ${prefix}"
export composite_command="(command rocm-smi || true)"
myIFS=$IFS
IFS=','
read -ra node0 <<< ${BASH_REMATCH[2]}
read -ra node1 <<< ${BASH_REMATCH[3]}
IFS=$myIFS
for i in "${!node0[@]}";do
command_node_0=$(echo ${node0[i]} | sed 's/\"//g')
command_node_1=$(echo ${node1[i]} | sed 's/\"//g')
export commands="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
echo "COMMANDS: ${commands}"
composite_command=$(echo "${composite_command} && ${commands}")
done
/bin/bash -c "${composite_command}"
cleanup_network
else
echo "Failed to parse node commands! Exiting."
cleanup_network
exit 111
fi
else else
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES" echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \ docker run \
......
...@@ -1278,7 +1278,7 @@ steps: ...@@ -1278,7 +1278,7 @@ steps:
- label: 2 Node Tests (4 GPUs in total) # 16min - label: 2 Node Tests (4 GPUs in total) # 16min
timeout_in_minutes: 30 timeout_in_minutes: 30
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental, amdmultinode]
agent_pool: mi325_4 agent_pool: mi325_4
# grade: Blocking # grade: Blocking
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
...@@ -1292,15 +1292,15 @@ steps: ...@@ -1292,15 +1292,15 @@ steps:
- tests/distributed/ - tests/distributed/
- tests/examples/offline_inference/data_parallel.py - tests/examples/offline_inference/data_parallel.py
commands: commands:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) | grep 'Same node test passed' | grep 'Node count test passed'
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
- label: Distributed Tests (2 GPUs) # 68min - label: Distributed Tests (2 GPUs) # 68min
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment