Commit abad43bf authored by one's avatar one
Browse files

[rccl-tests] Add topo mapping file

parent 8b3fbed1
...@@ -133,9 +133,9 @@ else ...@@ -133,9 +133,9 @@ else
if [ ${#copyto_hosts[@]} -gt 0 ]; then if [ ${#copyto_hosts[@]} -gt 0 ]; then
echo "[WRAPPER] Copying files to remote nodes in parallel: ${copyto_hosts[*]}" echo "[WRAPPER] Copying files to remote nodes in parallel: ${copyto_hosts[*]}"
for node in "${copyto_hosts[@]}"; do for node in "${copyto_hosts[@]}"; do
rsync -az -e "ssh -p ${ssh_port}" ${PWD}/build ${PWD}/scripts ${rccltest_runscript} ${NCCL_TOPO_FILE} ${NCCL_GRAPH_FILE} "${node}:${PWD}/" & rsync -azP -e "ssh -p ${ssh_port}" ${PWD}/build ${PWD}/scripts ${rccltest_runscript} ${NCCL_TOPO_FILE} ${NCCL_GRAPH_FILE} ${NCCL_TOPO_MAPPING_FILE} "${node}:${PWD}/" &
rsync -az -e "ssh -p ${ssh_port}" /opt/dtk/rccl/lib ${node}:/opt/dtk/rccl/ & rsync -azP -e "ssh -p ${ssh_port}" /opt/dtk/rccl/lib ${node}:/opt/dtk/rccl/ &
rsync -az -e "ssh -p ${ssh_port}" /opt/mpi /opt/ucx ${node}:/opt/ & rsync -azP -e "ssh -p ${ssh_port}" /opt/mpi /opt/ucx ${node}:/opt/ &
done done
wait wait
echo "[WRAPPER] Files synchronized successfully." echo "[WRAPPER] Files synchronized successfully."
......
...@@ -28,7 +28,8 @@ export NCCL_NET_GDR_READ=1 ...@@ -28,7 +28,8 @@ export NCCL_NET_GDR_READ=1
# export NCCL_PROTO=Simple # export NCCL_PROTO=Simple
export NCCL_SIMPLE_CHANNELS=32 export NCCL_SIMPLE_CHANNELS=32
unset NCCL_NCHANNELS_PER_PEER unset NCCL_NCHANNELS_PER_PEER
export NCCL_TOPO_FILE=${PWD}/topo-gdr-bw1000.xml export NCCL_TOPO_MAPPING_FILE=${PWD}/topo-mapping-bw1000.xml
# export NCCL_TOPO_FILE=${PWD}/topo-gdr-bw1000.xml # topo 和 topo mapping 二选一即可
# export NCCL_GRAPH_FILE=${PWD}/graph-16r-allreduce.xml # export NCCL_GRAPH_FILE=${PWD}/graph-16r-allreduce.xml
./mpirun_rccltest -np 2 \ ./mpirun_rccltest -np 2 \
./build/all_reduce_perf -b 4 -e 16G -f 2 -w 3 -g 1 ./build/all_reduce_perf -b 4 -e 16G -f 2 -w 3 -g 1
......
...@@ -26,7 +26,8 @@ export NCCL_P2P_LEVEL=SYS ...@@ -26,7 +26,8 @@ export NCCL_P2P_LEVEL=SYS
export NCCL_NET_GDR_LEVEL=PHB export NCCL_NET_GDR_LEVEL=PHB
export NCCL_NET_GDR_READ=1 export NCCL_NET_GDR_READ=1
unset NCCL_NCHANNELS_PER_PEER unset NCCL_NCHANNELS_PER_PEER
export NCCL_TOPO_FILE=${PWD}/topo-gdr-bw1000.xml export NCCL_TOPO_MAPPING_FILE=${PWD}/topo-mapping-bw1000.xml
# export NCCL_TOPO_FILE=${PWD}/topo-gdr-bw1000.xml # topo 和 topo mapping 二选一即可
./mpirun_rccltest -np 2 \ ./mpirun_rccltest -np 2 \
./build/alltoall_perf -b 32 -e 16G -f 2 -w 3 -g 1 ./build/alltoall_perf -b 32 -e 16G -f 2 -w 3 -g 1
./mpirun_rccltest -np 4 \ ./mpirun_rccltest -np 4 \
......
...@@ -16,7 +16,8 @@ export NCCL_P2P_LEVEL=SYS ...@@ -16,7 +16,8 @@ export NCCL_P2P_LEVEL=SYS
export NCCL_NET_GDR_LEVEL=PHB export NCCL_NET_GDR_LEVEL=PHB
export NCCL_NET_GDR_READ=1 export NCCL_NET_GDR_READ=1
unset NCCL_NCHANNELS_PER_PEER unset NCCL_NCHANNELS_PER_PEER
export NCCL_TOPO_FILE=${PWD}/topo-gdr-bw1000.xml export NCCL_TOPO_MAPPING_FILE=${PWD}/topo-mapping-bw1000.xml
# export NCCL_TOPO_FILE=${PWD}/topo-gdr-bw1000.xml # topo 和 topo mapping 二选一即可
for g in {0..7}; do for g in {0..7}; do
echo echo
......
<system version="2">
<group name="gfx936_8_x86_64_HygonGenuine_mlx5_11_InfiniBand_40-200-200-200-200-40-5-200-200-200-200_1_8_1">
<cpu numaid="3">
<pci>
<nic id="mlx5_1"/>
<nic id="mlx5_2"/>
<gpu dev="0"/>
<gpu dev="1"/>
</pci>
</cpu>
<cpu numaid="0">
<pci>
<nic id="mlx5_3"/>
<nic id="mlx5_4"/>
<gpu dev="2"/>
<gpu dev="3"/>
</pci>
</cpu>
<cpu numaid="7">
<pci>
<nic id="mlx5_7"/>
<nic id="mlx5_8"/>
<gpu dev="4"/>
<gpu dev="5"/>
</pci>
</cpu>
<cpu numaid="4">
<pci>
<nic id="mlx5_9"/>
<nic id="mlx5_10"/>
<gpu dev="6"/>
<gpu dev="7"/>
</pci>
</cpu>
</group>
</system>
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment