#!/bin/bash export NCCL_TOPO_DUMP_FILE=./topo-dump.xml #-x NCCL_TOPO_FILE=./topo.xml \ mpirun -H master:8,node1:8 --prefix /opt/mpi -np 16 --allow-run-as-root \ --mca plm_rsh_args "-p 2222" \ --mca btl_tcp_if_include p14p2 \ -x ROCM_PATH -x LD_LIBRARY_PATH \ -x NCCL_TOPO_FILE=./topo-0507-115-update.xml \ -x NCCL_DEBUG=WARN \ -x NCCL_SOCKET_IFNAME=p14p2 \ -x HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ -x NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_1:1,mlx5_10:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 \ ./build/alltoall_perf -b 7618 -e 1G -f 2 -g 1 -d half #-x NCCL_TOPO_FILE=/data1/sunzhq/rccl-tests-develop/topo-0507-115-real.xml \ #-x NCCL_IB_QP_PER_CONNECTION=4 \ #-x NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_1:1,mlx5_10:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 \ # -x NCCL_IB_HCA=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_7,mlx5_8,mlx5_9,mlx5_10 \ # --mca plm rsh \ # --mca plm_rsh_agent ssh \ # -x NCCL_GRAPH_FILE=./topo.xml \ # -x NCCL_ALGO=tree,ring \ # --mca plm_rsh_args "-2 -o StrictHostKeyChecking=no" \ # 使用 SSH 通信 # -x NCCL_DEBUG=INFO # 输出调试日志,定位问题 # -x NCCL_IB_GID_INDEX=3 # 若 IB 多子网,指定 GID 索引