test-run.sh 978 Bytes
Newer Older
wangkaixiong's avatar
init  
wangkaixiong committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#!/bin/bash

mpirun -H master:8,node1:8 --prefix /opt/mpi -np 16 --allow-run-as-root \
    --mca plm_rsh_args "-p 2222" \
    -x NCCL_DEBUG=WARN \
    -x HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
    -x HSA_FORCE_FINE_GRAIN_PCIE=1 \
    -x NCCL_MAX_NCHANNELS=20 \
    -x NCCL_MIN_NCHANNELS=20 \
    -x NCCL_IB_HCA=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_7,mlx5_8,mlx5_9,mlx5_10 \
    ./build/all_reduce_perf -b 7618 -e 1G -f 2 -g 1 -d half

    #-x NCCL_IB_QP_PER_CONNECTION=4 \
    #-x NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_1:1,mlx5_10:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 \
	# -x NCCL_IB_HCA=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_7,mlx5_8,mlx5_9,mlx5_10 \
    # --mca plm rsh \
    # --mca plm_rsh_agent ssh \
    # -x NCCL_GRAPH_FILE=./topo.xml \
    # -x NCCL_ALGO=tree,ring \
    # --mca plm_rsh_args "-2 -o StrictHostKeyChecking=no" \  # 使用 SSH 通信
    # -x NCCL_DEBUG=INFO  # 输出调试日志,定位问题
    # -x NCCL_IB_GID_INDEX=3  # 若 IB 多子网,指定 GID 索引