env.sh 1.8 KB
Newer Older
lim's avatar
lim committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# These variables should not be modified.
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export NCCL_ALGO=Ring
export NCCL_MAX_NCHANNELS=32
export NCCL_MIN_NCHANNELS=32
export NCCL_NET_GDR_LEVEL=4
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export TRITON_HOME=/tmp

# These variables should be modified according to the environment of the machine you are using.
# Please choose one from [wz, 508-shca, 508-mlnx].

# wz
export GLOO_SOCKET_IFNAME=enp33s0f3u1
export NCCL_IB_HCA=mlx5_0:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE=${MEGATRON_PATH}/requirements/topo-input.xml
export ROCSHMEM_MAX_NUM_CONTEXTS=48
export ROCSHMEM_ALLOWED_IBV_DEVICES=mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9
export ROCSHMEM_HEAP_SIZE=10737418240
export ROCSHMEM_TOPO_FILE_FORCE=${MEGATRON_PATH}/requirements/topo.config

# 508-shca
# module load app/rccl/shca_rdma_plugins/v8 
# module load app/rccl/tests 
# module load app/rccl/topos/default
# module load mpi/openmpi/5.0.3/gcc-8.5.0/shca_ucx-1.15.0
# export NCCL_IB_HCA=shca_0:1,shca_1:1,shca_2:1,shca_3:1
# export NCCL_PXN_DISABLE=0
# export RCCL_PXN_GPU_BALANCE=1
# export RCCL_NET_PLANE="shca_0,shca_3|shca_1,shca_2"
# export NCCL_PLUGIN_P2P=ib
# export NCCL_SOCKET_IFNAME=ib0
# export SHCA_DEBUG_MASK=0
# export SHCA_CMR_LOG_LEVEL=1
# export SHCA_SHUT_UP_FWB=0
# export NCCL_NET_PLUGIN=shca
# export UCX_IB_NUM_PATHS=1

# 508-mlnx
# module load app/rccl/tests
# module load app/rccl/topos/default
# module load mpi/hpcx/2.18.0/gcc-8.5.0/mlnx
# export NCCL_IB_HCA=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1
# export NCCL_PXN_DISABLE=0
# export NCCL_NCHANNELS_PER_PEER=32
# export NCCL_MIN_P2P_NCHANNELS=32
# export NCCL_MAX_P2P_NCHANNELS=32
# export RCCL_P2P_XHCL_CHANNEL_NUM=30