rochpl-scripts-2xbw1000.patch 4.92 KB
Newer Older
one's avatar
one committed
1
diff --git a/scripts/mpirun_rochpl.in b/scripts/mpirun_rochpl.in
one's avatar
one committed
2
index 155f502..94f7b0d 100755
one's avatar
one committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
--- a/scripts/mpirun_rochpl.in
+++ b/scripts/mpirun_rochpl.in
@@ -46,7 +46,7 @@ supported_distro( )
   fi
 
   case "${ID}" in
-    debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos)
+    debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos|kylin|rocky)
         true
         ;;
     *)  printf "This script is currently supported on Debian, Linuxmint, Ubuntu, CentOS, RHEL, Fedora and SLES\n"
@@ -101,6 +101,13 @@ filename=HPL.dat
 inputfile=false
 cmdrun=false
 
+tpl_dir=$(dirname "$(readlink -f "$0")")/../tpl
+ompi_prefix=$tpl_dir/openmpi
+ompi_lib_dir=$tpl_dir/openmpi/lib
+ucx_lib_dir=$tpl_dir/ucx/lib
+
+export LD_LIBRARY_PATH=$ompi_lib_dir:$ucx_lib_dir:$LD_LIBRARY_PATH
+export OPAL_PREFIX=$ompi_prefix
 devices=
 
 # #################################################
one's avatar
one committed
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
@@ -111,21 +118,6 @@ num_cpu_cores=$(lscpu | grep "Core(s)" | awk '{print $4}')
 num_cpu_sockets=$(lscpu | grep Socket | awk '{print $2}')
 total_cpu_cores=$(($num_cpu_cores*$num_cpu_sockets))
 
-#Default MPI options
-mpi_args=
-
-#Check if using OpenMPI
-if [[ $(${mpi_bin} --version | grep "open-mpi") ]]; then
-  mpi_args+=" --map-by node --rank-by slot --bind-to none "
-
-  #Check if this is OpenMPI+UCX
-  ompi_info=$(dirname ${mpi_bin})/ompi_info
-  if [[ $(${ompi_info} | grep "MCA pml: ucx") ]]; then
-    # ucx-specific args
one's avatar
one committed
43
-    mpi_args="--mca pml ucx --mca btl ^vader,tcp,openib,uct ${mpi_args}"
one's avatar
one committed
44
45
46
47
48
49
50
-  fi
-fi
-
 # #################################################
 # Parameter parsing
 # #################################################
@@ -153,7 +145,7 @@ while true; do
one's avatar
one committed
51
52
53
54
         exit 0
         ;;
     --version)
-        ${mpi_bin} -np 1 ${mpi_args} ${rochpl_runscript} --version
one's avatar
one committed
55
+        ${mpi_bin} --allow-run-as-root -np 1 ${rochpl_runscript} --version
one's avatar
one committed
56
57
58
         exit 0
         ;;
     -P)
one's avatar
one committed
59
@@ -218,5 +210,24 @@ if [ ! -z "${devices}" ]; then
one's avatar
one committed
60
61
62
63
   rochpl_args+=" --devices=${devices}"
 fi
 
+echo "Copying files..."
one's avatar
one committed
64
+rsync -az -e 'ssh -p 3333' build tpl ${filename} node02:/workspace/
one's avatar
one committed
65
66
67
68
69
+
 #run
-${mpi_bin} -np ${np} ${mpi_args} ${rochpl_runscript} ${rochpl_args}
+${mpi_bin} --allow-run-as-root \
+  --prefix ${ompi_prefix} \
one's avatar
one committed
70
71
+  --map-by ppr:8:node --bind-to none \
+  --mca pml ucx \
one's avatar
one committed
72
73
74
75
+  --mca btl ^openib \
+  --mca btl_tcp_if_include p14p2 \
+  --mca plm_rsh_args "-p 3333" \
+  --mca coll_hcoll_enable 0 \
one's avatar
one committed
76
77
78
+  -x UCX_TLS=self,sm,rocm,rc \
+  -x UCX_RNDV_SCHEME=put_zcopy \
+  -x UCX_RNDV_FRAG_MEM_TYPE=rocm \
one's avatar
one committed
79
80
+  -x UCX_MEMTYPE_CACHE=n \
+  -x HSA_FORCE_FINE_GRAIN_PCIE=1 \
one's avatar
one committed
81
+  -x PATH -x LD_LIBRARY_PATH -x OPAL_PREFIX \
one's avatar
one committed
82
83
+  -np 16 \
+  -H node01:8,node02:8 \
84
+  ${rochpl_runscript} ${rochpl_args}
one's avatar
one committed
85
diff --git a/scripts/run_rochpl.in b/scripts/run_rochpl.in
one's avatar
one committed
86
index 1522e5d..68c1958 100755
one's avatar
one committed
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
--- a/scripts/run_rochpl.in
+++ b/scripts/run_rochpl.in
@@ -46,7 +46,7 @@ supported_distro( )
   fi
 
   case "${ID}" in
-    debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos)
+    debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos|kylin|rocky)
         true
         ;;
     *)  printf "This script is currently supported on Debian, Linuxmint, Ubuntu, CentOS, RHEL, Fedora and SLES\n"
@@ -104,7 +104,9 @@ cmdrun=false
 
 devices=
 
-export LD_LIBRARY_PATH=${rocblas_dir}:${blas_dir}:${rocm_dir}/lib:$LD_LIBRARY_PATH
+tpl_dir=$(dirname "$(readlink -f "$0")")/../tpl
+ucx_lib_dir=$tpl_dir/ucx/lib
+export LD_LIBRARY_PATH=${rocblas_dir}:${blas_dir}:$ucx_lib_dir:${rocm_dir}/lib:$LD_LIBRARY_PATH
 
 # #################################################
 # Parameter parsing
@@ -274,7 +276,7 @@ myq=$((rank/p))
 cpulist=$(lscpu --parse=CPU,CORE,NODE | awk '!/#/' | tr ',' "\t" | sort -k 2 -g -s)
 
 #construct list of devices and their numa affinities
-devicelist=$(${rocm_dir}/bin/rocm-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t")
+devicelist=$(hy-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t")
 
 #count the cpus per core
 threads_per_core=$(echo "${cpulist}" | grep -c ".*	0	.*")
one's avatar
one committed
118
@@ -361,9 +363,23 @@ export OMP_NUM_THREADS=${omp_num_threads}
one's avatar
one committed
119
120
121
122
123
 export OMP_PLACES=${omp_places}
 export OMP_PROC_BIND=true
 
+# Hard-coded IB mapping for now
+declare -a IB_MAP=(
124
125
126
127
128
129
130
131
+    "mlx5_1:1"   # GPU 0 -> NUMA 3
+    "mlx5_2:1"   # GPU 1 -> NUMA 3
+    "mlx5_3:1"   # GPU 2 -> NUMA 0
+    "mlx5_4:1"   # GPU 3 -> NUMA 0
+    "mlx5_7:1"   # GPU 4 -> NUMA 7
+    "mlx5_8:1"   # GPU 5 -> NUMA 7
+    "mlx5_9:1"   # GPU 6 -> NUMA 4
+    "mlx5_10:1"  # GPU 7 -> NUMA 4
one's avatar
one committed
132
133
134
135
136
137
138
139
140
141
142
+)
+
+myib=${IB_MAP[$rank]}
+export UCX_NET_DEVICES=${myib}
 
 if [[ $globalRank -lt $size ]]; then
-  echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] GPU: $mygpu, CPU Cores: $omp_num_threads - $places"
+  echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] GPU: $mygpu, NUMA: $mynuma, IB: $myib, CPU Cores: $omp_num_threads - $places"
 fi
 
 rochpl_args="-P ${P} -Q ${Q} -p ${p} -q ${q} -f ${frac} -it ${it}"
one's avatar
one committed
143
@@ -374,4 +390,5 @@ else
one's avatar
one committed
144
145
146
147
148
149
 fi
 
 #run
-${rochpl_bin} ${rochpl_args}
+#${rochpl_bin} ${rochpl_args}
+numactl -N ${mynuma} -m ${mynuma} ${rochpl_bin} ${rochpl_args}