diff --git a/scripts/mpirun_rochpl.in b/scripts/mpirun_rochpl.in index 155f502..a0e8a41 100755 --- a/scripts/mpirun_rochpl.in +++ b/scripts/mpirun_rochpl.in @@ -46,7 +46,7 @@ supported_distro( ) fi case "${ID}" in - debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos) + debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos|kylin|rocky) true ;; *) printf "This script is currently supported on Debian, Linuxmint, Ubuntu, CentOS, RHEL, Fedora and SLES\n" @@ -101,6 +101,13 @@ filename=HPL.dat inputfile=false cmdrun=false +tpl_dir=$(dirname "$(readlink -f "$0")")/../tpl +ompi_prefix=$tpl_dir/openmpi +ompi_lib_dir=$tpl_dir/openmpi/lib +ucx_lib_dir=$tpl_dir/ucx/lib + +export LD_LIBRARY_PATH=$ompi_lib_dir:$ucx_lib_dir:$LD_LIBRARY_PATH +export OPAL_PREFIX=$ompi_prefix devices= # ################################################# @@ -111,21 +118,6 @@ num_cpu_cores=$(lscpu | grep "Core(s)" | awk '{print $4}') num_cpu_sockets=$(lscpu | grep Socket | awk '{print $2}') total_cpu_cores=$(($num_cpu_cores*$num_cpu_sockets)) -#Default MPI options -mpi_args= - -#Check if using OpenMPI -if [[ $(${mpi_bin} --version | grep "open-mpi") ]]; then - mpi_args+=" --map-by node --rank-by slot --bind-to none " - - #Check if this is OpenMPI+UCX - ompi_info=$(dirname ${mpi_bin})/ompi_info - if [[ $(${ompi_info} | grep "MCA pml: ucx") ]]; then - # ucx-specific args - mpi_args="--mca pml ucx --mca btl ^vader,tcp,openib,uct ${mpi_args}" - fi -fi - # ################################################# # Parameter parsing # ################################################# @@ -153,7 +145,7 @@ while true; do exit 0 ;; --version) - ${mpi_bin} -np 1 ${mpi_args} ${rochpl_runscript} --version + ${mpi_bin} --allow-run-as-root -np 1 ${rochpl_runscript} --version exit 0 ;; -P) @@ -218,5 +210,25 @@ if [ ! -z "${devices}" ]; then rochpl_args+=" --devices=${devices}" fi +echo "Copying files..." +rsync -az -e 'ssh -p 3333' build tpl ${filename} node02:/workspace/ + #run -${mpi_bin} -np ${np} ${mpi_args} ${rochpl_runscript} ${rochpl_args} +${mpi_bin} --allow-run-as-root \ + --prefix ${ompi_prefix} \ + --map-by ppr:8:node --bind-to none \ + --mca pml ucx \ + --mca osc ucx \ + --mca btl ^openib \ + --mca btl_tcp_if_include p14p2 \ + --mca plm_rsh_args "-p 3333" \ + --mca coll_hcoll_enable 0 \ + -x UCX_TLS=self,sm,rocm,rc \ + -x UCX_RNDV_SCHEME=put_zcopy \ + -x UCX_RNDV_FRAG_MEM_TYPE=rocm \ + -x UCX_MEMTYPE_CACHE=n \ + -x HSA_FORCE_FINE_GRAIN_PCIE=1 \ + -x PATH -x LD_LIBRARY_PATH -x OPAL_PREFIX \ + -np 16 \ + -H node01:8,node02:8 \ + ${rochpl_runscript} ${rochpl_args} diff --git a/scripts/run_rochpl.in b/scripts/run_rochpl.in index 1522e5d..68c1958 100755 --- a/scripts/run_rochpl.in +++ b/scripts/run_rochpl.in @@ -46,7 +46,7 @@ supported_distro( ) fi case "${ID}" in - debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos) + debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos|kylin|rocky) true ;; *) printf "This script is currently supported on Debian, Linuxmint, Ubuntu, CentOS, RHEL, Fedora and SLES\n" @@ -104,7 +104,9 @@ cmdrun=false devices= -export LD_LIBRARY_PATH=${rocblas_dir}:${blas_dir}:${rocm_dir}/lib:$LD_LIBRARY_PATH +tpl_dir=$(dirname "$(readlink -f "$0")")/../tpl +ucx_lib_dir=$tpl_dir/ucx/lib +export LD_LIBRARY_PATH=${rocblas_dir}:${blas_dir}:$ucx_lib_dir:${rocm_dir}/lib:$LD_LIBRARY_PATH # ################################################# # Parameter parsing @@ -274,7 +276,7 @@ myq=$((rank/p)) cpulist=$(lscpu --parse=CPU,CORE,NODE | awk '!/#/' | tr ',' "\t" | sort -k 2 -g -s) #construct list of devices and their numa affinities -devicelist=$(${rocm_dir}/bin/rocm-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t") +devicelist=$(hy-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t") #count the cpus per core threads_per_core=$(echo "${cpulist}" | grep -c ".* 0 .*") @@ -361,9 +363,23 @@ export OMP_NUM_THREADS=${omp_num_threads} export OMP_PLACES=${omp_places} export OMP_PROC_BIND=true +# Hard-coded IB mapping for now +declare -a IB_MAP=( + "mlx5_1:1" # GPU 0 -> NUMA 3 + "mlx5_2:1" # GPU 1 -> NUMA 3 + "mlx5_3:1" # GPU 2 -> NUMA 0 + "mlx5_4:1" # GPU 3 -> NUMA 0 + "mlx5_7:1" # GPU 4 -> NUMA 7 + "mlx5_8:1" # GPU 5 -> NUMA 7 + "mlx5_9:1" # GPU 6 -> NUMA 4 + "mlx5_10:1" # GPU 7 -> NUMA 4 +) + +myib=${IB_MAP[$rank]} +export UCX_NET_DEVICES=${myib} if [[ $globalRank -lt $size ]]; then - echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] GPU: $mygpu, CPU Cores: $omp_num_threads - $places" + echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] GPU: $mygpu, NUMA: $mynuma, IB: $myib, CPU Cores: $omp_num_threads - $places" fi rochpl_args="-P ${P} -Q ${Q} -p ${p} -q ${q} -f ${frac} -it ${it}" @@ -374,4 +390,5 @@ else fi #run -${rochpl_bin} ${rochpl_args} +#${rochpl_bin} ${rochpl_args} +numactl -N ${mynuma} -m ${mynuma} ${rochpl_bin} ${rochpl_args}