diff --git a/scripts/mpirun_rochpl.in b/scripts/mpirun_rochpl.in index 155f502..0f15d38 100755 --- a/scripts/mpirun_rochpl.in +++ b/scripts/mpirun_rochpl.in @@ -31,6 +31,9 @@ function display_help() echo " [-i] Input file. When set, all other commnand " echo " line parameters are ignored, and problem " echo " parameters are read from input file. " + echo " [-H|--hosts] Comma-separated list of nodes to run on. " + echo " [--tcp-iface] TCP interface to use for communication. " + echo " [--port] SSH port to use for remote connections. " echo " [-h|--help] prints this help message " echo " [--version] Print rocHPL version number. " } @@ -46,7 +49,7 @@ supported_distro( ) fi case "${ID}" in - debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos) + debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos|kylin|rocky) true ;; *) printf "This script is currently supported on Debian, Linuxmint, Ubuntu, CentOS, RHEL, Fedora and SLES\n" @@ -80,6 +83,15 @@ fi # The following function exits script if an unsupported distro is detected supported_distro +# Detect the number of GPUs per node +ngpu_per_node=$(hy-smi --showid 2>/dev/null | grep -ic "Device ID") +if [[ -z "${ngpu_per_node}" || "${ngpu_per_node}" -eq 0 ]]; then + echo "Failed to get the number of GPUs per node via hy-smi. Defaulting to 8." + ngpu_per_node=8 +else + echo "Detected ${ngpu_per_node} GPUs per node." +fi + # ################################################# # global variables # ################################################# @@ -101,7 +113,17 @@ filename=HPL.dat inputfile=false cmdrun=false +tpl_dir=$(dirname "$(readlink -f "$0")")/../tpl +ompi_prefix=$tpl_dir/openmpi +ompi_lib_dir=$tpl_dir/openmpi/lib +ucx_lib_dir=$tpl_dir/ucx/lib + +export LD_LIBRARY_PATH=$ompi_lib_dir:$ucx_lib_dir:$LD_LIBRARY_PATH +export OPAL_PREFIX=$ompi_prefix devices= +nodes= +tcp_iface=p14p2 +ssh_port=3333 # ################################################# # MPI Args @@ -111,21 +133,6 @@ num_cpu_cores=$(lscpu | grep "Core(s)" | awk '{print $4}') num_cpu_sockets=$(lscpu | grep Socket | awk '{print $2}') total_cpu_cores=$(($num_cpu_cores*$num_cpu_sockets)) -#Default MPI options -mpi_args= - -#Check if using OpenMPI -if [[ $(${mpi_bin} --version | grep "open-mpi") ]]; then - mpi_args+=" --map-by node --rank-by slot --bind-to none " - - #Check if this is OpenMPI+UCX - ompi_info=$(dirname ${mpi_bin})/ompi_info - if [[ $(${ompi_info} | grep "MCA pml: ucx") ]]; then - # ucx-specific args - mpi_args="--mca pml ucx --mca btl ^vader,tcp,openib,uct ${mpi_args}" - fi -fi - # ################################################# # Parameter parsing # ################################################# @@ -133,7 +140,7 @@ fi # check if we have a modern version of getopt that can handle whitespace and long parameters getopt -T if [[ $? -eq 4 ]]; then - GETOPT_PARSE=$(getopt --name "${0}" --longoptions NB:,it:,help,version,devices:, --options hP:Q:p:q:N:i:f: -- "$@") + GETOPT_PARSE=$(getopt --name "${0}" --longoptions NB:,it:,help,version,devices:,hosts:,port:,tcp-iface:, --options hP:Q:p:q:N:i:f:H: -- "$@") else echo "Need a new version of getopt" exit 1 @@ -153,7 +160,7 @@ while true; do exit 0 ;; --version) - ${mpi_bin} -np 1 ${mpi_args} ${rochpl_runscript} --version + ${mpi_bin} --allow-run-as-root -np 1 ${rochpl_runscript} --version exit 0 ;; -P) @@ -189,6 +196,15 @@ while true; do --devices) devices=${2} shift 2 ;; + -H|--hosts) + nodes=${2} + shift 2 ;; + --tcp-iface) + tcp_iface=${2} + shift 2 ;; + --port) + ssh_port=${2} + shift 2 ;; --) shift ; break ;; *) echo "Unexpected command line parameter received; aborting"; exit 1 @@ -218,5 +234,77 @@ if [ ! -z "${devices}" ]; then rochpl_args+=" --devices=${devices}" fi -#run -${mpi_bin} -np ${np} ${mpi_args} ${rochpl_runscript} ${rochpl_args} +# Run single-node test if --hosts is not set +if [ -z "${nodes}" ]; then + echo "No compute nodes specified. Running in single-node mode." + + ${mpi_bin} --allow-run-as-root \ + --bind-to none \ + --mca pml ucx \ + --mca osc ucx \ + --mca btl ^vader,tcp,openib,uct \ + --mca coll ^hcoll \ + -x UCX_TLS=self,sm,rocm \ + -x UCX_RNDV_SCHEME=put_zcopy \ + -x UCX_MEMTYPE_CACHE=n \ + -x HSA_FORCE_FINE_GRAIN_PCIE=1 \ + -np ${np} \ + ${rochpl_runscript} ${rochpl_args} +else + echo "Running in multi-node mode. Using nodes: ${nodes}" + echo "Using TCP interface: ${tcp_iface}" + echo "Using SSH port: ${ssh_port}" + + # Set rank counts for hosts + IFS=',' read -ra node_array <<< "${nodes}" + hosts_string="" + for node in "${node_array[@]}"; do + hosts_string+="${node}:${ngpu_per_node}," + done + hosts_string="${hosts_string%,}" + + echo "MPI hosts: ${hosts_string}" + + # Copy files to other nodes + current_node=$(hostname) + copyto_hosts=() + for node in "${node_array[@]}"; do + if [[ "${node}" != "${current_node}" ]]; then + copyto_hosts+=("${node}") + fi + done + + # Copy files using rsync only if there are other nodes to copy to + if [ ${#copyto_hosts[@]} -gt 0 ]; then + echo "Copying files to other nodes in parallel: ${copyto_hosts[@]}" + for node in "${copyto_hosts[@]}"; do + if [[ "${inputfile}" == false ]]; then + rsync -az -e "ssh -p ${ssh_port}" build tpl "${node}:/workspace/" & + else + rsync -az -e "ssh -p ${ssh_port}" build tpl ${filename} "${node}:/workspace/" & + fi + done + wait + echo "Files synchronized successfully." + fi + + # Multi-node run + ${mpi_bin} --allow-run-as-root \ + --prefix ${ompi_prefix} \ + --map-by ppr:${ngpu_per_node}:node --bind-to none \ + --mca pml ucx \ + --mca osc ucx \ + --mca btl ^openib \ + --mca btl_tcp_if_include ${tcp_iface} \ + --mca plm_rsh_args "-p ${ssh_port}" \ + --mca coll_hcoll_enable 0 \ + -x UCX_TLS=self,sm,rocm,rc \ + -x UCX_RNDV_SCHEME=put_zcopy \ + -x UCX_RNDV_FRAG_MEM_TYPE=rocm \ + -x UCX_MEMTYPE_CACHE=n \ + -x HSA_FORCE_FINE_GRAIN_PCIE=1 \ + -x PATH -x LD_LIBRARY_PATH -x OPAL_PREFIX \ + -np ${np} \ + -H ${hosts_string} \ + ${rochpl_runscript} ${rochpl_args} +fi \ No newline at end of file diff --git a/scripts/run_rochpl.in b/scripts/run_rochpl.in index 1522e5d..68c1958 100755 --- a/scripts/run_rochpl.in +++ b/scripts/run_rochpl.in @@ -46,7 +46,7 @@ supported_distro( ) fi case "${ID}" in - debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos) + debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos|kylin|rocky) true ;; *) printf "This script is currently supported on Debian, Linuxmint, Ubuntu, CentOS, RHEL, Fedora and SLES\n" @@ -104,7 +104,9 @@ cmdrun=false devices= -export LD_LIBRARY_PATH=${rocblas_dir}:${blas_dir}:${rocm_dir}/lib:$LD_LIBRARY_PATH +tpl_dir=$(dirname "$(readlink -f "$0")")/../tpl +ucx_lib_dir=$tpl_dir/ucx/lib +export LD_LIBRARY_PATH=${rocblas_dir}:${blas_dir}:$ucx_lib_dir:${rocm_dir}/lib:$LD_LIBRARY_PATH # ################################################# # Parameter parsing @@ -274,7 +276,7 @@ myq=$((rank/p)) cpulist=$(lscpu --parse=CPU,CORE,NODE | awk '!/#/' | tr ',' "\t" | sort -k 2 -g -s) #construct list of devices and their numa affinities -devicelist=$(${rocm_dir}/bin/rocm-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t") +devicelist=$(hy-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t") #count the cpus per core threads_per_core=$(echo "${cpulist}" | grep -c ".* 0 .*") @@ -361,9 +363,23 @@ export OMP_NUM_THREADS=${omp_num_threads} export OMP_PLACES=${omp_places} export OMP_PROC_BIND=true +# Hard-coded IB mapping for now +declare -a IB_MAP=( + "mlx5_1:1" # GPU 0 -> NUMA 3 + "mlx5_2:1" # GPU 1 -> NUMA 3 + "mlx5_3:1" # GPU 2 -> NUMA 0 + "mlx5_4:1" # GPU 3 -> NUMA 0 + "mlx5_7:1" # GPU 4 -> NUMA 7 + "mlx5_8:1" # GPU 5 -> NUMA 7 + "mlx5_9:1" # GPU 6 -> NUMA 4 + "mlx5_10:1" # GPU 7 -> NUMA 4 +) + +myib=${IB_MAP[$rank]} +export UCX_NET_DEVICES=${myib} if [[ $globalRank -lt $size ]]; then - echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] GPU: $mygpu, CPU Cores: $omp_num_threads - $places" + echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] GPU: $mygpu, NUMA: $mynuma, IB: $myib, CPU Cores: $omp_num_threads - $places" fi rochpl_args="-P ${P} -Q ${Q} -p ${p} -q ${q} -f ${frac} -it ${it}" @@ -374,4 +390,5 @@ else fi #run -${rochpl_bin} ${rochpl_args} +#${rochpl_bin} ${rochpl_args} +numactl -N ${mynuma} -m ${mynuma} ${rochpl_bin} ${rochpl_args}