"git@developer.sourcefind.cn:jerrrrry/infinicore.git" did not exist on "527086e975deb9b1dbb327692e83a7082f6ae6e2"
Commit 6d1f7610 authored by one's avatar one
Browse files

Update rocHPL patches

parent ee64defa
## Single Node with BW1000 or 8xBW150
Patch files:
- `rochpl-install.patch`
- `rochpl-scripts-bw.patch`
## Two Nodes with 2xBW1000
Patch files:
- `rochpl-install.patch`
- `rochpl-scripts-2xbw1000.patch`
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6b80b24..563122a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -83,7 +83,7 @@ foreach(i ${rochpl_device_source})
endforeach()
# HIP flags workaround while target_compile_options does not work
-list(APPEND HIP_HIPCC_FLAGS "-Wno-unused-command-line-argument -Wno-deprecated-declarations -fPIE -fopenmp")
+list(APPEND HIP_HIPCC_FLAGS "-Wno-unused-command-line-argument -Wno-deprecated-declarations -fPIE -fopenmp --gpu-max-threads-per-block=1024")
list(APPEND CMAKE_HOST_FLAGS "-Wno-deprecated-declarations")
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
@@ -94,25 +94,51 @@ else()
list(APPEND CMAKE_HOST_FLAGS "-O3;-march=native")
endif()
-# GPU arch targets
-set(TARGETS "gfx900;gfx906")
-if(HIP_VERSION VERSION_GREATER_EQUAL "3.7")
- set(TARGETS "${TARGETS};gfx908")
-endif()
-if(HIP_VERSION VERSION_GREATER_EQUAL "4.3")
- set(TARGETS "${TARGETS};gfx90a")
-endif()
-if (HIP_VERSION VERSION_GREATER_EQUAL "5.7")
- set(TARGETS "${TARGETS};gfx942")
-endif()
-if (HIP_VERSION VERSION_GREATER_EQUAL "6.5")
- set(TARGETS "${TARGETS};gfx950;gfx1100")
+set(ARCHS "") # use plural to indicate list
+if(DEFINED HPL_BUILD_ARCH AND NOT HPL_BUILD_ARCH STREQUAL "")
+ string(REPLACE "," ";" ARCHS "${HPL_BUILD_ARCH}")
+ list(TRANSFORM ARCHS STRIP)
+ list(REMOVE_DUPLICATES ARCHS)
+ message(STATUS "Using manually specified GPU targets: ${ARCHS}")
+else()
+ message(STATUS "Detecting available architecture")
+ ############ Find using rocminfo #####################
+ find_program(ROCMINFO_EXECUTABLE rocminfo)
+ if(ROCMINFO_EXECUTABLE)
+ execute_process(
+ COMMAND ${ROCMINFO_EXECUTABLE}
+ OUTPUT_VARIABLE ROCMINFO_OUTPUT
+ ERROR_QUIET
+ OUTPUT_STRIP_TRAILING_WHITESPACE
+ )
+
+ # 1) Only match lines where the token follows "Name:"
+ string(REGEX MATCHALL "Name:[ \t]+gfx[0-9a-z]+" ARCH_MATCHES "${ROCMINFO_OUTPUT}")
+
+ # 2) Strip the leading "Name: " to keep just gfx tokens
+ string(REGEX REPLACE "Name:[ \t]+" "" ARCHS "${ARCH_MATCHES}")
+
+ # 3) Remove duplicates
+ list(REMOVE_DUPLICATES ARCHS)
+
+ foreach(match ${ARCHS})
+ string(REGEX REPLACE "Name:\\s+" "" arch "${match}")
+ list(APPEND ARCH "${arch}")
+ endforeach()
+ endif()
endif()
if (HIP_VERSION VERSION_GREATER_EQUAL "7.0")
set(TARGETS "${TARGETS};gfx1201")
endif()
-foreach(target ${TARGETS})
+if(ARCHS STREQUAL "")
+ message(FATAL_ERROR "No GPU architectures detected via rocminfo and no BUILD_ARCH specified. Use ./install.sh --arch=gfxXXX")
+endif()
+
+message(STATUS "Building for GPU architecture: ${ARCHS}")
+
+# Generate HIP_HIPCC_FLAGS
+foreach(target ${ARCHS})
list(APPEND HIP_HIPCC_FLAGS "--offload-arch=${target}")
endforeach()
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 6d6be5d..468420e 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -101,7 +101,8 @@ if(NOT ROCM_FOUND)
execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip
WORKING_DIRECTORY ${PROJECT_EXTERN_DIR})
- find_package(ROCmCMakeBuildTools REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag})
+ # find_package(ROCmCMakeBuildTools REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag})
+ set(CMAKE_MODULE_PATH "${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}/share/rocm/cmake;${CMAKE_MODULE_PATH}")
endif()
include(ROCMSetupVersion)
diff --git a/install.sh b/install.sh
index b30a3fb..9ada030 100755
--- a/install.sh
+++ b/install.sh
@@ -17,6 +17,7 @@ function display_help()
echo " [--with-rocm=<dir>] Path to ROCm install (Default: /opt/rocm)"
echo " [--with-rocblas=<dir>] Path to rocBLAS library (Default: /opt/rocm/rocblas)"
echo " [--with-mpi=<dir>] Path to external MPI install (Default: clone+build OpenMPI)"
+ echo " [--arch] Specify comma separated architecture list to build (Default: detect from rocm_agent_enumerator)"
echo " [--with-mpi-gtl=<dir>] Path to external MPI-GTL install (Optional: defaults to no gtl support)"
echo " [--verbose-print] Verbose output during HPL setup (Default: true)"
echo " [--progress-report] Print progress report to terminal during HPL run (Default: true)"
@@ -33,7 +34,7 @@ supported_distro( )
fi
case "${ID}" in
- debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos)
+ debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos|kylin|rocky)
true
;;
*) printf "This script is currently supported on Debian, Linuxmint, Ubuntu, CentOS, RHEL, Fedora and SLES\n"
@@ -68,11 +69,11 @@ exit_with_error( )
printf "sudo apt install -y ${library_dependencies_ubuntu[*]}\n"
;;
- centos|rhel|tencentos)
+ centos|rhel|tencentos|kylin)
printf "sudo yum -y --nogpgcheck install ${library_dependencies_centos[*]}\n"
;;
- fedora)
+ fedora|rocky)
printf "sudo dnf install -y ${library_dependencies_fedora[*]}\n"
;;
@@ -152,6 +153,11 @@ install_openmpi( )
exit 3
fi
+ UCX_ROOT="$(pwd)/tpl/ucx"
+ export LD_LIBRARY_PATH="${UCX_ROOT}/lib:${UCX_ROOT}/lib64:${LD_LIBRARY_PATH}"
+ export LIBRARY_PATH="${UCX_ROOT}/lib:${UCX_ROOT}/lib64:${LIBRARY_PATH}"
+ export CPATH="${UCX_ROOT}/include:${CPATH}"
+
if [ ! -d "./tpl/openmpi" ]; then
mkdir -p tpl && cd tpl
git clone --branch v5.0.7 --recursive https://github.com/open-mpi/ompi.git openmpi
@@ -232,7 +238,7 @@ enable_tracing=false
# check if we have a modern version of getopt that can handle whitespace and long parameters
getopt -T
if [[ $? -eq 4 ]]; then
- GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,debug,prefix:,with-rocm:,with-mpi:,with-mpi-gtl:,with-rocblas:,verbose-print:,progress-report:,detailed-timing:,enable-tracing: --options hg -- "$@")
+ GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,debug,prefix:,with-rocm:,with-mpi:,with-mpi-gtl:,with-rocblas:,verbose-print:,arch:,progress-report:,detailed-timing:,enable-tracing: --options hg -- "$@")
else
echo "Need a new version of getopt"
exit_with_error 1
@@ -263,6 +269,9 @@ while true; do
--with-mpi)
with_mpi=${2}
shift 2 ;;
+ --arch)
+ arch=${2}
+ shift 2 ;;
--with-mpi-gtl)
with_mpi_gtl=${2}
shift 2 ;;
@@ -347,6 +356,9 @@ pushd .
if [[ "${enable_tracing}" == on || "${enable_tracing}" == true || "${enable_tracing}" == 1 || "${enable_tracing}" == enabled ]]; then
cmake_common_options="${cmake_common_options} -DHPL_TRACING=ON"
fi
+ if [[ -n "${arch}" ]]; then
+ cmake_common_options="${cmake_common_options} -DHPL_BUILD_ARCH=${arch}"
+ fi
shopt -u nocasematch
# Build library with AMD toolchain because of existence of device kernels
diff --git a/scripts/mpirun_rochpl.in b/scripts/mpirun_rochpl.in
index 155f502..05a96e0 100755
--- a/scripts/mpirun_rochpl.in
+++ b/scripts/mpirun_rochpl.in
@@ -46,7 +46,7 @@ supported_distro( )
fi
case "${ID}" in
- debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos)
+ debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos|kylin|rocky)
true
;;
*) printf "This script is currently supported on Debian, Linuxmint, Ubuntu, CentOS, RHEL, Fedora and SLES\n"
@@ -101,6 +101,13 @@ filename=HPL.dat
inputfile=false
cmdrun=false
+tpl_dir=$(dirname "$(readlink -f "$0")")/../tpl
+ompi_prefix=$tpl_dir/openmpi
+ompi_lib_dir=$tpl_dir/openmpi/lib
+ucx_lib_dir=$tpl_dir/ucx/lib
+
+export LD_LIBRARY_PATH=$ompi_lib_dir:$ucx_lib_dir:$LD_LIBRARY_PATH
+export OPAL_PREFIX=$ompi_prefix
devices=
# #################################################
@@ -117,12 +124,16 @@ mpi_args=
#Check if using OpenMPI
if [[ $(${mpi_bin} --version | grep "open-mpi") ]]; then
mpi_args+=" --map-by node --rank-by slot --bind-to none "
+ #mpi_args+=" --map-by numa:PE=16 --bind-to core --report-bindings "
#Check if this is OpenMPI+UCX
ompi_info=$(dirname ${mpi_bin})/ompi_info
if [[ $(${ompi_info} | grep "MCA pml: ucx") ]]; then
# ucx-specific args
- mpi_args="--mca pml ucx --mca btl ^vader,tcp,openib,uct ${mpi_args}"
+ mpi_args="--mca pml ucx --mca btl ^vader,tcp,openib,uct \
+ -x UCX_TLS=self,sm,rocm_ipc,rocm_copy,rc_mlx5 \
+ -x UCX_MEMTYPE_CACHE=n \
+ ${mpi_args}"
fi
fi
@@ -153,7 +164,7 @@ while true; do
exit 0
;;
--version)
- ${mpi_bin} -np 1 ${mpi_args} ${rochpl_runscript} --version
+ ${mpi_bin} --allow-run-as-root -np 1 ${mpi_args} ${rochpl_runscript} --version
exit 0
;;
-P)
@@ -219,4 +230,4 @@ if [ ! -z "${devices}" ]; then
fi
#run
-${mpi_bin} -np ${np} ${mpi_args} ${rochpl_runscript} ${rochpl_args}
+${mpi_bin} --allow-run-as-root -np ${np} ${mpi_args} ${rochpl_runscript} ${rochpl_args}
diff --git a/scripts/run_rochpl.in b/scripts/run_rochpl.in
index 1522e5d..1380d3a 100755
--- a/scripts/run_rochpl.in
+++ b/scripts/run_rochpl.in
@@ -46,7 +46,7 @@ supported_distro( )
fi
case "${ID}" in
- debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos)
+ debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos|kylin|rocky)
true
;;
*) printf "This script is currently supported on Debian, Linuxmint, Ubuntu, CentOS, RHEL, Fedora and SLES\n"
@@ -104,7 +104,9 @@ cmdrun=false
devices=
-export LD_LIBRARY_PATH=${rocblas_dir}:${blas_dir}:${rocm_dir}/lib:$LD_LIBRARY_PATH
+tpl_dir=$(dirname "$(readlink -f "$0")")/../tpl
+ucx_lib_dir=$tpl_dir/ucx/lib
+export LD_LIBRARY_PATH=${rocblas_dir}:${blas_dir}:$ucx_lib_dir:${rocm_dir}/lib:$LD_LIBRARY_PATH
# #################################################
# Parameter parsing
@@ -274,7 +276,7 @@ myq=$((rank/p))
cpulist=$(lscpu --parse=CPU,CORE,NODE | awk '!/#/' | tr ',' "\t" | sort -k 2 -g -s)
#construct list of devices and their numa affinities
-devicelist=$(${rocm_dir}/bin/rocm-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t")
+devicelist=$(hy-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t")
#count the cpus per core
threads_per_core=$(echo "${cpulist}" | grep -c ".* 0 .*")
@@ -363,7 +365,8 @@ export OMP_PROC_BIND=true
if [[ $globalRank -lt $size ]]; then
- echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] GPU: $mygpu, CPU Cores: $omp_num_threads - $places"
+ echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] GPU: $mygpu, NUMA: $mynuma, CPU Cores: $omp_num_threads - $places"
+# echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] GPU: $mygpu, NUMA: $mynuma"
fi
rochpl_args="-P ${P} -Q ${Q} -p ${p} -q ${q} -f ${frac} -it ${it}"
@@ -374,4 +377,5 @@ else
fi
#run
-${rochpl_bin} ${rochpl_args}
+#${rochpl_bin} ${rochpl_args}
+numactl --cpunodebind=${mynuma} --membind=${mynuma} ${rochpl_bin} ${rochpl_args}
diff --git a/src/HPL_pdtest.cpp b/src/HPL_pdtest.cpp
index 94a0d3f..3135763 100644
--- a/src/HPL_pdtest.cpp
+++ b/src/HPL_pdtest.cpp
@@ -212,7 +212,7 @@ void HPL_pdtest(HPL_T_test* TEST,
ctime(&current_time_end));
}
#ifdef HPL_PROGRESS_REPORT
- printf("Final Score: %7.4e GFLOPS \n", Gflops);
+ printf("Final Score: %7.9e GFLOPS \n", Gflops);
#endif
}
#ifdef HPL_DETAILED_TIMING
diff --git a/src/pgesv/HPL_pdgesv.cpp b/src/pgesv/HPL_pdgesv.cpp
index d6c99c3..280a9a5 100644
--- a/src/pgesv/HPL_pdgesv.cpp
+++ b/src/pgesv/HPL_pdgesv.cpp
@@ -336,7 +336,7 @@ void HPL_pdgesv(HPL_T_grid* GRID, HPL_T_palg* ALGO, HPL_T_pmat* A) {
printf(" %9.3e |", step_gflops);
#endif
- printf(" %9.3e \n", gflops);
+ printf(" %9.9e \n", gflops);
}
#endif
diff --git a/scripts/mpirun_rochpl.in b/scripts/mpirun_rochpl.in
index 155f502..2344410 100755
--- a/scripts/mpirun_rochpl.in
+++ b/scripts/mpirun_rochpl.in
@@ -46,7 +46,7 @@ supported_distro( )
fi
case "${ID}" in
- debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos)
+ debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos|kylin|rocky)
true
;;
*) printf "This script is currently supported on Debian, Linuxmint, Ubuntu, CentOS, RHEL, Fedora and SLES\n"
@@ -101,6 +101,13 @@ filename=HPL.dat
inputfile=false
cmdrun=false
+tpl_dir=$(dirname "$(readlink -f "$0")")/../tpl
+ompi_prefix=$tpl_dir/openmpi
+ompi_lib_dir=$tpl_dir/openmpi/lib
+ucx_lib_dir=$tpl_dir/ucx/lib
+
+export LD_LIBRARY_PATH=$ompi_lib_dir:$ucx_lib_dir:$LD_LIBRARY_PATH
+export OPAL_PREFIX=$ompi_prefix
devices=
# #################################################
@@ -111,21 +118,6 @@ num_cpu_cores=$(lscpu | grep "Core(s)" | awk '{print $4}')
num_cpu_sockets=$(lscpu | grep Socket | awk '{print $2}')
total_cpu_cores=$(($num_cpu_cores*$num_cpu_sockets))
-#Default MPI options
-mpi_args=
-
-#Check if using OpenMPI
-if [[ $(${mpi_bin} --version | grep "open-mpi") ]]; then
- mpi_args+=" --map-by node --rank-by slot --bind-to none "
-
- #Check if this is OpenMPI+UCX
- ompi_info=$(dirname ${mpi_bin})/ompi_info
- if [[ $(${ompi_info} | grep "MCA pml: ucx") ]]; then
- # ucx-specific args
- mpi_args="--mca pml ucx --mca btl ^vader,tcp,openib,uct ${mpi_args}"
- fi
-fi
-
# #################################################
# Parameter parsing
# #################################################
@@ -153,7 +145,7 @@ while true; do
exit 0
;;
--version)
- ${mpi_bin} -np 1 ${mpi_args} ${rochpl_runscript} --version
+ ${mpi_bin} --allow-run-as-root -np 1 ${rochpl_runscript} --version
exit 0
;;
-P)
@@ -219,4 +211,11 @@ if [ ! -z "${devices}" ]; then
fi
#run
-${mpi_bin} -np ${np} ${mpi_args} ${rochpl_runscript} ${rochpl_args}
+${mpi_bin} --allow-run-as-root \
+ --bind-to none \
+ --mca pml ucx \
+ --mca btl ^vader,tcp,openib,uct \
+ -x UCX_TLS=self,sm,rocm,rc \
+ -x UCX_MEMTYPE_CACHE=n \
+ -np ${np} \
+ ${rochpl_runscript} ${rochpl_args}
diff --git a/scripts/run_rochpl.in b/scripts/run_rochpl.in
index 1522e5d..3f840a7 100755
--- a/scripts/run_rochpl.in
+++ b/scripts/run_rochpl.in
@@ -46,7 +46,7 @@ supported_distro( )
fi
case "${ID}" in
- debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos)
+ debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos|kylin|rocky)
true
;;
*) printf "This script is currently supported on Debian, Linuxmint, Ubuntu, CentOS, RHEL, Fedora and SLES\n"
@@ -104,7 +104,9 @@ cmdrun=false
devices=
-export LD_LIBRARY_PATH=${rocblas_dir}:${blas_dir}:${rocm_dir}/lib:$LD_LIBRARY_PATH
+tpl_dir=$(dirname "$(readlink -f "$0")")/../tpl
+ucx_lib_dir=$tpl_dir/ucx/lib
+export LD_LIBRARY_PATH=${rocblas_dir}:${blas_dir}:$ucx_lib_dir:${rocm_dir}/lib:$LD_LIBRARY_PATH
# #################################################
# Parameter parsing
@@ -274,7 +276,7 @@ myq=$((rank/p))
cpulist=$(lscpu --parse=CPU,CORE,NODE | awk '!/#/' | tr ',' "\t" | sort -k 2 -g -s)
#construct list of devices and their numa affinities
-devicelist=$(${rocm_dir}/bin/rocm-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t")
+devicelist=$(hy-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t")
#count the cpus per core
threads_per_core=$(echo "${cpulist}" | grep -c ".* 0 .*")
@@ -363,7 +365,7 @@ export OMP_PROC_BIND=true
if [[ $globalRank -lt $size ]]; then
- echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] GPU: $mygpu, CPU Cores: $omp_num_threads - $places"
+ echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] GPU: $mygpu, NUMA: $mynuma, CPU Cores: $omp_num_threads - $places"
fi
rochpl_args="-P ${P} -Q ${Q} -p ${p} -q ${q} -f ${frac} -it ${it}"
@@ -374,4 +376,5 @@ else
fi
#run
-${rochpl_bin} ${rochpl_args}
+#${rochpl_bin} ${rochpl_args}
+numactl --cpunodebind=${mynuma} --membind=${mynuma} ${rochpl_bin} ${rochpl_args}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment