rochpl-install.patch 15 KB
Newer Older
one's avatar
one committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6b80b24..563122a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -83,7 +83,7 @@ foreach(i ${rochpl_device_source})
 endforeach()
 
 # HIP flags workaround while target_compile_options does not work
-list(APPEND HIP_HIPCC_FLAGS "-Wno-unused-command-line-argument -Wno-deprecated-declarations -fPIE -fopenmp")
+list(APPEND HIP_HIPCC_FLAGS "-Wno-unused-command-line-argument -Wno-deprecated-declarations -fPIE -fopenmp --gpu-max-threads-per-block=1024")
 list(APPEND CMAKE_HOST_FLAGS "-Wno-deprecated-declarations")
 
 if (CMAKE_BUILD_TYPE STREQUAL "Debug")
@@ -94,25 +94,51 @@ else()
   list(APPEND CMAKE_HOST_FLAGS "-O3;-march=native")
 endif()
 
-# GPU arch targets
-set(TARGETS "gfx900;gfx906")
-if(HIP_VERSION VERSION_GREATER_EQUAL "3.7")
-  set(TARGETS "${TARGETS};gfx908")
-endif()
-if(HIP_VERSION VERSION_GREATER_EQUAL "4.3")
-  set(TARGETS "${TARGETS};gfx90a")
-endif()
-if (HIP_VERSION VERSION_GREATER_EQUAL "5.7")
-  set(TARGETS "${TARGETS};gfx942")
-endif()
-if (HIP_VERSION VERSION_GREATER_EQUAL "6.5")
-  set(TARGETS "${TARGETS};gfx950;gfx1100")
+set(ARCHS "")  # use plural to indicate list
+if(DEFINED HPL_BUILD_ARCH AND NOT HPL_BUILD_ARCH STREQUAL "")
+  string(REPLACE "," ";" ARCHS "${HPL_BUILD_ARCH}")
+  list(TRANSFORM ARCHS STRIP)
+  list(REMOVE_DUPLICATES ARCHS)
+  message(STATUS "Using manually specified GPU targets: ${ARCHS}")
+else()
+  message(STATUS "Detecting available architecture")
+  ############ Find using rocminfo #####################
+  find_program(ROCMINFO_EXECUTABLE rocminfo)
+  if(ROCMINFO_EXECUTABLE)
+    execute_process(
+      COMMAND ${ROCMINFO_EXECUTABLE}
+      OUTPUT_VARIABLE ROCMINFO_OUTPUT
+      ERROR_QUIET
+      OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+
+    # 1) Only match lines where the token follows "Name:"
+    string(REGEX MATCHALL "Name:[ \t]+gfx[0-9a-z]+" ARCH_MATCHES "${ROCMINFO_OUTPUT}")
+
+    # 2) Strip the leading "Name:   " to keep just gfx tokens
+    string(REGEX REPLACE "Name:[ \t]+" "" ARCHS "${ARCH_MATCHES}")
+
+    # 3) Remove duplicates
+    list(REMOVE_DUPLICATES ARCHS)
+
+    foreach(match ${ARCHS})
+      string(REGEX REPLACE "Name:\\s+" "" arch "${match}")
+      list(APPEND ARCH "${arch}")
+    endforeach()
+  endif()
 endif()
 if (HIP_VERSION VERSION_GREATER_EQUAL "7.0")
   set(TARGETS "${TARGETS};gfx1201")
 endif()
 
-foreach(target ${TARGETS})
+if(ARCHS STREQUAL "")
+    message(FATAL_ERROR "No GPU architectures detected via rocminfo and no BUILD_ARCH specified. Use ./install.sh --arch=gfxXXX")
+endif()
+
+message(STATUS "Building for GPU architecture: ${ARCHS}")
+
+# Generate HIP_HIPCC_FLAGS
+foreach(target ${ARCHS})
   list(APPEND HIP_HIPCC_FLAGS "--offload-arch=${target}")
 endforeach()
 
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 6d6be5d..d11c01a 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -83,25 +83,33 @@ find_package(ROCmCMakeBuildTools QUIET CONFIG PATHS ${CMAKE_PREFIX_PATH})
 if(NOT ROCM_FOUND)
   set(PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern)
   set(rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download")
-  file(DOWNLOAD https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip
-       ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip STATUS status LOG log)
-
-  list(GET status 0 status_code)
-  list(GET status 1 status_string)
-
-  if(NOT status_code EQUAL 0)
-    message(FATAL_ERROR "error: downloading
-    'https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip' failed
-    status_code: ${status_code}
-    status_string: ${status_string}
-    log: ${log}
-    ")
+  set(rocm_cmake_zip ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip)
+
+  # Check if zip file already exists to skip download
+  if(EXISTS ${rocm_cmake_zip})
+    message("-- Using existing rocm-cmake zip file: ${rocm_cmake_zip}")
+  else()
+    file(DOWNLOAD https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip
+         ${rocm_cmake_zip} STATUS status LOG log)
+
+    list(GET status 0 status_code)
+    list(GET status 1 status_string)
+
+    if(NOT status_code EQUAL 0)
+      message(FATAL_ERROR "error: downloading
+      'https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip' failed
+      status_code: ${status_code}
+      status_string: ${status_string}
+      log: ${log}
+      ")
+    endif()
   endif()
 
-  execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip
+  execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf ${rocm_cmake_zip}
                   WORKING_DIRECTORY ${PROJECT_EXTERN_DIR})
 
-  find_package(ROCmCMakeBuildTools REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag})
+  # find_package(ROCmCMakeBuildTools REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag})
+  set(CMAKE_MODULE_PATH "${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}/share/rocm/cmake;${CMAKE_MODULE_PATH}")
 endif()
 
 include(ROCMSetupVersion)
diff --git a/install.sh b/install.sh
index b30a3fb..4d3284b 100755
--- a/install.sh
+++ b/install.sh
@@ -2,7 +2,7 @@
 # Author: Nico Trost
 # Modified by: Noel Chalmers
 
-#set -x #echo on
+# set -euo pipefail
 
 # #################################################
 # helper functions
@@ -17,6 +17,7 @@ function display_help()
   echo "    [--with-rocm=<dir>] Path to ROCm install (Default: /opt/rocm)"
   echo "    [--with-rocblas=<dir>] Path to rocBLAS library (Default: /opt/rocm/rocblas)"
   echo "    [--with-mpi=<dir>] Path to external MPI install (Default: clone+build OpenMPI)"
+  echo "    [--arch] Specify comma separated architecture list to build (Default: detect from rocm_agent_enumerator)"
   echo "    [--with-mpi-gtl=<dir>] Path to external MPI-GTL install (Optional: defaults to no gtl support)"
   echo "    [--verbose-print] Verbose output during HPL setup (Default: true)"
   echo "    [--progress-report] Print progress report to terminal during HPL run (Default: true)"
@@ -33,7 +34,7 @@ supported_distro( )
   fi
 
   case "${ID}" in
-    debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos)
+    debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos|kylin|rocky)
         true
         ;;
     *)  printf "This script is currently supported on Debian, Linuxmint, Ubuntu, CentOS, RHEL, Fedora and SLES\n"
@@ -68,11 +69,11 @@ exit_with_error( )
         printf "sudo apt install -y ${library_dependencies_ubuntu[*]}\n"
         ;;
 
-      centos|rhel|tencentos)
+      centos|rhel|tencentos|kylin)
         printf "sudo yum -y --nogpgcheck install ${library_dependencies_centos[*]}\n"
         ;;
 
-      fedora)
+      fedora|rocky)
         printf "sudo dnf install -y ${library_dependencies_fedora[*]}\n"
         ;;
 
@@ -105,42 +106,63 @@ check_exit_code( )
 # Clone and build OpenMPI+UCX in rochpl/tpl
 install_openmpi( )
 {
+  local install_dir=${PWD}/tpl
+  local ucx_prefix=${install_dir}/ucx
+  local ompi_prefix=${install_dir}/openmpi
+
   #OpenMPI and UCX install to one of these locations depending on OS
-  ucx_lib_folder=./tpl/ucx/lib
-  ompi_lib_folder=./tpl/openmpi/lib
-  ucx_lib64_folder=./tpl/ucx/lib64
-  ompi_lib64_folder=./tpl/openmpi/lib64
-
-  if [ ! -d "./tpl/ucx" ]; then
-    mkdir -p tpl && cd tpl
-    git clone --branch v1.18.0 https://github.com/openucx/ucx.git ucx
-    check_exit_code 2
-    cd ucx;
-    ./autogen.sh; ./autogen.sh #why do we have to run this twice?
+  local ucx_lib_folder=${ucx_prefix}/lib
+  local ompi_lib_folder=${ompi_prefix}/lib
+  local ucx_lib64_folder=${ucx_prefix}/lib64
+  local ompi_lib64_folder=${ompi_prefix}/lib64
+
+  # Create the tpl directory
+  mkdir -p ${install_dir} && cd ${install_dir}
+
+  local ucx_version=1.20.0
+  local ucx_src=${install_dir}/ucx-${ucx_version}
+  local ucx_tarball=ucx-${ucx_version}.tar.gz
+  local ompi_version=5.0.9
+  local ompi_src=${install_dir}/openmpi-${ompi_version}
+  local ompi_tarball=openmpi-${ompi_version}.tar.gz
+
+  # Download UCX on demand
+  if [ ! -d "${ucx_src}" ]; then
+    if [ ! -f "${ucx_tarball}" ]; then
+        wget https://github.com/openucx/ucx/releases/download/v${ucx_version}/${ucx_tarball}
+    fi
     check_exit_code 2
-    mkdir build; cd build
-    ../contrib/configure-opt --prefix=${PWD}/../ --with-rocm=${with_rocm} --without-knem --without-cuda --without-java
+    tar -zxf ${ucx_tarball}
     check_exit_code 2
-    make -j$(nproc)
-    check_exit_code 2
-    make install
+  fi
+  # Download OpenMPI on demand
+  if [ ! -d "${ompi_src}" ]; then
+    if [ ! -f "${ompi_tarball}" ]; then
+        wget https://download.open-mpi.org/release/open-mpi/v${ompi_version%.*}/${ompi_tarball}
+    fi
     check_exit_code 2
-    cd ../../..
-  elif ([ ! -f "${ucx_lib_folder}/libucm.so" ] || [ ! -f "${ucx_lib_folder}/libucp.so" ]  || \
-        [ ! -f "${ucx_lib_folder}/libucs.so" ] || [ ! -f "${ucx_lib_folder}/libuct.so" ]) && \
-       ([ ! -f "${ucx_lib64_folder}/libucm.so" ] || [ ! -f "${ucx_lib64_folder}/libucp.so" ]  || \
-        [ ! -f "${ucx_lib64_folder}/libucs.so" ] || [ ! -f "${ucx_lib64_folder}/libuct.so" ]); then
-    cd tpl/ucx;
-    ./autogen.sh; ./autogen.sh
+    tar -zxf ${ompi_tarball}
     check_exit_code 2
-    mkdir build; cd build
-    ../contrib/configure-opt --prefix=${PWD}/../ --with-rocm=${with_rocm} --without-knem --without-cuda --without-java
+  fi
+
+  # Build UCX on demand
+  if ([ ! -f "${ucx_lib_folder}/libucm.so" ] || [ ! -f "${ucx_lib_folder}/libucp.so" ]  || \
+      [ ! -f "${ucx_lib_folder}/libucs.so" ] || [ ! -f "${ucx_lib_folder}/libuct.so" ]) && \
+     ([ ! -f "${ucx_lib64_folder}/libucm.so" ] || [ ! -f "${ucx_lib64_folder}/libucp.so" ]  || \
+      [ ! -f "${ucx_lib64_folder}/libucs.so" ] || [ ! -f "${ucx_lib64_folder}/libuct.so" ]); then
+    cd ${ucx_src};
+    ./contrib/configure-release --prefix=${ucx_prefix} \
+      --enable-cma --enable-mt \
+      --with-mlx5 --with-rc --with-ud --with-dc --with-dm --with-ib_hw_tm \
+      --with-verbs=/usr/include --with-rdmacm=/usr \
+      --with-rocm=${with_rocm} \
+      --without-knem --without-cuda --without-java
     check_exit_code 2
     make -j$(nproc)
     check_exit_code 2
     make install
     check_exit_code 2
-    cd ../../..
+    cd ${install_dir}/..
   fi
 
   # Check for successful build
@@ -152,31 +174,26 @@ install_openmpi( )
     exit 3
   fi
 
-  if [ ! -d "./tpl/openmpi" ]; then
-    mkdir -p tpl && cd tpl
-    git clone --branch v5.0.7 --recursive https://github.com/open-mpi/ompi.git openmpi
-    check_exit_code 2
-    cd openmpi; ./autogen.pl;
-    check_exit_code 2
-    mkdir build; cd build
-    ../configure --prefix=${PWD}/../ --with-ucx=${PWD}/../../ucx --without-verbs --disable-man-pages --enable-mca-no-build=btl-uct
-    check_exit_code 2
-    make -j$(nproc)
-    check_exit_code 2
-    make install
-    check_exit_code 2
-    cd ../../..
-  elif [ ! -f "${ompi_lib_folder}/libmpi.so" ] && [ ! -f "${ompi_lib64_folder}/libmpi.so" ]; then
-    cd tpl/openmpi; ./autogen.pl;
-    check_exit_code 2
-    mkdir build; cd build
-    ../configure --prefix=${PWD}/../ --with-ucx=${PWD}/../../ucx --without-verbs --disable-man-pages --enable-mca-no-build=btl-uct
+  export LD_LIBRARY_PATH="${ucx_lib_folder}:${ucx_lib64_folder}:${LD_LIBRARY_PATH}"
+  export LIBRARY_PATH="${ucx_lib_folder}:${ucx_lib64_folder}:${LIBRARY_PATH}"
+  export CPATH="${ucx_prefix}/include:${CPATH}"
+
+  # Build OpenMPI on demand
+  if [ ! -f "${ompi_lib_folder}/libmpi.so" ] && [ ! -f "${ompi_lib64_folder}/libmpi.so" ]; then
+    cd ${ompi_src}
+    ./configure --prefix=${ompi_prefix} \
+      --with-ucx=${ucx_prefix} \
+      --with-rocm=${with_rocm} \
+      --disable-man-pages \
+      --enable-builtin-atomics \
+      --enable-wrapper-rpath \
+      --without-verbs --enable-mca-no-build=btl-uct
     check_exit_code 2
     make -j$(nproc)
     check_exit_code 2
     make install
     check_exit_code 2
-    cd ../../..
+    cd ${install_dir}/..
   fi
 
   # Check for successful build
@@ -184,6 +201,11 @@ install_openmpi( )
     echo "Error: OpenMPI install unsuccessful."
     exit_with_error 2
   fi
+
+  export LD_LIBRARY_PATH="${ompi_lib_folder}:${LD_LIBRARY_PATH}"
+  export LIBRARY_PATH="${ompi_lib_folder}:${LIBRARY_PATH}"
+  export CPATH="${ompi_prefix}/include:${CPATH}"
+  export OPAL_PREFIX=${ompi_prefix}
 }
 
 # #################################################
@@ -232,7 +254,7 @@ enable_tracing=false
 # check if we have a modern version of getopt that can handle whitespace and long parameters
 getopt -T
 if [[ $? -eq 4 ]]; then
-  GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,debug,prefix:,with-rocm:,with-mpi:,with-mpi-gtl:,with-rocblas:,verbose-print:,progress-report:,detailed-timing:,enable-tracing: --options hg -- "$@")
+  GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,debug,prefix:,with-rocm:,with-mpi:,with-mpi-gtl:,with-rocblas:,verbose-print:,arch:,progress-report:,detailed-timing:,enable-tracing: --options hg -- "$@")
 else
   echo "Need a new version of getopt"
   exit_with_error 1
@@ -263,6 +285,9 @@ while true; do
     --with-mpi)
         with_mpi=${2}
         shift 2 ;;
+    --arch)
+    arch=${2}
+    shift 2 ;;
     --with-mpi-gtl)
         with_mpi_gtl=${2}
         shift 2 ;;
@@ -294,9 +319,6 @@ printf "\033[32mCreating project build directory in: \033[33m${build_dir}\033[0m
 # #################################################
 # prep
 # #################################################
-# ensure a clean build environment
-rm -rf ${build_dir}
-
 # Default cmake executable is called cmake
 cmake_executable=cmake
 
@@ -311,7 +333,7 @@ pushd .
   # #################################################
   if [[ "${with_mpi}" == tpl/openmpi ]]; then
 
-    with_mpi=${PWD}/tpl/openmpi
+    with_mpi=${ompi_prefix}
     install_openmpi
 
   fi
@@ -347,11 +369,14 @@ pushd .
   if [[ "${enable_tracing}" == on || "${enable_tracing}" == true || "${enable_tracing}" == 1 || "${enable_tracing}" == enabled ]]; then
     cmake_common_options="${cmake_common_options} -DHPL_TRACING=ON"
   fi
+  if [[ -n "${arch}" ]]; then
+    cmake_common_options="${cmake_common_options} -DHPL_BUILD_ARCH=${arch}"
+  fi
   shopt -u nocasematch
 
   # Build library with AMD toolchain because of existence of device kernels
   mkdir -p ${build_dir} && cd ${build_dir}
-  ${cmake_executable} ${cmake_common_options} ..
+  ${cmake_executable} --fresh ${cmake_common_options} ..
   check_exit_code 2
 
   if [[ -e build.ninja ]]; then
diff --git a/src/HPL_pdtest.cpp b/src/HPL_pdtest.cpp
index 94a0d3f..3135763 100644
--- a/src/HPL_pdtest.cpp
+++ b/src/HPL_pdtest.cpp
@@ -212,7 +212,7 @@ void HPL_pdtest(HPL_T_test* TEST,
                     ctime(&current_time_end));
       }
 #ifdef HPL_PROGRESS_REPORT
-      printf("Final Score:    %7.4e GFLOPS \n", Gflops);
+      printf("Final Score:    %7.9e GFLOPS \n", Gflops);
 #endif
     }
 #ifdef HPL_DETAILED_TIMING
diff --git a/src/pgesv/HPL_pdgesv.cpp b/src/pgesv/HPL_pdgesv.cpp
index d6c99c3..280a9a5 100644
--- a/src/pgesv/HPL_pdgesv.cpp
+++ b/src/pgesv/HPL_pdgesv.cpp
@@ -336,7 +336,7 @@ void HPL_pdgesv(HPL_T_grid* GRID, HPL_T_palg* ALGO, HPL_T_pmat* A) {
       printf("  %9.3e  |", step_gflops);
 #endif
 
-      printf("    %9.3e   \n", gflops);
+      printf("    %9.9e   \n", gflops);
     }
 #endif