Commit c85bab5e authored by one's avatar one
Browse files

first commit

parents
# Linux line endings
* text=auto eol=lf
# Binary files
*.png binary
*.jpg binary
*.so binary
\ No newline at end of file
diff --git a/evo2/test/test_evo2_generation.py b/evo2/test/test_evo2_generation.py
index 789ecb3..2345c75 100644
--- a/evo2/test/test_evo2_generation.py
+++ b/evo2/test/test_evo2_generation.py
@@ -4,6 +4,7 @@ from importlib import resources
from pathlib import Path
from typing import List, Optional, Union
import numpy as np
+import time
import torch
@@ -65,6 +66,8 @@ def generate_and_score(*, sequences, model, generations_per_prompt=5, n_tokens=5
target = targets[i]
with torch.inference_mode():
+ if torch.cuda.is_available(): torch.cuda.synchronize()
+ elapsed_time = -time.perf_counter()
generated = model.generate(
prompt_seqs=[prompt],
n_tokens=n_tokens,
@@ -72,7 +75,10 @@ def generate_and_score(*, sequences, model, generations_per_prompt=5, n_tokens=5
top_k=top_k,
top_p=top_p,
)
-
+ if torch.cuda.is_available(): torch.cuda.synchronize()
+ elapsed_time += time.perf_counter()
+ print(f"[{i}] Time for model.generate: {elapsed_time:.3f} s")
+
decoded_seq = generated.sequences[0] # Assuming generate returns list of sequences
score = calculate_sequence_identity(decoded_seq, target)
scores.append(score)
@@ -94,6 +100,7 @@ def main():
parser = argparse.ArgumentParser(description="Test Evo2 Model Generation")
parser.add_argument("--model_name", choices=['evo2_7b', 'evo2_40b', 'evo2_1b_base'], default='evo2_7b',
help="Model to test (supports evo2_7b, evo2_40b, evo2_1b_base)")
+ parser.add_argument("--local_path", type=str, default=None)
args = parser.parse_args()
@@ -101,8 +108,8 @@ def main():
torch.manual_seed(1)
torch.cuda.manual_seed(1)
- model = Evo2(args.model_name)
-
+ model = Evo2(args.model_name, local_path=args.local_path)
+
# Test parameters: greedy sampling of 500 tokens
test_params = {
'n_tokens': 500,
@@ -140,4 +147,4 @@ def main():
print(f"\nTest Failed: Expected {expected_score}%, got {mean_score}%")
if __name__ == "__main__":
- main()
\ No newline at end of file
+ main()
diff --git a/pyproject.toml b/pyproject.toml
index fb1c1e6..6c7bd64 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,7 @@ name = "evo2"
version = "0.3.0"
description = "Genome modeling across all domains of life"
readme = "README.md"
-requires-python = ">=3.11,<3.13"
+requires-python = ">=3.10,<3.13"
license = "Apache-2.0"
authors = [
{name = "Evo 2 Team"},
--- vortex/model/utils.py.orig 2026-01-19 10:41:45.455424578 +0800
+++ vortex/model/utils.py 2026-01-19 10:47:28.980582986 +0800
@@ -114,7 +114,7 @@
mmap=True,
# Make sure PyTorch is not issuing a warning regarding potential
# security issues.
- weights_only=True,
+ weights_only=False,
)
model.to_bfloat16_except_pr_lc(to_float32=True)
--- vortex/model/attention.py.orig 2026-01-19 10:41:45.453424571 +0800
+++ vortex/model/attention.py 2026-01-19 10:47:28.981582989 +0800
@@ -26,6 +26,7 @@
FusedDense, ColumnParallelLinear, RowParallelLinear = None, None, None
from vortex.model.rotary import RotaryEmbedding
+from flash_attn.flash_attn_interface import flash_attn_kvpacked_func as dcu_flash_attn_kvpacked_fun
# From https://github.com/ofirpress/attention_with_linear_biases/blob/4b92f28a005ead2567abe2359f633e73e08f3833/fairseq/models/transformer.py#L742
@@ -215,16 +216,19 @@
batch_size, seqlen_q = q.shape[0], q.shape[1]
seqlen_k = kv.shape[1]
assert kv.shape[0] == batch_size and kv.shape[4] == q.shape[3]
- return local_flash_attn_kvpacked_func(
- q,
- kv,
- self.drop.p if self.training else 0.0,
- causal=causal,
- softmax_scale=self.softmax_scale,
- alibi_slopes=self.alibi_slopes,
- window_size=self.window_size,
- deterministic=self.deterministic,
- )
+ return dcu_flash_attn_kvpacked_fun(
+ q,
+ kv,
+ self.drop.p if self.training else 0.0,
+ softmax_scale=None,
+ causal=False,
+ alibi_slopes=self.alibi_slopes,
+ window_size=self.window_size,
+ deterministic=self.deterministic,
+ softcap=0.0,
+ return_attn_probs=False,
+ bhsd=False
+ )
class SelfAttention(nn.Module):
--- vortex/ops/attn_interface.py.orig 2026-01-19 10:41:45.456424582 +0800
+++ vortex/ops/attn_interface.py 2026-01-19 10:47:28.983582996 +0800
@@ -58,7 +58,7 @@
return_softmax: bool,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
- out, softmax_lse, S_dmask, rng_state = flash_attn_gpu.fwd(
+ out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = flash_attn_gpu.fwd(
q,
k,
v,
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6b80b24..563122a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -83,7 +83,7 @@ foreach(i ${rochpl_device_source})
endforeach()
# HIP flags workaround while target_compile_options does not work
-list(APPEND HIP_HIPCC_FLAGS "-Wno-unused-command-line-argument -Wno-deprecated-declarations -fPIE -fopenmp")
+list(APPEND HIP_HIPCC_FLAGS "-Wno-unused-command-line-argument -Wno-deprecated-declarations -fPIE -fopenmp --gpu-max-threads-per-block=1024")
list(APPEND CMAKE_HOST_FLAGS "-Wno-deprecated-declarations")
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
@@ -94,25 +94,51 @@ else()
list(APPEND CMAKE_HOST_FLAGS "-O3;-march=native")
endif()
-# GPU arch targets
-set(TARGETS "gfx900;gfx906")
-if(HIP_VERSION VERSION_GREATER_EQUAL "3.7")
- set(TARGETS "${TARGETS};gfx908")
-endif()
-if(HIP_VERSION VERSION_GREATER_EQUAL "4.3")
- set(TARGETS "${TARGETS};gfx90a")
-endif()
-if (HIP_VERSION VERSION_GREATER_EQUAL "5.7")
- set(TARGETS "${TARGETS};gfx942")
-endif()
-if (HIP_VERSION VERSION_GREATER_EQUAL "6.5")
- set(TARGETS "${TARGETS};gfx950;gfx1100")
+set(ARCHS "") # use plural to indicate list
+if(DEFINED HPL_BUILD_ARCH AND NOT HPL_BUILD_ARCH STREQUAL "")
+ string(REPLACE "," ";" ARCHS "${HPL_BUILD_ARCH}")
+ list(TRANSFORM ARCHS STRIP)
+ list(REMOVE_DUPLICATES ARCHS)
+ message(STATUS "Using manually specified GPU targets: ${ARCHS}")
+else()
+ message(STATUS "Detecting available architecture")
+ ############ Find using rocminfo #####################
+ find_program(ROCMINFO_EXECUTABLE rocminfo)
+ if(ROCMINFO_EXECUTABLE)
+ execute_process(
+ COMMAND ${ROCMINFO_EXECUTABLE}
+ OUTPUT_VARIABLE ROCMINFO_OUTPUT
+ ERROR_QUIET
+ OUTPUT_STRIP_TRAILING_WHITESPACE
+ )
+
+ # 1) Only match lines where the token follows "Name:"
+ string(REGEX MATCHALL "Name:[ \t]+gfx[0-9a-z]+" ARCH_MATCHES "${ROCMINFO_OUTPUT}")
+
+ # 2) Strip the leading "Name: " to keep just gfx tokens
+ string(REGEX REPLACE "Name:[ \t]+" "" ARCHS "${ARCH_MATCHES}")
+
+ # 3) Remove duplicates
+ list(REMOVE_DUPLICATES ARCHS)
+
+ foreach(match ${ARCHS})
+ string(REGEX REPLACE "Name:\\s+" "" arch "${match}")
+ list(APPEND ARCH "${arch}")
+ endforeach()
+ endif()
endif()
if (HIP_VERSION VERSION_GREATER_EQUAL "7.0")
set(TARGETS "${TARGETS};gfx1201")
endif()
-foreach(target ${TARGETS})
+if(ARCHS STREQUAL "")
+ message(FATAL_ERROR "No GPU architectures detected via rocminfo and no BUILD_ARCH specified. Use ./install.sh --arch=gfxXXX")
+endif()
+
+message(STATUS "Building for GPU architecture: ${ARCHS}")
+
+# Generate HIP_HIPCC_FLAGS
+foreach(target ${ARCHS})
list(APPEND HIP_HIPCC_FLAGS "--offload-arch=${target}")
endforeach()
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 6d6be5d..468420e 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -101,7 +101,8 @@ if(NOT ROCM_FOUND)
execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip
WORKING_DIRECTORY ${PROJECT_EXTERN_DIR})
- find_package(ROCmCMakeBuildTools REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag})
+ # find_package(ROCmCMakeBuildTools REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag})
+ set(CMAKE_MODULE_PATH "${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}/share/rocm/cmake;${CMAKE_MODULE_PATH}")
endif()
include(ROCMSetupVersion)
diff --git a/install.sh b/install.sh
index b30a3fb..9ada030 100755
--- a/install.sh
+++ b/install.sh
@@ -17,6 +17,7 @@ function display_help()
echo " [--with-rocm=<dir>] Path to ROCm install (Default: /opt/rocm)"
echo " [--with-rocblas=<dir>] Path to rocBLAS library (Default: /opt/rocm/rocblas)"
echo " [--with-mpi=<dir>] Path to external MPI install (Default: clone+build OpenMPI)"
+ echo " [--arch] Specify comma separated architecture list to build (Default: detect from rocm_agent_enumerator)"
echo " [--with-mpi-gtl=<dir>] Path to external MPI-GTL install (Optional: defaults to no gtl support)"
echo " [--verbose-print] Verbose output during HPL setup (Default: true)"
echo " [--progress-report] Print progress report to terminal during HPL run (Default: true)"
@@ -33,7 +34,7 @@ supported_distro( )
fi
case "${ID}" in
- debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos)
+ debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos|kylin|rocky)
true
;;
*) printf "This script is currently supported on Debian, Linuxmint, Ubuntu, CentOS, RHEL, Fedora and SLES\n"
@@ -68,11 +69,11 @@ exit_with_error( )
printf "sudo apt install -y ${library_dependencies_ubuntu[*]}\n"
;;
- centos|rhel|tencentos)
+ centos|rhel|tencentos|kylin)
printf "sudo yum -y --nogpgcheck install ${library_dependencies_centos[*]}\n"
;;
- fedora)
+ fedora|rocky)
printf "sudo dnf install -y ${library_dependencies_fedora[*]}\n"
;;
@@ -152,6 +153,11 @@ install_openmpi( )
exit 3
fi
+ UCX_ROOT="$(pwd)/tpl/ucx"
+ export LD_LIBRARY_PATH="${UCX_ROOT}/lib:${UCX_ROOT}/lib64:${LD_LIBRARY_PATH}"
+ export LIBRARY_PATH="${UCX_ROOT}/lib:${UCX_ROOT}/lib64:${LIBRARY_PATH}"
+ export CPATH="${UCX_ROOT}/include:${CPATH}"
+
if [ ! -d "./tpl/openmpi" ]; then
mkdir -p tpl && cd tpl
git clone --branch v5.0.7 --recursive https://github.com/open-mpi/ompi.git openmpi
@@ -232,7 +238,7 @@ enable_tracing=false
# check if we have a modern version of getopt that can handle whitespace and long parameters
getopt -T
if [[ $? -eq 4 ]]; then
- GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,debug,prefix:,with-rocm:,with-mpi:,with-mpi-gtl:,with-rocblas:,verbose-print:,progress-report:,detailed-timing:,enable-tracing: --options hg -- "$@")
+ GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,debug,prefix:,with-rocm:,with-mpi:,with-mpi-gtl:,with-rocblas:,verbose-print:,arch:,progress-report:,detailed-timing:,enable-tracing: --options hg -- "$@")
else
echo "Need a new version of getopt"
exit_with_error 1
@@ -263,6 +269,9 @@ while true; do
--with-mpi)
with_mpi=${2}
shift 2 ;;
+ --arch)
+ arch=${2}
+ shift 2 ;;
--with-mpi-gtl)
with_mpi_gtl=${2}
shift 2 ;;
@@ -347,6 +356,9 @@ pushd .
if [[ "${enable_tracing}" == on || "${enable_tracing}" == true || "${enable_tracing}" == 1 || "${enable_tracing}" == enabled ]]; then
cmake_common_options="${cmake_common_options} -DHPL_TRACING=ON"
fi
+ if [[ -n "${arch}" ]]; then
+ cmake_common_options="${cmake_common_options} -DHPL_BUILD_ARCH=${arch}"
+ fi
shopt -u nocasematch
# Build library with AMD toolchain because of existence of device kernels
diff --git a/scripts/mpirun_rochpl.in b/scripts/mpirun_rochpl.in
index 155f502..05a96e0 100755
--- a/scripts/mpirun_rochpl.in
+++ b/scripts/mpirun_rochpl.in
@@ -46,7 +46,7 @@ supported_distro( )
fi
case "${ID}" in
- debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos)
+ debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos|kylin|rocky)
true
;;
*) printf "This script is currently supported on Debian, Linuxmint, Ubuntu, CentOS, RHEL, Fedora and SLES\n"
@@ -101,6 +101,13 @@ filename=HPL.dat
inputfile=false
cmdrun=false
+tpl_dir=$(dirname "$(readlink -f "$0")")/../tpl
+ompi_prefix=$tpl_dir/openmpi
+ompi_lib_dir=$tpl_dir/openmpi/lib
+ucx_lib_dir=$tpl_dir/ucx/lib
+
+export LD_LIBRARY_PATH=$ompi_lib_dir:$ucx_lib_dir:$LD_LIBRARY_PATH
+export OPAL_PREFIX=$ompi_prefix
devices=
# #################################################
@@ -117,12 +124,16 @@ mpi_args=
#Check if using OpenMPI
if [[ $(${mpi_bin} --version | grep "open-mpi") ]]; then
mpi_args+=" --map-by node --rank-by slot --bind-to none "
+ #mpi_args+=" --map-by numa:PE=16 --bind-to core --report-bindings "
#Check if this is OpenMPI+UCX
ompi_info=$(dirname ${mpi_bin})/ompi_info
if [[ $(${ompi_info} | grep "MCA pml: ucx") ]]; then
# ucx-specific args
- mpi_args="--mca pml ucx --mca btl ^vader,tcp,openib,uct ${mpi_args}"
+ mpi_args="--mca pml ucx --mca btl ^vader,tcp,openib,uct \
+ -x UCX_TLS=self,sm,rocm_ipc,rocm_copy,rc_mlx5 \
+ -x UCX_MEMTYPE_CACHE=n \
+ ${mpi_args}"
fi
fi
@@ -153,7 +164,7 @@ while true; do
exit 0
;;
--version)
- ${mpi_bin} -np 1 ${mpi_args} ${rochpl_runscript} --version
+ ${mpi_bin} --allow-run-as-root -np 1 ${mpi_args} ${rochpl_runscript} --version
exit 0
;;
-P)
@@ -219,4 +230,4 @@ if [ ! -z "${devices}" ]; then
fi
#run
-${mpi_bin} -np ${np} ${mpi_args} ${rochpl_runscript} ${rochpl_args}
+${mpi_bin} --allow-run-as-root -np ${np} ${mpi_args} ${rochpl_runscript} ${rochpl_args}
diff --git a/scripts/run_rochpl.in b/scripts/run_rochpl.in
index 1522e5d..1380d3a 100755
--- a/scripts/run_rochpl.in
+++ b/scripts/run_rochpl.in
@@ -46,7 +46,7 @@ supported_distro( )
fi
case "${ID}" in
- debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos)
+ debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos|kylin|rocky)
true
;;
*) printf "This script is currently supported on Debian, Linuxmint, Ubuntu, CentOS, RHEL, Fedora and SLES\n"
@@ -104,7 +104,9 @@ cmdrun=false
devices=
-export LD_LIBRARY_PATH=${rocblas_dir}:${blas_dir}:${rocm_dir}/lib:$LD_LIBRARY_PATH
+tpl_dir=$(dirname "$(readlink -f "$0")")/../tpl
+ucx_lib_dir=$tpl_dir/ucx/lib
+export LD_LIBRARY_PATH=${rocblas_dir}:${blas_dir}:$ucx_lib_dir:${rocm_dir}/lib:$LD_LIBRARY_PATH
# #################################################
# Parameter parsing
@@ -274,7 +276,7 @@ myq=$((rank/p))
cpulist=$(lscpu --parse=CPU,CORE,NODE | awk '!/#/' | tr ',' "\t" | sort -k 2 -g -s)
#construct list of devices and their numa affinities
-devicelist=$(${rocm_dir}/bin/rocm-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t")
+devicelist=$(hy-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t")
#count the cpus per core
threads_per_core=$(echo "${cpulist}" | grep -c ".* 0 .*")
@@ -363,7 +365,8 @@ export OMP_PROC_BIND=true
if [[ $globalRank -lt $size ]]; then
- echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] GPU: $mygpu, CPU Cores: $omp_num_threads - $places"
+ echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] GPU: $mygpu, NUMA: $mynuma, CPU Cores: $omp_num_threads - $places"
+# echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] GPU: $mygpu, NUMA: $mynuma"
fi
rochpl_args="-P ${P} -Q ${Q} -p ${p} -q ${q} -f ${frac} -it ${it}"
@@ -374,4 +377,5 @@ else
fi
#run
-${rochpl_bin} ${rochpl_args}
+#${rochpl_bin} ${rochpl_args}
+numactl --cpunodebind=${mynuma} --membind=${mynuma} ${rochpl_bin} ${rochpl_args}
diff --git a/src/HPL_pdtest.cpp b/src/HPL_pdtest.cpp
index 94a0d3f..3135763 100644
--- a/src/HPL_pdtest.cpp
+++ b/src/HPL_pdtest.cpp
@@ -212,7 +212,7 @@ void HPL_pdtest(HPL_T_test* TEST,
ctime(&current_time_end));
}
#ifdef HPL_PROGRESS_REPORT
- printf("Final Score: %7.4e GFLOPS \n", Gflops);
+ printf("Final Score: %7.9e GFLOPS \n", Gflops);
#endif
}
#ifdef HPL_DETAILED_TIMING
diff --git a/src/pgesv/HPL_pdgesv.cpp b/src/pgesv/HPL_pdgesv.cpp
index d6c99c3..280a9a5 100644
--- a/src/pgesv/HPL_pdgesv.cpp
+++ b/src/pgesv/HPL_pdgesv.cpp
@@ -336,7 +336,7 @@ void HPL_pdgesv(HPL_T_grid* GRID, HPL_T_palg* ALGO, HPL_T_pmat* A) {
printf(" %9.3e |", step_gflops);
#endif
- printf(" %9.3e \n", gflops);
+ printf(" %9.9e \n", gflops);
}
#endif
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6b80b24..563122a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -83,7 +83,7 @@ foreach(i ${rochpl_device_source})
endforeach()
# HIP flags workaround while target_compile_options does not work
-list(APPEND HIP_HIPCC_FLAGS "-Wno-unused-command-line-argument -Wno-deprecated-declarations -fPIE -fopenmp")
+list(APPEND HIP_HIPCC_FLAGS "-Wno-unused-command-line-argument -Wno-deprecated-declarations -fPIE -fopenmp --gpu-max-threads-per-block=1024")
list(APPEND CMAKE_HOST_FLAGS "-Wno-deprecated-declarations")
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
@@ -94,25 +94,51 @@ else()
list(APPEND CMAKE_HOST_FLAGS "-O3;-march=native")
endif()
-# GPU arch targets
-set(TARGETS "gfx900;gfx906")
-if(HIP_VERSION VERSION_GREATER_EQUAL "3.7")
- set(TARGETS "${TARGETS};gfx908")
-endif()
-if(HIP_VERSION VERSION_GREATER_EQUAL "4.3")
- set(TARGETS "${TARGETS};gfx90a")
-endif()
-if (HIP_VERSION VERSION_GREATER_EQUAL "5.7")
- set(TARGETS "${TARGETS};gfx942")
-endif()
-if (HIP_VERSION VERSION_GREATER_EQUAL "6.5")
- set(TARGETS "${TARGETS};gfx950;gfx1100")
+set(ARCHS "") # use plural to indicate list
+if(DEFINED HPL_BUILD_ARCH AND NOT HPL_BUILD_ARCH STREQUAL "")
+ string(REPLACE "," ";" ARCHS "${HPL_BUILD_ARCH}")
+ list(TRANSFORM ARCHS STRIP)
+ list(REMOVE_DUPLICATES ARCHS)
+ message(STATUS "Using manually specified GPU targets: ${ARCHS}")
+else()
+ message(STATUS "Detecting available architecture")
+ ############ Find using rocminfo #####################
+ find_program(ROCMINFO_EXECUTABLE rocminfo)
+ if(ROCMINFO_EXECUTABLE)
+ execute_process(
+ COMMAND ${ROCMINFO_EXECUTABLE}
+ OUTPUT_VARIABLE ROCMINFO_OUTPUT
+ ERROR_QUIET
+ OUTPUT_STRIP_TRAILING_WHITESPACE
+ )
+
+ # 1) Only match lines where the token follows "Name:"
+ string(REGEX MATCHALL "Name:[ \t]+gfx[0-9a-z]+" ARCH_MATCHES "${ROCMINFO_OUTPUT}")
+
+ # 2) Strip the leading "Name: " to keep just gfx tokens
+ string(REGEX REPLACE "Name:[ \t]+" "" ARCHS "${ARCH_MATCHES}")
+
+ # 3) Remove duplicates
+ list(REMOVE_DUPLICATES ARCHS)
+
+ foreach(match ${ARCHS})
+ string(REGEX REPLACE "Name:\\s+" "" arch "${match}")
+ list(APPEND ARCH "${arch}")
+ endforeach()
+ endif()
endif()
if (HIP_VERSION VERSION_GREATER_EQUAL "7.0")
set(TARGETS "${TARGETS};gfx1201")
endif()
-foreach(target ${TARGETS})
+if(ARCHS STREQUAL "")
+ message(FATAL_ERROR "No GPU architectures detected via rocminfo and no BUILD_ARCH specified. Use ./install.sh --arch=gfxXXX")
+endif()
+
+message(STATUS "Building for GPU architecture: ${ARCHS}")
+
+# Generate HIP_HIPCC_FLAGS
+foreach(target ${ARCHS})
list(APPEND HIP_HIPCC_FLAGS "--offload-arch=${target}")
endforeach()
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 6d6be5d..468420e 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -101,7 +101,8 @@ if(NOT ROCM_FOUND)
execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip
WORKING_DIRECTORY ${PROJECT_EXTERN_DIR})
- find_package(ROCmCMakeBuildTools REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag})
+ # find_package(ROCmCMakeBuildTools REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag})
+ set(CMAKE_MODULE_PATH "${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}/share/rocm/cmake;${CMAKE_MODULE_PATH}")
endif()
include(ROCMSetupVersion)
diff --git a/install.sh b/install.sh
index b30a3fb..245341e 100755
--- a/install.sh
+++ b/install.sh
@@ -17,6 +17,7 @@ function display_help()
echo " [--with-rocm=<dir>] Path to ROCm install (Default: /opt/rocm)"
echo " [--with-rocblas=<dir>] Path to rocBLAS library (Default: /opt/rocm/rocblas)"
echo " [--with-mpi=<dir>] Path to external MPI install (Default: clone+build OpenMPI)"
+ echo " [--arch] Specify comma separated architecture list to build (Default: detect from rocm_agent_enumerator)"
echo " [--with-mpi-gtl=<dir>] Path to external MPI-GTL install (Optional: defaults to no gtl support)"
echo " [--verbose-print] Verbose output during HPL setup (Default: true)"
echo " [--progress-report] Print progress report to terminal during HPL run (Default: true)"
@@ -33,7 +34,7 @@ supported_distro( )
fi
case "${ID}" in
- debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos)
+ debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos|kylin|rocky)
true
;;
*) printf "This script is currently supported on Debian, Linuxmint, Ubuntu, CentOS, RHEL, Fedora and SLES\n"
@@ -68,11 +69,11 @@ exit_with_error( )
printf "sudo apt install -y ${library_dependencies_ubuntu[*]}\n"
;;
- centos|rhel|tencentos)
+ centos|rhel|tencentos|kylin)
printf "sudo yum -y --nogpgcheck install ${library_dependencies_centos[*]}\n"
;;
- fedora)
+ fedora|rocky)
printf "sudo dnf install -y ${library_dependencies_fedora[*]}\n"
;;
@@ -152,6 +153,11 @@ install_openmpi( )
exit 3
fi
+ UCX_ROOT="$(pwd)/tpl/ucx"
+ export LD_LIBRARY_PATH="${UCX_ROOT}/lib:${UCX_ROOT}/lib64:${LD_LIBRARY_PATH}"
+ export LIBRARY_PATH="${UCX_ROOT}/lib:${UCX_ROOT}/lib64:${LIBRARY_PATH}"
+ export CPATH="${UCX_ROOT}/include:${CPATH}"
+
if [ ! -d "./tpl/openmpi" ]; then
mkdir -p tpl && cd tpl
git clone --branch v5.0.7 --recursive https://github.com/open-mpi/ompi.git openmpi
@@ -184,6 +190,12 @@ install_openmpi( )
echo "Error: OpenMPI install unsuccessful."
exit_with_error 2
fi
+
+ OPENMPI_ROOT="$(pwd)/tpl/openmpi"
+ export LD_LIBRARY_PATH="${OPENMPI_ROOT}/lib:${LD_LIBRARY_PATH}"
+ export LIBRARY_PATH="${OPENMPI_ROOT}/lib:${LIBRARY_PATH}"
+ export CPATH="${OPENMPI_ROOT}/include:${CPATH}"
+ export OPAL_PREFIX=${OPENMPI_ROOT}
}
# #################################################
@@ -232,7 +244,7 @@ enable_tracing=false
# check if we have a modern version of getopt that can handle whitespace and long parameters
getopt -T
if [[ $? -eq 4 ]]; then
- GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,debug,prefix:,with-rocm:,with-mpi:,with-mpi-gtl:,with-rocblas:,verbose-print:,progress-report:,detailed-timing:,enable-tracing: --options hg -- "$@")
+ GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,debug,prefix:,with-rocm:,with-mpi:,with-mpi-gtl:,with-rocblas:,verbose-print:,arch:,progress-report:,detailed-timing:,enable-tracing: --options hg -- "$@")
else
echo "Need a new version of getopt"
exit_with_error 1
@@ -263,6 +275,9 @@ while true; do
--with-mpi)
with_mpi=${2}
shift 2 ;;
+ --arch)
+ arch=${2}
+ shift 2 ;;
--with-mpi-gtl)
with_mpi_gtl=${2}
shift 2 ;;
@@ -347,6 +362,9 @@ pushd .
if [[ "${enable_tracing}" == on || "${enable_tracing}" == true || "${enable_tracing}" == 1 || "${enable_tracing}" == enabled ]]; then
cmake_common_options="${cmake_common_options} -DHPL_TRACING=ON"
fi
+ if [[ -n "${arch}" ]]; then
+ cmake_common_options="${cmake_common_options} -DHPL_BUILD_ARCH=${arch}"
+ fi
shopt -u nocasematch
# Build library with AMD toolchain because of existence of device kernels
diff --git a/scripts/mpirun_rochpl.in b/scripts/mpirun_rochpl.in
index 155f502..3227f7d 100755
--- a/scripts/mpirun_rochpl.in
+++ b/scripts/mpirun_rochpl.in
@@ -46,7 +46,7 @@ supported_distro( )
fi
case "${ID}" in
- debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos)
+ debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos|kylin|rocky)
true
;;
*) printf "This script is currently supported on Debian, Linuxmint, Ubuntu, CentOS, RHEL, Fedora and SLES\n"
@@ -101,6 +101,13 @@ filename=HPL.dat
inputfile=false
cmdrun=false
+tpl_dir=$(dirname "$(readlink -f "$0")")/../tpl
+ompi_prefix=$tpl_dir/openmpi
+ompi_lib_dir=$tpl_dir/openmpi/lib
+ucx_lib_dir=$tpl_dir/ucx/lib
+
+export LD_LIBRARY_PATH=$ompi_lib_dir:$ucx_lib_dir:$LD_LIBRARY_PATH
+export OPAL_PREFIX=$ompi_prefix
devices=
# #################################################
@@ -117,12 +124,13 @@ mpi_args=
#Check if using OpenMPI
if [[ $(${mpi_bin} --version | grep "open-mpi") ]]; then
mpi_args+=" --map-by node --rank-by slot --bind-to none "
+ #mpi_args+=" --map-by numa:PE=16 --bind-to core --report-bindings "
#Check if this is OpenMPI+UCX
ompi_info=$(dirname ${mpi_bin})/ompi_info
if [[ $(${ompi_info} | grep "MCA pml: ucx") ]]; then
# ucx-specific args
- mpi_args="--mca pml ucx --mca btl ^vader,tcp,openib,uct ${mpi_args}"
+ mpi_args="--mca pml ucx ${mpi_args}"
fi
fi
@@ -153,7 +161,7 @@ while true; do
exit 0
;;
--version)
- ${mpi_bin} -np 1 ${mpi_args} ${rochpl_runscript} --version
+ ${mpi_bin} --allow-run-as-root -np 1 ${mpi_args} ${rochpl_runscript} --version
exit 0
;;
-P)
@@ -218,5 +226,24 @@ if [ ! -z "${devices}" ]; then
rochpl_args+=" --devices=${devices}"
fi
+echo "Copying files..."
+scp -P 3333 $0 node02:/workspace/build/
+scp -P 3333 $(dirname $(readlink -f "$0"))/run_rochpl node02:/workspace/build/
+
#run
-${mpi_bin} -np ${np} ${mpi_args} ${rochpl_runscript} ${rochpl_args}
+${mpi_bin} --allow-run-as-root \
+ --prefix ${ompi_prefix} \
+ ${mpi_args} \
+ --mca btl ^openib \
+ --mca btl_tcp_if_include p14p2 \
+ --mca plm_rsh_args "-p 3333" \
+ --mca coll_hcoll_enable 0 \
+ -x UCX_TLS=self,sm,rocm_ipc,rocm_copy,rc_mlx5,ud_mlx5 \
+ -x UCX_MEMTYPE_CACHE=n \
+ -x PATH -x LD_LIBRARY_PATH -x OPAL_PREFIX \
+ -x UCX_NET_DEVICES=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_7:1,mlx5_8:1,mlx5_9:1,mlx5_10:1 \
+ -x HSA_FORCE_FINE_GRAIN_PCIE=1 \
+ -x UCX_WARN_UNUSED_ENV_VARS=n \
+ -np 16 \
+ -H node01:8,node02:8 \
+ ${rochpl_runscript} ${rochpl_args}
diff --git a/scripts/run_rochpl.in b/scripts/run_rochpl.in
index 1522e5d..75aca98 100755
--- a/scripts/run_rochpl.in
+++ b/scripts/run_rochpl.in
@@ -46,7 +46,7 @@ supported_distro( )
fi
case "${ID}" in
- debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos)
+ debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos|kylin|rocky)
true
;;
*) printf "This script is currently supported on Debian, Linuxmint, Ubuntu, CentOS, RHEL, Fedora and SLES\n"
@@ -104,7 +104,9 @@ cmdrun=false
devices=
-export LD_LIBRARY_PATH=${rocblas_dir}:${blas_dir}:${rocm_dir}/lib:$LD_LIBRARY_PATH
+tpl_dir=$(dirname "$(readlink -f "$0")")/../tpl
+ucx_lib_dir=$tpl_dir/ucx/lib
+export LD_LIBRARY_PATH=${rocblas_dir}:${blas_dir}:$ucx_lib_dir:${rocm_dir}/lib:$LD_LIBRARY_PATH
# #################################################
# Parameter parsing
@@ -274,7 +276,7 @@ myq=$((rank/p))
cpulist=$(lscpu --parse=CPU,CORE,NODE | awk '!/#/' | tr ',' "\t" | sort -k 2 -g -s)
#construct list of devices and their numa affinities
-devicelist=$(${rocm_dir}/bin/rocm-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t")
+devicelist=$(hy-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t")
#count the cpus per core
threads_per_core=$(echo "${cpulist}" | grep -c ".* 0 .*")
@@ -361,9 +363,24 @@ export OMP_NUM_THREADS=${omp_num_threads}
export OMP_PLACES=${omp_places}
export OMP_PROC_BIND=true
+# Hard-coded IB mapping for now
+declare -a IB_MAP=(
+ "mlx5_1:1" # GPU 0 -> IB on NUMA 3
+ "mlx5_2:1" # GPU 1 -> IB on NUMA 3
+ "mlx5_3:1" # GPU 2 -> IB on NUMA 0
+ "mlx5_4:1" # GPU 3 -> IB on NUMA 0
+ "mlx5_7:1" # GPU 4 -> IB on NUMA 7
+ "mlx5_8:1" # GPU 5 -> IB on NUMA 7
+ "mlx5_9:1" # GPU 6 -> IB on NUMA 4
+ "mlx5_10:1" # GPU 7 -> IB on NUMA 4
+)
+
+myib=${IB_MAP[$rank]}
+export UCX_NET_DEVICES=${myib}
if [[ $globalRank -lt $size ]]; then
- echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] GPU: $mygpu, CPU Cores: $omp_num_threads - $places"
+ echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] GPU: $mygpu, NUMA: $mynuma, IB: $myib, CPU Cores: $omp_num_threads - $places"
+# echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] GPU: $mygpu, NUMA: $mynuma"
fi
rochpl_args="-P ${P} -Q ${Q} -p ${p} -q ${q} -f ${frac} -it ${it}"
@@ -374,4 +391,5 @@ else
fi
#run
-${rochpl_bin} ${rochpl_args}
+#${rochpl_bin} ${rochpl_args}
+numactl -N ${mynuma} -m ${mynuma} ${rochpl_bin} ${rochpl_args}
diff --git a/src/HPL_pdtest.cpp b/src/HPL_pdtest.cpp
index 94a0d3f..3135763 100644
--- a/src/HPL_pdtest.cpp
+++ b/src/HPL_pdtest.cpp
@@ -212,7 +212,7 @@ void HPL_pdtest(HPL_T_test* TEST,
ctime(&current_time_end));
}
#ifdef HPL_PROGRESS_REPORT
- printf("Final Score: %7.4e GFLOPS \n", Gflops);
+ printf("Final Score: %7.9e GFLOPS \n", Gflops);
#endif
}
#ifdef HPL_DETAILED_TIMING
diff --git a/src/pgesv/HPL_pdgesv.cpp b/src/pgesv/HPL_pdgesv.cpp
index d6c99c3..280a9a5 100644
--- a/src/pgesv/HPL_pdgesv.cpp
+++ b/src/pgesv/HPL_pdgesv.cpp
@@ -336,7 +336,7 @@ void HPL_pdgesv(HPL_T_grid* GRID, HPL_T_palg* ALGO, HPL_T_pmat* A) {
printf(" %9.3e |", step_gflops);
#endif
- printf(" %9.3e \n", gflops);
+ printf(" %9.9e \n", gflops);
}
#endif
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 91afcc4..b1c3ef6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -88,7 +88,7 @@ foreach(i ${rochplmxp_device_source})
endforeach()
# HIP flags workaround while target_compile_options does not work
-list(APPEND HIP_HIPCC_FLAGS "-Wno-unused-command-line-argument -fPIE")
+list(APPEND HIP_HIPCC_FLAGS "-Wno-unused-command-line-argument -fPIE --gpu-max-threads-per-block=1024")
list(APPEND CMAKE_HOST_FLAGS "")
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
@@ -99,21 +99,50 @@ else()
list(APPEND CMAKE_HOST_FLAGS "-O3;-march=native;-Wno-deprecated-declarations")
endif()
-# GPU arch targets
-set(TARGETS "gfx900;gfx906")
-if(HIP_VERSION VERSION_GREATER_EQUAL "3.7")
- set(TARGETS "${TARGETS};gfx908")
-endif()
-if(HIP_VERSION VERSION_GREATER_EQUAL "4.3")
- set(TARGETS "${TARGETS};gfx90a")
+set(ARCHS "") # use plural to indicate list
+if(DEFINED HPL_BUILD_ARCH AND NOT HPL_BUILD_ARCH STREQUAL "")
+ string(REPLACE "," ";" ARCHS "${HPL_BUILD_ARCH}")
+ list(TRANSFORM ARCHS STRIP)
+ list(REMOVE_DUPLICATES ARCHS)
+ message(STATUS "Using manually specified GPU targets: ${ARCHS}")
+else()
+ message(STATUS "Detecting available architecture")
+ ############ Find using rocminfo #####################
+ find_program(ROCMINFO_EXECUTABLE rocminfo)
+ if(ROCMINFO_EXECUTABLE)
+ execute_process(
+ COMMAND ${ROCMINFO_EXECUTABLE}
+ OUTPUT_VARIABLE ROCMINFO_OUTPUT
+ ERROR_QUIET
+ OUTPUT_STRIP_TRAILING_WHITESPACE
+ )
+
+ # 1) Only match lines where the token follows "Name:"
+ string(REGEX MATCHALL "Name:[ \t]+gfx[0-9a-z]+" ARCH_MATCHES "${ROCMINFO_OUTPUT}")
+
+ # 2) Strip the leading "Name: " to keep just gfx tokens
+ string(REGEX REPLACE "Name:[ \t]+" "" ARCHS "${ARCH_MATCHES}")
+
+ # 3) Remove duplicates
+ list(REMOVE_DUPLICATES ARCHS)
+
+ foreach(match ${ARCHS})
+ string(REGEX REPLACE "Name:\\s+" "" arch "${match}")
+ list(APPEND ARCH "${arch}")
+ endforeach()
+ endif()
endif()
-if (HIP_VERSION VERSION_GREATER_EQUAL "5.7")
- set(TARGETS "${TARGETS};gfx942")
+if (HIP_VERSION VERSION_GREATER_EQUAL "7.0")
+ set(TARGETS "${TARGETS};gfx1201")
endif()
-if (HIP_VERSION VERSION_GREATER_EQUAL "6.5")
- set(TARGETS "${TARGETS};gfx950")
+
+if(ARCHS STREQUAL "")
+ message(FATAL_ERROR "No GPU architectures detected via rocminfo and no BUILD_ARCH specified. Use ./install.sh --arch=gfxXXX")
endif()
+message(STATUS "Building for GPU architecture: ${ARCHS}")
+
+# Generate HIP_HIPCC_FLAGS
foreach(target ${TARGETS})
list(APPEND HIP_HIPCC_FLAGS "--offload-arch=${target}")
endforeach()
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 164d06d..78cc857 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -109,7 +109,8 @@ if(NOT ROCM_FOUND)
execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip
WORKING_DIRECTORY ${PROJECT_EXTERN_DIR})
- find_package(ROCmCMakeBuildTools REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag})
+ # find_package(ROCmCMakeBuildTools REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag})
+ set(CMAKE_MODULE_PATH "${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}/share/rocm/cmake;${CMAKE_MODULE_PATH}")
endif()
include(ROCMSetupVersion)
diff --git a/install.sh b/install.sh
index de72a20..6542e7f 100755
--- a/install.sh
+++ b/install.sh
@@ -18,6 +18,7 @@ function display_help()
echo " [--with-rocblas=<dir>] Path to rocBLAS library (Default: /opt/rocm/rocblas)"
echo " [--with-rocsolver=<dir>] Path to rocSOLVER library (Default: /opt/rocm/rocsolver)"
echo " [--with-mpi=<dir>] Path to external MPI install (Default: clone+build OpenMPI)"
+ echo " [--arch] Specify comma separated architecture list to build (Default: detect from rocm_agent_enumerator)"
echo " [--verbose-print] Verbose output during HPL setup (Default: true)"
echo " [--enable-tracing] Annotate profiler traces with rocTX markers (Default: false)"
echo " [--progress-report] Print progress report to terminal during HPL run (Default: true)"
@@ -33,7 +34,7 @@ supported_distro( )
fi
case "${ID}" in
- ubuntu|centos|rhel|fedora|sles)
+ ubuntu|centos|rhel|fedora|sles|kylin|rocky)
true
;;
*) printf "This script is currently supported on Ubuntu, CentOS, RHEL, Fedora and SLES\n"
@@ -68,11 +69,11 @@ exit_with_error( )
printf "sudo apt install -y ${library_dependencies_ubuntu[*]}\n"
;;
- centos|rhel)
+ centos|rhel|kylin)
printf "sudo yum -y --nogpgcheck install ${library_dependencies_centos[*]}\n"
;;
- fedora)
+ fedora|rocky)
printf "sudo dnf install -y ${library_dependencies_fedora[*]}\n"
;;
@@ -145,6 +146,11 @@ install_openmpi( )
exit 3
fi
+ UCX_ROOT="$(pwd)/tpl/ucx"
+ export LD_LIBRARY_PATH="${UCX_ROOT}/lib:${UCX_ROOT}/lib64:${LD_LIBRARY_PATH}"
+ export LIBRARY_PATH="${UCX_ROOT}/lib:${UCX_ROOT}/lib64:${LIBRARY_PATH}"
+ export CPATH="${UCX_ROOT}/include:${CPATH}"
+
if [ ! -d "./tpl/openmpi" ]; then
mkdir -p tpl && cd tpl
git clone --branch v5.0.7 --recursive https://github.com/open-mpi/ompi.git openmpi
@@ -225,7 +231,7 @@ detailed_timing=true
# check if we have a modern version of getopt that can handle whitespace and long parameters
getopt -T
if [[ $? -eq 4 ]]; then
- GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,debug,prefix:,with-rocm:,with-mpi:,with-rocblas:,with-rocsolver:,verbose-print:,enable-tracing:,progress-report:,detailed-timing: --options hg -- "$@")
+ GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,debug,prefix:,with-rocm:,with-mpi:,with-rocblas:,with-rocsolver:,arch:,verbose-print:,enable-tracing:,progress-report:,detailed-timing: --options hg -- "$@")
else
echo "Need a new version of getopt"
exit_with_error 1
@@ -262,6 +268,9 @@ while true; do
--with-rocsolver)
with_rocsolver=${2}
shift 2 ;;
+ --arch)
+ arch=${2}
+ shift 2 ;;
--verbose-print)
verbose_print=${2}
shift 2 ;;
@@ -335,6 +344,9 @@ pushd .
if [[ "${enable_tracing}" == on || "${enable_tracing}" == true || "${enable_tracing}" == 1 || "${enable_tracing}" == enabled ]]; then
cmake_common_options="${cmake_common_options} -DHPLMXP_TRACING=ON"
fi
+ if [[ -n "${arch}" ]]; then
+ cmake_common_options="${cmake_common_options} -DHPL_BUILD_ARCH=${arch}"
+ fi
shopt -u nocasematch
# Build library with AMD toolchain because of existence of device kernels
diff --git a/scripts/mpirun_rochplmxp.in b/scripts/mpirun_rochplmxp.in
index 5ad6166..89bb1ab 100755
--- a/scripts/mpirun_rochplmxp.in
+++ b/scripts/mpirun_rochplmxp.in
@@ -44,7 +44,7 @@ supported_distro( )
fi
case "${ID}" in
- ubuntu|centos|rhel|fedora|sles)
+ ubuntu|centos|rhel|fedora|sles|kylin|rocky)
true
;;
*) printf "This script is currently supported on Ubuntu, CentOS, RHEL, Fedora and SLES\n"
@@ -98,6 +98,14 @@ filename=HPL-MxP.dat
inputfile=false
cmdrun=false
+tpl_dir=$(dirname "$(readlink -f "$0")")/../tpl
+ompi_prefix=$tpl_dir/openmpi
+ompi_lib_dir=$tpl_dir/openmpi/lib
+ucx_lib_dir=$tpl_dir/ucx/lib
+
+export LD_LIBRARY_PATH=$ompi_lib_dir:$ucx_lib_dir:$LD_LIBRARY_PATH
+export OPAL_PREFIX=$ompi_prefix
+
# #################################################
# MPI Args
# #################################################
@@ -113,7 +121,10 @@ if [[ $(${mpi_bin} --version | grep "open-mpi") ]]; then
ompi_info=$(dirname ${mpi_bin})/ompi_info
if [[ $(${ompi_info} | grep "MCA pml: ucx") ]]; then
# ucx-specific args
- mpi_args="--mca pml ucx --mca btl ^vader,tcp,openib,uct ${mpi_args}"
+ mpi_args="--mca pml ucx --mca btl ^vader,tcp,openib,uct \
+ -x UCX_TLS=self,sm,rocm_ipc,rocm_copy,rc_mlx5 \
+ -x UCX_MEMTYPE_CACHE=n \
+ ${mpi_args}"
fi
fi
@@ -144,7 +155,7 @@ while true; do
exit 0
;;
--version)
- ${mpi_bin} -np 1 ${mpi_args} ${rochplmxp_runscript} --version
+ ${mpi_bin} --allow-run-as-root -np 1 ${mpi_args} ${rochplmxp_runscript} --version
exit 0
;;
-P)
@@ -200,4 +211,4 @@ else
fi
#run
-${mpi_bin} -np ${np} ${mpi_args} ${rochplmxp_runscript} ${rochplmxp_args}
+${mpi_bin} --allow-run-as-root -np ${np} ${mpi_args} ${rochplmxp_runscript} ${rochplmxp_args}
diff --git a/scripts/run_rochplmxp.in b/scripts/run_rochplmxp.in
index 698d3c1..bf1a15a 100755
--- a/scripts/run_rochplmxp.in
+++ b/scripts/run_rochplmxp.in
@@ -44,7 +44,7 @@ supported_distro( )
fi
case "${ID}" in
- ubuntu|centos|rhel|fedora|sles)
+ ubuntu|centos|rhel|fedora|sles|kylin|rocky)
true
;;
*) printf "This script is currently supported on Ubuntu, CentOS, RHEL, Fedora and SLES\n"
@@ -98,7 +98,9 @@ filename=HPL-MxP.dat
inputfile=false
cmdrun=false
-export LD_LIBRARY_PATH=${rocblas_dir}:${blas_dir}:${rocm_dir}/lib:$LD_LIBRARY_PATH
+tpl_dir=$(dirname "$(readlink -f "$0")")/../tpl
+ucx_lib_dir=$tpl_dir/ucx/lib
+export LD_LIBRARY_PATH=${rocblas_dir}:${blas_dir}:$ucx_lib_dir:${rocm_dir}/lib:$LD_LIBRARY_PATH
# #################################################
# Parameter parsing
diff --git a/src/hplmxp_ptest.cpp b/src/hplmxp_ptest.cpp
index 11d0f44..e8b1eee 100644
--- a/src/hplmxp_ptest.cpp
+++ b/src/hplmxp_ptest.cpp
@@ -211,7 +211,7 @@ void HPLMXP_ptest(HPLMXP_T_test& test,
ctime(&current_time_end));
}
#ifdef HPLMXP_PROGRESS_REPORT
- printf("Final Score: %7.4e GFLOPS \n", Gflops);
+ printf("Final Score: %7.9e GFLOPS \n", Gflops);
#endif
}
#ifdef HPLMXP_DETAILED_TIMING
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment