Commit d22dbec2 authored by zhoux's avatar zhoux
Browse files

Initial commit: release hytlass-0.1.0

parents
# HYGON HYTLASS Changelog
# HYTLASS 0.1.0新增
HYTLASS 0.1.0是HYTLASS的首次提交,其实现基于CUTLASS 3.5.0,具体地,提供了:
- HYTLASS兼容和支持CUTLASS 2.x实现:
- 支持BW(GFX936)及更早架构下的指令特性,包括支持TensorCore的mmac及ds_read_matrix原语。
- 支持基于这些架构指令特性下的矩阵乘法实现。
- 支持基于矩阵乘法模版实现的基于隐式矩阵乘法的卷积实现。
- HYTLASS兼容和支持CUTLASS 3.x及CuTe编程模型:
- 适配基于DCU架构对CUTLASS 3.x中的CuTe编程模型HuTe,目前已支持至BW平台的指令原语。
- 支持基于HuTe模型的矩阵乘法实现,包括MMA部分和Epilogue部分。
- 支持基于HuTe模型的Kernel调度、Tiling调度等计算任务调度架构,支持多种线程块调度优化策略。
- 十余个计算示例支持:
- 支持基于2.x的多种数据类型(TF32/FP16/BF16/I8/U8)的矩阵乘法、卷积及其融合算子实现。
- 支持基于Split-K、Stream-K等计算优化算法的矩阵乘法示例。
- 支持基于访问者模式的自定义尾声处理示例。
- 支持基于HuTe的矩阵乘法示例,包括BatchedGemm、GroupGemm等示例。
- 支持使用TensorCore加速的基于Block Ell格式的稀疏矩阵乘法示例。
- 工具链支持:
- 支持hytlass_profiler,用于细粒度问题参数下的kernel tuning。
# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmake_minimum_required(VERSION 3.19 FATAL_ERROR)
cmake_policy(SET CMP0112 NEW)
find_program(HIP_COMPILER_PATH hipcc)
if(HIP_COMPILER_PATH)
get_filename_component(DCU_TOOLKIT_ROOT_DIR "${HIP_COMPILER_PATH}" DIRECTORY)
get_filename_component(DCU_TOOLKIT_ROOT_DIR "${DCU_TOOLKIT_ROOT_DIR}/.." REALPATH)
message(STATUS "DCU_TOOLKIT_ROOT_DIR is set to ${DCU_TOOLKIT_ROOT_DIR}")
else()
message(FATAL_ERROR "hipcc not found in the environment path.")
endif()
# use hipcc as default compiler
set(CMAKE_CXX_COMPILER "${HIP_COMPILER_PATH}")
# for hipcomplex support
add_definitions(-DROCM_MATHLIBS_API_USE_HIP_COMPLEX=1)
if(hytlass_LOADED)
# If HYTLASS has been previously fetched and loaded, don't do it again.
return()
else()
set(hytlass_LOADED ON)
set(HYTLASS_DIR ${CMAKE_CURRENT_SOURCE_DIR} CACHE PATH "HYTLASS Repository Directory")
endif()
message(STATUS "CMake Version: ${CMAKE_VERSION}")
set(IMPLICIT_CMAKE_CXX_STANDARD OFF CACHE BOOL "Do not explicitly specify -std=c++17 if set")
# To reduce duplicate version locations, parse the version out of the
# main versions.h file and reuse it here.
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/include/hytlass/version.h VERSION_FILE_CONTENTS)
string(REGEX MATCH "#define HYTLASS_MAJOR ([0-9]+)" _HYTLASS_VERSION_MAJOR "${VERSION_FILE_CONTENTS}")
set(_HYTLASS_VERSION_MAJOR ${CMAKE_MATCH_1})
string(REGEX MATCH "#define HYTLASS_MINOR ([0-9]+)" _HYTLASS_VERSION_MINOR "${VERSION_FILE_CONTENTS}")
set(_HYTLASS_VERSION_MINOR ${CMAKE_MATCH_1})
string(REGEX MATCH "#define HYTLASS_PATCH ([0-9]+)" _HYTLASS_VERSION_PATCH "${VERSION_FILE_CONTENTS}")
set(_HYTLASS_VERSION_PATCH ${CMAKE_MATCH_1})
message(STATUS "HYTLASS ${_HYTLASS_VERSION_MAJOR}.${_HYTLASS_VERSION_MINOR}.${_HYTLASS_VERSION_PATCH}")
## HYTLASS PROJECT #############################################################
project(HYTLASS VERSION ${_HYTLASS_VERSION_MAJOR}.${_HYTLASS_VERSION_MINOR}.${_HYTLASS_VERSION_PATCH} LANGUAGES CXX)
################################################################################
include(${CMAKE_CURRENT_SOURCE_DIR}/HIP.cmake)
# enable __shfl_sync for dtk24.x
find_file(AMD_WARP_SYNC_PATH amd_warp_sync_functions.h
PATHS ${DCU_TOOLKIT_ROOT_DIR}/hip/include/hip/amd_detail
)
if(AMD_WARP_SYNC_PATH)
message(STATUS "Enable HIP_ENABLE_WARP_SYNC_BUILTINS")
add_definitions(-DHIP_ENABLE_WARP_SYNC_BUILTINS)
endif()
if( CMAKE_CXX_COMPILER MATCHES ".*/hipcc$")
execute_process(COMMAND ${CMAKE_CXX_COMPILER} "--version" OUTPUT_VARIABLE CXX_OUTPUT
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_STRIP_TRAILING_WHITESPACE)
string(REGEX MATCH "[A-Za-z]* ?clang version" TMP_CXX_VERSION ${CXX_OUTPUT})
string(REGEX MATCH "[A-Za-z]+" CXX_VERSION_STRING ${TMP_CXX_VERSION})
endif()
# add compiler check
if( CXX_VERSION_STRING MATCHES "clang" )
message( STATUS "Use hip-clang to build for amdgpu backend" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_HCC_COMPAT_MODE__=1" )
elseif( CXX_VERSION_STRING MATCHES "hipcc" )
message(FATAL_ERROR "Don't support for hipcc")
else()
message(FATAL_ERROR "Unsupport compiler ${CMAKE_CXX_COMPILER}. Only support for hip-clang")
endif()
find_package(Doxygen QUIET)
################################################################################
#
# HYTLASS 3.x requires C++17
#
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
set(CMAKE_INSTALL_PREFIX install CACHE PATH "Default installation location." FORCE)
endif()
message(STATUS "Default Install Location: ${CMAKE_INSTALL_PREFIX}")
set(HYTLASS_TEST_LEVEL "0" CACHE STRING "Level of tests to compile.")
# 0 - Sanity, 1 - Release-Quality, 2 - Exhaustive
find_package(Python3 3.5 COMPONENTS Interpreter REQUIRED)
# Install hytlass_library Python package
execute_process(
WORKING_DIRECTORY ${HYTLASS_DIR}/python
COMMAND ${Python3_EXECUTABLE} ${HYTLASS_DIR}/python/setup_library.py develop --user
RESULT_VARIABLE hytlass_lib_GENERATOR_INSTALL_RESULT
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/hytlass_library_installation.log
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/hytlass_library_installation.log
)
if(NOT hytlass_lib_GENERATOR_INSTALL_RESULT EQUAL 0)
message(FATAL_ERROR "Error installing hytlass_library package. See ${CMAKE_CURRENT_BINARY_DIR}/hytlass_library_installation.log")
endif()
################################################################################
set(HYTLASS_ENABLE_HEADERS_ONLY OFF CACHE BOOL "Enable only the header library")
if(HYTLASS_ENABLE_HEADERS_ONLY)
set(HYTLASS_ENABLE_EXAMPLES_INIT OFF)
set(HYTLASS_ENABLE_TOOLS_INIT ON)
set(HYTLASS_ENABLE_LIBRARY_INIT OFF)
set(HYTLASS_ENABLE_TESTS_INIT OFF)
else()
set(HYTLASS_ENABLE_EXAMPLES_INIT ON)
set(HYTLASS_ENABLE_TOOLS_INIT ON)
set(HYTLASS_ENABLE_LIBRARY_INIT ON)
if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
set(HYTLASS_ENABLE_TESTS_INIT ON)
else()
set(HYTLASS_ENABLE_TESTS_INIT OFF)
endif()
set(HYTLASS_ENABLE_HIPBLAS ON)
endif()
set(HYTLASS_TEST_UNIT_ENABLE_WARNINGS OFF CACHE BOOL "Enable warnings on waived unit tests.")
set(HYTLASS_ENABLE_EXAMPLES ${HYTLASS_ENABLE_EXAMPLES_INIT} CACHE BOOL "Enable HYTLASS Examples")
set(HYTLASS_ENABLE_TOOLS ${HYTLASS_ENABLE_TOOLS_INIT} CACHE BOOL "Enable HYTLASS Tools")
set(HYTLASS_ENABLE_LIBRARY ${HYTLASS_ENABLE_LIBRARY_INIT} CACHE BOOL "Enable HYTLASS Library")
set(HYTLASS_ENABLE_PROFILER ${HYTLASS_ENABLE_LIBRARY} CACHE BOOL "Enable HYTLASS Profiler")
set(HYTLASS_ENABLE_PERFORMANCE ${HYTLASS_ENABLE_PROFILER} CACHE BOOL "Enable HYTLASS Performance")
set(HYTLASS_ENABLE_TESTS ${HYTLASS_ENABLE_TESTS_INIT} CACHE BOOL "Enable HYTLASS Tests")
set(HYTLASS_ENABLE_GTEST_UNIT_TESTS ${HYTLASS_ENABLE_TESTS} CACHE BOOL "Enable HYTLASS GTest-based Unit Tests")
set(HYTLASS_USE_SYSTEM_GOOGLETEST OFF CACHE BOOL "Use system/external installation of GTest")
################################################################################
# Enable all arch for now unless some archs were specified
if (NOT DEFINED HYTLASS_HIPCC_ARCHS_SUPPORTED)
set(HYTLASS_HIPCC_ARCHS_SUPPORTED)
if(DEFINED ENV{AMDGPU_TARGETS})
set(AMDGPU_TARGETS_LIST "$ENV{AMDGPU_TARGETS}")
foreach(target ${AMDGPU_TARGETS_LIST})
string(REGEX REPLACE "gfx([0-9]+)" "\\1" number ${target})
list(APPEND HYTLASS_HIPCC_ARCHS_SUPPORTED "${number}")
endforeach()
elseif()
list(APPEND HYTLASS_HIPCC_ARCHS_SUPPORTED 906 926 908 928 936)
endif()
endif()
set(HYTLASS_HIPCC_ARCHS ${HYTLASS_HIPCC_ARCHS_SUPPORTED} CACHE STRING "The Gfx architectures requested.")
set(HYTLASS_HIPCC_ARCHS_ENABLED ${HYTLASS_HIPCC_ARCHS} CACHE STRING "The Gfx architectures to build code for.")
# Find unsupported and deprecated compute capabilities
if (HYTLASS_HIPCC_ARCHS_SUPPORTED)
set(HYTLASS_HIPCC_ARCHS_UNSUPPORTED ${HYTLASS_HIPCC_ARCHS})
list(REMOVE_ITEM HYTLASS_HIPCC_ARCHS_UNSUPPORTED ${HYTLASS_HIPCC_ARCHS_SUPPORTED})
if (HYTLASS_HIPCC_ARCHS_UNSUPPORTED)
message(WARNING "Using unsupported or deprecated compute capabilities ${HYTLASS_HIPCC_ARCHS_UNSUPPORTED}. Support may be removed in future versions.")
endif()
else()
message(WARNING "No supported compute capabilities")
endif()
# Special policy introduced in CMake 3.13
if (POLICY CMP0076)
cmake_policy(SET CMP0076 NEW)
endif()
include(GNUInstallDirs)
###################################################################################################
#
# Configure CMake variables
#
###################################################################################################
message(STATUS "HIP Compilation Architectures: ${HYTLASS_HIPCC_ARCHS_ENABLED}")
if (NOT (CMAKE_BUILD_TYPE OR CONFIGURATION_TYPES))
# By default we want to build in Release mode to ensure that we're getting best performance.
set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose build level" FORCE)
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "RelWithDebInfo" "Release")
endif()
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
if (DEFINED CMAKE_DEBUG_POSTFIX)
set(HYTLASS_LIBRARY_DEBUG_POSTFIX_INIT ${CMAKE_DEBUG_POSTFIX})
else()
set(HYTLASS_LIBRARY_DEBUG_POSTFIX_INIT .debug)
endif()
set(HYTLASS_LIBRARY_DEBUG_POSTFIX ${HYTLASS_LIBRARY_DEBUG_POSTFIX_INIT} CACHE STRING "Default postfix value for debug libraries")
if(WIN32)
# On Windows we link against the shared (DLL) runtime. Change gtest settings to match this.
set(gtest_force_shared_crt ON CACHE BOOL "Use shared (DLL) run-time lib even when Google Test is built as static lib" FORCE)
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHYTLASS_VERSIONS_GENERATED")
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -DHYTLASS_VERSIONS_GENERATED")
if (WIN32)
# Enable more warnings. Add "-Xcompiler=/WX" to enable warnings as errors.
list(APPEND HYTLASS_HIP_HIPCC_FLAGS -Xcompiler=/W3)
# Disable warning on Unicode characters
list(APPEND HYTLASS_HIP_HIPCC_FLAGS -Xcompiler=/wd4819)
# Disable excess x86 floating point precision that can lead to results being labeled incorrectly
list(APPEND HYTLASS_HIP_HIPCC_FLAGS -Xcompiler=/fp:strict)
endif(WIN32)
if (${HYTLASS_HIPCC_VERBOSE})
list(APPEND HYTLASS_HIP_HIPCC_FLAGS -v)
endif()
#
# HYTLASS NAMESPACE
#
set(HYTLASS_NAMESPACE "hytlass" CACHE STRING "Top level namespace of HYTLASS")
set(HYTLASS_HIPCC_KEEP OFF CACHE BOOL "Keep intermediate files generated by HIPCC.")
set(HYTLASS_ENABLE_F16C OFF CACHE BOOL "Enable F16C x86 extensions in host code.")
################################################################################
#
# HYTLASS generator cmake configuration
#
# Kernel unified filter file
set(KERNEL_FILTER_FILE "" CACHE STRING "KERNEL FILTER FILE FULL PATH")
if (KERNEL_FILTER_FILE AND NOT HYTLASS_LIBRARY_KERNELS)
# If a kernel filter file is specified, we want to generate and then
# filter on the entire kernel set, not the default kernel
# (sub)set. The user may overried HYTLASS_LIBRRARY_KERNELS, in which
# case the resulting kernel set will be the intersection of the two
# options differenced against HYTLASS_LIBRARY_IGNORE_KERNELS.
set(HYTLASS_LIBRARY_KERNELS_INIT "*")
else()
set(HYTLASS_LIBRARY_KERNELS_INIT "")
endif()
if (KERNEL_FILTER_FILE)
get_filename_component(KERNEL_FILTER_FILE "${KERNEL_FILTER_FILE}" ABSOLUTE)
set(KERNEL_FILTER_FILE "${KERNEL_FILTER_FILE}" CACHE STRING "KERNEL FILTER FILE FULL PATH" FORCE)
endif()
set(SELECTED_KERNEL_LIST "selected" CACHE STRING "Name of the filtered kernel list")
if(KERNEL_FILTER_FILE)
message(STATUS "Full path of filter file: ${KERNEL_FILTER_FILE}")
endif()
set(HYTLASS_LIBRARY_OPERATIONS "all" CACHE STRING "Comma delimited list of operation name filters. Default '' means all operations are enabled.")
set(HYTLASS_LIBRARY_KERNELS ${HYTLASS_LIBRARY_KERNELS_INIT} CACHE STRING "Comma delimited list of kernel name filters. If unspecified, only the largest tile size is enabled. If 'all' is specified, all kernels are enabled.")
set(HYTLASS_LIBRARY_IGNORE_KERNELS "" CACHE STRING "Comma delimited list of kernel names to exclude from build.")
set(HYTLASS_PROBLEM_SIZE_PATH "" CACHE STRING "Comma defined from which path data is loaded")
################################################################################
set(HYTLASS_TEST_ENABLE_CACHED_RESULTS OFF CACHE BOOL "Enable caching and reuse of test results in unit tests")
set_property(CACHE HYTLASS_TEST_LEVEL PROPERTY STRINGS 0 1 2)
list(APPEND HYTLASS_HIP_HIPCC_FLAGS -DHYTLASS_TEST_LEVEL=${HYTLASS_TEST_LEVEL})
if (HYTLASS_TEST_ENABLE_CACHED_RESULTS)
message(STATUS "Enable caching of reference results in conv unit tests")
list(APPEND HYTLASS_HIP_HIPCC_FLAGS -DHYTLASS_TEST_ENABLE_CACHED_RESULTS=1)
endif()
set(HYTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED ON CACHE BOOL "Enable/Disable rigorous conv problem sizes in conv unit tests")
if (HYTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED)
message(STATUS "Enable rigorous conv problem sizes in conv unit tests")
list(APPEND HYTLASS_HIP_HIPCC_FLAGS -DHYTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED=1)
endif()
################################################################################
# Trace levels for debugging
set(HYTLASS_DEBUG_TRACE_LEVEL "0" CACHE STRING "Level of debug tracing to perform.")
list(APPEND HYTLASS_HIP_HIPCC_FLAGS -DHYTLASS_DEBUG_TRACE_LEVEL=${HYTLASS_DEBUG_TRACE_LEVEL})
#
# NOTE: running with asan and HIP requires the following environment variable:
#
# ASAN_OPTIONS=protect_shadow_gap=0:replace_intrin=0:detect_leaks=0
#
# without the above environment setting, an error like the following may be generated:
#
# *** Error: Could not detect active GPU device ID [out of memory]
# ...
# ==9149==ERROR: LeakSanitizer: detected memory leaks
# ...
#
if(ENABLE_ASAN) # https://github.com/google/sanitizers/wiki/AddressSanitizer
list(APPEND HYTLASS_HIP_HIPCC_FLAGS --compiler-options=-fsanitize=address --compiler-options=-fno-omit-frame-pointer)
string(APPEND CMAKE_EXE_LINKER_FLAGS " -fsanitize=address")
endif()
# Enable double VGPRs for grid size 512
list(APPEND HYTLASS_HIP_HIPCC_FLAGS -mllvm -enable-num-vgprs-512=true)
###################################################################################################
#
# Configure HIP build options
#
###################################################################################################
# Warnings-as-error exceptions and warning suppressions for Clang builds
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=implicit-int-conversion ")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pass-failed ")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=inconsistent-missing-override ")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-sign-conversion ")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-return-type ")
endif()
if (NOT MSVC AND HYTLASS_HIPCC_KEEP)
# MSVC flow handles caching already, but for other generators we handle it here.
set(HYTLASS_HIPCC_KEEP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp CACHE PATH "Location to store HIPCC scratch files")
file(MAKE_DIRECTORY ${HYTLASS_HIPCC_KEEP_DIR})
list(APPEND HYTLASS_HIP_HIPCC_FLAGS -save-temps=${HYTLASS_HIPCC_KEEP_DIR} -v)
endif()
if (HYTLASS_ENABLE_F16C AND NOT CMAKE_CROSSCOMPILING)
list(APPEND HYTLASS_HIP_FLAGS -DHYTLASS_ENABLE_F16C=1)
if ((CMAKE_CXX_COMPILER_ID MATCHES "MSVC"))
list(APPEND HYTLASS_HIP_HIPCC_FLAGS -Xcompiler=/arch:AVX2)
else()
list(APPEND HYTLASS_HIP_HIPCC_FLAGS -mf16c)
endif()
endif()
if (HYTLASS_ENABLE_OPENMP_TESTS)
find_package(OpenMP)
if(OpenMP_CXX_FOUND)
list(APPEND HYTLASS_HIP_HIPCC_FLAGS -Xcompiler=${OpenMP_CXX_FLAGS})
else()
message(WARNING "HYTLASS_ENABLE_OPENMP_TESTS set but OpenMP not found.")
endif()
endif()
if(UNIX)
list(APPEND HYTLASS_HIP_HIPCC_FLAGS -Wconversion)
list(APPEND HYTLASS_HIP_HIPCC_FLAGS -fno-strict-aliasing)
endif()
# Don't leak lineinfo in release builds
if (NOT CMAKE_BUILD_TYPE MATCHES "Release")
list(APPEND HYTLASS_HIP_HIPCC_FLAGS -lineinfo)
endif()
list(APPEND HYTLASS_HIP_HIPCC_FLAGS -Wno-sign-conversion -Wno-shorten-64-to-32 -Wno-implicit-float-conversion -Wno-implicit-int-conversion -Wno-return-type)
if(HYTLASS_HIP_HIPCC_FLAGS)
message(STATUS "Using hipcc flags: ${HYTLASS_HIP_HIPCC_FLAGS}")
endif()
# Support for 128-bit integers if using HYGON C++ compiler
# if (${CMAKE_CXX_COMPILER_ID} MATCHES "PGI" OR ${CMAKE_CXX_COMPILER_ID} MATCHES "NVHPC")
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Mint128 ")
# endif()
if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
# CMake 3.18 added support for HIP_ARCHITECTURES target property. We will use this
# property for CMake 3.18+, so we request the NEW behavior for correct compatibility.
# https://cmake.org/cmake/help/v3.18/policy/CMP0104.html#policy:CMP0104
cmake_policy(SET CMP0104 NEW)
endif()
if (MSVC)
# MSVC by default does not apply the correct __cplusplus version as specified by the C++ standard
# because MSVC is not a completely compliant implementation. This option forces MSVC to use the
# appropriate value given the requested --std option. This fixes a compilation issue mismatch
# between GCC/Clang and MSVC.
#
# error : a constexpr function cannot have a nonliteral return type "dim3"
#
# See https://developercommunity.visualstudio.com/t/msvc-incorrectly-defines-cplusplus/139261
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zc:__cplusplus")
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Xcompiler /Zc:__cplusplus")
endif()
# Some tests require this build option in order to link.
if (MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /bigobj")
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Xcompiler /bigobj")
endif()
function(hytlass_apply_hip_gencode_flags TARGET)
set(options)
set(oneValueArgs)
set(multiValueArgs SM_ARCHS)
cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if (__SM_ARCHS)
set(ARCHS_ENABLED ${__SM_ARCHS})
else()
set(ARCHS_ENABLED ${HYTLASS_HIPCC_ARCHS_ENABLED})
endif()
set(HIPCC_FLAGS)
foreach(ARCH ${ARCHS_ENABLED})
list(APPEND HIPCC_FLAGS --offload-arch=gfx${ARCH})
endforeach()
if (NOT __SM_ARCHS)
target_compile_options(
${TARGET}
PRIVATE
$<$<COMPILE_LANGUAGE:CXX>:${HIPCC_FLAGS}>
)
else()
list(JOIN HIPCC_FLAGS " " STR_HIPCC_FLAGS)
if(${TARGET} MATCHES ".*\.cu")
set_source_files_properties(${TARGET} PROPERTIES COMPILE_FLAGS ${STR_HIPCC_FLAGS})
endif()
endif()
endfunction()
# Cache the flags so they are available when the function below is called anywhere globally.
set(__HYTLASS_HIP_FLAGS ${HYTLASS_HIP_FLAGS} CACHE INTERNAL "")
set(__HYTLASS_HIP_FLAGS_RELEASE ${HYTLASS_HIP_FLAGS_RELEASE} CACHE INTERNAL "")
set(__HYTLASS_HIP_FLAGS_RELWITHDEBINFO ${HYTLASS_HIP_FLAGS_RELWITHDEBINFO} CACHE INTERNAL "")
set(__HYTLASS_HIP_FLAGS_DEBUG ${HYTLASS_HIP_FLAGS_DEBUG} CACHE INTERNAL "")
set(__HYTLASS_HIP_HIPCC_FLAGS ${HYTLASS_HIP_HIPCC_FLAGS} CACHE INTERNAL "")
set(__HYTLASS_HIP_HIPCC_FLAGS_RELEASE ${HYTLASS_HIP_HIPCC_FLAGS_RELEASE} CACHE INTERNAL "")
set(__HYTLASS_HIP_HIPCC_FLAGS_RELWITHDEBINFO ${HYTLASS_HIP_HIPCC_FLAGS_RELWITHDEBINFO} CACHE INTERNAL "")
set(__HYTLASS_HIP_HIPCC_FLAGS_DEBUG ${HYTLASS_HIP_HIPCC_FLAGS_DEBUG} CACHE INTERNAL "")
function(hytlass_apply_standard_compile_options TARGET)
set(HIP_COMPILE_LANGUAGE CXX)
set(_FLAGS ${__HYTLASS_HIP_FLAGS} ${__HYTLASS_HIP_HIPCC_FLAGS})
set(_FLAGS_RELEASE ${__HYTLASS_HIP_FLAGS_RELEASE} ${__HYTLASS_HIP_HIPCC_FLAGS_RELEASE})
set(_FLAGS_RELWITHDEBINFO ${__HYTLASS_HIP_FLAGS_RELWITHDEBINFO} ${__HYTLASS_HIP_HIPCC_FLAGS_RELWITHDEBINFO})
set(_FLAGS_DEBUG ${__HYTLASS_HIP_FLAGS_DEBUG} ${__HYTLASS_HIP_HIPCC_FLAGS_DEBUG})
target_link_libraries(${TARGET} PRIVATE HYTLASS)
target_compile_options(
${TARGET}
PRIVATE
$<$<COMPILE_LANGUAGE:${HIP_COMPILE_LANGUAGE}>:${_FLAGS}>
$<$<COMPILE_LANGUAGE:${HIP_COMPILE_LANGUAGE}>:$<$<CONFIG:RELEASE>:${_FLAGS_RELEASE}>>
$<$<COMPILE_LANGUAGE:${HIP_COMPILE_LANGUAGE}>:$<$<CONFIG:RELWITHDEBINFO>:${_FLAGS_RELWITHDEBINFO}>>
$<$<COMPILE_LANGUAGE:${HIP_COMPILE_LANGUAGE}>:$<$<CONFIG:DEBUG>:${_FLAGS_DEBUG}>>
)
endfunction()
#
# The following items should eventually be pushed into hytlass/CMakeLists.txt
#
# GLOB for HYTLASS header files. Should we use a static list instead?
file(GLOB_RECURSE HYTLASS_INCLUDE RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} include/hytlass/*.h)
file(GLOB_RECURSE HYTLASS_HYTLASS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/include include/hytlass/*.h include/hytlass/*.hpp include/hytlass/*.inl)
file(GLOB_RECURSE HYTLASS_HUTE RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/include include/hute/*.h*)
###################################################################################################
#
# Define build targets
#
###################################################################################################
source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR}/include REGULAR_EXPRESSION ".*\.h")
add_library(HYTLASS INTERFACE)
add_library(hygon::hytlass::hytlass ALIAS HYTLASS)
set_target_properties(HYTLASS PROPERTIES EXPORT_NAME hytlass)
set(HYTLASS_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include CACHE PATH "HYTLASS Header Library")
set(HYTLASS_GENERATOR_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tools/library CACHE INTERNAL "Location of generator scripts")
# The following utility directory is needed even if the tools build is disabled, so it exists here.
set(HYTLASS_TOOLS_UTIL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tools/util/include CACHE INTERNAL "")
include_directories(${HYTLASS_INCLUDE_DIR})
target_compile_features(HYTLASS INTERFACE cxx_std_11)
if (NOT HYTLASS_NAMESPACE STREQUAL "hytlass")
target_compile_definitions(HYTLASS INTERFACE HYTLASS_NAMESPACE=${HYTLASS_NAMESPACE})
endif()
if (NOT DEFINED HYTLASS_REVISION)
find_package(Git QUIET)
execute_process(
COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD
RESULT_VARIABLE HYTLASS_REVISION_RESULT
OUTPUT_VARIABLE HYTLASS_REVISION
OUTPUT_STRIP_TRAILING_WHITESPACE
)
if (HYTLASS_REVISION_RESULT)
message(STATUS "HYTLASS Revision: Unable to detect, Git returned code ${HYTLASS_REVISION_RESULT}.")
else()
message(STATUS "HYTLASS Revision: ${HYTLASS_REVISION}")
endif()
endif()
configure_file(
${CMAKE_CURRENT_SOURCE_DIR}/cmake/version_extended.h.in
${CMAKE_CURRENT_BINARY_DIR}/include/hytlass/version_extended.h
@ONLY)
target_include_directories(
HYTLASS
INTERFACE
$<INSTALL_INTERFACE:include>
$<BUILD_INTERFACE:${HYTLASS_INCLUDE_DIR}>
$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>
$<BUILD_INTERFACE:${hute_SOURCE_DIR}/include>
$<BUILD_INTERFACE:${hute_SOURCE_DIR}/examples>
)
# Mark DTK headers as system to supress warnings from them
target_include_directories(
HYTLASS
SYSTEM INTERFACE
$<BUILD_INTERFACE:${DCU_TOOLKIT_ROOT_DIR}/include>
)
install(
DIRECTORY
${HYTLASS_INCLUDE_DIR}/
${CMAKE_CURRENT_BINARY_DIR}/include/
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
)
install(
TARGETS HYTLASS
EXPORT HygonHytlass
PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
)
################################################################################
# Doxygen is available. Generate documentation
if (DOXYGEN_FOUND)
# DOT is available. Enable graph generation in the documentation
if (DOXYGEN_DOT_EXECUTABLE)
set(HYTLASS_ENABLE_DOXYGEN_DOT ON CACHE BOOL "Use dot to generate graphs in the doxygen documentation.")
else()
set(HYTLASS_ENABLE_DOXYGEN_DOT OFF CACHE BOOL "Use dot to generate graphs in the doxygen documentation." FORCE)
endif()
if (HYTLASS_ENABLE_DOXYGEN_DOT)
set(HAVE_DOT "YES")
else()
set(HAVE_DOT "NO")
endif()
# Add custom target for Doxygen.
add_custom_target(hytlass_docs ${CMAKE_COMMAND} -E env
"DOT_PATH=${DOXYGEN_DOT_EXECUTABLE}"
"HAVE_DOT=${HAVE_DOT}"
${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
VERBATIM
)
endif()
if(NOT WIN32)
# Add common library search paths so executables and libraries can load and run
# without LD_LIBRARY_PATH being set.
link_libraries(
"-Wl,-rpath,'$ORIGIN'"
"-Wl,-rpath,'$ORIGIN/../lib64'"
"-Wl,-rpath,'$ORIGIN/../lib'"
"-Wl,-rpath,'${DCU_TOOLKIT_ROOT_DIR}/lib64'"
"-Wl,-rpath,'${DCU_TOOLKIT_ROOT_DIR}/lib'"
)
endif()
################################################################################
include(CTest)
enable_testing()
if (HYTLASS_ENABLE_GTEST_UNIT_TESTS)
if (HYTLASS_USE_SYSTEM_GOOGLETEST)
find_package(GTest REQUIRED)
else()
# include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/googletest.cmake)
add_subdirectory(googletest)
include_directories(googletest/googletest/include)
endif()
endif()
if (NOT TARGET test_all)
add_custom_target(test_all)
endif()
set(HYTLASS_INSTALL_TESTS ON CACHE BOOL "Install test executables")
set(HYTLASS_TEST_EXECUTION_ENVIRONMENT "" CACHE BOOL "Environment in which to invoke unit test executables")
set(CMAKE_TEST_INSTALL_PREFIX test CACHE STRING "Test root install location, relative to CMAKE_INSTALL_PREFIX.")
set(HYTLASS_TEST_INSTALL_PREFIX ${CMAKE_TEST_INSTALL_PREFIX}/hytlass CACHE STRING "Test root install location, relative to CMAKE_INSTALL_PREFIX.")
set(HYTLASS_TEST_INSTALL_BINDIR ${HYTLASS_TEST_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR} CACHE STRING "Test root install location, relative to CMAKE_INSTALL_PREFIX.")
set(HYTLASS_TEST_INSTALL_LIBDIR ${HYTLASS_TEST_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR} CACHE STRING "Test root install location, relative to CMAKE_INSTALL_PREFIX.")
install(DIRECTORY DESTINATION ${HYTLASS_TEST_INSTALL_PREFIX})
install(DIRECTORY DESTINATION ${HYTLASS_TEST_INSTALL_BINDIR})
install(DIRECTORY DESTINATION ${HYTLASS_TEST_INSTALL_LIBDIR})
install(DIRECTORY DESTINATION ${HYTLASS_TEST_INSTALL_PREFIX}/ctest)
################################################################################
# use hipBlas
include(${CMAKE_CURRENT_SOURCE_DIR}/hipBLAS.cmake)
if (HYTLASS_ENABLE_HIPBLAS)
target_compile_definitions(HYTLASS INTERFACE HYTLASS_ENABLE_HIPBLAS=1)
endif()
################################################################################
set(HYTLASS_DEFAULT_ACTIVE_TEST_SETS "default" CACHE STRING "Default
activated test sets. In `make test` mode, this string determines the
active set of tests. In `ctest` mode, this value can be overriden
with HYTLASS_TEST_SETS environment variable when running the ctest
executable.")
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_BINDIR}")
set(HYTLASS_CTEST_TEMPLATE_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/CTestTestfile.configure.cmake)
set(HYTLASS_CTEST_GENERATED_FILES "" CACHE INTERNAL "")
function(hytlass_add_executable_tests NAME TARGET)
#
# Generates test rules for `make test`, `make test_all`, and `ctest` invoked from either the
# <CMAKE_BINARY_DIR> or the <CMAKE_INSTALL_PREFIX>/<HYTLASS_TEST_INSTALL_PREFIX> after installation.
#
# NAME: The base name for the test. Can be run with `make <NAME>` or `ctest -R 'c<NAME>'`.
# TARGET: The target corresponding to the executable under test.
# DISABLE_EXECUTABLE_INSTALL_RULE: An option, if given, that disables creating an install rule for TARGET.
# DEPENDS: A list of targets or files on which this test is dependent.
# DEPENDEES: A list of targets which should depend on this test.
# TEST_COMMAND_OPTIONS: A list of variables (i.e. by reference params) which contain command line arguments
# to pass to the test executable. A unique test is generated for each set of
# options given. If this option is not used, a single test with no arguments is generated.
# TEST_COMMAND_OPTIONS_PREFIX: If provided, is added as a prefix to each TEST_COMMAND_OPTIONS value for
# generating the full variable name to be referenced.
# RESULT_CACHE_FILE: A file to be installed alongside the test executable with pre-computed
# test results to speed up test runtime.
# TEST_SETS_SUPPORTED: A list of test set names these tests support.
#
set(options DISABLE_EXECUTABLE_INSTALL_RULE)
set(oneValueArgs DISABLE_TESTS RESULT_CACHE_FILE TEST_COMMAND_OPTIONS_PREFIX)
set(multiValueArgs DEPENDS DEPENDEES TEST_COMMAND_OPTIONS TEST_SETS_SUPPORTED)
cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if (NOT DEFINED __DISABLE_TESTS)
set(__DISABLE_TESTS OFF)
endif()
set(TEST_EXE $<TARGET_FILE_NAME:${TARGET}>)
set(TEST_EXE_WORKING_DIRECTORY ./${CMAKE_INSTALL_BINDIR})
if (NOT DEFINED __TEST_SETS_SUPPORTED)
set(__TEST_SETS_SUPPORTED ${HYTLASS_DEFAULT_ACTIVE_TEST_SETS})
endif()
set(TEST_SETS_SUPPORTED ${__TEST_SETS_SUPPORTED})
if (__RESULT_CACHE_FILE)
add_custom_command(
TARGET ${TARGET}
POST_BUILD
COMMAND ${CMAKE_COMMAND}
ARGS -E copy ${__RESULT_CACHE_FILE} "$<TARGET_FILE_DIR:${TARGET}>"
)
endif()
if (NOT __DISABLE_EXECUTABLE_INSTALL_RULE AND HYTLASS_INSTALL_TESTS)
# file(RELATIVE_PATH CMAKE_CURRENT_BINARY_RELATIVE_DIR ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR})
install(
TARGETS ${TARGET}
RUNTIME DESTINATION ${HYTLASS_TEST_INSTALL_BINDIR}
)
if (__RESULT_CACHE_FILE)
install(
FILES ${__RESULT_CACHE_FILE}
DESTINATION ${HYTLASS_TEST_INSTALL_BINDIR}/
)
endif()
endif()
if (NOT __TEST_COMMAND_OPTIONS)
set(__TEST_COMMAND_OPTIONS " ")
endif()
list(LENGTH __TEST_COMMAND_OPTIONS CMD_COUNT)
if (CMD_COUNT GREATER 1)
add_custom_target(${NAME} DEPENDS ${TARGET} ${__DEPENDS})
foreach(DEPENDEE ${__DEPENDEES})
add_dependencies(${DEPENDEE} ${NAME})
endforeach()
endif()
if (HYTLASS_INSTALL_TESTS)
set(_INLINE_PER_TEST_CODE)
file(READ "${PROJECT_SOURCE_DIR}/cmake/CTestTestfile.test.configure.cmake" _INLINE_PER_TEST_CODE_TEMPLATE)
endif()
set(TEST_GROUP_NAME ${NAME})
foreach(CMD_OPTIONS_VAR IN LISTS __TEST_COMMAND_OPTIONS)
if (CMD_COUNT GREATER 1)
string(TOLOWER "${NAME}_${CMD_OPTIONS_VAR}" TEST_NAME)
else()
string(TOLOWER "${NAME}" TEST_NAME)
endif()
# The following rigmarole is needed to deal with spaces and possible quotes in
# command line arguments. The options are passed "by reference" as the actual
# variable names holding the real options. We then expand these in a way that
# preserves any quotes. Note, they have to be in this order for it to work for
# all the use cases below.
set(TEST_COMMAND_OPTIONS ${${__TEST_COMMAND_OPTIONS_PREFIX}${CMD_OPTIONS_VAR}})
list(JOIN TEST_COMMAND_OPTIONS " " TEST_COMMAND_OPTIONS)
separate_arguments(TEST_COMMAND_OPTIONS)
add_custom_target(
${TEST_NAME}
COMMAND
${HYTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${TEST_COMMAND_OPTIONS}
DEPENDS
${TARGET}
)
if (CMD_COUNT GREATER 1)
add_dependencies(${NAME} ${TEST_NAME})
endif()
foreach(DEPENDEE ${__DEPENDEES})
add_dependencies(${DEPENDEE} ${TEST_NAME})
endforeach()
set(TEST_NAME c${TEST_NAME})
string(CONFIGURE "${_INLINE_PER_TEST_CODE_TEMPLATE}" _TEST_CODE @ONLY)
string(APPEND _INLINE_PER_TEST_CODE "${_TEST_CODE}")
endforeach()
# To run the tests from an install package with tests enabled, we need to generate test files
# that don't rely on the current directory structure in build.
set(TEST_NAME c${NAME})
set(TEST_GEN_DIR ${CMAKE_CURRENT_BINARY_DIR}/ctest/${TEST_NAME})
file(MAKE_DIRECTORY ${TEST_GEN_DIR})
set(TEST_EXE_PATH $<TARGET_FILE:${TARGET}>)
set(TEST_USE_EXTENDED_FORMAT ON)
configure_file("${HYTLASS_CTEST_TEMPLATE_FILE}" "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.cmake" @ONLY)
set(TEST_EXE_PATH $<TARGET_FILE_NAME:${TARGET}>)
set(TEST_USE_EXTENDED_FORMAT OFF) # ctest does not support extended add_test format.
configure_file("${HYTLASS_CTEST_TEMPLATE_FILE}" "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.install.cmake.in" @ONLY)
# The following line imports the tests for immediate run via `make test`.
include(${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.cmake)
set(HYTLASS_CTEST_GENERATED_FILES ${HYTLASS_CTEST_GENERATED_FILES};ctest/${TEST_NAME}/CTestTestfile.${TEST_NAME}.cmake CACHE INTERNAL "")
if (HYTLASS_INSTALL_TESTS)
file(GENERATE
OUTPUT "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.install.cmake"
INPUT "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.install.cmake.in"
)
install(
FILES "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.install.cmake"
DESTINATION ${HYTLASS_TEST_INSTALL_PREFIX}/ctest/${TEST_NAME}
RENAME CTestTestfile.${TEST_NAME}.cmake
)
endif()
endfunction()
if (HYTLASS_ENABLE_TOOLS)
add_subdirectory(tools)
if (HYTLASS_ENABLE_PROFILER)
add_dependencies(test_all test_profiler)
endif()
endif()
if (HYTLASS_ENABLE_EXAMPLES)
add_subdirectory(examples)
add_dependencies(test_all test_examples)
endif()
if (HYTLASS_ENABLE_TESTS)
add_subdirectory(test)
if (HYTLASS_ENABLE_GTEST_UNIT_TESTS)
add_dependencies(test_all test_unit)
endif()
endif()
if (HYTLASS_INSTALL_TESTS)
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/ctest")
file(WRITE "${CMAKE_BINARY_DIR}/ctest/CTestTestfile.cmake" "# Generated File\n\n")
file(APPEND "${CMAKE_BINARY_DIR}/ctest/CTestTestfile.cmake" "cmake_policy(SET CMP0057 NEW) # Allow IN_LIST for if()\n\n")
file(APPEND "${CMAKE_BINARY_DIR}/ctest/CTestTestfile.cmake" "if (NOT DEFINED ENV{HYTLASS_TEST_SETS})\n")
file(APPEND "${CMAKE_BINARY_DIR}/ctest/CTestTestfile.cmake" " set(ENV{HYTLASS_TEST_SETS} ${HYTLASS_DEFAULT_ACTIVE_TEST_SETS})\n")
file(APPEND "${CMAKE_BINARY_DIR}/ctest/CTestTestfile.cmake" "endif()\n\n")
foreach(GENERATED_FILE ${HYTLASS_CTEST_GENERATED_FILES})
file(APPEND "${CMAKE_BINARY_DIR}/ctest/CTestTestfile.cmake" "include(${GENERATED_FILE})\n")
endforeach()
install(
FILES "${CMAKE_BINARY_DIR}/ctest/CTestTestfile.cmake"
DESTINATION "${HYTLASS_TEST_INSTALL_PREFIX}/"
)
endif()
################################################################################
include(CMakePackageConfigHelpers)
write_basic_package_version_file(
${CMAKE_CURRENT_BINARY_DIR}/HygonHytlassConfigVersion.cmake
COMPATIBILITY AnyNewerVersion)
configure_file(
${CMAKE_CURRENT_SOURCE_DIR}/cmake/HygonHytlassConfig.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/HygonHytlassConfig.cmake
@ONLY
)
install(
FILES
${CMAKE_CURRENT_BINARY_DIR}/HygonHytlassConfig.cmake
${CMAKE_CURRENT_BINARY_DIR}/HygonHytlassConfigVersion.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/HygonHytlass/
)
install(
EXPORT HygonHytlass
NAMESPACE hygon::hytlass::
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/HygonHytlass/
FILE HygonHytlassTargets.cmake
)
################################################################################
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/HygonHytlassPackageConfig.cmake)
# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
find_package(HIP REQUIRED CONFIG PATHS ${DCU_TOOLKIT_ROOT_DIR})
################# RT #########################
find_library(
GALAXY_HIP galaxyhip
${DCU_TOOLKIT_ROOT_DIR}/lib
NO_DEFAULT_PATH
)
if(NOT TARGET hip::galaxyhip AND GALAXY_HIP)
message(STATUS "Found galaxyhip: True")
add_library(galaxyhip SHARED IMPORTED GLOBAL)
add_library(hip::galaxyhip ALIAS hipgalaxy)
set_property(
TARGET galaxyhip
PROPERTY IMPORTED_LOCATION
${GALAXY_HIP}
)
elseif(TARGET hip::galaxyhip)
message(STATUS "Found galaxyhip: True")
else()
message(STATUS "Found galaxyhip: True")
endif()
find_library(
HIPRTC_LIBRARY hiprtc
PATHS
${DCU_TOOLKIT_ROOT_DIR}/lib
NO_DEFAULT_PATH
)
if(NOT TARGET hiprtc AND HIPRTC_LIBRARY)
message(STATUS "Found hiprtc: True")
add_library(hiprtc SHARED IMPORTED GLOBAL)
add_library(hip::hiprtc ALIAS hiprtc)
set_property(
TARGET hiprtc
PROPERTY IMPORTED_LOCATION
${HIPRTC_LIBRARY}
)
elseif(TARGET hiprtc)
message(STATUS "Found hiprtc: True")
else()
message(STATUS "Found hiprtc: False")
endif()
include_directories(SYSTEM "${DCU_TOOLKIT_ROOT_DIR}/include")
# set hip property as *.cu
function(hytlass_correct_source_file_language_property)
foreach(File ${ARGN})
# add compile option -xhip while using clang++
if(File MATCHES ".*\.cu$")
# set_source_files_properties(${File} PROPERTIES COMPILE_FLAGS "-x hip")
set_source_files_properties(${File} PROPERTIES LANGUAGE CXX)
endif()
endforeach()
endfunction()
set(HYTLASS_UNITY_BUILD_ENABLED_INIT OFF)
set(HYTLASS_UNITY_BUILD_ENABLED ${HYTLASS_UNITY_BUILD_ENABLED_INIT} CACHE BOOL "Enable combined source compilation")
set(HYTLASS_UNITY_BUILD_BATCH_SIZE_INIT 16)
set(HYTLASS_UNITY_BUILD_BATCH_SIZE ${HYTLASS_UNITY_BUILD_BATCH_SIZE_INIT} CACHE STRING "Batch size for unified source files")
# set unify
function(hytlass_unify_source_files TARGET_ARGS_VAR)
set(options)
set(oneValueArgs BATCH_SOURCES BATCH_SIZE)
set(multiValueArgs)
cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if (NOT DEFINED TARGET_ARGS_VAR)
message(FATAL_ERROR "TARGET_ARGS_VAR parameter is required")
endif()
if (__BATCH_SOURCES AND NOT DEFINED __BATCH_SIZE)
set(__BATCH_SIZE ${HYTLASS_UNITY_BUILD_BATCH_SIZE})
endif()
if (HYTLASS_UNITY_BUILD_ENABLED AND DEFINED __BATCH_SIZE AND __BATCH_SIZE GREATER 1)
set(HIP_FILE_ARGS)
set(TARGET_SOURCE_ARGS)
foreach(ARG ${__UNPARSED_ARGUMENTS})
if(${ARG} MATCHES ".*\.cu$")
list(APPEND HIP_FILE_ARGS ${ARG})
else()
list(APPEND TARGET_SOURCE_ARGS ${ARG})
endif()
endforeach()
list(LENGTH HIP_FILE_ARGS NUM_HIP_FILE_ARGS)
while(NUM_HIP_FILE_ARGS GREATER 0)
list(SUBLIST HIP_FILE_ARGS 0 ${__BATCH_SIZE} HIP_FILE_BATCH)
string(SHA256 HIP_FILE_BATCH_HASH "${HIP_FILE_BATCH}")
string(SUBSTRING ${HIP_FILE_BATCH_HASH} 0 12 HIP_FILE_BATCH_HASH)
set(BATCH_FILE ${CMAKE_CURRENT_BINARY_DIR}/${NAME}.unity.${HIP_FILE_BATCH_HASH}.cu)
message(STATUS "Generating ${BATCH_FILE}")
file(WRITE ${BATCH_FILE} "// Unity File - Auto Generated!\n")
foreach(HIP_FILE ${HIP_FILE_BATCH})
get_filename_component(HIP_FILE_ABS_PATH ${HIP_FILE} ABSOLUTE)
file(APPEND ${BATCH_FILE} "#include \"${HIP_FILE_ABS_PATH}\"\n")
endforeach()
list(APPEND TARGET_SOURCE_ARGS ${BATCH_FILE})
if (NUM_HIP_FILE_ARGS LESS_EQUAL __BATCH_SIZE)
break()
endif()
list(SUBLIST HIP_FILE_ARGS ${__BATCH_SIZE} -1 HIP_FILE_ARGS)
list(LENGTH HIP_FILE_ARGS NUM_HIP_FILE_ARGS)
endwhile()
else()
set(TARGET_SOURCE_ARGS ${__UNPARSED_ARGUMENTS})
endif()
set(${TARGET_ARGS_VAR} ${TARGET_SOURCE_ARGS} PARENT_SCOPE)
endfunction()
# unify -> set property -> add library
function(hytlass_add_library NAME)
set(options SKIP_GENCODE_FLAGS)
set(oneValueArgs EXPORT_NAME)
set(multiValueArgs)
cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
hytlass_unify_source_files(TARGET_SOURCE_ARGS ${__UNPARSED_ARGUMENTS})
hytlass_correct_source_file_language_property(${TARGET_SOURCE_ARGS})
add_library(${NAME} ${TARGET_SOURCE_ARGS} "")
hytlass_apply_standard_compile_options(${NAME})
if (NOT __SKIP_GENCODE_FLAGS)
hytlass_apply_hip_gencode_flags(${NAME})
endif()
target_compile_features(
${NAME}
INTERFACE
cxx_std_11
)
if(__EXPORT_NAME)
add_library(hygon::hytlass::${__EXPORT_NAME} ALIAS ${NAME})
set_target_properties(${NAME} PROPERTIES EXPORT_NAME ${__EXPORT_NAME})
endif()
endfunction()
function(hytlass_add_executable NAME)
set(options)
set(oneValueArgs)
set(multiValueArgs)
cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
hytlass_unify_source_files(TARGET_SOURCE_ARGS ${__UNPARSED_ARGUMENTS})
hytlass_correct_source_file_language_property(${TARGET_SOURCE_ARGS})
add_executable(${NAME} ${TARGET_SOURCE_ARGS})
hytlass_apply_standard_compile_options(${NAME})
hytlass_apply_hip_gencode_flags(${NAME})
target_compile_features(
${NAME}
INTERFACE
cxx_std_11
)
endfunction()
function(hytlass_target_sources NAME)
set(options)
set(oneValueArgs)
set(multiValueArgs)
cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
hytlass_unify_source_files(TARGET_SOURCE_ARGS ${__UNPARSED_ARGUMENTS})
hytlass_correct_source_file_language_property(${TARGET_SOURCE_ARGS})
target_sources(${NAME} ${TARGET_SOURCE_ARGS})
endfunction()
Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-------------------------------------------------------------------------
The following copyright statements and licenses apply to various open source software/model
packages (or portions thereof) that are distributed with this MUTLASS. MUTLASS that
includes this file does not necessarily use all the open source software packages referred
to below and may also only use portions of a given package. Some open source software
packages referred to below may have been modified by Moore Threads Technology Co., Ltd
-------------------------------------------------------------------------
cutlass
Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-------------------------------------------------------------------------
googletest
Copyright 2008, Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# HYTLASS 0.1.0
_HYTLASS 0.1.0 - 2025年12月_
HYTLASS(HYGON DCU Templates for Linear Algebra Subroutines)是一个用于在海光DCU架构下实现高性能矩阵乘法(GEMM)及其衍生计算的C++模板库,其设计思路来源于CUTLASS,将数据搬运、层次化结构等可动部件分解为可复用、模块化的软件组件,
并通过C++模板类进行抽象。
HYTLASS同时兼容了基于CUTLASS 2.x实现的基于C++模版的矩阵乘和卷积,以及基于CUTLASS 3.x引入的CuTe编程模型。
你可以参考[快速入门指南](media/docs/quickstart.md)来快速入门使用HYTLASS。
# HYTLASS 0.1.0新增
HYTLASS 0.1.0是HYTLASS的首次提交,其实现基于CUTLASS 3.5.0,具体地,提供了:
- HYTLASS兼容和支持CUTLASS 2.x实现:
- 支持BW(GFX936)及更早架构下的指令特性,包括支持TensorCore的mmac及ds_read_matrix原语。
- 支持基于这些架构指令特性下的矩阵乘法实现。
- 支持基于矩阵乘法模版实现的基于隐式矩阵乘法的卷积实现。
- HYTLASS兼容和支持CUTLASS 3.x及CuTe编程模型:
- 适配基于DCU架构对CUTLASS 3.x中的CuTe编程模型HuTe,目前已支持至BW平台的指令原语。
- 支持基于HuTe模型的矩阵乘法实现,包括MMA部分和Epilogue部分。
- 支持基于HuTe模型的Kernel调度、Tiling调度等计算任务调度架构,支持多种线程块调度优化策略。
- 十余个计算示例支持(详见[示例](examples)):
- 支持基于2.x的多种数据类型(TF32/FP16/BF16/I8/U8)的矩阵乘法、卷积及其融合算子实现。
- 支持基于Split-K、Stream-K等计算优化算法的矩阵乘法示例。
- 支持基于访问者模式的自定义尾声处理示例。
- 支持基于HuTe的矩阵乘法示例,包括BatchedGemm、GroupGemm等示例。
- 支持使用TensorCore加速的基于Block Ell格式的稀疏矩阵乘法示例。
- 工具链支持:
- 支持hytlass_profiler,用于细粒度问题参数下的kernel tuning。
# 编译HYTLASS
在其他项目中,HYTLASS作为头文件库不需要单独编译,用户将`include/`目录指定至头文件路径即可。
HYTLASS的单元测试、示例及工具链通过CMake编译,所需最低版本为3.19。
在HYTLASS项目根目录中创建一个build目录,然后执行cmake,你可以通过CMAKE选项`HYTLASS_HIPCC_ARCHS`指定编译的架构。
```bash
$ mkdir build && cd build
$ cmake .. -DHYTLASS_HIPCC_ARCHS=936 # compiles for DCU BW Architecture
```
你可以通过在build目录下使用make编译并执行`test_unit`来执行HYTLASS的单元测试。可以通过`-j`选项来并行执行make的流程。
```bash
$ make test_unit -j
...
...
...
[----------] Global test environment tear-down
[==========] 946 tests from 57 test cases ran. (10812 ms total)
[ PASSED ] 946 tests.
```
在所支持的硬件架构下,所有测试都应当通过,尽管不同硬件架构下的单元测试数量可能有所不同。
你也可以通过在build目录下,使用make编译`test_examples*`来执行所有示例或某个示例。
```bash
$ make test_examples -j
...
...
...
[100%] Built target test_examples_xxx
[100%] Built target test_examples
```
或在编译完成后,在`build/examples`目录下单独执行某个示例的可执行文件。
```bash
$ cd build/examples && ./00_hytlass_basic_gemm/gfx928_gemm_tensor_op
```
\ No newline at end of file
# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# Generated file
set(TEST_SETS_SUPPORTED @TEST_SETS_SUPPORTED@)
if (NOT DEFINED ENV{HYTLASS_TEST_SETS})
set(ENV{HYTLASS_TEST_SETS} @HYTLASS_DEFAULT_ACTIVE_TEST_SETS@)
endif()
foreach(TEST_SET_REQUESTED IN ITEMS $ENV{HYTLASS_TEST_SETS})
if (NOT TEST_SET_REQUESTED IN_LIST TEST_SETS_SUPPORTED)
message(STATUS "Skipping tests for @TEST_EXE_PATH@ as ${TEST_SET_REQUESTED} is not in the set of [${TEST_SETS_SUPPORTED}].")
return()
endif()
endforeach()
set(TEST_EXE_PATH @TEST_EXE_PATH@)
set(TEST_EXE_WORKING_DIRECTORY @TEST_EXE_WORKING_DIRECTORY@)
set(HYTLASS_USE_EXTENDED_ADD_TEST_FORMAT @TEST_USE_EXTENDED_FORMAT@)
if (DEFINED ENV{HYTLASS_TEST_EXECUTION_ENVIRONMENT})
set(_HYTLASS_TEST_EXECUTION_ENVIRONMENT $ENV{HYTLASS_TEST_EXECUTION_ENVIRONMENT})
else()
set(_HYTLASS_TEST_EXECUTION_ENVIRONMENT @HYTLASS_TEST_EXECUTION_ENVIRONMENT@)
endif()
@_INLINE_PER_TEST_CODE@
# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
if (HYTLASS_USE_EXTENDED_ADD_TEST_FORMAT)
# The longform/extended format allows generator expressions to be
# expanded property and is useful in contexts where the files need
# to be immediately included into being-processed cmake code.
add_test(NAME @TEST_NAME@ COMMAND ${_HYTLASS_TEST_EXECUTION_ENVIRONMENT} "${TEST_EXE_PATH}" @TEST_COMMAND_OPTIONS@)
else()
add_test(@TEST_NAME@ ${_HYTLASS_TEST_EXECUTION_ENVIRONMENT} "${TEST_EXE_PATH}" @TEST_COMMAND_OPTIONS@)
endif()
if (TEST_EXE_WORKING_DIRECTORY)
set_tests_properties(@TEST_NAME@ PROPERTIES WORKING_DIRECTORY "${TEST_EXE_WORKING_DIRECTORY}")
endif()
set_tests_properties(@TEST_NAME@ PROPERTIES DISABLED @__DISABLE_TESTS@)
get_filename_component(HygonHytlass_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
include(CMakeFindDependencyMacro)
if(TARGET hygon::hytlass::HYTLASS)
return()
endif()
include("${HygonHytlass_CMAKE_DIR}/HygonHytlassTargets.cmake")
# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
set(CPACK_PACKAGE_NAME HygonHytlass)
set(CPACK_PACKAGE_VENDOR HYGON)
set(CPACK_PACKAGE_CONTACT info@hygon.com)
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HYTLASS CUDA C++ Template Linear Algebra Library")
set(CPACK_PACKAGE_INSTALL_DIRECTORY ${CPACK_PACKAGE_NAME})
set(CPACK_PACKAGE_VERSION_MAJOR ${PROJECT_VERSION_MAJOR})
set(CPACK_PACKAGE_VERSION_MINOR ${PROJECT_VERSION_MINOR})
set(CPACK_PACKAGE_VERSION_PATCH ${PROJECT_VERSION_PATCH})
set(CPACK_VERBATIM_VARIABLES YES)
# set(CPACK_PACKAGE_DESCRIPTION_FILE ${CMAKE_CURRENT_LIST_DIR}/Description.txt)
# set(CPACK_RESOURCE_FILE_WELCOME ${CMAKE_CURRENT_LIST_DIR}/Welcome.txt)
# set(CPACK_RESOURCE_FILE_LICENSE ${CMAKE_CURRENT_LIST_DIR}/License.txt)
# set(CPACK_RESOURCE_FILE_README ${CMAKE_CURRENT_LIST_DIR}/Readme.txt)
include(CPack)
# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
include(FetchContent)
set(GOOGLETEST_DIR "" CACHE STRING "Location of local GoogleTest repo to build against")
if(GOOGLETEST_DIR)
set(FETCHCONTENT_SOURCE_DIR_GOOGLETEST ${GOOGLETEST_DIR} CACHE STRING "GoogleTest source directory override")
endif()
FetchContent_Declare(
googletest
GIT_REPOSITORY https://github.com/google/googletest.git
GIT_TAG v1.14.0
)
FetchContent_GetProperties(googletest)
if(NOT googletest_POPULATED)
FetchContent_Populate(googletest)
if (MSVC)
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
endif()
add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR} EXCLUDE_FROM_ALL)
endif()
/***************************************************************************************************
* Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Basic HIP file for testing compiler flags.
*/
__device__ int inner()
{
return -1;
}
__global__ void test()
{
inner();
}
int main()
{
test<<<1,1>>>();
return 0;
}
/***************************************************************************************************
* Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
* Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#pragma once
#define HYTLASS_BUILD @HYTLASS_VERSION_BUILD@
#define HYTLASS_REVISION "@HYTLASS_REVISION@"
# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
hytlass_example_add_executable(
gfx928_gemm_tensor_op
gfx928_gemm_tensor_op.cu
)
hytlass_example_add_executable(
gfx928_gemm_tensor_op_mixed
gfx928_gemm_tensor_op_mixed.cu
)
/***************************************************************************************************
* Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/**
This example shows how to run matrix multiplication kernels using functions and data structures
provided by HYTLASS using tensor cores.
Writing a single high performance matrix multiplication kernel is hard but do-able. Whereas writing
high performance kernels at scale which works for multiple problem sizes with good abstractions is
really hard. HYTLASS solves this problem by providing simplified abstractions to compose
multiple sections of gemm kernel. When used properly, the kernels can hit peak performance of GPU
easily.
HYTLASS divides a kernel into hierarchical composable sections. Which means, at each thread, warp
and thread-block level, they compute on their own tile-size with higher level of tile sizes being
composed from lower level ones. Multiple thread-tiles (tile size each thread computes) can be used
to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute
threadblock-tile (tile size computed by a threadblock).
In thie example, we split variable initialization into
1. Setting up data properties : describes how matrices are laid out in the memory and how the kernel
can view them (logical to physical mapping)
2. Setting up computation properties : describes how the above set matrices will be used to compute
output of matrix multiplication.
First, we setup the data types of matrices A, B, C and D along with alpha, beta as the equation for
GEMM is D = alpha * A * B + beta * C. In HYTLASS, the kernels first compute A * B and leaves the
rest of the computation to end of the kernel as alpha * X + beta * C is a simple element-wise
operation on X (A * B) and C. We call this as epilogue of kernel. Hence, we setup data types for
alpha and beta to be equal to ElementComputeEpilogue = int32_t. As we want to use MMA instructions
on Turing and they support 8-bit signed integer (int8_t), we use data type for elements in input
matrix A and B as int8_t. Volta also supports accumulation of partial dot product to int32_t, which
can store wider range of numbers, we use it as data type of output matrix elements and accumulation.
We convey this to HYTLASS kernel by initializing template variables ElementAccumulator (int32_t),
ElementComputeEpilogue (int32_t), ElementInputA (int8_t), ElementInputB (int8_t), ElementOutput
(int32_t). Communicating just the data type is not enough. As the data is laid out linearly in
memory, we have to convey the layout of matrices. We do that by initializing template variable
LayoutInputA to column major HYTLASS variable, LayoutInputB to row major and LayoutOutput to row
major. Next, we setup rules to comptue alpha * X + beta * C which is called epilogue of the kernel.
We initialize template variable EpilogueOp, which takes the data type of output ElementOutput
(int32_t), the number of elements per vector memory access (16), data type of accumulator (int32_t)
and data type of computation of linear combination (alpha * X + beta * C).
Now that we setup the properties of data, we have to setup properties of computation.
Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x256x64,
64x64x16, 8x8x16 (MxNxK) respectively. When passed to instantiate HYTLASS GEMM kernel, it internally
deduce the amount of threads needed per thread-block, amount of shared memory, storing data in
bank-conflict free manner, and ton of other variables required to compose, initialize and launch a
high performance GEMM kernel. This is the beauty of HYTLASS, it relieves developer from
understanding and coding complicated hardware optimizations which can easily go wrong.
HYTLASS also supports multiple MMA pipelines in a threadblock. What are MMA pipelines? MMA pipelines
constitute the whole process of loading input data from global memory to shared memory, loading data
from shared memory to registers, doing matrix multiplication, store to global memory. The below flow
sequence shows a typical mma pipeline.
matrix in global memory -> registers -> tile in shared memory -> registers -> mma -> registers ->
output to global memory
The problem with single pipeline is, each stage is synchronous which means, each stage has to wait
until the previous finished executing. There are stages in the pipeline which do not have fixed
latency, for example, the loads from global memory and shared memory. Therefore, we can add one more
pipeline with a phase shift in mma kernel to hide latency from global and shared memory loads.
Finally, the pipeline in a kernel looks like
(1) matrix in global memory -> (2) registers -> (3) tile in shared memory -> (4) registers -> (5)
mma -> (6) registers -> (7) output to global memory (1) <null> -> (2) <null> -> (3) matrix in global
memory -> (4) registers -> (5) tile in shared memory -> (6) registers -> (7) mma -> (8) registers ->
(9) output to global memory
This way, you can hide the second global memoroy load latency by doing computation on already loaded
input data.
There are few more template variables initialized such as, which threadblock tile of output matrix
is done which threadblock launched on an SM, GFX architecture of GPU you want to run on.
These are all put together to create a template variable which describes HYTLASS GEMM kernel using
hytlass::gemm::device::Gemm template.
The next step is to initialize physical data, instantiate and initialize HYTLASS kernel and run it.
We use HYTLASS utilities to initialize, fill, compare matrices as they are simple and doesn't come
in the way of learning HYTLASS.
Once all the matrices are initialized and filled with data, create arguments tuple to launch HYTLASS
kernel which takes problem size (M = 5120, N = 4096 and K = 4096), matrices, alpha, beta and the
important one, split k-dimension factor. Along with that, we query HYTLASS if any scratch-space
memory required by the kernel we instantiated. If yes, we create it and pass it along with other
arguments created to initialize HYTLASS kernel then, the kernel is launched.
In this example, we later on launch a reference gemm kernel (from HYTLASS utilities) to compare if
the output from HYTLASS kernel is same as reference GEMM kernel.
*/
#include <iostream>
#include "hytlass/hytlass.h"
#include "hytlass/gemm/device/gemm.h"
#include "hytlass/util/command_line.h"
#include "hytlass/util/host_tensor.h"
#include "hytlass/util/reference/device/gemm.h"
#include "hytlass/util/reference/host/tensor_compare.h"
#include "hytlass/util/reference/host/tensor_copy.h"
#include "hytlass/util/reference/host/tensor_fill.h"
#include "hytlass/util/tensor_view_io.h"
#include "helper.h"
#include "hytlass/util/GPU_Clock.hpp"
#include "hytlass/gemm/device/gemm_universal.h"
///////////////////////////////////////////////////////////////////////////////////////////////////
// Command line options parsing
struct Options {
bool help;
hytlass::gemm::GemmCoord problem_size;
int batch_count;
float alpha;
float beta;
bool reference_check;
int iterations;
Options():
help(false),
problem_size({5120, 4096, 4096}),
batch_count(1),
reference_check(true),
iterations(20),
alpha(1),
beta()
{}
bool valid() {
return true;
}
// Parses the command line
void parse(int argc, char const **args) {
hytlass::CommandLine cmd(argc, args);
if (cmd.check_cmd_line_flag("help")) {
help = true;
}
cmd.get_cmd_line_argument("m", problem_size.m());
cmd.get_cmd_line_argument("n", problem_size.n());
cmd.get_cmd_line_argument("k", problem_size.k());
cmd.get_cmd_line_argument("alpha", alpha);
cmd.get_cmd_line_argument("beta", beta);
cmd.get_cmd_line_argument("iterations", iterations);
}
/// Prints the usage statement.
std::ostream & print_usage(std::ostream &out) const {
out << "00_hytlass_basic_gemm example\n\n"
<< "Options:\n\n"
<< " --help If specified, displays this usage statement.\n\n"
<< " --m=<int> GEMM M dimension\n"
<< " --n=<int> GEMM N dimension\n"
<< " --k=<int> GEMM K dimension\n"
<< " --alpha=<f32> Epilogue scalar alpha\n"
<< " --beta=<f32> Epilogue scalar beta\n\n"
<< " --iterations=<int> Number of profiling iterations to perform.\n\n";
out << "\n\nExamples:\n\n"
<< "$ ./examples/00_hytlass_basic_gemm/gfx928_gemm_tensor_op --m=1024 --n=512 --k=1024 \\\n"
<< " --alpha=2 --beta=0.707 \n\n";
return out;
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////
// The code section below describes datatype for input, output matrices and computation between
// elements in input matrices.
using ElementAccumulator = float; // <- data type of accumulator
using ElementComputeEpilogue = ElementAccumulator; // <- data type of epilogue operations
using ElementInputA = hytlass::bfloat16_t; // <- data type of elements in input matrix A
using ElementInputB = hytlass::bfloat16_t; // <- data type of elements in input matrix B
using ElementOutput = hytlass::bfloat16_t; // <- data type of elements in output matrix D
// The code section below describes matrix layout of input and output matrices. Column Major for
// Matrix A, Row Major for Matrix B and Row Major for Matrix C
using LayoutInputA = hytlass::layout::ColumnMajor;
using LayoutInputB = hytlass::layout::ColumnMajor;
using LayoutOutput = hytlass::layout::RowMajor;
// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
using MMAOp = hytlass::arch::OpClassTensorOp;
// This code section describes GFX architecture number
using SmArch = hytlass::arch::Gfx928;
// This code section describes the tile size a thread block will compute
using ShapeMMAThreadBlock = hytlass::gemm::GemmShape<128, 128, 64>;
// This code section describes tile size a warp will compute
using ShapeMMAWarp = hytlass::gemm::GemmShape<64, 64, 32>;
// This code section describes the size of MMA op
// 但需要注意下面的 kAlignemntA/B,最大对齐长度为8
using ShapeMMAOp = hytlass::gemm::GemmShape<16, 16, 16>;
// 对齐情况
// OpMultiplyAddFastF16 模式下可以考虑使用256, MultiplyAdd使用128
constexpr int kAlignmentA = 128 / hytlass::sizeof_bits<ElementInputA>::value;
constexpr int kAlignmentB = 128 / hytlass::sizeof_bits<ElementInputB>::value;
constexpr int kAlignmentC = 128 / hytlass::sizeof_bits<ElementOutput>::value;
// This code section describes how threadblocks are scheduled on GPU
using SwizzleThreadBlock = hytlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>; // block swizzle
// This code section describes the epilogue part of the kernel
using EpilogueOp = hytlass::epilogue::thread::LinearCombination<
ElementOutput, // <- data type of output matrix
kAlignmentC, // <- the number of elements per vectorized
// memory access. For a byte, it's 16
// elements. This becomes the vector width of
// math instructions in the epilogue too
ElementAccumulator, // <- data type of accumulator
ElementComputeEpilogue>; // <- data type for alpha/beta in linear combination function
// 设为1走 Singlestage,支持warpShape::kK == InstructionShape::kK, lds开销小
// 设为2走 Pipeline
constexpr int NumStages = 1;
using Gemm = hytlass::gemm::device::GemmUniversal<
ElementInputA,
LayoutInputA,
ElementInputB,
LayoutInputB,
ElementOutput,
LayoutOutput,
ElementAccumulator,
MMAOp,
SmArch,
ShapeMMAThreadBlock,
ShapeMMAWarp,
ShapeMMAOp,
EpilogueOp,
SwizzleThreadBlock,
NumStages,
kAlignmentA,
kAlignmentB>;
int run(Options &options) {
// Create a tuple of problem size for matrix multiplication
hytlass::gemm::GemmCoord problem_size = options.problem_size;
// Initialize tensors using HYTLASS helper functions
hytlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
problem_size.mk()); // <- Create matrix A with dimensions M x K
hytlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(
problem_size.kn()); // <- Create matrix B with dimensions K x N
hytlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(
problem_size.mn()); // <- Create matrix C with dimensions M x N
hytlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(
problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from
// HYTLASS kernel
hytlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(
problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from
// reference kernel
// Fill input and output matrices on host using HYTLASS helper functions
hytlass::reference::host::TensorFillRandomUniform(
tensor_a.host_view(),
1,
ElementInputA(2),
ElementInputA(-2),
hytlass::MantissaInBits<ElementOutput>::bits); // <- Fill matrix A on host with uniform-distribution random data
hytlass::reference::host::TensorFillRandomUniform(
tensor_b.host_view(),
2,
ElementInputB(2),
ElementInputB(-2),
hytlass::MantissaInBits<ElementOutput>::bits); // <- Fill matrix B on host with uniform-distribution random data
hytlass::reference::host::TensorFillRandomUniform(
tensor_c.host_view(),
1,
ElementOutput(4),
ElementOutput(-4),
0); // <- Fill matrix C on host with uniform-distribution random data
hytlass::reference::host::TensorFill(
tensor_d.host_view()); // <- fill matrix D on host with zeros
hytlass::reference::host::TensorFill(
tensor_ref_d.host_view()); // <- fill matrix D for reference on host with zeros
// Copy data from host to GPU
tensor_a.sync_device();
tensor_b.sync_device();
tensor_c.sync_device();
tensor_d.sync_device();
tensor_ref_d.sync_device();
// Initialize alpha and beta for dot product computation
ElementComputeEpilogue alpha = ElementComputeEpilogue(options.alpha);
ElementComputeEpilogue beta = ElementComputeEpilogue(options.beta);
// Split K dimension into 1 partitions
int split_k_slices = 1;
// Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
// instantiated HYTLASS kernel
typename Gemm::Arguments arguments {
hytlass::gemm::GemmUniversalMode::kGemm, // <- GemmUniversalMode
problem_size, // <- problem size of matrix multiplication
1, // <- batch count
{alpha, beta}, // <- tuple of alpha and beta
tensor_a.device_data(), // <- reference to matrix A on device
tensor_b.device_data(), // <- reference to matrix B on device
tensor_c.device_data(), // <- reference to matrix C on device
tensor_d.device_data(), // <- reference to matrix D on device
1, 1, 1, 1, // <- batch stride
tensor_a.stride(0), // <- Stride of matrix A
tensor_b.stride(0), // <- Stride of matrix B
tensor_c.stride(0), // <- Stride of matrix C
tensor_d.stride(0), // <- Stride of matrix D
nullptr, nullptr, nullptr, // <- gather a,b,d indices
};
// Using the arguments, query for extra workspace required for matrix multiplication computation
size_t workspace_size = Gemm::get_workspace_size(arguments);
// Allocate workspace memory
hytlass::device_memory::allocation<uint8_t> workspace(workspace_size);
// Instantiate HYTLASS kernel depending on templates
Gemm gemm_op;
// Check the problem size is supported or not
hytlass::Status status = gemm_op.can_implement(arguments);
HYTLASS_CHECK(status);
// Initialize HYTLASS kernel with arguments and workspace pointer
status = gemm_op.initialize(arguments, workspace.get());
HYTLASS_CHECK(status);
// Launch initialized HYTLASS kernel
status = gemm_op();
HYTLASS_CHECK(status);
tensor_d.sync_host();
// Create instantiation for device reference gemm kernel
hytlass::reference::device::Gemm<
ElementInputA,
LayoutInputA,
ElementInputB,
LayoutInputB,
ElementOutput,
LayoutOutput,
ElementAccumulator,
ElementAccumulator> gemm_device;
// Launch device reference gemm kernel
gemm_device(problem_size,
alpha,
tensor_a.device_ref(),
tensor_b.device_ref(),
beta,
tensor_c.device_ref(),
tensor_ref_d.device_ref());
// Wait for kernels to finish
(void)hipDeviceSynchronize();
// Copy output data from HYTLASS and reference kernel to host for comparison
tensor_ref_d.sync_host();
ElementOutput eps(0.05);
const ElementOutput non_zero_floor(1e-6f);
bool passed = hytlass::reference::host::TensorRelativelyEquals(tensor_ref_d.host_view(),
tensor_d.host_view(), eps, non_zero_floor);
if (passed!=true) {
printf("failed\n");
}
else {
printf("passed\n");
}
GPU_Clock timer;
int iterations_cnt = 50;
double gflops = (2.0 * problem_size.m() * problem_size.n() * problem_size.k()) * 1e-9;
for (int i=0; i<10; i++) {
status = gemm_op();
}
HYTLASS_CHECK(status);
timer.start();
for (int i=0; i<iterations_cnt; i++) {
status = gemm_op();
}
HYTLASS_CHECK(status);
double hytlass_time = timer.seconds() / iterations_cnt;
printf("hytlass gemm: [%6.1f]GFlop/s (%6.4f)ms\n", gflops / hytlass_time, hytlass_time * 1000);
return 0;
}
int main(int argc, const char **argv) {
Options options;
options.parse(argc, argv);
if (options.help) {
options.print_usage(std::cout) << std::endl;
return 0;
}
printf("%d x %d x %d tensor op Matrix Multiply\n", \
options.problem_size.m(), options.problem_size.n(), options.problem_size.k());
if (!options.valid()) {
std::cerr << "Invalid problem." << std::endl;
return -1;
}
return run(options);
}
/***************************************************************************************************
* Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/**
This example shows how to run matrix multiplication kernels using functions and data structures
provided by HYTLASS using tensor cores.
Writing a single high performance matrix multiplication kernel is hard but do-able. Whereas writing
high performance kernels at scale which works for multiple problem sizes with good abstractions is
really hard. HYTLASS solves this problem by providing simplified abstractions to compose
multiple sections of gemm kernel. When used properly, the kernels can hit peak performance of GPU
easily.
HYTLASS divides a kernel into hierarchical composable sections. Which means, at each thread, warp
and thread-block level, they compute on their own tile-size with higher level of tile sizes being
composed from lower level ones. Multiple thread-tiles (tile size each thread computes) can be used
to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute
threadblock-tile (tile size computed by a threadblock).
In thie example, we split variable initialization into
1. Setting up data properties : describes how matrices are laid out in the memory and how the kernel
can view them (logical to physical mapping)
2. Setting up computation properties : describes how the above set matrices will be used to compute
output of matrix multiplication.
First, we setup the data types of matrices A, B, C and D along with alpha, beta as the equation for
GEMM is D = alpha * A * B + beta * C. In HYTLASS, the kernels first compute A * B and leaves the
rest of the computation to end of the kernel as alpha * X + beta * C is a simple element-wise
operation on X (A * B) and C. We call this as epilogue of kernel. Hence, we setup data types for
alpha and beta to be equal to ElementComputeEpilogue = int32_t. As we want to use MMA instructions
on Turing and they support 8-bit signed integer (int8_t), we use data type for elements in input
matrix A and B as int8_t. Volta also supports accumulation of partial dot product to int32_t, which
can store wider range of numbers, we use it as data type of output matrix elements and accumulation.
We convey this to HYTLASS kernel by initializing template variables ElementAccumulator (int32_t),
ElementComputeEpilogue (int32_t), ElementInputA (int8_t), ElementInputB (int8_t), ElementOutput
(int32_t). Communicating just the data type is not enough. As the data is laid out linearly in
memory, we have to convey the layout of matrices. We do that by initializing template variable
LayoutInputA to column major hytlass variable, LayoutInputB to row major and LayoutOutput to row
major. Next, we setup rules to comptue alpha * X + beta * C which is called epilogue of the kernel.
We initialize template variable EpilogueOp, which takes the data type of output ElementOutput
(int32_t), the number of elements per vector memory access (16), data type of accumulator (int32_t)
and data type of computation of linear combination (alpha * X + beta * C).
Now that we setup the properties of data, we have to setup properties of computation.
Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x256x64,
64x64x16, 8x8x16 (MxNxK) respectively. When passed to instantiate HYTLASS GEMM kernel, it internally
deduce the amount of threads needed per thread-block, amount of shared memory, storing data in
bank-conflict free manner, and ton of other variables required to compose, initialize and launch a
high performance GEMM kernel. This is the beauty of HYTLASS, it relieves developer from
understanding and coding complicated hardware optimizations which can easily go wrong.
HYTLASS also supports multiple MMA pipelines in a threadblock. What are MMA pipelines? MMA pipelines
constitute the whole process of loading input data from global memory to shared memory, loading data
from shared memory to registers, doing matrix multiplication, store to global memory. The below flow
sequence shows a typical mma pipeline.
matrix in global memory -> registers -> tile in shared memory -> registers -> mma -> registers ->
output to global memory
The problem with single pipeline is, each stage is synchronous which means, each stage has to wait
until the previous finished executing. There are stages in the pipeline which do not have fixed
latency, for example, the loads from global memory and shared memory. Therefore, we can add one more
pipeline with a phase shift in mma kernel to hide latency from global and shared memory loads.
Finally, the pipeline in a kernel looks like
(1) matrix in global memory -> (2) registers -> (3) tile in shared memory -> (4) registers -> (5)
mma -> (6) registers -> (7) output to global memory (1) <null> -> (2) <null> -> (3) matrix in global
memory -> (4) registers -> (5) tile in shared memory -> (6) registers -> (7) mma -> (8) registers ->
(9) output to global memory
This way, you can hide the second global memoroy load latency by doing computation on already loaded
input data.
There are few more template variables initialized such as, which threadblock tile of output matrix
is done which threadblock launched on an SM, GFX architecture of GPU you want to run on.
These are all put together to create a template variable which describes HYTLASS GEMM kernel using
hytlass::gemm::device::Gemm template.
The next step is to initialize physical data, instantiate and initialize HYTLASS kernel and run it.
We use HYTLASS utilities to initialize, fill, compare matrices as they are simple and doesn't come
in the way of learning HYTLASS.
Once all the matrices are initialized and filled with data, create arguments tuple to launch HYTLASS
kernel which takes problem size (M = 5120, N = 4096 and K = 4096), matrices, alpha, beta and the
important one, split k-dimension factor. Along with that, we query HYTLASS if any scratch-space
memory required by the kernel we instantiated. If yes, we create it and pass it along with other
arguments created to initialize HYTLASS kernel then, the kernel is launched.
In this example, we later on launch a reference gemm kernel (from HYTLASS utilities) to compare if
the output from HYTLASS kernel is same as reference GEMM kernel.
*/
#include <iostream>
#include "hytlass/hytlass.h"
#include "hytlass/gemm/device/gemm.h"
#include "hytlass/util/host_tensor.h"
#include "hytlass/util/reference/device/gemm.h"
#include "hytlass/util/reference/host/tensor_compare.h"
#include "hytlass/util/reference/host/tensor_copy.h"
#include "hytlass/util/reference/host/tensor_fill.h"
#include "hytlass/util/tensor_view_io.h"
#include "helper.h"
#include "hytlass/util/GPU_Clock.hpp"
#include "hipblas.h"
#include "hytlass/blas3.h"
// The code section below describes datatype for input, output matrices and computation between
// elements in input matrices.
using ElementAccumulator = float; // <- data type of accumulator
using ElementComputeEpilogue = ElementAccumulator; // <- data type of epilogue operations
using ElementInputA = hytlass::tfloat32_t; // <- data type of elements in input matrix A
using ElementInputB = hytlass::tfloat32_t; // <- data type of elements in input matrix B
using ElementOutput = hytlass::half_t; // <- data type of elements in output matrix D
// The code section below describes matrix layout of input and output matrices. Column Major for
// Matrix A, Row Major for Matrix B and Row Major for Matrix C
using LayoutInputA = hytlass::layout::ColumnMajor;
using LayoutInputB = hytlass::layout::RowMajor;
using LayoutOutput = hytlass::layout::RowMajor;
// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
using MMAOp = hytlass::arch::OpClassTensorOp;
// This code section describes GFX architecture number
using SmArch = hytlass::arch::Gfx928;
// This code section describes the tile size a thread block will compute
using ShapeMMAThreadBlock = hytlass::gemm::GemmShape<128, 128, 16>;
// This code section describes tile size a warp will compute
using ShapeMMAWarp = hytlass::gemm::GemmShape<64, 128, 16>;
// This code section describes the size of MMA op
using ShapeMMAOp = hytlass::gemm::GemmShape<16, 16, 8>;
constexpr int kAlignmentA = 128 / hytlass::sizeof_bits<ElementInputA>::value;
constexpr int kAlignmentB = 128 / hytlass::sizeof_bits<ElementInputA>::value;
constexpr int kAlignmentC = 128 / hytlass::sizeof_bits<ElementOutput>::value;
// This code section describes how threadblocks are scheduled on GPU
using SwizzleThreadBlock = hytlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; // <- ??
// This code section describes the epilogue part of the kernel
using EpilogueOp = hytlass::epilogue::thread::LinearCombination<
ElementOutput, // <- data type of output matrix
kAlignmentC, // <- the number of elements per vectorized
// memory access. For a byte, it's 16
// elements. This becomes the vector width of
// math instructions in the epilogue too
ElementAccumulator, // <- data type of accumulator
ElementComputeEpilogue>; // <- data type for alpha/beta in linear combination function
// Number of pipelines you want to use
constexpr int NumStages = 2;
using Gemm = hytlass::gemm::device::Gemm<
ElementInputA,
LayoutInputA,
ElementInputB,
LayoutInputB,
ElementOutput,
LayoutOutput,
ElementAccumulator,
MMAOp,
SmArch,
ShapeMMAThreadBlock,
ShapeMMAWarp,
ShapeMMAOp,
EpilogueOp,
SwizzleThreadBlock,
NumStages, kAlignmentA, kAlignmentB>;
void reference_hipblas(int m, int n, int k,
const ElementInputA* a, const ElementInputA* b, float* c) {
hipblasHandle_t handle;
hipblasCreate(&handle);
ElementComputeEpilogue one = ElementComputeEpilogue(1);
ElementComputeEpilogue zero = ElementComputeEpilogue(0);
float *_a;
float *_b;
(void)hipMalloc((void **)(&_a), sizeof(float) * m * k);
for (int i = 0; i < m * k; i++) {
_a[i] = static_cast<float>(a[i]);
}
(void)hipMalloc((void **)(&_b), sizeof(float) * k * n);
for (int i = 0; i < n * k; i++) {
_b[i] = static_cast<float>(b[i]);
}
hipblasOperation_t blas_trans_a = std::is_same<LayoutInputA,
hytlass::layout::ColumnMajor>::value ?
HIPBLAS_OP_N :
HIPBLAS_OP_T;
hipblasOperation_t blas_trans_b = std::is_same<LayoutInputB,
hytlass::layout::ColumnMajor>::value ?
HIPBLAS_OP_N :
HIPBLAS_OP_T;
int lda = std::is_same<LayoutInputA,
hytlass::layout::ColumnMajor>::value ? m: k;
int ldb = std::is_same<LayoutInputB,
hytlass::layout::ColumnMajor>::value ? k: n;
auto err_ = hipblasGemmEx(handle, blas_trans_a, blas_trans_b,
m, n, k,
&one,
_a, HIPBLAS_R_32F, lda,
_b, HIPBLAS_R_32F, ldb,
&zero,
c, HIPBLAS_R_32F, m,
HIPBLAS_R_32F,
HIPBLAS_GEMM_DEFAULT);
(void)hipDeviceSynchronize();
if (err_ != 0) {
printf("error code is %d\n", err_);
exit(-1);
}
}
int run(int length_m_, int length_n_, int length_k_) {
const int length_m = length_m_;
const int length_n = length_n_;
const int length_k = length_k_;
// Create a tuple of problem size for matrix multiplication
hytlass::gemm::GemmCoord problem_size(length_m, length_n, length_k);
// Initialize tensors using HYTLASS helper functions
hytlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
problem_size.mk()); // <- Create matrix A with dimensions M x K
hytlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(
problem_size.kn()); // <- Create matrix B with dimensions K x N
hytlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(
problem_size.mn()); // <- Create matrix C with dimensions M x N
hytlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(
problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from
// HYTLASS kernel
hytlass::HostTensor<float, hytlass::layout::ColumnMajor> tensor_ref_d(
problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from
// reference kernel
// Fill input and output matrices on host using HYTLASS helper functions
hytlass::reference::host::TensorFillRandomUniform(
tensor_a.host_view(),
1,
ElementInputA(8),
ElementInputA(-8),
hytlass::MantissaInBits<ElementOutput>::bits); // <- Fill matrix A on host with uniform-distribution random data
hytlass::reference::host::TensorFillRandomUniform(
tensor_b.host_view(),
2,
ElementInputB(8),
ElementInputB(-8),
hytlass::MantissaInBits<ElementOutput>::bits); // <- Fill matrix B on host with uniform-distribution random data
hytlass::reference::host::TensorFill(
tensor_c.host_view());
hytlass::reference::host::TensorFill(
tensor_d.host_view()); // <- fill matrix D on host with zeros
hytlass::reference::host::TensorFill(
tensor_ref_d.host_view()); // <- fill matrix D for reference on host with zeros
// Copy data from host to GPU
tensor_a.sync_device();
tensor_b.sync_device();
tensor_c.sync_device();
tensor_d.sync_device();
tensor_ref_d.sync_device();
// Initialize alpha and beta for dot product computation
ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
ElementComputeEpilogue beta = ElementComputeEpilogue(0);
// Split K dimension into 1 partitions
int split_k_slices = 1;
// Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
// instantiated HYTLASS kernel
typename Gemm::Arguments arguments {
problem_size, // <- problem size of matrix multiplication
tensor_a.device_ref(), // <- reference to matrix A on device
tensor_b.device_ref(), // <- reference to matrix B on device
tensor_c.device_ref(), // <- reference to matrix C on device
tensor_d.device_ref(), // <- reference to matrix D on device
{alpha, beta}, // <- tuple of alpha and beta
split_k_slices}; // <- k-dimension split factor
// Using the arguments, query for extra workspace required for matrix multiplication computation
size_t workspace_size = Gemm::get_workspace_size(arguments);
// Allocate workspace memory
hytlass::device_memory::allocation<uint8_t> workspace(workspace_size);
// Instantiate HYTLASS kernel depending on templates
Gemm gemm_op;
// Check the problem size is supported or not
hytlass::Status status = gemm_op.can_implement(arguments);
HYTLASS_CHECK(status);
// Initialize HYTLASS kernel with arguments and workspace pointer
status = gemm_op.initialize(arguments, workspace.get());
HYTLASS_CHECK(status);
// Launch initialized HYTLASS kernel
status = gemm_op();
HYTLASS_CHECK(status);
tensor_d.sync_host();
// printf("result is :\n");
// for(int i=0;i<length_m;i++){
// for(int j=0;j<length_n;j++){
// printf("%.2f,", float(tensor_d.host_data()[i*length_n+j]));
// }
// printf("\n");
// }
// Create instantiation for device reference gemm kernel
// hytlass::reference::device::Gemm<ElementInputA,
// LayoutInputA,
// ElementInputB,
// LayoutInputB,
// ElementOutput,
// LayoutOutput,
// ElementOutput,
// ElementOutput>
// gemm_device;
// // Launch device reference gemm kernel
// gemm_device(problem_size,
// alpha,
// tensor_a.device_ref(),
// tensor_b.device_ref(),
// beta,
// tensor_c.device_ref(),
// tensor_ref_d.device_ref());
reference_hipblas(length_m, length_n, length_k,
tensor_a.device_data(),
tensor_b.device_data(),
tensor_ref_d.device_data());
// // Wait for kernels to finish
(void)hipDeviceSynchronize();
// Copy output data from HYTLASS and reference kernel to host for comparison
tensor_ref_d.sync_host();
// Check if output from HYTLASS kernel and reference kernel are equal or not
ElementOutput eps(1e-3f);
if (std::is_same<ElementInputA, hytlass::bfloat16_t>::value) {
eps = 0.05f;
}
float max_error_v = 1e-9;
int cnt = 0;
for (int i = 0; i < length_m; i++) {
for (int j = 0; j < length_n; j++) {
float factor = float(1);
if (tensor_ref_d.host_data()[i + length_m * j] != 0) {
factor = std::abs(tensor_ref_d.host_data()[i + length_m * j]);
}
if (std::abs(tensor_ref_d.host_data()[i + length_m * j]) <= 1) {
factor = 1;
}
if (std::abs((tensor_ref_d.host_data()[i + length_m * j]) - float(tensor_d.host_data()[i * length_n + j])) / factor > eps) {
printf("error at (%d %d) expected %f got %f abs err is %f and Relative error is %f\n", i, j,
float(tensor_ref_d.host_data()[i + length_m * j]), float(tensor_d.host_data()[i * length_n + j]),
float(std::abs((tensor_ref_d.host_data()[i + length_m * j]) - float(tensor_d.host_data()[i * length_n + j]))),
float(std::abs((tensor_ref_d.host_data()[i + length_m * j]) - float(tensor_d.host_data()[i * length_n + j])) / factor));
if (max_error_v < std::abs((tensor_ref_d.host_data()[i + length_m * j]) - (tensor_d.host_data()[i * length_n + j])) / factor){
max_error_v = (std::abs((tensor_ref_d.host_data()[i + length_m * j]) - (tensor_d.host_data()[i * length_n + j])) / factor);
}
cnt++;
}
}
}
if (cnt > 0) {
printf("faild,");
printf("%f\n", max_error_v);
return -1;
}
printf("success\n");
return 0;
}
int main() {
run(1024, 1024, 1024);
}
# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
hytlass_example_add_executable(
gfx928_serial_splitk_gemm
gfx928_serial_splitk_gemm.cu
)
\ No newline at end of file
/***************************************************************************************************
* Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/**
This example shows how to use serial split-k version of matrix multiplication using functions and
data structures provided by HYTLASS.
Examples:
# Runs a serial split-K GEMM with the given problem size (use split_k_slices to set number of K slices)
$ ./gfx928_serial_splitk_gemm --m=5120 --n=5120 --k=8192 --alpha=1 --beta=0 --split_k_slices=2 --iterations=10
*/
#include <fstream>
#include <iostream>
#include "hip/hip_runtime.h"
#include "hytlass/hytlass.h"
#include "hytlass/gemm/device/gemm.h"
#include "hytlass/gemm/device/gemm_universal.h"
#include "hytlass/util/command_line.h"
#include "hytlass/util/host_tensor.h"
#include "hytlass/util/reference/device/gemm.h"
#include "hytlass/util/reference/host/tensor_compare.h"
#include "hytlass/util/reference/host/tensor_copy.h"
#include "hytlass/util/reference/host/tensor_fill.h"
#include "hytlass/util/tensor_view_io.h"
#include "helper.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Result structure
struct Result {
double runtime_ms;
double gflops;
hytlass::Status status;
hipError_t error;
bool passed;
Result(
double runtime_ms = 0,
double gflops = 0,
hytlass::Status status = hytlass::Status::kSuccess,
hipError_t error = hipSuccess)
:
runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true)
{}
};
///////////////////////////////////////////////////////////////////////////////////////////////////
// Command line options parsing
struct Options {
bool help;
hytlass::gemm::GemmCoord problem_size;
float alpha;
float beta;
bool reference_check;
int iterations;
int split_k_slices;
Options():
help(false),
problem_size({8192, 8192, 2048}),
reference_check(true),
iterations(10),
split_k_slices(1),
alpha(1),
beta()
{}
bool valid() {
return true;
}
// Parses the command line
void parse(int argc, char const **args) {
hytlass::CommandLine cmd(argc, args);
if (cmd.check_cmd_line_flag("help")) {
help = true;
}
cmd.get_cmd_line_argument("m", problem_size.m());
cmd.get_cmd_line_argument("n", problem_size.n());
cmd.get_cmd_line_argument("k", problem_size.k());
cmd.get_cmd_line_argument("alpha", alpha);
cmd.get_cmd_line_argument("beta", beta);
cmd.get_cmd_line_argument("split_k_slices", split_k_slices);
cmd.get_cmd_line_argument("iterations", iterations);
}
/// Prints the usage statement.
std::ostream &print_usage(std::ostream &out) const {
out << "01_hytlass_serial_splitk_gemm\n\n"
<< "Options:\n\n"
<< " --help If specified, displays this usage statement.\n\n"
<< " --m=<int> GEMM M dimension\n"
<< " --n=<int> GEMM N dimension\n"
<< " --k=<int> GEMM K dimension\n"
<< " --alpha=<f32> Epilogue scalar alpha\n"
<< " --beta=<f32> Epilogue scalar beta\n\n"
<< " --split_k_slices=<int> Split-K factor to emulate\n\n"
<< " --iterations=<int> Number of profiling iterations to perform.\n\n";
out << "\n\nExamples:\n\n"
<< "$ ./examples/01_hytlass_serial_splitk_gemm/gfx928_serial_splitk_gemm --m=1024 --n=512 --k=1024 \\\n"
<< " --alpha=2 --beta=0.707 --split_k_slices=2 \n\n";
return out;
}
/// Compute performance in GFLOP/s
double gflops(double runtime_s) const {
int64_t fmas = problem_size.product();
return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////
// The code section below describes datatype for input, output matrices and computation between
// elements in input matrices.
using ElementAccumulator = float; // <- data type of accumulator
using ElementComputeEpilogue = ElementAccumulator; // <- data type of epilogue operations
using ElementInputA = hytlass::half_t; // <- data type of elements in input matrix A
using ElementInputB = hytlass::half_t; // <- data type of elements in input matrix B
using ElementOutput = hytlass::half_t; // <- data type of elements in output matrix D
constexpr int kAlignmentA = 128 / hytlass::sizeof_bits<ElementInputA>::value;
constexpr int kAlignmentB = 128 / hytlass::sizeof_bits<ElementInputB>::value;
// The code section below describes matrix layout of input and output matrices. Column Major for
// Matrix A, Row Major for Matrix B and Row Major for Matrix C
using LayoutInputA = hytlass::layout::ColumnMajor;
using LayoutInputB = hytlass::layout::RowMajor;
using LayoutOutput = hytlass::layout::RowMajor;
// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
using MMAOp = hytlass::arch::OpClassTensorOp;
// This code section describes GFX architecture number
using SmArch = hytlass::arch::Gfx928;
// This code section describes the tile size a thread block will compute
using ShapeMMAThreadBlock =
hytlass::gemm::GemmShape<128, 128, 32>; // <- threadblock tile M = 128, N = 128, K = 16
// This code section describes tile size a warp will compute
using ShapeMMAWarp = hytlass::gemm::GemmShape<64, 64, 32>; // <- warp tile M = 64, N = 64, K = 16
// This code section describes the size of MMA op
using ShapeMMAOp = hytlass::gemm::GemmShape<16, 16, 16>;
// This code section describes how threadblocks are scheduled on GPU
using SwizzleThreadBlock = hytlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
static bool const kSplitKSerial = true;
// This code section describes the epilogue part of the kernel
using EpilogueOp = hytlass::epilogue::thread::LinearCombination<
ElementOutput, // <- data type of output matrix
128 / hytlass::sizeof_bits<ElementOutput>::value, // <- the number of elements per vectorized
// memory access. For a byte, it's 16
// elements. This becomes the vector width of
// math instructions in the epilogue too
ElementAccumulator, // <- data type of accumulator
ElementComputeEpilogue>; // <- data type for alpha/beta in linear combination function
// Number of pipelines you want to use
constexpr int NumStages = 1;
using Gemm = hytlass::gemm::device::Gemm<ElementInputA,
LayoutInputA,
ElementInputB,
LayoutInputB,
ElementOutput,
LayoutOutput,
ElementAccumulator,
MMAOp,
SmArch,
ShapeMMAThreadBlock,
ShapeMMAWarp,
ShapeMMAOp,
EpilogueOp,
SwizzleThreadBlock,
NumStages,
kAlignmentA,
kAlignmentB,
kSplitKSerial>;
int run(Options &options) {
// Create a tuple of problem size for matrix multiplication
hytlass::gemm::GemmCoord problem_size = options.problem_size;
// Initialize tensors using HYTLASS helper functions
hytlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
problem_size.mk()); // <- Create matrix A with dimensions M x K
hytlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(
problem_size.kn()); // <- Create matrix B with dimensions K x N
hytlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(
problem_size.mn()); // <- Create matrix C with dimensions M x N
hytlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(
problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from
// HYTLASS kernel
hytlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(
problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from
// reference kernel
// Fill input and output matrices on host using HYTLASS helper functions
hytlass::reference::host::TensorFillRandomUniform(
tensor_a.host_view(),
1,
ElementInputA(4),
ElementInputA(-4),
0); // <- Fill matrix A on host with uniform-distribution random data
hytlass::reference::host::TensorFillRandomUniform(
tensor_b.host_view(),
1,
ElementInputB(4),
ElementInputB(-4),
0); // <- Fill matrix B on host with uniform-distribution random data
// hytlass::reference::host::TensorFillSequential(
// tensor_a.host_view()); // <- Fill matrix A on host with uniform-distribution random data
// hytlass::reference::host::TensorFillSequential(
// tensor_b.host_view()); // <- Fill matrix B on host with uniform-distribution random data
hytlass::reference::host::TensorFillRandomUniform(
tensor_c.host_view(),
1,
ElementOutput(4),
ElementOutput(-4),
0); // <- Fill matrix C on host with uniform-distribution random data
hytlass::reference::host::TensorFill(tensor_d.host_view()); // <- fill matrix D on host with zeros
hytlass::reference::host::TensorFill(
tensor_ref_d.host_view()); // <- fill matrix D for reference on host with zeros
// Copy data from host to GPU
tensor_a.sync_device();
tensor_b.sync_device();
tensor_c.sync_device();
tensor_d.sync_device();
tensor_ref_d.sync_device();
// Initialize alpha and beta for dot product computation
ElementComputeEpilogue alpha = ElementComputeEpilogue(options.alpha);
ElementComputeEpilogue beta = ElementComputeEpilogue(options.beta);
// Split K dimension into 1 partitions
int split_k_slices = options.split_k_slices;
// Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
// instantiated HYTLASS kernel
typename Gemm::Arguments arguments{problem_size, // <- problem size of matrix multiplication
tensor_a.device_ref(), // <- reference to matrix A on device
tensor_b.device_ref(), // <- reference to matrix B on device
tensor_c.device_ref(), // <- reference to matrix C on device
tensor_d.device_ref(), // <- reference to matrix D on device
{alpha, beta}, // <- tuple of alpha and beta
split_k_slices}; // <- k-dimension split factor
// Using the arguments, query for extra workspace required for matrix multiplication computation
size_t workspace_size = Gemm::get_workspace_size(arguments);
// Allocate workspace memory
hytlass::device_memory::allocation<uint8_t> workspace(workspace_size);
// Instantiate HYTLASS kernel depending on templates
Gemm gemm_op;
// Check the problem size is supported or not
hytlass::Status status = gemm_op.can_implement(arguments);
HYTLASS_CHECK(status);
// Initialize HYTLASS kernel with arguments and workspace pointer
status = gemm_op.initialize(arguments, workspace.get());
HYTLASS_CHECK(status);
// Launch initialized HYTLASS kernel
status = gemm_op();
HYTLASS_CHECK(status);
// Result structure
Result result;
for (int i=0; i<10; i++) {
status = gemm_op();
}
HYTLASS_CHECK(status);
//
// Construct events
//
hipEvent_t events[2];
for (auto &event : events) {
result.error = hipEventCreate(&event);
if (result.error != hipSuccess) {
std::cerr << "hipEventCreate() failed: " << hipGetErrorString(result.error) << std::endl;
return -1;
}
}
// Record an event at the start of a series of GEMMs
result.error = hipEventRecord(events[0]);
if (result.error != hipSuccess) {
std::cerr << "hipEventRecord() failed: " << hipGetErrorString(result.error) << std::endl;
return -1;
}
//
// Run profiling loop
//
for (int iter = 0; iter < options.iterations; ++iter) {
// Launch initialized HYTLASS kernel
status = gemm_op();
HYTLASS_CHECK(status);
}
//
// Stop profiling loop
//
// Record an event when the GEMMs are complete
result.error = hipEventRecord(events[1]);
if (result.error != hipSuccess) {
std::cerr << "hipEventRecord() failed: " << hipGetErrorString(result.error) << std::endl;
return -1;
}
// Wait for work on the device to complete.
result.error = hipEventSynchronize(events[1]);
if (result.error != hipSuccess) {
std::cerr << "hipEventSynchronize() failed: " << hipGetErrorString(result.error) << std::endl;
return -1;
}
// Measure elapsed runtime
float runtime_ms = 0;
result.error = hipEventElapsedTime(&runtime_ms, events[0], events[1]);
if (result.error != hipSuccess) {
std::cerr << "hipEventElapsed() failed: " << hipGetErrorString(result.error) << std::endl;
return -1;
}
// Compute average runtime and GFLOPs.
result.runtime_ms = double(runtime_ms) / double(options.iterations);
result.gflops = options.gflops(result.runtime_ms / 1000.0);
// Cleanup
for (auto event : events) {
(void)hipEventDestroy(event);
}
if (options.reference_check) {
// Create instantiation for device reference gemm kernel
hytlass::reference::device::Gemm<ElementInputA,
LayoutInputA,
ElementInputB,
LayoutInputB,
ElementOutput,
LayoutOutput,
ElementComputeEpilogue,
ElementComputeEpilogue>
gemm_device;
// Launch device reference gemm kernel
gemm_device(problem_size,
alpha,
tensor_a.device_ref(),
tensor_b.device_ref(),
beta,
tensor_c.device_ref(),
tensor_ref_d.device_ref());
// Wait for kernels to finish
(void)hipDeviceSynchronize();
// Copy output data from HYTLASS and reference kernel to host for comparison
tensor_d.sync_host();
tensor_ref_d.sync_host();
// Check if output from HYTLASS kernel and reference kernel are equal or not
ElementOutput eps(0.05);
const ElementOutput non_zero_floor(1e-6f);
result.passed = hytlass::reference::host::TensorRelativelyEquals(
tensor_ref_d.host_view(), tensor_d.host_view(), eps, non_zero_floor);
}
if (!result.passed) {
std::stringstream fname;
fname << "error_Gemm_device_" << problem_size.m() << "x" << problem_size.n() << "x"
<< problem_size.k() << "_" << ShapeMMAThreadBlock{}.kM << "_" << ShapeMMAThreadBlock{}.kN
<< "_" << ShapeMMAThreadBlock{}.kK << ".csv";
std::ofstream file(fname.str());
file << "problem: " << ' ' << problem_size.m() << "x" << problem_size.n() << "x"
<< problem_size.k() << ", alpha: " << float(alpha) << ", beta: " << float(beta) << "\n\n";
file << "A =\n"
<< tensor_a.host_view() << "\nB =\n"
<< tensor_b.host_view() << "\nC =\n"
<< tensor_c.host_view() << "\n\nReference =\n"
<< tensor_ref_d.host_view() << "\n\nComputed =\n"
<< tensor_d.host_view();
}
std::cout << (result.passed ? "Passed" : "Failed") << std::endl;
if (result.passed) {
std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
std::cout << "GFLOPs: " << result.gflops << std::endl;
}
return (result.passed ? 0 : -1);
}
int main(int argc, const char **argv) {
Options options;
options.parse(argc, argv);
if (options.help) {
options.print_usage(std::cout) << std::endl;
return 0;
}
printf("%d x %d x %d tensor op Matrix Multiply\n",
options.problem_size.m(), options.problem_size.n(), options.problem_size.k());
if (!options.valid()) {
std::cerr << "Invalid problem." << std::endl;
return -1;
}
return run(options);
}
# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
hytlass_example_add_executable(
gfx928_parallel_splitk_gemm
gfx928_parallel_splitk_gemm.cu
)
\ No newline at end of file
/***************************************************************************************************
* Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/**
This example shows how to use split-k version of matrix multiplication using functions and data
structures provided by HYTLASS.
What is split-k?
Consider a problem size of M = 128, N = 128, K = 4096. In this case, if my thread-block tile size (a
tile can be viewed as a 2d matrix) is 128x128x4096, then we launch a singled a thread-block taking
up a single SM of 80 SMs present on Gfx928. Hence the efficiency of computation is really low. So, how
to solve it? This is where split-k comes in. It is a way of partitioning K-dimension of matrix
multiplication and distribute across multiple SMs and get better efficiency than single SM. In the
above example, we can partition K-dimension with split-k factor of 16 i.e., thread-block tile size
will be 128x128x256 and will be launching on 16 SMs. Once each thread-block computes their partial
inner product (1/16th of output), they accumulate to single output matrix.
Writing a single high performance matrix multiplication kernel is hard but do-able. Whereas writing
high performance kernels at scale which works for multiple problem sizes with good abstractions is
really hard. HYTLASS solves this problem by providing simplified abstractions to compose
multiple sections of gemm kernel. When used properly, the kernels can hit peak performance of GPU
easily.
HYTLASS divides a kernel into hierarchical composable sections. Which means, at each thread, warp
and thread-block level, they compute on their own tile-size with higher level of tile sizes being
composed from lower level ones. Multiple thread-tiles (tile size each thread computes) can be used
to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute
threadblock-tile (tile size computed by a threadblock).
In this example, we split variable initialization into
1. Setting up data properties : describes how matrices are laid out in the memory and how the kernel
can view them (logical to physical mapping)
2. Setting up computation properties : describes how the above set matrices will be used to compute
output of matrix multiplication.
First, we setup the data types of matrices A, B, C and D along with alpha, beta as the equation for
GEMM is D = alpha * A * B + beta * C. In HYTLASS, the kernels first compute A * B and leaves the
rest of the computation to end of the kernel as alpha * X + beta * C is a simple element-wise
operation on X (A * B) and C. We call this as epilogue of kernel. Hence, we setup data types for
alpha and beta to be equal to ElementComputeEpilogue = float. As we want to MMA instructions on
Volta and they support only half-precision floating point (fp16 or half), we use data type for
elements in input matrix A and B as hytlass::half_t. Volta also supports accumulation of partial dot
product to fp32, which can store wider range of numbers, we use it as data type of output matrix
elements and accumulation. We convey this to HYTLASS kernel by initializing template variables
ElementAccumulator (float), ElementComputeEpilogue (float), ElementInputA (hytlass::half_t),
ElementInputB (hytlass::half_t), ElementOutput (float). Communicating just the data type is not
enough. As the data is laid out linearly in memory, we have to convey the layout of matrices. We do
that by initializing template variable LayoutInputA to column major hytlass variable, LayoutInputB
to row major and LayoutOutput to row major. Next, we setup rules to compute alpha * X + beta * C
which is called epilogue of the kernel. We initialize template variable EpilogueOp, which takes the
data type of output ElementOutput (float), the number of elements per vector memory access (16),
data type of accumulator (float) and data type of computation of linear combination (alpha * X +
beta * C).
Now that we setup the properties of data, we have to setup properties of computation.
Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x32,
64x64x4, 8x8x4 (MxNxK) respectively. When passed to instantiate HYTLASS GEMM kernel, it internally
deduce the amount of threads needed per thread-block, amount of shared memory, storing data in
bank-conflict free manner, and ton of other variables required to compose, initialize and launch a
high performance GEMM kernel. This is the beauty of HYTLASS, it relieves developer from
understanding and coding complicated hardware optimizations which can easily go wrong.
There are few more template variables initialized such as, which threadblock tile of output matrix
is done which threadblock launched on an SM, GFX architecture of GPU you want to run on.
These are all put together to create a template variable which describes HYTLASS GEMM kernel using
hytlass::gemm::device::GemmSplitKParallel template.
The next step is to initialize physical data, instantiate and initialize HYTLASS kernel and run it.
We use HYTLASS utilities to initialize, fill, compare matrices as they are simple and doesn't come
in the way of learning HYTLASS.
Once all the matrices are initialized and filled with data, create arguments tuple to launch HYTLASS
kernel which takes problem size (M = 5120, N = 4096 and K = 4096), matrices, alpha, beta and the
important one, split k-dimension factor. Along with that, we query HYTLASS if any scratch-space
memory required by the kernel we instantiated. If yes, we create it and pass it along with other
arguments created to initialize HYTLASS kernel then, the kernel is launched.
In this example, we later on launch a reference gemm kernel (from HYTLASS utilities) to compare if
the output from HYTLASS kernel is same as reference GEMM kernel.
*/
#include <fstream>
#include <iostream>
#include "hip/hip_runtime.h"
#include "hytlass/hytlass.h"
#include "hytlass/gemm/device/gemm_splitk_parallel.h"
#include "hytlass/gemm/device/gemm_universal.h"
#include "hytlass/util/command_line.h"
#include "hytlass/util/host_tensor.h"
#include "hytlass/util/reference/device/gemm.h"
#include "hytlass/util/reference/host/tensor_compare.h"
#include "hytlass/util/reference/host/tensor_copy.h"
#include "hytlass/util/reference/host/tensor_fill.h"
#include "hytlass/util/tensor_view_io.h"
#include "helper.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Result structure
struct Result {
double runtime_ms;
double gflops;
hytlass::Status status;
hipError_t error;
bool passed;
Result(
double runtime_ms = 0,
double gflops = 0,
hytlass::Status status = hytlass::Status::kSuccess,
hipError_t error = hipSuccess)
:
runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true)
{}
};
///////////////////////////////////////////////////////////////////////////////////////////////////
// Command line options parsing
struct Options {
bool help;
hytlass::gemm::GemmCoord problem_size;
float alpha;
float beta;
bool reference_check;
int iterations;
int split_k_slices;
Options():
help(false),
problem_size({8192, 8192, 2048}),
reference_check(true),
iterations(10),
split_k_slices(1),
alpha(1),
beta()
{}
bool valid() {
return true;
}
// Parses the command line
void parse(int argc, char const **args) {
hytlass::CommandLine cmd(argc, args);
if (cmd.check_cmd_line_flag("help")) {
help = true;
}
cmd.get_cmd_line_argument("m", problem_size.m());
cmd.get_cmd_line_argument("n", problem_size.n());
cmd.get_cmd_line_argument("k", problem_size.k());
cmd.get_cmd_line_argument("alpha", alpha);
cmd.get_cmd_line_argument("beta", beta);
cmd.get_cmd_line_argument("split_k_slices", split_k_slices);
cmd.get_cmd_line_argument("iterations", iterations);
}
/// Prints the usage statement.
std::ostream &print_usage(std::ostream &out) const {
out << "02_hytlass_parallel_splitk_gemm example\n\n"
<< " This example uses the HYTLASS Library to execute F32 tensorop GEMM computations.\n\n"
<< "Options:\n\n"
<< " --help If specified, displays this usage statement.\n\n"
<< " --m=<int> GEMM M dimension\n"
<< " --n=<int> GEMM N dimension\n"
<< " --k=<int> GEMM K dimension\n"
<< " --alpha=<f32> Epilogue scalar alpha\n"
<< " --beta=<f32> Epilogue scalar beta\n\n"
<< " --split_k_slices=<int> Split-K factor to emulate\n\n"
<< " --iterations=<int> Number of profiling iterations to perform.\n\n";
out << "\n\nExamples:\n\n"
<< "$ ./examples/02_hytlass_parallel_splitk_gemm/gfx928_parallel_splitk_gemm --m=1024 --n=512 --k=1024 \\\n"
<< " --alpha=2 --beta=0.707 --split_k_slices=2 \n\n";
return out;
}
/// Compute performance in GFLOP/s
double gflops(double runtime_s) const {
// Number of real-valued multiply-adds
int64_t fmas = problem_size.product();
// Two flops per multiply-add
return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////
// The code section below describes datatype for input, output matrices and computation between
// elements in input matrices.
using ElementAccumulator = float; // <- data type of accumulator
using ElementComputeEpilogue = ElementAccumulator; // <- data type of epilogue operations
using ElementInputA = hytlass::half_t; // <- data type of elements in input matrix A
using ElementInputB = hytlass::half_t; // <- data type of elements in input matrix B
using ElementOutput = hytlass::half_t; // <- data type of elements in output matrix D
// The code section below describes matrix layout of input and output matrices. Column Major for
// Matrix A, Row Major for Matrix B and Row Major for Matrix C
using LayoutInputA = hytlass::layout::ColumnMajor;
using LayoutInputB = hytlass::layout::ColumnMajor;
using LayoutOutput = hytlass::layout::RowMajor;
// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
using MMAOp = hytlass::arch::OpClassTensorOp;
// This code section describes GFX architecture number
using SmArch = hytlass::arch::Gfx928;
// This code section describes the tile size a thread block will compute
using ShapeMMAThreadBlock =
hytlass::gemm::GemmShape<128, 128, 32>; // <- threadblock tile M = 128, N = 128, K = 16
// This code section describes tile size a warp will compute
using ShapeMMAWarp = hytlass::gemm::GemmShape<64, 64, 32>; // <- warp tile M = 64, N = 64, K = 16
// This code section describes the size of MMA op
using ShapeMMAOp = hytlass::gemm::GemmShape<16, 16, 16>;
// This code section describes the epilogue part of the kernel
using EpilogueOp = hytlass::epilogue::thread::LinearCombination<
ElementOutput, // <- data type of output matrix
128 / hytlass::sizeof_bits<ElementOutput>::value, // <- the number of elements per vectorized
// memory access. For a byte, it's 16
// elements. This becomes the vector width of
// math instructions in the epilogue too
ElementAccumulator, // <- data type of accumulator
ElementComputeEpilogue>; // <- data type for alpha/beta in linear combination function
using Gemm = hytlass::gemm::device::GemmSplitKParallel<ElementInputA,
LayoutInputA,
ElementInputB,
LayoutInputB,
ElementOutput,
LayoutOutput,
ElementAccumulator,
MMAOp,
SmArch,
ShapeMMAThreadBlock,
ShapeMMAWarp,
ShapeMMAOp,
EpilogueOp>;
int run(Options &options) {
// Create a tuple of problem size for matrix multiplication
hytlass::gemm::GemmCoord problem_size = options.problem_size;
// Initialize tensors using HYTLASS helper functions
hytlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
problem_size.mk()); // <- Create matrix A with dimensions M x K
hytlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(
problem_size.kn()); // <- Create matrix B with dimensions K x N
hytlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(
problem_size.mn()); // <- Create matrix C with dimensions M x N
hytlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(
problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from
// HYTLASS kernel
hytlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(
problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from
// reference kernel
// Fill input and output matrices on host using HYTLASS helper functions
hytlass::reference::host::TensorFillRandomUniform(
tensor_a.host_view(),
1,
ElementInputA(4),
ElementInputA(-4),
0); // <- Fill matrix A on host with uniform-distribution random data
hytlass::reference::host::TensorFillRandomUniform(
tensor_b.host_view(),
1,
ElementInputB(4),
ElementInputB(-4),
0); // <- Fill matrix B on host with uniform-distribution random data
// hytlass::reference::host::TensorFillSequential(
// tensor_a.host_view()); // <- Fill matrix A on host with uniform-distribution random data
// hytlass::reference::host::TensorFillSequential(
// tensor_b.host_view()); // <- Fill matrix B on host with uniform-distribution random data
hytlass::reference::host::TensorFillRandomUniform(
tensor_c.host_view(),
1,
ElementOutput(4),
ElementOutput(-4),
0); // <- Fill matrix C on host with uniform-distribution random data
hytlass::reference::host::TensorFill(
tensor_d.host_view()); // <- fill matrix D on host with zeros
hytlass::reference::host::TensorFill(
tensor_ref_d.host_view()); // <- fill matrix D for reference on host with zeros
// Copy data from host to GPU
tensor_a.sync_device();
tensor_b.sync_device();
tensor_c.sync_device();
tensor_d.sync_device();
tensor_ref_d.sync_device();
// Initialize alpha and beta for dot product computation
ElementComputeEpilogue alpha = ElementComputeEpilogue(options.alpha);
ElementComputeEpilogue beta = ElementComputeEpilogue(options.beta);
// Split K dimension into 1 partitions
int split_k_slices = options.split_k_slices;
// Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
// instantiated HYTLASS kernel
typename Gemm::Arguments arguments{problem_size, // <- problem size of matrix multiplication
tensor_a.device_ref(), // <- reference to matrix A on device
tensor_b.device_ref(), // <- reference to matrix B on device
tensor_c.device_ref(), // <- reference to matrix C on device
tensor_d.device_ref(), // <- reference to matrix D on device
{alpha, beta}, // <- tuple of alpha and beta
split_k_slices}; // <- k-dimension split factor
// Using the arguments, query for extra workspace required for matrix multiplication computation
size_t workspace_size = Gemm::get_workspace_size(arguments);
// Allocate workspace memory
hytlass::device_memory::allocation<uint8_t> workspace(workspace_size);
// Instantiate HYTLASS kernel depending on templates
Gemm gemm_op;
// Check the problem size is supported or not
hytlass::Status status = gemm_op.can_implement(arguments);
HYTLASS_CHECK(status);
// Initialize HYTLASS kernel with arguments and workspace pointer
status = gemm_op.initialize(arguments, workspace.get());
HYTLASS_CHECK(status);
// Launch initialized HYTLASS kernel
status = gemm_op();
HYTLASS_CHECK(status);
// Result structure
Result result;
for (int i=0; i<10; i++) {
status = gemm_op();
}
HYTLASS_CHECK(status);
//
// Construct events
//
hipEvent_t events[2];
for (auto &event : events) {
result.error = hipEventCreate(&event);
if (result.error != hipSuccess) {
std::cerr << "hipEventCreate() failed: " << hipGetErrorString(result.error) << std::endl;
return -1;
}
}
// Record an event at the start of a series of GEMMs
result.error = hipEventRecord(events[0]);
if (result.error != hipSuccess) {
std::cerr << "hipEventRecord() failed: " << hipGetErrorString(result.error) << std::endl;
return -1;
}
//
// Run profiling loop
//
for (int iter = 0; iter < options.iterations; ++iter) {
// Launch initialized HYTLASS kernel
status = gemm_op();
HYTLASS_CHECK(status);
}
//
// Stop profiling loop
//
// Record an event when the GEMMs are complete
result.error = hipEventRecord(events[1]);
if (result.error != hipSuccess) {
std::cerr << "hipEventRecord() failed: " << hipGetErrorString(result.error) << std::endl;
return -1;
}
// Wait for work on the device to complete.
result.error = hipEventSynchronize(events[1]);
if (result.error != hipSuccess) {
std::cerr << "hipEventSynchronize() failed: " << hipGetErrorString(result.error) << std::endl;
return -1;
}
// Measure elapsed runtime
float runtime_ms = 0;
result.error = hipEventElapsedTime(&runtime_ms, events[0], events[1]);
if (result.error != hipSuccess) {
std::cerr << "hipEventElapsed() failed: " << hipGetErrorString(result.error) << std::endl;
return -1;
}
// Compute average runtime and GFLOPs.
result.runtime_ms = double(runtime_ms) / double(options.iterations);
result.gflops = options.gflops(result.runtime_ms / 1000.0);
// Cleanup
for (auto event : events) {
(void)hipEventDestroy(event);
}
if (options.reference_check) {
// Create instantiation for device reference gemm kernel
hytlass::reference::device::Gemm<ElementInputA,
LayoutInputA,
ElementInputB,
LayoutInputB,
ElementOutput,
LayoutOutput,
ElementComputeEpilogue,
ElementComputeEpilogue>
gemm_device;
// Launch device reference gemm kernel
gemm_device(problem_size,
alpha,
tensor_a.device_ref(),
tensor_b.device_ref(),
beta,
tensor_c.device_ref(),
tensor_ref_d.device_ref());
// Wait for kernels to finish
(void)hipDeviceSynchronize();
// Copy output data from HYTLASS and reference kernel to host for comparison
tensor_d.sync_host();
tensor_ref_d.sync_host();
// Check if output from HYTLASS kernel and reference kernel are equal or not
ElementOutput eps(0.05);
const ElementOutput non_zero_floor(1e-6f);
result.passed = hytlass::reference::host::TensorRelativelyEquals(tensor_ref_d.host_view(), tensor_d.host_view(), eps, non_zero_floor);
}
if (!result.passed) {
std::stringstream fname;
fname << "error_Gemm_device_"
<< problem_size.m() << "x" << problem_size.n() << "x" << problem_size.k() << "_"
<< ShapeMMAThreadBlock{}.kM << "_"
<< ShapeMMAThreadBlock{}.kN << "_"
<< ShapeMMAThreadBlock{}.kK << ".csv";
std::ofstream file(fname.str());
file
<< "problem: " << ' ' << problem_size.m() << "x" << problem_size.n() << "x" << problem_size.k()
<< ", alpha: " << float(alpha) << ", beta: " << float(beta) << "\n\n";
file
<< "A =\n"
<< tensor_a.host_view()
<< "\nB =\n"
<< tensor_b.host_view()
<< "\nC =\n"
<< tensor_c.host_view()
<< "\n\nReference =\n"
<< tensor_ref_d.host_view()
<< "\n\nComputed =\n"
<< tensor_d.host_view();
}
std::cout << (result.passed ? "Passed" : "Failed") << std::endl;
if (result.passed) {
std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
std::cout << "GFLOPs: " << result.gflops << std::endl;
}
return (result.passed ? 0 : -1);
}
int main(int argc, const char **argv) {
Options options;
options.parse(argc, argv);
if (options.help) {
options.print_usage(std::cout) << std::endl;
return 0;
}
printf("%d x %d x %d tensor op Matrix Multiply\n",
options.problem_size.m(), options.problem_size.n(), options.problem_size.k());
if (!options.valid()) {
std::cerr << "Invalid problem." << std::endl;
return -1;
}
return run(options);
}
# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
hytlass_example_add_executable(
gfx928_streamk_gemm
gfx928_streamk_gemm.cu
)
hytlass_example_add_executable(
gfx928_gemm_universal_streamk_broadcast
gfx928_gemm_universal_streamk_broadcast.cu
)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment