Initial commit: release hytlass-0.1.0

d22dbec2 · zhoux · d22dbec2 · d22dbec2 · d22dbec2 · d22dbec2
Commit d22dbec2 authored Dec 09, 2025 by zhoux
20 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
+# HYGON HYTLASS Changelog
+# HYTLASS 0.1.0新增
+HYTLASS 0.1.0是HYTLASS的首次提交，其实现基于CUTLASS 3.5.0，具体地，提供了：
+- HYTLASS兼容和支持CUTLASS 2.x实现：
+    - 支持BW(GFX936)及更早架构下的指令特性，包括支持TensorCore的mmac及ds_read_matrix原语。
+    - 支持基于这些架构指令特性下的矩阵乘法实现。
+    - 支持基于矩阵乘法模版实现的基于隐式矩阵乘法的卷积实现。
+- HYTLASS兼容和支持CUTLASS 3.x及CuTe编程模型：
+    - 适配基于DCU架构对CUTLASS 3.x中的CuTe编程模型HuTe，目前已支持至BW平台的指令原语。
+    - 支持基于HuTe模型的矩阵乘法实现，包括MMA部分和Epilogue部分。
+    - 支持基于HuTe模型的Kernel调度、Tiling调度等计算任务调度架构，支持多种线程块调度优化策略。
+- 十余个计算示例支持：
+    - 支持基于2.x的多种数据类型(TF32/FP16/BF16/I8/U8)的矩阵乘法、卷积及其融合算子实现。
+    - 支持基于Split-K、Stream-K等计算优化算法的矩阵乘法示例。
+    - 支持基于访问者模式的自定义尾声处理示例。
+    - 支持基于HuTe的矩阵乘法示例，包括BatchedGemm、GroupGemm等示例。
+    - 支持使用TensorCore加速的基于Block Ell格式的稀疏矩阵乘法示例。
+- 工具链支持：
+    - 支持hytlass_profiler，用于细粒度问题参数下的kernel tuning。
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
+# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+cmake_minimum_required(VERSION 3.19 FATAL_ERROR)
+cmake_policy(SET CMP0112 NEW)
+find_program(HIP_COMPILER_PATH hipcc)
+if(HIP_COMPILER_PATH)
+    get_filename_component(DCU_TOOLKIT_ROOT_DIR "${HIP_COMPILER_PATH}" DIRECTORY)
+    get_filename_component(DCU_TOOLKIT_ROOT_DIR "${DCU_TOOLKIT_ROOT_DIR}/.." REALPATH)
+    message(STATUS "DCU_TOOLKIT_ROOT_DIR is set to ${DCU_TOOLKIT_ROOT_DIR}")
+else()
+    message(FATAL_ERROR "hipcc not found in the environment path.")
+endif()
+# use hipcc as default compiler
+set(CMAKE_CXX_COMPILER "${HIP_COMPILER_PATH}")
+# for hipcomplex support
+add_definitions(-DROCM_MATHLIBS_API_USE_HIP_COMPLEX=1)
+if(hytlass_LOADED)
+  # If HYTLASS has been previously fetched and loaded, don't do it again.
+  return()
+else()
+  set(hytlass_LOADED ON)
+  set(HYTLASS_DIR ${CMAKE_CURRENT_SOURCE_DIR} CACHE PATH "HYTLASS Repository Directory")
+endif()
+message(STATUS "CMake Version: ${CMAKE_VERSION}")
+set(IMPLICIT_CMAKE_CXX_STANDARD OFF CACHE BOOL "Do not explicitly specify -std=c++17 if set")
+# To reduce duplicate version locations, parse the version out of the
+# main versions.h file and reuse it here.
+file(READ ${CMAKE_CURRENT_SOURCE_DIR}/include/hytlass/version.h VERSION_FILE_CONTENTS)
+string(REGEX MATCH "#define HYTLASS_MAJOR ([0-9]+)" _HYTLASS_VERSION_MAJOR "${VERSION_FILE_CONTENTS}")
+set(_HYTLASS_VERSION_MAJOR ${CMAKE_MATCH_1})
+string(REGEX MATCH "#define HYTLASS_MINOR ([0-9]+)" _HYTLASS_VERSION_MINOR "${VERSION_FILE_CONTENTS}")
+set(_HYTLASS_VERSION_MINOR ${CMAKE_MATCH_1})
+string(REGEX MATCH "#define HYTLASS_PATCH ([0-9]+)" _HYTLASS_VERSION_PATCH "${VERSION_FILE_CONTENTS}")
+set(_HYTLASS_VERSION_PATCH ${CMAKE_MATCH_1})
+message(STATUS "HYTLASS ${_HYTLASS_VERSION_MAJOR}.${_HYTLASS_VERSION_MINOR}.${_HYTLASS_VERSION_PATCH}")
+## HYTLASS PROJECT #############################################################
+project(HYTLASS VERSION ${_HYTLASS_VERSION_MAJOR}.${_HYTLASS_VERSION_MINOR}.${_HYTLASS_VERSION_PATCH} LANGUAGES CXX)
+################################################################################
+include(${CMAKE_CURRENT_SOURCE_DIR}/HIP.cmake)
+# enable __shfl_sync for dtk24.x
+find_file(AMD_WARP_SYNC_PATH amd_warp_sync_functions.h
+  PATHS ${DCU_TOOLKIT_ROOT_DIR}/hip/include/hip/amd_detail
+)
+if(AMD_WARP_SYNC_PATH)
+  message(STATUS "Enable HIP_ENABLE_WARP_SYNC_BUILTINS")
+  add_definitions(-DHIP_ENABLE_WARP_SYNC_BUILTINS)
+endif()
+if( CMAKE_CXX_COMPILER MATCHES ".*/hipcc$")
+  execute_process(COMMAND ${CMAKE_CXX_COMPILER} "--version" OUTPUT_VARIABLE CXX_OUTPUT
+                  OUTPUT_STRIP_TRAILING_WHITESPACE
+                  ERROR_STRIP_TRAILING_WHITESPACE)
+  string(REGEX MATCH "[A-Za-z]* ?clang version" TMP_CXX_VERSION ${CXX_OUTPUT})
+  string(REGEX MATCH "[A-Za-z]+" CXX_VERSION_STRING ${TMP_CXX_VERSION})
+endif()
+# add compiler check
+if( CXX_VERSION_STRING MATCHES "clang" )
+  message( STATUS "Use hip-clang to build for amdgpu backend" )
+  set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_HCC_COMPAT_MODE__=1" )
+elseif( CXX_VERSION_STRING MATCHES "hipcc" )
+  message(FATAL_ERROR "Don't support for hipcc")
+else()
+  message(FATAL_ERROR "Unsupport compiler ${CMAKE_CXX_COMPILER}. Only support for hip-clang")
+endif()
+find_package(Doxygen QUIET)
+################################################################################
+#
+# HYTLASS 3.x requires C++17
+#
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+  set(CMAKE_INSTALL_PREFIX install CACHE PATH "Default installation location." FORCE)
+endif()
+message(STATUS "Default Install Location: ${CMAKE_INSTALL_PREFIX}")
+set(HYTLASS_TEST_LEVEL "0" CACHE STRING "Level of tests to compile.")
+# 0 - Sanity, 1 - Release-Quality, 2 - Exhaustive
+find_package(Python3 3.5 COMPONENTS Interpreter REQUIRED)
+# Install hytlass_library Python package
+execute_process(
+  WORKING_DIRECTORY ${HYTLASS_DIR}/python
+  COMMAND ${Python3_EXECUTABLE} ${HYTLASS_DIR}/python/setup_library.py develop --user
+  RESULT_VARIABLE hytlass_lib_GENERATOR_INSTALL_RESULT
+  OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/hytlass_library_installation.log
+  ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/hytlass_library_installation.log
+)
+if(NOT hytlass_lib_GENERATOR_INSTALL_RESULT EQUAL 0)
+  message(FATAL_ERROR "Error installing hytlass_library package. See ${CMAKE_CURRENT_BINARY_DIR}/hytlass_library_installation.log")
+endif()
+################################################################################
+set(HYTLASS_ENABLE_HEADERS_ONLY OFF CACHE BOOL "Enable only the header library")
+if(HYTLASS_ENABLE_HEADERS_ONLY)
+  set(HYTLASS_ENABLE_EXAMPLES_INIT OFF)
+  set(HYTLASS_ENABLE_TOOLS_INIT ON)
+  set(HYTLASS_ENABLE_LIBRARY_INIT OFF)
+  set(HYTLASS_ENABLE_TESTS_INIT OFF)
+else()
+  set(HYTLASS_ENABLE_EXAMPLES_INIT ON)
+  set(HYTLASS_ENABLE_TOOLS_INIT ON)
+  set(HYTLASS_ENABLE_LIBRARY_INIT ON)
+  if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
+    set(HYTLASS_ENABLE_TESTS_INIT ON)
+  else()
+    set(HYTLASS_ENABLE_TESTS_INIT OFF)
+  endif()
+  set(HYTLASS_ENABLE_HIPBLAS ON)
+endif()
+set(HYTLASS_TEST_UNIT_ENABLE_WARNINGS OFF CACHE BOOL "Enable warnings on waived unit tests.")
+set(HYTLASS_ENABLE_EXAMPLES ${HYTLASS_ENABLE_EXAMPLES_INIT} CACHE BOOL "Enable HYTLASS Examples")
+set(HYTLASS_ENABLE_TOOLS ${HYTLASS_ENABLE_TOOLS_INIT} CACHE BOOL "Enable HYTLASS Tools")
+set(HYTLASS_ENABLE_LIBRARY ${HYTLASS_ENABLE_LIBRARY_INIT} CACHE BOOL "Enable HYTLASS Library")
+set(HYTLASS_ENABLE_PROFILER ${HYTLASS_ENABLE_LIBRARY} CACHE BOOL "Enable HYTLASS Profiler")
+set(HYTLASS_ENABLE_PERFORMANCE ${HYTLASS_ENABLE_PROFILER} CACHE BOOL "Enable HYTLASS Performance")
+set(HYTLASS_ENABLE_TESTS ${HYTLASS_ENABLE_TESTS_INIT} CACHE BOOL "Enable HYTLASS Tests")
+set(HYTLASS_ENABLE_GTEST_UNIT_TESTS ${HYTLASS_ENABLE_TESTS} CACHE BOOL "Enable HYTLASS GTest-based Unit Tests")
+set(HYTLASS_USE_SYSTEM_GOOGLETEST OFF CACHE BOOL "Use system/external installation of GTest")
+################################################################################
+# Enable all arch for now unless some archs were specified
+if (NOT DEFINED HYTLASS_HIPCC_ARCHS_SUPPORTED)
+  set(HYTLASS_HIPCC_ARCHS_SUPPORTED)
+  if(DEFINED ENV{AMDGPU_TARGETS})
+    set(AMDGPU_TARGETS_LIST "$ENV{AMDGPU_TARGETS}")
+    foreach(target ${AMDGPU_TARGETS_LIST})
+      string(REGEX REPLACE "gfx([0-9]+)" "\\1" number ${target})
+      list(APPEND HYTLASS_HIPCC_ARCHS_SUPPORTED "${number}")
+    endforeach()
+  elseif()
+    list(APPEND HYTLASS_HIPCC_ARCHS_SUPPORTED 906 926 908 928 936)
+  endif()
+endif()
+set(HYTLASS_HIPCC_ARCHS ${HYTLASS_HIPCC_ARCHS_SUPPORTED} CACHE STRING "The Gfx architectures requested.")
+set(HYTLASS_HIPCC_ARCHS_ENABLED ${HYTLASS_HIPCC_ARCHS} CACHE STRING "The Gfx architectures to build code for.")
+# Find unsupported and deprecated compute capabilities
+if (HYTLASS_HIPCC_ARCHS_SUPPORTED)
+  set(HYTLASS_HIPCC_ARCHS_UNSUPPORTED ${HYTLASS_HIPCC_ARCHS})
+  list(REMOVE_ITEM HYTLASS_HIPCC_ARCHS_UNSUPPORTED ${HYTLASS_HIPCC_ARCHS_SUPPORTED})
+  if (HYTLASS_HIPCC_ARCHS_UNSUPPORTED)
+    message(WARNING "Using unsupported or deprecated compute capabilities ${HYTLASS_HIPCC_ARCHS_UNSUPPORTED}. Support may be removed in future versions.")
+  endif()
+else()
+  message(WARNING "No supported compute capabilities")
+endif()
+# Special policy introduced in CMake 3.13
+if (POLICY CMP0076)
+  cmake_policy(SET CMP0076 NEW)
+endif()
+include(GNUInstallDirs)
+###################################################################################################
+#
+# Configure CMake variables
+#
+###################################################################################################
+message(STATUS "HIP Compilation Architectures: ${HYTLASS_HIPCC_ARCHS_ENABLED}")
+if (NOT (CMAKE_BUILD_TYPE OR CONFIGURATION_TYPES))
+  # By default we want to build in Release mode to ensure that we're getting best performance.
+  set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose build level" FORCE)
+  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "RelWithDebInfo" "Release")
+endif()
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+if (DEFINED CMAKE_DEBUG_POSTFIX)
+  set(HYTLASS_LIBRARY_DEBUG_POSTFIX_INIT ${CMAKE_DEBUG_POSTFIX})
+else()
+  set(HYTLASS_LIBRARY_DEBUG_POSTFIX_INIT .debug)
+endif()
+set(HYTLASS_LIBRARY_DEBUG_POSTFIX ${HYTLASS_LIBRARY_DEBUG_POSTFIX_INIT} CACHE STRING "Default postfix value for debug libraries")
+if(WIN32)
+  # On Windows we link against the shared (DLL) runtime. Change gtest settings to match this.
+  set(gtest_force_shared_crt ON CACHE BOOL "Use shared (DLL) run-time lib even when Google Test is built as static lib" FORCE)
+endif()
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHYTLASS_VERSIONS_GENERATED")
+set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -DHYTLASS_VERSIONS_GENERATED")
+if (WIN32)
+  # Enable more warnings.  Add "-Xcompiler=/WX" to enable warnings as errors.
+  list(APPEND HYTLASS_HIP_HIPCC_FLAGS -Xcompiler=/W3)
+  # Disable warning on Unicode characters
+  list(APPEND HYTLASS_HIP_HIPCC_FLAGS -Xcompiler=/wd4819)
+  # Disable excess x86 floating point precision that can lead to results being labeled incorrectly
+  list(APPEND HYTLASS_HIP_HIPCC_FLAGS -Xcompiler=/fp:strict)
+endif(WIN32)
+if (${HYTLASS_HIPCC_VERBOSE})
+  list(APPEND HYTLASS_HIP_HIPCC_FLAGS -v)
+endif()
+#
+# HYTLASS NAMESPACE 
+#
+set(HYTLASS_NAMESPACE "hytlass" CACHE STRING "Top level namespace of HYTLASS")
+set(HYTLASS_HIPCC_KEEP OFF CACHE BOOL "Keep intermediate files generated by HIPCC.")
+set(HYTLASS_ENABLE_F16C OFF CACHE BOOL "Enable F16C x86 extensions in host code.")
+################################################################################
+#
+# HYTLASS generator cmake configuration
+#
+# Kernel unified filter file
+set(KERNEL_FILTER_FILE "" CACHE STRING "KERNEL FILTER FILE FULL PATH")
+if (KERNEL_FILTER_FILE AND NOT HYTLASS_LIBRARY_KERNELS) 
+  # If a kernel filter file is specified, we want to generate and then
+  # filter on the entire kernel set, not the default kernel
+  # (sub)set. The user may overried HYTLASS_LIBRRARY_KERNELS, in which
+  # case the resulting kernel set will be the intersection of the two
+  # options differenced against HYTLASS_LIBRARY_IGNORE_KERNELS.
+  set(HYTLASS_LIBRARY_KERNELS_INIT "*")
+else() 
+  set(HYTLASS_LIBRARY_KERNELS_INIT "") 
+endif()
+if (KERNEL_FILTER_FILE)
+  get_filename_component(KERNEL_FILTER_FILE "${KERNEL_FILTER_FILE}" ABSOLUTE)
+  set(KERNEL_FILTER_FILE "${KERNEL_FILTER_FILE}" CACHE STRING "KERNEL FILTER FILE FULL PATH" FORCE)
+endif()
+set(SELECTED_KERNEL_LIST "selected" CACHE STRING "Name of the filtered kernel list")
+if(KERNEL_FILTER_FILE)
+  message(STATUS "Full path of filter file: ${KERNEL_FILTER_FILE}")
+endif()
+set(HYTLASS_LIBRARY_OPERATIONS "all" CACHE STRING "Comma delimited list of operation name filters. Default '' means all operations are enabled.")
+set(HYTLASS_LIBRARY_KERNELS ${HYTLASS_LIBRARY_KERNELS_INIT} CACHE STRING "Comma delimited list of kernel name filters. If unspecified, only the largest tile size is enabled. If 'all' is specified, all kernels are enabled.")
+set(HYTLASS_LIBRARY_IGNORE_KERNELS "" CACHE STRING "Comma delimited list of kernel names to exclude from build.")
+set(HYTLASS_PROBLEM_SIZE_PATH "" CACHE STRING "Comma defined from which path data is loaded")
+################################################################################
+set(HYTLASS_TEST_ENABLE_CACHED_RESULTS OFF CACHE BOOL "Enable caching and reuse of test results in unit tests")
+set_property(CACHE HYTLASS_TEST_LEVEL PROPERTY STRINGS 0 1 2)
+list(APPEND HYTLASS_HIP_HIPCC_FLAGS -DHYTLASS_TEST_LEVEL=${HYTLASS_TEST_LEVEL})
+if (HYTLASS_TEST_ENABLE_CACHED_RESULTS)
+  message(STATUS "Enable caching of reference results in conv unit tests")
+  list(APPEND HYTLASS_HIP_HIPCC_FLAGS -DHYTLASS_TEST_ENABLE_CACHED_RESULTS=1)
+endif()
+set(HYTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED ON CACHE BOOL "Enable/Disable rigorous conv problem sizes in conv unit tests")
+if (HYTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED)
+  message(STATUS "Enable rigorous conv problem sizes in conv unit tests")
+  list(APPEND HYTLASS_HIP_HIPCC_FLAGS -DHYTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED=1)
+endif()
+################################################################################
+# Trace levels for debugging
+set(HYTLASS_DEBUG_TRACE_LEVEL "0" CACHE STRING "Level of debug tracing to perform.")
+list(APPEND HYTLASS_HIP_HIPCC_FLAGS -DHYTLASS_DEBUG_TRACE_LEVEL=${HYTLASS_DEBUG_TRACE_LEVEL})
+#
+# NOTE: running with asan and HIP requires the following environment variable:
+#
+#  ASAN_OPTIONS=protect_shadow_gap=0:replace_intrin=0:detect_leaks=0
+#
+# without the above environment setting, an error like the following may be generated:
+#
+#  *** Error: Could not detect active GPU device ID [out of memory]
+#  ...
+#  ==9149==ERROR: LeakSanitizer: detected memory leaks
+#  ...
+#
+if(ENABLE_ASAN)  # https://github.com/google/sanitizers/wiki/AddressSanitizer
+  list(APPEND HYTLASS_HIP_HIPCC_FLAGS --compiler-options=-fsanitize=address --compiler-options=-fno-omit-frame-pointer)
+  string(APPEND CMAKE_EXE_LINKER_FLAGS " -fsanitize=address")
+endif()
+# Enable double VGPRs for grid size 512
+list(APPEND HYTLASS_HIP_HIPCC_FLAGS -mllvm -enable-num-vgprs-512=true)
+###################################################################################################
+#
+# Configure HIP build options
+#
+###################################################################################################
+# Warnings-as-error exceptions and warning suppressions for Clang builds
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=implicit-int-conversion ")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pass-failed ")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=inconsistent-missing-override ")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-sign-conversion ")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-return-type ")
+endif()
+if (NOT MSVC AND HYTLASS_HIPCC_KEEP)
+  # MSVC flow handles caching already, but for other generators we handle it here.
+  set(HYTLASS_HIPCC_KEEP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp CACHE PATH "Location to store HIPCC scratch files")
+  file(MAKE_DIRECTORY ${HYTLASS_HIPCC_KEEP_DIR})
+  list(APPEND HYTLASS_HIP_HIPCC_FLAGS -save-temps=${HYTLASS_HIPCC_KEEP_DIR} -v)
+endif()
+if (HYTLASS_ENABLE_F16C AND NOT CMAKE_CROSSCOMPILING)
+  list(APPEND HYTLASS_HIP_FLAGS -DHYTLASS_ENABLE_F16C=1)
+  if ((CMAKE_CXX_COMPILER_ID MATCHES "MSVC"))
+    list(APPEND HYTLASS_HIP_HIPCC_FLAGS -Xcompiler=/arch:AVX2)
+  else()
+    list(APPEND HYTLASS_HIP_HIPCC_FLAGS -mf16c)
+  endif()
+endif()
+if (HYTLASS_ENABLE_OPENMP_TESTS)
+  find_package(OpenMP)
+  if(OpenMP_CXX_FOUND)
+    list(APPEND HYTLASS_HIP_HIPCC_FLAGS -Xcompiler=${OpenMP_CXX_FLAGS})
+  else()
+    message(WARNING "HYTLASS_ENABLE_OPENMP_TESTS set but OpenMP not found.")
+  endif()
+endif()
+if(UNIX)
+  list(APPEND HYTLASS_HIP_HIPCC_FLAGS -Wconversion)
+  list(APPEND HYTLASS_HIP_HIPCC_FLAGS -fno-strict-aliasing)
+endif()
+# Don't leak lineinfo in release builds
+if (NOT CMAKE_BUILD_TYPE MATCHES "Release")
+  list(APPEND HYTLASS_HIP_HIPCC_FLAGS -lineinfo)
+endif()
+list(APPEND HYTLASS_HIP_HIPCC_FLAGS -Wno-sign-conversion -Wno-shorten-64-to-32 -Wno-implicit-float-conversion -Wno-implicit-int-conversion -Wno-return-type)
+if(HYTLASS_HIP_HIPCC_FLAGS)
+  message(STATUS "Using hipcc flags: ${HYTLASS_HIP_HIPCC_FLAGS}")
+endif()
+# Support for 128-bit integers if using HYGON C++ compiler 
+# if (${CMAKE_CXX_COMPILER_ID} MATCHES "PGI" OR ${CMAKE_CXX_COMPILER_ID} MATCHES "NVHPC")
+#     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Mint128 ")
+# endif()
+if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+  # CMake 3.18 added support for HIP_ARCHITECTURES target property. We will use this
+  # property for CMake 3.18+, so we request the NEW behavior for correct compatibility.
+  # https://cmake.org/cmake/help/v3.18/policy/CMP0104.html#policy:CMP0104 
+  cmake_policy(SET CMP0104 NEW)
+endif()
+if (MSVC)
+  # MSVC by default does not apply the correct __cplusplus version as specified by the C++ standard
+  # because MSVC is not a completely compliant implementation. This option forces MSVC to use the 
+  # appropriate value given the requested --std option. This fixes a compilation issue mismatch
+  # between GCC/Clang and MSVC.
+  #
+  # error : a constexpr function cannot have a nonliteral return type "dim3"
+  # 
+  # See https://developercommunity.visualstudio.com/t/msvc-incorrectly-defines-cplusplus/139261
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zc:__cplusplus")
+  set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Xcompiler  /Zc:__cplusplus")
+endif()
+# Some tests require this build option in order to link.
+if (MSVC)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /bigobj")
+  set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Xcompiler /bigobj")
+endif()
+function(hytlass_apply_hip_gencode_flags TARGET)
+  set(options)
+  set(oneValueArgs)
+  set(multiValueArgs SM_ARCHS)
+  cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  if (__SM_ARCHS)
+    set(ARCHS_ENABLED ${__SM_ARCHS})
+  else()
+    set(ARCHS_ENABLED ${HYTLASS_HIPCC_ARCHS_ENABLED})
+  endif()
+  set(HIPCC_FLAGS)
+  foreach(ARCH ${ARCHS_ENABLED})
+    list(APPEND HIPCC_FLAGS --offload-arch=gfx${ARCH})
+  endforeach()
+  if (NOT __SM_ARCHS)
+    target_compile_options(
+      ${TARGET}
+      PRIVATE
+      $<$<COMPILE_LANGUAGE:CXX>:${HIPCC_FLAGS}>
+      )
+  else()
+    list(JOIN HIPCC_FLAGS " " STR_HIPCC_FLAGS)
+    if(${TARGET} MATCHES ".*\.cu")
+      set_source_files_properties(${TARGET} PROPERTIES COMPILE_FLAGS ${STR_HIPCC_FLAGS})
+    endif()
+  endif()
+endfunction()
+# Cache the flags so they are available when the function below is called anywhere globally. 
+set(__HYTLASS_HIP_FLAGS ${HYTLASS_HIP_FLAGS} CACHE INTERNAL "")
+set(__HYTLASS_HIP_FLAGS_RELEASE ${HYTLASS_HIP_FLAGS_RELEASE} CACHE INTERNAL "")
+set(__HYTLASS_HIP_FLAGS_RELWITHDEBINFO ${HYTLASS_HIP_FLAGS_RELWITHDEBINFO} CACHE INTERNAL "")
+set(__HYTLASS_HIP_FLAGS_DEBUG ${HYTLASS_HIP_FLAGS_DEBUG} CACHE INTERNAL "")
+set(__HYTLASS_HIP_HIPCC_FLAGS ${HYTLASS_HIP_HIPCC_FLAGS} CACHE INTERNAL "")
+set(__HYTLASS_HIP_HIPCC_FLAGS_RELEASE ${HYTLASS_HIP_HIPCC_FLAGS_RELEASE} CACHE INTERNAL "")
+set(__HYTLASS_HIP_HIPCC_FLAGS_RELWITHDEBINFO ${HYTLASS_HIP_HIPCC_FLAGS_RELWITHDEBINFO} CACHE INTERNAL "")
+set(__HYTLASS_HIP_HIPCC_FLAGS_DEBUG ${HYTLASS_HIP_HIPCC_FLAGS_DEBUG} CACHE INTERNAL "")
+function(hytlass_apply_standard_compile_options TARGET)
+  set(HIP_COMPILE_LANGUAGE CXX)
+  set(_FLAGS ${__HYTLASS_HIP_FLAGS} ${__HYTLASS_HIP_HIPCC_FLAGS})
+  set(_FLAGS_RELEASE ${__HYTLASS_HIP_FLAGS_RELEASE} ${__HYTLASS_HIP_HIPCC_FLAGS_RELEASE})
+  set(_FLAGS_RELWITHDEBINFO ${__HYTLASS_HIP_FLAGS_RELWITHDEBINFO} ${__HYTLASS_HIP_HIPCC_FLAGS_RELWITHDEBINFO})
+  set(_FLAGS_DEBUG ${__HYTLASS_HIP_FLAGS_DEBUG} ${__HYTLASS_HIP_HIPCC_FLAGS_DEBUG})
+  target_link_libraries(${TARGET} PRIVATE HYTLASS)
+  target_compile_options(
+    ${TARGET}
+    PRIVATE
+    $<$<COMPILE_LANGUAGE:${HIP_COMPILE_LANGUAGE}>:${_FLAGS}>
+    $<$<COMPILE_LANGUAGE:${HIP_COMPILE_LANGUAGE}>:$<$<CONFIG:RELEASE>:${_FLAGS_RELEASE}>>
+    $<$<COMPILE_LANGUAGE:${HIP_COMPILE_LANGUAGE}>:$<$<CONFIG:RELWITHDEBINFO>:${_FLAGS_RELWITHDEBINFO}>>
+    $<$<COMPILE_LANGUAGE:${HIP_COMPILE_LANGUAGE}>:$<$<CONFIG:DEBUG>:${_FLAGS_DEBUG}>>
+    )
+endfunction()
+#
+# The following items should eventually be pushed into hytlass/CMakeLists.txt
+#
+# GLOB for HYTLASS header files. Should we use a static list instead?
+file(GLOB_RECURSE HYTLASS_INCLUDE RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} include/hytlass/*.h)
+file(GLOB_RECURSE HYTLASS_HYTLASS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/include include/hytlass/*.h include/hytlass/*.hpp include/hytlass/*.inl)
+file(GLOB_RECURSE HYTLASS_HUTE RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/include include/hute/*.h*)
+###################################################################################################
+#
+# Define build targets
+#
+###################################################################################################
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR}/include REGULAR_EXPRESSION ".*\.h")
+add_library(HYTLASS INTERFACE)
+add_library(hygon::hytlass::hytlass ALIAS HYTLASS)
+set_target_properties(HYTLASS PROPERTIES EXPORT_NAME hytlass)
+set(HYTLASS_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include CACHE PATH "HYTLASS Header Library")
+set(HYTLASS_GENERATOR_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tools/library CACHE INTERNAL "Location of generator scripts")
+# The following utility directory is needed even if the tools build is disabled, so it exists here.
+set(HYTLASS_TOOLS_UTIL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tools/util/include CACHE INTERNAL "")
+include_directories(${HYTLASS_INCLUDE_DIR})
+target_compile_features(HYTLASS INTERFACE cxx_std_11)
+if (NOT HYTLASS_NAMESPACE STREQUAL "hytlass")
+  target_compile_definitions(HYTLASS INTERFACE HYTLASS_NAMESPACE=${HYTLASS_NAMESPACE})
+endif()
+if (NOT DEFINED HYTLASS_REVISION)
+  find_package(Git QUIET)
+  execute_process(
+    COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD
+    RESULT_VARIABLE HYTLASS_REVISION_RESULT
+    OUTPUT_VARIABLE HYTLASS_REVISION
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+  )
+  if (HYTLASS_REVISION_RESULT)
+    message(STATUS "HYTLASS Revision: Unable to detect, Git returned code ${HYTLASS_REVISION_RESULT}.")
+  else()
+    message(STATUS "HYTLASS Revision: ${HYTLASS_REVISION}")
+  endif()
+endif()
+configure_file(
+  ${CMAKE_CURRENT_SOURCE_DIR}/cmake/version_extended.h.in
+  ${CMAKE_CURRENT_BINARY_DIR}/include/hytlass/version_extended.h
+  @ONLY)
+target_include_directories(
+  HYTLASS
+  INTERFACE
+  $<INSTALL_INTERFACE:include>
+  $<BUILD_INTERFACE:${HYTLASS_INCLUDE_DIR}>
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>
+  $<BUILD_INTERFACE:${hute_SOURCE_DIR}/include>
+  $<BUILD_INTERFACE:${hute_SOURCE_DIR}/examples>
+  )
+# Mark DTK headers as system to supress warnings from them
+target_include_directories(
+  HYTLASS
+  SYSTEM INTERFACE
+  $<BUILD_INTERFACE:${DCU_TOOLKIT_ROOT_DIR}/include>
+  )
+install(
+  DIRECTORY
+  ${HYTLASS_INCLUDE_DIR}/
+  ${CMAKE_CURRENT_BINARY_DIR}/include/
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+  )
+install(
+  TARGETS HYTLASS
+  EXPORT HygonHytlass
+  PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+  )
+################################################################################
+# Doxygen is available. Generate documentation
+if (DOXYGEN_FOUND)
+    # DOT is available. Enable graph generation in the documentation
+    if (DOXYGEN_DOT_EXECUTABLE)
+        set(HYTLASS_ENABLE_DOXYGEN_DOT ON CACHE BOOL "Use dot to generate graphs in the doxygen documentation.")
+    else()
+        set(HYTLASS_ENABLE_DOXYGEN_DOT OFF CACHE BOOL "Use dot to generate graphs in the doxygen documentation." FORCE)
+    endif()
+    if (HYTLASS_ENABLE_DOXYGEN_DOT)
+        set(HAVE_DOT "YES")
+    else()
+        set(HAVE_DOT "NO")
+    endif()
+    # Add custom target for Doxygen.
+    add_custom_target(hytlass_docs ${CMAKE_COMMAND} -E env
+        "DOT_PATH=${DOXYGEN_DOT_EXECUTABLE}"
+        "HAVE_DOT=${HAVE_DOT}"
+        ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        VERBATIM
+    )
+endif()
+if(NOT WIN32)
+  # Add common library search paths so executables and libraries can load and run
+  # without LD_LIBRARY_PATH being set.
+  link_libraries(
+    "-Wl,-rpath,'$ORIGIN'"
+    "-Wl,-rpath,'$ORIGIN/../lib64'"
+    "-Wl,-rpath,'$ORIGIN/../lib'"
+    "-Wl,-rpath,'${DCU_TOOLKIT_ROOT_DIR}/lib64'"
+    "-Wl,-rpath,'${DCU_TOOLKIT_ROOT_DIR}/lib'"
+    )
+endif()
+################################################################################
+include(CTest)
+enable_testing()
+if (HYTLASS_ENABLE_GTEST_UNIT_TESTS)
+  if (HYTLASS_USE_SYSTEM_GOOGLETEST)
+    find_package(GTest REQUIRED)
+  else()
+    # include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/googletest.cmake)
+    add_subdirectory(googletest)
+    include_directories(googletest/googletest/include)
+  endif()
+endif()
+if (NOT TARGET test_all)
+  add_custom_target(test_all)
+endif()
+set(HYTLASS_INSTALL_TESTS ON CACHE BOOL "Install test executables")
+set(HYTLASS_TEST_EXECUTION_ENVIRONMENT "" CACHE BOOL "Environment in which to invoke unit test executables")
+set(CMAKE_TEST_INSTALL_PREFIX test CACHE STRING "Test root install location, relative to CMAKE_INSTALL_PREFIX.")
+set(HYTLASS_TEST_INSTALL_PREFIX ${CMAKE_TEST_INSTALL_PREFIX}/hytlass CACHE STRING "Test root install location, relative to CMAKE_INSTALL_PREFIX.")
+set(HYTLASS_TEST_INSTALL_BINDIR ${HYTLASS_TEST_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR} CACHE STRING "Test root install location, relative to CMAKE_INSTALL_PREFIX.")
+set(HYTLASS_TEST_INSTALL_LIBDIR ${HYTLASS_TEST_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR} CACHE STRING "Test root install location, relative to CMAKE_INSTALL_PREFIX.")
+install(DIRECTORY DESTINATION ${HYTLASS_TEST_INSTALL_PREFIX})
+install(DIRECTORY DESTINATION ${HYTLASS_TEST_INSTALL_BINDIR})
+install(DIRECTORY DESTINATION ${HYTLASS_TEST_INSTALL_LIBDIR})
+install(DIRECTORY DESTINATION ${HYTLASS_TEST_INSTALL_PREFIX}/ctest)
+################################################################################
+# use hipBlas
+include(${CMAKE_CURRENT_SOURCE_DIR}/hipBLAS.cmake)
+if (HYTLASS_ENABLE_HIPBLAS)
+  target_compile_definitions(HYTLASS INTERFACE HYTLASS_ENABLE_HIPBLAS=1)
+endif()
+################################################################################
+set(HYTLASS_DEFAULT_ACTIVE_TEST_SETS "default" CACHE STRING "Default
+  activated test sets. In `make test` mode, this string determines the
+  active set of tests. In `ctest` mode, this value can be overriden
+  with HYTLASS_TEST_SETS environment variable when running the ctest
+  executable.")
+file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_BINDIR}")
+set(HYTLASS_CTEST_TEMPLATE_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/CTestTestfile.configure.cmake)
+set(HYTLASS_CTEST_GENERATED_FILES "" CACHE INTERNAL "")
+function(hytlass_add_executable_tests NAME TARGET)
+# 
+# Generates test rules for `make test`, `make test_all`, and `ctest` invoked from either the 
+# <CMAKE_BINARY_DIR> or the <CMAKE_INSTALL_PREFIX>/<HYTLASS_TEST_INSTALL_PREFIX> after installation.
+# 
+# NAME: The base name for the test. Can be run with `make <NAME>` or `ctest -R 'c<NAME>'`.
+# TARGET: The target corresponding to the executable under test.
+# DISABLE_EXECUTABLE_INSTALL_RULE: An option, if given, that disables creating an install rule for TARGET.
+# DEPENDS: A list of targets or files on which this test is dependent.
+# DEPENDEES: A list of targets which should depend on this test.
+# TEST_COMMAND_OPTIONS: A list of variables (i.e. by reference params) which contain command line arguments
+#   to pass to the test executable. A unique test is generated for each set of 
+#   options given. If this option is not used, a single test with no arguments is generated.
+# TEST_COMMAND_OPTIONS_PREFIX: If provided, is added as a prefix to each TEST_COMMAND_OPTIONS value for 
+#   generating the full variable name to be referenced.
+# RESULT_CACHE_FILE: A file to be installed alongside the test executable with pre-computed
+#   test results to speed up test runtime.
+# TEST_SETS_SUPPORTED: A list of test set names these tests support. 
+# 
+  set(options DISABLE_EXECUTABLE_INSTALL_RULE)
+  set(oneValueArgs DISABLE_TESTS RESULT_CACHE_FILE TEST_COMMAND_OPTIONS_PREFIX)
+  set(multiValueArgs DEPENDS DEPENDEES TEST_COMMAND_OPTIONS TEST_SETS_SUPPORTED)
+  cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  if (NOT DEFINED __DISABLE_TESTS)
+    set(__DISABLE_TESTS OFF)
+  endif()
+  set(TEST_EXE $<TARGET_FILE_NAME:${TARGET}>)
+  set(TEST_EXE_WORKING_DIRECTORY ./${CMAKE_INSTALL_BINDIR})
+  if (NOT DEFINED __TEST_SETS_SUPPORTED)
+    set(__TEST_SETS_SUPPORTED ${HYTLASS_DEFAULT_ACTIVE_TEST_SETS})
+  endif()
+  set(TEST_SETS_SUPPORTED ${__TEST_SETS_SUPPORTED})
+  if (__RESULT_CACHE_FILE)
+    add_custom_command(
+      TARGET ${TARGET}
+      POST_BUILD
+      COMMAND ${CMAKE_COMMAND}
+      ARGS -E copy ${__RESULT_CACHE_FILE} "$<TARGET_FILE_DIR:${TARGET}>"
+      )
+  endif()
+  if (NOT __DISABLE_EXECUTABLE_INSTALL_RULE AND HYTLASS_INSTALL_TESTS)
+    # file(RELATIVE_PATH CMAKE_CURRENT_BINARY_RELATIVE_DIR ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+    install(
+      TARGETS ${TARGET}
+      RUNTIME DESTINATION ${HYTLASS_TEST_INSTALL_BINDIR}
+      )
+    if (__RESULT_CACHE_FILE)
+     install(
+       FILES ${__RESULT_CACHE_FILE}
+       DESTINATION ${HYTLASS_TEST_INSTALL_BINDIR}/
+       )
+    endif()
+  endif()
+  if (NOT __TEST_COMMAND_OPTIONS)
+    set(__TEST_COMMAND_OPTIONS " ")
+  endif()
+  list(LENGTH __TEST_COMMAND_OPTIONS CMD_COUNT)
+  if (CMD_COUNT GREATER 1)
+    add_custom_target(${NAME} DEPENDS ${TARGET} ${__DEPENDS})
+    foreach(DEPENDEE ${__DEPENDEES})
+      add_dependencies(${DEPENDEE} ${NAME})
+    endforeach()
+  endif()
+  if (HYTLASS_INSTALL_TESTS)
+    set(_INLINE_PER_TEST_CODE)
+    file(READ "${PROJECT_SOURCE_DIR}/cmake/CTestTestfile.test.configure.cmake" _INLINE_PER_TEST_CODE_TEMPLATE)
+  endif()
+  set(TEST_GROUP_NAME ${NAME})
+  foreach(CMD_OPTIONS_VAR IN LISTS __TEST_COMMAND_OPTIONS)
+    if (CMD_COUNT GREATER 1)
+      string(TOLOWER "${NAME}_${CMD_OPTIONS_VAR}" TEST_NAME)
+    else()
+      string(TOLOWER "${NAME}" TEST_NAME)
+    endif()
+    # The following rigmarole is needed to deal with spaces and possible quotes in 
+    # command line arguments. The options are passed "by reference" as the actual
+    # variable names holding the real options. We then expand these in a way that
+    # preserves any quotes. Note, they have to be in this order for it to work for 
+    # all the use cases below.
+    set(TEST_COMMAND_OPTIONS ${${__TEST_COMMAND_OPTIONS_PREFIX}${CMD_OPTIONS_VAR}})
+    list(JOIN TEST_COMMAND_OPTIONS " " TEST_COMMAND_OPTIONS)
+    separate_arguments(TEST_COMMAND_OPTIONS)
+    add_custom_target(
+      ${TEST_NAME}
+      COMMAND
+      ${HYTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${TEST_COMMAND_OPTIONS}
+      DEPENDS
+      ${TARGET}
+      )
+    if (CMD_COUNT GREATER 1)
+      add_dependencies(${NAME} ${TEST_NAME})
+    endif()
+    foreach(DEPENDEE ${__DEPENDEES})
+      add_dependencies(${DEPENDEE} ${TEST_NAME})
+    endforeach()
+    set(TEST_NAME c${TEST_NAME})
+    string(CONFIGURE "${_INLINE_PER_TEST_CODE_TEMPLATE}" _TEST_CODE @ONLY)
+    string(APPEND _INLINE_PER_TEST_CODE "${_TEST_CODE}")
+  endforeach()
+  # To run the tests from an install package with tests enabled, we need to generate test files
+  # that don't rely on the current directory structure in build.  
+  set(TEST_NAME c${NAME})
+  set(TEST_GEN_DIR ${CMAKE_CURRENT_BINARY_DIR}/ctest/${TEST_NAME})
+  file(MAKE_DIRECTORY ${TEST_GEN_DIR})
+  set(TEST_EXE_PATH $<TARGET_FILE:${TARGET}>)
+  set(TEST_USE_EXTENDED_FORMAT ON)
+  configure_file("${HYTLASS_CTEST_TEMPLATE_FILE}" "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.cmake" @ONLY)
+  set(TEST_EXE_PATH $<TARGET_FILE_NAME:${TARGET}>)
+  set(TEST_USE_EXTENDED_FORMAT OFF) # ctest does not support extended add_test format.
+  configure_file("${HYTLASS_CTEST_TEMPLATE_FILE}" "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.install.cmake.in" @ONLY)
+  # The following line imports the tests for immediate run via `make test`.
+  include(${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.cmake)
+  set(HYTLASS_CTEST_GENERATED_FILES ${HYTLASS_CTEST_GENERATED_FILES};ctest/${TEST_NAME}/CTestTestfile.${TEST_NAME}.cmake CACHE INTERNAL "")
+    if (HYTLASS_INSTALL_TESTS)
+    file(GENERATE 
+      OUTPUT "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.install.cmake" 
+      INPUT "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.install.cmake.in" 
+      )
+    install(
+      FILES "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.install.cmake"
+      DESTINATION ${HYTLASS_TEST_INSTALL_PREFIX}/ctest/${TEST_NAME}
+      RENAME CTestTestfile.${TEST_NAME}.cmake
+      )
+    endif()
+endfunction()
+if (HYTLASS_ENABLE_TOOLS)
+  add_subdirectory(tools)
+  if (HYTLASS_ENABLE_PROFILER)
+    add_dependencies(test_all test_profiler)
+  endif()
+endif()
+if (HYTLASS_ENABLE_EXAMPLES)
+  add_subdirectory(examples)
+  add_dependencies(test_all test_examples)
+endif()
+if (HYTLASS_ENABLE_TESTS)
+  add_subdirectory(test)
+  if (HYTLASS_ENABLE_GTEST_UNIT_TESTS)
+  add_dependencies(test_all test_unit)
+  endif()
+endif()
+if (HYTLASS_INSTALL_TESTS)
+  file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/ctest")
+  file(WRITE "${CMAKE_BINARY_DIR}/ctest/CTestTestfile.cmake" "# Generated File\n\n")
+  file(APPEND "${CMAKE_BINARY_DIR}/ctest/CTestTestfile.cmake" "cmake_policy(SET CMP0057 NEW) # Allow IN_LIST for if()\n\n")
+  file(APPEND "${CMAKE_BINARY_DIR}/ctest/CTestTestfile.cmake" "if (NOT DEFINED ENV{HYTLASS_TEST_SETS})\n")
+  file(APPEND "${CMAKE_BINARY_DIR}/ctest/CTestTestfile.cmake" "  set(ENV{HYTLASS_TEST_SETS} ${HYTLASS_DEFAULT_ACTIVE_TEST_SETS})\n")
+  file(APPEND "${CMAKE_BINARY_DIR}/ctest/CTestTestfile.cmake" "endif()\n\n")
+  foreach(GENERATED_FILE ${HYTLASS_CTEST_GENERATED_FILES})
+    file(APPEND "${CMAKE_BINARY_DIR}/ctest/CTestTestfile.cmake" "include(${GENERATED_FILE})\n")
+  endforeach()
+  install(
+    FILES "${CMAKE_BINARY_DIR}/ctest/CTestTestfile.cmake"
+    DESTINATION "${HYTLASS_TEST_INSTALL_PREFIX}/"
+    )
+endif()
+################################################################################
+include(CMakePackageConfigHelpers)
+write_basic_package_version_file(
+  ${CMAKE_CURRENT_BINARY_DIR}/HygonHytlassConfigVersion.cmake 
+  COMPATIBILITY AnyNewerVersion)
+configure_file(
+  ${CMAKE_CURRENT_SOURCE_DIR}/cmake/HygonHytlassConfig.cmake.in
+  ${CMAKE_CURRENT_BINARY_DIR}/HygonHytlassConfig.cmake  
+  @ONLY
+  )
+install(
+  FILES 
+    ${CMAKE_CURRENT_BINARY_DIR}/HygonHytlassConfig.cmake  
+    ${CMAKE_CURRENT_BINARY_DIR}/HygonHytlassConfigVersion.cmake 
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/HygonHytlass/
+  )
+install(
+  EXPORT HygonHytlass
+  NAMESPACE hygon::hytlass::
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/HygonHytlass/
+  FILE HygonHytlassTargets.cmake
+  )
+################################################################################
+include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/HygonHytlassPackageConfig.cmake)
--- a/HIP.cmake
+++ b/HIP.cmake
+# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+find_package(HIP REQUIRED CONFIG PATHS ${DCU_TOOLKIT_ROOT_DIR})
+################# RT #########################
+find_library(
+  GALAXY_HIP galaxyhip
+  ${DCU_TOOLKIT_ROOT_DIR}/lib
+  NO_DEFAULT_PATH
+)
+if(NOT TARGET hip::galaxyhip AND GALAXY_HIP)
+  message(STATUS "Found galaxyhip: True")
+  add_library(galaxyhip SHARED IMPORTED GLOBAL)
+  add_library(hip::galaxyhip ALIAS hipgalaxy)
+  set_property(
+    TARGET galaxyhip
+    PROPERTY IMPORTED_LOCATION
+    ${GALAXY_HIP}
+  )
+elseif(TARGET hip::galaxyhip)
+  message(STATUS "Found galaxyhip: True")
+else()
+  message(STATUS "Found galaxyhip: True")
+endif()
+find_library(
+  HIPRTC_LIBRARY hiprtc
+  PATHS
+  ${DCU_TOOLKIT_ROOT_DIR}/lib
+  NO_DEFAULT_PATH
+  )
+if(NOT TARGET hiprtc AND HIPRTC_LIBRARY)
+  message(STATUS "Found hiprtc: True")
+  add_library(hiprtc SHARED IMPORTED GLOBAL)
+  add_library(hip::hiprtc ALIAS hiprtc)
+  set_property(
+    TARGET hiprtc
+    PROPERTY IMPORTED_LOCATION
+    ${HIPRTC_LIBRARY}
+    )
+elseif(TARGET hiprtc)
+  message(STATUS "Found hiprtc: True")
+else()
+  message(STATUS "Found hiprtc: False")
+endif()
+include_directories(SYSTEM "${DCU_TOOLKIT_ROOT_DIR}/include")
+# set hip property as *.cu
+function(hytlass_correct_source_file_language_property)
+  foreach(File ${ARGN})
+    # add compile option -xhip while using clang++
+    if(File MATCHES ".*\.cu$")
+    #   set_source_files_properties(${File} PROPERTIES COMPILE_FLAGS "-x hip")
+    set_source_files_properties(${File} PROPERTIES LANGUAGE CXX)
+    endif()
+  endforeach()
+endfunction()
+set(HYTLASS_UNITY_BUILD_ENABLED_INIT OFF)
+set(HYTLASS_UNITY_BUILD_ENABLED ${HYTLASS_UNITY_BUILD_ENABLED_INIT} CACHE BOOL "Enable combined source compilation")
+set(HYTLASS_UNITY_BUILD_BATCH_SIZE_INIT 16)
+set(HYTLASS_UNITY_BUILD_BATCH_SIZE ${HYTLASS_UNITY_BUILD_BATCH_SIZE_INIT} CACHE STRING "Batch size for unified source files")
+# set unify 
+function(hytlass_unify_source_files TARGET_ARGS_VAR)
+  set(options)
+  set(oneValueArgs BATCH_SOURCES BATCH_SIZE)
+  set(multiValueArgs)
+  cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  if (NOT DEFINED TARGET_ARGS_VAR)
+    message(FATAL_ERROR "TARGET_ARGS_VAR parameter is required")
+  endif()
+  if (__BATCH_SOURCES AND NOT DEFINED __BATCH_SIZE)
+    set(__BATCH_SIZE ${HYTLASS_UNITY_BUILD_BATCH_SIZE})
+  endif()
+  if (HYTLASS_UNITY_BUILD_ENABLED AND DEFINED __BATCH_SIZE AND __BATCH_SIZE GREATER 1)
+    set(HIP_FILE_ARGS)
+    set(TARGET_SOURCE_ARGS)
+    foreach(ARG ${__UNPARSED_ARGUMENTS})
+      if(${ARG} MATCHES ".*\.cu$")
+        list(APPEND HIP_FILE_ARGS ${ARG})
+      else()
+        list(APPEND TARGET_SOURCE_ARGS ${ARG})
+      endif()
+    endforeach()
+    list(LENGTH HIP_FILE_ARGS NUM_HIP_FILE_ARGS)
+    while(NUM_HIP_FILE_ARGS GREATER 0)
+      list(SUBLIST HIP_FILE_ARGS 0 ${__BATCH_SIZE} HIP_FILE_BATCH)
+      string(SHA256 HIP_FILE_BATCH_HASH "${HIP_FILE_BATCH}")
+      string(SUBSTRING ${HIP_FILE_BATCH_HASH} 0 12 HIP_FILE_BATCH_HASH)
+      set(BATCH_FILE ${CMAKE_CURRENT_BINARY_DIR}/${NAME}.unity.${HIP_FILE_BATCH_HASH}.cu)
+      message(STATUS "Generating ${BATCH_FILE}")
+      file(WRITE ${BATCH_FILE} "// Unity File - Auto Generated!\n")
+      foreach(HIP_FILE ${HIP_FILE_BATCH})
+        get_filename_component(HIP_FILE_ABS_PATH ${HIP_FILE} ABSOLUTE)
+        file(APPEND ${BATCH_FILE} "#include \"${HIP_FILE_ABS_PATH}\"\n")
+      endforeach()
+      list(APPEND TARGET_SOURCE_ARGS ${BATCH_FILE})
+      if (NUM_HIP_FILE_ARGS LESS_EQUAL __BATCH_SIZE)
+        break()
+      endif()
+      list(SUBLIST HIP_FILE_ARGS ${__BATCH_SIZE} -1 HIP_FILE_ARGS)
+      list(LENGTH HIP_FILE_ARGS NUM_HIP_FILE_ARGS)
+    endwhile()
+  else()
+    set(TARGET_SOURCE_ARGS ${__UNPARSED_ARGUMENTS})
+  endif()
+  set(${TARGET_ARGS_VAR} ${TARGET_SOURCE_ARGS} PARENT_SCOPE)
+endfunction()
+# unify -> set property -> add library
+function(hytlass_add_library NAME)
+  set(options SKIP_GENCODE_FLAGS)
+  set(oneValueArgs EXPORT_NAME)
+  set(multiValueArgs)
+  cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  hytlass_unify_source_files(TARGET_SOURCE_ARGS ${__UNPARSED_ARGUMENTS})
+  hytlass_correct_source_file_language_property(${TARGET_SOURCE_ARGS})
+  add_library(${NAME} ${TARGET_SOURCE_ARGS} "")
+  hytlass_apply_standard_compile_options(${NAME})
+  if (NOT __SKIP_GENCODE_FLAGS)
+  hytlass_apply_hip_gencode_flags(${NAME})
+  endif()
+  target_compile_features(
+   ${NAME}
+   INTERFACE
+   cxx_std_11
+   )
+  if(__EXPORT_NAME)
+    add_library(hygon::hytlass::${__EXPORT_NAME} ALIAS ${NAME})
+    set_target_properties(${NAME} PROPERTIES EXPORT_NAME ${__EXPORT_NAME})
+  endif()
+endfunction()
+function(hytlass_add_executable NAME)
+  set(options)
+  set(oneValueArgs)
+  set(multiValueArgs)
+  cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  hytlass_unify_source_files(TARGET_SOURCE_ARGS ${__UNPARSED_ARGUMENTS})
+  hytlass_correct_source_file_language_property(${TARGET_SOURCE_ARGS})
+  add_executable(${NAME} ${TARGET_SOURCE_ARGS})
+  hytlass_apply_standard_compile_options(${NAME})
+  hytlass_apply_hip_gencode_flags(${NAME})
+  target_compile_features(
+   ${NAME}
+   INTERFACE
+   cxx_std_11
+   )
+endfunction()
+function(hytlass_target_sources NAME)
+  set(options)
+  set(oneValueArgs)
+  set(multiValueArgs)
+  cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  hytlass_unify_source_files(TARGET_SOURCE_ARGS ${__UNPARSED_ARGUMENTS})
+  hytlass_correct_source_file_language_property(${TARGET_SOURCE_ARGS})
+  target_sources(${NAME} ${TARGET_SOURCE_ARGS})
+endfunction()
--- a/LICENSE.txt
+++ b/LICENSE.txt
+Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-------------------------------------------------------------------------
+The following copyright statements and licenses apply to various open source software/model
+packages (or portions thereof) that are distributed with this MUTLASS. MUTLASS that
+includes this file does not necessarily use all the open source software packages referred
+to below and may also only use portions of a given package. Some open source software
+packages referred to below may have been modified by Moore Threads Technology Co., Ltd
+-------------------------------------------------------------------------
+cutlass
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-------------------------------------------------------------------------
+googletest
+Copyright 2008, Google Inc.
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/README.md
+++ b/README.md
+# HYTLASS 0.1.0
+_HYTLASS 0.1.0 - 2025年12月_
+HYTLASS(HYGON DCU Templates for Linear Algebra Subroutines)是一个用于在海光DCU架构下实现高性能矩阵乘法(GEMM)及其衍生计算的C++模板库，其设计思路来源于CUTLASS，将数据搬运、层次化结构等可动部件分解为可复用、模块化的软件组件，
+并通过C++模板类进行抽象。
+HYTLASS同时兼容了基于CUTLASS 2.x实现的基于C++模版的矩阵乘和卷积，以及基于CUTLASS 3.x引入的CuTe编程模型。
+你可以参考[快速入门指南](media/docs/quickstart.md)来快速入门使用HYTLASS。
+# HYTLASS 0.1.0新增
+HYTLASS 0.1.0是HYTLASS的首次提交，其实现基于CUTLASS 3.5.0，具体地，提供了：
+- HYTLASS兼容和支持CUTLASS 2.x实现：
+  - 支持BW(GFX936)及更早架构下的指令特性，包括支持TensorCore的mmac及ds_read_matrix原语。
+  - 支持基于这些架构指令特性下的矩阵乘法实现。
+  - 支持基于矩阵乘法模版实现的基于隐式矩阵乘法的卷积实现。
+- HYTLASS兼容和支持CUTLASS 3.x及CuTe编程模型：
+  - 适配基于DCU架构对CUTLASS 3.x中的CuTe编程模型HuTe，目前已支持至BW平台的指令原语。
+  - 支持基于HuTe模型的矩阵乘法实现，包括MMA部分和Epilogue部分。
+  - 支持基于HuTe模型的Kernel调度、Tiling调度等计算任务调度架构，支持多种线程块调度优化策略。
+- 十余个计算示例支持(详见[示例](examples))：
+  - 支持基于2.x的多种数据类型(TF32/FP16/BF16/I8/U8)的矩阵乘法、卷积及其融合算子实现。
+  - 支持基于Split-K、Stream-K等计算优化算法的矩阵乘法示例。
+  - 支持基于访问者模式的自定义尾声处理示例。
+  - 支持基于HuTe的矩阵乘法示例，包括BatchedGemm、GroupGemm等示例。
+  - 支持使用TensorCore加速的基于Block Ell格式的稀疏矩阵乘法示例。
+- 工具链支持：
+  - 支持hytlass_profiler，用于细粒度问题参数下的kernel tuning。
+# 编译HYTLASS
+在其他项目中，HYTLASS作为头文件库不需要单独编译，用户将`include/`目录指定至头文件路径即可。
+HYTLASS的单元测试、示例及工具链通过CMake编译，所需最低版本为3.19。
+在HYTLASS项目根目录中创建一个build目录，然后执行cmake，你可以通过CMAKE选项`HYTLASS_HIPCC_ARCHS`指定编译的架构。
+```bash
+$ mkdir build && cd build
+$ cmake .. -DHYTLASS_HIPCC_ARCHS=936               # compiles for DCU BW Architecture
+```
+你可以通过在build目录下使用make编译并执行`test_unit`来执行HYTLASS的单元测试。可以通过`-j`选项来并行执行make的流程。
+```bash
+$ make test_unit -j
+...
+...
+...
+[----------] Global test environment tear-down
+[==========] 946 tests from 57 test cases ran. (10812 ms total)
+[  PASSED  ] 946 tests.
+```
+在所支持的硬件架构下，所有测试都应当通过，尽管不同硬件架构下的单元测试数量可能有所不同。
+你也可以通过在build目录下，使用make编译`test_examples*`来执行所有示例或某个示例。
+```bash
+$ make test_examples -j
+...
+...
+...
+[100%] Built target test_examples_xxx
+[100%] Built target test_examples
+```
+或在编译完成后，在`build/examples`目录下单独执行某个示例的可执行文件。
+```bash
+$ cd build/examples && ./00_hytlass_basic_gemm/gfx928_gemm_tensor_op
+```
\ No newline at end of file
--- a/cmake/CTestTestfile.configure.cmake
+++ b/cmake/CTestTestfile.configure.cmake
+# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# Generated file
+set(TEST_SETS_SUPPORTED @TEST_SETS_SUPPORTED@)
+if (NOT DEFINED ENV{HYTLASS_TEST_SETS})
+  set(ENV{HYTLASS_TEST_SETS} @HYTLASS_DEFAULT_ACTIVE_TEST_SETS@)
+endif()
+foreach(TEST_SET_REQUESTED IN ITEMS $ENV{HYTLASS_TEST_SETS})
+  if (NOT TEST_SET_REQUESTED IN_LIST TEST_SETS_SUPPORTED) 
+    message(STATUS "Skipping tests for @TEST_EXE_PATH@ as ${TEST_SET_REQUESTED} is not in the set of [${TEST_SETS_SUPPORTED}].")
+    return()
+  endif()
+endforeach()
+set(TEST_EXE_PATH @TEST_EXE_PATH@)
+set(TEST_EXE_WORKING_DIRECTORY @TEST_EXE_WORKING_DIRECTORY@)
+set(HYTLASS_USE_EXTENDED_ADD_TEST_FORMAT @TEST_USE_EXTENDED_FORMAT@)
+if (DEFINED ENV{HYTLASS_TEST_EXECUTION_ENVIRONMENT})
+  set(_HYTLASS_TEST_EXECUTION_ENVIRONMENT $ENV{HYTLASS_TEST_EXECUTION_ENVIRONMENT})
+else()
+  set(_HYTLASS_TEST_EXECUTION_ENVIRONMENT @HYTLASS_TEST_EXECUTION_ENVIRONMENT@)
+endif()
+@_INLINE_PER_TEST_CODE@
--- a/cmake/CTestTestfile.test.configure.cmake
+++ b/cmake/CTestTestfile.test.configure.cmake
+# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+if (HYTLASS_USE_EXTENDED_ADD_TEST_FORMAT)
+  # The longform/extended format allows generator expressions to be
+  # expanded property and is useful in contexts where the files need
+  # to be immediately included into being-processed cmake code.
+  add_test(NAME @TEST_NAME@ COMMAND ${_HYTLASS_TEST_EXECUTION_ENVIRONMENT} "${TEST_EXE_PATH}" @TEST_COMMAND_OPTIONS@)
+else()
+  add_test(@TEST_NAME@ ${_HYTLASS_TEST_EXECUTION_ENVIRONMENT} "${TEST_EXE_PATH}" @TEST_COMMAND_OPTIONS@)
+endif()
+if (TEST_EXE_WORKING_DIRECTORY)
+  set_tests_properties(@TEST_NAME@ PROPERTIES WORKING_DIRECTORY "${TEST_EXE_WORKING_DIRECTORY}")
+endif()
+set_tests_properties(@TEST_NAME@ PROPERTIES DISABLED @__DISABLE_TESTS@)
--- a/cmake/HygonHytlassConfig.cmake.in
+++ b/cmake/HygonHytlassConfig.cmake.in
+get_filename_component(HygonHytlass_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+include(CMakeFindDependencyMacro)
+if(TARGET hygon::hytlass::HYTLASS)
+  return()
+endif()
+include("${HygonHytlass_CMAKE_DIR}/HygonHytlassTargets.cmake")
--- a/cmake/HygonHytlassPackageConfig.cmake
+++ b/cmake/HygonHytlassPackageConfig.cmake
+# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+set(CPACK_PACKAGE_NAME HygonHytlass)
+set(CPACK_PACKAGE_VENDOR HYGON)
+set(CPACK_PACKAGE_CONTACT info@hygon.com)
+set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HYTLASS CUDA C++ Template Linear Algebra Library")
+set(CPACK_PACKAGE_INSTALL_DIRECTORY ${CPACK_PACKAGE_NAME})
+set(CPACK_PACKAGE_VERSION_MAJOR ${PROJECT_VERSION_MAJOR})
+set(CPACK_PACKAGE_VERSION_MINOR ${PROJECT_VERSION_MINOR})
+set(CPACK_PACKAGE_VERSION_PATCH ${PROJECT_VERSION_PATCH})
+set(CPACK_VERBATIM_VARIABLES YES)
+# set(CPACK_PACKAGE_DESCRIPTION_FILE ${CMAKE_CURRENT_LIST_DIR}/Description.txt)
+# set(CPACK_RESOURCE_FILE_WELCOME ${CMAKE_CURRENT_LIST_DIR}/Welcome.txt)
+# set(CPACK_RESOURCE_FILE_LICENSE ${CMAKE_CURRENT_LIST_DIR}/License.txt)
+# set(CPACK_RESOURCE_FILE_README ${CMAKE_CURRENT_LIST_DIR}/Readme.txt)
+include(CPack)
--- a/cmake/googletest.cmake
+++ b/cmake/googletest.cmake
+# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+include(FetchContent)
+set(GOOGLETEST_DIR "" CACHE STRING "Location of local GoogleTest repo to build against")
+if(GOOGLETEST_DIR)
+  set(FETCHCONTENT_SOURCE_DIR_GOOGLETEST ${GOOGLETEST_DIR} CACHE STRING "GoogleTest source directory override")
+endif()
+FetchContent_Declare(
+  googletest
+  GIT_REPOSITORY https://github.com/google/googletest.git
+  GIT_TAG        v1.14.0
+  )
+FetchContent_GetProperties(googletest)
+if(NOT googletest_POPULATED)
+  FetchContent_Populate(googletest)
+  if (MSVC)
+    set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+  endif()
+  add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR} EXCLUDE_FROM_ALL)
+endif()
--- a/cmake/nop.cu
+++ b/cmake/nop.cu
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Basic HIP file for testing compiler flags.
+*/
+__device__ int inner()
+{
+  return -1;
+}
+__global__ void test()
+{
+  inner();
+}
+int main()
+{
+  test<<<1,1>>>();
+  return 0;
+}
--- a/cmake/version_extended.h.in
+++ b/cmake/version_extended.h.in
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#define HYTLASS_BUILD @HYTLASS_VERSION_BUILD@
+#define HYTLASS_REVISION "@HYTLASS_REVISION@"
--- a/examples/00_hytlass_basic_gemm/CMakeLists.txt
+++ b/examples/00_hytlass_basic_gemm/CMakeLists.txt
+# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+hytlass_example_add_executable(
+  gfx928_gemm_tensor_op
+  gfx928_gemm_tensor_op.cu
+  )
+hytlass_example_add_executable(
+  gfx928_gemm_tensor_op_mixed
+  gfx928_gemm_tensor_op_mixed.cu
+  )
--- a/examples/00_hytlass_basic_gemm/gfx928_gemm_tensor_op.cu
+++ b/examples/00_hytlass_basic_gemm/gfx928_gemm_tensor_op.cu
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/**
+This example shows how to run matrix multiplication kernels using functions and data structures
+provided by HYTLASS using tensor cores.
+Writing a single high performance matrix multiplication kernel is hard but do-able. Whereas writing
+high performance kernels at scale which works for multiple problem sizes with good abstractions is
+really hard. HYTLASS solves this problem by providing simplified abstractions to compose
+multiple sections of gemm kernel. When used properly, the kernels can hit peak performance of GPU
+easily.
+HYTLASS divides a kernel into hierarchical composable sections. Which means, at each thread, warp
+and thread-block level, they compute on their own tile-size with higher level of tile sizes being
+composed from lower level ones. Multiple thread-tiles (tile size each thread computes) can be used
+to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute
+threadblock-tile (tile size computed by a threadblock).
+In thie example, we split variable initialization into
+1. Setting up data properties : describes how matrices are laid out in the memory and how the kernel
+can view them (logical to physical mapping)
+2. Setting up computation properties : describes how the above set matrices will be used to compute
+output of matrix multiplication.
+First, we setup the data types of matrices A, B, C and D along with alpha, beta as the equation for
+GEMM is D = alpha * A * B + beta * C. In HYTLASS, the kernels first compute A * B and leaves the
+rest of the computation to end of the kernel as alpha * X + beta * C is a simple element-wise
+operation on X (A * B) and C. We call this as epilogue of kernel. Hence, we setup data types for
+alpha and beta to be equal to ElementComputeEpilogue = int32_t. As we want to use MMA instructions
+on Turing and they support 8-bit signed integer (int8_t), we use data type for elements in input
+matrix A and B as int8_t. Volta also supports accumulation of partial dot product to int32_t, which
+can store wider range of numbers, we use it as data type of output matrix elements and accumulation.
+We convey this to HYTLASS kernel by initializing template variables ElementAccumulator (int32_t),
+ElementComputeEpilogue (int32_t), ElementInputA (int8_t), ElementInputB (int8_t), ElementOutput
+(int32_t). Communicating just the data type is not enough. As the data is laid out linearly in
+memory, we have to convey the layout of matrices. We do that by initializing template variable
+LayoutInputA to column major HYTLASS variable, LayoutInputB to row major and LayoutOutput to row
+major. Next, we setup rules to comptue alpha * X + beta * C which is called epilogue of the kernel.
+We initialize template variable EpilogueOp, which takes the data type of output ElementOutput
+(int32_t), the number of elements per vector memory access (16), data type of accumulator (int32_t)
+and data type of computation of linear combination (alpha * X + beta * C).
+Now that we setup the properties of data, we have to setup properties of computation.
+Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x256x64,
+64x64x16, 8x8x16 (MxNxK) respectively. When passed to instantiate HYTLASS GEMM kernel, it internally
+deduce the amount of threads needed per thread-block, amount of shared memory, storing data in
+bank-conflict free manner, and ton of other variables required to compose, initialize and launch a
+high performance GEMM kernel. This is the beauty of HYTLASS, it relieves developer from
+understanding and coding complicated hardware optimizations which can easily go wrong.
+HYTLASS also supports multiple MMA pipelines in a threadblock. What are MMA pipelines? MMA pipelines
+constitute the whole process of loading input data from global memory to shared memory, loading data
+from shared memory to registers, doing matrix multiplication, store to global memory. The below flow
+sequence shows a typical mma pipeline.
+matrix in global memory -> registers -> tile in shared memory -> registers -> mma -> registers ->
+output to global memory
+The problem with single pipeline is, each stage is synchronous which means, each stage has to wait
+until the previous finished executing. There are stages in the pipeline which do not have fixed
+latency, for example, the loads from global memory and shared memory. Therefore, we can add one more
+pipeline with a phase shift in mma kernel to hide latency from global and shared memory loads.
+Finally, the pipeline in a kernel looks like
+(1) matrix in global memory -> (2) registers -> (3) tile in shared memory -> (4) registers -> (5)
+mma -> (6) registers -> (7) output to global memory (1) <null> -> (2) <null> -> (3) matrix in global
+memory -> (4) registers -> (5) tile in shared memory -> (6) registers -> (7) mma -> (8) registers ->
+(9) output to global memory
+This way, you can hide the second global memoroy load latency by doing computation on already loaded
+input data.
+There are few more template variables initialized such as, which threadblock tile of output matrix
+is done which threadblock launched on an SM, GFX architecture of GPU you want to run on.
+These are all put together to create a template variable which describes HYTLASS GEMM kernel using
+hytlass::gemm::device::Gemm template.
+The next step is to initialize physical data, instantiate and initialize HYTLASS kernel and run it.
+We use HYTLASS utilities to initialize, fill, compare matrices as they are simple and doesn't come
+in the way of learning HYTLASS.
+Once all the matrices are initialized and filled with data, create arguments tuple to launch HYTLASS
+kernel which takes problem size (M = 5120, N = 4096 and K = 4096), matrices, alpha, beta and the
+important one, split k-dimension factor. Along with that, we query HYTLASS if any scratch-space
+memory required by the kernel we instantiated. If yes, we create it and pass it along with other
+arguments created to initialize HYTLASS kernel then, the kernel is launched.
+In this example, we later on launch a reference gemm kernel (from HYTLASS utilities) to compare if
+the output from HYTLASS kernel is same as reference GEMM kernel.
+*/
+#include <iostream>
+#include "hytlass/hytlass.h"
+#include "hytlass/gemm/device/gemm.h"
+#include "hytlass/util/command_line.h"
+#include "hytlass/util/host_tensor.h"
+#include "hytlass/util/reference/device/gemm.h"
+#include "hytlass/util/reference/host/tensor_compare.h"
+#include "hytlass/util/reference/host/tensor_copy.h"
+#include "hytlass/util/reference/host/tensor_fill.h"
+#include "hytlass/util/tensor_view_io.h"
+#include "helper.h"
+#include "hytlass/util/GPU_Clock.hpp"
+#include "hytlass/gemm/device/gemm_universal.h"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Command line options parsing
+struct Options {
+  bool help;
+  hytlass::gemm::GemmCoord problem_size;
+  int batch_count;
+  float alpha;
+  float beta;
+  bool reference_check;
+  int iterations;
+  Options():
+    help(false),
+    problem_size({5120, 4096, 4096}),
+    batch_count(1),
+    reference_check(true),
+    iterations(20),
+    alpha(1),
+    beta() 
+  {}
+  bool valid() {
+    return true;
+  }
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    hytlass::CommandLine cmd(argc, args);
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+    cmd.get_cmd_line_argument("m", problem_size.m());
+    cmd.get_cmd_line_argument("n", problem_size.n());
+    cmd.get_cmd_line_argument("k", problem_size.k());
+    cmd.get_cmd_line_argument("alpha", alpha);
+    cmd.get_cmd_line_argument("beta", beta);
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+    out << "00_hytlass_basic_gemm example\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --m=<int>                   GEMM M dimension\n"
+      << "  --n=<int>                   GEMM N dimension\n"
+      << "  --k=<int>                   GEMM K dimension\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
+    out << "\n\nExamples:\n\n"
+      << "$ ./examples/00_hytlass_basic_gemm/gfx928_gemm_tensor_op --m=1024 --n=512 --k=1024 \\\n"
+      << "     --alpha=2 --beta=0.707 \n\n";
+    return out;
+  }
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// The code section below describes datatype for input, output matrices and computation between
+// elements in input matrices.
+using ElementAccumulator = float;                 // <- data type of accumulator
+using ElementComputeEpilogue = ElementAccumulator;  // <- data type of epilogue operations
+using ElementInputA = hytlass::bfloat16_t;                       // <- data type of elements in input matrix A
+using ElementInputB = hytlass::bfloat16_t;                       // <- data type of elements in input matrix B
+using ElementOutput = hytlass::bfloat16_t;                      // <- data type of elements in output matrix D
+// The code section below describes matrix layout of input and output matrices. Column Major for
+// Matrix A, Row Major for Matrix B and Row Major for Matrix C
+using LayoutInputA = hytlass::layout::ColumnMajor;
+using LayoutInputB = hytlass::layout::ColumnMajor;
+using LayoutOutput = hytlass::layout::RowMajor;
+// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
+using MMAOp = hytlass::arch::OpClassTensorOp;
+// This code section describes GFX architecture number
+using SmArch = hytlass::arch::Gfx928;
+// This code section describes the tile size a thread block will compute
+using ShapeMMAThreadBlock = hytlass::gemm::GemmShape<128, 128, 64>;
+// This code section describes tile size a warp will compute
+using ShapeMMAWarp = hytlass::gemm::GemmShape<64, 64, 32>;
+// This code section describes the size of MMA op
+// 但需要注意下面的 kAlignemntA/B，最大对齐长度为8
+using ShapeMMAOp = hytlass::gemm::GemmShape<16, 16, 16>;
+// 对齐情况
+// OpMultiplyAddFastF16 模式下可以考虑使用256, MultiplyAdd使用128
+constexpr int kAlignmentA = 128 / hytlass::sizeof_bits<ElementInputA>::value;
+constexpr int kAlignmentB = 128 / hytlass::sizeof_bits<ElementInputB>::value;
+constexpr int kAlignmentC = 128 / hytlass::sizeof_bits<ElementOutput>::value;
+// This code section describes how threadblocks are scheduled on GPU
+using SwizzleThreadBlock = hytlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>;  // block swizzle
+// This code section describes the epilogue part of the kernel
+using EpilogueOp = hytlass::epilogue::thread::LinearCombination<
+  ElementOutput,                                     // <- data type of output matrix
+  kAlignmentC,                                       // <- the number of elements per vectorized
+                                                     // memory access. For a byte, it's 16
+                                                     // elements. This becomes the vector width of
+                                                     // math instructions in the epilogue too
+  ElementAccumulator,                                // <- data type of accumulator
+  ElementComputeEpilogue>;  // <- data type for alpha/beta in linear combination function
+// 设为1走 Singlestage，支持warpShape::kK == InstructionShape::kK, lds开销小
+// 设为2走 Pipeline 
+constexpr int NumStages = 1;
+using Gemm = hytlass::gemm::device::GemmUniversal<
+  ElementInputA,
+  LayoutInputA,
+  ElementInputB,
+  LayoutInputB,
+  ElementOutput,
+  LayoutOutput,
+  ElementAccumulator,
+  MMAOp,
+  SmArch,
+  ShapeMMAThreadBlock,
+  ShapeMMAWarp,
+  ShapeMMAOp,
+  EpilogueOp,
+  SwizzleThreadBlock,
+  NumStages, 
+  kAlignmentA, 
+  kAlignmentB>;
+int run(Options &options) {
+  // Create a tuple of problem size for matrix multiplication
+  hytlass::gemm::GemmCoord problem_size = options.problem_size;
+  // Initialize tensors using HYTLASS helper functions
+  hytlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
+    problem_size.mk());    // <- Create matrix A with dimensions M x K
+  hytlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(
+    problem_size.kn());    // <- Create matrix B with dimensions K x N
+  hytlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(
+    problem_size.mn());    // <- Create matrix C with dimensions M x N
+  hytlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(
+    problem_size.mn());    // <- Create matrix D with dimensions M x N used to store output from
+                           // HYTLASS kernel
+  hytlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(
+    problem_size.mn());    // <- Create matrix D with dimensions M x N used to store output from
+                           // reference kernel
+  // Fill input and output matrices on host using HYTLASS helper functions
+  hytlass::reference::host::TensorFillRandomUniform(
+    tensor_a.host_view(),
+    1,
+    ElementInputA(2),
+    ElementInputA(-2),
+    hytlass::MantissaInBits<ElementOutput>::bits);  // <- Fill matrix A on host with uniform-distribution random data
+  hytlass::reference::host::TensorFillRandomUniform(
+    tensor_b.host_view(),
+    2,
+    ElementInputB(2),
+    ElementInputB(-2),
+    hytlass::MantissaInBits<ElementOutput>::bits);  // <- Fill matrix B on host with uniform-distribution random data
+  hytlass::reference::host::TensorFillRandomUniform(
+    tensor_c.host_view(),
+    1,
+    ElementOutput(4),
+    ElementOutput(-4),
+    0);                                            // <- Fill matrix C on host with uniform-distribution random data
+  hytlass::reference::host::TensorFill(
+    tensor_d.host_view());                         // <- fill matrix D on host with zeros
+  hytlass::reference::host::TensorFill(
+    tensor_ref_d.host_view());                     // <- fill matrix D for reference on host with zeros
+  // Copy data from host to GPU
+  tensor_a.sync_device();
+  tensor_b.sync_device();
+  tensor_c.sync_device();
+  tensor_d.sync_device();
+  tensor_ref_d.sync_device();
+  // Initialize alpha and beta for dot product computation
+  ElementComputeEpilogue alpha = ElementComputeEpilogue(options.alpha);
+  ElementComputeEpilogue beta = ElementComputeEpilogue(options.beta);
+  // Split K dimension into 1 partitions
+  int split_k_slices = 1;
+  // Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
+  // instantiated HYTLASS kernel
+  typename Gemm::Arguments arguments {
+    hytlass::gemm::GemmUniversalMode::kGemm,         // <- GemmUniversalMode
+    problem_size,                                    // <- problem size of matrix multiplication
+    1,                                               // <- batch count
+    {alpha, beta},                                   // <- tuple of alpha and beta
+    tensor_a.device_data(),                          // <- reference to matrix A on device
+    tensor_b.device_data(),                          // <- reference to matrix B on device
+    tensor_c.device_data(),                          // <- reference to matrix C on device
+    tensor_d.device_data(),                          // <- reference to matrix D on device
+    1, 1, 1, 1,                                      // <- batch stride 
+    tensor_a.stride(0),                              // <- Stride of matrix A
+    tensor_b.stride(0),                              // <- Stride of matrix B
+    tensor_c.stride(0),                              // <- Stride of matrix C
+    tensor_d.stride(0),                              // <- Stride of matrix D
+    nullptr, nullptr, nullptr,                       // <- gather a,b,d indices  
+    };
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+  // Allocate workspace memory
+  hytlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+  // Instantiate HYTLASS kernel depending on templates
+  Gemm gemm_op;
+  // Check the problem size is supported or not 
+  hytlass::Status status = gemm_op.can_implement(arguments);
+  HYTLASS_CHECK(status);
+  // Initialize HYTLASS kernel with arguments and workspace pointer
+  status = gemm_op.initialize(arguments, workspace.get());
+  HYTLASS_CHECK(status);
+  // Launch initialized HYTLASS kernel
+  status = gemm_op();
+  HYTLASS_CHECK(status);
+  tensor_d.sync_host();
+  // Create instantiation for device reference gemm kernel
+  hytlass::reference::device::Gemm<
+    ElementInputA,
+    LayoutInputA,
+    ElementInputB,
+    LayoutInputB,
+    ElementOutput,
+    LayoutOutput,
+    ElementAccumulator,
+    ElementAccumulator> gemm_device;
+//   Launch device reference gemm kernel
+  gemm_device(problem_size,
+    alpha,
+    tensor_a.device_ref(),
+    tensor_b.device_ref(),
+    beta,
+    tensor_c.device_ref(),
+    tensor_ref_d.device_ref());
+  // Wait for kernels to finish
+  (void)hipDeviceSynchronize();
+  // Copy output data from HYTLASS and reference kernel to host for comparison
+  tensor_ref_d.sync_host();
+  ElementOutput eps(0.05);
+  const ElementOutput non_zero_floor(1e-6f);
+  bool passed = hytlass::reference::host::TensorRelativelyEquals(tensor_ref_d.host_view(), 
+    tensor_d.host_view(), eps, non_zero_floor);
+  if (passed!=true) {
+    printf("failed\n");
+  }
+  else {
+    printf("passed\n");
+  }
+  GPU_Clock timer;
+  int iterations_cnt = 50;
+  double gflops = (2.0 * problem_size.m() * problem_size.n() * problem_size.k()) * 1e-9;
+  for (int i=0; i<10; i++) {
+    status = gemm_op();
+  }
+  HYTLASS_CHECK(status);
+  timer.start();
+  for (int i=0; i<iterations_cnt; i++) {
+    status = gemm_op();
+  }
+  HYTLASS_CHECK(status);
+  double hytlass_time = timer.seconds() / iterations_cnt;
+  printf("hytlass gemm: [%6.1f]GFlop/s (%6.4f)ms\n", gflops / hytlass_time, hytlass_time * 1000);
+  return 0;
+}
+int main(int argc, const char **argv) {
+  Options options;
+  options.parse(argc, argv);
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+  printf("%d x %d x %d tensor op Matrix Multiply\n", \
+    options.problem_size.m(), options.problem_size.n(), options.problem_size.k());
+  if (!options.valid()) {
+    std::cerr << "Invalid problem." << std::endl;
+    return -1;
+  }
+  return run(options);
+}
--- a/examples/00_hytlass_basic_gemm/gfx928_gemm_tensor_op_mixed.cu
+++ b/examples/00_hytlass_basic_gemm/gfx928_gemm_tensor_op_mixed.cu
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/**
+This example shows how to run matrix multiplication kernels using functions and data structures
+provided by HYTLASS using tensor cores.
+Writing a single high performance matrix multiplication kernel is hard but do-able. Whereas writing
+high performance kernels at scale which works for multiple problem sizes with good abstractions is
+really hard. HYTLASS solves this problem by providing simplified abstractions to compose
+multiple sections of gemm kernel. When used properly, the kernels can hit peak performance of GPU
+easily.
+HYTLASS divides a kernel into hierarchical composable sections. Which means, at each thread, warp
+and thread-block level, they compute on their own tile-size with higher level of tile sizes being
+composed from lower level ones. Multiple thread-tiles (tile size each thread computes) can be used
+to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute
+threadblock-tile (tile size computed by a threadblock).
+In thie example, we split variable initialization into
+1. Setting up data properties : describes how matrices are laid out in the memory and how the kernel
+can view them (logical to physical mapping)
+2. Setting up computation properties : describes how the above set matrices will be used to compute
+output of matrix multiplication.
+First, we setup the data types of matrices A, B, C and D along with alpha, beta as the equation for
+GEMM is D = alpha * A * B + beta * C. In HYTLASS, the kernels first compute A * B and leaves the
+rest of the computation to end of the kernel as alpha * X + beta * C is a simple element-wise
+operation on X (A * B) and C. We call this as epilogue of kernel. Hence, we setup data types for
+alpha and beta to be equal to ElementComputeEpilogue = int32_t. As we want to use MMA instructions
+on Turing and they support 8-bit signed integer (int8_t), we use data type for elements in input
+matrix A and B as int8_t. Volta also supports accumulation of partial dot product to int32_t, which
+can store wider range of numbers, we use it as data type of output matrix elements and accumulation.
+We convey this to HYTLASS kernel by initializing template variables ElementAccumulator (int32_t),
+ElementComputeEpilogue (int32_t), ElementInputA (int8_t), ElementInputB (int8_t), ElementOutput
+(int32_t). Communicating just the data type is not enough. As the data is laid out linearly in
+memory, we have to convey the layout of matrices. We do that by initializing template variable
+LayoutInputA to column major hytlass variable, LayoutInputB to row major and LayoutOutput to row
+major. Next, we setup rules to comptue alpha * X + beta * C which is called epilogue of the kernel.
+We initialize template variable EpilogueOp, which takes the data type of output ElementOutput
+(int32_t), the number of elements per vector memory access (16), data type of accumulator (int32_t)
+and data type of computation of linear combination (alpha * X + beta * C).
+Now that we setup the properties of data, we have to setup properties of computation.
+Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x256x64,
+64x64x16, 8x8x16 (MxNxK) respectively. When passed to instantiate HYTLASS GEMM kernel, it internally
+deduce the amount of threads needed per thread-block, amount of shared memory, storing data in
+bank-conflict free manner, and ton of other variables required to compose, initialize and launch a
+high performance GEMM kernel. This is the beauty of HYTLASS, it relieves developer from
+understanding and coding complicated hardware optimizations which can easily go wrong.
+HYTLASS also supports multiple MMA pipelines in a threadblock. What are MMA pipelines? MMA pipelines
+constitute the whole process of loading input data from global memory to shared memory, loading data
+from shared memory to registers, doing matrix multiplication, store to global memory. The below flow
+sequence shows a typical mma pipeline.
+matrix in global memory -> registers -> tile in shared memory -> registers -> mma -> registers ->
+output to global memory
+The problem with single pipeline is, each stage is synchronous which means, each stage has to wait
+until the previous finished executing. There are stages in the pipeline which do not have fixed
+latency, for example, the loads from global memory and shared memory. Therefore, we can add one more
+pipeline with a phase shift in mma kernel to hide latency from global and shared memory loads.
+Finally, the pipeline in a kernel looks like
+(1) matrix in global memory -> (2) registers -> (3) tile in shared memory -> (4) registers -> (5)
+mma -> (6) registers -> (7) output to global memory (1) <null> -> (2) <null> -> (3) matrix in global
+memory -> (4) registers -> (5) tile in shared memory -> (6) registers -> (7) mma -> (8) registers ->
+(9) output to global memory
+This way, you can hide the second global memoroy load latency by doing computation on already loaded
+input data.
+There are few more template variables initialized such as, which threadblock tile of output matrix
+is done which threadblock launched on an SM, GFX architecture of GPU you want to run on.
+These are all put together to create a template variable which describes HYTLASS GEMM kernel using
+hytlass::gemm::device::Gemm template.
+The next step is to initialize physical data, instantiate and initialize HYTLASS kernel and run it.
+We use HYTLASS utilities to initialize, fill, compare matrices as they are simple and doesn't come
+in the way of learning HYTLASS.
+Once all the matrices are initialized and filled with data, create arguments tuple to launch HYTLASS
+kernel which takes problem size (M = 5120, N = 4096 and K = 4096), matrices, alpha, beta and the
+important one, split k-dimension factor. Along with that, we query HYTLASS if any scratch-space
+memory required by the kernel we instantiated. If yes, we create it and pass it along with other
+arguments created to initialize HYTLASS kernel then, the kernel is launched.
+In this example, we later on launch a reference gemm kernel (from HYTLASS utilities) to compare if
+the output from HYTLASS kernel is same as reference GEMM kernel.
+*/
+#include <iostream>
+#include "hytlass/hytlass.h"
+#include "hytlass/gemm/device/gemm.h"
+#include "hytlass/util/host_tensor.h"
+#include "hytlass/util/reference/device/gemm.h"
+#include "hytlass/util/reference/host/tensor_compare.h"
+#include "hytlass/util/reference/host/tensor_copy.h"
+#include "hytlass/util/reference/host/tensor_fill.h"
+#include "hytlass/util/tensor_view_io.h"
+#include "helper.h"
+#include "hytlass/util/GPU_Clock.hpp"
+#include "hipblas.h"
+#include "hytlass/blas3.h"
+// The code section below describes datatype for input, output matrices and computation between
+// elements in input matrices.
+using ElementAccumulator = float;                 // <- data type of accumulator
+using ElementComputeEpilogue = ElementAccumulator;  // <- data type of epilogue operations
+using ElementInputA = hytlass::tfloat32_t;                       // <- data type of elements in input matrix A
+using ElementInputB = hytlass::tfloat32_t;                       // <- data type of elements in input matrix B
+using ElementOutput = hytlass::half_t;                      // <- data type of elements in output matrix D
+// The code section below describes matrix layout of input and output matrices. Column Major for
+// Matrix A, Row Major for Matrix B and Row Major for Matrix C
+using LayoutInputA = hytlass::layout::ColumnMajor;
+using LayoutInputB = hytlass::layout::RowMajor;
+using LayoutOutput = hytlass::layout::RowMajor;
+// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
+using MMAOp = hytlass::arch::OpClassTensorOp;
+// This code section describes GFX architecture number
+using SmArch = hytlass::arch::Gfx928;
+// This code section describes the tile size a thread block will compute
+using ShapeMMAThreadBlock = hytlass::gemm::GemmShape<128, 128, 16>;
+// This code section describes tile size a warp will compute
+using ShapeMMAWarp = hytlass::gemm::GemmShape<64, 128, 16>;
+// This code section describes the size of MMA op
+using ShapeMMAOp = hytlass::gemm::GemmShape<16, 16, 8>;
+constexpr int kAlignmentA = 128 / hytlass::sizeof_bits<ElementInputA>::value;
+constexpr int kAlignmentB = 128 / hytlass::sizeof_bits<ElementInputA>::value;
+constexpr int kAlignmentC = 128 / hytlass::sizeof_bits<ElementOutput>::value;
+// This code section describes how threadblocks are scheduled on GPU
+using SwizzleThreadBlock = hytlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;  // <- ??
+// This code section describes the epilogue part of the kernel
+using EpilogueOp = hytlass::epilogue::thread::LinearCombination<
+  ElementOutput,                                     // <- data type of output matrix
+  kAlignmentC,  // <- the number of elements per vectorized
+                                                       // memory access. For a byte, it's 16
+                                                       // elements. This becomes the vector width of
+                                                       // math instructions in the epilogue too
+  ElementAccumulator,                                // <- data type of accumulator
+  ElementComputeEpilogue>;  // <- data type for alpha/beta in linear combination function
+// Number of pipelines you want to use
+constexpr int NumStages = 2;
+using Gemm = hytlass::gemm::device::Gemm<
+  ElementInputA,
+  LayoutInputA,
+  ElementInputB,
+  LayoutInputB,
+  ElementOutput,
+  LayoutOutput,
+  ElementAccumulator,
+  MMAOp,
+  SmArch,
+  ShapeMMAThreadBlock,
+  ShapeMMAWarp,
+  ShapeMMAOp,
+  EpilogueOp,
+  SwizzleThreadBlock,
+  NumStages, kAlignmentA, kAlignmentB>;
+void reference_hipblas(int m, int n, int k, 
+                      const ElementInputA* a, const ElementInputA* b, float* c) {
+  hipblasHandle_t handle;
+  hipblasCreate(&handle);
+  ElementComputeEpilogue one = ElementComputeEpilogue(1);
+  ElementComputeEpilogue zero = ElementComputeEpilogue(0);
+  float *_a;
+  float *_b;  
+  (void)hipMalloc((void **)(&_a), sizeof(float) * m * k);
+  for (int i = 0; i < m * k; i++) {
+      _a[i] = static_cast<float>(a[i]);
+  }  
+  (void)hipMalloc((void **)(&_b), sizeof(float) * k * n);
+  for (int i = 0; i < n * k; i++) {
+      _b[i] = static_cast<float>(b[i]);
+  }
+  hipblasOperation_t blas_trans_a = std::is_same<LayoutInputA, 
+                                        hytlass::layout::ColumnMajor>::value ? 
+                                        HIPBLAS_OP_N : 
+                                        HIPBLAS_OP_T;
+  hipblasOperation_t blas_trans_b = std::is_same<LayoutInputB, 
+                                        hytlass::layout::ColumnMajor>::value ? 
+                                        HIPBLAS_OP_N : 
+                                        HIPBLAS_OP_T;
+  int lda = std::is_same<LayoutInputA, 
+                        hytlass::layout::ColumnMajor>::value ? m: k;
+  int ldb = std::is_same<LayoutInputB, 
+                        hytlass::layout::ColumnMajor>::value ? k: n;
+  auto err_ = hipblasGemmEx(handle, blas_trans_a, blas_trans_b,
+                           m, n, k,
+                           &one,
+                           _a, HIPBLAS_R_32F, lda,
+                           _b, HIPBLAS_R_32F, ldb,
+                           &zero,
+                           c, HIPBLAS_R_32F, m,
+                           HIPBLAS_R_32F,
+                           HIPBLAS_GEMM_DEFAULT);  
+  (void)hipDeviceSynchronize();
+  if (err_ != 0) {
+      printf("error code is %d\n", err_);
+      exit(-1);
+  }                       
+}
+int run(int length_m_, int length_n_, int length_k_) {
+  const int length_m = length_m_;
+  const int length_n = length_n_;
+  const int length_k = length_k_;
+  // Create a tuple of problem size for matrix multiplication
+  hytlass::gemm::GemmCoord problem_size(length_m, length_n, length_k);
+  // Initialize tensors using HYTLASS helper functions
+  hytlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
+    problem_size.mk());    // <- Create matrix A with dimensions M x K
+  hytlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(
+    problem_size.kn());    // <- Create matrix B with dimensions K x N
+  hytlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(
+    problem_size.mn());    // <- Create matrix C with dimensions M x N
+  hytlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(
+    problem_size.mn());    // <- Create matrix D with dimensions M x N used to store output from
+                           // HYTLASS kernel
+  hytlass::HostTensor<float, hytlass::layout::ColumnMajor> tensor_ref_d(
+    problem_size.mn());    // <- Create matrix D with dimensions M x N used to store output from
+                           // reference kernel
+  // Fill input and output matrices on host using HYTLASS helper functions
+  hytlass::reference::host::TensorFillRandomUniform(
+    tensor_a.host_view(),
+    1,
+    ElementInputA(8),
+    ElementInputA(-8),
+    hytlass::MantissaInBits<ElementOutput>::bits);  // <- Fill matrix A on host with uniform-distribution random data
+  hytlass::reference::host::TensorFillRandomUniform(
+    tensor_b.host_view(),
+    2,
+    ElementInputB(8),
+    ElementInputB(-8),
+    hytlass::MantissaInBits<ElementOutput>::bits);  // <- Fill matrix B on host with uniform-distribution random data
+  hytlass::reference::host::TensorFill(
+    tensor_c.host_view());
+  hytlass::reference::host::TensorFill(
+    tensor_d.host_view());  // <- fill matrix D on host with zeros
+  hytlass::reference::host::TensorFill(
+    tensor_ref_d.host_view());  // <- fill matrix D for reference on host with zeros
+  // Copy data from host to GPU
+  tensor_a.sync_device();
+  tensor_b.sync_device();
+  tensor_c.sync_device();
+  tensor_d.sync_device();
+  tensor_ref_d.sync_device();
+  // Initialize alpha and beta for dot product computation
+  ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
+  ElementComputeEpilogue beta = ElementComputeEpilogue(0);
+  // Split K dimension into 1 partitions
+  int split_k_slices = 1;
+  // Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
+  // instantiated HYTLASS kernel
+  typename Gemm::Arguments arguments {
+    problem_size,  // <- problem size of matrix multiplication
+    tensor_a.device_ref(),  // <- reference to matrix A on device
+    tensor_b.device_ref(),  // <- reference to matrix B on device
+    tensor_c.device_ref(),  // <- reference to matrix C on device
+    tensor_d.device_ref(),  // <- reference to matrix D on device
+    {alpha, beta},          // <- tuple of alpha and beta
+    split_k_slices};        // <- k-dimension split factor
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+  // Allocate workspace memory
+  hytlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+  // Instantiate HYTLASS kernel depending on templates
+  Gemm gemm_op;
+  // Check the problem size is supported or not 
+  hytlass::Status status = gemm_op.can_implement(arguments);
+  HYTLASS_CHECK(status);
+  // Initialize HYTLASS kernel with arguments and workspace pointer
+  status = gemm_op.initialize(arguments, workspace.get());
+  HYTLASS_CHECK(status);
+  // Launch initialized HYTLASS kernel
+  status = gemm_op();
+  HYTLASS_CHECK(status);
+  tensor_d.sync_host();
+  // printf("result is :\n");
+  // for(int i=0;i<length_m;i++){
+  //   for(int j=0;j<length_n;j++){
+  //       printf("%.2f,", float(tensor_d.host_data()[i*length_n+j]));
+  //   }
+  //   printf("\n");
+  // }
+  // Create instantiation for device reference gemm kernel
+//   hytlass::reference::device::Gemm<ElementInputA,
+//                                    LayoutInputA,
+//                                    ElementInputB,
+//                                    LayoutInputB,
+//                                    ElementOutput,
+//                                    LayoutOutput,
+//                                    ElementOutput,
+//                                    ElementOutput>
+//       gemm_device;
+// //   Launch device reference gemm kernel
+//   gemm_device(problem_size,
+//               alpha,
+//               tensor_a.device_ref(),
+//               tensor_b.device_ref(),
+//               beta,
+//               tensor_c.device_ref(),
+//               tensor_ref_d.device_ref());
+  reference_hipblas(length_m, length_n, length_k, 
+                   tensor_a.device_data(),
+                   tensor_b.device_data(),
+                   tensor_ref_d.device_data());
+//   // Wait for kernels to finish
+  (void)hipDeviceSynchronize();
+  // Copy output data from HYTLASS and reference kernel to host for comparison
+  tensor_ref_d.sync_host();
+  // Check if output from HYTLASS kernel and reference kernel are equal or not
+  ElementOutput eps(1e-3f);
+  if (std::is_same<ElementInputA, hytlass::bfloat16_t>::value) {
+      eps = 0.05f;
+  }
+  float max_error_v = 1e-9;
+  int cnt = 0;
+  for (int i = 0; i < length_m; i++) {
+    for (int j = 0; j < length_n; j++) {
+      float factor = float(1);
+        if (tensor_ref_d.host_data()[i + length_m * j] != 0) {
+          factor = std::abs(tensor_ref_d.host_data()[i + length_m * j]);
+        }
+        if (std::abs(tensor_ref_d.host_data()[i + length_m * j]) <= 1) {
+          factor = 1;
+        }
+        if (std::abs((tensor_ref_d.host_data()[i + length_m * j]) - float(tensor_d.host_data()[i * length_n + j])) / factor > eps) {
+          printf("error at (%d %d) expected %f got %f abs err is %f and Relative error is %f\n", i, j,
+                float(tensor_ref_d.host_data()[i + length_m * j]), float(tensor_d.host_data()[i * length_n + j]),
+                float(std::abs((tensor_ref_d.host_data()[i + length_m * j]) - float(tensor_d.host_data()[i * length_n + j]))),
+                float(std::abs((tensor_ref_d.host_data()[i + length_m * j]) - float(tensor_d.host_data()[i * length_n + j])) / factor));
+        if (max_error_v < std::abs((tensor_ref_d.host_data()[i + length_m * j]) - (tensor_d.host_data()[i * length_n + j])) / factor){
+          max_error_v = (std::abs((tensor_ref_d.host_data()[i + length_m * j]) - (tensor_d.host_data()[i * length_n + j])) / factor);
+        }
+        cnt++;
+      }
+    }
+  }
+  if (cnt > 0) {
+    printf("faild,");
+    printf("%f\n", max_error_v);
+    return -1;
+  }
+    printf("success\n");
+  return 0;
+}
+int main() {
+  run(1024, 1024, 1024);
+}
--- a/examples/01_hytlass_serial_splitk_gemm/CMakeLists.txt
+++ b/examples/01_hytlass_serial_splitk_gemm/CMakeLists.txt
+# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+hytlass_example_add_executable(
+  gfx928_serial_splitk_gemm
+  gfx928_serial_splitk_gemm.cu
+  )
\ No newline at end of file
--- a/examples/01_hytlass_serial_splitk_gemm/gfx928_serial_splitk_gemm.cu
+++ b/examples/01_hytlass_serial_splitk_gemm/gfx928_serial_splitk_gemm.cu
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+ /**
+This example shows how to use serial split-k version of matrix multiplication using functions and
+data structures provided by HYTLASS.
+    Examples:
+      # Runs a serial split-K GEMM with the given problem size (use split_k_slices to set number of K slices)
+      $ ./gfx928_serial_splitk_gemm --m=5120 --n=5120 --k=8192 --alpha=1 --beta=0 --split_k_slices=2 --iterations=10 
+*/
+#include <fstream>
+#include <iostream>
+#include "hip/hip_runtime.h"
+#include "hytlass/hytlass.h"
+#include "hytlass/gemm/device/gemm.h"
+#include "hytlass/gemm/device/gemm_universal.h"
+#include "hytlass/util/command_line.h"
+#include "hytlass/util/host_tensor.h"
+#include "hytlass/util/reference/device/gemm.h"
+#include "hytlass/util/reference/host/tensor_compare.h"
+#include "hytlass/util/reference/host/tensor_copy.h"
+#include "hytlass/util/reference/host/tensor_fill.h"
+#include "hytlass/util/tensor_view_io.h"
+#include "helper.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Result structure
+struct Result {
+  double runtime_ms;
+  double gflops;
+  hytlass::Status status;
+  hipError_t error;
+  bool passed;
+  Result(
+    double runtime_ms = 0,
+    double gflops = 0,
+    hytlass::Status status = hytlass::Status::kSuccess,
+    hipError_t error = hipSuccess)
+  :
+    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true)
+  {}
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Command line options parsing
+struct Options {
+  bool help;
+  hytlass::gemm::GemmCoord problem_size;
+  float alpha;
+  float beta;
+  bool reference_check;
+  int iterations;
+  int split_k_slices;
+  Options(): 
+    help(false),
+    problem_size({8192, 8192, 2048}),
+    reference_check(true),
+    iterations(10),
+    split_k_slices(1),
+    alpha(1),
+    beta() 
+  {}
+  bool valid() {
+    return true;
+  }
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    hytlass::CommandLine cmd(argc, args);
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+    cmd.get_cmd_line_argument("m", problem_size.m());
+    cmd.get_cmd_line_argument("n", problem_size.n());
+    cmd.get_cmd_line_argument("k", problem_size.k());
+    cmd.get_cmd_line_argument("alpha", alpha);
+    cmd.get_cmd_line_argument("beta", beta);
+    cmd.get_cmd_line_argument("split_k_slices", split_k_slices);
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+  /// Prints the usage statement.
+  std::ostream &print_usage(std::ostream &out) const {
+    out << "01_hytlass_serial_splitk_gemm\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --m=<int>                   GEMM M dimension\n"
+      << "  --n=<int>                   GEMM N dimension\n"
+      << "  --k=<int>                   GEMM K dimension\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n\n"
+      << "  --split_k_slices=<int>      Split-K factor to emulate\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
+    out << "\n\nExamples:\n\n"
+      << "$ ./examples/01_hytlass_serial_splitk_gemm/gfx928_serial_splitk_gemm --m=1024 --n=512 --k=1024 \\\n"
+      << "     --alpha=2 --beta=0.707 --split_k_slices=2 \n\n";
+    return out;
+  }
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+    int64_t fmas = problem_size.product();
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// The code section below describes datatype for input, output matrices and computation between
+// elements in input matrices.
+using ElementAccumulator = float;                  // <- data type of accumulator
+using ElementComputeEpilogue = ElementAccumulator; // <- data type of epilogue operations
+using ElementInputA = hytlass::half_t;             // <- data type of elements in input matrix A
+using ElementInputB = hytlass::half_t;             // <- data type of elements in input matrix B
+using ElementOutput = hytlass::half_t;             // <- data type of elements in output matrix D
+constexpr int kAlignmentA = 128 / hytlass::sizeof_bits<ElementInputA>::value;
+constexpr int kAlignmentB = 128 / hytlass::sizeof_bits<ElementInputB>::value;
+// The code section below describes matrix layout of input and output matrices. Column Major for
+// Matrix A, Row Major for Matrix B and Row Major for Matrix C
+using LayoutInputA = hytlass::layout::ColumnMajor;
+using LayoutInputB = hytlass::layout::RowMajor;
+using LayoutOutput = hytlass::layout::RowMajor;
+// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
+using MMAOp = hytlass::arch::OpClassTensorOp;
+// This code section describes GFX architecture number
+using SmArch = hytlass::arch::Gfx928;
+// This code section describes the tile size a thread block will compute
+using ShapeMMAThreadBlock =
+    hytlass::gemm::GemmShape<128, 128, 32>; // <- threadblock tile M = 128, N = 128, K = 16
+// This code section describes tile size a warp will compute
+using ShapeMMAWarp = hytlass::gemm::GemmShape<64, 64, 32>; // <- warp tile M = 64, N = 64, K = 16
+// This code section describes the size of MMA op
+using ShapeMMAOp = hytlass::gemm::GemmShape<16, 16, 16>;
+// This code section describes how threadblocks are scheduled on GPU
+using SwizzleThreadBlock = hytlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
+static bool const kSplitKSerial = true;
+// This code section describes the epilogue part of the kernel
+using EpilogueOp = hytlass::epilogue::thread::LinearCombination<
+    ElementOutput,                                    // <- data type of output matrix
+    128 / hytlass::sizeof_bits<ElementOutput>::value, // <- the number of elements per vectorized
+                                                      // memory access. For a byte, it's 16
+                                                      // elements. This becomes the vector width of
+                                                      // math instructions in the epilogue too
+    ElementAccumulator,                               // <- data type of accumulator
+    ElementComputeEpilogue>;                          // <- data type for alpha/beta in linear combination function
+// Number of pipelines you want to use
+constexpr int NumStages = 1;
+using Gemm = hytlass::gemm::device::Gemm<ElementInputA,
+                                         LayoutInputA,
+                                         ElementInputB,
+                                         LayoutInputB,
+                                         ElementOutput,
+                                         LayoutOutput,
+                                         ElementAccumulator,
+                                         MMAOp,
+                                         SmArch,
+                                         ShapeMMAThreadBlock,
+                                         ShapeMMAWarp,
+                                         ShapeMMAOp,
+                                         EpilogueOp,
+                                         SwizzleThreadBlock,
+                                         NumStages,
+                                         kAlignmentA,
+                                         kAlignmentB,
+                                         kSplitKSerial>;
+int run(Options &options) {
+  // Create a tuple of problem size for matrix multiplication
+  hytlass::gemm::GemmCoord problem_size = options.problem_size;
+  // Initialize tensors using HYTLASS helper functions
+  hytlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
+      problem_size.mk()); // <- Create matrix A with dimensions M x K
+  hytlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(
+      problem_size.kn()); // <- Create matrix B with dimensions K x N
+  hytlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(
+      problem_size.mn()); // <- Create matrix C with dimensions M x N
+  hytlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(
+      problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from
+                          // HYTLASS kernel
+  hytlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(
+      problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from
+                          // reference kernel
+  // Fill input and output matrices on host using HYTLASS helper functions
+  hytlass::reference::host::TensorFillRandomUniform(
+      tensor_a.host_view(),
+      1,
+      ElementInputA(4),
+      ElementInputA(-4),
+      0); // <- Fill matrix A on host with uniform-distribution random data
+  hytlass::reference::host::TensorFillRandomUniform(
+      tensor_b.host_view(),
+      1,
+      ElementInputB(4),
+      ElementInputB(-4),
+      0); // <- Fill matrix B on host with uniform-distribution random data
+  // hytlass::reference::host::TensorFillSequential(
+  //     tensor_a.host_view());  // <- Fill matrix A on host with uniform-distribution random data
+  // hytlass::reference::host::TensorFillSequential(
+  //     tensor_b.host_view());  // <- Fill matrix B on host with uniform-distribution random data
+  hytlass::reference::host::TensorFillRandomUniform(
+      tensor_c.host_view(),
+      1,
+      ElementOutput(4),
+      ElementOutput(-4),
+      0);                                                     // <- Fill matrix C on host with uniform-distribution random data
+  hytlass::reference::host::TensorFill(tensor_d.host_view()); // <- fill matrix D on host with zeros
+  hytlass::reference::host::TensorFill(
+      tensor_ref_d.host_view()); // <- fill matrix D for reference on host with zeros
+  // Copy data from host to GPU
+  tensor_a.sync_device();
+  tensor_b.sync_device();
+  tensor_c.sync_device();
+  tensor_d.sync_device();
+  tensor_ref_d.sync_device();
+  // Initialize alpha and beta for dot product computation
+  ElementComputeEpilogue alpha = ElementComputeEpilogue(options.alpha);
+  ElementComputeEpilogue beta = ElementComputeEpilogue(options.beta);
+  // Split K dimension into 1 partitions
+  int split_k_slices = options.split_k_slices;
+  // Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
+  // instantiated HYTLASS kernel
+  typename Gemm::Arguments arguments{problem_size,          // <- problem size of matrix multiplication
+                                     tensor_a.device_ref(), // <- reference to matrix A on device
+                                     tensor_b.device_ref(), // <- reference to matrix B on device
+                                     tensor_c.device_ref(), // <- reference to matrix C on device
+                                     tensor_d.device_ref(), // <- reference to matrix D on device
+                                     {alpha, beta},         // <- tuple of alpha and beta
+                                     split_k_slices};       // <- k-dimension split factor
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+  // Allocate workspace memory
+  hytlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+  // Instantiate HYTLASS kernel depending on templates
+  Gemm gemm_op;
+  // Check the problem size is supported or not
+  hytlass::Status status = gemm_op.can_implement(arguments);
+  HYTLASS_CHECK(status);
+  // Initialize HYTLASS kernel with arguments and workspace pointer
+  status = gemm_op.initialize(arguments, workspace.get());
+  HYTLASS_CHECK(status);
+  // Launch initialized HYTLASS kernel
+  status = gemm_op();
+  HYTLASS_CHECK(status);
+  // Result structure
+  Result result;
+  for (int i=0; i<10; i++) {
+    status = gemm_op();
+  }
+  HYTLASS_CHECK(status);
+  //
+  // Construct events
+  //
+  hipEvent_t events[2];
+  for (auto &event : events) {
+    result.error = hipEventCreate(&event);
+    if (result.error != hipSuccess) {
+      std::cerr << "hipEventCreate() failed: " << hipGetErrorString(result.error) << std::endl;
+      return -1;
+    }
+  }
+  // Record an event at the start of a series of GEMMs
+  result.error = hipEventRecord(events[0]);
+  if (result.error != hipSuccess) {
+    std::cerr << "hipEventRecord() failed: " << hipGetErrorString(result.error) << std::endl;
+    return -1;
+  }
+  //
+  // Run profiling loop
+  //
+  for (int iter = 0; iter < options.iterations; ++iter) {
+    // Launch initialized HYTLASS kernel
+    status = gemm_op();
+    HYTLASS_CHECK(status);
+  }
+  //
+  // Stop profiling loop
+  //
+  // Record an event when the GEMMs are complete
+  result.error = hipEventRecord(events[1]);
+  if (result.error != hipSuccess) {
+    std::cerr << "hipEventRecord() failed: " << hipGetErrorString(result.error) << std::endl;
+    return -1;
+  }
+  // Wait for work on the device to complete.
+  result.error = hipEventSynchronize(events[1]);
+  if (result.error != hipSuccess) {
+    std::cerr << "hipEventSynchronize() failed: " << hipGetErrorString(result.error) << std::endl;
+    return -1;
+  }
+  // Measure elapsed runtime
+  float runtime_ms = 0;
+  result.error = hipEventElapsedTime(&runtime_ms, events[0], events[1]);
+  if (result.error != hipSuccess) {
+    std::cerr << "hipEventElapsed() failed: " << hipGetErrorString(result.error) << std::endl;
+    return -1;
+  }
+  // Compute average runtime and GFLOPs.
+  result.runtime_ms = double(runtime_ms) / double(options.iterations);
+  result.gflops = options.gflops(result.runtime_ms / 1000.0);
+  // Cleanup
+  for (auto event : events) {
+    (void)hipEventDestroy(event);
+  }
+  if (options.reference_check) {
+    // Create instantiation for device reference gemm kernel
+    hytlass::reference::device::Gemm<ElementInputA,
+                                     LayoutInputA,
+                                     ElementInputB,
+                                     LayoutInputB,
+                                     ElementOutput,
+                                     LayoutOutput,
+                                     ElementComputeEpilogue,
+                                     ElementComputeEpilogue>
+        gemm_device;
+    // Launch device reference gemm kernel
+    gemm_device(problem_size,
+                alpha,
+                tensor_a.device_ref(),
+                tensor_b.device_ref(),
+                beta,
+                tensor_c.device_ref(),
+                tensor_ref_d.device_ref());
+    // Wait for kernels to finish
+    (void)hipDeviceSynchronize();
+    // Copy output data from HYTLASS and reference kernel to host for comparison
+    tensor_d.sync_host();
+    tensor_ref_d.sync_host();
+    // Check if output from HYTLASS kernel and reference kernel are equal or not
+    ElementOutput eps(0.05);
+    const ElementOutput non_zero_floor(1e-6f);
+    result.passed = hytlass::reference::host::TensorRelativelyEquals(
+        tensor_ref_d.host_view(), tensor_d.host_view(), eps, non_zero_floor);
+  }
+  if (!result.passed) {
+    std::stringstream fname;
+    fname << "error_Gemm_device_" << problem_size.m() << "x" << problem_size.n() << "x"
+          << problem_size.k() << "_" << ShapeMMAThreadBlock{}.kM << "_" << ShapeMMAThreadBlock{}.kN
+          << "_" << ShapeMMAThreadBlock{}.kK << ".csv";
+    std::ofstream file(fname.str());
+    file << "problem: " << ' ' << problem_size.m() << "x" << problem_size.n() << "x"
+         << problem_size.k() << ", alpha: " << float(alpha) << ", beta: " << float(beta) << "\n\n";
+    file << "A =\n"
+         << tensor_a.host_view() << "\nB =\n"
+         << tensor_b.host_view() << "\nC =\n"
+         << tensor_c.host_view() << "\n\nReference =\n"
+         << tensor_ref_d.host_view() << "\n\nComputed =\n"
+         << tensor_d.host_view();
+  }
+  std::cout << (result.passed ? "Passed" : "Failed") << std::endl;
+  if (result.passed) {
+    std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
+    std::cout << "GFLOPs: " << result.gflops << std::endl;
+  }
+  return (result.passed ? 0 : -1);
+}
+int main(int argc, const char **argv) {
+  Options options;
+  options.parse(argc, argv);
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+  printf("%d x %d x %d tensor op Matrix Multiply\n",
+         options.problem_size.m(), options.problem_size.n(), options.problem_size.k());
+  if (!options.valid()) {
+    std::cerr << "Invalid problem." << std::endl;
+    return -1;
+  }
+  return run(options);
+}
--- a/examples/02_hytlass_parallel_splitk_gemm/CMakeLists.txt
+++ b/examples/02_hytlass_parallel_splitk_gemm/CMakeLists.txt
+# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+hytlass_example_add_executable(
+  gfx928_parallel_splitk_gemm
+  gfx928_parallel_splitk_gemm.cu
+  )
\ No newline at end of file
--- a/examples/02_hytlass_parallel_splitk_gemm/gfx928_parallel_splitk_gemm.cu
+++ b/examples/02_hytlass_parallel_splitk_gemm/gfx928_parallel_splitk_gemm.cu
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/**
+This example shows how to use split-k version of matrix multiplication using functions and data
+structures provided by HYTLASS.
+What is split-k?
+Consider a problem size of M = 128, N = 128, K = 4096. In this case, if my thread-block tile size (a
+tile can be viewed as a 2d matrix) is 128x128x4096, then we launch a singled a thread-block taking
+up a single SM of 80 SMs present on Gfx928. Hence the efficiency of computation is really low. So, how
+to solve it? This is where split-k comes in. It is a way of partitioning K-dimension of matrix
+multiplication and distribute across multiple SMs and get better efficiency than single SM. In the
+above example, we can partition K-dimension with split-k factor of 16 i.e., thread-block tile size
+will be 128x128x256 and will be launching on 16 SMs. Once each thread-block computes their partial
+inner product (1/16th of output), they accumulate to single output matrix.
+Writing a single high performance matrix multiplication kernel is hard but do-able. Whereas writing
+high performance kernels at scale which works for multiple problem sizes with good abstractions is
+really hard. HYTLASS solves this problem by providing simplified abstractions to compose
+multiple sections of gemm kernel. When used properly, the kernels can hit peak performance of GPU
+easily.
+HYTLASS divides a kernel into hierarchical composable sections. Which means, at each thread, warp
+and thread-block level, they compute on their own tile-size with higher level of tile sizes being
+composed from lower level ones. Multiple thread-tiles (tile size each thread computes) can be used
+to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute
+threadblock-tile (tile size computed by a threadblock).
+In this example, we split variable initialization into
+1. Setting up data properties : describes how matrices are laid out in the memory and how the kernel
+can view them (logical to physical mapping)
+2. Setting up computation properties : describes how the above set matrices will be used to compute
+output of matrix multiplication.
+First, we setup the data types of matrices A, B, C and D along with alpha, beta as the equation for
+GEMM is D = alpha * A * B + beta * C. In HYTLASS, the kernels first compute A * B and leaves the
+rest of the computation to end of the kernel as alpha * X + beta * C is a simple element-wise
+operation on X (A * B) and C. We call this as epilogue of kernel. Hence, we setup data types for
+alpha and beta to be equal to ElementComputeEpilogue = float. As we want to MMA instructions on
+Volta and they support only half-precision floating point (fp16 or half), we use data type for
+elements in input matrix A and B as hytlass::half_t. Volta also supports accumulation of partial dot
+product to fp32, which can store wider range of numbers, we use it as data type of output matrix
+elements and accumulation. We convey this to HYTLASS kernel by initializing template variables
+ElementAccumulator (float), ElementComputeEpilogue (float), ElementInputA (hytlass::half_t),
+ElementInputB (hytlass::half_t), ElementOutput (float). Communicating just the data type is not
+enough. As the data is laid out linearly in memory, we have to convey the layout of matrices. We do
+that by initializing template variable LayoutInputA to column major hytlass variable, LayoutInputB
+to row major and LayoutOutput to row major. Next, we setup rules to compute alpha * X + beta * C
+which is called epilogue of the kernel. We initialize template variable EpilogueOp, which takes the
+data type of output ElementOutput (float), the number of elements per vector memory access (16),
+data type of accumulator (float) and data type of computation of linear combination (alpha * X +
+beta * C).
+Now that we setup the properties of data, we have to setup properties of computation.
+Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x32,
+64x64x4, 8x8x4 (MxNxK) respectively. When passed to instantiate HYTLASS GEMM kernel, it internally
+deduce the amount of threads needed per thread-block, amount of shared memory, storing data in
+bank-conflict free manner, and ton of other variables required to compose, initialize and launch a
+high performance GEMM kernel. This is the beauty of HYTLASS, it relieves developer from
+understanding and coding complicated hardware optimizations which can easily go wrong.
+There are few more template variables initialized such as, which threadblock tile of output matrix
+is done which threadblock launched on an SM, GFX architecture of GPU you want to run on.
+These are all put together to create a template variable which describes HYTLASS GEMM kernel using
+hytlass::gemm::device::GemmSplitKParallel template.
+The next step is to initialize physical data, instantiate and initialize HYTLASS kernel and run it.
+We use HYTLASS utilities to initialize, fill, compare matrices as they are simple and doesn't come
+in the way of learning HYTLASS.
+Once all the matrices are initialized and filled with data, create arguments tuple to launch HYTLASS
+kernel which takes problem size (M = 5120, N = 4096 and K = 4096), matrices, alpha, beta and the
+important one, split k-dimension factor. Along with that, we query HYTLASS if any scratch-space
+memory required by the kernel we instantiated. If yes, we create it and pass it along with other
+arguments created to initialize HYTLASS kernel then, the kernel is launched.
+In this example, we later on launch a reference gemm kernel (from HYTLASS utilities) to compare if
+the output from HYTLASS kernel is same as reference GEMM kernel.
+*/
+#include <fstream>
+#include <iostream>
+#include "hip/hip_runtime.h"
+#include "hytlass/hytlass.h"
+#include "hytlass/gemm/device/gemm_splitk_parallel.h"
+#include "hytlass/gemm/device/gemm_universal.h"
+#include "hytlass/util/command_line.h"
+#include "hytlass/util/host_tensor.h"
+#include "hytlass/util/reference/device/gemm.h"
+#include "hytlass/util/reference/host/tensor_compare.h"
+#include "hytlass/util/reference/host/tensor_copy.h"
+#include "hytlass/util/reference/host/tensor_fill.h"
+#include "hytlass/util/tensor_view_io.h"
+#include "helper.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Result structure
+struct Result {
+  double runtime_ms;
+  double gflops;
+  hytlass::Status status;
+  hipError_t error;
+  bool passed;
+  Result(
+    double runtime_ms = 0,
+    double gflops = 0,
+    hytlass::Status status = hytlass::Status::kSuccess,
+    hipError_t error = hipSuccess)
+  :
+    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true)
+  {}
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Command line options parsing
+struct Options {
+  bool help;
+  hytlass::gemm::GemmCoord problem_size;
+  float alpha;
+  float beta;
+  bool reference_check;
+  int iterations;
+  int split_k_slices;
+  Options(): 
+    help(false),
+    problem_size({8192, 8192, 2048}),
+    reference_check(true),
+    iterations(10),
+    split_k_slices(1),
+    alpha(1),
+    beta() 
+  {}
+  bool valid() {
+    return true;
+  }
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    hytlass::CommandLine cmd(argc, args);
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+    cmd.get_cmd_line_argument("m", problem_size.m());
+    cmd.get_cmd_line_argument("n", problem_size.n());
+    cmd.get_cmd_line_argument("k", problem_size.k());
+    cmd.get_cmd_line_argument("alpha", alpha);
+    cmd.get_cmd_line_argument("beta", beta);
+    cmd.get_cmd_line_argument("split_k_slices", split_k_slices);
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+  /// Prints the usage statement.
+  std::ostream &print_usage(std::ostream &out) const {
+    out << "02_hytlass_parallel_splitk_gemm example\n\n"
+        << "  This example uses the HYTLASS Library to execute F32 tensorop GEMM computations.\n\n"
+        << "Options:\n\n"
+        << "  --help                      If specified, displays this usage statement.\n\n"
+        << "  --m=<int>                   GEMM M dimension\n"
+        << "  --n=<int>                   GEMM N dimension\n"
+        << "  --k=<int>                   GEMM K dimension\n"
+        << "  --alpha=<f32>               Epilogue scalar alpha\n"
+        << "  --beta=<f32>                Epilogue scalar beta\n\n"
+        << "  --split_k_slices=<int>      Split-K factor to emulate\n\n"
+        << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
+    out << "\n\nExamples:\n\n"
+        << "$ ./examples/02_hytlass_parallel_splitk_gemm/gfx928_parallel_splitk_gemm --m=1024 --n=512 --k=1024 \\\n"
+        << "     --alpha=2 --beta=0.707 --split_k_slices=2 \n\n";
+    return out;
+  }
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+    // Number of real-valued multiply-adds
+    int64_t fmas = problem_size.product();
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// The code section below describes datatype for input, output matrices and computation between
+// elements in input matrices.
+using ElementAccumulator = float;                  // <- data type of accumulator
+using ElementComputeEpilogue = ElementAccumulator; // <- data type of epilogue operations
+using ElementInputA = hytlass::half_t;             // <- data type of elements in input matrix A
+using ElementInputB = hytlass::half_t;             // <- data type of elements in input matrix B
+using ElementOutput = hytlass::half_t;             // <- data type of elements in output matrix D
+// The code section below describes matrix layout of input and output matrices. Column Major for
+// Matrix A, Row Major for Matrix B and Row Major for Matrix C
+using LayoutInputA = hytlass::layout::ColumnMajor;
+using LayoutInputB = hytlass::layout::ColumnMajor;
+using LayoutOutput = hytlass::layout::RowMajor;
+// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
+using MMAOp = hytlass::arch::OpClassTensorOp;
+// This code section describes GFX architecture number
+using SmArch = hytlass::arch::Gfx928;
+// This code section describes the tile size a thread block will compute
+using ShapeMMAThreadBlock =
+    hytlass::gemm::GemmShape<128, 128, 32>; // <- threadblock tile M = 128, N = 128, K = 16
+// This code section describes tile size a warp will compute
+using ShapeMMAWarp = hytlass::gemm::GemmShape<64, 64, 32>; // <- warp tile M = 64, N = 64, K = 16
+// This code section describes the size of MMA op
+using ShapeMMAOp = hytlass::gemm::GemmShape<16, 16, 16>;
+// This code section describes the epilogue part of the kernel
+using EpilogueOp = hytlass::epilogue::thread::LinearCombination<
+    ElementOutput,                                    // <- data type of output matrix
+    128 / hytlass::sizeof_bits<ElementOutput>::value, // <- the number of elements per vectorized
+                                                      // memory access. For a byte, it's 16
+                                                      // elements. This becomes the vector width of
+                                                      // math instructions in the epilogue too
+    ElementAccumulator,                               // <- data type of accumulator
+    ElementComputeEpilogue>;                          // <- data type for alpha/beta in linear combination function
+using Gemm = hytlass::gemm::device::GemmSplitKParallel<ElementInputA,
+                                                       LayoutInputA,
+                                                       ElementInputB,
+                                                       LayoutInputB,
+                                                       ElementOutput,
+                                                       LayoutOutput,
+                                                       ElementAccumulator,
+                                                       MMAOp,
+                                                       SmArch,
+                                                       ShapeMMAThreadBlock,
+                                                       ShapeMMAWarp,
+                                                       ShapeMMAOp,
+                                                       EpilogueOp>;
+int run(Options &options) {
+  // Create a tuple of problem size for matrix multiplication
+  hytlass::gemm::GemmCoord problem_size = options.problem_size;
+  // Initialize tensors using HYTLASS helper functions
+  hytlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
+      problem_size.mk()); // <- Create matrix A with dimensions M x K
+  hytlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(
+      problem_size.kn()); // <- Create matrix B with dimensions K x N
+  hytlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(
+      problem_size.mn()); // <- Create matrix C with dimensions M x N
+  hytlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(
+      problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from
+                          // HYTLASS kernel
+  hytlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(
+      problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from
+                          // reference kernel
+  // Fill input and output matrices on host using HYTLASS helper functions
+  hytlass::reference::host::TensorFillRandomUniform(
+      tensor_a.host_view(),
+      1,
+      ElementInputA(4),
+      ElementInputA(-4),
+      0); // <- Fill matrix A on host with uniform-distribution random data
+  hytlass::reference::host::TensorFillRandomUniform(
+      tensor_b.host_view(),
+      1,
+      ElementInputB(4),
+      ElementInputB(-4),
+      0); // <- Fill matrix B on host with uniform-distribution random data
+  // hytlass::reference::host::TensorFillSequential(
+  //     tensor_a.host_view());  // <- Fill matrix A on host with uniform-distribution random data
+  // hytlass::reference::host::TensorFillSequential(
+  //     tensor_b.host_view());  // <- Fill matrix B on host with uniform-distribution random data
+  hytlass::reference::host::TensorFillRandomUniform(
+      tensor_c.host_view(),
+      1,
+      ElementOutput(4),
+      ElementOutput(-4),
+      0); // <- Fill matrix C on host with uniform-distribution random data
+  hytlass::reference::host::TensorFill(
+      tensor_d.host_view()); // <- fill matrix D on host with zeros
+  hytlass::reference::host::TensorFill(
+      tensor_ref_d.host_view()); // <- fill matrix D for reference on host with zeros
+  // Copy data from host to GPU
+  tensor_a.sync_device();
+  tensor_b.sync_device();
+  tensor_c.sync_device();
+  tensor_d.sync_device();
+  tensor_ref_d.sync_device();
+  // Initialize alpha and beta for dot product computation
+  ElementComputeEpilogue alpha = ElementComputeEpilogue(options.alpha);
+  ElementComputeEpilogue beta = ElementComputeEpilogue(options.beta);
+  // Split K dimension into 1 partitions
+  int split_k_slices = options.split_k_slices;
+  // Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
+  // instantiated HYTLASS kernel
+  typename Gemm::Arguments arguments{problem_size,          // <- problem size of matrix multiplication
+                                     tensor_a.device_ref(), // <- reference to matrix A on device
+                                     tensor_b.device_ref(), // <- reference to matrix B on device
+                                     tensor_c.device_ref(), // <- reference to matrix C on device
+                                     tensor_d.device_ref(), // <- reference to matrix D on device
+                                     {alpha, beta},         // <- tuple of alpha and beta
+                                     split_k_slices};       // <- k-dimension split factor
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+  // Allocate workspace memory
+  hytlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+  // Instantiate HYTLASS kernel depending on templates
+  Gemm gemm_op;
+  // Check the problem size is supported or not
+  hytlass::Status status = gemm_op.can_implement(arguments);
+  HYTLASS_CHECK(status);
+  // Initialize HYTLASS kernel with arguments and workspace pointer
+  status = gemm_op.initialize(arguments, workspace.get());
+  HYTLASS_CHECK(status);
+  // Launch initialized HYTLASS kernel
+  status = gemm_op();
+  HYTLASS_CHECK(status);
+  // Result structure
+  Result result;
+  for (int i=0; i<10; i++) {
+    status = gemm_op();
+  }
+  HYTLASS_CHECK(status);
+  //
+  // Construct events
+  //
+  hipEvent_t events[2];
+  for (auto &event : events) {
+    result.error = hipEventCreate(&event);
+    if (result.error != hipSuccess) {
+      std::cerr << "hipEventCreate() failed: " << hipGetErrorString(result.error) << std::endl;
+      return -1;
+    }
+  }
+  // Record an event at the start of a series of GEMMs
+  result.error = hipEventRecord(events[0]);
+  if (result.error != hipSuccess) {
+    std::cerr << "hipEventRecord() failed: " << hipGetErrorString(result.error) << std::endl;
+    return -1;
+  }
+  //
+  // Run profiling loop
+  //
+  for (int iter = 0; iter < options.iterations; ++iter) {
+    // Launch initialized HYTLASS kernel
+    status = gemm_op();
+    HYTLASS_CHECK(status);
+  }
+  //
+  // Stop profiling loop
+  //
+  // Record an event when the GEMMs are complete
+  result.error = hipEventRecord(events[1]);
+  if (result.error != hipSuccess) {
+    std::cerr << "hipEventRecord() failed: " << hipGetErrorString(result.error) << std::endl;
+    return -1;
+  }
+  // Wait for work on the device to complete.
+  result.error = hipEventSynchronize(events[1]);
+  if (result.error != hipSuccess) {
+    std::cerr << "hipEventSynchronize() failed: " << hipGetErrorString(result.error) << std::endl;
+    return -1;
+  }
+  // Measure elapsed runtime
+  float runtime_ms = 0;
+  result.error = hipEventElapsedTime(&runtime_ms, events[0], events[1]);
+  if (result.error != hipSuccess) {
+    std::cerr << "hipEventElapsed() failed: " << hipGetErrorString(result.error) << std::endl;
+    return -1;
+  }
+  // Compute average runtime and GFLOPs.
+  result.runtime_ms = double(runtime_ms) / double(options.iterations);
+  result.gflops = options.gflops(result.runtime_ms / 1000.0);
+  // Cleanup
+  for (auto event : events) {
+    (void)hipEventDestroy(event);
+  }
+  if (options.reference_check) {
+    // Create instantiation for device reference gemm kernel
+    hytlass::reference::device::Gemm<ElementInputA,
+                                     LayoutInputA,
+                                     ElementInputB,
+                                     LayoutInputB,
+                                     ElementOutput,
+                                     LayoutOutput,
+                                     ElementComputeEpilogue,
+                                     ElementComputeEpilogue>
+        gemm_device;
+    // Launch device reference gemm kernel
+    gemm_device(problem_size,
+                alpha,
+                tensor_a.device_ref(),
+                tensor_b.device_ref(),
+                beta,
+                tensor_c.device_ref(),
+                tensor_ref_d.device_ref());
+    // Wait for kernels to finish
+    (void)hipDeviceSynchronize();
+    // Copy output data from HYTLASS and reference kernel to host for comparison
+    tensor_d.sync_host();
+    tensor_ref_d.sync_host();
+    // Check if output from HYTLASS kernel and reference kernel are equal or not
+    ElementOutput eps(0.05);
+    const ElementOutput non_zero_floor(1e-6f);
+    result.passed = hytlass::reference::host::TensorRelativelyEquals(tensor_ref_d.host_view(), tensor_d.host_view(), eps, non_zero_floor);
+  }
+  if (!result.passed) {
+    std::stringstream fname;
+    fname << "error_Gemm_device_"
+          << problem_size.m() << "x" << problem_size.n() << "x" << problem_size.k() << "_"
+          << ShapeMMAThreadBlock{}.kM << "_"
+          << ShapeMMAThreadBlock{}.kN << "_"
+          << ShapeMMAThreadBlock{}.kK << ".csv";
+    std::ofstream file(fname.str());
+    file
+        << "problem: " << ' ' << problem_size.m() << "x" << problem_size.n() << "x" << problem_size.k()
+        << ", alpha: " << float(alpha) << ", beta: " << float(beta) << "\n\n";
+    file
+        << "A =\n"
+        << tensor_a.host_view()
+        << "\nB =\n"
+        << tensor_b.host_view()
+        << "\nC =\n"
+        << tensor_c.host_view()
+        << "\n\nReference =\n"
+        << tensor_ref_d.host_view()
+        << "\n\nComputed =\n"
+        << tensor_d.host_view();
+  }
+  std::cout << (result.passed ? "Passed" : "Failed") << std::endl;
+  if (result.passed) {
+    std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
+    std::cout << "GFLOPs: " << result.gflops << std::endl;
+  }
+  return (result.passed ? 0 : -1);
+}
+int main(int argc, const char **argv) {
+  Options options;
+  options.parse(argc, argv);
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+  printf("%d x %d x %d tensor op Matrix Multiply\n",
+         options.problem_size.m(), options.problem_size.n(), options.problem_size.k());
+  if (!options.valid()) {
+    std::cerr << "Invalid problem." << std::endl;
+    return -1;
+  }
+  return run(options);
+}
--- a/examples/03_hytlass_streamk_gemm/CMakeLists.txt
+++ b/examples/03_hytlass_streamk_gemm/CMakeLists.txt
+# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+hytlass_example_add_executable(
+  gfx928_streamk_gemm
+  gfx928_streamk_gemm.cu
+  )
+hytlass_example_add_executable(
+  gfx928_gemm_universal_streamk_broadcast
+  gfx928_gemm_universal_streamk_broadcast.cu
+  )
\ No newline at end of file