Merge remote-tracking branch 'origin/develop' into letaoqin/update_layernorm

dc1c2bf8 · carlushuang · 5cfd751b · a285d6f9 · dc1c2bf8 · dc1c2bf8
Commit dc1c2bf8 authored Oct 20, 2024 by carlushuang
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -202,6 +202,13 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500723302)
    add_compile_options(-fno-offload-uniform-block)
  endif()
 endif()
+if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500500000)
+  check_cxx_compiler_flag("-mllvm --lsr-drop-solution=1" HAS_LSR_DROP_SOLUTION)
+  if(HAS_LSR_DROP_SOLUTION)
+    message("Adding the lsr-drop-solution=1 compiler flag")
+    add_compile_options("SHELL: -mllvm --lsr-drop-solution=1")
+  endif()
+endif()
 if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090)
  check_cxx_compiler_flag("-mllvm -enable-post-misched=0" HAS_ENABLE_POST_MISCHED)
  if(HAS_ENABLE_POST_MISCHED)

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -735,11 +735,11 @@ def process_results(Map conf=[:]){
 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
 CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true
-                                              0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true
+                                              0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true;RUN_CODEGEN_TESTS=true
                                              0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
                                              0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
-                                              0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_CODEGEN_TESTS=false;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false
+                                              0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false
-                                              0 13 * * * % BUILD_LEGACY_OS=true ''' : ""
+                                              0 13 * * * % BUILD_LEGACY_OS=true''' : ""
 pipeline {
    agent none
@@ -806,6 +806,10 @@ pipeline {
            name: "RUN_GROUPED_CONV_LARGE_CASES_TESTS",
            defaultValue: false,
            description: "Run the grouped conv large cases tests (default: OFF)")
+        booleanParam(
+            name: "RUN_CODEGEN_TESTS",
+            defaultValue: false,
+            description: "Run codegen tests (default: OFF)")
        booleanParam(
            name: "RUN_CK_TILE_FMHA_TESTS",
            defaultValue: false,
@@ -926,7 +930,30 @@ pipeline {
                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx90a && \
                                           make -j64 test_grouped_convnd_fwd_large_cases_xdl && \
                                           ./bin/test_grouped_convnd_fwd_large_cases_xdl"""
-                   }
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
+            }
+        }
+        stage("Run Codegen Tests")
+        {
+            parallel
+            {
+                stage("Run Codegen Tests on gfx90a")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_CODEGEN_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx90a")}
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ CXX=/opt/rocm/llvm/bin/clang++ cmake ../codegen && \
+                                           make -j64 check"""
+                    }
                    steps{
                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
                        cleanWs()
@@ -951,7 +978,7 @@ pipeline {
                                           make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
                                           cd ../ &&
                                           example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
-                   }
+                    }
                    steps{
                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
                        cleanWs()
@@ -970,7 +997,7 @@ pipeline {
                                           make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
                                           cd ../ &&
                                           example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
-                   }
+                    }
                    steps{
                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
                        cleanWs()
@@ -995,7 +1022,7 @@ pipeline {
                                           make -j64 tile_example_gemm_basic && \
                                           cd ../ &&
                                           example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
-                   }
+                    }
                    steps{
                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
                        cleanWs()
@@ -1014,7 +1041,7 @@ pipeline {
                                           make -j64 tile_example_gemm_basic && \
                                           cd ../ &&
                                           example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
-                   }
+                    }
                    steps{
                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
                        cleanWs()
@@ -1040,7 +1067,7 @@ pipeline {
                                         -DCMAKE_CXX_FLAGS=" -O3 " \
                                         -DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """
                        execute_args = " "
-                   }
+                    }
                    steps{
                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: " ", no_reboot:true, build_type: 'Release', docker_name: docker_name)
                        cleanWs()
@@ -1059,7 +1086,7 @@ pipeline {
                                         -DCMAKE_CXX_FLAGS=" -O3 " \
                                         -DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """
                        execute_args = " "
-                   }
+                    }
                    steps{
                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: " ", no_reboot:true, build_type: 'Release', docker_name: docker_name)
                        cleanWs()
@@ -1140,7 +1167,7 @@ pipeline {
                                           -D CMAKE_BUILD_TYPE=Release \
                                           -D GPU_ARCHS="gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102"  \
                                           -D CMAKE_CXX_FLAGS=" -O3 " .. && make -j64 """
-                   }
+                    }
                    steps{
                        buildHipClangJobAndReboot(setup_cmd: "",  build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
                        cleanWs()

--- a/codegen/CMakeLists.txt
+++ b/codegen/CMakeLists.txt
+cmake_minimum_required(VERSION 3.16)
+project(composable_kernel_host)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
@@ -5,56 +8,51 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 set(CK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)
-add_compile_options(-std=c++17)
+find_package(ROCM)
-find_package(hip)
+include(ROCMInstallTargets)
-add_custom_target(codegen)
+include(ROCMTest)
-# add include directories
+rocm_setup_version(VERSION 1.0)
-include_directories(BEFORE
-            ${PROJECT_BINARY_DIR}/include
-            ${PROJECT_SOURCE_DIR}/include
-            ${PROJECT_SOURCE_DIR}/library/include
-            ${HIP_INCLUDE_DIRS}
-            )
 list(APPEND CMAKE_MODULE_PATH ${CK_ROOT}/cmake)
 include(Embed)
 file(GLOB_RECURSE KERNEL_FILES CONFIGURE_DEPENDS
-	${CK_ROOT}/include/ck/*.hpp)
+    ${CK_ROOT}/include/ck/*.hpp)
-#printouts fot debug purposes
+# printouts fot debug purposes
-#message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
+# message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
-#message(STATUS "RELATIVE: ${CK_ROOT}/include")
+# message(STATUS "RELATIVE: ${CK_ROOT}/include")
 add_embed_library(ck_headers ${KERNEL_FILES} RELATIVE ${CK_ROOT}/include)
-file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp)
+add_compile_options(-std=c++17)
-##message(STATUS "SOURCE_FILES: ${SOURCES}")
+file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp)
 # TODO: Use object library
 add_library(ck_host STATIC ${SOURCES})
 target_link_libraries(ck_host PRIVATE ck_headers)
-set_target_properties(ck_host PROPERTIES
+set_target_properties(ck_host PROPERTIES 
-	LINKER_LANGUAGE CXX
+    LINKER_LANGUAGE CXX
-        POSITION_INDEPENDENT_CODE ON)
+    POSITION_INDEPENDENT_CODE ON)
-target_include_directories(ck_host PUBLIC
+# target_include_directories(ck_host PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+#     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-    $<INSTALL_INTERFACE:include>
+# )
-)
 add_executable(ck-template-driver driver/main.cpp)
 target_link_libraries(ck-template-driver ck_host)
-rocm_install(
+rocm_install_targets(
    TARGETS ck_host ck_headers
-    EXPORT ck_hostTargets
+    EXPORT ck_host_targets
+    INCLUDE include
+    PRIVATE
+)
+rocm_export_targets(
+    EXPORT ck_host_targets
+    NAMESPACE composable_kernel::
 )
-rocm_install(EXPORT ck_hostTargets 
-	FILE composable_kernelck_hostTargets.cmake 
-	NAMESPACE composable_kernel:: 
-	DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel)
-rocm_install(DIRECTORY include/ck DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 if(BUILD_TESTING)
-	add_subdirectory(test)
+    add_subdirectory(test)
 endif()
--- a/codegen/test/CMakeLists.txt
+++ b/codegen/test/CMakeLists.txt
 list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
 add_subdirectory(rtc)
 file(GLOB TEST_SRCS CONFIGURE_DEPENDS *.cpp)
-# do not build the tests when we build the library for various targets
-if(NOT GPU_ARCHS)
+# TODO: These tests need to be refactored to remove dependency on main ck
-  foreach(TEST_SRC ${TEST_SRCS})
+# headers and device compilation.
-    set_source_files_properties(${TEST_SRC} PROPERTIES LANGUAGE HIP)
+set(TESTS_REQUIRE_DEVICE_COMPILE 
-    get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE)
+  grouped_conv_fwd_multiple_d_v1
-    add_executable(codegen_test_${BASE_NAME} ${TEST_SRC})
+  grouped_conv_fwd_multiple_d_v2
-    if(CK_USE_ALTERNATIVE_PYTHON)
+  grouped_conv_fwd_multiple_d_v3
-      target_link_options(codegen_test_${BASE_NAME} PRIVATE -lstdc++fs)
+  grouped_conv_fwd_multiple_d_v4
-    endif()
+)
-    add_dependencies(codegen codegen_test_${BASE_NAME})
+find_package(hip)
-    add_dependencies(tests codegen_test_${BASE_NAME})
-    add_dependencies(check codegen_test_${BASE_NAME})
+foreach(TEST_SRC ${TEST_SRCS})
-    add_test(NAME codegen_test_${BASE_NAME} COMMAND codegen_test_${BASE_NAME})
+  get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE)
-    message("adding test codegen_test_${BASE_NAME}")
+  rocm_add_test_executable(codegen_test_${BASE_NAME} ${TEST_SRC})
-    target_link_libraries(codegen_test_${BASE_NAME} ck_rtc ck_host)
+  target_link_libraries(codegen_test_${BASE_NAME} ck_rtc ck_host)
-    target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/codegen/test/include)
+  target_include_directories(codegen_test_${BASE_NAME} PUBLIC include)
+  if(BASE_NAME IN_LIST TESTS_REQUIRE_DEVICE_COMPILE)
+    target_link_libraries(codegen_test_${BASE_NAME} hip::device)
    target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/include)
    target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/library/include)
-  endforeach()
+  endif()
-endif()
+endforeach()
--- a/codegen/test/common.hpp
+++ b/codegen/test/common.hpp
--- a/codegen/test/rtc/CMakeLists.txt
+++ b/codegen/test/rtc/CMakeLists.txt
+find_package(hip)
 file(GLOB RTC_SOURCES CONFIGURE_DEPENDS src/*.cpp)
 add_library(ck_rtc ${RTC_SOURCES})
 target_include_directories(ck_rtc PUBLIC include)
 target_link_libraries(ck_rtc PUBLIC hip::host)
+target_link_libraries(ck_rtc PUBLIC -lstdc++fs)
--- a/codegen/test/rtc/include/rtc/compile_kernel.hpp
+++ b/codegen/test/rtc/include/rtc/compile_kernel.hpp
@@ -2,14 +2,14 @@
 #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL
 #include <rtc/kernel.hpp>
-#include <ck/filesystem.hpp>
+#include <rtc/filesystem.hpp>
 #include <string>
 namespace rtc {
 struct src_file
 {
-    CK::fs::path path;
+    fs::path path;
    std::string_view content;
 };

--- a/codegen/test/rtc/include/rtc/filesystem.hpp
+++ b/codegen/test/rtc/include/rtc/filesystem.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#ifndef GUARD_TEST_HOST_RTC_FILESYSTEM_HPP
+#define GUARD_TEST_HOST_RTC_FILESYSTEM_HPP
+#include <string>
+#include <string_view>
+// clang-format off
+#if defined(CPPCHECK)
+  #define RTC_HAS_FILESYSTEM 1
+  #define RTC_HAS_FILESYSTEM_TS 1
+#elif defined(_WIN32)
+  #if _MSC_VER >= 1920
+    #define RTC_HAS_FILESYSTEM 1
+    #define RTC_HAS_FILESYSTEM_TS 0
+  #elif _MSC_VER >= 1900
+    #define RTC_HAS_FILESYSTEM 0
+    #define RTC_HAS_FILESYSTEM_TS 1
+  #else
+    #define RTC_HAS_FILESYSTEM 0
+    #define RTC_HAS_FILESYSTEM_TS 0
+  #endif
+#elif defined(__has_include)
+  #if __has_include(<filesystem>) && __cplusplus >= 201703L
+    #define RTC_HAS_FILESYSTEM 1
+  #else
+    #define RTC_HAS_FILESYSTEM 0
+  #endif
+  #if __has_include(<experimental/filesystem>) && __cplusplus >= 201103L
+    #define RTC_HAS_FILESYSTEM_TS 1
+  #else
+    #define RTC_HAS_FILESYSTEM_TS 0
+  #endif
+#else
+  #define RTC_HAS_FILESYSTEM 0
+  #define RTC_HAS_FILESYSTEM_TS 0
+#endif
+// clang-format on
+#if RTC_HAS_FILESYSTEM
+#include <filesystem>
+#elif RTC_HAS_FILESYSTEM_TS
+#include <experimental/filesystem>
+#else
+#error "No filesystem include available"
+#endif
+namespace rtc {
+#if RTC_HAS_FILESYSTEM
+namespace fs = ::std::filesystem;
+#elif RTC_HAS_FILESYSTEM_TS
+namespace fs = ::std::experimental::filesystem;
+#endif
+} // namespace rtc
+#endif // GUARD_RTC_FILESYSTEM_HPP_
--- a/codegen/test/rtc/include/rtc/tmp_dir.hpp
+++ b/codegen/test/rtc/include/rtc/tmp_dir.hpp
@@ -2,13 +2,13 @@
 #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR
 #include <string>
-#include <ck/filesystem.hpp>
+#include <rtc/filesystem.hpp>
 namespace rtc {
 struct tmp_dir
 {
-    CK::fs::path path;
+    fs::path path;
    tmp_dir(const std::string& prefix = "");
    void execute(const std::string& cmd) const;

--- a/codegen/test/rtc/src/compile_kernel.cpp
+++ b/codegen/test/rtc/src/compile_kernel.cpp
-#include "rtc/hip.hpp"
+#include <rtc/hip.hpp>
 #include <rtc/compile_kernel.hpp>
 #include <rtc/tmp_dir.hpp>
 #include <stdexcept>
@@ -70,9 +70,9 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
    for(const auto& src : srcs)
    {
-        CK::fs::path full_path   = td.path / src.path;
+        fs::path full_path   = td.path / src.path;
-        CK::fs::path parent_path = full_path.parent_path();
+        fs::path parent_path = full_path.parent_path();
-        CK::fs::create_directories(parent_path);
+        fs::create_directories(parent_path);
        write_string(full_path.string(), src.content);
        if(src.path.extension().string() == ".cpp")
        {
@@ -86,7 +86,7 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
    td.execute(compiler() + options.flags);
    auto out_path = td.path / out;
-    if(not CK::fs::exists(out_path))
+    if(not fs::exists(out_path))
        throw std::runtime_error("Output file missing: " + out);
    auto obj = read_buffer(out_path.string());

--- a/codegen/test/rtc/src/tmp_dir.cpp
+++ b/codegen/test/rtc/src/tmp_dir.cpp
@@ -31,10 +31,10 @@ std::string unique_string(const std::string& prefix)
 }
 tmp_dir::tmp_dir(const std::string& prefix)
-    : path(CK::fs::temp_directory_path() /
+    : path(fs::temp_directory_path() /
           unique_string(prefix.empty() ? "ck-rtc" : "ck-rtc-" + prefix))
 {
-    CK::fs::create_directories(this->path);
+    fs::create_directories(this->path);
 }
 void tmp_dir::execute(const std::string& cmd) const
@@ -43,6 +43,6 @@ void tmp_dir::execute(const std::string& cmd) const
    std::system(s.c_str());
 }
-tmp_dir::~tmp_dir() { CK::fs::remove_all(this->path); }
+tmp_dir::~tmp_dir() { fs::remove_all(this->path); }
 } // namespace rtc
--- a/example/44_elementwise_permute/CMakeLists.txt
+++ b/example/44_elementwise_permute/CMakeLists.txt
@@ -5,3 +5,4 @@ add_example_executable(example_elementwise_permute_4D_fp32_col elementwise_permu
 add_example_executable(example_elementwise_permute_4D_fp16_col elementwise_permute_4D_fp16_col.cpp)
 add_example_executable(example_elementwise_binary_4D_fp16 elementwise_binary_4D_fp16.cpp)
 add_example_executable(example_elementwise_trinary_4D_fp16 elementwise_trinary_4D_fp16.cpp)
+add_example_executable(elementwise_scale_permute_amax_2D_fp16_fp8 elementwise_scale_permute_amax_2D_fp16_fp8.cpp)
--- a/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
+++ b/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/utility/reduction_enums.hpp"
+using F16 = ck::half_t;
+using F32 = float;
+using F8  = ck::f8_t;
+using InputDataType  = F16;
+using ScaleDataType  = F32;
+using OutputDataType = F8;
+static constexpr ck::index_t NumDim = 2;
+constexpr ck::ReduceTensorOp ReduceOpId = ck::ReduceTensorOp::MAX;
+constexpr bool PropagateNan             = true;
+constexpr bool OutputIndex              = false;
+using ReduceOperation = typename ck::reduce_binary_operator<ReduceOpId>::opType;
+struct ScalePassThrough
+{
+    ScalePassThrough(const float alpha = 1.f) : alpha_(alpha) {}
+    __host__ __device__ constexpr void
+    operator()(OutputDataType& y0, OutputDataType& y1, const InputDataType& x0) const
+    {
+        y0 = ck::type_convert<OutputDataType>(ck::type_convert<ScaleDataType>(x0) * alpha_);
+        y1 = y0;
+    }
+    const ScaleDataType alpha_;
+};
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using UnaryAbs    = ck::tensor_operation::element_wise::UnaryAbs;
+using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceElementwiseImpl<
+    ck::Tuple<InputDataType>,                  // InDataTypeTuple
+    ck::Tuple<OutputDataType, OutputDataType>, // OutDataTypeTuple
+    ScalePassThrough,                          // Elementwise
+    NumDim,                                    // NumDim
+    256,                                       // BlockSize
+    128,                                       // M0PerBlock
+    128,                                       // M1PerBlock
+    8,                                         // M0PerThread
+    8,                                         // M1PerThread
+    ck::Sequence<1, 0>,                        // ThreadClusterArrangeOrder
+    ck::Sequence<8>,                           // InScalarPerVectorSeq
+    ck::Sequence<8, 1>>;                       // OutScalarPerVectorSeq
+using DeviceReduceInstance =
+    ck::tensor_operation::device::DeviceReduceMultiBlock<OutputDataType,
+                                                         OutputDataType,
+                                                         OutputDataType,
+                                                         NumDim,
+                                                         NumDim,
+                                                         ReduceOperation,
+                                                         UnaryAbs,
+                                                         PassThrough,
+                                                         ck::InMemoryDataOperationEnum::Set,
+                                                         PropagateNan,
+                                                         OutputIndex,
+                                                         false, // HaveIndexInputIfOutputIndex
+                                                         1024,  // BlockSize
+                                                         1,     // MThreadClusterSize
+                                                         1024,  // KThreadClusterSize
+                                                         1,     // MThreadSliceSize
+                                                         16,    // KThreadSliceSize
+                                                         1,     // InSrcVectorDim
+                                                         16,    // InSrceVectorSize
+                                                         1>;    // OutDstVectorSize
+void reference_scale_permute_amax(Tensor<InputDataType>& input,
+                                  Tensor<OutputDataType>& host_output_scaled_casted_transposed,
+                                  Tensor<OutputDataType>& host_output_scaled_casted,
+                                  Tensor<OutputDataType>& host_output_amax,
+                                  const float scale)
+{
+    ScalePassThrough out_element_op(scale);
+    const ck::index_t M = input.GetLengths()[0];
+    const ck::index_t K = input.GetLengths()[1];
+    for(ck::index_t m = 0; m < M; m++)
+    {
+        for(ck::index_t k = 0; k < K; k++)
+        {
+            OutputDataType y0, y1;
+            out_element_op(y0, y1, input(m, k));
+            host_output_scaled_casted(m, k)            = y0;
+            host_output_scaled_casted_transposed(m, k) = y1;
+            const OutputDataType y_fabs =
+                ck::type_convert<OutputDataType>(ck::math::abs(ck::type_convert<float>(y0)));
+            host_output_amax(0) = ck::math::max(y_fabs, host_output_amax(0));
+        }
+    }
+}
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    bool time_kernel     = true;
+    const float scale = 2.f;
+    ck::index_t M = 1024;
+    ck::index_t K = 1024;
+    if(argc == 3)
+    {
+        M = std::stoi(argv[1]);
+        K = std::stoi(argv[2]);
+    }
+    std::array<ck::index_t, 2> dims        = {M, K};
+    std::array<ck::index_t, 2> in_strides  = {K, 1};
+    std::array<ck::index_t, 2> out_strides = {1, M};
+    Tensor<InputDataType> input(dims, in_strides);
+    Tensor<OutputDataType> output_scaled_casted_transposed(dims, out_strides);
+    Tensor<OutputDataType> output_scaled_casted(dims, in_strides);
+    Tensor<OutputDataType> output_amax({1});
+    input.GenerateTensorValue(GeneratorTensor_3<InputDataType>{0.0, 1.0});
+    DeviceMem input_dev_buf(sizeof(InputDataType) * input.mDesc.GetElementSpaceSize());
+    DeviceMem output_scaled_casted_transposed_dev_buf(
+        sizeof(OutputDataType) * output_scaled_casted_transposed.mDesc.GetElementSpaceSize());
+    DeviceMem output_scaled_casted_dev_buf(sizeof(OutputDataType) *
+                                           output_scaled_casted.mDesc.GetElementSpaceSize());
+    DeviceMem output_amax_dev_buf(sizeof(OutputDataType) * output_amax.mDesc.GetElementSpaceSize());
+    input_dev_buf.ToDevice(input.mData.data());
+    std::array<const void*, 1> inputs = {input_dev_buf.GetDeviceBuffer()};
+    std::array<void*, 2> outputs      = {output_scaled_casted_transposed_dev_buf.GetDeviceBuffer(),
+                                    output_scaled_casted_dev_buf.GetDeviceBuffer()};
+    std::cout << "Input: " << input.mDesc << std::endl;
+    std::cout << "Scale: " << scale << std::endl;
+    std::cout << "Output scaled casted transposed: " << output_scaled_casted_transposed.mDesc
+              << std::endl;
+    std::cout << "Output scaled casted: " << output_scaled_casted.mDesc << std::endl;
+    std::cout << "Output amax: " << output_amax.mDesc << std::endl;
+    auto launch_transpose_scale = [&]() {
+        auto transposeScale = DeviceElementwisePermuteInstance{};
+        auto argument       = transposeScale.MakeArgumentPointer(dims,
+                                                           {in_strides},
+                                                           {out_strides, in_strides},
+                                                           inputs,
+                                                           outputs,
+                                                           ScalePassThrough{scale});
+        if(!transposeScale.IsSupportedArgument(argument.get()))
+        {
+            throw std::runtime_error(
+                "The runtime parameters seems not supported by the device instance, exiting!");
+        };
+        auto transposeScale_invoker_ptr = transposeScale.MakeInvokerPointer();
+        return transposeScale_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
+    };
+    auto launch_reduce = [&]() {
+        auto reduce = DeviceReduceInstance{};
+        auto reduce_argument_ptr =
+            reduce.MakeArgumentPointer(dims,
+                                       in_strides,
+                                       {1},    // Output Lengths
+                                       {1},    // Output Strides
+                                       {0, 1}, // Reduce Dims
+                                       static_cast<double>(1.f),
+                                       static_cast<double>(0.f),
+                                       output_scaled_casted_dev_buf.GetDeviceBuffer(),
+                                       nullptr,
+                                       output_amax_dev_buf.GetDeviceBuffer(),
+                                       nullptr,
+                                       UnaryAbs{},
+                                       PassThrough{});
+        if(!reduce.IsSupportedArgument(reduce_argument_ptr.get()))
+        {
+            throw std::runtime_error(
+                "The runtime parameters seems not supported by the device instance, exiting!");
+        };
+        auto invoker_ptr = reduce.MakeInvokerPointer();
+        return invoker_ptr->Run(reduce_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+    };
+    float ave_time = launch_transpose_scale();
+    ave_time += launch_reduce();
+    std::cout << "Perf: " << ave_time << " ms" << std::endl;
+    bool pass = true;
+    if(do_verification)
+    {
+        Tensor<OutputDataType> host_output_scaled_casted_transposed(dims, out_strides);
+        Tensor<OutputDataType> host_output_scaled_casted(dims, in_strides);
+        Tensor<OutputDataType> host_output_amax({1});
+        reference_scale_permute_amax(input,
+                                     host_output_scaled_casted_transposed,
+                                     host_output_scaled_casted,
+                                     host_output_amax,
+                                     scale);
+        output_scaled_casted_transposed_dev_buf.FromDevice(
+            output_scaled_casted_transposed.mData.data());
+        output_scaled_casted_dev_buf.FromDevice(output_scaled_casted.mData.data());
+        output_amax_dev_buf.FromDevice(output_amax.mData.data());
+        pass &= ck::utils::check_err(output_scaled_casted_transposed.mData,
+                                     host_output_scaled_casted_transposed.mData,
+                                     "Error: Incorrect results scaled transposed",
+                                     1e-3,
+                                     1e-3);
+        pass &= ck::utils::check_err(output_scaled_casted.mData,
+                                     host_output_scaled_casted.mData,
+                                     "Error: Incorrect results scaled",
+                                     1e-3,
+                                     1e-3);
+        pass &= ck::utils::check_err(
+            output_amax.mData, host_output_amax.mData, "Error: Incorrect results amax", 1e-3, 1e-3);
+    }
+    return pass ? 0 : 1;
+}
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
@@ -282,7 +281,11 @@ int main(int argc, char* argv[])
    using CodegenPipelineProblem = ck_tile::
        GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenGemmShape, CodegenGemmTraits>;
-    using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+    using CodegenGemmPolicy = ck_tile::
+        UniversalGemmPipelineAgBgCrPolicy<matrix_a_layout, matrix_b_layout, matrix_c_layout>;
+    using CodegenGemmPipeline =
+        ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem, CodegenGemmPolicy>;
    invoke_gemm<ck_tile::half_t,
                matrix_a_layout,

--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -419,6 +419,12 @@ struct UnaryAbs
        y = ck::math::abs(x);
    };
+    template <>
+    __host__ __device__ void operator()(f8_t& y, const f8_t& x) const
+    {
+        y = ck::type_convert<f8_t>(ck::math::abs(ck::type_convert<float>(x)));
+    };
 };
 struct UnarySqrt

--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
@@ -80,6 +80,8 @@ static inline __host__ bool isnan(half_t x)
    return (xx & 0x7FFF) > 0x7C00;
 };
+static inline __host__ bool isnan(f8_t x) { return (x & 0x80); };
 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
 static inline __host__ bool isnan(int4_t x)
 {
@@ -529,6 +531,8 @@ static inline __device__ bool isnan(half_t x)
    return (xx & 0x7FFF) > 0x7C00;
 };
+static inline __device__ bool isnan(f8_t x) { return (x & 0x80); };
 static inline __device__ half_t sqrt(half_t x)
 {
    return static_cast<half_t>(__builtin_amdgcn_sqrtf(static_cast<float>(x)));

--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -159,8 +159,11 @@
 #endif
 #endif
+// workaround for ROCm 6.2 and later
 #ifndef CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE
-#if HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 2 && HIP_VERSION_PATCH >= 41133
+#if(HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 2 && HIP_VERSION_PATCH >= 41133) ||  \
+    (HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 3 && HIP_VERSION_PATCH >= 42131) || \
+    (HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR > 3)
 #define CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE 1
 #else
 #define CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE 0

--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp
@@ -178,13 +178,13 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
            k_lds_ptr, Policy::template MakeKLdsWriteBlockDescriptor<Problem>());
        auto k_lds_write_window =
-            make_tile_window(k_lds, make_tuple(number<kN0>{}, number<kK0>{}), {0, 0});
+            make_tile_window(k_lds, make_tuple(number<kN0>{}, number<kQKHeaddim>{}), {0, 0});
        auto k_lds_read_window =
            make_tile_window(k_lds_write_window.get_bottom_tensor_view(),
                             make_tuple(number<kN0>{}, number<kK0>{}),
                             k_lds_write_window.get_window_origin(),
-                             Policy::template MakeKRegSliceBlockDescriptor<Problem>());
+                             Policy::template MakeKRegBlockDescriptor<Problem>());
        auto k_reg_tensor = make_static_distributed_tensor<KDataType>(
            Policy::template MakeKRegBlockDescriptor<Problem>());
@@ -204,16 +204,13 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
            v_lds_ptr, Policy::template MakeVLdsWriteBlockDescriptor<Problem>());
        auto v_lds_write_window =
-            make_tile_window(v_lds, make_tuple(number<kN0>{}, number<kK2>{}), {0, 0});
+            make_tile_window(v_lds, make_tuple(number<kN0>{}, number<kVHeaddim>{}), {0, 0});
        auto v_lds_read_window =
            make_tile_window(v_lds_write_window.get_bottom_tensor_view(),
                             make_tuple(number<kN0>{}, number<kK2>{}),
                             v_lds_write_window.get_window_origin(),
-                             Policy::template MakeVRegSliceBlockDescriptor<Problem>());
+                             Policy::template MakeVRegBlockDescriptor<Problem>());
-        auto v_reg_tensor = make_static_distributed_tensor<VDataType>(
-            Policy::template MakeVRegBlockDescriptor<Problem>());
        //------------------------------------------------------------------
        // KT, Reg ->LDS ->Reg
@@ -227,7 +224,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
            kt_lds_ptr, Policy::template MakeShuffledKLdsWriteBlockDescriptor<Problem>());
        auto shuffled_k_lds_write_window = make_tile_window(
-            shuffled_k_lds_write, make_tuple(number<kN0>{}, number<kK0>{}), {0, 0});
+            shuffled_k_lds_write, make_tuple(number<kN0>{}, number<kQKHeaddim>{}), {0, 0});
        auto kt_lds_read = make_tensor_view<address_space_enum::lds>(
            kt_lds_ptr, Policy::template MakeKTLdsReadBlockDescriptor<Problem>());
@@ -257,7 +254,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
        block_sync_lds();
-        v_reg_tensor = load_tile(v_lds_read_window);
+        auto v_reg_tensor = load_tile(v_lds_read_window);
        block_sync_lds();
        //---------------------------- Loop Load in ----------------------------//
        // Q: HBM ->Reg ->LDS
@@ -276,7 +273,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
            q_lds_ptr, Policy::template MakeQLdsBlockDescriptor<Problem>());
        auto q_lds_window =
-            make_tile_window(q_lds, make_tuple(number<kM0>{}, number<kK0>{}), {0, 0});
+            make_tile_window(q_lds, make_tuple(number<kM0>{}, number<kQKHeaddim>{}), {0, 0});
        auto q_lds_read_window =
            make_tile_window(q_lds_window.get_bottom_tensor_view(),
@@ -297,7 +294,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
            qt_lds_ptr, Policy::template MakeShuffledQLdsWriteBlockDescriptor<Problem>());
        auto shuffled_q_lds_write_window = make_tile_window(
-            shuffled_q_lds_write, make_tuple(number<kM0>{}, number<kK0>{}), {0, 0});
+            shuffled_q_lds_write, make_tuple(number<kM0>{}, number<kQKHeaddim>{}), {0, 0});
        auto qt_lds_read = make_tensor_view<address_space_enum::lds>(
            qt_lds_ptr, Policy::template MakeQTLdsReadBlockDescriptor<Problem>());
@@ -322,7 +319,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
            do_lds_ptr, Policy::template MakeOGradLdsBlockDescriptor<Problem>());
        auto do_lds_window =
-            make_tile_window(do_lds, make_tuple(number<kM0>{}, number<kK2>{}), {0, 0});
+            make_tile_window(do_lds, make_tuple(number<kM0>{}, number<kVHeaddim>{}), {0, 0});
        auto do_lds_read_window =
            make_tile_window(do_lds_window.get_bottom_tensor_view(),
@@ -341,7 +338,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
            dot_lds_ptr, Policy::template MakeShuffledOGradLdsWriteBlockDescriptor<Problem>());
        auto shuffled_do_lds_write_window = make_tile_window(
-            shuffled_do_lds_write, make_tuple(number<kM0>{}, number<kK2>{}), {0, 0});
+            shuffled_do_lds_write, make_tuple(number<kM0>{}, number<kVHeaddim>{}), {0, 0});
        auto dot_read_lds = make_tensor_view<address_space_enum::lds>(
            dot_lds_ptr, Policy::template MakeOGradTLdsReadBlockDescriptor<Problem>());
@@ -483,9 +480,9 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
        index_t i_total_loops = 0;
        index_t seqlen_q_step = seqlen_q_start;
-        static_assert(kQKHeaddim == kK0, "kQKHeaddim should equal to kK0");
+        static_assert(kQKHeaddim >= kK0, "kQKHeaddim should be equal or greater than kK0");
        static_assert(kM0 == kK1, "kM0 should equal to kK1");
-        static_assert(kVHeaddim == kK2, "kVHeaddim should equal to kK2");
+        static_assert(kVHeaddim >= kK2, "kVHeaddim should be equal or greater than kK2");
        static_assert(kM0 == kK3, "kM0 should equal to kK3");
        constexpr index_t k4_loops = kN0 / kK4;

--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
@@ -178,13 +178,13 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
            k_lds_ptr, Policy::template MakeKLdsWriteBlockDescriptor<Problem>());
        auto k_lds_write_window =
-            make_tile_window(k_lds, make_tuple(number<kN0>{}, number<kK0>{}), {0, 0});
+            make_tile_window(k_lds, make_tuple(number<kN0>{}, number<kQKHeaddim>{}), {0, 0});
        auto k_lds_read_window =
            make_tile_window(k_lds_write_window.get_bottom_tensor_view(),
                             make_tuple(number<kN0>{}, number<kK0>{}),
                             k_lds_write_window.get_window_origin(),
-                             Policy::template MakeKRegSliceBlockDescriptor<Problem>());
+                             Policy::template MakeKRegBlockDescriptor<Problem>());
        auto k_reg_tensor = make_static_distributed_tensor<KDataType>(
            Policy::template MakeKRegBlockDescriptor<Problem>());
@@ -204,16 +204,13 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
            v_lds_ptr, Policy::template MakeVLdsWriteBlockDescriptor<Problem>());
        auto v_lds_write_window =
-            make_tile_window(v_lds, make_tuple(number<kN0>{}, number<kK2>{}), {0, 0});
+            make_tile_window(v_lds, make_tuple(number<kN0>{}, number<kVHeaddim>{}), {0, 0});
        auto v_lds_read_window =
            make_tile_window(v_lds_write_window.get_bottom_tensor_view(),
                             make_tuple(number<kN0>{}, number<kK2>{}),
                             v_lds_write_window.get_window_origin(),
-                             Policy::template MakeVRegSliceBlockDescriptor<Problem>());
+                             Policy::template MakeVRegBlockDescriptor<Problem>());
-        auto v_reg_tensor = make_static_distributed_tensor<VDataType>(
-            Policy::template MakeVRegBlockDescriptor<Problem>());
        //------------------------------------------------------------------
        // KT, Reg ->LDS ->Reg
@@ -227,7 +224,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
            kt_lds_ptr, Policy::template MakeShuffledKLdsWriteBlockDescriptor<Problem>());
        auto shuffled_k_lds_write_window = make_tile_window(
-            shuffled_k_lds_write, make_tuple(number<kN0>{}, number<kK0>{}), {0, 0});
+            shuffled_k_lds_write, make_tuple(number<kN0>{}, number<kQKHeaddim>{}), {0, 0});
        auto kt_lds_read = make_tensor_view<address_space_enum::lds>(
            kt_lds_ptr, Policy::template MakeKTLdsReadBlockDescriptor<Problem>());
@@ -257,7 +254,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
        block_sync_lds();
-        v_reg_tensor = load_tile(v_lds_read_window);
+        auto v_reg_tensor = load_tile(v_lds_read_window);
        //---------------------------- Loop Load in ----------------------------//
        // Q: HBM ->Reg ->LDS
        auto q_dram_window =
@@ -275,7 +272,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
            q_lds_ptr, Policy::template MakeQLdsBlockDescriptor<Problem>());
        auto q_lds_window =
-            make_tile_window(q_lds, make_tuple(number<kM0>{}, number<kK0>{}), {0, 0});
+            make_tile_window(q_lds, make_tuple(number<kM0>{}, number<kQKHeaddim>{}), {0, 0});
        auto q_lds_read_window =
            make_tile_window(q_lds_window.get_bottom_tensor_view(),
@@ -296,7 +293,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
            qt_lds_ptr, Policy::template MakeShuffledQLdsWriteBlockDescriptor<Problem>());
        auto shuffled_q_lds_write_window = make_tile_window(
-            shuffled_q_lds_write, make_tuple(number<kM0>{}, number<kK0>{}), {0, 0});
+            shuffled_q_lds_write, make_tuple(number<kM0>{}, number<kQKHeaddim>{}), {0, 0});
        auto qt_lds_read = make_tensor_view<address_space_enum::lds>(
            qt_lds_ptr, Policy::template MakeQTLdsReadBlockDescriptor<Problem>());
@@ -321,7 +318,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
            do_lds_ptr, Policy::template MakeOGradLdsBlockDescriptor<Problem>());
        auto do_lds_window =
-            make_tile_window(do_lds, make_tuple(number<kM0>{}, number<kK2>{}), {0, 0});
+            make_tile_window(do_lds, make_tuple(number<kM0>{}, number<kVHeaddim>{}), {0, 0});
        auto do_lds_read_window =
            make_tile_window(do_lds_window.get_bottom_tensor_view(),
@@ -340,7 +337,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
            dot_lds_ptr, Policy::template MakeShuffledOGradLdsWriteBlockDescriptor<Problem>());
        auto shuffled_do_lds_write_window = make_tile_window(
-            shuffled_do_lds_write, make_tuple(number<kM0>{}, number<kK2>{}), {0, 0});
+            shuffled_do_lds_write, make_tuple(number<kM0>{}, number<kVHeaddim>{}), {0, 0});
        auto dot_read_lds = make_tensor_view<address_space_enum::lds>(
            dot_lds_ptr, Policy::template MakeOGradTLdsReadBlockDescriptor<Problem>());
@@ -482,9 +479,9 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
        index_t i_total_loops = 0;
        index_t seqlen_q_step = seqlen_q_start;
-        static_assert(kQKHeaddim == kK0, "kQKHeaddim should equal to kK0");
+        static_assert(kQKHeaddim >= kK0, "kQKHeaddim should be equal or greater than kK0");
        static_assert(kM0 == kK1, "kM0 should equal to kK1");
-        static_assert(kVHeaddim == kK2, "kVHeaddim should equal to kK2");
+        static_assert(kVHeaddim >= kK2, "kVHeaddim should be equal or greater than kK2");
        static_assert(kM0 == kK3, "kM0 should equal to kK3");
        constexpr index_t k4_loops = kN0 / kK4;