merge develop

9dce6851 · Jing Zhang · 3cc57101 · 5d37d7bf · 9dce6851 · 9dce6851
Commit 9dce6851 authored Mar 10, 2022 by Jing Zhang
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,7 +45,6 @@ message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
 message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
 message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")

-# set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 link_libraries(${OpenMP_gomp_LIBRARY})
 link_libraries(${OpenMP_pthread_LIBRARY})

@@ -71,17 +70,17 @@ if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
 endif()
 message(STATUS "Build with HIP ${HIP_VERSION}")

-## half
-#find_path(HALF_INCLUDE_DIR half.hpp)
-set(HALF_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/external/half/include")
-message("HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}")
-

 rocm_create_package(
    NAME CK-${CK_BACKEND}
-    DESCRIPTION "High Performance Composable Kernels for AMD GPUs"
+    DESCRIPTION "High Performance Composable Kernel for AMD GPUs"
    LDCONFIG
 )
+
+## half
+set(HALF_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/external/include/half")
+message("HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}")
+
 ## tidy
 include(EnableCompilerWarnings)
 set(CK_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name)
@@ -184,7 +183,6 @@ enable_clang_tidy(
        -cppcoreguidelines-narrowing-conversions
        -altera-struct-pack-align
        -cppcoreguidelines-prefer-member-initializer
-
        ${CK_TIDY_CHECKS}
        ${CK_TIDY_ERRORS}
    HEADER_FILTER
@@ -214,70 +212,36 @@ enable_cppcheck(
        unmatchedSuppression
    FORCE
    SOURCES
-        host/host_tensor/src
-        host/driver_offline/src
-        composable_kernel/src/kernel_wrapper
+        library/src
    INCLUDE
-        host/host_tensor/include
-        host/device/include
-        host/solver/include
-        host/driver_offline/include
-        composable_kernel/include/*
        ${CMAKE_CURRENT_SOURCE_DIR}/include
        ${CMAKE_CURRENT_BINARY_DIR}/include
+        ${CMAKE_CURRENT_SOURCE_DIR}/library/include
    DEFINE
        CPPCHECK=1
        __linux__=1
 )
+
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)

-file(GLOB_RECURSE COMPOSABLE_KERNEL_HEADERS "composable_kernel/include/*/*.hpp")
-file(GLOB_RECURSE DEVICE_OPS_HEADERS "device_operation/include/*.hpp")
-
-file(GLOB_RECURSE DEVICE_OPS_SOURCE "device_operation/*.cpp")
+configure_file("${PROJECT_SOURCE_DIR}/include/ck/hip_version.hpp.in" "${PROJECT_BINARY_DIR}/include/ck/hip_version.hpp")

-set(CK_HEADERS ${COMPOSABLE_KERNEL_HEADERS} ${DEVICE_OPS_HEADERS})
-set(CK_SOURCE ${DEVICE_OPS_SOURCE})
-add_library(composable_kernel
-    ${CK_SOURCE}
+include_directories(BEFORE
+    ${PROJECT_SOURCE_DIR}/include
+    ${PROJECT_BINARY_DIR}/include
+    ${PROJECT_SOURCE_DIR}/library/include
 )

-target_include_directories(composable_kernel PUBLIC
-    $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/composable_kernel/include>
-)
-target_include_directories(composable_kernel PUBLIC 
-    $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/device_operation/include>
-)
-target_include_directories(composable_kernel PUBLIC 
-    $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/host/include>
-)
-target_include_directories(composable_kernel PUBLIC 
-    $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/host/host_tensor/include>
-)
-# The following should eventually be removed
-target_include_directories(composable_kernel PUBLIC
-    $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/composable_kernel/include/utility>
-)
-target_include_directories(composable_kernel PUBLIC
-    $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation>
-)
-target_include_directories(composable_kernel PUBLIC
-    $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description>
-)
-# clang_tidy_check(composable_kernel)
 SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
 if(BUILD_DEV)
-    target_compile_options(composable_kernel PRIVATE -Werror)
-    target_compile_options(composable_kernel PRIVATE -Weverything)
+    add_compile_options(-Werror)
+    add_compile_options(-Weverything)
 endif()
 message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")

-configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/hip_version.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/hip_version.hpp")
-
-add_subdirectory(host)
-add_subdirectory(device_operation)
+add_subdirectory(library)
 add_subdirectory(example)
-add_subdirectory(profiler)
 add_subdirectory(test)
+add_subdirectory(profiler)
--- a/Dockerfile
+++ b/Dockerfile
 FROM ubuntu:18.04

-ARG ROCMVERSION=4.5
+ARG ROCMVERSION=5.0
 ARG OSDB_BKC_VERSION

 RUN set -xe

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -17,7 +17,7 @@ def cmake_build(Map conf=[:]){
    def compiler = conf.get("compiler","/opt/rocm/bin/hipcc")
    def config_targets = conf.get("config_targets","check")
    def debug_flags = "-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined " + conf.get("extradebugflags", "")
-    def build_envs = "CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 " + conf.get("build_env","")
+    def build_envs = "CTEST_PARALLEL_LEVEL=4 " + conf.get("build_env","")
    def prefixpath = conf.get("prefixpath","/opt/rocm")
    def setup_args = conf.get("setup_args","")

@@ -60,7 +60,8 @@ def cmake_build(Map conf=[:]){
            cd build
        """
    def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args}   .. ")
-    def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make -j\$(nproc) ${config_targets}")
+    // reduce parallelism when compiling, clang uses too much memory
+    def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make  -j\$(( \$(nproc) / 1 )) ${config_targets}")
    def execute_cmd = conf.get("execute_cmd", "")

    def cmd = conf.get("cmd", """
@@ -177,15 +178,27 @@ pipeline {
                //         buildHipClangJobAndReboot(build_cmd: build_cmd, no_reboot:true, prefixpath: '/opt/rocm', build_type: 'debug')
                //     }
                // }
-                stage('Build Profiler: gfx908')
+                stage('Build Profiler: Release, gfx908')
                {
-                    agent { label rocmnode("gfx908")}
+                    agent { label rocmnode("nogpu")}
                    environment{
                        setup_args = """ -D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " -DBUILD_DEV=On """
-                        build_cmd = "make -j\$(nproc) -k ckProfiler"
                    }
                    steps{
-                        buildHipClangJobAndReboot(setup_args:setup_args, build_cmd:build_cmd, no_reboot:true, build_type: 'Release')
+                        buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')
+                    }
+                }
+                stage('Build Profiler: Debug, gfx908')
+                {
+                    agent { label rocmnode("nogpu")}
+                    environment{
+                        setup_args = """ -D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " -DBUILD_DEV=On """
+                    }
+                    steps{
+                        // until we stabilize debug build due to compiler crashes
+                        catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') {
+                            buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Debug')
+                        }
                    }
                }
                stage('Clang Format') {
@@ -207,6 +220,24 @@ pipeline {
                }
            }
        }
+        stage("Tests")
+        {
+            parallel
+            {
+                stage("Run Tests: gfx908")
+                {
+                    agent{ label rocmnode("gfx908")}
+                    environment{
+                        setup_args = """ -D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " -DBUILD_DEV=On """
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release')
+                    }
+
+                }
+
+            }
+        }
        // enable after the cmake file supports packaging
        // stage("Packages") {
        //     when {
@@ -222,4 +253,4 @@ pipeline {
        //     }
        // }
    }
-}
\ No newline at end of file
+}
--- a/composable_kernel/include/gridwise_operation_wrapper.hpp
+++ b/composable_kernel/include/gridwise_operation_wrapper.hpp
-#ifndef CK_GRIDWISE_OPERATION_KERNEL_WRAPPER
-#define CK_GRIDWISE_OPERATION_KERNEL_WRAPPER
-
-template <typename GridwiseOp, typename... Xs>
-__global__ void
-#if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
-#endif
-        run_gridwise_operation(Xs... xs)
-{
-    GridwiseOp{}.Run(xs...);
-}
-
-#endif
--- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_BLOCKWISE_HPP
-#define CK_GRIDWISE_GENERIC_2D_REDUCTION_BLOCKWISE_HPP
-
-#include "data_type.hpp"
-#include "reduction_common.hpp"
-#include "reduction_operator.hpp"
-#include "reduction_functions_blockwise.hpp"
-
-#include "blockwise_tensor_slice_transfer.hpp"
-
-namespace ck {
-
-template <index_t BlockSize,
-          typename srcDataType,
-          typename dstDataType,
-          typename compType,
-          typename src2dDescType,
-          typename dst1dDescType,
-          ReduceTensorOp_t op,
-          NanPropagation_t nanPropaOpt,
-          ReduceTensorIndices_t reduceIndicesOpt,
-          bool isFirstCall,
-          bool isLastCall,
-          index_t GredAccessesPerThreadInBlock>
-struct GridwiseReduction_xy_to_x_blockwise
-{
-    using opReduce = typename reduce_binary_operator<compType, op>::opType;
-    using preUnaryOpType =
-        typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::preUnaryOp;
-    using posUnaryOpType =
-        typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::posUnaryOp;
-
-    static constexpr auto buffer2dDesc = make_naive_tensor_descriptor_packed(
-        make_tuple(Number<GredAccessesPerThreadInBlock>{}, Number<BlockSize>{}));
-    using blockwise_reduce =
-        BlockwiseReduction_2d_block_buffer<decltype(buffer2dDesc), true, opReduce, nanPropaOpt>;
-
-    static constexpr index_t BlockBufferSize = buffer2dDesc.GetElementSize();
-
-    static constexpr auto I0 = Number<0>{};
-
-    template <int RunId>
-    __device__ static void Run(const src2dDescType& src2dDesc,
-                               const dst1dDescType& dst1dDesc,
-                               int origReduceLen,
-                               srcDataType alpha,
-                               const srcDataType* const __restrict__ p_src_global,
-                               dstDataType beta,
-                               dstDataType* const __restrict__ p_dst_global,
-                               const int* const __restrict__ ws_indices_global,
-                               int* const __restrict__ indices_global);
-
-    template <>
-    __device__ static void Run<1>(const src2dDescType& src2dDesc,
-                                  const dst1dDescType& dst1dDesc,
-                                  int origReduceLen,
-                                  srcDataType alpha,
-                                  const srcDataType* const __restrict__ p_src_global,
-                                  dstDataType beta,
-                                  dstDataType* const __restrict__ p_dst_global,
-                                  const int* const __restrict__ ws_indices_global,
-                                  int* const __restrict__ indices_global)
-    {
-        (void)ws_indices_global;
-        (void)indices_global;
-
-        // LDS
-        __shared__ compType p_in_block_buffer[BlockBufferSize];
-
-        const auto zeroVal = opReduce::GetReductionZeroVal();
-
-        const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>(zeroVal));
-        auto dst_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            p_dst_global, dst1dDesc.GetElementSpaceSize());
-
-        auto in_block_buf =
-            make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_buffer, BlockBufferSize);
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
-
-        accuValue_buf(I0) = zeroVal;
-
-        const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
-        const int divider         = origReduceLen;
-
-        const preUnaryOpType preUnaryOp(divider);
-        const posUnaryOpType posUnaryOp(divider);
-
-        const index_t thread_local_id    = get_thread_local_1d_id();
-        const index_t block_global_1d_id = get_block_1d_id();
-
-        constexpr auto in_block_desc =
-            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<BlockBufferSize>{}));
-
-        using ThreadSliceLengths   = Sequence<1, GredAccessesPerThreadInBlock>;
-        using ThreadClusterLengths = Sequence<1, BlockSize>;
-
-        auto blockwise_src_load =
-            BlockwiseTensorSliceTransfer_v4<BlockSize,
-                                            InMemoryDataOperationEnum_t::Set,
-                                            Sequence<1, BlockBufferSize>,
-                                            ThreadSliceLengths,
-                                            ThreadClusterLengths,
-                                            Sequence<0, 1>,
-                                            srcDataType,
-                                            compType,
-                                            src2dDescType,
-                                            decltype(in_block_desc),
-                                            Sequence<0, 1>,
-                                            Sequence<0, 1>,
-                                            1,
-                                            1,
-                                            1,
-                                            1,
-                                            1,
-                                            1,
-                                            false,
-                                            true>(src2dDesc,
-                                                  make_multi_index(block_global_1d_id, 0),
-                                                  in_block_desc,
-                                                  make_multi_index(0, 0));
-
-        constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
-
-        const index_t toReduceBlocks = (toReduceLength + BlockSize - 1) / BlockSize;
-
-        for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
-            reducedBlocks += GredAccessesPerThreadInBlock)
-        {
-            blockwise_src_load.RunRead(src2dDesc, src_global_buf);
-            blockwise_src_load.RunWrite(in_block_desc, in_block_buf);
-
-            __syncthreads();
-
-            // do element-wise pre-reduction operation
-            blockwise_reduce::operate_on_elements(preUnaryOp, in_block_buf);
-
-            index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
-                                        ? GredAccessesPerThreadInBlock
-                                        : toReduceBlocks - reducedBlocks;
-            blockwise_reduce::Reduce(in_block_buf, BlocksInOneOp, accuValue_buf(I0));
-
-            blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
-        }
-
-        accuValue_buf(I0) = posUnaryOp(accuValue_buf[I0]);
-
-        constexpr auto ReducedDataDesc =
-            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
-
-        // The first thread in the block stores the reduced result to the global location
-        // representing the block
-        if(thread_local_id == 0)
-        {
-            if(!float_equal_one{}(alpha))
-                accuValue_buf(I0) *= type_convert<compType>(alpha);
-
-            StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
-
-            dstValue_buf(I0) = type_convert<dstDataType>(accuValue_buf[I0]);
-
-            if(!float_equal_zero{}(beta))
-            {
-                auto threadwise_dst_load =
-                    ThreadwiseTensorSliceTransfer_v2<dstDataType,
-                                                     dstDataType,
-                                                     dst1dDescType,
-                                                     decltype(ReducedDataDesc),
-                                                     Sequence<1>,
-                                                     Sequence<0>,
-                                                     0,
-                                                     1,
-                                                     1,
-                                                     false>(dst1dDesc,
-                                                            make_multi_index(block_global_1d_id));
-
-                StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
-
-                threadwise_dst_load.Run(
-                    dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
-
-                dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
-            }
-
-            auto threadwise_dst_store =
-                ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
-                                                   dstDataType,
-                                                   decltype(ReducedDataDesc),
-                                                   dst1dDescType,
-                                                   Sequence<1>,
-                                                   Sequence<0>,
-                                                   0,
-                                                   1,
-                                                   InMemoryDataOperationEnum_t::Set,
-                                                   1,
-                                                   false>(dst1dDesc,
-                                                          make_multi_index(block_global_1d_id));
-
-            threadwise_dst_store.Run(
-                ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_buf);
-        }
-    };
-
-    template <>
-    __device__ static void Run<2>(const src2dDescType& src2dDesc,
-                                  const dst1dDescType& dst1dDesc,
-                                  int origReduceLen,
-                                  srcDataType alpha,
-                                  const srcDataType* const __restrict__ p_src_global,
-                                  dstDataType beta,
-                                  dstDataType* const __restrict__ p_dst_global,
-                                  const int* const __restrict__ ws_indices_global,
-                                  int* const __restrict__ indices_global)
-    {
-        (void)ws_indices_global;
-
-        // LDS
-        __shared__ compType p_in_block_buffer[BlockBufferSize];
-        __shared__ int block_indices_buffer[BlockBufferSize];
-
-        const auto zeroVal = opReduce::GetReductionZeroVal();
-
-        const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>(zeroVal));
-        auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            p_dst_global, dst1dDesc.GetElementSpaceSize());
-        auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            indices_global, dst1dDesc.GetElementSpaceSize());
-
-        auto in_block_val_buf =
-            make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_buffer, BlockBufferSize);
-        auto in_block_idx_buf =
-            make_dynamic_buffer<AddressSpaceEnum_t::Lds>(block_indices_buffer, BlockBufferSize);
-
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
-
-        accuValue_buf(I0) = zeroVal;
-        accuIndex_buf(I0) = 0;
-
-        const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
-        const int divider         = origReduceLen;
-
-        const preUnaryOpType preUnaryOp(divider);
-
-        const index_t thread_local_id    = get_thread_local_1d_id();
-        const index_t block_global_1d_id = get_block_1d_id();
-
-        constexpr auto in_block_desc =
-            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<BlockBufferSize>{}));
-
-        using ThreadSliceLengths   = Sequence<1, GredAccessesPerThreadInBlock>;
-        using ThreadClusterLengths = Sequence<1, BlockSize>;
-
-        auto blockwise_src_load =
-            BlockwiseTensorSliceTransfer_v4<BlockSize,
-                                            InMemoryDataOperationEnum_t::Set,
-                                            Sequence<1, BlockBufferSize>,
-                                            ThreadSliceLengths,
-                                            ThreadClusterLengths,
-                                            Sequence<0, 1>,
-                                            srcDataType,
-                                            compType,
-                                            src2dDescType,
-                                            decltype(in_block_desc),
-                                            Sequence<0, 1>,
-                                            Sequence<0, 1>,
-                                            1,
-                                            1,
-                                            1,
-                                            1,
-                                            1,
-                                            1,
-                                            false,
-                                            true>(src2dDesc,
-                                                  make_multi_index(block_global_1d_id, 0),
-                                                  in_block_desc,
-                                                  make_multi_index(0, 0));
-
-        constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
-
-        const index_t toReduceBlocks = (toReduceLength + BlockSize - 1) / BlockSize;
-
-        int indexOffset = 0;
-
-        for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
-            reducedBlocks += GredAccessesPerThreadInBlock)
-        {
-            // load block data from global to LDS, no use of double buffers (to be improved)
-            blockwise_src_load.RunRead(src2dDesc, src_global_buf);
-            blockwise_src_load.RunWrite(in_block_desc, in_block_val_buf);
-
-            __syncthreads();
-
-            // construct the indices for the current toReduce blocks
-            blockwise_reduce::init_buffer_indices(in_block_idx_buf, indexOffset);
-
-            // unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
-            // done here
-            blockwise_reduce::operate_on_elements(preUnaryOp, in_block_val_buf);
-
-            index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
-                                        ? GredAccessesPerThreadInBlock
-                                        : toReduceBlocks - reducedBlocks;
-
-            blockwise_reduce::Reduce2(in_block_val_buf,
-                                      in_block_idx_buf,
-                                      BlocksInOneOp,
-                                      accuValue_buf(I0),
-                                      accuIndex_buf(I0));
-
-            indexOffset += BlockBufferSize;
-
-            blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
-        }
-
-        constexpr auto ReducedDataDesc =
-            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
-
-        // The first thread in the block stores the reduced result to the global location
-        // representing the block
-        if(thread_local_id == 0)
-        {
-            if(!float_equal_one{}(alpha))
-                accuValue_buf(I0) *= type_convert<compType>(alpha);
-
-            StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
-
-            dstValue_buf(I0) = type_convert<dstDataType>(accuValue_buf[I0]);
-
-            if(!float_equal_zero{}(beta))
-            {
-                auto threadwise_dst_load =
-                    ThreadwiseTensorSliceTransfer_v2<dstDataType,
-                                                     dstDataType,
-                                                     dst1dDescType,
-                                                     decltype(ReducedDataDesc),
-                                                     Sequence<1>,
-                                                     Sequence<0>,
-                                                     0,
-                                                     1,
-                                                     1,
-                                                     false>(dst1dDesc,
-                                                            make_multi_index(block_global_1d_id));
-
-                StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
-
-                threadwise_dst_load.Run(dst1dDesc,
-                                        dst_global_val_buf,
-                                        ReducedDataDesc,
-                                        make_tuple(I0),
-                                        priorDstValue_buf);
-
-                dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
-            }
-
-            auto threadwise_dst_val_store =
-                ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
-                                                   dstDataType,
-                                                   decltype(ReducedDataDesc),
-                                                   dst1dDescType,
-                                                   Sequence<1>,
-                                                   Sequence<0>,
-                                                   0,
-                                                   1,
-                                                   InMemoryDataOperationEnum_t::Set,
-                                                   1,
-                                                   false>(dst1dDesc,
-                                                          make_multi_index(block_global_1d_id));
-
-            auto threadwise_dst_idx_store =
-                ThreadwiseTensorSliceTransfer_v1r3<int,
-                                                   int,
-                                                   decltype(ReducedDataDesc),
-                                                   dst1dDescType,
-                                                   Sequence<1>,
-                                                   Sequence<0>,
-                                                   0,
-                                                   1,
-                                                   InMemoryDataOperationEnum_t::Set,
-                                                   1,
-                                                   false>(dst1dDesc,
-                                                          make_multi_index(block_global_1d_id));
-
-            threadwise_dst_val_store.Run(
-                ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf);
-            threadwise_dst_idx_store.Run(
-                ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
-        }
-    };
-
-    template <>
-    __device__ static void Run<3>(const src2dDescType& src2dDesc,
-                                  const dst1dDescType& dst1dDesc,
-                                  int origReduceLen,
-                                  srcDataType alpha,
-                                  const srcDataType* const __restrict__ ws_values_global,
-                                  dstDataType beta,
-                                  dstDataType* const __restrict__ p_dst_global,
-                                  const int* const __restrict__ ws_indices_global,
-                                  int* const __restrict__ indices_global)
-    {
-        (void)origReduceLen;
-
-        // LDS
-        __shared__ compType p_in_block_buffer[BlockBufferSize];
-        __shared__ int block_indices_buffer[BlockBufferSize];
-
-        const auto zeroVal = opReduce::GetReductionZeroVal();
-
-        const auto src_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            ws_values_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>(zeroVal));
-        const auto src_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            ws_indices_global, src2dDesc.GetElementSpaceSize());
-        auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            p_dst_global, dst1dDesc.GetElementSpaceSize());
-        auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            indices_global, dst1dDesc.GetElementSpaceSize());
-
-        auto in_block_val_buf =
-            make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_buffer, BlockBufferSize);
-        auto in_block_idx_buf =
-            make_dynamic_buffer<AddressSpaceEnum_t::Lds>(block_indices_buffer, BlockBufferSize);
-
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
-
-        accuValue_buf(I0) = zeroVal;
-        accuIndex_buf(I0) = 0;
-
-        const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
-
-        const index_t thread_local_id    = get_thread_local_1d_id();
-        const index_t block_global_1d_id = get_block_1d_id();
-
-        constexpr auto in_block_desc =
-            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<BlockBufferSize>{}));
-
-        using ThreadSliceLengths   = Sequence<1, GredAccessesPerThreadInBlock>;
-        using ThreadClusterLengths = Sequence<1, BlockSize>;
-
-        auto blockwise_src_val_load =
-            BlockwiseTensorSliceTransfer_v4<BlockSize,
-                                            InMemoryDataOperationEnum_t::Set,
-                                            Sequence<1, BlockBufferSize>,
-                                            ThreadSliceLengths,
-                                            ThreadClusterLengths,
-                                            Sequence<0, 1>,
-                                            srcDataType,
-                                            compType,
-                                            src2dDescType,
-                                            decltype(in_block_desc),
-                                            Sequence<0, 1>,
-                                            Sequence<0, 1>,
-                                            1,
-                                            1,
-                                            1,
-                                            1,
-                                            1,
-                                            1,
-                                            false,
-                                            true>(src2dDesc,
-                                                  make_multi_index(block_global_1d_id, 0),
-                                                  in_block_desc,
-                                                  make_multi_index(0, 0));
-
-        auto blockwise_src_idx_load =
-            BlockwiseTensorSliceTransfer_v4<BlockSize,
-                                            InMemoryDataOperationEnum_t::Set,
-                                            Sequence<1, BlockBufferSize>,
-                                            ThreadSliceLengths,
-                                            ThreadClusterLengths,
-                                            Sequence<0, 1>,
-                                            int,
-                                            int,
-                                            src2dDescType,
-                                            decltype(in_block_desc),
-                                            Sequence<0, 1>,
-                                            Sequence<0, 1>,
-                                            1,
-                                            1,
-                                            1,
-                                            1,
-                                            1,
-                                            1,
-                                            false,
-                                            true>(src2dDesc,
-                                                  make_multi_index(block_global_1d_id, 0),
-                                                  in_block_desc,
-                                                  make_multi_index(0, 0));
-
-        constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
-
-        const index_t toReduceBlocks = (toReduceLength + BlockSize - 1) / BlockSize;
-
-        for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
-            reducedBlocks += GredAccessesPerThreadInBlock)
-        {
-            // load block data from global to LDS, no use of double buffers (to be improved)
-            blockwise_src_val_load.RunRead(src2dDesc, src_global_val_buf);
-            blockwise_src_idx_load.RunRead(src2dDesc, src_global_idx_buf);
-            blockwise_src_val_load.RunWrite(in_block_desc, in_block_val_buf);
-            blockwise_src_idx_load.RunWrite(in_block_desc, in_block_idx_buf);
-
-            __syncthreads();
-
-            index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
-                                        ? GredAccessesPerThreadInBlock
-                                        : toReduceBlocks - reducedBlocks;
-
-            blockwise_reduce::Reduce2(in_block_val_buf,
-                                      in_block_idx_buf,
-                                      BlocksInOneOp,
-                                      accuValue_buf(I0),
-                                      accuIndex_buf(I0));
-
-            blockwise_src_val_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
-            blockwise_src_idx_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
-        }
-
-        constexpr auto ReducedDataDesc =
-            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
-
-        // The first thread in the block stores the reduced result to the global location
-        // representing the block
-        if(thread_local_id == 0)
-        {
-            if(!float_equal_one{}(alpha))
-                accuValue_buf(I0) *= type_convert<compType>(alpha);
-
-            StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
-
-            dstValue_buf(I0) = type_convert<dstDataType>(accuValue_buf[I0]);
-
-            if(!float_equal_zero{}(beta))
-            {
-                auto threadwise_dst_load =
-                    ThreadwiseTensorSliceTransfer_v2<dstDataType,
-                                                     dstDataType,
-                                                     dst1dDescType,
-                                                     decltype(ReducedDataDesc),
-                                                     Sequence<1>,
-                                                     Sequence<0>,
-                                                     0,
-                                                     1,
-                                                     1,
-                                                     true>(dst1dDesc,
-                                                           make_multi_index(block_global_1d_id));
-
-                StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
-
-                threadwise_dst_load.Run(dst1dDesc,
-                                        dst_global_val_buf,
-                                        ReducedDataDesc,
-                                        make_tuple(I0),
-                                        priorDstValue_buf);
-
-                dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
-            }
-
-            auto threadwise_dst_val_store =
-                ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
-                                                   dstDataType,
-                                                   decltype(ReducedDataDesc),
-                                                   dst1dDescType,
-                                                   Sequence<1>,
-                                                   Sequence<0>,
-                                                   0,
-                                                   1,
-                                                   InMemoryDataOperationEnum_t::Set,
-                                                   1,
-                                                   true>(dst1dDesc,
-                                                         make_multi_index(block_global_1d_id));
-
-            auto threadwise_dst_idx_store =
-                ThreadwiseTensorSliceTransfer_v1r3<int,
-                                                   int,
-                                                   decltype(ReducedDataDesc),
-                                                   dst1dDescType,
-                                                   Sequence<1>,
-                                                   Sequence<0>,
-                                                   0,
-                                                   1,
-                                                   InMemoryDataOperationEnum_t::Set,
-                                                   1,
-                                                   true>(dst1dDesc,
-                                                         make_multi_index(block_global_1d_id));
-
-            threadwise_dst_val_store.Run(
-                ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf);
-            threadwise_dst_idx_store.Run(
-                ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
-        }
-    };
-};
-
-} // namespace ck
-#endif
--- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_THREADWISE_HPP
-#define CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_THREADWISE_HPP
-
-#include "data_type.hpp"
-#include "reduction_common.hpp"
-#include "reduction_operator.hpp"
-#include "reduction_functions_threadwise.hpp"
-
-#include "threadwise_tensor_slice_transfer.hpp"
-
-namespace ck {
-
-template <index_t BlockSize,
-          typename srcDataType,
-          typename dstDataType,
-          typename compType,
-          typename src2dDescType,
-          typename dst1dDescType,
-          ReduceTensorOp_t op,
-          NanPropagation_t nanPropaOpt,
-          ReduceTensorIndices_t reduceIndicesOpt,
-          bool isFirstCall,
-          bool isLastCall,
-          index_t GredThreadBufferLength>
-struct GridwiseReduction_xy_to_x_direct_threadwise
-{
-    using opReduce = typename reduce_binary_operator<compType, op>::opType;
-    using preUnaryOpType =
-        typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::preUnaryOp;
-    using posUnaryOpType =
-        typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::posUnaryOp;
-
-    static constexpr auto I0 = Number<0>{};
-
-    template <int RunId>
-    __device__ static void Run(const src2dDescType& src2dDesc,
-                               const dst1dDescType& dst1dDesc,
-                               int origReduceLen,
-                               srcDataType alpha,
-                               const srcDataType* const __restrict__ p_src_global,
-                               dstDataType beta,
-                               dstDataType* const __restrict__ p_dst_global,
-                               const int* const __restrict__ ws_indices_global,
-                               int* const __restrict__ indices_global);
-
-    template <>
-    __device__ static void Run<1>(const src2dDescType& src2dDesc,
-                                  const dst1dDescType& dst1dDesc,
-                                  int origReduceLen,
-                                  srcDataType alpha,
-                                  const srcDataType* const __restrict__ p_src_global,
-                                  dstDataType beta,
-                                  dstDataType* const __restrict__ p_dst_global,
-                                  const int* const __restrict__ ws_indices_global,
-                                  int* const __restrict__ indices_global)
-    {
-        (void)ws_indices_global;
-        (void)indices_global;
-
-        const auto zeroVal = opReduce::GetReductionZeroVal();
-
-        const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>(zeroVal));
-        auto dst_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            p_dst_global, dst1dDesc.GetElementSpaceSize());
-
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredThreadBufferLength, true>
-            in_thread_buf;
-
-        using threadwise_reduce = ThreadReduce<decltype(in_thread_buf), opReduce, nanPropaOpt>;
-
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
-
-        accuValue_buf(I0) = zeroVal;
-
-        const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
-        const int divider         = origReduceLen;
-
-        const preUnaryOpType preUnaryOp(divider);
-        const posUnaryOpType posUnaryOp(divider);
-
-        using ThreadBufferLengths       = Sequence<1, GredThreadBufferLength>;
-        constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
-            make_tuple(Number<1>{}, Number<GredThreadBufferLength>{}));
-
-        index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
-
-        auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
-                                                                    compType,
-                                                                    src2dDescType,
-                                                                    decltype(ThreadBufferDesc),
-                                                                    ThreadBufferLengths,
-                                                                    Sequence<0, 1>,
-                                                                    1,
-                                                                    1,
-                                                                    1,
-                                                                    false>(
-            src2dDesc, make_multi_index(thread_global_1d_id, 0));
-
-        constexpr auto in_thread_copy_step = make_multi_index(0, GredThreadBufferLength);
-
-        for(index_t reducedLength = 0; reducedLength < toReduceLength;
-            reducedLength += GredThreadBufferLength)
-        {
-            threadwise_src_load.Run(
-                src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf);
-
-            // do element-wise pre-reduction operation
-            threadwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf);
-
-            // do the reduction on the Thread Buffer
-            threadwise_reduce::Reduce(in_thread_buf, accuValue_buf(I0));
-
-            threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
-        }
-
-        accuValue_buf(I0) = posUnaryOp(accuValue_buf[I0]);
-
-        constexpr auto ReducedDataDesc =
-            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
-
-        if(!float_equal_one{}(alpha))
-            accuValue_buf(I0) *= type_convert<compType>(alpha);
-
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
-
-        dstValue_buf(I0) = type_convert<dstDataType>(accuValue_buf[I0]);
-
-        if(!float_equal_zero{}(beta))
-        {
-            auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2<dstDataType,
-                                                                        dstDataType,
-                                                                        dst1dDescType,
-                                                                        decltype(ReducedDataDesc),
-                                                                        Sequence<1>,
-                                                                        Sequence<0>,
-                                                                        0,
-                                                                        1,
-                                                                        1,
-                                                                        true>(
-                dst1dDesc, make_multi_index(thread_global_1d_id));
-
-            StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
-
-            threadwise_dst_load.Run(
-                dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
-
-            dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
-        }
-
-        auto threadwise_dst_store =
-            ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
-                                               dstDataType,
-                                               decltype(ReducedDataDesc),
-                                               dst1dDescType,
-                                               Sequence<1>,
-                                               Sequence<0>,
-                                               0,
-                                               1,
-                                               InMemoryDataOperationEnum_t::Set,
-                                               1,
-                                               true>(dst1dDesc,
-                                                     make_multi_index(thread_global_1d_id));
-
-        threadwise_dst_store.Run(
-            ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_buf);
-    };
-
-    template <>
-    __device__ static void Run<2>(const src2dDescType& src2dDesc,
-                                  const dst1dDescType& dst1dDesc,
-                                  int origReduceLen,
-                                  srcDataType alpha,
-                                  const srcDataType* const __restrict__ p_src_global,
-                                  dstDataType beta,
-                                  dstDataType* const __restrict__ p_dst_global,
-                                  const int* const __restrict__ ws_indices_global,
-                                  int* const __restrict__ indices_global)
-    {
-        (void)ws_indices_global;
-
-        const auto zeroVal = opReduce::GetReductionZeroVal();
-
-        const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>(zeroVal));
-        auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            p_dst_global, dst1dDesc.GetElementSpaceSize());
-        auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            indices_global, dst1dDesc.GetElementSpaceSize());
-
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredThreadBufferLength, true>
-            in_thread_buf;
-
-        using threadwise_reduce = ThreadReduce<decltype(in_thread_buf), opReduce, nanPropaOpt>;
-
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
-
-        accuValue_buf(I0) = zeroVal;
-        accuIndex_buf(I0) = 0;
-
-        const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
-        const int divider         = origReduceLen;
-
-        const preUnaryOpType preUnaryOp(divider);
-
-        using ThreadBufferLengths       = Sequence<1, GredThreadBufferLength>;
-        constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
-            make_tuple(Number<1>{}, Number<GredThreadBufferLength>{}));
-
-        index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
-
-        auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
-                                                                    compType,
-                                                                    src2dDescType,
-                                                                    decltype(ThreadBufferDesc),
-                                                                    ThreadBufferLengths,
-                                                                    Sequence<0, 1>,
-                                                                    1,
-                                                                    1,
-                                                                    1,
-                                                                    false>(
-            src2dDesc, make_multi_index(thread_global_1d_id, 0));
-
-        constexpr auto in_thread_copy_step = make_multi_index(0, GredThreadBufferLength);
-
-        index_t indexStart = 0;
-        for(index_t reducedLength = 0; reducedLength < toReduceLength;
-            reducedLength += GredThreadBufferLength)
-        {
-            threadwise_src_load.Run(
-                src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf);
-
-            // unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
-            // done here
-            threadwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf);
-
-            // do the reduction on the Thread Buffer
-            threadwise_reduce::Reduce2(
-                in_thread_buf, accuValue_buf(I0), accuIndex_buf(I0), indexStart);
-
-            indexStart += GredThreadBufferLength;
-
-            threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
-        }
-
-        constexpr auto ReducedDataDesc =
-            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
-
-        if(!float_equal_one{}(alpha))
-            accuValue_buf(I0) *= type_convert<compType>(alpha);
-
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
-
-        dstValue_buf(I0) = type_convert<dstDataType>(accuValue_buf[I0]);
-
-        if(!float_equal_zero{}(beta))
-        {
-            auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2<dstDataType,
-                                                                        dstDataType,
-                                                                        dst1dDescType,
-                                                                        decltype(ReducedDataDesc),
-                                                                        Sequence<1>,
-                                                                        Sequence<0>,
-                                                                        0,
-                                                                        1,
-                                                                        1,
-                                                                        false>(
-                dst1dDesc, make_multi_index(thread_global_1d_id));
-
-            StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
-
-            threadwise_dst_load.Run(
-                dst1dDesc, dst_global_val_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
-
-            dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
-        }
-
-        auto threadwise_dst_val_store =
-            ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
-                                               dstDataType,
-                                               decltype(ReducedDataDesc),
-                                               dst1dDescType,
-                                               Sequence<1>,
-                                               Sequence<0>,
-                                               0,
-                                               1,
-                                               InMemoryDataOperationEnum_t::Set,
-                                               1,
-                                               false>(dst1dDesc,
-                                                      make_multi_index(thread_global_1d_id));
-
-        auto threadwise_dst_idx_store =
-            ThreadwiseTensorSliceTransfer_v1r3<int,
-                                               int,
-                                               decltype(ReducedDataDesc),
-                                               dst1dDescType,
-                                               Sequence<1>,
-                                               Sequence<0>,
-                                               0,
-                                               1,
-                                               InMemoryDataOperationEnum_t::Set,
-                                               1,
-                                               false>(dst1dDesc,
-                                                      make_multi_index(thread_global_1d_id));
-
-        threadwise_dst_val_store.Run(
-            ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf);
-        threadwise_dst_idx_store.Run(
-            ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
-    };
-
-    template <>
-    __device__ static void Run<3>(const src2dDescType& src2dDesc,
-                                  const dst1dDescType& dst1dDesc,
-                                  int origReduceLen,
-                                  srcDataType alpha,
-                                  const srcDataType* const __restrict__ ws_values_global,
-                                  dstDataType beta,
-                                  dstDataType* const __restrict__ p_dst_global,
-                                  const int* const __restrict__ ws_indices_global,
-                                  int* const __restrict__ indices_global)
-    {
-        (void)origReduceLen;
-
-        const auto zeroVal = opReduce::GetReductionZeroVal();
-
-        const auto src_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            ws_values_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>(zeroVal));
-        const auto src_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            ws_indices_global, src2dDesc.GetElementSpaceSize());
-        auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            p_dst_global, dst1dDesc.GetElementSpaceSize());
-        auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            indices_global, dst1dDesc.GetElementSpaceSize());
-
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredThreadBufferLength, true>
-            in_thread_val_buf;
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, int, GredThreadBufferLength, true> in_thread_idx_buf;
-
-        using threadwise_reduce = ThreadReduceWithIndicesInput<decltype(in_thread_val_buf),
-                                                               decltype(in_thread_idx_buf),
-                                                               opReduce,
-                                                               nanPropaOpt>;
-
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
-
-        accuValue_buf(I0) = zeroVal;
-        accuIndex_buf(I0) = 0;
-
-        const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
-
-        using ThreadBufferLengths       = Sequence<1, GredThreadBufferLength>;
-        constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
-            make_tuple(Number<1>{}, Number<GredThreadBufferLength>{}));
-
-        index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
-
-        auto threadwise_src_val_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
-                                                                        compType,
-                                                                        src2dDescType,
-                                                                        decltype(ThreadBufferDesc),
-                                                                        ThreadBufferLengths,
-                                                                        Sequence<0, 1>,
-                                                                        1,
-                                                                        1,
-                                                                        1,
-                                                                        false>(
-            src2dDesc, make_multi_index(thread_global_1d_id, 0));
-
-        auto threadwise_src_idx_load = ThreadwiseTensorSliceTransfer_v2<int,
-                                                                        int,
-                                                                        src2dDescType,
-                                                                        decltype(ThreadBufferDesc),
-                                                                        ThreadBufferLengths,
-                                                                        Sequence<0, 1>,
-                                                                        1,
-                                                                        1,
-                                                                        1,
-                                                                        false>(
-            src2dDesc, make_multi_index(thread_global_1d_id, 0));
-
-        constexpr auto in_thread_copy_step = make_multi_index(0, GredThreadBufferLength);
-
-        for(index_t reducedLength = 0; reducedLength < toReduceLength;
-            reducedLength += GredThreadBufferLength)
-        {
-            threadwise_src_val_load.Run(src2dDesc,
-                                        src_global_val_buf,
-                                        ThreadBufferDesc,
-                                        make_tuple(I0, I0),
-                                        in_thread_val_buf);
-            threadwise_src_idx_load.Run(src2dDesc,
-                                        src_global_idx_buf,
-                                        ThreadBufferDesc,
-                                        make_tuple(I0, I0),
-                                        in_thread_idx_buf);
-
-            // do the reduction on the Thread Buffer
-            threadwise_reduce::Reduce(
-                in_thread_val_buf, in_thread_idx_buf, accuValue_buf(I0), accuIndex_buf(I0));
-
-            threadwise_src_val_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
-            threadwise_src_idx_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
-        }
-
-        constexpr auto ReducedDataDesc =
-            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
-
-        if(!float_equal_one{}(alpha))
-            accuValue_buf(I0) *= type_convert<compType>(alpha);
-
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
-
-        dstValue_buf(I0) = type_convert<dstDataType>(accuValue_buf[I0]);
-
-        if(!float_equal_zero{}(beta))
-        {
-            auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2<dstDataType,
-                                                                        dstDataType,
-                                                                        dst1dDescType,
-                                                                        decltype(ReducedDataDesc),
-                                                                        Sequence<1>,
-                                                                        Sequence<0>,
-                                                                        0,
-                                                                        1,
-                                                                        1,
-                                                                        false>(
-                dst1dDesc, make_multi_index(thread_global_1d_id));
-
-            StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
-
-            threadwise_dst_load.Run(
-                dst1dDesc, dst_global_val_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
-
-            dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
-        }
-
-        auto threadwise_dst_val_store =
-            ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
-                                               dstDataType,
-                                               decltype(ReducedDataDesc),
-                                               dst1dDescType,
-                                               Sequence<1>,
-                                               Sequence<0>,
-                                               0,
-                                               1,
-                                               InMemoryDataOperationEnum_t::Set,
-                                               1,
-                                               false>(dst1dDesc,
-                                                      make_multi_index(thread_global_1d_id));
-
-        auto threadwise_dst_idx_store =
-            ThreadwiseTensorSliceTransfer_v1r3<int,
-                                               int,
-                                               decltype(ReducedDataDesc),
-                                               dst1dDescType,
-                                               Sequence<1>,
-                                               Sequence<0>,
-                                               0,
-                                               1,
-                                               InMemoryDataOperationEnum_t::Set,
-                                               1,
-                                               false>(dst1dDesc,
-                                                      make_multi_index(thread_global_1d_id));
-
-        threadwise_dst_val_store.Run(
-            ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf);
-        threadwise_dst_idx_store.Run(
-            ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
-    };
-};
-
-} // namespace ck
-#endif
--- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_WARPWISE_HPP
-#define CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_WARPWISE_HPP
-
-#include "data_type.hpp"
-#include "reduction_common.hpp"
-#include "reduction_operator.hpp"
-#include "reduction_functions_warpwise.hpp"
-
-#include "threadwise_tensor_slice_transfer.hpp"
-
-namespace ck {
-
-template <index_t BlockSize,
-          typename srcDataType,
-          typename dstDataType,
-          typename compType,
-          typename src2dDescType,
-          typename dst1dDescType,
-          ReduceTensorOp_t op,
-          NanPropagation_t nanPropaOpt,
-          ReduceTensorIndices_t reduceIndicesOpt,
-          bool isFirstCall,
-          bool isLastCall,
-          index_t GredAccessesPerThreadInWarp>
-struct GridwiseReduction_xy_to_x_direct_warpwise
-{
-    using opReduce = typename reduce_binary_operator<compType, op>::opType;
-    using preUnaryOpType =
-        typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::preUnaryOp;
-    using posUnaryOpType =
-        typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::posUnaryOp;
-
-    static constexpr auto I0 = Number<0>{};
-
-    template <int RunId>
-    __device__ static void Run(const src2dDescType& src2dDesc,
-                               const dst1dDescType& dst1dDesc,
-                               int origReduceLen,
-                               srcDataType alpha,
-                               const srcDataType* const __restrict__ p_src_global,
-                               dstDataType beta,
-                               dstDataType* const __restrict__ p_dst_global,
-                               const int* const __restrict__ ws_indices_global,
-                               int* const __restrict__ indices_global);
-
-    template <>
-    __device__ static void Run<1>(const src2dDescType& src2dDesc,
-                                  const dst1dDescType& dst1dDesc,
-                                  int origReduceLen,
-                                  srcDataType alpha,
-                                  const srcDataType* const __restrict__ p_src_global,
-                                  dstDataType beta,
-                                  dstDataType* const __restrict__ p_dst_global,
-                                  const int* const __restrict__ ws_indices_global,
-                                  int* const __restrict__ indices_global)
-    {
-        (void)ws_indices_global;
-        (void)indices_global;
-
-        const auto zeroVal = opReduce::GetReductionZeroVal();
-
-        const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>(zeroVal));
-        auto dst_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            p_dst_global, dst1dDesc.GetElementSpaceSize());
-
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredAccessesPerThreadInWarp, true>
-            in_thread_buf;
-
-        using warpwise_reduce =
-            WarpReduce<decltype(in_thread_buf), BlockSize, opReduce, nanPropaOpt>;
-
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
-
-        accuValue_buf(I0) = zeroVal;
-
-        const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
-        const int divider         = origReduceLen;
-
-        const preUnaryOpType preUnaryOp(divider);
-        const posUnaryOpType posUnaryOp(divider);
-
-        using ThreadBufferLengths       = Sequence<1, GredAccessesPerThreadInWarp>;
-        constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
-            make_tuple(Number<1>{}, Number<GredAccessesPerThreadInWarp>{}));
-
-        index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
-        index_t warp_global_1d_id   = thread_global_1d_id / warpSize;
-        index_t thread_inwarp_id    = thread_global_1d_id % warpSize;
-
-        auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
-                                                                    compType,
-                                                                    src2dDescType,
-                                                                    decltype(ThreadBufferDesc),
-                                                                    ThreadBufferLengths,
-                                                                    Sequence<0, 1>,
-                                                                    1,
-                                                                    1,
-                                                                    1,
-                                                                    false>(
-            src2dDesc,
-            make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp));
-
-        constexpr auto in_thread_copy_step =
-            make_multi_index(0, warpSize * GredAccessesPerThreadInWarp);
-
-        for(index_t reducedLength = 0; reducedLength < toReduceLength;
-            reducedLength += warpSize * GredAccessesPerThreadInWarp)
-        {
-            threadwise_src_load.Run(
-                src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf);
-
-            // do element-wise pre-reduction operation
-            warpwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf);
-
-            // do the warp-wise reduction on data of all thread buffers
-            warpwise_reduce::Reduce(in_thread_buf, accuValue_buf(I0));
-
-            threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
-        }
-
-        accuValue_buf(I0) = posUnaryOp(accuValue_buf[I0]);
-
-        constexpr auto ReducedDataDesc =
-            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
-
-        // The first thread in the warp stores the reduced result to the global location
-        // representing the Warp
-        if(thread_inwarp_id == 0)
-        {
-            if(!float_equal_one{}(alpha))
-                accuValue_buf(I0) *= type_convert<compType>(alpha);
-
-            StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
-
-            dstValue_buf(I0) = type_convert<dstDataType>(accuValue_buf[I0]);
-
-            if(!float_equal_zero{}(beta))
-            {
-                auto threadwise_dst_load =
-                    ThreadwiseTensorSliceTransfer_v2<dstDataType,
-                                                     dstDataType,
-                                                     dst1dDescType,
-                                                     decltype(ReducedDataDesc),
-                                                     Sequence<1>,
-                                                     Sequence<0>,
-                                                     0,
-                                                     1,
-                                                     1,
-                                                     true>(dst1dDesc,
-                                                           make_multi_index(warp_global_1d_id));
-
-                StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
-
-                threadwise_dst_load.Run(
-                    dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
-
-                dstValue_buf(I0) += priorDstValue_buf(I0) * beta;
-            }
-
-            auto threadwise_dst_store =
-                ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
-                                                   dstDataType,
-                                                   decltype(ReducedDataDesc),
-                                                   dst1dDescType,
-                                                   Sequence<1>,
-                                                   Sequence<0>,
-                                                   0,
-                                                   1,
-                                                   InMemoryDataOperationEnum_t::Set,
-                                                   1,
-                                                   true>(dst1dDesc,
-                                                         make_multi_index(warp_global_1d_id));
-
-            threadwise_dst_store.Run(
-                ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_buf);
-        }
-    };
-
-    template <>
-    __device__ static void Run<2>(const src2dDescType& src2dDesc,
-                                  const dst1dDescType& dst1dDesc,
-                                  int origReduceLen,
-                                  srcDataType alpha,
-                                  const srcDataType* const __restrict__ p_src_global,
-                                  dstDataType beta,
-                                  dstDataType* const __restrict__ p_dst_global,
-                                  const int* const __restrict__ ws_indices_global,
-                                  int* const __restrict__ indices_global)
-    {
-        (void)ws_indices_global;
-
-        const auto zeroVal = opReduce::GetReductionZeroVal();
-
-        const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>(zeroVal));
-        auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            p_dst_global, dst1dDesc.GetElementSpaceSize());
-        auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            indices_global, dst1dDesc.GetElementSpaceSize());
-
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredAccessesPerThreadInWarp, true>
-            in_thread_buf;
-
-        using warpwise_reduce =
-            WarpReduce<decltype(in_thread_buf), BlockSize, opReduce, nanPropaOpt>;
-
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
-
-        accuValue_buf(I0) = zeroVal;
-        accuIndex_buf(I0) = 0;
-
-        const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
-        const int divider         = origReduceLen;
-
-        const preUnaryOpType preUnaryOp(divider);
-
-        using ThreadBufferLengths       = Sequence<1, GredAccessesPerThreadInWarp>;
-        constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
-            make_tuple(Number<1>{}, Number<GredAccessesPerThreadInWarp>{}));
-
-        index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
-        index_t warp_global_1d_id   = thread_global_1d_id / warpSize;
-        index_t thread_inwarp_id    = thread_global_1d_id % warpSize;
-
-        auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
-                                                                    compType,
-                                                                    src2dDescType,
-                                                                    decltype(ThreadBufferDesc),
-                                                                    ThreadBufferLengths,
-                                                                    Sequence<0, 1>,
-                                                                    1,
-                                                                    1,
-                                                                    1,
-                                                                    false>(
-            src2dDesc,
-            make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp));
-
-        constexpr auto in_thread_copy_step =
-            make_multi_index(0, warpSize * GredAccessesPerThreadInWarp);
-
-        index_t indexOffset = 0;
-        for(index_t reducedLength = 0; reducedLength < toReduceLength;
-            reducedLength += warpSize * GredAccessesPerThreadInWarp)
-        {
-            threadwise_src_load.Run(
-                src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf);
-
-            // unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
-            // done here
-            warpwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf);
-
-            // do the warp-wise reduction on data of all thread buffers
-            warpwise_reduce::Reduce2(
-                in_thread_buf, accuValue_buf(I0), accuIndex_buf(I0), indexOffset);
-
-            indexOffset += warpSize * GredAccessesPerThreadInWarp;
-
-            threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
-        }
-
-        constexpr auto ReducedDataDesc =
-            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
-
-        // The first thread in the warp stores the reduced result to the global location
-        // representing the Warp
-        if(thread_inwarp_id == 0)
-        {
-            if(!float_equal_one{}(alpha))
-                accuValue_buf(I0) *= type_convert<compType>(alpha);
-
-            StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
-
-            dstValue_buf(I0) = type_convert<dstDataType>(accuValue_buf[I0]);
-
-            if(!float_equal_zero{}(beta))
-            {
-                auto threadwise_dst_load =
-                    ThreadwiseTensorSliceTransfer_v2<dstDataType,
-                                                     dstDataType,
-                                                     dst1dDescType,
-                                                     decltype(ReducedDataDesc),
-                                                     Sequence<1>,
-                                                     Sequence<0>,
-                                                     0,
-                                                     1,
-                                                     1,
-                                                     true>(dst1dDesc,
-                                                           make_multi_index(warp_global_1d_id));
-
-                StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
-
-                threadwise_dst_load.Run(dst1dDesc,
-                                        dst_global_val_buf,
-                                        ReducedDataDesc,
-                                        make_tuple(I0),
-                                        priorDstValue_buf);
-
-                dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
-            }
-
-            auto threadwise_dst_val_store =
-                ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
-                                                   dstDataType,
-                                                   decltype(ReducedDataDesc),
-                                                   dst1dDescType,
-                                                   Sequence<1>,
-                                                   Sequence<0>,
-                                                   0,
-                                                   1,
-                                                   InMemoryDataOperationEnum_t::Set,
-                                                   1,
-                                                   true>(dst1dDesc,
-                                                         make_multi_index(warp_global_1d_id));
-
-            auto threadwise_dst_idx_store =
-                ThreadwiseTensorSliceTransfer_v1r3<int,
-                                                   int,
-                                                   decltype(ReducedDataDesc),
-                                                   dst1dDescType,
-                                                   Sequence<1>,
-                                                   Sequence<0>,
-                                                   0,
-                                                   1,
-                                                   InMemoryDataOperationEnum_t::Set,
-                                                   1,
-                                                   true>(dst1dDesc,
-                                                         make_multi_index(warp_global_1d_id));
-
-            threadwise_dst_val_store.Run(
-                ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf);
-            threadwise_dst_idx_store.Run(
-                ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
-        }
-    };
-
-    template <>
-    __device__ static void Run<3>(const src2dDescType& src2dDesc,
-                                  const dst1dDescType& dst1dDesc,
-                                  int origReduceLen,
-                                  srcDataType alpha,
-                                  const srcDataType* const __restrict__ ws_values_global,
-                                  dstDataType beta,
-                                  dstDataType* const __restrict__ p_dst_global,
-                                  const int* const __restrict__ ws_indices_global,
-                                  int* const __restrict__ indices_global)
-    {
-        (void)origReduceLen;
-
-        const auto zeroVal = opReduce::GetReductionZeroVal();
-
-        const auto src_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            ws_values_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>(zeroVal));
-        const auto src_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            ws_indices_global, src2dDesc.GetElementSpaceSize());
-        auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            p_dst_global, dst1dDesc.GetElementSpaceSize());
-        auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            indices_global, dst1dDesc.GetElementSpaceSize());
-
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredAccessesPerThreadInWarp, true>
-            in_thread_val_buf;
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, int, GredAccessesPerThreadInWarp, true>
-            in_thread_idx_buf;
-
-        using warpwise_reduce = WarpReduceWithIndicesInput<decltype(in_thread_val_buf),
-                                                           decltype(in_thread_idx_buf),
-                                                           BlockSize,
-                                                           opReduce,
-                                                           nanPropaOpt>;
-
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
-
-        accuValue_buf(I0) = zeroVal;
-        accuIndex_buf(I0) = 0;
-
-        const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
-
-        using ThreadBufferLengths       = Sequence<1, GredAccessesPerThreadInWarp>;
-        constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
-            make_tuple(Number<1>{}, Number<GredAccessesPerThreadInWarp>{}));
-
-        index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
-        index_t warp_global_1d_id   = thread_global_1d_id / warpSize;
-        index_t thread_inwarp_id    = thread_global_1d_id % warpSize;
-
-        auto threadwise_src_val_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
-                                                                        compType,
-                                                                        src2dDescType,
-                                                                        decltype(ThreadBufferDesc),
-                                                                        ThreadBufferLengths,
-                                                                        Sequence<0, 1>,
-                                                                        1,
-                                                                        1,
-                                                                        1,
-                                                                        false>(
-            src2dDesc,
-            make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp));
-
-        auto threadwise_src_idx_load = ThreadwiseTensorSliceTransfer_v2<int,
-                                                                        int,
-                                                                        src2dDescType,
-                                                                        decltype(ThreadBufferDesc),
-                                                                        ThreadBufferLengths,
-                                                                        Sequence<0, 1>,
-                                                                        1,
-                                                                        1,
-                                                                        1,
-                                                                        false>(
-            src2dDesc,
-            make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp));
-
-        constexpr auto in_thread_copy_step =
-            make_multi_index(0, warpSize * GredAccessesPerThreadInWarp);
-
-        for(index_t reducedLength = 0; reducedLength < toReduceLength;
-            reducedLength += warpSize * GredAccessesPerThreadInWarp)
-        {
-            threadwise_src_val_load.Run(src2dDesc,
-                                        src_global_val_buf,
-                                        ThreadBufferDesc,
-                                        make_tuple(I0, I0),
-                                        in_thread_val_buf);
-            threadwise_src_idx_load.Run(src2dDesc,
-                                        src_global_idx_buf,
-                                        ThreadBufferDesc,
-                                        make_tuple(I0, I0),
-                                        in_thread_idx_buf);
-
-            // do the warp-wise reduction on data of all thread buffers
-            warpwise_reduce::Reduce(
-                in_thread_val_buf, in_thread_idx_buf, accuValue_buf(I0), accuIndex_buf(I0));
-
-            threadwise_src_val_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
-            threadwise_src_idx_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
-        }
-
-        constexpr auto ReducedDataDesc =
-            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
-
-        // The first thread in the warp stores the reduced result to the global location
-        // representing the Warp
-        if(thread_inwarp_id == 0)
-        {
-            if(!float_equal_one{}(alpha))
-                accuValue_buf(I0) *= type_convert<compType>(alpha);
-
-            StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
-
-            dstValue_buf(I0) = type_convert<dstDataType>(accuValue_buf[I0]);
-
-            if(!float_equal_zero{}(beta))
-            {
-                auto threadwise_dst_load =
-                    ThreadwiseTensorSliceTransfer_v2<dstDataType,
-                                                     dstDataType,
-                                                     dst1dDescType,
-                                                     decltype(ReducedDataDesc),
-                                                     Sequence<1>,
-                                                     Sequence<0>,
-                                                     0,
-                                                     1,
-                                                     1,
-                                                     true>(dst1dDesc,
-                                                           make_multi_index(warp_global_1d_id));
-
-                StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
-
-                threadwise_dst_load.Run(dst1dDesc,
-                                        dst_global_val_buf,
-                                        ReducedDataDesc,
-                                        make_tuple(I0),
-                                        priorDstValue_buf);
-
-                dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
-            }
-
-            auto threadwise_dst_val_store =
-                ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
-                                                   dstDataType,
-                                                   decltype(ReducedDataDesc),
-                                                   dst1dDescType,
-                                                   Sequence<1>,
-                                                   Sequence<0>,
-                                                   0,
-                                                   1,
-                                                   InMemoryDataOperationEnum_t::Set,
-                                                   1,
-                                                   true>(dst1dDesc,
-                                                         make_multi_index(warp_global_1d_id));
-
-            auto threadwise_dst_idx_store =
-                ThreadwiseTensorSliceTransfer_v1r3<int,
-                                                   int,
-                                                   decltype(ReducedDataDesc),
-                                                   dst1dDescType,
-                                                   Sequence<1>,
-                                                   Sequence<0>,
-                                                   0,
-                                                   1,
-                                                   InMemoryDataOperationEnum_t::Set,
-                                                   1,
-                                                   true>(dst1dDesc,
-                                                         make_multi_index(warp_global_1d_id));
-
-            threadwise_dst_val_store.Run(
-                ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf);
-            threadwise_dst_idx_store.Run(
-                ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
-        }
-    };
-};
-
-} // namespace ck
-#endif
--- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_MULTIBLOCK_HPP
-#define CK_GRIDWISE_GENERIC_2D_REDUCTION_MULTIBLOCK_HPP
-
-#include "reduction_common.hpp"
-#include "reduction_operator.hpp"
-#include "reduction_functions_blockwise.hpp"
-
-#include "blockwise_tensor_slice_transfer.hpp"
-
-namespace ck {
-
-template <index_t BlockSize,
-          typename srcDataType,
-          typename dstDataType, // not used together with the beta input
-          typename compType,
-          typename src2dDescType,
-          typename dst1dDescType,
-          ReduceTensorOp_t op,
-          NanPropagation_t nanPropaOpt,
-          ReduceTensorIndices_t reduceIndicesOpt,
-          index_t GredAccessesPerThreadInBlock>
-struct GridwiseReduction_xy_to_x_multiblock
-{
-    using opReduce       = typename reduce_binary_operator<compType, op>::opType;
-    using preUnaryOpType = typename reduce_unary_operator<compType, op, true, false>::preUnaryOp;
-    using posUnaryOpType = typename reduce_unary_operator<compType, op, true, false>::posUnaryOp;
-
-    static constexpr auto buffer2dDesc = make_naive_tensor_descriptor_packed(
-        make_tuple(Number<GredAccessesPerThreadInBlock>{}, Number<BlockSize>{}));
-    using blockwise_reduce =
-        BlockwiseReduction_2d_block_buffer<decltype(buffer2dDesc), true, opReduce, nanPropaOpt>;
-
-    static constexpr index_t BlockBufferSize = buffer2dDesc.GetElementSize();
-
-    static constexpr auto I0 = Number<0>{};
-
-    template <int RunId>
-    __device__ static void Run(const src2dDescType& src2dDesc,
-                               const dst1dDescType& dst1dDesc,
-                               int origReduceLen,
-                               int BlkGroupSize,
-                               srcDataType alpha,
-                               const srcDataType* const __restrict__ p_src_global,
-                               dstDataType beta,
-                               srcDataType* const __restrict__ ws_values_global,
-                               int* const __restrict__ ws_indices_global);
-
-    template <>
-    __device__ static void Run<1>(const src2dDescType& src2dDesc,
-                                  const dst1dDescType& dst1dDesc,
-                                  int origReduceLen,
-                                  int BlkGroupSize,
-                                  srcDataType alpha,
-                                  const srcDataType* const __restrict__ p_src_global,
-                                  dstDataType beta,
-                                  srcDataType* const __restrict__ ws_values_global,
-                                  int* const __restrict__ ws_indices_global)
-    {
-        (void)ws_indices_global;
-
-        (void)alpha; // unused
-        (void)beta;  // unused
-
-        const auto zeroVal = opReduce::GetReductionZeroVal();
-
-        // LDS
-        __shared__ compType p_in_block_buffer[BlockBufferSize];
-
-        const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>(zeroVal));
-        auto workspace_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            ws_values_global, dst1dDesc.GetLength(I0) * BlkGroupSize);
-
-        auto in_block_buf =
-            make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_buffer, BlockBufferSize);
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
-
-        accuValue_buf(I0) = zeroVal;
-
-        const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
-        const int divider         = origReduceLen;
-
-        const preUnaryOpType preUnaryOp(divider);
-
-        const index_t thread_local_id = get_thread_local_1d_id();
-        const index_t block_global_id = get_block_1d_id();
-        const index_t blkgroup_id     = block_global_id / BlkGroupSize;
-        const index_t block_local_id  = block_global_id % BlkGroupSize;
-
-        const index_t reduceSizePerBlock =
-            (((toReduceLength + BlkGroupSize - 1) / BlkGroupSize + BlockBufferSize - 1) /
-             BlockBufferSize) *
-            BlockBufferSize;
-
-        constexpr auto in_block_desc = make_naive_tensor_descriptor_packed(
-            make_tuple(Number<1>{}, Number<BlockSize * GredAccessesPerThreadInBlock>{}));
-
-        using ThreadSliceLengths   = Sequence<1, GredAccessesPerThreadInBlock>;
-        using ThreadClusterLengths = Sequence<1, BlockSize>;
-
-        auto blockwise_src_load = BlockwiseTensorSliceTransfer_v4<BlockSize,
-                                                                  InMemoryDataOperationEnum_t::Set,
-                                                                  Sequence<1, BlockBufferSize>,
-                                                                  ThreadSliceLengths,
-                                                                  ThreadClusterLengths,
-                                                                  Sequence<0, 1>,
-                                                                  srcDataType,
-                                                                  compType,
-                                                                  src2dDescType,
-                                                                  decltype(in_block_desc),
-                                                                  Sequence<0, 1>,
-                                                                  Sequence<0, 1>,
-                                                                  1,
-                                                                  1,
-                                                                  1,
-                                                                  1,
-                                                                  1,
-                                                                  1,
-                                                                  false,
-                                                                  true>(
-            src2dDesc,
-            make_multi_index(blkgroup_id, block_local_id * reduceSizePerBlock),
-            in_block_desc,
-            make_multi_index(0, 0));
-
-        constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
-
-        const index_t toReduceBlocks = (reduceSizePerBlock + BlockSize - 1) / BlockSize;
-
-        for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
-            reducedBlocks += GredAccessesPerThreadInBlock)
-        {
-            blockwise_src_load.RunRead(src2dDesc, src_global_buf);
-            blockwise_src_load.RunWrite(in_block_desc, in_block_buf);
-            __syncthreads();
-
-            // do element-wise pre-reduction operation
-            blockwise_reduce::operate_on_elements(preUnaryOp, in_block_buf);
-
-            index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
-                                        ? GredAccessesPerThreadInBlock
-                                        : toReduceBlocks - reducedBlocks;
-            blockwise_reduce::Reduce(in_block_buf, BlocksInOneOp, accuValue_buf(I0));
-
-            blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
-        }
-
-        constexpr auto ReducedDataDesc =
-            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
-
-        const auto workspace_desc =
-            make_naive_tensor_descriptor_packed(make_tuple(dst1dDesc.GetLength(I0) * BlkGroupSize));
-
-        // The first thread in the block stores the reduced result to the global location
-        // representing the block
-        if(thread_local_id == 0)
-        {
-            auto threadwise_workspace_store =
-                ThreadwiseTensorSliceTransfer_v1r3<compType,
-                                                   srcDataType,
-                                                   decltype(ReducedDataDesc),
-                                                   decltype(workspace_desc),
-                                                   Sequence<1>,
-                                                   Sequence<0>,
-                                                   0,
-                                                   1,
-                                                   InMemoryDataOperationEnum_t::Set,
-                                                   1,
-                                                   true>(workspace_desc,
-                                                         make_multi_index(block_global_id));
-
-            threadwise_workspace_store.Run(ReducedDataDesc,
-                                           make_tuple(I0),
-                                           accuValue_buf,
-                                           workspace_desc,
-                                           workspace_global_buf);
-        }
-    };
-
-    template <>
-    __device__ static void Run<2>(const src2dDescType& src2dDesc,
-                                  const dst1dDescType& dst1dDesc,
-                                  int origReduceLen,
-                                  int BlkGroupSize,
-                                  srcDataType alpha,
-                                  const srcDataType* const __restrict__ p_src_global,
-                                  dstDataType beta,
-                                  srcDataType* const __restrict__ ws_values_global,
-                                  int* const __restrict__ ws_indices_global)
-    {
-        (void)alpha; // unused
-        (void)beta;  // unused
-
-        const auto zeroVal = opReduce::GetReductionZeroVal();
-
-        // LDS
-        __shared__ compType p_in_block_values_buffer[BlockBufferSize];
-        __shared__ int p_in_block_indices_buffer[BlockBufferSize];
-
-        const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>(zeroVal));
-        auto workspace_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            ws_values_global, dst1dDesc.GetLength(I0) * BlkGroupSize);
-        auto workspace_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
-            ws_indices_global, dst1dDesc.GetLength(I0) * BlkGroupSize);
-
-        auto in_block_val_buf =
-            make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_values_buffer, BlockBufferSize);
-        auto in_block_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
-            p_in_block_indices_buffer, BlockBufferSize);
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
-
-        accuValue_buf(I0) = zeroVal;
-        accuIndex_buf(I0) = 0;
-
-        const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
-        const int divider         = origReduceLen;
-
-        const preUnaryOpType preUnaryOp(divider);
-
-        const index_t thread_local_id = get_thread_local_1d_id();
-        const index_t block_global_id = get_block_1d_id();
-        const index_t blkgroup_id     = block_global_id / BlkGroupSize;
-        const index_t block_local_id  = block_global_id % BlkGroupSize;
-
-        const index_t reduceSizePerBlock =
-            (((toReduceLength + BlkGroupSize - 1) / BlkGroupSize + BlockBufferSize - 1) /
-             BlockBufferSize) *
-            BlockBufferSize;
-
-        constexpr auto in_block_desc = make_naive_tensor_descriptor_packed(
-            make_tuple(Number<1>{}, Number<BlockSize * GredAccessesPerThreadInBlock>{}));
-
-        using ThreadSliceLengths   = Sequence<1, GredAccessesPerThreadInBlock>;
-        using ThreadClusterLengths = Sequence<1, BlockSize>;
-
-        auto blockwise_src_load = BlockwiseTensorSliceTransfer_v4<BlockSize,
-                                                                  InMemoryDataOperationEnum_t::Set,
-                                                                  Sequence<1, BlockBufferSize>,
-                                                                  ThreadSliceLengths,
-                                                                  ThreadClusterLengths,
-                                                                  Sequence<0, 1>,
-                                                                  srcDataType,
-                                                                  compType,
-                                                                  src2dDescType,
-                                                                  decltype(in_block_desc),
-                                                                  Sequence<0, 1>,
-                                                                  Sequence<0, 1>,
-                                                                  1,
-                                                                  1,
-                                                                  1,
-                                                                  1,
-                                                                  1,
-                                                                  1,
-                                                                  false,
-                                                                  true>(
-            src2dDesc,
-            make_multi_index(blkgroup_id, block_local_id * reduceSizePerBlock),
-            in_block_desc,
-            make_multi_index(0, 0));
-
-        constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
-
-        const index_t toReduceBlocks = (reduceSizePerBlock + BlockSize - 1) / BlockSize;
-
-        int indexOffset = block_local_id * reduceSizePerBlock;
-
-        for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
-            reducedBlocks += GredAccessesPerThreadInBlock)
-        {
-            blockwise_reduce::init_buffer_indices(in_block_idx_buf, indexOffset);
-
-            blockwise_src_load.RunRead(src2dDesc, src_global_buf);
-            blockwise_src_load.RunWrite(in_block_desc, in_block_val_buf);
-
-            __syncthreads();
-
-            // unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
-            // done here
-            blockwise_reduce::operate_on_elements(preUnaryOp, in_block_val_buf);
-
-            index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
-                                        ? GredAccessesPerThreadInBlock
-                                        : toReduceBlocks - reducedBlocks;
-
-            blockwise_reduce::Reduce2(in_block_val_buf,
-                                      in_block_idx_buf,
-                                      BlocksInOneOp,
-                                      accuValue_buf(I0),
-                                      accuIndex_buf(I0));
-
-            indexOffset += BlockBufferSize;
-
-            blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
-        }
-
-        constexpr auto ReducedDataDesc =
-            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
-
-        const auto workspace_desc =
-            make_naive_tensor_descriptor_packed(make_tuple(dst1dDesc.GetLength(I0) * BlkGroupSize));
-
-        // The first thread in the block stores the reduced result to the global location
-        // representing the block
-        if(thread_local_id == 0)
-        {
-            auto threadwise_workspace_val_store =
-                ThreadwiseTensorSliceTransfer_v1r3<compType,
-                                                   srcDataType,
-                                                   decltype(ReducedDataDesc),
-                                                   decltype(workspace_desc),
-                                                   Sequence<1>,
-                                                   Sequence<0>,
-                                                   0,
-                                                   1,
-                                                   InMemoryDataOperationEnum_t::Set,
-                                                   1,
-                                                   true>(workspace_desc,
-                                                         make_multi_index(block_global_id));
-
-            auto threadwise_workspace_idx_store =
-                ThreadwiseTensorSliceTransfer_v1r3<int,
-                                                   int,
-                                                   decltype(ReducedDataDesc),
-                                                   decltype(workspace_desc),
-                                                   Sequence<1>,
-                                                   Sequence<0>,
-                                                   0,
-                                                   1,
-                                                   InMemoryDataOperationEnum_t::Set,
-                                                   1,
-                                                   true>(workspace_desc,
-                                                         make_multi_index(block_global_id));
-
-            threadwise_workspace_val_store.Run(ReducedDataDesc,
-                                               make_tuple(I0),
-                                               accuValue_buf,
-                                               workspace_desc,
-                                               workspace_global_val_buf);
-            threadwise_workspace_idx_store.Run(ReducedDataDesc,
-                                               make_tuple(I0),
-                                               accuIndex_buf,
-                                               workspace_desc,
-                                               workspace_global_idx_buf);
-        }
-    };
-};
-
-} // namespace ck
-#endif
--- a/composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp
+++ b/composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
-#define CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
-
-#include "data_type.hpp"
-
-#include "reduction_common.hpp"
-#include "reduction_operator.hpp"
-#include "reduction_functions_binop.hpp"
-
-namespace ck {
-
-template <typename buffer2dDescType,
-          bool blockIsOneRow,
-          typename opReduce,
-          NanPropagation_t nanPropaOpt>
-struct BlockwiseReduction_2d_block_buffer
-{
-    using compType = typename opReduce::dataType;
-
-    static constexpr auto buffer2dDesc = buffer2dDescType{};
-
-    static constexpr index_t BlockSize =
-        blockIsOneRow ? buffer2dDesc.GetLength(Number<1>{}) : buffer2dDesc.GetLength(Number<0>{});
-    static constexpr index_t NumBlocks =
-        blockIsOneRow ? buffer2dDesc.GetLength(Number<0>{}) : buffer2dDesc.GetLength(Number<1>{});
-    using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
-
-    // This interface does not accumulate on indices
-    template <typename BufferType>
-    __device__ static void
-    Reduce(BufferType& block_buffer, index_t toReduceBlocks, compType& accuData)
-    {
-        const index_t thread_local_id = get_thread_local_1d_id();
-        compType lAccuData            = opReduce::GetReductionZeroVal();
-
-        index_t offset;
-        for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
-        {
-            offset = blockIsOneRow
-                         ? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_local_id))
-                         : buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, otherDimInd));
-            compType opData = type_convert<compType>(block_buffer[offset]);
-
-            binop::calculate(lAccuData, opData);
-        }
-
-        offset = blockIsOneRow ? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id))
-                               : buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
-
-        block_buffer(offset) = lAccuData;
-
-        __syncthreads();
-
-        for(index_t indOffset = BlockSize / 2; indOffset > 0; indOffset /= 2)
-        {
-            if(thread_local_id < indOffset)
-            {
-                index_t offset1 =
-                    blockIsOneRow ? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id))
-                                  : buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
-
-                index_t offset2 =
-                    blockIsOneRow
-                        ? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id + indOffset))
-                        : buffer2dDesc.CalculateOffset(make_tuple(thread_local_id + indOffset, 0));
-
-                compType opData1 = type_convert<compType>(block_buffer[offset1]);
-                compType opData2 = type_convert<compType>(block_buffer[offset2]);
-                binop::calculate(opData1, opData2);
-                block_buffer(offset1) = type_convert<compType>(opData1);
-            }
-
-            __syncthreads();
-        }
-
-        if(thread_local_id == 0)
-        {
-            compType tmpVal = type_convert<compType>(block_buffer[0]);
-
-            binop::calculate(accuData, tmpVal);
-        }
-    };
-
-    // This interface accumulates on both data values and indices
-    template <typename BufferType, typename IdxBufferType>
-    __device__ static void Reduce2(BufferType& block_buffer,
-                                   IdxBufferType& block_indices_buffer,
-                                   index_t toReduceBlocks,
-                                   compType& accuData,
-                                   int& accuIndex)
-    {
-        const index_t thread_local_id = get_thread_local_1d_id();
-        compType lAccuData            = opReduce::GetReductionZeroVal();
-        int lAccuIndex                = 0;
-
-        if constexpr(blockIsOneRow)
-        {
-            for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
-            {
-                for(index_t indOffset = 1; indOffset < BlockSize; indOffset *= 2)
-                {
-                    if(thread_local_id % (indOffset * 2) == 0)
-                    {
-                        index_t offset1 =
-                            buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_local_id));
-                        index_t offset2 = buffer2dDesc.CalculateOffset(
-                            make_tuple(otherDimInd, thread_local_id + indOffset));
-
-                        compType currVal1 = type_convert<compType>(block_buffer[offset1]);
-                        compType currVal2 = type_convert<compType>(block_buffer[offset2]);
-                        int currIndex1    = block_indices_buffer[offset1];
-                        int currIndex2    = block_indices_buffer[offset2];
-
-                        binop::calculate(currVal1, currVal2, currIndex1, currIndex2);
-                        block_buffer(offset1)         = type_convert<compType>(currVal1);
-                        block_indices_buffer(offset1) = currIndex1;
-                    }
-                    __syncthreads();
-                }
-            }
-
-            if(thread_local_id == 0)
-            {
-                for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
-                {
-                    index_t offset = buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, 0));
-
-                    compType tmpVal = type_convert<compType>(block_buffer[offset]);
-                    int tmpIndex    = block_indices_buffer[offset];
-
-                    binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex);
-                }
-
-                binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex);
-            }
-        }
-        else
-        {
-            index_t offset;
-
-            for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
-            {
-                offset = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, otherDimInd));
-                compType currVal = type_convert<compType>(block_buffer[offset]);
-                int currIndex    = block_indices_buffer[offset];
-
-                binop::calculate(lAccuData, currVal, lAccuIndex, currIndex);
-            }
-
-            offset = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
-
-            block_buffer(offset)         = lAccuData;
-            block_indices_buffer(offset) = lAccuIndex;
-
-            __syncthreads();
-
-            for(index_t indOffset = 1; indOffset < BlockSize; indOffset *= 2)
-            {
-                if(thread_local_id % (indOffset * 2) == 0)
-                {
-                    index_t offset1 = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
-                    index_t offset2 =
-                        buffer2dDesc.CalculateOffset(make_tuple(thread_local_id + indOffset, 0));
-
-                    compType currVal1 = type_convert<compType>(block_buffer[offset1]);
-                    compType currVal2 = type_convert<compType>(block_buffer[offset2]);
-                    int currIndex1    = block_indices_buffer[offset1];
-                    int currIndex2    = block_indices_buffer[offset2];
-
-                    binop::calculate(currVal1, currVal2, currIndex1, currIndex2);
-                    block_buffer(offset1)         = type_convert<compType>(currVal1);
-                    block_indices_buffer(offset1) = currIndex1;
-                }
-
-                __syncthreads();
-            }
-
-            if(thread_local_id == 0)
-            {
-                compType tmpVal = type_convert<compType>(block_buffer[0]);
-                int tmpIndex    = block_indices_buffer[0];
-
-                binop::calculate(accuData, tmpVal, accuIndex, tmpIndex);
-            }
-        }
-    };
-
-    template <typename BufferType>
-    __device__ static void set_buffer_value(BufferType& block_buffer, compType value)
-    {
-        index_t thread_id = get_thread_local_1d_id();
-
-        for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++)
-        {
-            index_t offset = blockIsOneRow
-                                 ? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id))
-                                 : buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd));
-
-            block_buffer(offset) = value;
-
-            __syncthreads();
-        }
-    };
-
-    // Initialize the block-wise indices buffer, the index for each element in the block-wise
-    // data buffer is calculated according to its position in the buffer and the global starting
-    // index
-    template <typename IdxBufferType>
-    __device__ static void init_buffer_indices(IdxBufferType& block_indices_buffer, int indexStart)
-    {
-        index_t thread_id = get_thread_local_1d_id();
-
-        for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++)
-        {
-            index_t offset = blockIsOneRow
-                                 ? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id))
-                                 : buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd));
-
-            block_indices_buffer(offset) = offset + indexStart;
-
-            __syncthreads();
-        }
-    };
-
-    // Execute unary operation on the block buffer elements
-    template <typename unary_op_type, typename BufferType>
-    __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& block_buffer)
-    {
-        index_t thread_id = get_thread_local_1d_id();
-
-        for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++)
-        {
-            index_t offset = blockIsOneRow
-                                 ? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id))
-                                 : buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd));
-
-            block_buffer(offset) = unary_op(block_buffer[offset]);
-
-            __syncthreads();
-        }
-    };
-};
-
-}; // end of namespace ck
-
-#endif
--- a/composable_kernel/include/tensor_operation/reduction_functions_threadwise.hpp
+++ b/composable_kernel/include/tensor_operation/reduction_functions_threadwise.hpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef CK_REDUCTION_FUNCTIONS_THREADWISE_HPP
-#define CK_REDUCTION_FUNCTIONS_THREADWISE_HPP
-
-#include "data_type.hpp"
-
-#include "reduction_common.hpp"
-#include "reduction_operator.hpp"
-#include "reduction_functions_binop.hpp"
-
-namespace ck {
-
-template <typename BufferType, typename opReduce, NanPropagation_t nanPropaOpt>
-struct ThreadReduce
-{
-    using compType = typename opReduce::dataType;
-
-    static_assert(BufferType::IsStaticBuffer(), "Thread-wise reduction needs use StaticBuffer!");
-
-    static_assert(
-        std::is_same<typename BufferType::type, compType>::value,
-        "Data type of StaticBuffer for Thread-wise reduction should be same as the compType!");
-
-    static constexpr index_t ThreadBufferLen = BufferType::Size();
-
-    using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
-
-    // This interface does not accumulate on indices
-    __device__ static void Reduce(const BufferType& thread_buffer, compType& accuData)
-    {
-        static_for<0, ThreadBufferLen, 1>{}(
-            [&](auto I) { binop::calculate(accuData, thread_buffer[I]); });
-    };
-
-    // This interface accumulates on both data values and indices and
-    // is called by Direct_ThreadWise reduction method at first-time reduction
-    __device__ static void
-    Reduce2(const BufferType& thread_buffer, compType& accuData, int& accuIndex, int indexStart)
-    {
-        static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
-            int currIndex = I + indexStart;
-            binop::calculate(accuData, thread_buffer[I], accuIndex, currIndex);
-        });
-    };
-
-    // Set the elements in the per-thread buffer to a specific value
-    // cppcheck-suppress constParameter
-    __device__ static void set_buffer_value(BufferType& thread_buffer, compType value)
-    {
-        static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; });
-    };
-
-    // Execute unary operation on the per-thread buffer elements
-    template <typename unary_op_type>
-    __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer)
-    {
-        static_for<0, ThreadBufferLen, 1>{}(
-            [&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); });
-    };
-};
-
-template <typename BufferType,
-          typename IdxBufferType,
-          typename opReduce,
-          NanPropagation_t nanPropaOpt>
-struct ThreadReduceWithIndicesInput
-{
-    using compType = typename opReduce::dataType;
-
-    static_assert(BufferType::IsStaticBuffer(), "Thread-wise reduction needs use StaticBuffer!");
-    static_assert(IdxBufferType::IsStaticBuffer(),
-                  "Thread-wise reduction needs use StaticBuffer for indices!");
-
-    static_assert(
-        std::is_same<typename BufferType::type, compType>::value,
-        "Data type of StaticBuffer for Thread-wise reduction should be same as the compType!");
-    static_assert(std::is_same<typename IdxBufferType::type, index_t>::value,
-                  "Indices type of StaticBuffer for Thread-wise reduction should be index_t!");
-
-    static_assert(BufferType::Size() == IdxBufferType::Size(),
-                  "StaticBuffers for data and indices should have the same sizes!");
-
-    static constexpr index_t ThreadBufferLen = BufferType::Size();
-
-    using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
-
-    // This interface accumulates on both data values and indices and
-    // is called by Direct_ThreadWise reduction method at second-time reduction
-    __device__ static void Reduce(const BufferType& thread_buffer,
-                                  const IdxBufferType& thread_indices_buffer,
-                                  compType& accuData,
-                                  int& accuIndex)
-    {
-        static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
-            binop::calculate(accuData, thread_buffer[I], accuIndex, thread_indices_buffer[I]);
-        });
-    };
-
-    // Set the elements in the per-thread buffer to a specific value
-    // cppcheck-suppress constParameter
-    __device__ static void set_buffer_value(BufferType& thread_buffer, compType value)
-    {
-        static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; });
-    };
-
-    // Execute unary operation on the per-thread buffer elements
-    template <typename unary_op_type>
-    __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer)
-    {
-        static_for<0, ThreadBufferLen, 1>{}(
-            [&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); });
-    };
-};
-
-}; // end of namespace ck
-
-#endif
--- a/composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp
+++ b/composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef CK_REDUCTION_FUNCTIONS_WARPWISE_HPP
-#define CK_REDUCTION_FUNCTIONS_WARPWISE_HPP
-
-#include "data_type.hpp"
-
-#include "reduction_common.hpp"
-#include "reduction_operator.hpp"
-#include "reduction_functions_binop.hpp"
-
-namespace ck {
-
-template <typename BufferType, index_t BlockSize, typename opReduce, NanPropagation_t nanPropaOpt>
-struct WarpReduce
-{
-    using compType = typename opReduce::dataType;
-    using binop    = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
-
-    static_assert(BufferType::IsStaticBuffer(),
-                  "Per-thread buffer for WarpWise reduction should be StaticBuffer!");
-    static_assert(std::is_same<typename BufferType::type, compType>::value,
-                  "Data type of per-thread StaticBuffer for WarpWise reduction should be same as "
-                  "the compType!");
-
-    static constexpr index_t ThreadBufferLen = BufferType::Size();
-    static constexpr bool have_builtin_shuffle =
-        std::is_same<compType, float>::value || std::is_same<compType, double>::value;
-
-    // This interface does not accumulate on indices
-    __device__ static void Reduce(const BufferType& thread_buffer, compType& accuData)
-    {
-        if constexpr(have_builtin_shuffle)
-            ReduceImpl1(thread_buffer, accuData);
-        else
-            ReduceImpl2(thread_buffer, accuData);
-    };
-
-    // This interface implementation uses HIP built-in device shuffling functions
-    __device__ static void ReduceImpl1(const BufferType& thread_buffer, compType& accuData)
-    {
-        compType lAccuData = opReduce::GetReductionZeroVal();
-
-        static_for<0, ThreadBufferLen, 1>{}(
-            [&](auto I) { binop::calculate(lAccuData, thread_buffer[I]); });
-
-        // synchronize among all threads in this warp
-        __all(1);
-
-        for(index_t stride = warpSize / 2; stride > 0; stride /= 2)
-        {
-            compType tmpVal = __shfl_down(lAccuData, stride, warpSize);
-            binop::calculate(lAccuData, tmpVal);
-            __all(1);
-        }
-
-        binop::calculate(accuData, lAccuData);
-    };
-
-    // This interface implementation does not use HIP built-in device shuffling functions
-    // since for fp16, built-in shuffling functions is not provided by HIP
-    __device__ static void ReduceImpl2(const BufferType& thread_buffer, compType& accuData)
-    {
-        compType lAccuData = opReduce::GetReductionZeroVal();
-
-        static_for<0, ThreadBufferLen, 1>{}(
-            [&](auto I) { binop::calculate(lAccuData, thread_buffer[I]); });
-
-        __syncthreads();
-
-        index_t thread_id        = get_thread_local_1d_id();
-        index_t warpId           = thread_id / warpSize;
-        index_t thread_inwarp_id = thread_id % warpSize;
-
-        __shared__ compType shuffle_buffer[BlockSize];
-
-        compType* myBuffer = &shuffle_buffer[warpId * warpSize];
-
-        myBuffer[thread_inwarp_id] = lAccuData;
-
-        __syncthreads();
-
-        for(index_t stride = warpSize / 2; stride > 0; stride /= 2)
-        {
-            if(thread_inwarp_id < stride)
-            {
-                compType currVal1 = myBuffer[thread_inwarp_id];
-                compType currVal2 = myBuffer[thread_inwarp_id + stride];
-
-                binop::calculate(currVal1, currVal2);
-
-                myBuffer[thread_inwarp_id] = currVal1;
-            }
-
-            __syncthreads();
-        }
-        if(thread_inwarp_id == 0)
-            binop::calculate(accuData, myBuffer[0]);
-    };
-
-    // This interface accumulates on both data values and indices and is called by Direct_WarpWise
-    // reduction method at first-time reduction
-    __device__ static void
-    Reduce2(const BufferType& thread_buffer, compType& accuData, int& accuIndex, int indexStart)
-    {
-        if constexpr(have_builtin_shuffle)
-            Reduce2Impl1(thread_buffer, accuData, accuIndex, indexStart);
-        else
-            Reduce2Impl2(thread_buffer, accuData, accuIndex, indexStart);
-    };
-
-    // This interface implementation uses HIP built-in device shuffling functions
-    __device__ static void Reduce2Impl1(const BufferType& thread_buffer,
-                                        compType& accuData,
-                                        int& accuIndex,
-                                        int indexStart)
-    {
-        compType lAccuData       = opReduce::GetReductionZeroVal();
-        int lAccuIndex           = 0;
-        index_t thread_inwarp_id = get_thread_local_1d_id() % warpSize;
-
-        static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
-            int currIndex = thread_inwarp_id * ThreadBufferLen + I + indexStart;
-            binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, currIndex);
-        });
-
-        // synchronize among all threads in this warp
-        __all(1);
-
-        for(index_t stride = 1; stride < warpSize; stride *= 2)
-        {
-            compType tmpVal = __shfl_down(lAccuData, stride, warpSize);
-            int tmpIndex    = __shfl_down(lAccuIndex, stride, warpSize);
-
-            binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex);
-            __all(1);
-        }
-
-        if(thread_inwarp_id == 0)
-            binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex);
-    };
-
-    // This interface implementation does not use HIP built-in device shuffling functions since for
-    // fp16, built-in shuffling functions is not provided by HIP
-    __device__ static void Reduce2Impl2(const BufferType& thread_buffer,
-                                        compType& accuData,
-                                        int& accuIndex,
-                                        int indexStart)
-    {
-        compType lAccuData       = opReduce::GetReductionZeroVal();
-        int lAccuIndex           = 0;
-        index_t thread_id        = get_thread_local_1d_id();
-        index_t warpId           = thread_id / warpSize;
-        index_t thread_inwarp_id = thread_id % warpSize;
-
-        static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
-            int currIndex = thread_inwarp_id * ThreadBufferLen + I + indexStart;
-            binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, currIndex);
-        });
-
-        __shared__ compType shuffle_data_buffer[BlockSize];
-        __shared__ int shuffle_indices_buffer[BlockSize];
-
-        compType* myDataBuffer = &shuffle_data_buffer[warpId * warpSize];
-        int* myIndicesBuffer   = &shuffle_indices_buffer[warpId * warpSize];
-
-        myDataBuffer[thread_inwarp_id]    = lAccuData;
-        myIndicesBuffer[thread_inwarp_id] = lAccuIndex;
-
-        __syncthreads();
-
-        for(index_t stride = 1; stride < warpSize; stride *= 2)
-        {
-            compType currVal1 = myDataBuffer[thread_inwarp_id];
-            compType currVal2 = myDataBuffer[thread_inwarp_id + stride];
-            int currIndex1    = myIndicesBuffer[thread_inwarp_id];
-            int currIndex2    = myIndicesBuffer[thread_inwarp_id + stride];
-
-            binop::calculate(currVal1, currVal2, currIndex1, currIndex2);
-
-            myDataBuffer[thread_inwarp_id]    = currVal1;
-            myIndicesBuffer[thread_inwarp_id] = currIndex1;
-
-            __syncthreads();
-        }
-
-        if(thread_inwarp_id == 0)
-            binop::calculate(accuData, myDataBuffer[0], accuIndex, myIndicesBuffer[0]);
-    };
-
-    // cppcheck-suppress constParameter
-    __device__ static void set_buffer_value(BufferType& thread_buffer, compType value)
-    {
-        static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; });
-
-        __all(1);
-    };
-
-    // Execute unary operation on the per-thread buffer elements
-    template <typename unary_op_type>
-    __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer)
-    {
-        static_for<0, ThreadBufferLen, 1>{}(
-            [&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); });
-
-        __all(1);
-    };
-};
-
-template <typename BufferType,
-          typename IdxBufferType,
-          index_t BlockSize,
-          typename opReduce,
-          NanPropagation_t nanPropaOpt>
-struct WarpReduceWithIndicesInput
-{
-    using compType = typename opReduce::dataType;
-    using binop    = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
-
-    static_assert(BufferType::IsStaticBuffer(),
-                  "Per-thread buffer for WarpWise reduction should be StaticBuffer!");
-    static_assert(IdxBufferType::IsStaticBuffer(),
-                  "Per-thread buffer for WarpWise reduction should be StaticBuffer for indices!");
-
-    static_assert(std::is_same<typename BufferType::type, compType>::value,
-                  "Data type of per-thread StaticBuffer for WarpWise reduction should be same as "
-                  "the compType!");
-    static_assert(
-        std::is_same<typename IdxBufferType::type, index_t>::value,
-        "Indices type per-thread of StaticBuffer for WarpWise reduction should be index_t!");
-
-    static_assert(BufferType::Size() == IdxBufferType::Size(),
-                  "StaticBuffers for data and indices should have the same sizes!");
-
-    static constexpr index_t ThreadBufferLen = BufferType::Size();
-    static constexpr bool have_builtin_shuffle =
-        std::is_same<compType, float>::value || std::is_same<compType, double>::value;
-
-    // This interface accumulates on both data values and indices and is called by Direct_WarpWise
-    // reduction method at second-time reduction
-    __device__ static void Reduce(const BufferType& thread_buffer,
-                                  const IdxBufferType& thread_indices_buffer,
-                                  compType& accuData,
-                                  int& accuIndex)
-    {
-        if constexpr(have_builtin_shuffle)
-            ReduceImpl1(thread_buffer, thread_indices_buffer, accuData, accuIndex);
-        else
-            ReduceImpl2(thread_buffer, thread_indices_buffer, accuData, accuIndex);
-    };
-
-    // This interface implementation uses HIP built-in device shuffling functions
-    __device__ static void ReduceImpl1(const BufferType& thread_buffer,
-                                       const IdxBufferType& thread_indices_buffer,
-                                       compType& accuData,
-                                       int& accuIndex)
-    {
-        compType lAccuData = opReduce::GetReductionZeroVal();
-        int lAccuIndex     = 0;
-
-        static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
-            binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, thread_indices_buffer[I]);
-        });
-
-        // synchronize among all threads in this warp
-        __all(1);
-
-        for(index_t stride = 1; stride < warpSize; stride *= 2)
-        {
-            compType tmpVal = __shfl_down(lAccuData, stride, warpSize);
-            int tmpIndex    = __shfl_down(lAccuIndex, stride, warpSize);
-
-            binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex);
-            __all(1);
-        }
-
-        binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex);
-    };
-
-    // This interface implementation does not use HIP built-in device shuffling functions
-    // since for fp16, built-in shuffling functions is not provided by HIP
-    __device__ static void ReduceImpl2(const BufferType& thread_buffer,
-                                       const IdxBufferType& thread_indices_buffer,
-                                       compType& accuData,
-                                       int& accuIndex)
-    {
-        compType lAccuData       = opReduce::GetReductionZeroVal();
-        int lAccuIndex           = 0;
-        index_t thread_id        = get_thread_local_1d_id();
-        index_t warpId           = thread_id / warpSize;
-        index_t thread_inwarp_id = thread_id % warpSize;
-
-        static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
-            binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, thread_indices_buffer[I]);
-        });
-
-        __shared__ compType shuffle_data_buffer[BlockSize];
-        __shared__ int shuffle_indices_buffer[BlockSize];
-
-        compType* myDataBuffer = &shuffle_data_buffer[warpId * warpSize];
-        int* myIndicesBuffer   = &shuffle_indices_buffer[warpId * warpSize];
-
-        myDataBuffer[thread_inwarp_id]    = lAccuData;
-        myIndicesBuffer[thread_inwarp_id] = lAccuIndex;
-
-        __syncthreads();
-
-        for(index_t stride = 1; stride < warpSize; stride *= 2)
-        {
-            compType currVal1 = myDataBuffer[thread_inwarp_id];
-            compType currVal2 = myDataBuffer[thread_inwarp_id + stride];
-            int currIndex1    = myIndicesBuffer[thread_inwarp_id];
-            int currIndex2    = myIndicesBuffer[thread_inwarp_id + stride];
-
-            binop::calculate(currVal1, currVal2, currIndex1, currIndex2);
-
-            myDataBuffer[thread_inwarp_id]    = currVal1;
-            myIndicesBuffer[thread_inwarp_id] = currIndex1;
-
-            __syncthreads();
-        }
-
-        if(thread_inwarp_id == 0)
-            binop::calculate(accuData, myDataBuffer[0], accuIndex, myIndicesBuffer[0]);
-    };
-
-    // cppcheck-suppress constParameter
-    __device__ static void set_buffer_value(BufferType& thread_buffer, compType value)
-    {
-        static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; });
-
-        __all(1);
-    };
-
-    // Execute unary operation on the per-thread buffer elements
-    template <typename unary_op_type>
-    __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer)
-    {
-        static_for<0, ThreadBufferLen, 1>{}(
-            [&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); });
-
-        __all(1);
-    };
-};
-
-}; // end of namespace ck
-
-#endif
--- a/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp
+++ b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm_dlops_v1r2.hpp"
-#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
-
-using namespace ck;
-
-constexpr DataTypeEnum_t ABDataTypeEnum  = static_cast<DataTypeEnum_t>(CK_PARAM_ABDataTypeEnum);
-constexpr DataTypeEnum_t AccDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_AccDataTypeEnum);
-constexpr DataTypeEnum_t CDataTypeEnum   = static_cast<DataTypeEnum_t>(CK_PARAM_CDataTypeEnum);
-
-using FloatAB  = typename get_datatype_from_enum<ABDataTypeEnum>::type;
-using FloatAcc = typename get_datatype_from_enum<AccDataTypeEnum>::type;
-using FloatC   = typename get_datatype_from_enum<CDataTypeEnum>::type;
-
-constexpr index_t BlockSize = CK_PARAM_BlockSize;
-
-constexpr index_t MPerBlock            = CK_PARAM_MPerBlock;
-constexpr index_t NPerBlock            = CK_PARAM_NPerBlock;
-constexpr index_t KPerBlock            = CK_PARAM_KPerBlock;
-constexpr index_t M1PerThread          = CK_PARAM_M1PerThread;
-constexpr index_t N1PerThread          = CK_PARAM_N1PerThread;
-constexpr index_t KPerThread           = CK_PARAM_KPerThread;
-constexpr index_t M1N1ThreadClusterM10 = CK_PARAM_M1N1ThreadClusterM10;
-constexpr index_t M1N1ThreadClusterN10 = CK_PARAM_M1N1ThreadClusterN10;
-constexpr index_t M1N1ThreadClusterM11 = CK_PARAM_M1N1ThreadClusterM11;
-constexpr index_t M1N1ThreadClusterN11 = CK_PARAM_M1N1ThreadClusterN11;
-
-using ABlockTransferThreadSliceLengths_K_M0_M1 =
-    Sequence<CK_PARAM_ABlockTransferThreadSliceLengths_K_M0_M1>;
-using ABlockTransferThreadClusterLengths_K_M0_M1 =
-    Sequence<CK_PARAM_ABlockTransferThreadClusterLengths_K_M0_M1>;
-using ABlockTransferThreadClusterArrangeOrder =
-    Sequence<CK_PARAM_ABlockTransferThreadClusterArrangeOrder>;
-using ABlockTransferSrcAccessOrder = Sequence<CK_PARAM_ABlockTransferSrcAccessOrder>;
-
-constexpr index_t ABlockTransferSrcVectorDim       = CK_PARAM_ABlockTransferSrcVectorDim;
-constexpr index_t ABlockTransferSrcScalarPerVector = CK_PARAM_ABlockTransferSrcScalarPerVector;
-constexpr index_t ABlockTransferDstScalarPerVector_M1 =
-    CK_PARAM_ABlockTransferDstScalarPerVector_M1;
-constexpr bool AThreadTransferSrcResetCoordinateAfterRun =
-    static_cast<bool>(CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun);
-
-using BBlockTransferThreadSliceLengths_K_N0_N1 =
-    Sequence<CK_PARAM_BBlockTransferThreadSliceLengths_K_N0_N1>;
-using BBlockTransferThreadClusterLengths_K_N0_N1 =
-    Sequence<CK_PARAM_BBlockTransferThreadClusterLengths_K_N0_N1>;
-using BBlockTransferThreadClusterArrangeOrder =
-    Sequence<CK_PARAM_BBlockTransferThreadClusterArrangeOrder>;
-using BBlockTransferSrcAccessOrder = Sequence<CK_PARAM_BBlockTransferSrcAccessOrder>;
-
-constexpr index_t BBlockTransferSrcVectorDim       = CK_PARAM_BBlockTransferSrcVectorDim;
-constexpr index_t BBlockTransferSrcScalarPerVector = CK_PARAM_BBlockTransferSrcScalarPerVector;
-constexpr index_t BBlockTransferDstScalarPerVector_N1 =
-    CK_PARAM_BBlockTransferDstScalarPerVector_N1;
-constexpr bool BThreadTransferSrcResetCoordinateAfterRun =
-    static_cast<bool>(CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun);
-
-using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDstAccessOrder>;
-constexpr index_t CThreadTransferSrcDstVectorDim    = CK_PARAM_CThreadTransferSrcDstVectorDim;
-constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
-
-constexpr bool HasMainKBlockLoop       = static_cast<bool>(CK_PARAM_HAS_MAIN_KBLOCK_LOOP);
-constexpr bool HasDoubleTailKBlockLoop = static_cast<bool>(CK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP);
-
-extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare(
-    int n,
-    int c,
-    int hi,
-    int wi,
-    int k,
-    int y,
-    int x,
-    int convStrideH,
-    int convStrideW,
-    int convDilationY,
-    int convDilationX,
-    int leftPadH,
-    int leftPadW,
-    int rightPadH,
-    int rightPadW,
-    void* p_a_k_m0_m1_grid_desc,
-    void* p_b_k_n0_n1_grid_desc,
-    void* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
-    void* p_cblockid_to_m0_n0_block_cluster_adaptor)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-
-    const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1;
-    const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1;
-
-    const auto in_n_c_hi_wi_desc  = make_naive_tensor_descriptor_packed(make_tuple(n, c, hi, wi));
-    const auto wei_k_c_y_x_desc   = make_naive_tensor_descriptor_packed(make_tuple(k, c, y, x));
-    const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(make_tuple(n, k, ho, wo));
-
-    const auto descs = transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(
-        wei_k_c_y_x_desc,
-        in_n_c_hi_wi_desc,
-        out_n_k_ho_wo_desc,
-        make_tuple(convStrideH, convStrideW),
-        make_tuple(convDilationY, convDilationX),
-        make_tuple(leftPadH, leftPadW),
-        make_tuple(rightPadH, rightPadW));
-
-    const auto a_k_m_grid_desc = descs[I0];
-    const auto b_k_n_grid_desc = descs[I1];
-    const auto c_m_n_grid_desc = descs[I2];
-
-    using AKMGridDesc = decltype(a_k_m_grid_desc);
-    using BKNGridDesc = decltype(b_k_n_grid_desc);
-    using CMNGridDesc = decltype(c_m_n_grid_desc);
-
-    using AGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{}),
-                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{})));
-
-    using BGridStepHacks =
-        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}),
-                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{})));
-
-    using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 1, 0, 0>{},
-                                                          Sequence<0, 0, 1, 0, 0>{},
-                                                          Sequence<0, 0, 1, 0, 0>{}),
-                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 2, 0, 0>{},
-                                                          Sequence<0, 0, 2, 0, 0>{},
-                                                          Sequence<0, 0, 2, 0, 0>{})));
-
-    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
-    using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
-
-    using GridwiseGemm =
-        GridwiseGemmDlops_km_kn_mn_v1r2<BlockSize,
-                                        FloatAB,
-                                        FloatAcc,
-                                        FloatC,
-                                        InMemoryDataOperationEnum_t::Set, /* ToDo tunable */
-                                        AKMGridDesc,
-                                        BKNGridDesc,
-                                        CMNGridDesc,
-                                        MPerBlock,
-                                        NPerBlock,
-                                        KPerBlock,
-                                        M1PerThread,
-                                        N1PerThread,
-                                        KPerThread,
-                                        M1N1ThreadClusterM10,
-                                        M1N1ThreadClusterN10,
-                                        M1N1ThreadClusterM11,
-                                        M1N1ThreadClusterN11,
-                                        ABlockTransferThreadSliceLengths_K_M0_M1,
-                                        ABlockTransferThreadClusterLengths_K_M0_M1,
-                                        ABlockTransferThreadClusterArrangeOrder,
-                                        ABlockTransferSrcAccessOrder,
-                                        ABlockTransferSrcVectorDim,
-                                        ABlockTransferSrcScalarPerVector,
-                                        ABlockTransferDstScalarPerVector_M1,
-                                        AThreadTransferSrcResetCoordinateAfterRun,
-                                        BBlockTransferThreadSliceLengths_K_N0_N1,
-                                        BBlockTransferThreadClusterLengths_K_N0_N1,
-                                        BBlockTransferThreadClusterArrangeOrder,
-                                        BBlockTransferSrcAccessOrder,
-                                        BBlockTransferSrcVectorDim,
-                                        BBlockTransferSrcScalarPerVector,
-                                        BBlockTransferDstScalarPerVector_N1,
-                                        BThreadTransferSrcResetCoordinateAfterRun,
-                                        CThreadTransferSrcDstAccessOrder,
-                                        CThreadTransferSrcDstVectorDim,
-                                        CThreadTransferDstScalarPerVector,
-                                        AGridStepHacks,
-                                        BGridStepHacks,
-                                        CGridStepHacks,
-                                        AGridMoveSliceWindowStepHacks,
-                                        BGridMoveSliceWindowStepHacks>;
-
-    auto a_k_m0_m1_grid_desc = GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
-    auto b_k_n0_n1_grid_desc = GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc);
-    auto c_m0_m10_m11_n0_n10_n11_grid_desc =
-        GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc);
-    auto cblockid_to_m0_n0_block_cluster_adaptor =
-        GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc);
-
-    if(hipThreadIdx_x == 0)
-    {
-        *static_cast<decltype(a_k_m0_m1_grid_desc)*>(p_a_k_m0_m1_grid_desc) = a_k_m0_m1_grid_desc;
-        *static_cast<decltype(b_k_n0_n1_grid_desc)*>(p_b_k_n0_n1_grid_desc) = b_k_n0_n1_grid_desc;
-        *static_cast<decltype(c_m0_m10_m11_n0_n10_n11_grid_desc)*>(
-            p_c_m0_m10_m11_n0_n10_n11_grid_desc) = c_m0_m10_m11_n0_n10_n11_grid_desc;
-        *static_cast<decltype(cblockid_to_m0_n0_block_cluster_adaptor)*>(
-            p_cblockid_to_m0_n0_block_cluster_adaptor) = cblockid_to_m0_n0_block_cluster_adaptor;
-    };
-};
-
-extern "C" __global__ void
-#if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
-#endif
-        convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            const void CONSTANT* p_a_k_m0_m1_grid_desc,
-            const void CONSTANT* p_b_k_n0_n1_grid_desc,
-            const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
-            const void CONSTANT* p_cblockid_to_m0_n0_block_cluster_adaptor)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-
-    constexpr auto in_n_c_hi_wi_desc =
-        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
-    constexpr auto wei_k_c_y_x_desc =
-        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 3, 3));
-    constexpr auto out_n_k_ho_wo_desc =
-        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
-
-    constexpr auto descs =
-        transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
-                                                                        in_n_c_hi_wi_desc,
-                                                                        out_n_k_ho_wo_desc,
-                                                                        make_tuple(1, 1),
-                                                                        make_tuple(1, 1),
-                                                                        make_tuple(1, 1),
-                                                                        make_tuple(1, 1));
-
-    constexpr auto a_k_m_grid_desc = descs[I0];
-    constexpr auto b_k_n_grid_desc = descs[I1];
-    constexpr auto c_m_n_grid_desc = descs[I2];
-
-    using AKMGridDesc = decltype(a_k_m_grid_desc);
-    using BKNGridDesc = decltype(b_k_n_grid_desc);
-    using CMNGridDesc = decltype(c_m_n_grid_desc);
-
-    using AGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{}),
-                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{})));
-
-    using BGridStepHacks =
-        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}),
-                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{})));
-
-    using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 1, 0, 0>{},
-                                                          Sequence<0, 0, 1, 0, 0>{},
-                                                          Sequence<0, 0, 1, 0, 0>{}),
-                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 2, 0, 0>{},
-                                                          Sequence<0, 0, 2, 0, 0>{},
-                                                          Sequence<0, 0, 2, 0, 0>{})));
-
-    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
-    using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
-
-    using GridwiseGemm =
-        GridwiseGemmDlops_km_kn_mn_v1r2<BlockSize,
-                                        FloatAB,
-                                        FloatAcc,
-                                        FloatC,
-                                        InMemoryDataOperationEnum_t::Set, /* ToDo tunable */
-                                        AKMGridDesc,
-                                        BKNGridDesc,
-                                        CMNGridDesc,
-                                        MPerBlock,
-                                        NPerBlock,
-                                        KPerBlock,
-                                        M1PerThread,
-                                        N1PerThread,
-                                        KPerThread,
-                                        M1N1ThreadClusterM10,
-                                        M1N1ThreadClusterN10,
-                                        M1N1ThreadClusterM11,
-                                        M1N1ThreadClusterN11,
-                                        ABlockTransferThreadSliceLengths_K_M0_M1,
-                                        ABlockTransferThreadClusterLengths_K_M0_M1,
-                                        ABlockTransferThreadClusterArrangeOrder,
-                                        ABlockTransferSrcAccessOrder,
-                                        ABlockTransferSrcVectorDim,
-                                        ABlockTransferSrcScalarPerVector,
-                                        ABlockTransferDstScalarPerVector_M1,
-                                        AThreadTransferSrcResetCoordinateAfterRun,
-                                        BBlockTransferThreadSliceLengths_K_N0_N1,
-                                        BBlockTransferThreadClusterLengths_K_N0_N1,
-                                        BBlockTransferThreadClusterArrangeOrder,
-                                        BBlockTransferSrcAccessOrder,
-                                        BBlockTransferSrcVectorDim,
-                                        BBlockTransferSrcScalarPerVector,
-                                        BBlockTransferDstScalarPerVector_N1,
-                                        BThreadTransferSrcResetCoordinateAfterRun,
-                                        CThreadTransferSrcDstAccessOrder,
-                                        CThreadTransferSrcDstVectorDim,
-                                        CThreadTransferDstScalarPerVector,
-                                        AGridStepHacks,
-                                        BGridStepHacks,
-                                        CGridStepHacks,
-                                        AGridMoveSliceWindowStepHacks,
-                                        BGridMoveSliceWindowStepHacks>;
-
-    constexpr auto a_k_m0_m1_grid_desc_tmp =
-        GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
-    constexpr auto b_k_n0_n1_grid_desc_tmp =
-        GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc);
-    constexpr auto c_m0_m10_m11_n0_n10_n11_grid_desc_tmp =
-        GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc);
-    constexpr auto cblockid_to_m0_n0_block_cluster_adaptor_tmp =
-        GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc);
-
-    using AKM0M1GridDesc                    = decltype(a_k_m0_m1_grid_desc_tmp);
-    using BKN0N1GridDesc                    = decltype(b_k_n0_n1_grid_desc_tmp);
-    using CM0M10M11N0N10N11GridDesc         = decltype(c_m0_m10_m11_n0_n10_n11_grid_desc_tmp);
-    using CBlockIdToM0N0BlockClusterAdaptor = decltype(cblockid_to_m0_n0_block_cluster_adaptor_tmp);
-
-    const auto a_k_m0_m1_grid_desc =
-        *reinterpret_cast<const AKM0M1GridDesc*>((const void*)p_a_k_m0_m1_grid_desc);
-    const auto b_k_n0_n1_grid_desc =
-        *reinterpret_cast<const BKN0N1GridDesc*>((const void*)p_b_k_n0_n1_grid_desc);
-    const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
-        *reinterpret_cast<const CM0M10M11N0N10N11GridDesc*>(
-            (const void*)p_c_m0_m10_m11_n0_n10_n11_grid_desc);
-    const auto cblockid_to_m0_n0_block_cluster_adaptor =
-        *reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
-            (const void*)p_cblockid_to_m0_n0_block_cluster_adaptor);
-
-    constexpr index_t shared_block_size =
-        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
-
-    __shared__ FloatAB p_shared_block[shared_block_size];
-
-    GridwiseGemm::Run(p_a_grid,
-                      p_b_grid,
-                      p_c_grid,
-                      p_shared_block,
-                      a_k_m0_m1_grid_desc,
-                      b_k_n0_n1_grid_desc,
-                      c_m0_m10_m11_n0_n10_n11_grid_desc,
-                      cblockid_to_m0_n0_block_cluster_adaptor,
-                      integral_constant<bool, HasMainKBlockLoop>{},
-                      integral_constant<bool, HasDoubleTailKBlockLoop>{});
-};
--- a/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
+++ b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm_xdlops_v2r3.hpp"
-#include "transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp"
-
-using namespace ck;
-
-constexpr DataTypeEnum_t ABDataTypeEnum  = static_cast<DataTypeEnum_t>(CK_PARAM_ABDataTypeEnum);
-constexpr DataTypeEnum_t AccDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_AccDataTypeEnum);
-constexpr DataTypeEnum_t CDataTypeEnum   = static_cast<DataTypeEnum_t>(CK_PARAM_CDataTypeEnum);
-
-using FloatAB  = typename get_datatype_from_enum<ABDataTypeEnum>::type;
-using FloatAcc = typename get_datatype_from_enum<AccDataTypeEnum>::type;
-using FloatC   = typename get_datatype_from_enum<CDataTypeEnum>::type;
-
-constexpr index_t BlockSize = CK_PARAM_BlockSize;
-
-constexpr index_t MPerBlock = CK_PARAM_MPerBlock;
-constexpr index_t NPerBlock = CK_PARAM_NPerBlock;
-constexpr index_t KPerBlock = CK_PARAM_KPerBlock;
-
-constexpr index_t MPerWave = CK_PARAM_MPerWave;
-constexpr index_t NPerWave = CK_PARAM_NPerWave;
-constexpr index_t MRepeat  = CK_PARAM_MRepeat;
-constexpr index_t NRepeat  = CK_PARAM_NRepeat;
-constexpr index_t K1       = CK_PARAM_K1;
-
-using ABlockTransferThreadSliceLengths_K0_M_K1 =
-    Sequence<CK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1>;
-using ABlockTransferThreadClusterLengths_K0_M_K1 =
-    Sequence<CK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1>;
-using ABlockTransferThreadClusterArrangeOrder =
-    Sequence<CK_PARAM_ABlockTransferThreadClusterArrangeOrder>;
-using ABlockTransferSrcAccessOrder = Sequence<CK_PARAM_ABlockTransferSrcAccessOrder>;
-
-constexpr index_t ABlockTransferSrcVectorDim       = CK_PARAM_ABlockTransferSrcVectorDim;
-constexpr index_t ABlockTransferSrcScalarPerVector = CK_PARAM_ABlockTransferSrcScalarPerVector;
-constexpr index_t ABlockTransferDstScalarPerVector_K1 =
-    CK_PARAM_ABlockTransferDstScalarPerVector_K1;
-constexpr bool AThreadTransferSrcResetCoordinateAfterRun =
-    static_cast<bool>(CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun);
-
-using BBlockTransferThreadSliceLengths_K0_N_K1 =
-    Sequence<CK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1>;
-using BBlockTransferThreadClusterLengths_K0_N_K1 =
-    Sequence<CK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1>;
-using BBlockTransferThreadClusterArrangeOrder =
-    Sequence<CK_PARAM_BBlockTransferThreadClusterArrangeOrder>;
-using BBlockTransferSrcAccessOrder = Sequence<CK_PARAM_BBlockTransferSrcAccessOrder>;
-
-constexpr index_t BBlockTransferSrcVectorDim       = CK_PARAM_BBlockTransferSrcVectorDim;
-constexpr index_t BBlockTransferSrcScalarPerVector = CK_PARAM_BBlockTransferSrcScalarPerVector;
-constexpr index_t BBlockTransferDstScalarPerVector_K1 =
-    CK_PARAM_BBlockTransferDstScalarPerVector_K1;
-constexpr bool BThreadTransferSrcResetCoordinateAfterRun =
-    static_cast<bool>(CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun);
-
-using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDstAccessOrder>;
-constexpr index_t CThreadTransferSrcDstVectorDim    = CK_PARAM_CThreadTransferSrcDstVectorDim;
-constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
-
-extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare(
-    int n,
-    int c,
-    int hi,
-    int wi,
-    int k,
-    int y,
-    int x,
-    int convStrideH,
-    int convStrideW,
-    int convDilationY,
-    int convDilationX,
-    int leftPadH,
-    int leftPadW,
-    int rightPadH,
-    int rightPadW,
-    void* p_a_k0_m_k1_grid_desc,
-    void* p_b_k0_n_k1_grid_desc,
-    void* p_c_m0_m1_m2_n_grid_desc,
-    void* p_cblockid_to_m0_n0_block_cluster_adaptor)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-
-    const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1;
-    const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1;
-
-    const auto in_n_c_hi_wi_desc  = make_naive_tensor_descriptor_packed(make_tuple(n, c, hi, wi));
-    const auto wei_k_c_y_x_desc   = make_naive_tensor_descriptor_packed(make_tuple(k, c, y, x));
-    const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(make_tuple(n, k, ho, wo));
-
-    const auto descs = transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
-        wei_k_c_y_x_desc,
-        in_n_c_hi_wi_desc,
-        out_n_k_ho_wo_desc,
-        make_tuple(convStrideH, convStrideW),
-        make_tuple(convDilationY, convDilationX),
-        make_tuple(leftPadH, leftPadW),
-        make_tuple(rightPadH, rightPadW),
-        Number<K1>{});
-
-    const auto a_k0_m_k1_grid_desc = descs[I0];
-    const auto b_k0_n_k1_grid_desc = descs[I1];
-    const auto c_m_n_grid_desc     = descs[I2];
-
-    using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc);
-    using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc);
-    using CMNGridDesc    = decltype(c_m_n_grid_desc);
-
-    using AGridStepHacks = decltype(make_tuple(
-        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
-        make_tuple(
-            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
-
-    using BGridStepHacks =
-        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
-                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
-
-    using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 1, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 1, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 1, 0, 0>{}),
-                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 2, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 2, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 2, 0, 0>{})));
-
-    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
-    using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
-
-    using GridwiseGemm =
-        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
-                                                FloatAB,
-                                                FloatAcc,
-                                                FloatC,
-                                                InMemoryDataOperationEnum_t::Set,
-                                                AK0MK1GridDesc,
-                                                BK0NK1GridDesc,
-                                                CMNGridDesc,
-                                                MPerBlock,
-                                                NPerBlock,
-                                                KPerBlock,
-                                                MPerWave,
-                                                NPerWave,
-                                                K1,
-                                                MRepeat,
-                                                NRepeat,
-                                                ABlockTransferThreadSliceLengths_K0_M_K1,
-                                                ABlockTransferThreadClusterLengths_K0_M_K1,
-                                                ABlockTransferThreadClusterArrangeOrder,
-                                                ABlockTransferSrcAccessOrder,
-                                                ABlockTransferSrcVectorDim,
-                                                ABlockTransferSrcScalarPerVector,
-                                                ABlockTransferDstScalarPerVector_K1,
-                                                AThreadTransferSrcResetCoordinateAfterRun,
-                                                BBlockTransferThreadSliceLengths_K0_N_K1,
-                                                BBlockTransferThreadClusterLengths_K0_N_K1,
-                                                BBlockTransferThreadClusterArrangeOrder,
-                                                BBlockTransferSrcAccessOrder,
-                                                BBlockTransferSrcVectorDim,
-                                                BBlockTransferSrcScalarPerVector,
-                                                BBlockTransferDstScalarPerVector_K1,
-                                                BThreadTransferSrcResetCoordinateAfterRun,
-                                                CThreadTransferSrcDstAccessOrder,
-                                                CThreadTransferSrcDstVectorDim,
-                                                CThreadTransferDstScalarPerVector,
-                                                AGridStepHacks,
-                                                BGridStepHacks,
-                                                CGridStepHacks,
-                                                AGridMoveSliceWindowStepHacks,
-                                                BGridMoveSliceWindowStepHacks,
-                                                false>;
-
-    auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
-
-    auto cblockid_to_m0_n0_block_cluster_adaptor =
-        GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
-
-    if(hipThreadIdx_x == 0)
-    {
-        *static_cast<remove_cv_t<decltype(a_k0_m_k1_grid_desc)>*>(p_a_k0_m_k1_grid_desc) =
-            a_k0_m_k1_grid_desc;
-        *static_cast<remove_cv_t<decltype(b_k0_n_k1_grid_desc)>*>(p_b_k0_n_k1_grid_desc) =
-            b_k0_n_k1_grid_desc;
-        *static_cast<decltype(c_m0_m1_m2_n_grid_desc)*>(p_c_m0_m1_m2_n_grid_desc) =
-            c_m0_m1_m2_n_grid_desc;
-        *static_cast<decltype(cblockid_to_m0_n0_block_cluster_adaptor)*>(
-            p_cblockid_to_m0_n0_block_cluster_adaptor) = cblockid_to_m0_n0_block_cluster_adaptor;
-    }
-};
-
-extern "C" __global__ void
-#if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
-#endif
-        convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            const void CONSTANT* p_a_k0_m_k1_grid_desc,
-            const void CONSTANT* p_b_k0_n_k1_grid_desc,
-            const void CONSTANT* p_c_m0_m1_m2_n_grid_desc,
-            const void CONSTANT* p_cblockid_to_m0_n0_block_cluster_adaptor)
-{
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-
-    constexpr auto in_n_c_hi_wi_desc =
-        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
-    constexpr auto wei_k_c_y_x_desc =
-        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 3, 3));
-    constexpr auto out_n_k_ho_wo_desc =
-        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
-
-    constexpr auto descs =
-        transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
-                                                                          in_n_c_hi_wi_desc,
-                                                                          out_n_k_ho_wo_desc,
-                                                                          make_tuple(1, 1),
-                                                                          make_tuple(1, 1),
-                                                                          make_tuple(1, 1),
-                                                                          make_tuple(1, 1),
-                                                                          Number<K1>{});
-
-    constexpr auto a_k0_m_k1_grid_desc_tmp = descs[I0];
-    constexpr auto b_k0_n_k1_grid_desc_tmp = descs[I1];
-    constexpr auto c_m_n_grid_desc         = descs[I2];
-
-    using AGridStepHacks = decltype(make_tuple(
-        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
-        make_tuple(
-            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
-
-    using BGridStepHacks =
-        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
-                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
-
-    using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 1, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 1, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 1, 0, 0>{}),
-                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 2, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 2, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 2, 0, 0>{})));
-
-    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
-    using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
-
-    using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc_tmp);
-    using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc_tmp);
-    using CMNGridDesc    = decltype(c_m_n_grid_desc);
-
-    using GridwiseGemm =
-        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
-                                                FloatAB,
-                                                FloatAcc,
-                                                FloatC,
-                                                InMemoryDataOperationEnum_t::Set,
-                                                AK0MK1GridDesc,
-                                                BK0NK1GridDesc,
-                                                CMNGridDesc,
-                                                MPerBlock,
-                                                NPerBlock,
-                                                KPerBlock,
-                                                MPerWave,
-                                                NPerWave,
-                                                K1,
-                                                MRepeat,
-                                                NRepeat,
-                                                ABlockTransferThreadSliceLengths_K0_M_K1,
-                                                ABlockTransferThreadClusterLengths_K0_M_K1,
-                                                ABlockTransferThreadClusterArrangeOrder,
-                                                ABlockTransferSrcAccessOrder,
-                                                ABlockTransferSrcVectorDim,
-                                                ABlockTransferSrcScalarPerVector,
-                                                ABlockTransferDstScalarPerVector_K1,
-                                                AThreadTransferSrcResetCoordinateAfterRun,
-                                                BBlockTransferThreadSliceLengths_K0_N_K1,
-                                                BBlockTransferThreadClusterLengths_K0_N_K1,
-                                                BBlockTransferThreadClusterArrangeOrder,
-                                                BBlockTransferSrcAccessOrder,
-                                                BBlockTransferSrcVectorDim,
-                                                BBlockTransferSrcScalarPerVector,
-                                                BBlockTransferDstScalarPerVector_K1,
-                                                BThreadTransferSrcResetCoordinateAfterRun,
-                                                CThreadTransferSrcDstAccessOrder,
-                                                CThreadTransferSrcDstVectorDim,
-                                                CThreadTransferDstScalarPerVector,
-                                                AGridStepHacks,
-                                                BGridStepHacks,
-                                                CGridStepHacks,
-                                                AGridMoveSliceWindowStepHacks,
-                                                BGridMoveSliceWindowStepHacks,
-                                                false>;
-
-    constexpr auto c_m0_m1_m2_n_grid_desc_tmp =
-        GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
-    constexpr auto cblockid_to_m0_n0_block_cluster_adaptor_tmp =
-        GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
-
-    using CM0M1M2NGridDesc                  = decltype(c_m0_m1_m2_n_grid_desc_tmp);
-    using CBlockIdToM0N0BlockClusterAdaptor = decltype(cblockid_to_m0_n0_block_cluster_adaptor_tmp);
-
-    const auto a_k0_m_k1_grid_desc =
-        *reinterpret_cast<const AK0MK1GridDesc*>((const void*)p_a_k0_m_k1_grid_desc);
-    const auto b_k0_n_k1_grid_desc =
-        *reinterpret_cast<const BK0NK1GridDesc*>((const void*)p_b_k0_n_k1_grid_desc);
-    const auto c_m0_m1_m2_n_grid_desc =
-        *reinterpret_cast<const CM0M1M2NGridDesc*>((const void*)p_c_m0_m1_m2_n_grid_desc);
-    const auto cblockid_to_m0_n0_block_cluster_adaptor =
-        *reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
-            (const void*)p_cblockid_to_m0_n0_block_cluster_adaptor);
-
-    constexpr index_t shared_block_size =
-        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
-
-    __shared__ FloatAB p_shared_block[shared_block_size];
-
-    GridwiseGemm::Run(p_a_grid,
-                      p_b_grid,
-                      p_c_grid,
-                      p_shared_block,
-                      a_k0_m_k1_grid_desc,
-                      b_k0_n_k1_grid_desc,
-                      c_m0_m1_m2_n_grid_desc,
-                      cblockid_to_m0_n0_block_cluster_adaptor);
-};
--- a/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
+++ b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm_xdlops_v2r3.hpp"
-#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
-
-using namespace ck;
-
-constexpr DataTypeEnum_t ABDataTypeEnum  = static_cast<DataTypeEnum_t>(CK_PARAM_ABDataTypeEnum);
-constexpr DataTypeEnum_t AccDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_AccDataTypeEnum);
-constexpr DataTypeEnum_t CDataTypeEnum   = static_cast<DataTypeEnum_t>(CK_PARAM_CDataTypeEnum);
-
-using FloatAB  = typename get_datatype_from_enum<ABDataTypeEnum>::type;
-using FloatAcc = typename get_datatype_from_enum<AccDataTypeEnum>::type;
-using FloatC   = typename get_datatype_from_enum<CDataTypeEnum>::type;
-
-constexpr index_t BlockSize = CK_PARAM_BlockSize;
-
-constexpr index_t MPerBlock = CK_PARAM_MPerBlock;
-constexpr index_t NPerBlock = CK_PARAM_NPerBlock;
-constexpr index_t KPerBlock = CK_PARAM_KPerBlock;
-
-constexpr index_t MPerWave = CK_PARAM_MPerWave;
-constexpr index_t NPerWave = CK_PARAM_NPerWave;
-constexpr index_t MRepeat  = CK_PARAM_MRepeat;
-constexpr index_t NRepeat  = CK_PARAM_NRepeat;
-constexpr index_t K1       = CK_PARAM_K1;
-
-using ABlockTransferThreadSliceLengths_K0_M_K1 =
-    Sequence<CK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1>;
-using ABlockTransferThreadClusterLengths_K0_M_K1 =
-    Sequence<CK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1>;
-using ABlockTransferThreadClusterArrangeOrder =
-    Sequence<CK_PARAM_ABlockTransferThreadClusterArrangeOrder>;
-using ABlockTransferSrcAccessOrder = Sequence<CK_PARAM_ABlockTransferSrcAccessOrder>;
-
-constexpr index_t ABlockTransferSrcVectorDim       = CK_PARAM_ABlockTransferSrcVectorDim;
-constexpr index_t ABlockTransferSrcScalarPerVector = CK_PARAM_ABlockTransferSrcScalarPerVector;
-constexpr index_t ABlockTransferDstScalarPerVector_K1 =
-    CK_PARAM_ABlockTransferDstScalarPerVector_K1;
-constexpr bool AThreadTransferSrcResetCoordinateAfterRun =
-    static_cast<bool>(CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun);
-
-using BBlockTransferThreadSliceLengths_K0_N_K1 =
-    Sequence<CK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1>;
-using BBlockTransferThreadClusterLengths_K0_N_K1 =
-    Sequence<CK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1>;
-using BBlockTransferThreadClusterArrangeOrder =
-    Sequence<CK_PARAM_BBlockTransferThreadClusterArrangeOrder>;
-using BBlockTransferSrcAccessOrder = Sequence<CK_PARAM_BBlockTransferSrcAccessOrder>;
-
-constexpr index_t BBlockTransferSrcVectorDim       = CK_PARAM_BBlockTransferSrcVectorDim;
-constexpr index_t BBlockTransferSrcScalarPerVector = CK_PARAM_BBlockTransferSrcScalarPerVector;
-constexpr index_t BBlockTransferDstScalarPerVector_K1 =
-    CK_PARAM_BBlockTransferDstScalarPerVector_K1;
-constexpr bool BThreadTransferSrcResetCoordinateAfterRun =
-    static_cast<bool>(CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun);
-
-using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDstAccessOrder>;
-constexpr index_t CThreadTransferSrcDstVectorDim    = CK_PARAM_CThreadTransferSrcDstVectorDim;
-constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
-
-extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare(
-    int n,
-    int hi,
-    int wi,
-    int c,
-    int k,
-    int y,
-    int x,
-    int convStrideH,
-    int convStrideW,
-    int convDilationY,
-    int convDilationX,
-    int leftPadH,
-    int leftPadW,
-    int rightPadH,
-    int rightPadW,
-    void* p_a_k0_m_k1_grid_desc,
-    void* p_b_k0_n_k1_grid_desc,
-    void* p_c_m0_m1_m2_n_grid_desc,
-    void* p_cblockid_to_m0_n0_block_cluster_adaptor)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-
-    const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1;
-    const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1;
-
-    const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(make_tuple(n, hi, wi, c));
-    const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(make_tuple(k, y, x, c));
-    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(make_tuple(n, ho, wo, k));
-
-    const auto descs = transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk(
-        in_n_hi_wi_c_desc,
-        wei_k_y_x_c_desc,
-        out_n_ho_wo_k_desc,
-        make_tuple(convStrideH, convStrideW),
-        make_tuple(convDilationY, convDilationX),
-        make_tuple(leftPadH, leftPadW),
-        make_tuple(rightPadH, rightPadW),
-        Number<K1>{});
-
-    const auto a_k0_m_k1_grid_desc = descs[I0];
-    const auto b_k0_n_k1_grid_desc = descs[I1];
-    const auto c_m_n_grid_desc     = descs[I2];
-
-    using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc);
-    using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc);
-    using CMNGridDesc    = decltype(c_m_n_grid_desc);
-
-    using BGridStepHacks = decltype(make_tuple(
-        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
-        make_tuple(
-            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
-
-    using AGridStepHacks =
-        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
-                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
-
-    using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 1, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 1, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 1, 0, 0>{}),
-                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 2, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 2, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 2, 0, 0>{})));
-
-    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
-    using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
-
-    using GridwiseGemm =
-        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
-                                                FloatAB,
-                                                FloatAcc,
-                                                FloatC,
-                                                InMemoryDataOperationEnum_t::Set,
-                                                AK0MK1GridDesc,
-                                                BK0NK1GridDesc,
-                                                CMNGridDesc,
-                                                MPerBlock,
-                                                NPerBlock,
-                                                KPerBlock,
-                                                MPerWave,
-                                                NPerWave,
-                                                K1,
-                                                MRepeat,
-                                                NRepeat,
-                                                ABlockTransferThreadSliceLengths_K0_M_K1,
-                                                ABlockTransferThreadClusterLengths_K0_M_K1,
-                                                ABlockTransferThreadClusterArrangeOrder,
-                                                ABlockTransferSrcAccessOrder,
-                                                ABlockTransferSrcVectorDim,
-                                                ABlockTransferSrcScalarPerVector,
-                                                ABlockTransferDstScalarPerVector_K1,
-                                                AThreadTransferSrcResetCoordinateAfterRun,
-                                                BBlockTransferThreadSliceLengths_K0_N_K1,
-                                                BBlockTransferThreadClusterLengths_K0_N_K1,
-                                                BBlockTransferThreadClusterArrangeOrder,
-                                                BBlockTransferSrcAccessOrder,
-                                                BBlockTransferSrcVectorDim,
-                                                BBlockTransferSrcScalarPerVector,
-                                                BBlockTransferDstScalarPerVector_K1,
-                                                BThreadTransferSrcResetCoordinateAfterRun,
-                                                CThreadTransferSrcDstAccessOrder,
-                                                CThreadTransferSrcDstVectorDim,
-                                                CThreadTransferDstScalarPerVector,
-                                                AGridStepHacks,
-                                                BGridStepHacks,
-                                                CGridStepHacks,
-                                                AGridMoveSliceWindowStepHacks,
-                                                BGridMoveSliceWindowStepHacks,
-                                                false>;
-
-    auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
-
-    auto cblockid_to_m0_n0_block_cluster_adaptor =
-        GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
-
-    if(hipThreadIdx_x == 0)
-    {
-        *static_cast<remove_cv_t<decltype(a_k0_m_k1_grid_desc)>*>(p_a_k0_m_k1_grid_desc) =
-            a_k0_m_k1_grid_desc;
-        *static_cast<remove_cv_t<decltype(b_k0_n_k1_grid_desc)>*>(p_b_k0_n_k1_grid_desc) =
-            b_k0_n_k1_grid_desc;
-        *static_cast<decltype(c_m0_m1_m2_n_grid_desc)*>(p_c_m0_m1_m2_n_grid_desc) =
-            c_m0_m1_m2_n_grid_desc;
-        *static_cast<decltype(cblockid_to_m0_n0_block_cluster_adaptor)*>(
-            p_cblockid_to_m0_n0_block_cluster_adaptor) = cblockid_to_m0_n0_block_cluster_adaptor;
-    }
-};
-
-extern "C" __global__ void
-#if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
-#endif
-        convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            const void CONSTANT* p_a_k0_m_k1_grid_desc,
-            const void CONSTANT* p_b_k0_n_k1_grid_desc,
-            const void CONSTANT* p_c_m0_m1_m2_n_grid_desc,
-            const void CONSTANT* p_cblockid_to_m0_n0_block_cluster_adaptor)
-{
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-
-    constexpr auto in_n_hi_wi_c_desc =
-        make_naive_tensor_descriptor_packed(make_tuple(256, 28, 28, 256));
-    constexpr auto wei_k_y_x_c_desc =
-        make_naive_tensor_descriptor_packed(make_tuple(256, 3, 3, 256));
-    constexpr auto out_n_ho_wo_k_desc =
-        make_naive_tensor_descriptor_packed(make_tuple(256, 28, 28, 256));
-
-    constexpr auto descs =
-        transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk(in_n_hi_wi_c_desc,
-                                                                      wei_k_y_x_c_desc,
-                                                                      out_n_ho_wo_k_desc,
-                                                                      make_tuple(1, 1),
-                                                                      make_tuple(1, 1),
-                                                                      make_tuple(1, 1),
-                                                                      make_tuple(1, 1),
-                                                                      Number<K1>{});
-
-    constexpr auto a_k0_m_k1_grid_desc_tmp = descs[I0];
-    constexpr auto b_k0_n_k1_grid_desc_tmp = descs[I1];
-    constexpr auto c_m_n_grid_desc         = descs[I2];
-
-    using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc_tmp);
-    using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc_tmp);
-    using CMNGridDesc    = decltype(c_m_n_grid_desc);
-
-    using BGridStepHacks = decltype(make_tuple(
-        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
-        make_tuple(
-            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
-
-    using AGridStepHacks =
-        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
-                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
-
-    using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 1, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 1, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 1, 0, 0>{}),
-                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 2, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 2, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 0, 0, 0>{},
-                                                          Sequence<0, 0, 2, 0, 0>{})));
-
-    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
-    using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
-
-    using GridwiseGemm =
-        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
-                                                FloatAB,
-                                                FloatAcc,
-                                                FloatC,
-                                                InMemoryDataOperationEnum_t::Set,
-                                                AK0MK1GridDesc,
-                                                BK0NK1GridDesc,
-                                                CMNGridDesc,
-                                                MPerBlock,
-                                                NPerBlock,
-                                                KPerBlock,
-                                                MPerWave,
-                                                NPerWave,
-                                                K1,
-                                                MRepeat,
-                                                NRepeat,
-                                                ABlockTransferThreadSliceLengths_K0_M_K1,
-                                                ABlockTransferThreadClusterLengths_K0_M_K1,
-                                                ABlockTransferThreadClusterArrangeOrder,
-                                                ABlockTransferSrcAccessOrder,
-                                                ABlockTransferSrcVectorDim,
-                                                ABlockTransferSrcScalarPerVector,
-                                                ABlockTransferDstScalarPerVector_K1,
-                                                AThreadTransferSrcResetCoordinateAfterRun,
-                                                BBlockTransferThreadSliceLengths_K0_N_K1,
-                                                BBlockTransferThreadClusterLengths_K0_N_K1,
-                                                BBlockTransferThreadClusterArrangeOrder,
-                                                BBlockTransferSrcAccessOrder,
-                                                BBlockTransferSrcVectorDim,
-                                                BBlockTransferSrcScalarPerVector,
-                                                BBlockTransferDstScalarPerVector_K1,
-                                                BThreadTransferSrcResetCoordinateAfterRun,
-                                                CThreadTransferSrcDstAccessOrder,
-                                                CThreadTransferSrcDstVectorDim,
-                                                CThreadTransferDstScalarPerVector,
-                                                AGridStepHacks,
-                                                BGridStepHacks,
-                                                CGridStepHacks,
-                                                AGridMoveSliceWindowStepHacks,
-                                                BGridMoveSliceWindowStepHacks,
-                                                false>;
-    constexpr auto c_m0_m1_m2_n_grid_desc_tmp =
-        GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
-    constexpr auto cblockid_to_m0_n0_block_cluster_adaptor_tmp =
-        GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
-
-    using CM0M1M2NGridDesc                  = decltype(c_m0_m1_m2_n_grid_desc_tmp);
-    using CBlockIdToM0N0BlockClusterAdaptor = decltype(cblockid_to_m0_n0_block_cluster_adaptor_tmp);
-
-    const auto a_k0_m_k1_grid_desc =
-        *reinterpret_cast<const AK0MK1GridDesc*>((const void*)p_a_k0_m_k1_grid_desc);
-    const auto b_k0_n_k1_grid_desc =
-        *reinterpret_cast<const BK0NK1GridDesc*>((const void*)p_b_k0_n_k1_grid_desc);
-    const auto c_m0_m1_m2_n_grid_desc =
-        *reinterpret_cast<const CM0M1M2NGridDesc*>((const void*)p_c_m0_m1_m2_n_grid_desc);
-    const auto cblockid_to_m0_n0_block_cluster_adaptor =
-        *reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
-            (const void*)p_cblockid_to_m0_n0_block_cluster_adaptor);
-
-    constexpr index_t shared_block_size =
-        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
-
-    __shared__ FloatAB p_shared_block[shared_block_size];
-
-    GridwiseGemm::Run(p_a_grid,
-                      p_b_grid,
-                      p_c_grid,
-                      p_shared_block,
-                      a_k0_m_k1_grid_desc,
-                      b_k0_n_k1_grid_desc,
-                      c_m0_m1_m2_n_grid_desc,
-                      cblockid_to_m0_n0_block_cluster_adaptor);
-};
--- a/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
+++ b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_contraction_dlops_v1r2.hpp"
-#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
-
-using namespace ck;
-
-constexpr DataTypeEnum_t ABDataTypeEnum  = static_cast<DataTypeEnum_t>(CK_PARAM_ABDataTypeEnum);
-constexpr DataTypeEnum_t AccDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_AccDataTypeEnum);
-constexpr DataTypeEnum_t CDataTypeEnum   = static_cast<DataTypeEnum_t>(CK_PARAM_CDataTypeEnum);
-
-using FloatAB  = typename get_datatype_from_enum<ABDataTypeEnum>::type;
-using FloatAcc = typename get_datatype_from_enum<AccDataTypeEnum>::type;
-using FloatC   = typename get_datatype_from_enum<CDataTypeEnum>::type;
-
-constexpr index_t BlockSize = CK_PARAM_BlockSize;
-
-constexpr auto GN0 = Number<CK_PARAM_GN0>{};
-constexpr auto GK1 = Number<CK_PARAM_GK1>{};
-
-constexpr index_t GM1PerBlockGM11 = CK_PARAM_GM1PerBlockGM11;
-constexpr index_t GN1PerBlockGN11 = CK_PARAM_GN1PerBlockGN11;
-constexpr index_t GK0PerBlock     = CK_PARAM_GK0PerBlock;
-
-constexpr index_t BM1PerThreadBM11 = CK_PARAM_BM1PerThreadBM11;
-constexpr index_t BN1PerThreadBN11 = CK_PARAM_BN1PerThreadBN11;
-constexpr index_t BK0PerThread     = CK_PARAM_BK0PerThread;
-
-using BM10BN10ThreadClusterBM10Xs = Sequence<CK_PARAM_BM10BN10ThreadClusterBM10Xs>;
-using BM10BN10ThreadClusterBN10Xs = Sequence<CK_PARAM_BM10BN10ThreadClusterBN10Xs>;
-
-using ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 =
-    Sequence<CK_PARAM_ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1>;
-using ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 =
-    Sequence<CK_PARAM_ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1>;
-using ABlockTransferThreadClusterArrangeOrder = Sequence<1, 2, 3, 0, 4>;
-using ABlockTransferSrcAccessOrder            = Sequence<3, 2, 1, 0, 4>;
-using ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 =
-    Sequence<CK_PARAM_ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1>;
-using ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 =
-    Sequence<CK_PARAM_ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1>;
-using ABlockTransferSrcVectorTensorContiguousDimOrder = Sequence<0, 1, 2, 3, 4>;
-
-using BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 =
-    Sequence<CK_PARAM_BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1>;
-using BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 =
-    Sequence<CK_PARAM_BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1>;
-using BBlockTransferThreadClusterArrangeOrder = Sequence<0, 4, 1, 2, 3>;
-using BBlockTransferSrcAccessOrder            = Sequence<4, 3, 2, 0, 1>;
-using BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 =
-    Sequence<CK_PARAM_BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1>;
-using BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 =
-    Sequence<CK_PARAM_BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1>;
-using BBlockTransferSrcVectorTensorContiguousDimOrder = Sequence<0, 1, 2, 3, 4>;
-
-using CThreadTransferSrcDstAccessOrder              = Sequence<3, 4, 5, 0, 1, 2>;
-constexpr index_t CThreadTransferSrcDstVectorDim    = 5;
-constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
-
-constexpr bool HasMainKBlockLoop       = static_cast<bool>(CK_PARAM_HasMainKBlockLoop);
-constexpr bool HasDoubleTailKBlockLoop = static_cast<bool>(CK_PARAM_HasDoubleTailKBlockLoop);
-
-extern "C" __global__ void
-convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(int N_,
-                                                                    int C_,
-                                                                    int Hi_,
-                                                                    int Wi_,
-                                                                    int K_,
-                                                                    int Y_,
-                                                                    int X_,
-                                                                    int ConvStrideH_,
-                                                                    int ConvStrideW_,
-                                                                    int ConvDilationH_,
-                                                                    int ConvDilationW_,
-                                                                    int InLeftPadH_,
-                                                                    int InLeftPadW_,
-                                                                    int InRightPadH_,
-                                                                    int InRightPadW_,
-                                                                    void* p_desc_tuple)
-{
-    index_t N             = static_cast<index_t>(N_);
-    index_t C             = static_cast<index_t>(C_);
-    index_t Hi            = static_cast<index_t>(Hi_);
-    index_t Wi            = static_cast<index_t>(Wi_);
-    index_t K             = static_cast<index_t>(K_);
-    index_t Y             = static_cast<index_t>(Y_);
-    index_t X             = static_cast<index_t>(X_);
-    index_t ConvStrideH   = static_cast<index_t>(ConvStrideH_);
-    index_t ConvStrideW   = static_cast<index_t>(ConvStrideW_);
-    index_t ConvDilationH = static_cast<index_t>(ConvDilationH_);
-    index_t ConvDilationW = static_cast<index_t>(ConvDilationW_);
-    index_t InLeftPadH    = static_cast<index_t>(InLeftPadH_);
-    index_t InLeftPadW    = static_cast<index_t>(InLeftPadW_);
-    index_t InRightPadH   = static_cast<index_t>(InRightPadH_);
-    index_t InRightPadW   = static_cast<index_t>(InRightPadW_);
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-
-    const index_t Ho =
-        (Hi + InLeftPadH + InRightPadH - ConvDilationH * (Y - 1) - 1) / ConvStrideH + 1;
-    const index_t Wo =
-        (Wi + InLeftPadW + InRightPadW - ConvDilationW * (X - 1) - 1) / ConvStrideW + 1;
-
-    const auto in_n_c_hi_wi_desc  = make_naive_tensor_descriptor_packed(make_tuple(N, C, Hi, Wi));
-    const auto wei_k_c_y_x_desc   = make_naive_tensor_descriptor_packed(make_tuple(K, C, Y, X));
-    const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho, Wo));
-
-    const auto descs = transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(
-        wei_k_c_y_x_desc,
-        in_n_c_hi_wi_desc,
-        out_n_k_ho_wo_desc,
-        make_tuple(ConvStrideH, ConvStrideW),
-        make_tuple(ConvDilationH, ConvDilationW),
-        make_tuple(InLeftPadH, InLeftPadW),
-        make_tuple(InRightPadH, InRightPadW),
-        GN0,
-        GK1);
-
-    const auto a_grid_desc_gk0_gm0_gm1_gk1 = descs[I0];
-    const auto b_grid_desc_gk0_gn0_gn1_gk1 = descs[I1];
-    const auto c_grid_desc_gm0_gm1_gn0_gn1 = descs[I2];
-
-    using AGridDesc_GK0_GM0_GM1_GK1 = decltype(a_grid_desc_gk0_gm0_gm1_gk1);
-    using BGridDesc_GK0_GN0_GN1_GK1 = decltype(b_grid_desc_gk0_gn0_gn1_gk1);
-    using CGridDesc_GM0_GM1_GN0_GN1 = decltype(c_grid_desc_gm0_gm1_gn0_gn1);
-
-    using AGridStepHacks =
-        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 0+: GK0
-                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 1+: GM0
-                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 2+: GM10
-                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 3+: GM11
-                                       Sequence<0, 0, 0, 0, 0, 0, 0>{}),   // 4+: GK1
-                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 0-: GK0
-                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 1-: GM0
-                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 2-: GM10
-                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 3-: GM11
-                                       Sequence<0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1
-
-    using BGridStepHacks = decltype(make_tuple(
-        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{},    // 0+: GK0
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},    // 1+: GN0
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},    // 2+: GN10
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},    // 3+: GN11
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),   // 4+: GK1
-        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0>{},    // 0-: GK0
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},    // 1-: GN0
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},    // 2-: GN10
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},    // 3-: GN11
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1
-
-    using CGridStepHacks = decltype(make_tuple(
-        make_tuple(
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 0+: GM10
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},  // 1+: BM0
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},  // 2+: BM1
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 3+: GN10
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{},  // 4+: BN0
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{}), // 5+: GN1
-        make_tuple(
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},    // 0-: GM10
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},    // 1-: BM0
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},    // 2-: BM1
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},    // 3-: GN10
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{},    // 4-: BN0
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{}))); // 5-: GN1
-
-    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0>;
-
-    using BGridMoveSliceWindowStepHacks =
-        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0>;
-
-    using GridwiseContraction =
-        GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1<
-            BlockSize,
-            FloatAB,
-            FloatAcc,
-            FloatC,
-            InMemoryDataOperationEnum_t::Set,
-            AGridDesc_GK0_GM0_GM1_GK1,
-            BGridDesc_GK0_GN0_GN1_GK1,
-            CGridDesc_GM0_GM1_GN0_GN1,
-            GM1PerBlockGM11,
-            GN1PerBlockGN11,
-            GK0PerBlock,
-            BM1PerThreadBM11,
-            BN1PerThreadBN11,
-            BK0PerThread,
-            BM10BN10ThreadClusterBM10Xs,
-            BM10BN10ThreadClusterBN10Xs,
-            ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
-            ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
-            ABlockTransferThreadClusterArrangeOrder,
-            ABlockTransferSrcAccessOrder,
-            ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
-            ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
-            ABlockTransferSrcVectorTensorContiguousDimOrder,
-            BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
-            BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
-            BBlockTransferThreadClusterArrangeOrder,
-            BBlockTransferSrcAccessOrder,
-            BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
-            BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
-            BBlockTransferSrcVectorTensorContiguousDimOrder,
-            CThreadTransferSrcDstAccessOrder,
-            CThreadTransferSrcDstVectorDim,
-            CThreadTransferDstScalarPerVector,
-            AGridStepHacks,
-            BGridStepHacks,
-            CGridStepHacks,
-            AGridMoveSliceWindowStepHacks,
-            BGridMoveSliceWindowStepHacks>;
-
-    if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
-    {
-        auto desc_tuple =
-            make_tuple(GridwiseContraction::MakeAGridDescriptor_GK0_GM0_GM10_GM11_GK1(
-                           a_grid_desc_gk0_gm0_gm1_gk1),
-                       GridwiseContraction::MakeBGridDescriptor_GK0_GN0_GN10_GN11_GK1(
-                           b_grid_desc_gk0_gn0_gn1_gk1),
-                       GridwiseContraction::MakeCGridDescriptor_GM10_BM0_BM1_GN10_BN0_BN1(
-                           c_grid_desc_gm0_gm1_gn0_gn1),
-                       GridwiseContraction::MakeCGridBlockCluster_BlockId_To_GM10_GN10(
-                           c_grid_desc_gm0_gm1_gn0_gn1));
-
-        *static_cast<decltype(desc_tuple)*>(p_desc_tuple) = desc_tuple;
-    }
-};
-
-extern "C" __global__ void
-#if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
-#endif
-        convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            const void CONSTANT* p_desc_tuple)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto in_n_c_hi_wi_desc =
-        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
-    constexpr auto wei_k_c_y_x_desc =
-        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 3, 3));
-    constexpr auto out_n_k_ho_wo_desc =
-        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
-
-    constexpr auto descs =
-        transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
-                                                                               in_n_c_hi_wi_desc,
-                                                                               out_n_k_ho_wo_desc,
-                                                                               make_tuple(1, 1),
-                                                                               make_tuple(1, 1),
-                                                                               make_tuple(1, 1),
-                                                                               make_tuple(1, 1),
-                                                                               GN0,
-                                                                               GK1);
-
-    constexpr auto a_grid_desc_gk0_gm0_gm1_gk1 = descs[I0];
-    constexpr auto b_grid_desc_gk0_gn0_gn1_gk1 = descs[I1];
-    constexpr auto c_grid_desc_gm0_gm1_gn0_gn1 = descs[I2];
-
-    using AGridDesc_GK0_GM0_GM1_GK1 = decltype(a_grid_desc_gk0_gm0_gm1_gk1);
-    using BGridDesc_GK0_GN0_GN1_GK1 = decltype(b_grid_desc_gk0_gn0_gn1_gk1);
-    using CGridDesc_GM0_GM1_GN0_GN1 = decltype(c_grid_desc_gm0_gm1_gn0_gn1);
-
-    using AGridStepHacks =
-        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 0+: GK0
-                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 1+: GM0
-                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 2+: GM10
-                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 3+: GM11
-                                       Sequence<0, 0, 0, 0, 0, 0, 0>{}),   // 4+: GK1
-                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 0-: GK0
-                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 1-: GM0
-                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 2-: GM10
-                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 3-: GM11
-                                       Sequence<0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1
-
-    using BGridStepHacks = decltype(make_tuple(
-        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{},    // 0+: GK0
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},    // 1+: GN0
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},    // 2+: GN10
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},    // 3+: GN11
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),   // 4+: GK1
-        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0>{},    // 0-: GK0
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},    // 1-: GN0
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},    // 2-: GN10
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},    // 3-: GN11
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1
-
-    using CGridStepHacks = decltype(make_tuple(
-        make_tuple(
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 0+: GM10
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},  // 1+: BM0
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},  // 2+: BM1
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 3+: GN10
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{},  // 4+: BN0
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{}), // 5+: GN1
-        make_tuple(
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},    // 0-: GM10
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},    // 1-: BM0
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},    // 2-: BM1
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},    // 3-: GN10
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{},    // 4-: BN0
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{}))); // 5-: GN1
-
-    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0>;
-
-    using BGridMoveSliceWindowStepHacks =
-        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0>;
-
-    using GridwiseContraction =
-        GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1<
-            BlockSize,
-            FloatAB,
-            FloatAcc,
-            FloatC,
-            InMemoryDataOperationEnum_t::Set,
-            AGridDesc_GK0_GM0_GM1_GK1,
-            BGridDesc_GK0_GN0_GN1_GK1,
-            CGridDesc_GM0_GM1_GN0_GN1,
-            GM1PerBlockGM11,
-            GN1PerBlockGN11,
-            GK0PerBlock,
-            BM1PerThreadBM11,
-            BN1PerThreadBN11,
-            BK0PerThread,
-            BM10BN10ThreadClusterBM10Xs,
-            BM10BN10ThreadClusterBN10Xs,
-            ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
-            ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
-            ABlockTransferThreadClusterArrangeOrder,
-            ABlockTransferSrcAccessOrder,
-            ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
-            ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
-            ABlockTransferSrcVectorTensorContiguousDimOrder,
-            BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
-            BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
-            BBlockTransferThreadClusterArrangeOrder,
-            BBlockTransferSrcAccessOrder,
-            BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
-            BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
-            BBlockTransferSrcVectorTensorContiguousDimOrder,
-            CThreadTransferSrcDstAccessOrder,
-            CThreadTransferSrcDstVectorDim,
-            CThreadTransferDstScalarPerVector,
-            AGridStepHacks,
-            BGridStepHacks,
-            CGridStepHacks,
-            AGridMoveSliceWindowStepHacks,
-            BGridMoveSliceWindowStepHacks>;
-
-    using AGridDesc_GK0_GM0_GM10_GM11_GK1 =
-        decltype(GridwiseContraction::MakeAGridDescriptor_GK0_GM0_GM10_GM11_GK1(
-            a_grid_desc_gk0_gm0_gm1_gk1));
-    using BGridDesc_GK0_GN0_GN10_GN11_GK1 =
-        decltype(GridwiseContraction::MakeBGridDescriptor_GK0_GN0_GN10_GN11_GK1(
-            b_grid_desc_gk0_gn0_gn1_gk1));
-    using CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1 =
-        decltype(GridwiseContraction::MakeCGridDescriptor_GM10_BM0_BM1_GN10_BN0_BN1(
-            c_grid_desc_gm0_gm1_gn0_gn1));
-    using CGridBlockCluster_BlockId_To_GM10_GN10 =
-        decltype(GridwiseContraction::MakeCGridBlockCluster_BlockId_To_GM10_GN10(
-            c_grid_desc_gm0_gm1_gn0_gn1));
-
-    using DescTuple = decltype(make_tuple(AGridDesc_GK0_GM0_GM10_GM11_GK1{},
-                                          BGridDesc_GK0_GN0_GN10_GN11_GK1{},
-                                          CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1{},
-                                          CGridBlockCluster_BlockId_To_GM10_GN10{}));
-
-    const auto desc_tuple =
-        *reinterpret_cast<const DescTuple*>(cast_pointer_to_generic_address_space(p_desc_tuple));
-
-    const auto a_grid_desc_gk0_gm0_gm10_gm11_gk1         = desc_tuple[I0];
-    const auto b_grid_desc_gk0_gn0_gn10_gn11_gk1         = desc_tuple[I1];
-    const auto c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1     = desc_tuple[I2];
-    const auto c_grid_block_cluster_blockid_to_gm10_gn10 = desc_tuple[I3];
-
-    constexpr index_t shared_block_size =
-        GridwiseContraction::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
-
-    __shared__ FloatAB p_shared_block[shared_block_size];
-
-    GridwiseContraction::Run(p_a_grid,
-                             p_b_grid,
-                             p_c_grid,
-                             p_shared_block,
-                             a_grid_desc_gk0_gm0_gm10_gm11_gk1,
-                             b_grid_desc_gk0_gn0_gn10_gn11_gk1,
-                             c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
-                             c_grid_block_cluster_blockid_to_gm10_gn10,
-                             integral_constant<bool, HasMainKBlockLoop>{},
-                             integral_constant<bool, HasDoubleTailKBlockLoop>{});
-};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include "config.hpp"
-#include "number.hpp"
-#include "sequence.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "data_type_enum_helper.hpp"
-#include "reduction_common.hpp"
-#include "gridwise_generic_2d_reduction_blockwise.hpp"
-
-using namespace ck;
-
-using srcDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
-using dstDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
-using compType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
-
-constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
-
-constexpr index_t srcDims = CK_PARAM_IN_DIMS;
-
-constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
-constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
-                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
-                                             : NanPropagation_t::PROPAGATE_NAN;
-constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
-                                                       ? ReduceTensorIndices_t::NO_INDICES
-                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
-
-constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
-constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
-
-constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
-constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
-
-constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
-
-// helper functions using variadic template arguments
-template <index_t... Ns>
-__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
-{
-    return make_tuple(static_cast<index_t>(lengths[Ns])...);
-};
-
-template <index_t arraySize>
-__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
-{
-    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
-
-    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
-
-    return make_tuple_from_array_and_index_seq(lengths, index_seq);
-};
-
-template <index_t... Ns>
-__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
-{
-    return make_tuple(Ns...);
-};
-
-extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
-                                                             int BlkGroupSize,
-                                                             int inLength0,
-                                                             int inLength1,
-                                                             int inLength2,
-                                                             int inLength3,
-                                                             int inLength4,
-                                                             int inLength5,
-                                                             int inStride0,
-                                                             int inStride1,
-                                                             int inStride2,
-                                                             int inStride3,
-                                                             int inStride4,
-                                                             int inStride5,
-                                                             void* __restrict__ ws_global)
-{
-    (void)GridSize;
-    (void)BlkGroupSize;
-
-    void* p_src2dDesc = ws_global;
-    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
-
-    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
-    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
-
-    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
-    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
-    const auto tupleDstLengths = make_tuple(1);
-    const auto tupleDstStrides = make_tuple(1);
-
-    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
-    auto dstDesc       = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
-
-    const auto one_dim_srcDesc = transform_tensor_descriptor(
-        srcDesc,
-        make_tuple(make_merge_transform(tupleSrcLengths)),
-        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-
-    auto src2dDesc = transform_tensor_descriptor(
-        one_dim_srcDesc,
-        make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))),
-        make_tuple(Sequence<0>{}),
-        make_tuple(Sequence<0, 1>{}));
-
-    constexpr int invariantLen = 1;
-    const auto toReduceLen     = src2dDesc.GetLength(Number<1>{});
-
-    constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
-
-    if constexpr(src2d_need_padding)
-    {
-        const auto srcPad =
-            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
-
-        auto src2dDesc_2 =
-            transform_tensor_descriptor(src2dDesc,
-                                        make_tuple(make_pass_through_transform(invariantLen),
-                                                   make_pad_transform(toReduceLen, 0, srcPad)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
-    }
-    else
-    {
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
-    }
-
-    if(get_thread_local_1d_id() == 0)
-        *static_cast<decltype(dstDesc)*>(p_dst1dDesc) = dstDesc;
-};
-
-template <index_t srcDims>
-struct get_ref_desc_types
-{
-    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
-
-    // don't have to use accurate strides to get an expected referrence type
-    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
-        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
-    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1));
-
-    static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
-        ref_srcDesc,
-        make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))),
-        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-
-    static constexpr auto ref_src2dDesc =
-        transform_tensor_descriptor(ref_one_dim_srcDesc,
-                                    make_tuple(make_unmerge_transform(
-                                        make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))),
-                                    make_tuple(Sequence<0>{}),
-                                    make_tuple(Sequence<0, 1>{}));
-
-    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
-    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
-
-    // used by the BlockWise and MultiBlock method
-    using refType_src2dDesc_padded_34 = decltype(
-        transform_tensor_descriptor(ref_src2dDesc,
-                                    make_tuple(make_pass_through_transform(ref_invariantLen),
-                                               make_pad_transform(ref_toReduceLen, 0, 2)),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{})));
-
-    using refType_dst1dDesc_padded =
-        decltype(transform_tensor_descriptor(ref_dstDesc,
-                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
-                                             make_tuple(Sequence<0>{}),
-                                             make_tuple(Sequence<0>{})));
-
-    using refType_src2dDesc = decltype(ref_src2dDesc);
-    using refType_dst1dDesc = decltype(ref_dstDesc);
-};
-
-using refType_src2dDesc = typename get_ref_desc_types<srcDims>::refType_src2dDesc;
-using refType_dst1dDesc = typename get_ref_desc_types<srcDims>::refType_dst1dDesc;
-using refType_src2dDesc_padded_34 =
-    typename get_ref_desc_types<srcDims>::refType_src2dDesc_padded_34;
-using refType_dst1dDesc_padded = typename get_ref_desc_types<srcDims>::refType_dst1dDesc_padded;
-
-template <bool need_padding>
-static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
-    else
-        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
-};
-
-template <bool need_padding>
-static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
-    else
-        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
-};
-
-extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
-                                                     int BlkGroupSize,
-                                                     float alpha,
-                                                     const void* __restrict__ p_src_global,
-                                                     float beta,
-                                                     void* __restrict__ p_dst_global,
-                                                     const void CONSTANT* ws_global,
-                                                     long ws_buf2_bytes_offset,
-                                                     void* __restrict__ indices_global)
-{
-    (void)BlkGroupSize;
-    (void)ws_buf2_bytes_offset;
-
-    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
-    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
-
-    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
-    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
-
-    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise<BlockSize,
-                                                                   srcDataType,
-                                                                   dstDataType,
-                                                                   compType,
-                                                                   decltype(src2dDesc),
-                                                                   decltype(dst1dDesc),
-                                                                   op,
-                                                                   nanPropaOpt,
-                                                                   reduceIndicesOpt,
-                                                                   true,
-                                                                   true,
-                                                                   GredAccessesPerThreadInBlock>;
-
-    constexpr int RunId = need_indices ? 2 : 1;
-    gridwise_2d_reduce::template Run<RunId>(
-        src2dDesc,
-        dst1dDesc,
-        origReduceLen,
-        alpha,
-        static_cast<const srcDataType* const __restrict__>(p_src_global),
-        beta,
-        static_cast<dstDataType* const __restrict__>(p_dst_global),
-        static_cast<const int* const __restrict__>(nullptr),
-        static_cast<int* const __restrict__>(indices_global));
-};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include "config.hpp"
-#include "number.hpp"
-#include "sequence.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "data_type_enum_helper.hpp"
-#include "reduction_common.hpp"
-#include "gridwise_generic_2d_reduction_blockwise.hpp"
-
-using namespace ck;
-
-using srcDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
-using dstDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
-using compType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
-
-constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
-
-constexpr index_t srcDims = CK_PARAM_IN_DIMS;
-constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
-
-constexpr index_t num_toReduceDims  = CK_PARAM_NUM_TOREDUCE_DIMS;
-constexpr index_t num_invariantDims = srcDims - num_toReduceDims;
-
-using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type;
-using toReduceDims  = typename arithmetic_sequence_gen<num_invariantDims, srcDims, 1>::type;
-
-constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
-constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
-                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
-                                             : NanPropagation_t::PROPAGATE_NAN;
-constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
-                                                       ? ReduceTensorIndices_t::NO_INDICES
-                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
-
-constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
-constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
-
-static_assert(num_invariantDims > 0, "Not all dimensins are reduced for this kernel !!");
-
-constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
-constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
-
-constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
-
-// helper functions using variadic template arguments
-template <index_t... Ns>
-__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
-{
-    return make_tuple(static_cast<index_t>(lengths[Ns])...);
-};
-
-template <index_t arraySize>
-__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
-{
-    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
-
-    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
-
-    return make_tuple_from_array_and_index_seq(lengths, index_seq);
-};
-
-template <index_t... Ns>
-__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
-{
-    return make_tuple(Ns...);
-};
-
-extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
-                                                             int BlkGroupSize,
-                                                             int inLength0,
-                                                             int inLength1,
-                                                             int inLength2,
-                                                             int inLength3,
-                                                             int inLength4,
-                                                             int inLength5,
-                                                             int inStride0,
-                                                             int inStride1,
-                                                             int inStride2,
-                                                             int inStride3,
-                                                             int inStride4,
-                                                             int inStride5,
-                                                             int outStride0,
-                                                             int outStride1,
-                                                             int outStride2,
-                                                             int outStride3,
-                                                             int outStride4,
-                                                             int outStride5,
-                                                             void* __restrict__ ws_global)
-{
-    (void)GridSize;
-    (void)BlkGroupSize;
-
-    void* p_src2dDesc = ws_global;
-    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
-
-    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
-    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
-    const int dstStrides[6] = {
-        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
-
-    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
-    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
-    const auto tupleDstLengths = make_tuple_from_array(srcLengths, Number<dstDims>{});
-    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
-
-    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
-    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
-
-    const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{});
-    const auto invariantDimLengths =
-        make_tuple_from_array_and_index_seq(srcLengths, invariantDims{});
-
-    auto src2dDesc =
-        transform_tensor_descriptor(srcDesc,
-                                    make_tuple(make_merge_transform(invariantDimLengths),
-                                               make_merge_transform(toReduceDimLengths)),
-                                    make_tuple(invariantDims{}, toReduceDims{}),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    auto dst1dDesc = transform_tensor_descriptor(
-        dstDesc,
-        make_tuple(make_merge_transform(tupleDstLengths)),
-        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-
-    const auto invariantLen = src2dDesc.GetLength(Number<0>{});
-    const auto toReduceLen  = src2dDesc.GetLength(Number<1>{});
-
-    constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
-
-    if constexpr(src2d_need_padding)
-    {
-        const auto srcPad =
-            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
-
-        auto src2dDesc_2 =
-            transform_tensor_descriptor(src2dDesc,
-                                        make_tuple(make_pass_through_transform(invariantLen),
-                                                   make_pad_transform(toReduceLen, 0, srcPad)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
-    }
-    else
-    {
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
-    }
-
-    if(get_thread_local_1d_id() == 0)
-        *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
-};
-
-template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
-struct get_ref_desc_types
-{
-    static constexpr auto ref_toReduceDimLengths =
-        typename uniform_sequence_gen<toReduceDims::Size(), 8>::type{};
-    static constexpr auto ref_invariantDimLengths =
-        typename uniform_sequence_gen<invariantDims::Size(), 8>::type{};
-
-    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
-    static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 8>::type{};
-
-    // don't have to use accurate strides to get an expected referrence type
-    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
-        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
-    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
-        make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
-
-    static constexpr auto ref_src2dDesc = transform_tensor_descriptor(
-        ref_srcDesc,
-        make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)),
-                   make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))),
-        make_tuple(invariantDims{}, toReduceDims{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
-        ref_dstDesc,
-        make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
-        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-
-    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
-    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
-
-    // used by the BlockWise and MultiBlock method
-    using refType_src2dDesc_padded_34 = decltype(
-        transform_tensor_descriptor(ref_src2dDesc,
-                                    make_tuple(make_pass_through_transform(ref_invariantLen),
-                                               make_pad_transform(ref_toReduceLen, 0, 2)),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{})));
-
-    using refType_dst1dDesc_padded =
-        decltype(transform_tensor_descriptor(ref_dst1dDesc,
-                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
-                                             make_tuple(Sequence<0>{}),
-                                             make_tuple(Sequence<0>{})));
-
-    using refType_src2dDesc = decltype(ref_src2dDesc);
-    using refType_dst1dDesc = decltype(ref_dst1dDesc);
-};
-
-using refType_src2dDesc =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
-using refType_dst1dDesc =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
-using refType_src2dDesc_padded_34 =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
-        refType_src2dDesc_padded_34;
-using refType_dst1dDesc_padded =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
-        refType_dst1dDesc_padded;
-
-template <bool need_padding>
-static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
-    else
-        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
-};
-
-template <bool need_padding>
-static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
-    else
-        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
-};
-
-extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
-                                                     int BlkGroupSize,
-                                                     float alpha,
-                                                     const void* __restrict__ p_src_global,
-                                                     float beta,
-                                                     void* __restrict__ p_dst_global,
-                                                     const void CONSTANT* ws_global,
-                                                     long ws_buf2_bytes_offset,
-                                                     void* __restrict__ indices_global)
-{
-    (void)BlkGroupSize;
-    (void)ws_buf2_bytes_offset;
-
-    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
-    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
-
-    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
-    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
-
-    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise<BlockSize,
-                                                                   srcDataType,
-                                                                   dstDataType,
-                                                                   compType,
-                                                                   decltype(src2dDesc),
-                                                                   decltype(dst1dDesc),
-                                                                   op,
-                                                                   nanPropaOpt,
-                                                                   reduceIndicesOpt,
-                                                                   true,
-                                                                   true,
-                                                                   GredAccessesPerThreadInBlock>;
-
-    constexpr int RunId = need_indices ? 2 : 1;
-    gridwise_2d_reduce::template Run<RunId>(
-        src2dDesc,
-        dst1dDesc,
-        origReduceLen,
-        alpha,
-        static_cast<const srcDataType* const __restrict__>(p_src_global),
-        beta,
-        static_cast<dstDataType* const __restrict__>(p_dst_global),
-        static_cast<const int* const __restrict__>(nullptr),
-        static_cast<int* const __restrict__>(indices_global));
-};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include "config.hpp"
-#include "number.hpp"
-#include "sequence.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "data_type_enum_helper.hpp"
-#include "reduction_common.hpp"
-#include "gridwise_generic_2d_reduction_multiblock.hpp"
-
-using namespace ck;
-
-using srcDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
-using dstDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
-using compType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
-
-constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
-
-constexpr index_t srcDims = CK_PARAM_IN_DIMS;
-
-constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
-constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
-                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
-                                             : NanPropagation_t::PROPAGATE_NAN;
-constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
-                                                       ? ReduceTensorIndices_t::NO_INDICES
-                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
-
-constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
-constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
-
-constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
-constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
-
-constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
-
-// helper functions using variadic template arguments
-template <index_t... Ns>
-__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
-{
-    return make_tuple(static_cast<index_t>(lengths[Ns])...);
-};
-
-template <index_t arraySize>
-__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
-{
-    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
-
-    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
-
-    return make_tuple_from_array_and_index_seq(lengths, index_seq);
-};
-
-template <index_t... Ns>
-__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
-{
-    return make_tuple(Ns...);
-};
-
-extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
-                                                             int BlkGroupSize,
-                                                             int inLength0,
-                                                             int inLength1,
-                                                             int inLength2,
-                                                             int inLength3,
-                                                             int inLength4,
-                                                             int inLength5,
-                                                             int inStride0,
-                                                             int inStride1,
-                                                             int inStride2,
-                                                             int inStride3,
-                                                             int inStride4,
-                                                             int inStride5,
-                                                             void* __restrict__ ws_global)
-{
-    (void)GridSize;
-
-    void* p_src2dDesc = ws_global;
-    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
-
-    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
-    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
-
-    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
-    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
-    const auto tupleDstLengths = make_tuple(1);
-    const auto tupleDstStrides = make_tuple(1);
-
-    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
-    auto dstDesc       = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
-
-    const auto one_dim_srcDesc = transform_tensor_descriptor(
-        srcDesc,
-        make_tuple(make_merge_transform(tupleSrcLengths)),
-        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-
-    auto src2dDesc = transform_tensor_descriptor(
-        one_dim_srcDesc,
-        make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))),
-        make_tuple(Sequence<0>{}),
-        make_tuple(Sequence<0, 1>{}));
-
-    constexpr int invariantLen = 1;
-    const auto toReduceLen     = src2dDesc.GetLength(Number<1>{});
-
-    constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
-    const index_t reduceSizePerBlock =
-        (((toReduceLen + BlkGroupSize - 1) / BlkGroupSize + copySliceLen - 1) / copySliceLen) *
-        copySliceLen;
-
-    if constexpr(src2d_need_padding)
-    {
-        const auto srcPad = reduceSizePerBlock * BlkGroupSize - toReduceLen;
-
-        auto src2dDesc_2 =
-            transform_tensor_descriptor(src2dDesc,
-                                        make_tuple(make_pass_through_transform(invariantLen),
-                                                   make_pad_transform(toReduceLen, 0, srcPad)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
-    }
-    else
-    {
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
-    }
-
-    if(get_thread_local_1d_id() == 0)
-        *static_cast<decltype(dstDesc)*>(p_dst1dDesc) = dstDesc;
-};
-
-template <index_t srcDims>
-struct get_ref_desc_types
-{
-    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
-
-    // don't have to use accurate strides to get an expected referrence type
-    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
-        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
-    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1));
-
-    static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
-        ref_srcDesc,
-        make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))),
-        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-
-    static constexpr auto ref_src2dDesc =
-        transform_tensor_descriptor(ref_one_dim_srcDesc,
-                                    make_tuple(make_unmerge_transform(
-                                        make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))),
-                                    make_tuple(Sequence<0>{}),
-                                    make_tuple(Sequence<0, 1>{}));
-
-    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
-    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
-
-    // used by the BlockWise and MultiBlock method
-    using refType_src2dDesc_padded_34 = decltype(
-        transform_tensor_descriptor(ref_src2dDesc,
-                                    make_tuple(make_pass_through_transform(ref_invariantLen),
-                                               make_pad_transform(ref_toReduceLen, 0, 2)),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{})));
-
-    using refType_dst1dDesc_padded =
-        decltype(transform_tensor_descriptor(ref_dstDesc,
-                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
-                                             make_tuple(Sequence<0>{}),
-                                             make_tuple(Sequence<0>{})));
-
-    using refType_src2dDesc = decltype(ref_src2dDesc);
-    using refType_dst1dDesc = decltype(ref_dstDesc);
-};
-
-using refType_src2dDesc = typename get_ref_desc_types<srcDims>::refType_src2dDesc;
-using refType_dst1dDesc = typename get_ref_desc_types<srcDims>::refType_dst1dDesc;
-using refType_src2dDesc_padded_34 =
-    typename get_ref_desc_types<srcDims>::refType_src2dDesc_padded_34;
-using refType_dst1dDesc_padded = typename get_ref_desc_types<srcDims>::refType_dst1dDesc_padded;
-
-template <bool need_padding>
-static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
-    else
-        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
-};
-
-template <bool need_padding>
-static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
-    else
-        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
-};
-
-extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
-                                                     int BlkGroupSize,
-                                                     float alpha,
-                                                     const void* __restrict__ p_src_global,
-                                                     float beta,
-                                                     void* __restrict__ p_dst_global,
-                                                     const void CONSTANT* ws_global,
-                                                     long ws_buf2_bytes_offset,
-                                                     void* __restrict__ indices_global)
-{
-    (void)p_dst_global;
-    (void)indices_global;
-
-    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
-    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
-    void* ws_buf1_global    = const_cast<char*>(static_cast<const char*>(p_src2dDesc) + 4096);
-
-    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
-    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
-
-    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_multiblock<BlockSize,
-                                                                    srcDataType,
-                                                                    dstDataType,
-                                                                    compType,
-                                                                    decltype(src2dDesc),
-                                                                    decltype(dst1dDesc),
-                                                                    op,
-                                                                    nanPropaOpt,
-                                                                    reduceIndicesOpt,
-                                                                    GredAccessesPerThreadInBlock>;
-
-    void* const ws_buf2_global =
-        ws_buf2_bytes_offset > 0
-            ? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
-            : nullptr;
-
-    constexpr int RunId = need_indices ? 2 : 1;
-    gridwise_2d_reduce::template Run<RunId>(
-        src2dDesc,
-        dst1dDesc,
-        origReduceLen,
-        BlkGroupSize,
-        alpha,
-        static_cast<const srcDataType* const __restrict__>(p_src_global),
-        beta,
-        static_cast<srcDataType* const __restrict__>(ws_buf1_global),
-        static_cast<int* const __restrict__>(ws_buf2_global));
-};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include "config.hpp"
-#include "number.hpp"
-#include "sequence.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "data_type_enum_helper.hpp"
-#include "reduction_common.hpp"
-#include "gridwise_generic_2d_reduction_multiblock.hpp"
-
-using namespace ck;
-
-using srcDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
-using dstDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
-using compType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
-
-constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
-
-constexpr index_t srcDims = CK_PARAM_IN_DIMS;
-constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
-
-constexpr index_t num_toReduceDims  = CK_PARAM_NUM_TOREDUCE_DIMS;
-constexpr index_t num_invariantDims = srcDims - num_toReduceDims;
-
-using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type;
-using toReduceDims  = typename arithmetic_sequence_gen<num_invariantDims, srcDims, 1>::type;
-
-constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
-constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
-                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
-                                             : NanPropagation_t::PROPAGATE_NAN;
-constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
-                                                       ? ReduceTensorIndices_t::NO_INDICES
-                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
-
-constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
-constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
-
-static_assert(num_invariantDims > 0, "Not all dimensins are reduced for this kernel !!");
-
-constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
-constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
-
-constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
-
-// helper functions using variadic template arguments
-template <index_t... Ns>
-__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
-{
-    return make_tuple(static_cast<index_t>(lengths[Ns])...);
-};
-
-template <index_t arraySize>
-__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
-{
-    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
-
-    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
-
-    return make_tuple_from_array_and_index_seq(lengths, index_seq);
-};
-
-template <index_t... Ns>
-__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
-{
-    return make_tuple(Ns...);
-};
-
-extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
-                                                             int BlkGroupSize,
-                                                             int inLength0,
-                                                             int inLength1,
-                                                             int inLength2,
-                                                             int inLength3,
-                                                             int inLength4,
-                                                             int inLength5,
-                                                             int inStride0,
-                                                             int inStride1,
-                                                             int inStride2,
-                                                             int inStride3,
-                                                             int inStride4,
-                                                             int inStride5,
-                                                             int outStride0,
-                                                             int outStride1,
-                                                             int outStride2,
-                                                             int outStride3,
-                                                             int outStride4,
-                                                             int outStride5,
-                                                             void* __restrict__ ws_global)
-{
-    (void)GridSize;
-
-    void* p_src2dDesc = ws_global;
-    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
-
-    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
-    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
-    const int dstStrides[6] = {
-        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
-
-    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
-    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
-    const auto tupleDstLengths = make_tuple_from_array(srcLengths, Number<dstDims>{});
-    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
-
-    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
-    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
-
-    const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{});
-    const auto invariantDimLengths =
-        make_tuple_from_array_and_index_seq(srcLengths, invariantDims{});
-
-    auto src2dDesc =
-        transform_tensor_descriptor(srcDesc,
-                                    make_tuple(make_merge_transform(invariantDimLengths),
-                                               make_merge_transform(toReduceDimLengths)),
-                                    make_tuple(invariantDims{}, toReduceDims{}),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    auto dst1dDesc = transform_tensor_descriptor(
-        dstDesc,
-        make_tuple(make_merge_transform(tupleDstLengths)),
-        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-
-    const auto invariantLen = src2dDesc.GetLength(Number<0>{});
-    const auto toReduceLen  = src2dDesc.GetLength(Number<1>{});
-
-    constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
-    const index_t reduceSizePerBlock =
-        (((toReduceLen + BlkGroupSize - 1) / BlkGroupSize + copySliceLen - 1) / copySliceLen) *
-        copySliceLen;
-
-    if constexpr(src2d_need_padding)
-    {
-        const auto srcPad = reduceSizePerBlock * BlkGroupSize - toReduceLen;
-
-        auto src2dDesc_2 =
-            transform_tensor_descriptor(src2dDesc,
-                                        make_tuple(make_pass_through_transform(invariantLen),
-                                                   make_pad_transform(toReduceLen, 0, srcPad)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
-    }
-    else
-    {
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
-    }
-
-    if(get_thread_local_1d_id() == 0)
-        *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
-};
-
-template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
-struct get_ref_desc_types
-{
-    static constexpr auto ref_toReduceDimLengths =
-        typename uniform_sequence_gen<toReduceDims::Size(), 8>::type{};
-    static constexpr auto ref_invariantDimLengths =
-        typename uniform_sequence_gen<invariantDims::Size(), 8>::type{};
-
-    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
-    static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 8>::type{};
-
-    // don't have to use accurate strides to get an expected referrence type
-    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
-        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
-    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
-        make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
-
-    static constexpr auto ref_src2dDesc = transform_tensor_descriptor(
-        ref_srcDesc,
-        make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)),
-                   make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))),
-        make_tuple(invariantDims{}, toReduceDims{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
-        ref_dstDesc,
-        make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
-        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-
-    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
-    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
-
-    // used by the BlockWise and MultiBlock method
-    using refType_src2dDesc_padded_34 = decltype(
-        transform_tensor_descriptor(ref_src2dDesc,
-                                    make_tuple(make_pass_through_transform(ref_invariantLen),
-                                               make_pad_transform(ref_toReduceLen, 0, 2)),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{})));
-
-    using refType_dst1dDesc_padded =
-        decltype(transform_tensor_descriptor(ref_dst1dDesc,
-                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
-                                             make_tuple(Sequence<0>{}),
-                                             make_tuple(Sequence<0>{})));
-
-    using refType_src2dDesc = decltype(ref_src2dDesc);
-    using refType_dst1dDesc = decltype(ref_dst1dDesc);
-};
-
-using refType_src2dDesc =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
-using refType_dst1dDesc =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
-using refType_src2dDesc_padded_34 =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
-        refType_src2dDesc_padded_34;
-using refType_dst1dDesc_padded =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
-        refType_dst1dDesc_padded;
-
-template <bool need_padding>
-static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
-    else
-        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
-};
-
-template <bool need_padding>
-static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
-    else
-        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
-};
-
-extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
-                                                     int BlkGroupSize,
-                                                     float alpha,
-                                                     const void* __restrict__ p_src_global,
-                                                     float beta,
-                                                     void* __restrict__ p_dst_global,
-                                                     const void CONSTANT* ws_global,
-                                                     long ws_buf2_bytes_offset,
-                                                     void* __restrict__ indices_global)
-{
-    (void)p_dst_global;
-    (void)indices_global;
-
-    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
-    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
-    void* ws_buf1_global    = const_cast<char*>(static_cast<const char*>(p_src2dDesc) + 4096);
-
-    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
-    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
-
-    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_multiblock<BlockSize,
-                                                                    srcDataType,
-                                                                    dstDataType,
-                                                                    compType,
-                                                                    decltype(src2dDesc),
-                                                                    decltype(dst1dDesc),
-                                                                    op,
-                                                                    nanPropaOpt,
-                                                                    reduceIndicesOpt,
-                                                                    GredAccessesPerThreadInBlock>;
-
-    void* const ws_buf2_global =
-        ws_buf2_bytes_offset > 0
-            ? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
-            : nullptr;
-
-    constexpr int RunId = need_indices ? 2 : 1;
-    gridwise_2d_reduce::template Run<RunId>(
-        src2dDesc,
-        dst1dDesc,
-        origReduceLen,
-        BlkGroupSize,
-        alpha,
-        static_cast<const srcDataType* const __restrict__>(p_src_global),
-        beta,
-        static_cast<srcDataType* const __restrict__>(ws_buf1_global),
-        static_cast<int* const __restrict__>(ws_buf2_global));
-};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include "config.hpp"
-#include "number.hpp"
-#include "sequence.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "data_type_enum_helper.hpp"
-#include "reduction_common.hpp"
-#include "gridwise_generic_2d_reduction_direct_threadwise.hpp"
-
-using namespace ck;
-
-using srcDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
-using dstDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
-using compType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
-
-constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
-
-constexpr index_t srcDims = CK_PARAM_IN_DIMS;
-
-constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
-constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
-                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
-                                             : NanPropagation_t::PROPAGATE_NAN;
-constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
-                                                       ? ReduceTensorIndices_t::NO_INDICES
-                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
-
-constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
-constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
-
-constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
-constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
-
-constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable
-
-// helper functions using variadic template arguments
-template <index_t... Ns>
-__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
-{
-    return make_tuple(static_cast<index_t>(lengths[Ns])...);
-};
-
-template <index_t arraySize>
-__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
-{
-    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
-
-    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
-
-    return make_tuple_from_array_and_index_seq(lengths, index_seq);
-};
-
-template <index_t... Ns>
-__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
-{
-    return make_tuple(Ns...);
-};
-
-extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
-                                                             int BlkGroupSize,
-                                                             int inLength0,
-                                                             int inLength1,
-                                                             int inLength2,
-                                                             int inLength3,
-                                                             int inLength4,
-                                                             int inLength5,
-                                                             int inStride0,
-                                                             int inStride1,
-                                                             int inStride2,
-                                                             int inStride3,
-                                                             int inStride4,
-                                                             int inStride5,
-                                                             void* __restrict__ ws_global)
-{
-    (void)BlkGroupSize;
-
-    void* p_src2dDesc = ws_global;
-    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
-
-    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
-    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
-
-    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
-    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
-    const auto tupleDstLengths = make_tuple(1);
-    const auto tupleDstStrides = make_tuple(1);
-
-    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
-    auto dstDesc       = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
-
-    const auto one_dim_srcDesc = transform_tensor_descriptor(
-        srcDesc,
-        make_tuple(make_merge_transform(tupleSrcLengths)),
-        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-
-    auto src2dDesc = transform_tensor_descriptor(
-        one_dim_srcDesc,
-        make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))),
-        make_tuple(Sequence<0>{}),
-        make_tuple(Sequence<0, 1>{}));
-
-    constexpr int invariantLen = 1;
-    const auto toReduceLen     = src2dDesc.GetLength(Number<1>{});
-
-    constexpr auto copySliceLen = GredThreadBufferLength;
-
-    if constexpr(src2d_need_padding)
-    {
-        const auto srcPad1 = GridSize * BlockSize - invariantLen;
-        const auto srcPad2 =
-            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
-        auto src2dDesc_2 =
-            transform_tensor_descriptor(src2dDesc,
-                                        make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
-                                                   make_pad_transform(toReduceLen, 0, srcPad2)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
-    }
-    else
-    {
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
-    }
-
-    if constexpr(dst1d_need_padding)
-    {
-        const auto dstPad = GridSize * BlockSize - invariantLen;
-        auto dst1dDesc_2 =
-            transform_tensor_descriptor(dstdDesc,
-                                        make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
-                                        make_tuple(Sequence<0>{}),
-                                        make_tuple(Sequence<0>{}));
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
-    }
-    else
-    {
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(dstDesc)*>(p_dst1dDesc) = dstDesc;
-    }
-};
-
-template <index_t srcDims>
-struct get_ref_desc_types
-{
-    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
-
-    // don't have to use accurate strides to get an expected referrence type
-    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
-        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
-    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1));
-
-    static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
-        ref_srcDesc,
-        make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))),
-        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-
-    static constexpr auto ref_src2dDesc =
-        transform_tensor_descriptor(ref_one_dim_srcDesc,
-                                    make_tuple(make_unmerge_transform(
-                                        make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))),
-                                    make_tuple(Sequence<0>{}),
-                                    make_tuple(Sequence<0, 1>{}));
-
-    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
-    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
-
-    // used by the DirectThreadWise and DirectWarpWise method
-    using refType_src2dDesc_padded_12 =
-        decltype(transform_tensor_descriptor(ref_src2dDesc,
-                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
-                                                        make_pad_transform(ref_toReduceLen, 0, 2)),
-                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                             make_tuple(Sequence<0>{}, Sequence<1>{})));
-
-    using refType_dst1dDesc_padded =
-        decltype(transform_tensor_descriptor(ref_dstDesc,
-                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
-                                             make_tuple(Sequence<0>{}),
-                                             make_tuple(Sequence<0>{})));
-
-    using refType_src2dDesc = decltype(ref_src2dDesc);
-    using refType_dst1dDesc = decltype(ref_dstDesc);
-};
-
-using refType_src2dDesc = typename get_ref_desc_types<srcDims>::refType_src2dDesc;
-using refType_dst1dDesc = typename get_ref_desc_types<srcDims>::refType_dst1dDesc;
-using refType_src2dDesc_padded_12 =
-    typename get_ref_desc_types<srcDims>::refType_src2dDesc_padded_12;
-using refType_dst1dDesc_padded = typename get_ref_desc_types<srcDims>::refType_dst1dDesc_padded;
-
-template <bool need_padding>
-static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
-    else
-        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
-};
-
-template <bool need_padding>
-static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
-    else
-        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
-};
-
-extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
-                                                     int BlkGroupSize,
-                                                     float alpha,
-                                                     const void* __restrict__ p_src_global,
-                                                     float beta,
-                                                     void* __restrict__ p_dst_global,
-                                                     const void CONSTANT* ws_global,
-                                                     long ws_buf2_bytes_offset,
-                                                     void* __restrict__ indices_global)
-{
-    (void)BlkGroupSize;
-    (void)ws_buf2_bytes_offset;
-
-    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
-    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
-
-    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
-    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
-
-    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise<BlockSize,
-                                                                           srcDataType,
-                                                                           dstDataType,
-                                                                           compType,
-                                                                           decltype(src2dDesc),
-                                                                           decltype(dst1dDesc),
-                                                                           op,
-                                                                           nanPropaOpt,
-                                                                           reduceIndicesOpt,
-                                                                           true,
-                                                                           true,
-                                                                           GredThreadBufferLength>;
-
-    constexpr int RunId = need_indices ? 2 : 1;
-    gridwise_2d_reduce::template Run<RunId>(
-        src2dDesc,
-        dst1dDesc,
-        origReduceLen,
-        alpha,
-        static_cast<const srcDataType* const __restrict__>(p_src_global),
-        beta,
-        static_cast<dstDataType* const __restrict__>(p_dst_global),
-        static_cast<const int* const __restrict__>(nullptr),
-        static_cast<int* const __restrict__>(indices_global));
-};