merge develop

9dce6851 · Jing Zhang · 3cc57101 · 5d37d7bf · 3cc57101 · 3cc57101
Commit 9dce6851 authored Mar 10, 2022 by Jing Zhang
20 changed files
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include "config.hpp"
-#include "number.hpp"
-#include "sequence.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "data_type_enum_helper.hpp"
-#include "reduction_common.hpp"
-#include "gridwise_generic_2d_reduction_direct_threadwise.hpp"
-using namespace ck;
-using srcDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
-using dstDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
-using compType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
-constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
-constexpr index_t srcDims = CK_PARAM_IN_DIMS;
-constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
-constexpr index_t num_toReduceDims  = CK_PARAM_NUM_TOREDUCE_DIMS;
-constexpr index_t num_invariantDims = srcDims - num_toReduceDims;
-using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type;
-using toReduceDims  = typename arithmetic_sequence_gen<num_invariantDims, srcDims, 1>::type;
-constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
-constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
-                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
-                                             : NanPropagation_t::PROPAGATE_NAN;
-constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
-                                                       ? ReduceTensorIndices_t::NO_INDICES
-                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
-constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
-constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
-static_assert(num_invariantDims > 0, "Not all dimensins are reduced for this kernel !!");
-constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
-constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
-constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable
-// helper functions using variadic template arguments
-template <index_t... Ns>
-__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
-{
-    return make_tuple(static_cast<index_t>(lengths[Ns])...);
-};
-template <index_t arraySize>
-__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
-{
-    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
-    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
-    return make_tuple_from_array_and_index_seq(lengths, index_seq);
-};
-template <index_t... Ns>
-__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
-{
-    return make_tuple(Ns...);
-};
-extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
-                                                             int BlkGroupSize,
-                                                             int inLength0,
-                                                             int inLength1,
-                                                             int inLength2,
-                                                             int inLength3,
-                                                             int inLength4,
-                                                             int inLength5,
-                                                             int inStride0,
-                                                             int inStride1,
-                                                             int inStride2,
-                                                             int inStride3,
-                                                             int inStride4,
-                                                             int inStride5,
-                                                             int outStride0,
-                                                             int outStride1,
-                                                             int outStride2,
-                                                             int outStride3,
-                                                             int outStride4,
-                                                             int outStride5,
-                                                             void* __restrict__ ws_global)
-{
-    (void)BlkGroupSize;
-    void* p_src2dDesc = ws_global;
-    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
-    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
-    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
-    const int dstStrides[6] = {
-        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
-    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
-    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
-    const auto tupleDstLengths = make_tuple_from_array(srcLengths, Number<dstDims>{});
-    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
-    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
-    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
-    const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{});
-    const auto invariantDimLengths =
-        make_tuple_from_array_and_index_seq(srcLengths, invariantDims{});
-    auto src2dDesc =
-        transform_tensor_descriptor(srcDesc,
-                                    make_tuple(make_merge_transform(invariantDimLengths),
-                                               make_merge_transform(toReduceDimLengths)),
-                                    make_tuple(invariantDims{}, toReduceDims{}),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
-    auto dst1dDesc = transform_tensor_descriptor(
-        dstDesc,
-        make_tuple(make_merge_transform(tupleDstLengths)),
-        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-    const auto invariantLen = src2dDesc.GetLength(Number<0>{});
-    const auto toReduceLen  = src2dDesc.GetLength(Number<1>{});
-    constexpr auto copySliceLen = GredThreadBufferLength;
-    if constexpr(src2d_need_padding)
-    {
-        const auto srcPad1 = GridSize * BlockSize - invariantLen;
-        const auto srcPad2 =
-            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
-        auto src2dDesc_2 =
-            transform_tensor_descriptor(src2dDesc,
-                                        make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
-                                                   make_pad_transform(toReduceLen, 0, srcPad2)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
-    }
-    else
-    {
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
-    }
-    if constexpr(dst1d_need_padding)
-    {
-        const auto dstPad = GridSize * BlockSize - invariantLen;
-        auto dst1dDesc_2 =
-            transform_tensor_descriptor(dst1dDesc,
-                                        make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
-                                        make_tuple(Sequence<0>{}),
-                                        make_tuple(Sequence<0>{}));
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
-    }
-    else
-    {
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
-    }
-};
-template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
-struct get_ref_desc_types
-{
-    static constexpr auto ref_toReduceDimLengths =
-        typename uniform_sequence_gen<toReduceDims::Size(), 8>::type{};
-    static constexpr auto ref_invariantDimLengths =
-        typename uniform_sequence_gen<invariantDims::Size(), 8>::type{};
-    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
-    static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 8>::type{};
-    // don't have to use accurate strides to get an expected referrence type
-    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
-        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
-    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
-        make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
-    static constexpr auto ref_src2dDesc = transform_tensor_descriptor(
-        ref_srcDesc,
-        make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)),
-                   make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))),
-        make_tuple(invariantDims{}, toReduceDims{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
-    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
-        ref_dstDesc,
-        make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
-        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
-    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
-    // used by the DirectThreadWise and DirectWarpWise method
-    using refType_src2dDesc_padded_12 =
-        decltype(transform_tensor_descriptor(ref_src2dDesc,
-                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
-                                                        make_pad_transform(ref_toReduceLen, 0, 2)),
-                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                             make_tuple(Sequence<0>{}, Sequence<1>{})));
-    using refType_dst1dDesc_padded =
-        decltype(transform_tensor_descriptor(ref_dst1dDesc,
-                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
-                                             make_tuple(Sequence<0>{}),
-                                             make_tuple(Sequence<0>{})));
-    using refType_src2dDesc = decltype(ref_src2dDesc);
-    using refType_dst1dDesc = decltype(ref_dst1dDesc);
-};
-using refType_src2dDesc =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
-using refType_dst1dDesc =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
-using refType_src2dDesc_padded_12 =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
-        refType_src2dDesc_padded_12;
-using refType_dst1dDesc_padded =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
-        refType_dst1dDesc_padded;
-template <bool need_padding>
-static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
-    else
-        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
-};
-template <bool need_padding>
-static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
-    else
-        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
-};
-extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
-                                                     int BlkGroupSize,
-                                                     float alpha,
-                                                     const void* __restrict__ p_src_global,
-                                                     float beta,
-                                                     void* __restrict__ p_dst_global,
-                                                     const void CONSTANT* ws_global,
-                                                     long ws_buf2_bytes_offset,
-                                                     void* __restrict__ indices_global)
-{
-    (void)BlkGroupSize;
-    (void)ws_buf2_bytes_offset;
-    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
-    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
-    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
-    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
-    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise<BlockSize,
-                                                                           srcDataType,
-                                                                           dstDataType,
-                                                                           compType,
-                                                                           decltype(src2dDesc),
-                                                                           decltype(dst1dDesc),
-                                                                           op,
-                                                                           nanPropaOpt,
-                                                                           reduceIndicesOpt,
-                                                                           true,
-                                                                           true,
-                                                                           GredThreadBufferLength>;
-    constexpr int RunId = need_indices ? 2 : 1;
-    gridwise_2d_reduce::template Run<RunId>(
-        src2dDesc,
-        dst1dDesc,
-        origReduceLen,
-        alpha,
-        static_cast<const srcDataType* const __restrict__>(p_src_global),
-        beta,
-        static_cast<dstDataType* const __restrict__>(p_dst_global),
-        static_cast<const int* const __restrict__>(nullptr),
-        static_cast<int* const __restrict__>(indices_global));
-};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include "config.hpp"
-#include "number.hpp"
-#include "sequence.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "data_type_enum_helper.hpp"
-#include "reduction_common.hpp"
-#include "gridwise_generic_2d_reduction_direct_warpwise.hpp"
-using namespace ck;
-using srcDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
-using dstDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
-using compType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
-constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
-constexpr index_t srcDims = CK_PARAM_IN_DIMS;
-constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
-constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
-                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
-                                             : NanPropagation_t::PROPAGATE_NAN;
-constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
-                                                       ? ReduceTensorIndices_t::NO_INDICES
-                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
-constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
-constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
-constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
-constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
-constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable
-// helper functions using variadic template arguments
-template <index_t... Ns>
-__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
-{
-    return make_tuple(static_cast<index_t>(lengths[Ns])...);
-};
-template <index_t arraySize>
-__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
-{
-    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
-    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
-    return make_tuple_from_array_and_index_seq(lengths, index_seq);
-};
-template <index_t... Ns>
-__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
-{
-    return make_tuple(Ns...);
-};
-extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
-                                                             int BlkGroupSize,
-                                                             int inLength0,
-                                                             int inLength1,
-                                                             int inLength2,
-                                                             int inLength3,
-                                                             int inLength4,
-                                                             int inLength5,
-                                                             int inStride0,
-                                                             int inStride1,
-                                                             int inStride2,
-                                                             int inStride3,
-                                                             int inStride4,
-                                                             int inStride5,
-                                                             void* __restrict__ ws_global)
-{
-    (void)BlkGroupSize;
-    void* p_src2dDesc = ws_global;
-    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
-    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
-    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
-    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
-    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
-    const auto tupleDstLengths = make_tuple(1);
-    const auto tupleDstStrides = make_tuple(1);
-    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
-    auto dstDesc       = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
-    const auto one_dim_srcDesc = transform_tensor_descriptor(
-        srcDesc,
-        make_tuple(make_merge_transform(tupleSrcLengths)),
-        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-    auto src2dDesc = transform_tensor_descriptor(
-        one_dim_srcDesc,
-        make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))),
-        make_tuple(Sequence<0>{}),
-        make_tuple(Sequence<0, 1>{}));
-    constexpr int invariantLen = 1;
-    const auto toReduceLen     = src2dDesc.GetLength(Number<1>{});
-    constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp;
-    if constexpr(src2d_need_padding)
-    {
-        const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen;
-        const auto srcPad2 =
-            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
-        auto src2dDesc_2 =
-            transform_tensor_descriptor(src2dDesc,
-                                        make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
-                                                   make_pad_transform(toReduceLen, 0, srcPad2)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
-    }
-    else
-    {
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
-    }
-    if constexpr(dst1d_need_padding)
-    {
-        const auto dstPad = GridSize * BlockSize / warpSize - invariantLen;
-        auto dst1dDesc_2 =
-            transform_tensor_descriptor(dstDesc,
-                                        make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
-                                        make_tuple(Sequence<0>{}),
-                                        make_tuple(Sequence<0>{}));
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
-    }
-    else
-    {
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(dstDesc)*>(p_dst1dDesc) = dstDesc;
-    }
-};
-template <index_t srcDims>
-struct get_ref_desc_types
-{
-    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
-    // don't have to use accurate strides to get an expected referrence type
-    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
-        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
-    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1));
-    static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
-        ref_srcDesc,
-        make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))),
-        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-    static constexpr auto ref_src2dDesc =
-        transform_tensor_descriptor(ref_one_dim_srcDesc,
-                                    make_tuple(make_unmerge_transform(
-                                        make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))),
-                                    make_tuple(Sequence<0>{}),
-                                    make_tuple(Sequence<0, 1>{}));
-    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
-    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
-    // used by the DirectThreadWise and DirectWarpWise method
-    using refType_src2dDesc_padded_12 =
-        decltype(transform_tensor_descriptor(ref_src2dDesc,
-                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
-                                                        make_pad_transform(ref_toReduceLen, 0, 2)),
-                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                             make_tuple(Sequence<0>{}, Sequence<1>{})));
-    using refType_dst1dDesc_padded =
-        decltype(transform_tensor_descriptor(ref_dstDesc,
-                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
-                                             make_tuple(Sequence<0>{}),
-                                             make_tuple(Sequence<0>{})));
-    using refType_src2dDesc = decltype(ref_src2dDesc);
-    using refType_dst1dDesc = decltype(ref_dstDesc);
-};
-using refType_src2dDesc = typename get_ref_desc_types<srcDims>::refType_src2dDesc;
-using refType_dst1dDesc = typename get_ref_desc_types<srcDims>::refType_dst1dDesc;
-using refType_src2dDesc_padded_12 typename get_ref_desc_types<srcDims>::refType_src2dDesc_padded_12;
-using refType_dst1dDesc_padded = typename get_ref_desc_types<srcDims>::refType_dst1dDesc_padded;
-template <bool need_padding>
-static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
-    else
-        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
-};
-template <bool need_padding>
-static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
-    else
-        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
-};
-extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
-                                                     int BlkGroupSize,
-                                                     float alpha,
-                                                     const void* __restrict__ p_src_global,
-                                                     float beta,
-                                                     void* __restrict__ p_dst_global,
-                                                     const void CONSTANT* ws_global,
-                                                     long ws_buf2_bytes_offset,
-                                                     void* __restrict__ indices_global)
-{
-    (void)BlkGroupSize;
-    (void)ws_buf2_bytes_offset;
-    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
-    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
-    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
-    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
-    using gridwise_2d_reduce =
-        GridwiseReduction_xy_to_x_direct_warpwise<BlockSize,
-                                                  srcDataType,
-                                                  dstDataType,
-                                                  compType,
-                                                  decltype(src2dDesc),
-                                                  decltype(dst1dDesc),
-                                                  op,
-                                                  nanPropaOpt,
-                                                  reduceIndicesOpt,
-                                                  true,
-                                                  true,
-                                                  GredAccessesPerThreadInWarp>;
-    constexpr int RunId = need_indices ? 2 : 1;
-    gridwise_2d_reduce::template Run<RunId>(
-        src2dDesc,
-        dst1dDesc,
-        origReduceLen,
-        alpha,
-        static_cast<const srcDataType* const __restrict__>(p_src_global),
-        beta,
-        static_cast<dstDataType* const __restrict__>(p_dst_global),
-        static_cast<const int* const __restrict__>(nullptr),
-        static_cast<int* const __restrict__>(indices_global));
-};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include "config.hpp"
-#include "number.hpp"
-#include "sequence.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "data_type_enum_helper.hpp"
-#include "reduction_common.hpp"
-#include "gridwise_generic_2d_reduction_direct_warpwise.hpp"
-using namespace ck;
-using srcDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
-using dstDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
-using compType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
-constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
-constexpr index_t srcDims = CK_PARAM_IN_DIMS;
-constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
-constexpr index_t num_toReduceDims  = CK_PARAM_NUM_TOREDUCE_DIMS;
-constexpr index_t num_invariantDims = srcDims - num_toReduceDims;
-using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type;
-using toReduceDims  = typename arithmetic_sequence_gen<num_invariantDims, srcDims, 1>::type;
-constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
-constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
-                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
-                                             : NanPropagation_t::PROPAGATE_NAN;
-constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
-                                                       ? ReduceTensorIndices_t::NO_INDICES
-                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
-constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
-constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
-static_assert(num_invariantDims > 0, "Not all dimensins are reduced for this kernel !!");
-constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
-constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
-constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable
-// helper functions using variadic template arguments
-template <index_t... Ns>
-__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
-{
-    return make_tuple(static_cast<index_t>(lengths[Ns])...);
-};
-template <index_t arraySize>
-__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
-{
-    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
-    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
-    return make_tuple_from_array_and_index_seq(lengths, index_seq);
-};
-template <index_t... Ns>
-__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
-{
-    return make_tuple(Ns...);
-};
-extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
-                                                             int BlkGroupSize,
-                                                             int inLength0,
-                                                             int inLength1,
-                                                             int inLength2,
-                                                             int inLength3,
-                                                             int inLength4,
-                                                             int inLength5,
-                                                             int inStride0,
-                                                             int inStride1,
-                                                             int inStride2,
-                                                             int inStride3,
-                                                             int inStride4,
-                                                             int inStride5,
-                                                             int outStride0,
-                                                             int outStride1,
-                                                             int outStride2,
-                                                             int outStride3,
-                                                             int outStride4,
-                                                             int outStride5,
-                                                             void* __restrict__ ws_global)
-{
-    (void)BlkGroupSize;
-    void* p_src2dDesc = ws_global;
-    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
-    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
-    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
-    const int dstStrides[6] = {
-        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
-    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
-    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
-    const auto tupleDstLengths = make_tuple_from_array(srcLengths, Number<dstDims>{});
-    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
-    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
-    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
-    const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{});
-    const auto invariantDimLengths =
-        make_tuple_from_array_and_index_seq(srcLengths, invariantDims{});
-    auto src2dDesc =
-        transform_tensor_descriptor(srcDesc,
-                                    make_tuple(make_merge_transform(invariantDimLengths),
-                                               make_merge_transform(toReduceDimLengths)),
-                                    make_tuple(invariantDims{}, toReduceDims{}),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
-    auto dst1dDesc = transform_tensor_descriptor(
-        dstDesc,
-        make_tuple(make_merge_transform(tupleDstLengths)),
-        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-    const auto invariantLen = src2dDesc.GetLength(Number<0>{});
-    const auto toReduceLen  = src2dDesc.GetLength(Number<1>{});
-    constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp;
-    if constexpr(src2d_need_padding)
-    {
-        const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen;
-        const auto srcPad2 =
-            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
-        auto src2dDesc_2 =
-            transform_tensor_descriptor(src2dDesc,
-                                        make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
-                                                   make_pad_transform(toReduceLen, 0, srcPad2)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
-    }
-    else
-    {
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
-    }
-    if constexpr(dst1d_need_padding)
-    {
-        const auto dstPad = GridSize * BlockSize / warpSize - invariantLen;
-        auto dst1dDesc_2 =
-            transform_tensor_descriptor(dst1dDesc,
-                                        make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
-                                        make_tuple(Sequence<0>{}),
-                                        make_tuple(Sequence<0>{}));
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
-    }
-    else
-    {
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
-    }
-};
-template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
-struct get_ref_desc_types
-{
-    static constexpr auto ref_toReduceDimLengths =
-        typename uniform_sequence_gen<toReduceDims::Size(), 8>::type{};
-    static constexpr auto ref_invariantDimLengths =
-        typename uniform_sequence_gen<invariantDims::Size(), 8>::type{};
-    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
-    static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 8>::type{};
-    // don't have to use accurate strides to get an expected referrence type
-    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
-        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
-    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
-        make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
-    static constexpr auto ref_src2dDesc = transform_tensor_descriptor(
-        ref_srcDesc,
-        make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)),
-                   make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))),
-        make_tuple(invariantDims{}, toReduceDims{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
-    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
-        ref_dstDesc,
-        make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
-        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
-    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
-    // used by the DirectThreadWise and DirectWarpWise method
-    using refType_src2dDesc_padded_12 =
-        decltype(transform_tensor_descriptor(ref_src2dDesc,
-                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
-                                                        make_pad_transform(ref_toReduceLen, 0, 2)),
-                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                             make_tuple(Sequence<0>{}, Sequence<1>{})));
-    using refType_dst1dDesc_padded =
-        decltype(transform_tensor_descriptor(ref_dst1dDesc,
-                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
-                                             make_tuple(Sequence<0>{}),
-                                             make_tuple(Sequence<0>{})));
-    using refType_src2dDesc = decltype(ref_src2dDesc);
-    using refType_dst1dDesc = decltype(ref_dst1dDesc);
-};
-using refType_src2dDesc =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
-using refType_dst1dDesc =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
-using refType_src2dDesc_padded_12 =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
-        refType_src2dDesc_padded_12;
-using refType_dst1dDesc_padded =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
-        refType_dst1dDesc_padded;
-template <bool need_padding>
-static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
-    else
-        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
-};
-template <bool need_padding>
-static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
-    else
-        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
-};
-extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
-                                                     int BlkGroupSize,
-                                                     float alpha,
-                                                     const void* __restrict__ p_src_global,
-                                                     float beta,
-                                                     void* __restrict__ p_dst_global,
-                                                     const void CONSTANT* ws_global,
-                                                     long ws_buf2_bytes_offset,
-                                                     void* __restrict__ indices_global)
-{
-    (void)BlkGroupSize;
-    (void)ws_buf2_bytes_offset;
-    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
-    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
-    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
-    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
-    using gridwise_2d_reduce =
-        GridwiseReduction_xy_to_x_direct_warpwise<BlockSize,
-                                                  srcDataType,
-                                                  dstDataType,
-                                                  compType,
-                                                  decltype(src2dDesc),
-                                                  decltype(dst1dDesc),
-                                                  op,
-                                                  nanPropaOpt,
-                                                  reduceIndicesOpt,
-                                                  true,
-                                                  true,
-                                                  GredAccessesPerThreadInWarp>;
-    constexpr int RunId = need_indices ? 2 : 1;
-    gridwise_2d_reduce::template Run<RunId>(
-        src2dDesc,
-        dst1dDesc,
-        origReduceLen,
-        alpha,
-        static_cast<const srcDataType* const __restrict__>(p_src_global),
-        beta,
-        static_cast<dstDataType* const __restrict__>(p_dst_global),
-        static_cast<const int* const __restrict__>(nullptr),
-        static_cast<int* const __restrict__>(indices_global));
-};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include "config.hpp"
-#include "number.hpp"
-#include "sequence.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "data_type_enum_helper.hpp"
-#include "reduction_common.hpp"
-#include "gridwise_generic_2d_reduction_blockwise.hpp"
-using namespace ck;
-using srcDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
-using dstDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
-using compType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
-constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
-constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
-constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
-                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
-                                             : NanPropagation_t::PROPAGATE_NAN;
-constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
-                                                       ? ReduceTensorIndices_t::NO_INDICES
-                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
-constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
-constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
-constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
-constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
-constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
-extern "C" __global__ void
-gridwise_generic_reduce_2_prepare(int GridSize, int BlkGroupSize, void* __restrict__ ws_global)
-{
-    (void)GridSize;
-    void* p_src2dDesc = ws_global;
-    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
-    const auto tupleDstLengths = make_tuple(1);
-    const auto tupleDstStrides = make_tuple(1);
-    auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
-    const index_t invariantLen = dstDesc.GetLength(Number<0>{});
-    const index_t toReduceLen  = BlkGroupSize;
-    auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen));
-    constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
-    if constexpr(src2d_need_padding)
-    {
-        const auto srcPad =
-            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
-        auto src2dDesc_2 =
-            transform_tensor_descriptor(src2dDesc,
-                                        make_tuple(make_pass_through_transform(invariantLen),
-                                                   make_pad_transform(toReduceLen, 0, srcPad)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
-    }
-    else
-    {
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
-    }
-    if(get_thread_local_1d_id() == 0)
-        *static_cast<decltype(dstDesc)*>(p_dst1dDesc) = dstDesc;
-};
-struct get_ref_desc_types
-{
-    static constexpr auto ref_tupleDstLengths = make_tuple(8);
-    static constexpr auto ref_dstDesc =
-        make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths);
-    static constexpr index_t ref_invariantLen = ref_dstDesc.GetLength(Number<0>{});
-    static constexpr index_t ref_toReduceLen  = 8;
-    static constexpr auto ref_src2dDesc =
-        make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen));
-    using refType_src2dDesc = decltype(ref_src2dDesc);
-    using refType_dst1dDesc = decltype(ref_dstDesc);
-    // used by the BlockWise and MultiBlock method
-    using refType_src2dDesc_padded_34 = decltype(
-        transform_tensor_descriptor(ref_src2dDesc,
-                                    make_tuple(make_pass_through_transform(ref_invariantLen),
-                                               make_pad_transform(ref_toReduceLen, 0, 2)),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{})));
-    using refType_dst1dDesc_padded =
-        decltype(transform_tensor_descriptor(ref_dstDesc,
-                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
-                                             make_tuple(Sequence<0>{}),
-                                             make_tuple(Sequence<0>{})));
-};
-using refType_src2dDesc           = typename get_ref_desc_types::refType_src2dDesc;
-using refType_dst1dDesc           = typename get_ref_desc_types::refType_dst1dDesc;
-using refType_src2dDesc_padded_34 = typename get_ref_desc_types::refType_src2dDesc_padded_34;
-using refType_dst1dDesc_padded    = typename get_ref_desc_types::refType_dst1dDesc_padded;
-template <bool need_padding>
-static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
-    else
-        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
-};
-template <bool need_padding>
-static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
-    else
-        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
-};
-extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen,
-                                                     float alpha,
-                                                     const void* __restrict__ p_src_global,
-                                                     float beta,
-                                                     void* __restrict__ p_dst_global,
-                                                     const void CONSTANT* ws_global,
-                                                     long ws_buf2_bytes_offset,
-                                                     void* __restrict__ indices_global)
-{
-    (void)p_src_global;
-    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
-    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
-    void* ws_buf1_global    = const_cast<char*>(static_cast<const char*>(p_src2dDesc) + 4096);
-    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
-    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
-    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise<BlockSize,
-                                                                   srcDataType,
-                                                                   dstDataType,
-                                                                   compType,
-                                                                   decltype(src2dDesc),
-                                                                   decltype(dst1dDesc),
-                                                                   op,
-                                                                   nanPropaOpt,
-                                                                   reduceIndicesOpt,
-                                                                   false,
-                                                                   true,
-                                                                   GredAccessesPerThreadInBlock>;
-    void* const ws_buf2_global =
-        ws_buf2_bytes_offset > 0
-            ? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
-            : nullptr;
-    constexpr int RunId = need_indices ? 3 : 1;
-    gridwise_2d_reduce::template Run<RunId>(
-        src2dDesc,
-        dst1dDesc,
-        origReduceLen,
-        alpha,
-        static_cast<const srcDataType* const __restrict__>(ws_buf1_global),
-        beta,
-        static_cast<dstDataType* const __restrict__>(p_dst_global),
-        static_cast<const int* const __restrict__>(ws_buf2_global),
-        static_cast<int* const __restrict__>(indices_global));
-};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include "config.hpp"
-#include "number.hpp"
-#include "sequence.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "data_type_enum_helper.hpp"
-#include "reduction_common.hpp"
-#include "gridwise_generic_2d_reduction_blockwise.hpp"
-using namespace ck;
-using srcDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
-using dstDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
-using compType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
-constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
-constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
-constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
-constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
-                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
-                                             : NanPropagation_t::PROPAGATE_NAN;
-constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
-                                                       ? ReduceTensorIndices_t::NO_INDICES
-                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
-constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
-constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
-constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
-constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
-constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
-// helper functions using variadic template arguments
-template <index_t... Ns>
-__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
-{
-    return make_tuple(static_cast<index_t>(lengths[Ns])...);
-};
-template <index_t arraySize>
-__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
-{
-    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
-    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
-    return make_tuple_from_array_and_index_seq(lengths, index_seq);
-};
-template <index_t... Ns>
-__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
-{
-    return make_tuple(Ns...);
-};
-extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize,
-                                                             int BlkGroupSize,
-                                                             int outLength0,
-                                                             int outLength1,
-                                                             int outLength2,
-                                                             int outLength3,
-                                                             int outLength4,
-                                                             int outLength5,
-                                                             int outStride0,
-                                                             int outStride1,
-                                                             int outStride2,
-                                                             int outStride3,
-                                                             int outStride4,
-                                                             int outStride5,
-                                                             void* __restrict__ ws_global)
-{
-    (void)GridSize;
-    void* p_src2dDesc = ws_global;
-    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
-    const int dstLengths[6] = {
-        outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
-    const int dstStrides[6] = {
-        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
-    const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
-    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
-    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
-    auto dst1dDesc = transform_tensor_descriptor(
-        dstDesc,
-        make_tuple(make_merge_transform(tupleDstLengths)),
-        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-    const index_t invariantLen = dst1dDesc.GetLength(Number<0>{});
-    const index_t toReduceLen  = BlkGroupSize;
-    auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen));
-    constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
-    if constexpr(src2d_need_padding)
-    {
-        const auto srcPad =
-            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
-        auto src2dDesc_2 =
-            transform_tensor_descriptor(src2dDesc,
-                                        make_tuple(make_pass_through_transform(invariantLen),
-                                                   make_pad_transform(toReduceLen, 0, srcPad)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
-    }
-    else
-    {
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
-    }
-    if(get_thread_local_1d_id() == 0)
-        *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
-};
-template <index_t dstDims>
-struct get_ref_desc_types
-{
-    static constexpr auto ref_tupleDstLengths =
-        make_tuple_from_seq(typename uniform_sequence_gen<dstDims, 8>::type{});
-    static constexpr auto ref_dstDesc =
-        make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths);
-    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
-        ref_dstDesc,
-        make_tuple(make_merge_transform(ref_tupleDstLengths)),
-        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-    static constexpr index_t ref_invariantLen = ref_dst1dDesc.GetLength(Number<0>{});
-    static constexpr index_t ref_toReduceLen  = 8;
-    static constexpr auto ref_src2dDesc =
-        make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen));
-    using refType_src2dDesc = decltype(ref_src2dDesc);
-    using refType_dst1dDesc = decltype(ref_dst1dDesc);
-    // used by the BlockWise and MultiBlock method
-    using refType_src2dDesc_padded_34 = decltype(
-        transform_tensor_descriptor(ref_src2dDesc,
-                                    make_tuple(make_pass_through_transform(ref_invariantLen),
-                                               make_pad_transform(ref_toReduceLen, 0, 2)),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{})));
-    using refType_dst1dDesc_padded =
-        decltype(transform_tensor_descriptor(ref_dst1dDesc,
-                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
-                                             make_tuple(Sequence<0>{}),
-                                             make_tuple(Sequence<0>{})));
-};
-using refType_src2dDesc = typename get_ref_desc_types<dstDims>::refType_src2dDesc;
-using refType_dst1dDesc = typename get_ref_desc_types<dstDims>::refType_dst1dDesc;
-using refType_src2dDesc_padded_34 =
-    typename get_ref_desc_types<dstDims>::refType_src2dDesc_padded_34;
-using refType_dst1dDesc_padded = typename get_ref_desc_types<dstDims>::refType_dst1dDesc_padded;
-template <bool need_padding>
-static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
-    else
-        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
-};
-template <bool need_padding>
-static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
-    else
-        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
-};
-extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen,
-                                                     float alpha,
-                                                     const void* __restrict__ p_src_global,
-                                                     float beta,
-                                                     void* __restrict__ p_dst_global,
-                                                     const void CONSTANT* ws_global,
-                                                     long ws_buf2_bytes_offset,
-                                                     void* __restrict__ indices_global)
-{
-    (void)p_src_global;
-    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
-    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
-    void* ws_buf1_global    = const_cast<char*>(static_cast<const char*>(p_src2dDesc) + 4096);
-    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
-    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
-    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise<BlockSize,
-                                                                   srcDataType,
-                                                                   dstDataType,
-                                                                   compType,
-                                                                   decltype(src2dDesc),
-                                                                   decltype(dst1dDesc),
-                                                                   op,
-                                                                   nanPropaOpt,
-                                                                   reduceIndicesOpt,
-                                                                   false,
-                                                                   true,
-                                                                   GredAccessesPerThreadInBlock>;
-    void* const ws_buf2_global =
-        ws_buf2_bytes_offset > 0
-            ? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
-            : nullptr;
-    constexpr int RunId = need_indices ? 3 : 1;
-    gridwise_2d_reduce::template Run<RunId>(
-        src2dDesc,
-        dst1dDesc,
-        origReduceLen,
-        alpha,
-        static_cast<const srcDataType* const __restrict__>(ws_buf1_global),
-        beta,
-        static_cast<dstDataType* const __restrict__>(p_dst_global),
-        static_cast<const int* const __restrict__>(ws_buf2_global),
-        static_cast<int* const __restrict__>(indices_global));
-};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include "config.hpp"
-#include "number.hpp"
-#include "sequence.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "data_type_enum_helper.hpp"
-#include "reduction_common.hpp"
-#include "gridwise_generic_2d_reduction_direct_threadwise.hpp"
-using namespace ck;
-using srcDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
-using dstDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
-using compType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
-constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
-using toReduceDims  = Sequence<CK_PARAM_TOREDUCE_DIMS>;
-using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>; // this could be empty
-constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
-constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
-                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
-                                             : NanPropagation_t::PROPAGATE_NAN;
-constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
-                                                       ? ReduceTensorIndices_t::NO_INDICES
-                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
-constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
-constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
-constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
-constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
-constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable
-extern "C" __global__ void
-gridwise_generic_reduce_2_prepare(int GridSize, int BlkGroupSize, void* __restrict__ ws_global)
-{
-    (void)BlkGroupSize;
-    void* p_src2dDesc = ws_global;
-    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
-    const auto tupleDstLengths = make_tuple(1);
-    const auto tupleDstStrides = make_tuple(1);
-    auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
-    const index_t invariantLen = dstDesc.GetLength(Number<0>{});
-    const index_t toReduceLen  = BlkGroupSize;
-    auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen));
-    constexpr auto copySliceLen = GredThreadBufferLength;
-    if constexpr(src2d_need_padding)
-    {
-        const auto srcPad1 = GridSize * BlockSize - invariantLen;
-        const auto srcPad2 =
-            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
-        auto src2dDesc_2 =
-            transform_tensor_descriptor(src2dDesc,
-                                        make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
-                                                   make_pad_transform(toReduceLen, 0, srcPad2)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
-    }
-    else
-    {
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
-    }
-    if constexpr(dst1d_need_padding)
-    {
-        const auto dstPad = GridSize * BlockSize - invariantLen;
-        auto dst1dDesc_2 =
-            transform_tensor_descriptor(dstDesc,
-                                        make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
-                                        make_tuple(Sequence<0>{}),
-                                        make_tuple(Sequence<0>{}));
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
-    }
-    else
-    {
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(dstDesc)*>(p_dst1dDesc) = dstDesc;
-    }
-};
-struct get_ref_desc_types
-{
-    static constexpr auto ref_tupleDstLengths = make_tuple(8);
-    static constexpr auto ref_dstDesc =
-        make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths);
-    static constexpr index_t ref_invariantLen = ref_dstDesc.GetLength(Number<0>{});
-    static constexpr index_t ref_toReduceLen  = 8;
-    static constexpr auto ref_src2dDesc =
-        make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen));
-    using refType_src2dDesc = decltype(ref_src2dDesc);
-    using refType_dst1dDesc = decltype(ref_dstDesc);
-    // used by the DirectThreadWise and DirectWarpWise method
-    using refType_src2dDesc_padded_12 =
-        decltype(transform_tensor_descriptor(ref_src2dDesc,
-                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
-                                                        make_pad_transform(ref_toReduceLen, 0, 2)),
-                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                             make_tuple(Sequence<0>{}, Sequence<1>{})));
-    using refType_dst1dDesc_padded =
-        decltype(transform_tensor_descriptor(ref_dstDesc,
-                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
-                                             make_tuple(Sequence<0>{}),
-                                             make_tuple(Sequence<0>{})));
-};
-using refType_src2dDesc           = typename get_ref_desc_types::refType_src2dDesc;
-using refType_dst1dDesc           = typename get_ref_desc_types::refType_dst1dDesc;
-using refType_src2dDesc_padded_12 = typename get_ref_desc_types::refType_src2dDesc_padded_12;
-using refType_dst1dDesc_padded    = typename get_ref_desc_types::refType_dst1dDesc_padded;
-template <bool need_padding>
-static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
-    else
-        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
-};
-template <bool need_padding>
-static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
-    else
-        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
-};
-extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen,
-                                                     float alpha,
-                                                     const void* __restrict__ p_src_global,
-                                                     float beta,
-                                                     void* __restrict__ p_dst_global,
-                                                     const void CONSTANT* ws_global,
-                                                     long ws_buf2_bytes_offset,
-                                                     void* __restrict__ indices_global)
-{
-    (void)p_src_global;
-    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
-    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
-    void* ws_buf1_global    = const_cast<char*>(static_cast<const char*>(p_src2dDesc) + 4096);
-    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
-    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
-    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise<BlockSize,
-                                                                           srcDataType,
-                                                                           dstDataType,
-                                                                           compType,
-                                                                           decltype(src2dDesc),
-                                                                           decltype(dst1dDesc),
-                                                                           op,
-                                                                           nanPropaOpt,
-                                                                           reduceIndicesOpt,
-                                                                           false,
-                                                                           true,
-                                                                           GredThreadBufferLength>;
-    void* const ws_buf2_global =
-        ws_buf2_bytes_offset > 0
-            ? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
-            : nullptr;
-    constexpr int RunId = need_indices ? 3 : 1;
-    gridwise_2d_reduce::template Run<RunId>(
-        src2dDesc,
-        dst1dDesc,
-        origReduceLen,
-        alpha,
-        static_cast<const srcDataType* const __restrict__>(ws_buf1_global),
-        beta,
-        static_cast<dstDataType* const __restrict__>(p_dst_global),
-        static_cast<const int* const __restrict__>(ws_buf2_global),
-        static_cast<int* const __restrict__>(indices_global));
-};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include "config.hpp"
-#include "number.hpp"
-#include "sequence.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "data_type_enum_helper.hpp"
-#include "reduction_common.hpp"
-#include "gridwise_generic_2d_reduction_direct_threadwise.hpp"
-using namespace ck;
-using srcDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
-using dstDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
-using compType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
-constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
-constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
-constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
-constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
-                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
-                                             : NanPropagation_t::PROPAGATE_NAN;
-constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
-                                                       ? ReduceTensorIndices_t::NO_INDICES
-                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
-constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
-constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
-constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
-constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
-constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable
-// helper functions using variadic template arguments
-template <index_t... Ns>
-__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
-{
-    return make_tuple(static_cast<index_t>(lengths[Ns])...);
-};
-template <index_t arraySize>
-__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
-{
-    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
-    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
-    return make_tuple_from_array_and_index_seq(lengths, index_seq);
-};
-template <index_t... Ns>
-__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
-{
-    return make_tuple(Ns...);
-};
-extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize,
-                                                             int BlkGroupSize,
-                                                             int outLength0,
-                                                             int outLength1,
-                                                             int outLength2,
-                                                             int outLength3,
-                                                             int outLength4,
-                                                             int outLength5,
-                                                             int outStride0,
-                                                             int outStride1,
-                                                             int outStride2,
-                                                             int outStride3,
-                                                             int outStride4,
-                                                             int outStride5,
-                                                             void* __restrict__ ws_global)
-{
-    (void)BlkGroupSize;
-    void* p_src2dDesc = ws_global;
-    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
-    const int dstLengths[6] = {
-        outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
-    const int dstStrides[6] = {
-        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
-    const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
-    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
-    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
-    auto dst1dDesc = transform_tensor_descriptor(
-        dstDesc,
-        make_tuple(make_merge_transform(tupleDstLengths)),
-        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-    const index_t invariantLen = dst1dDesc.GetLength(Number<0>{});
-    const index_t toReduceLen  = BlkGroupSize;
-    auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen));
-    constexpr auto copySliceLen = GredThreadBufferLength;
-    if constexpr(src2d_need_padding)
-    {
-        const auto srcPad1 = GridSize * BlockSize - invariantLen;
-        const auto srcPad2 =
-            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
-        auto src2dDesc_2 =
-            transform_tensor_descriptor(src2dDesc,
-                                        make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
-                                                   make_pad_transform(toReduceLen, 0, srcPad2)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
-    }
-    else
-    {
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
-    }
-    if constexpr(dst1d_need_padding)
-    {
-        const auto dstPad = GridSize * BlockSize - invariantLen;
-        auto dst1dDesc_2 =
-            transform_tensor_descriptor(dst1dDesc,
-                                        make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
-                                        make_tuple(Sequence<0>{}),
-                                        make_tuple(Sequence<0>{}));
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
-    }
-    else
-    {
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
-    }
-};
-template <index_t dstDims>
-struct get_ref_desc_types
-{
-    static constexpr auto ref_tupleDstLengths =
-        make_tuple_from_seq(typename uniform_sequence_gen<dstDims, 8>::type{});
-    static constexpr auto ref_dstDesc =
-        make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths);
-    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
-        ref_dstDesc,
-        make_tuple(make_merge_transform(ref_tupleDstLengths)),
-        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-    static constexpr index_t ref_invariantLen = ref_dst1dDesc.GetLength(Number<0>{});
-    static constexpr index_t ref_toReduceLen  = 8;
-    static constexpr auto ref_src2dDesc =
-        make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen));
-    using refType_src2dDesc = decltype(ref_src2dDesc);
-    using refType_dst1dDesc = decltype(ref_dst1dDesc);
-    // used by the DirectThreadWise and DirectWarpWise method
-    using refType_src2dDesc_padded_12 =
-        decltype(transform_tensor_descriptor(ref_src2dDesc,
-                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
-                                                        make_pad_transform(ref_toReduceLen, 0, 2)),
-                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                             make_tuple(Sequence<0>{}, Sequence<1>{})));
-    using refType_dst1dDesc_padded =
-        decltype(transform_tensor_descriptor(ref_dst1dDesc,
-                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
-                                             make_tuple(Sequence<0>{}),
-                                             make_tuple(Sequence<0>{})));
-};
-using refType_src2dDesc = typename get_ref_desc_types<dstDims>::refType_src2dDesc;
-using refType_dst1dDesc = typename get_ref_desc_types<dstDims>::refType_dst1dDesc;
-using refType_src2dDesc_padded_12 =
-    typename get_ref_desc_types<dstDims>::refType_src2dDesc_padded_12;
-using refType_dst1dDesc_padded = typename get_ref_desc_types<dstDims>::refType_dst1dDesc_padded;
-template <bool need_padding>
-static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
-    else
-        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
-};
-template <bool need_padding>
-static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
-    else
-        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
-};
-extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen,
-                                                     float alpha,
-                                                     const void* __restrict__ p_src_global,
-                                                     float beta,
-                                                     void* __restrict__ p_dst_global,
-                                                     const void CONSTANT* ws_global,
-                                                     long ws_buf2_bytes_offset,
-                                                     void* __restrict__ indices_global)
-{
-    (void)p_src_global;
-    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
-    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
-    void* ws_buf1_global    = const_cast<char*>(static_cast<const char*>(p_src2dDesc) + 4096);
-    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
-    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
-    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise<BlockSize,
-                                                                           srcDataType,
-                                                                           dstDataType,
-                                                                           compType,
-                                                                           decltype(src2dDesc),
-                                                                           decltype(dst1dDesc),
-                                                                           op,
-                                                                           nanPropaOpt,
-                                                                           reduceIndicesOpt,
-                                                                           false,
-                                                                           true,
-                                                                           GredThreadBufferLength>;
-    void* const ws_buf2_global =
-        ws_buf2_bytes_offset > 0
-            ? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
-            : nullptr;
-    constexpr int RunId = need_indices ? 3 : 1;
-    gridwise_2d_reduce::template Run<RunId>(
-        src2dDesc,
-        dst1dDesc,
-        origReduceLen,
-        alpha,
-        static_cast<const srcDataType* const __restrict__>(ws_buf1_global),
-        beta,
-        static_cast<dstDataType* const __restrict__>(p_dst_global),
-        static_cast<const int* const __restrict__>(ws_buf2_global),
-        static_cast<int* const __restrict__>(indices_global));
-};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include "config.hpp"
-#include "number.hpp"
-#include "sequence.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "data_type_enum_helper.hpp"
-#include "reduction_common.hpp"
-#include "gridwise_generic_2d_reduction_direct_warpwise.hpp"
-using namespace ck;
-using srcDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
-using dstDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
-using compType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
-constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
-constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
-constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
-                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
-                                             : NanPropagation_t::PROPAGATE_NAN;
-constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
-                                                       ? ReduceTensorIndices_t::NO_INDICES
-                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
-constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
-constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
-constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
-constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
-constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable
-extern "C" __global__ void
-gridwise_generic_reduce_2_prepare(int GridSize, int BlkGroupSize, void* __restrict__ ws_global)
-{
-    (void)BlkGroupSize;
-    void* p_src2dDesc = ws_global;
-    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
-    const auto tupleDstLengths = make_tuple(1);
-    const auto tupleDstStrides = make_tuple(1);
-    auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
-    const index_t invariantLen = dstDesc.GetLength(Number<0>{});
-    const index_t toReduceLen  = BlkGroupSize;
-    auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen));
-    constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp;
-    if constexpr(src2d_need_padding)
-    {
-        const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen;
-        const auto srcPad2 =
-            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
-        auto src2dDesc_2 =
-            transform_tensor_descriptor(src2dDesc,
-                                        make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
-                                                   make_pad_transform(toReduceLen, 0, srcPad2)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
-    }
-    else
-    {
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
-    }
-    if constexpr(dst1d_need_padding)
-    {
-        const auto dstPad = GridSize * BlockSize / warpSize - invariantLen;
-        auto dst1dDesc_2 =
-            transform_tensor_descriptor(dstDesc,
-                                        make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
-                                        make_tuple(Sequence<0>{}),
-                                        make_tuple(Sequence<0>{}));
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
-    }
-    else
-    {
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(dstDesc)*>(p_dst1dDesc) = dstDesc;
-    }
-};
-struct get_ref_desc_types
-{
-    static constexpr auto ref_tupleDstLengths = make_tuple(8);
-    static constexpr auto ref_dstDesc =
-        make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths);
-    static constexpr index_t ref_invariantLen = ref_dstDesc.GetLength(Number<0>{});
-    static constexpr index_t ref_toReduceLen  = 8;
-    static constexpr auto ref_src2dDesc =
-        make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen));
-    using refType_src2dDesc = decltype(ref_src2dDesc);
-    using refType_dst1dDesc = decltype(ref_dstDesc);
-    // used by the DirectThreadWise and DirectWarpWise method
-    using refType_src2dDesc_padded_12 =
-        decltype(transform_tensor_descriptor(ref_src2dDesc,
-                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
-                                                        make_pad_transform(ref_toReduceLen, 0, 2)),
-                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                             make_tuple(Sequence<0>{}, Sequence<1>{})));
-    using refType_dst1dDesc_padded =
-        decltype(transform_tensor_descriptor(ref_dstDesc,
-                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
-                                             make_tuple(Sequence<0>{}),
-                                             make_tuple(Sequence<0>{})));
-};
-using refType_src2dDesc           = typename get_ref_desc_types::refType_src2dDesc;
-using refType_dst1dDesc           = typename get_ref_desc_types::refType_dst1dDesc;
-using refType_src2dDesc_padded_12 = typename get_ref_desc_types::refType_src2dDesc_padded_12;
-using refType_dst1dDesc_padded    = typename get_ref_desc_types::refType_dst1dDesc_padded;
-template <bool need_padding>
-static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
-    else
-        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
-};
-template <bool need_padding>
-static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
-    else
-        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
-};
-extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen,
-                                                     float alpha,
-                                                     const void* __restrict__ p_src_global,
-                                                     float beta,
-                                                     void* __restrict__ p_dst_global,
-                                                     const void CONSTANT* ws_global,
-                                                     long ws_buf2_bytes_offset,
-                                                     void* __restrict__ indices_global)
-{
-    (void)p_src_global;
-    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
-    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
-    void* ws_buf1_global    = const_cast<char*>(static_cast<const char*>(p_src2dDesc) + 4096);
-    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
-    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
-    using gridwise_2d_reduce =
-        GridwiseReduction_xy_to_x_direct_warpwise<BlockSize,
-                                                  srcDataType,
-                                                  dstDataType,
-                                                  compType,
-                                                  decltype(src2dDesc),
-                                                  decltype(dst1dDesc),
-                                                  op,
-                                                  nanPropaOpt,
-                                                  reduceIndicesOpt,
-                                                  false,
-                                                  true,
-                                                  GredAccessesPerThreadInWarp>;
-    void* const ws_buf2_global =
-        ws_buf2_bytes_offset > 0
-            ? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
-            : nullptr;
-    constexpr int RunId = need_indices ? 3 : 1;
-    gridwise_2d_reduce::template Run<RunId>(
-        src2dDesc,
-        dst1dDesc,
-        origReduceLen,
-        alpha,
-        static_cast<const srcDataType* const __restrict__>(ws_buf1_global),
-        beta,
-        static_cast<dstDataType* const __restrict__>(p_dst_global),
-        static_cast<const int* const __restrict__>(ws_buf2_global),
-        static_cast<int* const __restrict__>(indices_global));
-};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_partial_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_partial_dims.cpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include "config.hpp"
-#include "number.hpp"
-#include "sequence.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "data_type_enum_helper.hpp"
-#include "reduction_common.hpp"
-#include "gridwise_generic_2d_reduction_direct_warpwise.hpp"
-using namespace ck;
-using srcDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
-using dstDataType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
-using compType =
-    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
-constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
-constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
-constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
-constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
-                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
-                                             : NanPropagation_t::PROPAGATE_NAN;
-constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
-                                                       ? ReduceTensorIndices_t::NO_INDICES
-                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
-constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
-constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
-constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
-constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
-constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable
-// helper functions using variadic template arguments
-template <index_t... Ns>
-__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
-{
-    return make_tuple(static_cast<index_t>(lengths[Ns])...);
-};
-template <index_t arraySize>
-__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
-{
-    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
-    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
-    return make_tuple_from_array_and_index_seq(lengths, index_seq);
-};
-template <index_t... Ns>
-__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
-{
-    return make_tuple(Ns...);
-};
-extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize,
-                                                             int BlkGroupSize,
-                                                             int outLength0,
-                                                             int outLength1,
-                                                             int outLength2,
-                                                             int outLength3,
-                                                             int outLength4,
-                                                             int outLength5,
-                                                             int outStride0,
-                                                             int outStride1,
-                                                             int outStride2,
-                                                             int outStride3,
-                                                             int outStride4,
-                                                             int outStride5,
-                                                             void* __restrict__ ws_global)
-{
-    (void)BlkGroupSize;
-    void* p_src2dDesc = ws_global;
-    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
-    const int dstLengths[6] = {
-        outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
-    const int dstStrides[6] = {
-        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
-    const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
-    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
-    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
-    auto dst1dDesc = transform_tensor_descriptor(
-        dstDesc,
-        make_tuple(make_merge_transform(tupleDstLengths)),
-        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-    const index_t invariantLen = dst1dDesc.GetLength(Number<0>{});
-    const index_t toReduceLen  = BlkGroupSize;
-    auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen));
-    constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp;
-    if constexpr(src2d_need_padding)
-    {
-        const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen;
-        const auto srcPad2 =
-            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
-        auto src2dDesc_2 =
-            transform_tensor_descriptor(src2dDesc,
-                                        make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
-                                                   make_pad_transform(toReduceLen, 0, srcPad2)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
-    }
-    else
-    {
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
-    }
-    if constexpr(dst1d_need_padding)
-    {
-        const auto dstPad = GridSize * BlockSize / warpSize - invariantLen;
-        auto dst1dDesc_2 =
-            transform_tensor_descriptor(dst1dDesc,
-                                        make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
-                                        make_tuple(Sequence<0>{}),
-                                        make_tuple(Sequence<0>{}));
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
-    }
-    else
-    {
-        if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
-    }
-};
-template <index_t dstDims>
-struct get_ref_desc_types
-{
-    static constexpr auto ref_tupleDstLengths =
-        make_tuple_from_seq(typename uniform_sequence_gen<dstDims, 8>::type{});
-    static constexpr auto ref_dstDesc =
-        make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths);
-    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
-        ref_dstDesc,
-        make_tuple(make_merge_transform(ref_tupleDstLengths)),
-        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-    static constexpr index_t ref_invariantLen = ref_dst1dDesc.GetLength(Number<0>{});
-    static constexpr index_t ref_toReduceLen  = 8;
-    static constexpr auto ref_src2dDesc =
-        make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen));
-    using refType_src2dDesc = decltype(ref_src2dDesc);
-    using refType_dst1dDesc = decltype(ref_dst1dDesc);
-    // used by the DirectThreadWise and DirectWarpWise method
-    using refType_src2dDesc_padded_12 =
-        decltype(transform_tensor_descriptor(ref_src2dDesc,
-                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
-                                                        make_pad_transform(ref_toReduceLen, 0, 2)),
-                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                             make_tuple(Sequence<0>{}, Sequence<1>{})));
-    using refType_dst1dDesc_padded =
-        decltype(transform_tensor_descriptor(ref_dst1dDesc,
-                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
-                                             make_tuple(Sequence<0>{}),
-                                             make_tuple(Sequence<0>{})));
-};
-using refType_src2dDesc = typename get_ref_desc_types<dstDims>::refType_src2dDesc;
-using refType_dst1dDesc = typename get_ref_desc_types<dstDims>::refType_dst1dDesc;
-using refType_src2dDesc_padded_12 =
-    typename get_ref_desc_types<dstDims>::refType_src2dDesc_padded_12;
-using refType_dst1dDesc_padded = typename get_ref_desc_types<dstDims>::refType_dst1dDesc_padded;
-template <bool need_padding>
-static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
-    else
-        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
-};
-template <bool need_padding>
-static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
-{
-    if constexpr(need_padding)
-        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
-    else
-        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
-};
-extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen,
-                                                     float alpha,
-                                                     const void* __restrict__ p_src_global,
-                                                     float beta,
-                                                     void* __restrict__ p_dst_global,
-                                                     const void CONSTANT* ws_global,
-                                                     long ws_buf2_bytes_offset,
-                                                     void* __restrict__ indices_global)
-{
-    (void)p_src_global;
-    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
-    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
-    void* ws_buf1_global    = const_cast<char*>(static_cast<const char*>(p_src2dDesc) + 4096);
-    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
-    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
-    using gridwise_2d_reduce =
-        GridwiseReduction_xy_to_x_direct_warpwise<BlockSize,
-                                                  srcDataType,
-                                                  dstDataType,
-                                                  compType,
-                                                  decltype(src2dDesc),
-                                                  decltype(dst1dDesc),
-                                                  op,
-                                                  nanPropaOpt,
-                                                  reduceIndicesOpt,
-                                                  false,
-                                                  true,
-                                                  GredAccessesPerThreadInWarp>;
-    void* const ws_buf2_global =
-        ws_buf2_bytes_offset > 0
-            ? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
-            : nullptr;
-    constexpr int RunId = need_indices ? 3 : 1;
-    gridwise_2d_reduce::template Run<RunId>(
-        src2dDesc,
-        dst1dDesc,
-        origReduceLen,
-        alpha,
-        static_cast<const srcDataType* const __restrict__>(ws_buf1_global),
-        beta,
-        static_cast<dstDataType* const __restrict__>(p_dst_global),
-        static_cast<const int* const __restrict__>(ws_buf2_global),
-        static_cast<int* const __restrict__>(indices_global));
-};
--- a/device_operation/CMakeLists.txt
+++ b/device_operation/CMakeLists.txt
-include_directories(BEFORE
-    include
-    ${PROJECT_SOURCE_DIR}/host/host_tensor/include
-    ${PROJECT_SOURCE_DIR}/device/include
-    ${PROJECT_SOURCE_DIR}/device_operation/include
-    ${PROJECT_SOURCE_DIR}/profiler/include
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform
-    ${PROJECT_SOURCE_DIR}/external/rocm/include
-)
-# device_gemm_instance
-set(DEVICE_GEMM_INSTANCE_SOURCE
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp;
-) 
-# device_gemm_bias_2d_instance
-set(DEVICE_GEMM_BIAS_2D_INSTANCE_SOURCE
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_km_kn_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_km_nk_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_mk_kn_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_mk_nk_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_km_kn_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_km_nk_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_kn_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_nk_mn_instance.cpp;
-)
-# device_gemm_bias_relu_instance
-set(DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_mk_kn_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_mk_nk_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_kn_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_nk_mn_instance.cpp;
-)
-# device_gemm_bias_relu_add_instance
-set(DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_mk_kn_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_mk_nk_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_kn_mn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_nk_mn_instance.cpp;
-)
-set(DEVICE_BATCHED_GEMM_INSTANCE_SOURCE
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp;
-)
-# device_conv2d_fwd_instance
-set(DEVICE_CONV2D_FWD_INSTANCE_SOURCE
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp;
-)
-# device_conv1d_fwd_instance
-set(DEVICE_CONV1D_FWD_INSTANCE_SOURCE 
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instance.cpp;
-) 
-# device_conv2d_fwd_bias_relu_instance
-set(DEVICE_CONV2D_FWD_BIAS_RELU_INSTANCE_SOURCE
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp;
-)
-# device_conv2d_fwd_bias_relu_add_instance
-set(DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp;
-)
-# device_conv2d_fwd_bias_relu_atomic_add_instance
-set(DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE
-   ${PROJECT_SOURCE_DIR}/device_operation/src/device_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add_nhwc_kyxc_nhwk_f16_instance.cpp;
-)
-add_library(device_gemm_instance SHARED ${DEVICE_GEMM_INSTANCE_SOURCE})
-add_library(device_gemm_bias_2d_instance SHARED ${DEVICE_GEMM_BIAS_2D_INSTANCE_SOURCE})
-add_library(device_gemm_bias_relu_instance SHARED ${DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE})
-add_library(device_gemm_bias_relu_add_instance SHARED ${DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE})
-add_library(device_batched_gemm_instance SHARED ${DEVICE_BATCHED_GEMM_INSTANCE_SOURCE})
-add_library(device_conv1d_fwd_instance SHARED ${DEVICE_CONV1D_FWD_INSTANCE_SOURCE}) 
-add_library(device_conv2d_fwd_instance SHARED ${DEVICE_CONV2D_FWD_INSTANCE_SOURCE}) 
-add_library(device_conv2d_fwd_bias_relu_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_INSTANCE_SOURCE}) 
-add_library(device_conv2d_fwd_bias_relu_add_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE}) 
-add_library(device_conv2d_fwd_bias_relu_atomic_add_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE}) 
-target_include_directories(device_gemm_instance SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
-target_include_directories(device_gemm_bias_2d_instance SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
-target_include_directories(device_gemm_bias_relu_instance SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
-target_include_directories(device_gemm_bias_relu_add_instance SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
-target_include_directories(device_batched_gemm_instance SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
-target_include_directories(device_conv1d_fwd_instance SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
-target_include_directories(device_conv2d_fwd_instance SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
-target_include_directories(device_conv2d_fwd_bias_relu_instance SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
-target_include_directories(device_conv2d_fwd_bias_relu_add_instance SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
-target_include_directories(device_conv2d_fwd_bias_relu_atomic_add_instance SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
-target_compile_features(device_gemm_instance PUBLIC)
-target_compile_features(device_gemm_bias_2d_instance PUBLIC)
-target_compile_features(device_gemm_bias_relu_instance PUBLIC)
-target_compile_features(device_gemm_bias_relu_add_instance PUBLIC)
-target_compile_features(device_batched_gemm_instance PUBLIC)
-target_compile_features(device_conv1d_fwd_instance PUBLIC)
-target_compile_features(device_conv2d_fwd_instance PUBLIC)
-target_compile_features(device_conv2d_fwd_bias_relu_instance PUBLIC)
-target_compile_features(device_conv2d_fwd_bias_relu_add_instance PUBLIC)
-target_compile_features(device_conv2d_fwd_bias_relu_atomic_add_instance PUBLIC)
-set_target_properties(device_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-set_target_properties(device_gemm_bias_2d_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-set_target_properties(device_gemm_bias_relu_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-set_target_properties(device_gemm_bias_relu_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-set_target_properties(device_batched_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-set_target_properties(device_conv1d_fwd_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-set_target_properties(device_conv2d_fwd_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-set_target_properties(device_conv2d_fwd_bias_relu_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-set_target_properties(device_conv2d_fwd_bias_relu_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-set_target_properties(device_conv2d_fwd_bias_relu_atomic_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_gemm_instance LIBRARY DESTINATION lib)
-install(TARGETS device_gemm_bias_2d_instance LIBRARY DESTINATION lib)
-install(TARGETS device_gemm_bias_relu_instance LIBRARY DESTINATION lib)
-install(TARGETS device_gemm_bias_relu_add_instance LIBRARY DESTINATION lib)
-install(TARGETS device_batched_gemm_instance LIBRARY DESTINATION lib)
-install(TARGETS device_conv1d_fwd_instance LIBRARY DESTINATION lib) 
-install(TARGETS device_conv2d_fwd_instance LIBRARY DESTINATION lib) 
-install(TARGETS device_conv2d_fwd_bias_relu_instance LIBRARY DESTINATION lib) 
-install(TARGETS device_conv2d_fwd_bias_relu_add_instance LIBRARY DESTINATION lib) 
-install(TARGETS device_conv2d_fwd_bias_relu_atomic_add_instance LIBRARY DESTINATION lib) 
--- a/device_operation/src/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
+++ b/device_operation/src/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
-#include <stdlib.h>
-#include "config.hpp"
-#include "device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp"
-#include "element_wise_operation.hpp"
-#include "device_operation_instance.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_conv2d_fwd_instance {
-using F32 = float;
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-static constexpr auto ConvFwdDefault =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default;
-static constexpr auto ConvFwd1x1P0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Pad0;
-static constexpr auto ConvFwd1x1S1P0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0;
-// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
-using device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances = std::tuple<
-    // clang-format off
-        //################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
-        //################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
-        //################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
-        //################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>
-    // clang-format on
-    >;
-using device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_bf16_instances = std::tuple<
-    // clang-format off
-        //################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
-        //################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
-        //################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
-        //################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>
-    // clang-format on
-    >;
-using device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_bf16_instances = std::tuple<
-    // clang-format off
-        //################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
-        //################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
-        //################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
-        //################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
-        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  ushort,  ushort,  ushort,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>
-    // clang-format on
-    >;
-void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(
-    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances{});
-    add_device_operation_instances(instances,
-                                   device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_bf16_instances{});
-    add_device_operation_instances(instances,
-                                   device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_bf16_instances{});
-}
-} // namespace device_conv2d_fwd_instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/device_operation/src/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/device_operation/src/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp
-#include <stdlib.h>
-#include "config.hpp"
-#include "device_gemm_xdl_c_shuffle.hpp"
-#include "element_wise_operation.hpp"
-#include "device_operation_instance.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_gemm_instance {
-using F16 = ck::half_t;
-using F32 = float;
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
-using device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances = std::tuple<
-    // clang-format off
-        //#####################| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|      Num|
-        //#####################|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Prefetch|
-        //#####################|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         |
-        //#####################|      |      |      |        |        |        |        |            |            |            |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |         |
-        DeviceGemmXdl_C_Shuffle<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8,        2>,
-        DeviceGemmXdl_C_Shuffle<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8,        2>,
-        DeviceGemmXdl_C_Shuffle<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8,        2>,
-        DeviceGemmXdl_C_Shuffle<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8,        2>,
-        DeviceGemmXdl_C_Shuffle<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8,        2>,
-        DeviceGemmXdl_C_Shuffle<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8,        2>,
-        DeviceGemmXdl_C_Shuffle<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8,        2>,
-        DeviceGemmXdl_C_Shuffle<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8,        2>,
-        DeviceGemmXdl_C_Shuffle<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8,        2>,
-        DeviceGemmXdl_C_Shuffle<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8,        2>,
-        DeviceGemmXdl_C_Shuffle<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8,        2>,
-        DeviceGemmXdl_C_Shuffle<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8,        2>,
-        DeviceGemmXdl_C_Shuffle<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8,        2>
-    // clang-format on
-    >;
-void add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances{});
-}
-} // namespace device_gemm_instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/device_operation/src/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/device_operation/src/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
-#include <stdlib.h>
-#include "config.hpp"
-#include "device_gemm_xdl_c_shuffle.hpp"
-#include "element_wise_operation.hpp"
-#include "device_operation_instance.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_gemm_instance {
-using F16 = ck::half_t;
-using F32 = float;
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
-using device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances = std::tuple<
-    // clang-format off
-        //#####################|AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################| Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
-        //#####################|     |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################|     |      |      |        |        |        |        |            |            |            |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,    32,   4,   4,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,    32,   2,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,    32,   2,   2,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,    32,   2,   2,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>
-    // clang-format on
-    >;
-void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances{});
-}
-} // namespace device_gemm_instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/device_operation/src/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
+++ b/device_operation/src/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
-#include <stdlib.h>
-#include "config.hpp"
-#include "device_gemm_xdl_c_shuffle.hpp"
-#include "element_wise_operation.hpp"
-#include "device_operation_instance.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_gemm_instance {
-using F16 = ck::half_t;
-using F32 = float;
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
-using device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances = std::tuple<
-    // clang-format off
-        //#####################|AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################| Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
-        //#####################|     |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################|     |      |      |        |        |        |        |            |            |            |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,    32,   2,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,    32,   2,   8,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,    32,   2,   8,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>
-    // clang-format on
-    >;
-void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances{});
-}
-} // namespace device_gemm_instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/device_operation/src/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/device_operation/src/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
-#include <stdlib.h>
-#include "config.hpp"
-#include "device_gemm_xdl_c_shuffle.hpp"
-#include "element_wise_operation.hpp"
-#include "device_operation_instance.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_gemm_instance {
-using F16 = ck::half_t;
-using F32 = float;
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
-using device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances = std::tuple<
-    // clang-format off
-        //#####################|AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################| Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
-        //#####################|     |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################|     |      |      |        |        |        |        |            |            |            |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>
-    // clang-format on
-    >;
-void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances{});
-}
-} // namespace device_gemm_instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/device_operation/src/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/device_operation/src/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
-#include <stdlib.h>
-#include "config.hpp"
-#include "device_gemm_xdl_c_shuffle.hpp"
-#include "element_wise_operation.hpp"
-#include "device_operation_instance.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_gemm_instance {
-using F16 = ck::half_t;
-using F32 = float;
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
-using device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances = std::tuple<
-    // clang-format off
-        //#####################|AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################| Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
-        //#####################|     |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################|     |      |      |        |        |        |        |            |            |            |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
-        DeviceGemmXdl_C_Shuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>
-    // clang-format on
-    >;
-void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances{});
-}
-} // namespace device_gemm_instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
+add_example_executable(example_gemm_xdl_fp16 gemm_xdl_fp16.cpp)
+add_example_executable(example_gemm_xdl_bf16 gemm_xdl_bf16.cpp)
+add_example_executable(example_gemm_xdl_int8 gemm_xdl_int8.cpp)
--- a/example/12_grouped_gemm_xdl/README.md
+++ b/example/12_grouped_gemm_xdl/README.md
--- a/example/01_gemm/gemm_xdl_bf16.cpp
+++ b/example/01_gemm/gemm_xdl_bf16.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "host_gemm.hpp"
+#include "device_tensor.hpp"
+#include "device_gemm_xdl.hpp"
+#include "device_gemm_xdl_c_shuffle.hpp"
+#include "element_wise_operation.hpp"
+#include "reference_gemm.hpp"
+#include "gemm_specialization.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ADataType   = BF16;
+using BDataType   = BF16;
+using CDataType   = BF16;
+using AccDataType = F32;
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle<
+    ADataType,              // ADataType
+    BDataType,              // BDataType
+    CDataType,              // CDataType
+    AccDataType,            // AccDataType
+    CDataType,              // CShuffleDataType
+    ALayout,                // ALayout
+    BLayout,                // BLayout
+    CLayout,                // CLayout
+    PassThrough,            // AElementwiseOperation
+    PassThrough,            // BElementwiseOperation
+    PassThrough,            // CElementwiseOperation
+    256,                    // BlockSize
+    256,                    // MPerBlock
+    128,                    // NPerBlock
+    32,                     // KPerBlock
+    8,                      // AK1
+    8,                      // BK1
+    32,                     // MPerXDL
+    32,                     // NPerXDL
+    4,                      // MXdlPerWave
+    2,                      // NXdlPerWave
+    S<4, 64, 1>,            // ABlockTransferThreadClusterLengths_K0_M_K1
+    S<1, 0, 2>,             // ABlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,             // ABlockTransferSrcAccessOrder
+    2,                      // ABlockTransferSrcVectorDim
+    8,                      // ABlockTransferSrcScalarPerVector
+    8,                      // ABlockTransferDstScalarPerVector_K1
+    true,                   // ABlockLdsAddExtraM
+    S<4, 64, 1>,            // BBlockTransferThreadClusterLengths_K0_N_K1
+    S<1, 0, 2>,             // BBlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,             // BBlockTransferSrcAccessOrder
+    2,                      // BBlockTransferSrcVectorDim
+    8,                      // BBlockTransferSrcScalarPerVector
+    8,                      // BBlockTransferDstScalarPerVector_K1
+    true,                   // BBlockLdsAddExtraN
+    1,                      // CShuffleMXdlPerWavePerShuffle
+    1,                      // CShuffleNXdlPerWavePerShuffle
+    S<1, 1, 32, 1, 1, 8>,   // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+    8>;                     // CBlockTransferScalarPerVector_NWaveNPerXdl
+// clang-format on
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<float, float, float, PassThrough, PassThrough, PassThrough>;
+int main(int argc, char* argv[])
+{
+    bool do_verification = 0;
+    int init_method      = 0;
+    int nrepeat          = 5;
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideC = 4096;
+    if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        nrepeat         = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        nrepeat         = std::stoi(argv[3]);
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideC = std::stoi(argv[9]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
+        exit(0);
+    }
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({stride, 1}));
+            }
+            else
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({1, stride}));
+            }
+        };
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+    }
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+    auto a_element_op = PassThrough{};
+    auto b_element_op = PassThrough{};
+    auto c_element_op = PassThrough{};
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+                                      static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+                                      static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+                                      M,
+                                      N,
+                                      K,
+                                      StrideA,
+                                      StrideB,
+                                      StrideC,
+                                      a_element_op,
+                                      b_element_op,
+                                      c_element_op);
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+    float ave_time = invoker.Run(argument, nrepeat);
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+    c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+    if(do_verification)
+    {
+        Tensor<float> a_f32_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+        Tensor<float> b_f32_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+        Tensor<float> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+        Tensor<float> c_m_n_device_f32_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+        bf16_to_f32_(a_m_k, a_f32_m_k);
+        bf16_to_f32_(b_k_n, b_f32_k_n);
+        bf16_to_f32_(c_m_n_device_result, c_m_n_device_f32_result);
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_f32_m_k, b_f32_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+        ref_invoker.Run(ref_argument);
+        check_error(c_m_n_host_result, c_m_n_device_f32_result);
+    }
+    return 0;
+}
--- a/example/1_gemm_xdl/gemm_xdl.cpp
+++ b/example/1_gemm_xdl/gemm_xdl.cpp
@@ -41,8 +41,7 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp = ck::tensor_operation::element_wise::PassThrough;
 using CElementOp = ck::tensor_operation::element_wise::PassThrough;
-static constexpr auto GemmDefault   = ck::tensor_operation::device::GemmSpecialization_t::Default;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default;
-static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization_t::MNPadding;
 // clang-format off
 #if 0
@@ -55,11 +54,11 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
        <   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1>;
 #elif 1
 using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle
-//######|AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+//######|AData| BData| CData| AccData| Shuffle| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
-//######| Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+//######| Type|  Type|  Type|    Type|    Data|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
-//######|     |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|     |      |      |        |    Type|        |        |        |   Operation|   Operation|   Operation|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
-//######|     |      |      |        |        |        |        |            |            |            |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+//######|     |      |      |        |        |        |        |        |            |            |            |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-        <  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>;
+        <  F16,   F16,   F16,     F32,     F16,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>;
 #elif 0
 using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
 //######| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|      Num|