Merge pull request #31 from ROCmSoftwarePlatform/miopen_downstream-dynamic_reduction_pr

[MIOpen Downstream] Dynamic Reduction PR

Merge pull request #31 from ROCmSoftwarePlatform/miopen_downstream-dynamic_reduction_pr
[MIOpen Downstream] Dynamic Reduction PR
f305bebd · Chao Liu · GitHub · f3acd251 · b725e3fc · f305bebd
Unverified Commit f305bebd authored Sep 21, 2021 by Chao Liu Committed by GitHub Sep 21, 2021
20 changed files
--- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_BLOCKWISE_HPP
+#define CK_GRIDWISE_GENERIC_2D_REDUCTION_BLOCKWISE_HPP
+
+#include "data_type.hpp"
+#include "reduction_common.hpp"
+#include "reduction_operator.hpp"
+#include "reduction_functions_blockwise.hpp"
+
+#include "blockwise_tensor_slice_transfer.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename srcDataType,
+          typename dstDataType,
+          typename compType,
+          typename src2dDescType,
+          typename dst1dDescType,
+          ReduceTensorOp_t op,
+          NanPropagation_t nanPropaOpt,
+          ReduceTensorIndices_t reduceIndicesOpt,
+          bool isFirstCall,
+          bool isLastCall,
+          index_t GredAccessesPerThreadInBlock>
+struct GridwiseReduction_xy_to_x_blockwise
+{
+    using opReduce = typename reduce_binary_operator<compType, op>::opType;
+    using preUnaryOpType =
+        typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::preUnaryOp;
+    using posUnaryOpType =
+        typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::posUnaryOp;
+
+    static constexpr auto buffer2dDesc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<GredAccessesPerThreadInBlock>{}, Number<BlockSize>{}));
+    using blockwise_reduce =
+        BlockwiseReduction_2d_block_buffer<decltype(buffer2dDesc), true, opReduce, nanPropaOpt>;
+
+    static constexpr index_t BlockBufferSize = buffer2dDesc.GetElementSize();
+
+    static constexpr auto I0 = Number<0>{};
+
+    template <int RunId>
+    __device__ static void Run(const src2dDescType& src2dDesc,
+                               const dst1dDescType& dst1dDesc,
+                               int origReduceLen,
+                               srcDataType alpha,
+                               const srcDataType* const __restrict__ p_src_global,
+                               dstDataType beta,
+                               dstDataType* const __restrict__ p_dst_global,
+                               const int* const __restrict__ ws_indices_global,
+                               int* const __restrict__ indices_global);
+
+    template <>
+    __device__ static void Run<1>(const src2dDescType& src2dDesc,
+                                  const dst1dDescType& dst1dDesc,
+                                  int origReduceLen,
+                                  srcDataType alpha,
+                                  const srcDataType* const __restrict__ p_src_global,
+                                  dstDataType beta,
+                                  dstDataType* const __restrict__ p_dst_global,
+                                  const int* const __restrict__ ws_indices_global,
+                                  int* const __restrict__ indices_global)
+    {
+        (void)ws_indices_global;
+        (void)indices_global;
+
+        // LDS
+        __shared__ compType p_in_block_buffer[BlockBufferSize];
+
+        auto zeroVal = opReduce::GetZeroVal();
+
+        const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
+        auto dst_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_dst_global, dst1dDesc.GetElementSpaceSize());
+
+        auto in_block_buf =
+            make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_buffer, BlockBufferSize);
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
+
+        accuValue_buf(I0) = zeroVal;
+
+        const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
+        const int divider         = origReduceLen;
+
+        const preUnaryOpType preUnaryOp(divider);
+        const posUnaryOpType posUnaryOp(divider);
+
+        const index_t thread_local_id    = get_thread_local_1d_id();
+        const index_t block_global_1d_id = get_block_1d_id();
+
+        constexpr auto in_block_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<BlockBufferSize>{}));
+
+        using ThreadSliceLengths   = Sequence<1, GredAccessesPerThreadInBlock>;
+        using ThreadClusterLengths = Sequence<1, BlockSize>;
+
+        auto blockwise_src_load =
+            BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                            InMemoryDataOperationEnum_t::Set,
+                                            Sequence<1, BlockBufferSize>,
+                                            ThreadSliceLengths,
+                                            ThreadClusterLengths,
+                                            Sequence<0, 1>,
+                                            srcDataType,
+                                            compType,
+                                            src2dDescType,
+                                            decltype(in_block_desc),
+                                            Sequence<0, 1>,
+                                            Sequence<0, 1>,
+                                            1,
+                                            1,
+                                            1,
+                                            1,
+                                            1,
+                                            1,
+                                            false,
+                                            true>(src2dDesc,
+                                                  make_multi_index(block_global_1d_id, 0),
+                                                  in_block_desc,
+                                                  make_multi_index(0, 0));
+
+        constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
+
+        const index_t toReduceBlocks = (toReduceLength + BlockSize - 1) / BlockSize;
+
+        for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
+            reducedBlocks += GredAccessesPerThreadInBlock)
+        {
+            blockwise_src_load.RunRead(src2dDesc, src_global_buf);
+            blockwise_src_load.RunWrite(in_block_desc, in_block_buf);
+
+            __syncthreads();
+
+            // do element-wise pre-reduction operation
+            blockwise_reduce::operate_on_elements(preUnaryOp, in_block_buf);
+
+            index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
+                                        ? GredAccessesPerThreadInBlock
+                                        : toReduceBlocks - reducedBlocks;
+            blockwise_reduce::Reduce(in_block_buf, BlocksInOneOp, accuValue_buf(I0));
+
+            blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
+        }
+
+        accuValue_buf(I0) = posUnaryOp(accuValue_buf[I0]);
+
+        constexpr auto ReducedDataDesc =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
+
+        // The first thread in the block stores the reduced result to the global location
+        // representing the block
+        if(thread_local_id == 0)
+        {
+            if(!float_equal_one{}(alpha))
+                accuValue_buf(I0) *= type_convert<compType>{}(alpha);
+
+            if(!float_equal_zero{}(beta))
+            {
+                auto threadwise_dst_load =
+                    ThreadwiseTensorSliceTransfer_v2<dstDataType,
+                                                     dstDataType,
+                                                     dst1dDescType,
+                                                     decltype(ReducedDataDesc),
+                                                     Sequence<1>,
+                                                     Sequence<0>,
+                                                     0,
+                                                     1,
+                                                     1,
+                                                     false>(dst1dDesc,
+                                                            make_multi_index(block_global_1d_id));
+
+                StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
+
+                threadwise_dst_load.Run(
+                    dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
+
+                accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
+            }
+
+            auto threadwise_dst_store =
+                ThreadwiseTensorSliceTransfer_v1r3<compType,
+                                                   dstDataType,
+                                                   decltype(ReducedDataDesc),
+                                                   dst1dDescType,
+                                                   Sequence<1>,
+                                                   Sequence<0>,
+                                                   0,
+                                                   1,
+                                                   InMemoryDataOperationEnum_t::Set,
+                                                   1,
+                                                   false>(dst1dDesc,
+                                                          make_multi_index(block_global_1d_id));
+
+            threadwise_dst_store.Run(
+                ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_buf);
+        }
+    };
+
+    template <>
+    __device__ static void Run<2>(const src2dDescType& src2dDesc,
+                                  const dst1dDescType& dst1dDesc,
+                                  int origReduceLen,
+                                  srcDataType alpha,
+                                  const srcDataType* const __restrict__ p_src_global,
+                                  dstDataType beta,
+                                  dstDataType* const __restrict__ p_dst_global,
+                                  const int* const __restrict__ ws_indices_global,
+                                  int* const __restrict__ indices_global)
+    {
+        (void)ws_indices_global;
+
+        // LDS
+        __shared__ compType p_in_block_buffer[BlockBufferSize];
+        __shared__ int block_indices_buffer[BlockBufferSize];
+
+        auto zeroVal = opReduce::GetZeroVal();
+
+        const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
+        auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_dst_global, dst1dDesc.GetElementSpaceSize());
+        auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            indices_global, dst1dDesc.GetElementSpaceSize());
+
+        auto in_block_val_buf =
+            make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_buffer, BlockBufferSize);
+        auto in_block_idx_buf =
+            make_dynamic_buffer<AddressSpaceEnum_t::Lds>(block_indices_buffer, BlockBufferSize);
+
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
+
+        accuValue_buf(I0) = zeroVal;
+        accuIndex_buf(I0) = 0;
+
+        const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
+        const int divider         = origReduceLen;
+
+        const preUnaryOpType preUnaryOp(divider);
+
+        const index_t thread_local_id    = get_thread_local_1d_id();
+        const index_t block_global_1d_id = get_block_1d_id();
+
+        constexpr auto in_block_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<BlockBufferSize>{}));
+
+        using ThreadSliceLengths   = Sequence<1, GredAccessesPerThreadInBlock>;
+        using ThreadClusterLengths = Sequence<1, BlockSize>;
+
+        auto blockwise_src_load =
+            BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                            InMemoryDataOperationEnum_t::Set,
+                                            Sequence<1, BlockBufferSize>,
+                                            ThreadSliceLengths,
+                                            ThreadClusterLengths,
+                                            Sequence<0, 1>,
+                                            srcDataType,
+                                            dstDataType,
+                                            src2dDescType,
+                                            decltype(in_block_desc),
+                                            Sequence<0, 1>,
+                                            Sequence<0, 1>,
+                                            1,
+                                            1,
+                                            1,
+                                            1,
+                                            1,
+                                            1,
+                                            false,
+                                            true>(src2dDesc,
+                                                  make_multi_index(block_global_1d_id, 0),
+                                                  in_block_desc,
+                                                  make_multi_index(0, 0));
+
+        constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
+
+        const index_t toReduceBlocks = (toReduceLength + BlockSize - 1) / BlockSize;
+
+        int indexOffset = 0;
+
+        for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
+            reducedBlocks += GredAccessesPerThreadInBlock)
+        {
+            // load block data from global to LDS, no use of double buffers (to be improved)
+            blockwise_src_load.RunRead(src2dDesc, src_global_buf);
+            blockwise_src_load.RunWrite(in_block_desc, in_block_val_buf);
+
+            __syncthreads();
+
+            // construct the indices for the current toReduce blocks
+            blockwise_reduce::init_buffer_indices(in_block_idx_buf, indexOffset);
+
+            // unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
+            // done here
+            blockwise_reduce::operate_on_elements(preUnaryOp, in_block_val_buf);
+
+            index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
+                                        ? GredAccessesPerThreadInBlock
+                                        : toReduceBlocks - reducedBlocks;
+
+            blockwise_reduce::Reduce2(in_block_val_buf,
+                                      in_block_idx_buf,
+                                      BlocksInOneOp,
+                                      accuValue_buf(I0),
+                                      accuIndex_buf(I0));
+
+            indexOffset += BlockBufferSize;
+
+            blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
+        }
+
+        constexpr auto ReducedDataDesc =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
+
+        // The first thread in the block stores the reduced result to the global location
+        // representing the block
+        if(thread_local_id == 0)
+        {
+            if(!float_equal_one{}(alpha))
+                accuValue_buf(I0) *= type_convert<compType>{}(alpha);
+
+            if(!float_equal_zero{}(beta))
+            {
+                auto threadwise_dst_load =
+                    ThreadwiseTensorSliceTransfer_v2<dstDataType,
+                                                     dstDataType,
+                                                     dst1dDescType,
+                                                     decltype(ReducedDataDesc),
+                                                     Sequence<1>,
+                                                     Sequence<0>,
+                                                     0,
+                                                     1,
+                                                     1,
+                                                     false>(dst1dDesc,
+                                                            make_multi_index(block_global_1d_id));
+
+                StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
+
+                threadwise_dst_load.Run(dst1dDesc,
+                                        dst_global_val_buf,
+                                        ReducedDataDesc,
+                                        make_tuple(I0),
+                                        priorDstValue_buf);
+
+                accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
+            }
+
+            auto threadwise_dst_val_store =
+                ThreadwiseTensorSliceTransfer_v1r3<compType,
+                                                   dstDataType,
+                                                   decltype(ReducedDataDesc),
+                                                   dst1dDescType,
+                                                   Sequence<1>,
+                                                   Sequence<0>,
+                                                   0,
+                                                   1,
+                                                   InMemoryDataOperationEnum_t::Set,
+                                                   1,
+                                                   false>(dst1dDesc,
+                                                          make_multi_index(block_global_1d_id));
+
+            auto threadwise_dst_idx_store =
+                ThreadwiseTensorSliceTransfer_v1r3<int,
+                                                   int,
+                                                   decltype(ReducedDataDesc),
+                                                   dst1dDescType,
+                                                   Sequence<1>,
+                                                   Sequence<0>,
+                                                   0,
+                                                   1,
+                                                   InMemoryDataOperationEnum_t::Set,
+                                                   1,
+                                                   false>(dst1dDesc,
+                                                          make_multi_index(block_global_1d_id));
+
+            threadwise_dst_val_store.Run(
+                ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
+            threadwise_dst_idx_store.Run(
+                ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
+        }
+    };
+
+    template <>
+    __device__ static void Run<3>(const src2dDescType& src2dDesc,
+                                  const dst1dDescType& dst1dDesc,
+                                  int origReduceLen,
+                                  srcDataType alpha,
+                                  const srcDataType* const __restrict__ ws_values_global,
+                                  dstDataType beta,
+                                  dstDataType* const __restrict__ p_dst_global,
+                                  const int* const __restrict__ ws_indices_global,
+                                  int* const __restrict__ indices_global)
+    {
+        (void)origReduceLen;
+
+        // LDS
+        __shared__ compType p_in_block_buffer[BlockBufferSize];
+        __shared__ int block_indices_buffer[BlockBufferSize];
+
+        auto zeroVal = opReduce::GetZeroVal();
+
+        const auto src_global_val_buf =
+            make_dynamic_buffer<AddressSpaceEnum_t::Global>(ws_values_global,
+                                                            src2dDesc.GetElementSpaceSize(),
+                                                            type_convert<srcDataType>{}(zeroVal));
+        const auto src_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            ws_indices_global, src2dDesc.GetElementSpaceSize());
+        auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_dst_global, dst1dDesc.GetElementSpaceSize());
+        auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            indices_global, dst1dDesc.GetElementSpaceSize());
+
+        auto in_block_val_buf =
+            make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_buffer, BlockBufferSize);
+        auto in_block_idx_buf =
+            make_dynamic_buffer<AddressSpaceEnum_t::Lds>(block_indices_buffer, BlockBufferSize);
+
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
+
+        accuValue_buf(I0) = zeroVal;
+        accuIndex_buf(I0) = 0;
+
+        const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
+
+        const index_t thread_local_id    = get_thread_local_1d_id();
+        const index_t block_global_1d_id = get_block_1d_id();
+
+        constexpr auto in_block_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<BlockBufferSize>{}));
+
+        using ThreadSliceLengths   = Sequence<1, GredAccessesPerThreadInBlock>;
+        using ThreadClusterLengths = Sequence<1, BlockSize>;
+
+        auto blockwise_src_val_load =
+            BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                            InMemoryDataOperationEnum_t::Set,
+                                            Sequence<1, BlockBufferSize>,
+                                            ThreadSliceLengths,
+                                            ThreadClusterLengths,
+                                            Sequence<0, 1>,
+                                            srcDataType,
+                                            compType,
+                                            src2dDescType,
+                                            decltype(in_block_desc),
+                                            Sequence<0, 1>,
+                                            Sequence<0, 1>,
+                                            1,
+                                            1,
+                                            1,
+                                            1,
+                                            1,
+                                            1,
+                                            false,
+                                            true>(src2dDesc,
+                                                  make_multi_index(block_global_1d_id, 0),
+                                                  in_block_desc,
+                                                  make_multi_index(0, 0));
+
+        auto blockwise_src_idx_load =
+            BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                            InMemoryDataOperationEnum_t::Set,
+                                            Sequence<1, BlockBufferSize>,
+                                            ThreadSliceLengths,
+                                            ThreadClusterLengths,
+                                            Sequence<0, 1>,
+                                            int,
+                                            int,
+                                            src2dDescType,
+                                            decltype(in_block_desc),
+                                            Sequence<0, 1>,
+                                            Sequence<0, 1>,
+                                            1,
+                                            1,
+                                            1,
+                                            1,
+                                            1,
+                                            1,
+                                            false,
+                                            true>(src2dDesc,
+                                                  make_multi_index(block_global_1d_id, 0),
+                                                  in_block_desc,
+                                                  make_multi_index(0, 0));
+
+        constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
+
+        const index_t toReduceBlocks = (toReduceLength + BlockSize - 1) / BlockSize;
+
+        for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
+            reducedBlocks += GredAccessesPerThreadInBlock)
+        {
+            // load block data from global to LDS, no use of double buffers (to be improved)
+            blockwise_src_val_load.RunRead(src2dDesc, src_global_val_buf);
+            blockwise_src_idx_load.RunRead(src2dDesc, src_global_idx_buf);
+            blockwise_src_val_load.RunWrite(in_block_desc, in_block_val_buf);
+            blockwise_src_idx_load.RunWrite(in_block_desc, in_block_idx_buf);
+
+            __syncthreads();
+
+            index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
+                                        ? GredAccessesPerThreadInBlock
+                                        : toReduceBlocks - reducedBlocks;
+
+            blockwise_reduce::Reduce2(in_block_val_buf,
+                                      in_block_idx_buf,
+                                      BlocksInOneOp,
+                                      accuValue_buf(I0),
+                                      accuIndex_buf(I0));
+
+            blockwise_src_val_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
+            blockwise_src_idx_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
+        }
+
+        constexpr auto ReducedDataDesc =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
+
+        // The first thread in the block stores the reduced result to the global location
+        // representing the block
+        if(thread_local_id == 0)
+        {
+            if(!float_equal_one{}(alpha))
+                accuValue_buf(I0) *= type_convert<compType>{}(alpha);
+
+            if(!float_equal_zero{}(beta))
+            {
+                auto threadwise_dst_load =
+                    ThreadwiseTensorSliceTransfer_v2<dstDataType,
+                                                     dstDataType,
+                                                     dst1dDescType,
+                                                     decltype(ReducedDataDesc),
+                                                     Sequence<1>,
+                                                     Sequence<0>,
+                                                     0,
+                                                     1,
+                                                     1,
+                                                     true>(dst1dDesc,
+                                                           make_multi_index(block_global_1d_id));
+
+                StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
+
+                threadwise_dst_load.Run(dst1dDesc,
+                                        dst_global_val_buf,
+                                        ReducedDataDesc,
+                                        make_tuple(I0),
+                                        priorDstValue_buf);
+
+                accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
+            }
+
+            auto threadwise_dst_val_store =
+                ThreadwiseTensorSliceTransfer_v1r3<compType,
+                                                   dstDataType,
+                                                   decltype(ReducedDataDesc),
+                                                   dst1dDescType,
+                                                   Sequence<1>,
+                                                   Sequence<0>,
+                                                   0,
+                                                   1,
+                                                   InMemoryDataOperationEnum_t::Set,
+                                                   1,
+                                                   true>(dst1dDesc,
+                                                         make_multi_index(block_global_1d_id));
+
+            auto threadwise_dst_idx_store =
+                ThreadwiseTensorSliceTransfer_v1r3<int,
+                                                   int,
+                                                   decltype(ReducedDataDesc),
+                                                   dst1dDescType,
+                                                   Sequence<1>,
+                                                   Sequence<0>,
+                                                   0,
+                                                   1,
+                                                   InMemoryDataOperationEnum_t::Set,
+                                                   1,
+                                                   true>(dst1dDesc,
+                                                         make_multi_index(block_global_1d_id));
+
+            threadwise_dst_val_store.Run(
+                ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
+            threadwise_dst_idx_store.Run(
+                ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
+        }
+    };
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_THREADWISE_HPP
+#define CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_THREADWISE_HPP
+
+#include "data_type.hpp"
+#include "reduction_common.hpp"
+#include "reduction_operator.hpp"
+#include "reduction_functions_threadwise.hpp"
+
+#include "threadwise_tensor_slice_transfer.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename srcDataType,
+          typename dstDataType,
+          typename compType,
+          typename src2dDescType,
+          typename dst1dDescType,
+          ReduceTensorOp_t op,
+          NanPropagation_t nanPropaOpt,
+          ReduceTensorIndices_t reduceIndicesOpt,
+          bool isFirstCall,
+          bool isLastCall,
+          index_t GredThreadBufferLength>
+struct GridwiseReduction_xy_to_x_direct_threadwise
+{
+    using opReduce = typename reduce_binary_operator<compType, op>::opType;
+    using preUnaryOpType =
+        typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::preUnaryOp;
+    using posUnaryOpType =
+        typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::posUnaryOp;
+
+    static constexpr auto I0 = Number<0>{};
+
+    template <int RunId>
+    __device__ static void Run(const src2dDescType& src2dDesc,
+                               const dst1dDescType& dst1dDesc,
+                               int origReduceLen,
+                               srcDataType alpha,
+                               const srcDataType* const __restrict__ p_src_global,
+                               dstDataType beta,
+                               dstDataType* const __restrict__ p_dst_global,
+                               const int* const __restrict__ ws_indices_global,
+                               int* const __restrict__ indices_global);
+
+    template <>
+    __device__ static void Run<1>(const src2dDescType& src2dDesc,
+                                  const dst1dDescType& dst1dDesc,
+                                  int origReduceLen,
+                                  srcDataType alpha,
+                                  const srcDataType* const __restrict__ p_src_global,
+                                  dstDataType beta,
+                                  dstDataType* const __restrict__ p_dst_global,
+                                  const int* const __restrict__ ws_indices_global,
+                                  int* const __restrict__ indices_global)
+    {
+        (void)ws_indices_global;
+        (void)indices_global;
+
+        const auto zeroVal = opReduce::GetZeroVal();
+
+        const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
+        auto dst_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_dst_global, dst1dDesc.GetElementSpaceSize());
+
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredThreadBufferLength, true>
+            in_thread_buf;
+
+        using threadwise_reduce = ThreadReduce<decltype(in_thread_buf), opReduce, nanPropaOpt>;
+
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
+
+        accuValue_buf(I0) = zeroVal;
+
+        const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
+        const int divider         = origReduceLen;
+
+        const preUnaryOpType preUnaryOp(divider);
+        const posUnaryOpType posUnaryOp(divider);
+
+        using ThreadBufferLengths       = Sequence<1, GredThreadBufferLength>;
+        constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<1>{}, Number<GredThreadBufferLength>{}));
+
+        index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
+
+        auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
+                                                                    compType,
+                                                                    src2dDescType,
+                                                                    decltype(ThreadBufferDesc),
+                                                                    ThreadBufferLengths,
+                                                                    Sequence<0, 1>,
+                                                                    1,
+                                                                    1,
+                                                                    1,
+                                                                    false>(
+            src2dDesc, make_multi_index(thread_global_1d_id, 0));
+
+        constexpr auto in_thread_copy_step = make_multi_index(0, GredThreadBufferLength);
+
+        for(index_t reducedLength = 0; reducedLength < toReduceLength;
+            reducedLength += GredThreadBufferLength)
+        {
+            threadwise_src_load.Run(
+                src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf);
+
+            // do element-wise pre-reduction operation
+            threadwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf);
+
+            // do the reduction on the Thread Buffer
+            threadwise_reduce::Reduce(in_thread_buf, accuValue_buf(I0));
+
+            threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
+        }
+
+        accuValue_buf(I0) = posUnaryOp(accuValue_buf[I0]);
+
+        constexpr auto ReducedDataDesc =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
+
+        if(!float_equal_one{}(alpha))
+            accuValue_buf(I0) *= type_convert<compType>{}(alpha);
+
+        if(!float_equal_zero{}(beta))
+        {
+            auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2<dstDataType,
+                                                                        dstDataType,
+                                                                        dst1dDescType,
+                                                                        decltype(ReducedDataDesc),
+                                                                        Sequence<1>,
+                                                                        Sequence<0>,
+                                                                        0,
+                                                                        1,
+                                                                        1,
+                                                                        true>(
+                dst1dDesc, make_multi_index(thread_global_1d_id));
+
+            StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
+
+            threadwise_dst_load.Run(
+                dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
+
+            accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
+        }
+
+        auto threadwise_dst_store =
+            ThreadwiseTensorSliceTransfer_v1r3<compType,
+                                               dstDataType,
+                                               decltype(ReducedDataDesc),
+                                               dst1dDescType,
+                                               Sequence<1>,
+                                               Sequence<0>,
+                                               0,
+                                               1,
+                                               InMemoryDataOperationEnum_t::Set,
+                                               1,
+                                               true>(dst1dDesc,
+                                                     make_multi_index(thread_global_1d_id));
+
+        threadwise_dst_store.Run(
+            ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_buf);
+    };
+
+    template <>
+    __device__ static void Run<2>(const src2dDescType& src2dDesc,
+                                  const dst1dDescType& dst1dDesc,
+                                  int origReduceLen,
+                                  srcDataType alpha,
+                                  const srcDataType* const __restrict__ p_src_global,
+                                  dstDataType beta,
+                                  dstDataType* const __restrict__ p_dst_global,
+                                  const int* const __restrict__ ws_indices_global,
+                                  int* const __restrict__ indices_global)
+    {
+        (void)ws_indices_global;
+
+        const auto zeroVal = opReduce::GetZeroVal();
+
+        const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
+        auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_dst_global, dst1dDesc.GetElementSpaceSize());
+        auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            indices_global, dst1dDesc.GetElementSpaceSize());
+
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredThreadBufferLength, true>
+            in_thread_buf;
+
+        using threadwise_reduce = ThreadReduce<decltype(in_thread_buf), opReduce, nanPropaOpt>;
+
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
+
+        accuValue_buf(I0) = zeroVal;
+        accuIndex_buf(I0) = 0;
+
+        const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
+        const int divider         = origReduceLen;
+
+        const preUnaryOpType preUnaryOp(divider);
+
+        using ThreadBufferLengths       = Sequence<1, GredThreadBufferLength>;
+        constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<1>{}, Number<GredThreadBufferLength>{}));
+
+        index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
+
+        auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
+                                                                    dstDataType,
+                                                                    src2dDescType,
+                                                                    decltype(ThreadBufferDesc),
+                                                                    ThreadBufferLengths,
+                                                                    Sequence<0, 1>,
+                                                                    1,
+                                                                    1,
+                                                                    1,
+                                                                    false>(
+            src2dDesc, make_multi_index(thread_global_1d_id, 0));
+
+        constexpr auto in_thread_copy_step = make_multi_index(0, GredThreadBufferLength);
+
+        index_t indexStart = 0;
+        for(index_t reducedLength = 0; reducedLength < toReduceLength;
+            reducedLength += GredThreadBufferLength)
+        {
+            threadwise_src_load.Run(
+                src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf);
+
+            // unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
+            // done here
+            threadwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf);
+
+            // do the reduction on the Thread Buffer
+            threadwise_reduce::Reduce2(
+                in_thread_buf, accuValue_buf(I0), accuIndex_buf(I0), indexStart);
+
+            indexStart += GredThreadBufferLength;
+
+            threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
+        }
+
+        constexpr auto ReducedDataDesc =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
+
+        if(!float_equal_one{}(alpha))
+            accuValue_buf(I0) *= type_convert<compType>{}(alpha);
+
+        if(!float_equal_zero{}(beta))
+        {
+            auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2<dstDataType,
+                                                                        dstDataType,
+                                                                        dst1dDescType,
+                                                                        decltype(ReducedDataDesc),
+                                                                        Sequence<1>,
+                                                                        Sequence<0>,
+                                                                        0,
+                                                                        1,
+                                                                        1,
+                                                                        false>(
+                dst1dDesc, make_multi_index(thread_global_1d_id));
+
+            StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
+
+            threadwise_dst_load.Run(
+                dst1dDesc, dst_global_val_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
+
+            accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
+        }
+
+        auto threadwise_dst_val_store =
+            ThreadwiseTensorSliceTransfer_v1r3<compType,
+                                               dstDataType,
+                                               decltype(ReducedDataDesc),
+                                               dst1dDescType,
+                                               Sequence<1>,
+                                               Sequence<0>,
+                                               0,
+                                               1,
+                                               InMemoryDataOperationEnum_t::Set,
+                                               1,
+                                               false>(dst1dDesc,
+                                                      make_multi_index(thread_global_1d_id));
+
+        auto threadwise_dst_idx_store =
+            ThreadwiseTensorSliceTransfer_v1r3<int,
+                                               int,
+                                               decltype(ReducedDataDesc),
+                                               dst1dDescType,
+                                               Sequence<1>,
+                                               Sequence<0>,
+                                               0,
+                                               1,
+                                               InMemoryDataOperationEnum_t::Set,
+                                               1,
+                                               false>(dst1dDesc,
+                                                      make_multi_index(thread_global_1d_id));
+
+        threadwise_dst_val_store.Run(
+            ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
+        threadwise_dst_idx_store.Run(
+            ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
+    };
+
+    template <>
+    __device__ static void Run<3>(const src2dDescType& src2dDesc,
+                                  const dst1dDescType& dst1dDesc,
+                                  int origReduceLen,
+                                  srcDataType alpha,
+                                  const srcDataType* const __restrict__ ws_values_global,
+                                  dstDataType beta,
+                                  dstDataType* const __restrict__ p_dst_global,
+                                  const int* const __restrict__ ws_indices_global,
+                                  int* const __restrict__ indices_global)
+    {
+        (void)origReduceLen;
+
+        const auto zeroVal = opReduce::GetZeroVal();
+
+        const auto src_global_val_buf =
+            make_dynamic_buffer<AddressSpaceEnum_t::Global>(ws_values_global,
+                                                            src2dDesc.GetElementSpaceSize(),
+                                                            type_convert<srcDataType>{}(zeroVal));
+        const auto src_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            ws_indices_global, src2dDesc.GetElementSpaceSize());
+        auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_dst_global, dst1dDesc.GetElementSpaceSize());
+        auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            indices_global, dst1dDesc.GetElementSpaceSize());
+
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredThreadBufferLength, true>
+            in_thread_val_buf;
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, int, GredThreadBufferLength, true> in_thread_idx_buf;
+
+        using threadwise_reduce = ThreadReduceWithIndicesInput<decltype(in_thread_val_buf),
+                                                               decltype(in_thread_idx_buf),
+                                                               opReduce,
+                                                               nanPropaOpt>;
+
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
+
+        accuValue_buf(I0) = zeroVal;
+        accuIndex_buf(I0) = 0;
+
+        const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
+
+        using ThreadBufferLengths       = Sequence<1, GredThreadBufferLength>;
+        constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<1>{}, Number<GredThreadBufferLength>{}));
+
+        index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
+
+        auto threadwise_src_val_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
+                                                                        dstDataType,
+                                                                        src2dDescType,
+                                                                        decltype(ThreadBufferDesc),
+                                                                        ThreadBufferLengths,
+                                                                        Sequence<0, 1>,
+                                                                        1,
+                                                                        1,
+                                                                        1,
+                                                                        false>(
+            src2dDesc, make_multi_index(thread_global_1d_id, 0));
+
+        auto threadwise_src_idx_load = ThreadwiseTensorSliceTransfer_v2<int,
+                                                                        int,
+                                                                        src2dDescType,
+                                                                        decltype(ThreadBufferDesc),
+                                                                        ThreadBufferLengths,
+                                                                        Sequence<0, 1>,
+                                                                        1,
+                                                                        1,
+                                                                        1,
+                                                                        false>(
+            src2dDesc, make_multi_index(thread_global_1d_id, 0));
+
+        constexpr auto in_thread_copy_step = make_multi_index(0, GredThreadBufferLength);
+
+        for(index_t reducedLength = 0; reducedLength < toReduceLength;
+            reducedLength += GredThreadBufferLength)
+        {
+            threadwise_src_val_load.Run(src2dDesc,
+                                        src_global_val_buf,
+                                        ThreadBufferDesc,
+                                        make_tuple(I0, I0),
+                                        in_thread_val_buf);
+            threadwise_src_idx_load.Run(src2dDesc,
+                                        src_global_idx_buf,
+                                        ThreadBufferDesc,
+                                        make_tuple(I0, I0),
+                                        in_thread_idx_buf);
+
+            // do the reduction on the Thread Buffer
+            threadwise_reduce::Reduce(
+                in_thread_val_buf, in_thread_idx_buf, accuValue_buf(I0), accuIndex_buf(I0));
+
+            threadwise_src_val_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
+            threadwise_src_idx_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
+        }
+
+        constexpr auto ReducedDataDesc =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
+
+        if(!float_equal_one{}(alpha))
+            accuValue_buf(I0) *= type_convert<compType>{}(alpha);
+
+        if(!float_equal_zero{}(beta))
+        {
+            auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2<dstDataType,
+                                                                        dstDataType,
+                                                                        dst1dDescType,
+                                                                        decltype(ReducedDataDesc),
+                                                                        Sequence<1>,
+                                                                        Sequence<0>,
+                                                                        0,
+                                                                        1,
+                                                                        1,
+                                                                        false>(
+                dst1dDesc, make_multi_index(thread_global_1d_id));
+
+            StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
+
+            threadwise_dst_load.Run(
+                dst1dDesc, dst_global_val_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
+
+            accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
+        }
+
+        auto threadwise_dst_val_store =
+            ThreadwiseTensorSliceTransfer_v1r3<compType,
+                                               dstDataType,
+                                               decltype(ReducedDataDesc),
+                                               dst1dDescType,
+                                               Sequence<1>,
+                                               Sequence<0>,
+                                               0,
+                                               1,
+                                               InMemoryDataOperationEnum_t::Set,
+                                               1,
+                                               false>(dst1dDesc,
+                                                      make_multi_index(thread_global_1d_id));
+
+        auto threadwise_dst_idx_store =
+            ThreadwiseTensorSliceTransfer_v1r3<int,
+                                               int,
+                                               decltype(ReducedDataDesc),
+                                               dst1dDescType,
+                                               Sequence<1>,
+                                               Sequence<0>,
+                                               0,
+                                               1,
+                                               InMemoryDataOperationEnum_t::Set,
+                                               1,
+                                               false>(dst1dDesc,
+                                                      make_multi_index(thread_global_1d_id));
+
+        threadwise_dst_val_store.Run(
+            ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
+        threadwise_dst_idx_store.Run(
+            ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
+    };
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_WARPWISE_HPP
+#define CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_WARPWISE_HPP
+
+#include "data_type.hpp"
+#include "reduction_common.hpp"
+#include "reduction_operator.hpp"
+#include "reduction_functions_warpwise.hpp"
+
+#include "threadwise_tensor_slice_transfer.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename srcDataType,
+          typename dstDataType,
+          typename compType,
+          typename src2dDescType,
+          typename dst1dDescType,
+          ReduceTensorOp_t op,
+          NanPropagation_t nanPropaOpt,
+          ReduceTensorIndices_t reduceIndicesOpt,
+          bool isFirstCall,
+          bool isLastCall,
+          index_t GredAccessesPerThreadInWarp>
+struct GridwiseReduction_xy_to_x_direct_warpwise
+{
+    using opReduce = typename reduce_binary_operator<compType, op>::opType;
+    using preUnaryOpType =
+        typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::preUnaryOp;
+    using posUnaryOpType =
+        typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::posUnaryOp;
+
+    static constexpr auto I0 = Number<0>{};
+
+    template <int RunId>
+    __device__ static void Run(const src2dDescType& src2dDesc,
+                               const dst1dDescType& dst1dDesc,
+                               int origReduceLen,
+                               srcDataType alpha,
+                               const srcDataType* const __restrict__ p_src_global,
+                               dstDataType beta,
+                               dstDataType* const __restrict__ p_dst_global,
+                               const int* const __restrict__ ws_indices_global,
+                               int* const __restrict__ indices_global);
+
+    template <>
+    __device__ static void Run<1>(const src2dDescType& src2dDesc,
+                                  const dst1dDescType& dst1dDesc,
+                                  int origReduceLen,
+                                  srcDataType alpha,
+                                  const srcDataType* const __restrict__ p_src_global,
+                                  dstDataType beta,
+                                  dstDataType* const __restrict__ p_dst_global,
+                                  const int* const __restrict__ ws_indices_global,
+                                  int* const __restrict__ indices_global)
+    {
+        (void)ws_indices_global;
+        (void)indices_global;
+
+        auto zeroVal = opReduce::GetZeroVal();
+
+        const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
+        auto dst_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_dst_global, dst1dDesc.GetElementSpaceSize());
+
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredAccessesPerThreadInWarp, true>
+            in_thread_buf;
+
+        using warpwise_reduce =
+            WarpReduce<decltype(in_thread_buf), BlockSize, opReduce, nanPropaOpt>;
+
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
+
+        accuValue_buf(I0) = zeroVal;
+
+        const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
+        const int divider         = origReduceLen;
+
+        const preUnaryOpType preUnaryOp(divider);
+        const posUnaryOpType posUnaryOp(divider);
+
+        using ThreadBufferLengths       = Sequence<1, GredAccessesPerThreadInWarp>;
+        constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<1>{}, Number<GredAccessesPerThreadInWarp>{}));
+
+        index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
+        index_t warp_global_1d_id   = thread_global_1d_id / warpSize;
+        index_t thread_inwarp_id    = thread_global_1d_id % warpSize;
+
+        auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
+                                                                    compType,
+                                                                    src2dDescType,
+                                                                    decltype(ThreadBufferDesc),
+                                                                    ThreadBufferLengths,
+                                                                    Sequence<0, 1>,
+                                                                    1,
+                                                                    1,
+                                                                    1,
+                                                                    false>(
+            src2dDesc,
+            make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp));
+
+        constexpr auto in_thread_copy_step =
+            make_multi_index(0, warpSize * GredAccessesPerThreadInWarp);
+
+        for(index_t reducedLength = 0; reducedLength < toReduceLength;
+            reducedLength += warpSize * GredAccessesPerThreadInWarp)
+        {
+            threadwise_src_load.Run(
+                src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf);
+
+            // do element-wise pre-reduction operation
+            warpwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf);
+
+            // do the warp-wise reduction on data of all thread buffers
+            warpwise_reduce::Reduce(in_thread_buf, accuValue_buf(I0));
+
+            threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
+        }
+
+        accuValue_buf(I0) = posUnaryOp(accuValue_buf[I0]);
+
+        constexpr auto ReducedDataDesc =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
+
+        // The first thread in the warp stores the reduced result to the global location
+        // representing the Warp
+        if(thread_inwarp_id == 0)
+        {
+            if(!float_equal_one{}(alpha))
+                accuValue_buf(I0) *= type_convert<compType>{}(alpha);
+
+            if(!float_equal_zero{}(beta))
+            {
+                auto threadwise_dst_load =
+                    ThreadwiseTensorSliceTransfer_v2<dstDataType,
+                                                     dstDataType,
+                                                     dst1dDescType,
+                                                     decltype(ReducedDataDesc),
+                                                     Sequence<1>,
+                                                     Sequence<0>,
+                                                     0,
+                                                     1,
+                                                     1,
+                                                     true>(dst1dDesc,
+                                                           make_multi_index(warp_global_1d_id));
+
+                StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
+
+                threadwise_dst_load.Run(
+                    dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
+
+                accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf(I0) * beta);
+            }
+
+            auto threadwise_dst_store =
+                ThreadwiseTensorSliceTransfer_v1r3<compType,
+                                                   dstDataType,
+                                                   decltype(ReducedDataDesc),
+                                                   dst1dDescType,
+                                                   Sequence<1>,
+                                                   Sequence<0>,
+                                                   0,
+                                                   1,
+                                                   InMemoryDataOperationEnum_t::Set,
+                                                   1,
+                                                   true>(dst1dDesc,
+                                                         make_multi_index(warp_global_1d_id));
+
+            threadwise_dst_store.Run(
+                ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_buf);
+        }
+    };
+
+    template <>
+    __device__ static void Run<2>(const src2dDescType& src2dDesc,
+                                  const dst1dDescType& dst1dDesc,
+                                  int origReduceLen,
+                                  srcDataType alpha,
+                                  const srcDataType* const __restrict__ p_src_global,
+                                  dstDataType beta,
+                                  dstDataType* const __restrict__ p_dst_global,
+                                  const int* const __restrict__ ws_indices_global,
+                                  int* const __restrict__ indices_global)
+    {
+        (void)ws_indices_global;
+
+        auto zeroVal = opReduce::GetZeroVal();
+
+        const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
+        auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_dst_global, dst1dDesc.GetElementSpaceSize());
+        auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            indices_global, dst1dDesc.GetElementSpaceSize());
+
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredAccessesPerThreadInWarp, true>
+            in_thread_buf;
+
+        using warpwise_reduce =
+            WarpReduce<decltype(in_thread_buf), BlockSize, opReduce, nanPropaOpt>;
+
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
+
+        accuValue_buf(I0) = zeroVal;
+        accuIndex_buf(I0) = 0;
+
+        const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
+        const int divider         = origReduceLen;
+
+        const preUnaryOpType preUnaryOp(divider);
+
+        using ThreadBufferLengths       = Sequence<1, GredAccessesPerThreadInWarp>;
+        constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<1>{}, Number<GredAccessesPerThreadInWarp>{}));
+
+        index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
+        index_t warp_global_1d_id   = thread_global_1d_id / warpSize;
+        index_t thread_inwarp_id    = thread_global_1d_id % warpSize;
+
+        auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
+                                                                    compType,
+                                                                    src2dDescType,
+                                                                    decltype(ThreadBufferDesc),
+                                                                    ThreadBufferLengths,
+                                                                    Sequence<0, 1>,
+                                                                    1,
+                                                                    1,
+                                                                    1,
+                                                                    false>(
+            src2dDesc,
+            make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp));
+
+        constexpr auto in_thread_copy_step =
+            make_multi_index(0, warpSize * GredAccessesPerThreadInWarp);
+
+        index_t indexOffset = 0;
+        for(index_t reducedLength = 0; reducedLength < toReduceLength;
+            reducedLength += warpSize * GredAccessesPerThreadInWarp)
+        {
+            threadwise_src_load.Run(
+                src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf);
+
+            // unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
+            // done here
+            warpwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf);
+
+            // do the warp-wise reduction on data of all thread buffers
+            warpwise_reduce::Reduce2(
+                in_thread_buf, accuValue_buf(I0), accuIndex_buf(I0), indexOffset);
+
+            indexOffset += warpSize * GredAccessesPerThreadInWarp;
+
+            threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
+        }
+
+        constexpr auto ReducedDataDesc =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
+
+        // The first thread in the warp stores the reduced result to the global location
+        // representing the Warp
+        if(thread_inwarp_id == 0)
+        {
+            if(!float_equal_one{}(alpha))
+                accuValue_buf(I0) *= type_convert<compType>{}(alpha);
+
+            if(!float_equal_zero{}(beta))
+            {
+                auto threadwise_dst_load =
+                    ThreadwiseTensorSliceTransfer_v2<dstDataType,
+                                                     dstDataType,
+                                                     dst1dDescType,
+                                                     decltype(ReducedDataDesc),
+                                                     Sequence<1>,
+                                                     Sequence<0>,
+                                                     0,
+                                                     1,
+                                                     1,
+                                                     true>(dst1dDesc,
+                                                           make_multi_index(warp_global_1d_id));
+
+                StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
+
+                threadwise_dst_load.Run(dst1dDesc,
+                                        dst_global_val_buf,
+                                        ReducedDataDesc,
+                                        make_tuple(I0),
+                                        priorDstValue_buf);
+
+                accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
+            }
+
+            auto threadwise_dst_val_store =
+                ThreadwiseTensorSliceTransfer_v1r3<compType,
+                                                   dstDataType,
+                                                   decltype(ReducedDataDesc),
+                                                   dst1dDescType,
+                                                   Sequence<1>,
+                                                   Sequence<0>,
+                                                   0,
+                                                   1,
+                                                   InMemoryDataOperationEnum_t::Set,
+                                                   1,
+                                                   true>(dst1dDesc,
+                                                         make_multi_index(warp_global_1d_id));
+
+            auto threadwise_dst_idx_store =
+                ThreadwiseTensorSliceTransfer_v1r3<int,
+                                                   int,
+                                                   decltype(ReducedDataDesc),
+                                                   dst1dDescType,
+                                                   Sequence<1>,
+                                                   Sequence<0>,
+                                                   0,
+                                                   1,
+                                                   InMemoryDataOperationEnum_t::Set,
+                                                   1,
+                                                   true>(dst1dDesc,
+                                                         make_multi_index(warp_global_1d_id));
+
+            threadwise_dst_val_store.Run(
+                ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
+            threadwise_dst_idx_store.Run(
+                ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
+        }
+    };
+
+    template <>
+    __device__ static void Run<3>(const src2dDescType& src2dDesc,
+                                  const dst1dDescType& dst1dDesc,
+                                  int origReduceLen,
+                                  srcDataType alpha,
+                                  const srcDataType* const __restrict__ ws_values_global,
+                                  dstDataType beta,
+                                  dstDataType* const __restrict__ p_dst_global,
+                                  const int* const __restrict__ ws_indices_global,
+                                  int* const __restrict__ indices_global)
+    {
+        (void)origReduceLen;
+
+        auto zeroVal = opReduce::GetZeroVal();
+
+        const auto src_global_val_buf =
+            make_dynamic_buffer<AddressSpaceEnum_t::Global>(ws_values_global,
+                                                            src2dDesc.GetElementSpaceSize(),
+                                                            type_convert<srcDataType>{}(zeroVal));
+        const auto src_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            ws_indices_global, src2dDesc.GetElementSpaceSize());
+        auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_dst_global, dst1dDesc.GetElementSpaceSize());
+        auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            indices_global, dst1dDesc.GetElementSpaceSize());
+
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredAccessesPerThreadInWarp, true>
+            in_thread_val_buf;
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, int, GredAccessesPerThreadInWarp, true>
+            in_thread_idx_buf;
+
+        using warpwise_reduce = WarpReduceWithIndicesInput<decltype(in_thread_val_buf),
+                                                           decltype(in_thread_idx_buf),
+                                                           BlockSize,
+                                                           opReduce,
+                                                           nanPropaOpt>;
+
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
+
+        accuValue_buf(I0) = zeroVal;
+        accuIndex_buf(I0) = 0;
+
+        const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
+
+        using ThreadBufferLengths       = Sequence<1, GredAccessesPerThreadInWarp>;
+        constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<1>{}, Number<GredAccessesPerThreadInWarp>{}));
+
+        index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
+        index_t warp_global_1d_id   = thread_global_1d_id / warpSize;
+        index_t thread_inwarp_id    = thread_global_1d_id % warpSize;
+
+        auto threadwise_src_val_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
+                                                                        compType,
+                                                                        src2dDescType,
+                                                                        decltype(ThreadBufferDesc),
+                                                                        ThreadBufferLengths,
+                                                                        Sequence<0, 1>,
+                                                                        1,
+                                                                        1,
+                                                                        1,
+                                                                        false>(
+            src2dDesc,
+            make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp));
+
+        auto threadwise_src_idx_load = ThreadwiseTensorSliceTransfer_v2<int,
+                                                                        int,
+                                                                        src2dDescType,
+                                                                        decltype(ThreadBufferDesc),
+                                                                        ThreadBufferLengths,
+                                                                        Sequence<0, 1>,
+                                                                        1,
+                                                                        1,
+                                                                        1,
+                                                                        false>(
+            src2dDesc,
+            make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp));
+
+        constexpr auto in_thread_copy_step =
+            make_multi_index(0, warpSize * GredAccessesPerThreadInWarp);
+
+        for(index_t reducedLength = 0; reducedLength < toReduceLength;
+            reducedLength += warpSize * GredAccessesPerThreadInWarp)
+        {
+            threadwise_src_val_load.Run(src2dDesc,
+                                        src_global_val_buf,
+                                        ThreadBufferDesc,
+                                        make_tuple(I0, I0),
+                                        in_thread_val_buf);
+            threadwise_src_idx_load.Run(src2dDesc,
+                                        src_global_idx_buf,
+                                        ThreadBufferDesc,
+                                        make_tuple(I0, I0),
+                                        in_thread_idx_buf);
+
+            // do the warp-wise reduction on data of all thread buffers
+            warpwise_reduce::Reduce(
+                in_thread_val_buf, in_thread_idx_buf, accuValue_buf(I0), accuIndex_buf(I0));
+
+            threadwise_src_val_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
+            threadwise_src_idx_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
+        }
+
+        constexpr auto ReducedDataDesc =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
+
+        // The first thread in the warp stores the reduced result to the global location
+        // representing the Warp
+        if(thread_inwarp_id == 0)
+        {
+            if(!float_equal_one{}(alpha))
+                accuValue_buf(I0) *= type_convert<compType>{}(alpha);
+
+            if(!float_equal_zero{}(beta))
+            {
+                auto threadwise_dst_load =
+                    ThreadwiseTensorSliceTransfer_v2<dstDataType,
+                                                     dstDataType,
+                                                     dst1dDescType,
+                                                     decltype(ReducedDataDesc),
+                                                     Sequence<1>,
+                                                     Sequence<0>,
+                                                     0,
+                                                     1,
+                                                     1,
+                                                     true>(dst1dDesc,
+                                                           make_multi_index(warp_global_1d_id));
+
+                StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
+
+                threadwise_dst_load.Run(dst1dDesc,
+                                        dst_global_val_buf,
+                                        ReducedDataDesc,
+                                        make_tuple(I0),
+                                        priorDstValue_buf);
+
+                accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
+            }
+
+            auto threadwise_dst_val_store =
+                ThreadwiseTensorSliceTransfer_v1r3<compType,
+                                                   dstDataType,
+                                                   decltype(ReducedDataDesc),
+                                                   dst1dDescType,
+                                                   Sequence<1>,
+                                                   Sequence<0>,
+                                                   0,
+                                                   1,
+                                                   InMemoryDataOperationEnum_t::Set,
+                                                   1,
+                                                   true>(dst1dDesc,
+                                                         make_multi_index(warp_global_1d_id));
+
+            auto threadwise_dst_idx_store =
+                ThreadwiseTensorSliceTransfer_v1r3<int,
+                                                   int,
+                                                   decltype(ReducedDataDesc),
+                                                   dst1dDescType,
+                                                   Sequence<1>,
+                                                   Sequence<0>,
+                                                   0,
+                                                   1,
+                                                   InMemoryDataOperationEnum_t::Set,
+                                                   1,
+                                                   true>(dst1dDesc,
+                                                         make_multi_index(warp_global_1d_id));
+
+            threadwise_dst_val_store.Run(
+                ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
+            threadwise_dst_idx_store.Run(
+                ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
+        }
+    };
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_MULTIBLOCK_HPP
+#define CK_GRIDWISE_GENERIC_2D_REDUCTION_MULTIBLOCK_HPP
+
+#include "reduction_common.hpp"
+#include "reduction_operator.hpp"
+#include "reduction_functions_blockwise.hpp"
+
+#include "blockwise_tensor_slice_transfer.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename srcDataType,
+          typename dstDataType, // not used together with the beta input
+          typename compType,
+          typename src2dDescType,
+          typename dst1dDescType,
+          ReduceTensorOp_t op,
+          NanPropagation_t nanPropaOpt,
+          ReduceTensorIndices_t reduceIndicesOpt,
+          index_t GredAccessesPerThreadInBlock>
+struct GridwiseReduction_xy_to_x_multiblock
+{
+    using opReduce       = typename reduce_binary_operator<compType, op>::opType;
+    using preUnaryOpType = typename reduce_unary_operator<compType, op, true, false>::preUnaryOp;
+    using posUnaryOpType = typename reduce_unary_operator<compType, op, true, false>::posUnaryOp;
+
+    static constexpr auto buffer2dDesc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<GredAccessesPerThreadInBlock>{}, Number<BlockSize>{}));
+    using blockwise_reduce =
+        BlockwiseReduction_2d_block_buffer<decltype(buffer2dDesc), true, opReduce, nanPropaOpt>;
+
+    static constexpr index_t BlockBufferSize = buffer2dDesc.GetElementSize();
+
+    static constexpr auto I0 = Number<0>{};
+
+    template <int RunId>
+    __device__ static void Run(const src2dDescType& src2dDesc,
+                               const dst1dDescType& dst1dDesc,
+                               int origReduceLen,
+                               int BlkGroupSize,
+                               srcDataType alpha,
+                               const srcDataType* const __restrict__ p_src_global,
+                               dstDataType beta,
+                               srcDataType* const __restrict__ ws_values_global,
+                               int* const __restrict__ ws_indices_global);
+
+    template <>
+    __device__ static void Run<1>(const src2dDescType& src2dDesc,
+                                  const dst1dDescType& dst1dDesc,
+                                  int origReduceLen,
+                                  int BlkGroupSize,
+                                  srcDataType alpha,
+                                  const srcDataType* const __restrict__ p_src_global,
+                                  dstDataType beta,
+                                  srcDataType* const __restrict__ ws_values_global,
+                                  int* const __restrict__ ws_indices_global)
+    {
+        (void)ws_indices_global;
+
+        (void)alpha; // unused
+        (void)beta;  // unused
+
+        auto zeroVal = opReduce::GetZeroVal();
+
+        // LDS
+        __shared__ compType p_in_block_buffer[BlockBufferSize];
+
+        const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
+        auto workspace_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            ws_values_global, dst1dDesc.GetLength(I0) * BlkGroupSize);
+
+        auto in_block_buf =
+            make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_buffer, BlockBufferSize);
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
+
+        accuValue_buf(I0) = zeroVal;
+
+        const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
+        const int divider         = origReduceLen;
+
+        const preUnaryOpType preUnaryOp(divider);
+
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+        const index_t blkgroup_id     = block_global_id / BlkGroupSize;
+        const index_t block_local_id  = block_global_id % BlkGroupSize;
+
+        const index_t reduceSizePerBlock =
+            (((toReduceLength + BlkGroupSize - 1) / BlkGroupSize + BlockBufferSize - 1) /
+             BlockBufferSize) *
+            BlockBufferSize;
+
+        constexpr auto in_block_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<1>{}, Number<BlockSize * GredAccessesPerThreadInBlock>{}));
+
+        using ThreadSliceLengths   = Sequence<1, GredAccessesPerThreadInBlock>;
+        using ThreadClusterLengths = Sequence<1, BlockSize>;
+
+        auto blockwise_src_load = BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                                                  InMemoryDataOperationEnum_t::Set,
+                                                                  Sequence<1, BlockBufferSize>,
+                                                                  ThreadSliceLengths,
+                                                                  ThreadClusterLengths,
+                                                                  Sequence<0, 1>,
+                                                                  srcDataType,
+                                                                  compType,
+                                                                  src2dDescType,
+                                                                  decltype(in_block_desc),
+                                                                  Sequence<0, 1>,
+                                                                  Sequence<0, 1>,
+                                                                  1,
+                                                                  1,
+                                                                  1,
+                                                                  1,
+                                                                  1,
+                                                                  1,
+                                                                  false,
+                                                                  true>(
+            src2dDesc,
+            make_multi_index(blkgroup_id, block_local_id * reduceSizePerBlock),
+            in_block_desc,
+            make_multi_index(0, 0));
+
+        constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
+
+        const index_t toReduceBlocks = (reduceSizePerBlock + BlockSize - 1) / BlockSize;
+
+        for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
+            reducedBlocks += GredAccessesPerThreadInBlock)
+        {
+            blockwise_src_load.RunRead(src2dDesc, src_global_buf);
+            blockwise_src_load.RunWrite(in_block_desc, in_block_buf);
+            __syncthreads();
+
+            // do element-wise pre-reduction operation
+            blockwise_reduce::operate_on_elements(preUnaryOp, in_block_buf);
+
+            index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
+                                        ? GredAccessesPerThreadInBlock
+                                        : toReduceBlocks - reducedBlocks;
+            blockwise_reduce::Reduce(in_block_buf, BlocksInOneOp, accuValue_buf(I0));
+
+            blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
+        }
+
+        constexpr auto ReducedDataDesc =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
+
+        const auto workspace_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(dst1dDesc.GetLength(I0) * BlkGroupSize));
+
+        // The first thread in the block stores the reduced result to the global location
+        // representing the block
+        if(thread_local_id == 0)
+        {
+            auto threadwise_workspace_store =
+                ThreadwiseTensorSliceTransfer_v1r3<compType,
+                                                   srcDataType,
+                                                   decltype(ReducedDataDesc),
+                                                   decltype(workspace_desc),
+                                                   Sequence<1>,
+                                                   Sequence<0>,
+                                                   0,
+                                                   1,
+                                                   InMemoryDataOperationEnum_t::Set,
+                                                   1,
+                                                   true>(workspace_desc,
+                                                         make_multi_index(block_global_id));
+
+            threadwise_workspace_store.Run(ReducedDataDesc,
+                                           make_tuple(I0),
+                                           accuValue_buf,
+                                           workspace_desc,
+                                           workspace_global_buf);
+        }
+    };
+
+    template <>
+    __device__ static void Run<2>(const src2dDescType& src2dDesc,
+                                  const dst1dDescType& dst1dDesc,
+                                  int origReduceLen,
+                                  int BlkGroupSize,
+                                  srcDataType alpha,
+                                  const srcDataType* const __restrict__ p_src_global,
+                                  dstDataType beta,
+                                  srcDataType* const __restrict__ ws_values_global,
+                                  int* const __restrict__ ws_indices_global)
+    {
+        (void)alpha; // unused
+        (void)beta;  // unused
+
+        auto zeroVal = opReduce::GetZeroVal();
+
+        // LDS
+        __shared__ compType p_in_block_values_buffer[BlockBufferSize];
+        __shared__ int p_in_block_indices_buffer[BlockBufferSize];
+
+        const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
+        auto workspace_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            ws_values_global, dst1dDesc.GetLength(I0) * BlkGroupSize);
+        auto workspace_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            ws_indices_global, dst1dDesc.GetLength(I0) * BlkGroupSize);
+
+        auto in_block_val_buf =
+            make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_values_buffer, BlockBufferSize);
+        auto in_block_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_in_block_indices_buffer, BlockBufferSize);
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
+
+        accuValue_buf(I0) = zeroVal;
+        accuIndex_buf(I0) = 0;
+
+        const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
+        const int divider         = origReduceLen;
+
+        const preUnaryOpType preUnaryOp(divider);
+
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+        const index_t blkgroup_id     = block_global_id / BlkGroupSize;
+        const index_t block_local_id  = block_global_id % BlkGroupSize;
+
+        const index_t reduceSizePerBlock =
+            (((toReduceLength + BlkGroupSize - 1) / BlkGroupSize + BlockBufferSize - 1) /
+             BlockBufferSize) *
+            BlockBufferSize;
+
+        constexpr auto in_block_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<1>{}, Number<BlockSize * GredAccessesPerThreadInBlock>{}));
+
+        using ThreadSliceLengths   = Sequence<1, GredAccessesPerThreadInBlock>;
+        using ThreadClusterLengths = Sequence<1, BlockSize>;
+
+        auto blockwise_src_load = BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                                                  InMemoryDataOperationEnum_t::Set,
+                                                                  Sequence<1, BlockBufferSize>,
+                                                                  ThreadSliceLengths,
+                                                                  ThreadClusterLengths,
+                                                                  Sequence<0, 1>,
+                                                                  srcDataType,
+                                                                  compType,
+                                                                  src2dDescType,
+                                                                  decltype(in_block_desc),
+                                                                  Sequence<0, 1>,
+                                                                  Sequence<0, 1>,
+                                                                  1,
+                                                                  1,
+                                                                  1,
+                                                                  1,
+                                                                  1,
+                                                                  1,
+                                                                  false,
+                                                                  true>(
+            src2dDesc,
+            make_multi_index(blkgroup_id, block_local_id * reduceSizePerBlock),
+            in_block_desc,
+            make_multi_index(0, 0));
+
+        constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
+
+        const index_t toReduceBlocks = (reduceSizePerBlock + BlockSize - 1) / BlockSize;
+
+        int indexOffset = block_local_id * reduceSizePerBlock;
+
+        for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
+            reducedBlocks += GredAccessesPerThreadInBlock)
+        {
+            blockwise_reduce::init_buffer_indices(in_block_idx_buf, indexOffset);
+
+            blockwise_src_load.RunRead(src2dDesc, src_global_buf);
+            blockwise_src_load.RunWrite(in_block_desc, in_block_val_buf);
+
+            __syncthreads();
+
+            // unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
+            // done here
+            blockwise_reduce::operate_on_elements(preUnaryOp, in_block_val_buf);
+
+            index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
+                                        ? GredAccessesPerThreadInBlock
+                                        : toReduceBlocks - reducedBlocks;
+
+            blockwise_reduce::Reduce2(in_block_val_buf,
+                                      in_block_idx_buf,
+                                      BlocksInOneOp,
+                                      accuValue_buf(I0),
+                                      accuIndex_buf(I0));
+
+            indexOffset += BlockBufferSize;
+
+            blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
+        }
+
+        constexpr auto ReducedDataDesc =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
+
+        const auto workspace_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(dst1dDesc.GetLength(I0) * BlkGroupSize));
+
+        // The first thread in the block stores the reduced result to the global location
+        // representing the block
+        if(thread_local_id == 0)
+        {
+            auto threadwise_workspace_val_store =
+                ThreadwiseTensorSliceTransfer_v1r3<compType,
+                                                   srcDataType,
+                                                   decltype(ReducedDataDesc),
+                                                   decltype(workspace_desc),
+                                                   Sequence<1>,
+                                                   Sequence<0>,
+                                                   0,
+                                                   1,
+                                                   InMemoryDataOperationEnum_t::Set,
+                                                   1,
+                                                   true>(workspace_desc,
+                                                         make_multi_index(block_global_id));
+
+            auto threadwise_workspace_idx_store =
+                ThreadwiseTensorSliceTransfer_v1r3<int,
+                                                   int,
+                                                   decltype(ReducedDataDesc),
+                                                   decltype(workspace_desc),
+                                                   Sequence<1>,
+                                                   Sequence<0>,
+                                                   0,
+                                                   1,
+                                                   InMemoryDataOperationEnum_t::Set,
+                                                   1,
+                                                   true>(workspace_desc,
+                                                         make_multi_index(block_global_id));
+
+            threadwise_workspace_val_store.Run(ReducedDataDesc,
+                                               make_tuple(I0),
+                                               accuValue_buf,
+                                               workspace_desc,
+                                               workspace_global_val_buf);
+            threadwise_workspace_idx_store.Run(ReducedDataDesc,
+                                               make_tuple(I0),
+                                               accuIndex_buf,
+                                               workspace_desc,
+                                               workspace_global_idx_buf);
+        }
+    };
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp
+++ b/composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
+#define CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
+
+#include "data_type.hpp"
+
+#include "reduction_common.hpp"
+#include "reduction_operator.hpp"
+#include "reduction_functions_binop.hpp"
+
+namespace ck {
+
+template <typename buffer2dDescType,
+          bool blockIsOneRow,
+          typename opReduce,
+          NanPropagation_t nanPropaOpt>
+struct BlockwiseReduction_2d_block_buffer
+{
+    using compType = typename opReduce::dataType;
+
+    static constexpr auto buffer2dDesc = buffer2dDescType{};
+
+    static constexpr index_t BlockSize =
+        blockIsOneRow ? buffer2dDesc.GetLength(Number<1>{}) : buffer2dDesc.GetLength(Number<0>{});
+    static constexpr index_t NumBlocks =
+        blockIsOneRow ? buffer2dDesc.GetLength(Number<0>{}) : buffer2dDesc.GetLength(Number<1>{});
+    using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
+
+    // This interface does not accumulate on indices
+    template <typename BufferType>
+    __device__ static void
+    Reduce(BufferType& block_buffer, index_t toReduceBlocks, compType& accuData)
+    {
+        const index_t thread_local_id = get_thread_local_1d_id();
+        compType lAccuData            = opReduce::GetZeroVal();
+
+        index_t offset;
+        for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
+        {
+            offset = blockIsOneRow
+                         ? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_local_id))
+                         : buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, otherDimInd));
+            compType opData = type_convert<compType>{}(block_buffer[offset]);
+
+            binop::calculate(lAccuData, opData);
+        }
+
+        offset = blockIsOneRow ? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id))
+                               : buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
+
+        block_buffer(offset) = lAccuData;
+
+        __syncthreads();
+
+        for(index_t indOffset = BlockSize / 2; indOffset > 0; indOffset /= 2)
+        {
+            if(thread_local_id < indOffset)
+            {
+                index_t offset1 =
+                    blockIsOneRow ? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id))
+                                  : buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
+
+                index_t offset2 =
+                    blockIsOneRow
+                        ? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id + indOffset))
+                        : buffer2dDesc.CalculateOffset(make_tuple(thread_local_id + indOffset, 0));
+
+                compType opData1 = type_convert<compType>{}(block_buffer[offset1]);
+                compType opData2 = type_convert<compType>{}(block_buffer[offset2]);
+                binop::calculate(opData1, opData2);
+                block_buffer(offset1) = type_convert<compType>{}(opData1);
+            }
+
+            __syncthreads();
+        }
+
+        if(thread_local_id == 0)
+        {
+            compType tmpVal = type_convert<compType>{}(block_buffer[0]);
+
+            binop::calculate(accuData, tmpVal);
+        }
+    };
+
+    // This interface accumulates on both data values and indices
+    template <typename BufferType, typename IdxBufferType>
+    __device__ static void Reduce2(BufferType& block_buffer,
+                                   IdxBufferType& block_indices_buffer,
+                                   index_t toReduceBlocks,
+                                   compType& accuData,
+                                   int& accuIndex)
+    {
+        const index_t thread_local_id = get_thread_local_1d_id();
+        compType lAccuData            = opReduce::GetZeroVal();
+        int lAccuIndex                = 0;
+
+        if constexpr(blockIsOneRow)
+        {
+            for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
+            {
+                for(index_t indOffset = 1; indOffset < BlockSize; indOffset *= 2)
+                {
+                    if(thread_local_id % (indOffset * 2) == 0)
+                    {
+                        index_t offset1 =
+                            buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_local_id));
+                        index_t offset2 = buffer2dDesc.CalculateOffset(
+                            make_tuple(otherDimInd, thread_local_id + indOffset));
+
+                        compType currVal1 = type_convert<compType>{}(block_buffer[offset1]);
+                        compType currVal2 = type_convert<compType>{}(block_buffer[offset2]);
+                        int currIndex1    = block_indices_buffer[offset1];
+                        int currIndex2    = block_indices_buffer[offset2];
+
+                        binop::calculate(currVal1, currVal2, currIndex1, currIndex2);
+                        block_buffer(offset1)         = type_convert<compType>{}(currVal1);
+                        block_indices_buffer(offset1) = currIndex1;
+                    }
+                    __syncthreads();
+                }
+            }
+
+            if(thread_local_id == 0)
+            {
+                for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
+                {
+                    index_t offset = buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, 0));
+
+                    compType tmpVal = type_convert<compType>{}(block_buffer[offset]);
+                    int tmpIndex    = block_indices_buffer[offset];
+
+                    binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex);
+                }
+
+                binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex);
+            }
+        }
+        else
+        {
+            index_t offset;
+
+            for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
+            {
+                offset = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, otherDimInd));
+                compType currVal = type_convert<compType>{}(block_buffer[offset]);
+                int currIndex    = block_indices_buffer[offset];
+
+                binop::calculate(lAccuData, currVal, lAccuIndex, currIndex);
+            }
+
+            offset = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
+
+            block_buffer(offset)         = lAccuData;
+            block_indices_buffer(offset) = lAccuIndex;
+
+            __syncthreads();
+
+            for(index_t indOffset = 1; indOffset < BlockSize; indOffset *= 2)
+            {
+                if(thread_local_id % (indOffset * 2) == 0)
+                {
+                    index_t offset1 = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
+                    index_t offset2 =
+                        buffer2dDesc.CalculateOffset(make_tuple(thread_local_id + indOffset, 0));
+
+                    compType currVal1 = type_convert<compType>{}(block_buffer[offset1]);
+                    compType currVal2 = type_convert<compType>{}(block_buffer[offset2]);
+                    int currIndex1    = block_indices_buffer[offset1];
+                    int currIndex2    = block_indices_buffer[offset2];
+
+                    binop::calculate(currVal1, currVal2, currIndex1, currIndex2);
+                    block_buffer(offset1)         = type_convert<compType>{}(currVal1);
+                    block_indices_buffer(offset1) = currIndex1;
+                }
+
+                __syncthreads();
+            }
+
+            if(thread_local_id == 0)
+            {
+                compType tmpVal = type_convert<compType>{}(block_buffer[0]);
+                int tmpIndex    = block_indices_buffer[0];
+
+                binop::calculate(accuData, tmpVal, accuIndex, tmpIndex);
+            }
+        }
+    };
+
+    template <typename BufferType>
+    __device__ static void set_buffer_value(BufferType& block_buffer, compType value)
+    {
+        index_t thread_id = get_thread_local_1d_id();
+
+        for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++)
+        {
+            index_t offset = blockIsOneRow
+                                 ? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id))
+                                 : buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd));
+
+            block_buffer(offset) = value;
+
+            __syncthreads();
+        }
+    };
+
+    // Initialize the block-wise indices buffer, the index for each element in the block-wise data
+    // buffer
+    // is calculated according to its position in the buffer and the global starting index
+    template <typename IdxBufferType>
+    __device__ static void init_buffer_indices(IdxBufferType& block_indices_buffer, int indexStart)
+    {
+        index_t thread_id = get_thread_local_1d_id();
+
+        for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++)
+        {
+            index_t offset = blockIsOneRow
+                                 ? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id))
+                                 : buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd));
+
+            block_indices_buffer(offset) = offset + indexStart;
+
+            __syncthreads();
+        }
+    };
+
+    // Execute unary operation on the block buffer elements
+    template <typename unary_op_type, typename BufferType>
+    __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& block_buffer)
+    {
+        index_t thread_id = get_thread_local_1d_id();
+
+        for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++)
+        {
+            index_t offset = blockIsOneRow
+                                 ? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id))
+                                 : buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd));
+
+            block_buffer(offset) = unary_op(block_buffer[offset]);
+
+            __syncthreads();
+        }
+    };
+};
+
+}; // end of namespace ck
+
+#endif
--- a/composable_kernel/include/tensor_operation/reduction_functions_threadwise.hpp
+++ b/composable_kernel/include/tensor_operation/reduction_functions_threadwise.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_REDUCTION_FUNCTIONS_THREADWISE_HPP
+#define CK_REDUCTION_FUNCTIONS_THREADWISE_HPP
+
+#include "data_type.hpp"
+
+#include "reduction_common.hpp"
+#include "reduction_operator.hpp"
+#include "reduction_functions_binop.hpp"
+
+namespace ck {
+
+template <typename BufferType, typename opReduce, NanPropagation_t nanPropaOpt>
+struct ThreadReduce
+{
+    using compType = typename opReduce::dataType;
+
+    static_assert(BufferType::IsStaticBuffer(), "Thread-wise reduction needs use StaticBuffer!");
+
+    static_assert(
+        std::is_same<typename BufferType::type, compType>::value,
+        "Data type of StaticBuffer for Thread-wise reduction should be same as the compType!");
+
+    static constexpr index_t ThreadBufferLen = BufferType::Size();
+
+    using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
+
+    // This interface does not accumulate on indices
+    __device__ static void Reduce(const BufferType& thread_buffer, compType& accuData)
+    {
+        static_for<0, ThreadBufferLen, 1>{}(
+            [&](auto I) { binop::calculate(accuData, thread_buffer[I]); });
+    };
+
+    // This interface accumulates on both data values and indices and
+    // is called by Direct_ThreadWise reduction method at first-time reduction
+    __device__ static void
+    Reduce2(const BufferType& thread_buffer, compType& accuData, int& accuIndex, int indexStart)
+    {
+        static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
+            int currIndex = I + indexStart;
+            binop::calculate(accuData, thread_buffer[I], accuIndex, currIndex);
+        });
+    };
+
+    // Set the elements in the per-thread buffer to a specific value
+    // cppcheck-suppress constParameter
+    __device__ static void set_buffer_value(BufferType& thread_buffer, compType value)
+    {
+        static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; });
+    };
+
+    // Execute unary operation on the per-thread buffer elements
+    template <typename unary_op_type>
+    __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer)
+    {
+        static_for<0, ThreadBufferLen, 1>{}(
+            [&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); });
+    };
+};
+
+template <typename BufferType,
+          typename IdxBufferType,
+          typename opReduce,
+          NanPropagation_t nanPropaOpt>
+struct ThreadReduceWithIndicesInput
+{
+    using compType = typename opReduce::dataType;
+
+    static_assert(BufferType::IsStaticBuffer(), "Thread-wise reduction needs use StaticBuffer!");
+    static_assert(IdxBufferType::IsStaticBuffer(),
+                  "Thread-wise reduction needs use StaticBuffer for indices!");
+
+    static_assert(
+        std::is_same<typename BufferType::type, compType>::value,
+        "Data type of StaticBuffer for Thread-wise reduction should be same as the compType!");
+    static_assert(std::is_same<typename IdxBufferType::type, index_t>::value,
+                  "Indices type of StaticBuffer for Thread-wise reduction should be index_t!");
+
+    static_assert(BufferType::Size() == IdxBufferType::Size(),
+                  "StaticBuffers for data and indices should have the same sizes!");
+
+    static constexpr index_t ThreadBufferLen = BufferType::Size();
+
+    using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
+
+    // This interface accumulates on both data values and indices and
+    // is called by Direct_ThreadWise reduction method at second-time reduction
+    __device__ static void Reduce(const BufferType& thread_buffer,
+                                  const IdxBufferType& thread_indices_buffer,
+                                  compType& accuData,
+                                  int& accuIndex)
+    {
+        static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
+            binop::calculate(accuData, thread_buffer[I], accuIndex, thread_indices_buffer[I]);
+        });
+    };
+
+    // Set the elements in the per-thread buffer to a specific value
+    // cppcheck-suppress constParameter
+    __device__ static void set_buffer_value(BufferType& thread_buffer, compType value)
+    {
+        static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; });
+    };
+
+    // Execute unary operation on the per-thread buffer elements
+    template <typename unary_op_type>
+    __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer)
+    {
+        static_for<0, ThreadBufferLen, 1>{}(
+            [&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); });
+    };
+};
+
+}; // end of namespace ck
+
+#endif
--- a/composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp
+++ b/composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_REDUCTION_FUNCTIONS_WARPWISE_HPP
+#define CK_REDUCTION_FUNCTIONS_WARPWISE_HPP
+
+#include "data_type.hpp"
+
+#include "reduction_common.hpp"
+#include "reduction_operator.hpp"
+#include "reduction_functions_binop.hpp"
+
+namespace ck {
+
+template <typename BufferType, index_t BlockSize, typename opReduce, NanPropagation_t nanPropaOpt>
+struct WarpReduce
+{
+    using compType = typename opReduce::dataType;
+    using binop    = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
+
+    static_assert(BufferType::IsStaticBuffer(),
+                  "Per-thread buffer for WarpWise reduction should be StaticBuffer!");
+    static_assert(std::is_same<typename BufferType::type, compType>::value,
+                  "Data type of per-thread StaticBuffer for WarpWise reduction should be same as "
+                  "the compType!");
+
+    static constexpr index_t ThreadBufferLen = BufferType::Size();
+    static constexpr bool have_builtin_shuffle =
+        std::is_same<compType, float>::value || std::is_same<compType, double>::value;
+
+    // This interface does not accumulate on indices
+    __device__ static void Reduce(const BufferType& thread_buffer, compType& accuData)
+    {
+        if constexpr(have_builtin_shuffle)
+            ReduceImpl1(thread_buffer, accuData);
+        else
+            ReduceImpl2(thread_buffer, accuData);
+    };
+
+    // This interface implementation uses HIP built-in device shuffling functions
+    __device__ static void ReduceImpl1(const BufferType& thread_buffer, compType& accuData)
+    {
+        compType lAccuData = opReduce::GetZeroVal();
+
+        static_for<0, ThreadBufferLen, 1>{}(
+            [&](auto I) { binop::calculate(lAccuData, thread_buffer[I]); });
+
+        // synchronize among all threads in this warp
+        __all(1);
+
+        for(index_t stride = warpSize / 2; stride > 0; stride /= 2)
+        {
+            compType tmpVal = __shfl_down(lAccuData, stride, warpSize);
+            binop::calculate(lAccuData, tmpVal);
+            __all(1);
+        }
+
+        binop::calculate(accuData, lAccuData);
+    };
+
+    // This interface implementation does not use HIP built-in device shuffling functions
+    // since for fp16, built-in shuffling functions is not provided by HIP
+    __device__ static void ReduceImpl2(const BufferType& thread_buffer, compType& accuData)
+    {
+        compType lAccuData = opReduce::GetZeroVal();
+
+        static_for<0, ThreadBufferLen, 1>{}(
+            [&](auto I) { binop::calculate(lAccuData, thread_buffer[I]); });
+
+        __syncthreads();
+
+        index_t thread_id        = get_thread_local_1d_id();
+        index_t warpId           = thread_id / warpSize;
+        index_t thread_inwarp_id = thread_id % warpSize;
+
+        __shared__ compType shuffle_buffer[BlockSize];
+
+        compType* myBuffer = &shuffle_buffer[warpId * warpSize];
+
+        myBuffer[thread_inwarp_id] = lAccuData;
+
+        __syncthreads();
+
+        for(index_t stride = warpSize / 2; stride > 0; stride /= 2)
+        {
+            if(thread_inwarp_id < stride)
+            {
+                compType currVal1 = myBuffer[thread_inwarp_id];
+                compType currVal2 = myBuffer[thread_inwarp_id + stride];
+
+                binop::calculate(currVal1, currVal2);
+
+                myBuffer[thread_inwarp_id] = currVal1;
+            }
+
+            __syncthreads();
+        }
+        if(thread_inwarp_id == 0)
+            binop::calculate(accuData, myBuffer[0]);
+    };
+
+    // This interface accumulates on both data values and indices and is called by Direct_WarpWise
+    // reduction method at first-time reduction
+    __device__ static void
+    Reduce2(const BufferType& thread_buffer, compType& accuData, int& accuIndex, int indexStart)
+    {
+        if constexpr(have_builtin_shuffle)
+            Reduce2Impl1(thread_buffer, accuData, accuIndex, indexStart);
+        else
+            Reduce2Impl2(thread_buffer, accuData, accuIndex, indexStart);
+    };
+
+    // This interface implementation uses HIP built-in device shuffling functions
+    __device__ static void Reduce2Impl1(const BufferType& thread_buffer,
+                                        compType& accuData,
+                                        int& accuIndex,
+                                        int indexStart)
+    {
+        compType lAccuData       = opReduce::GetZeroVal();
+        int lAccuIndex           = 0;
+        index_t thread_inwarp_id = get_thread_local_1d_id() % warpSize;
+
+        static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
+            int currIndex = thread_inwarp_id * ThreadBufferLen + I + indexStart;
+            binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, currIndex);
+        });
+
+        // synchronize among all threads in this warp
+        __all(1);
+
+        for(index_t stride = 1; stride < warpSize; stride *= 2)
+        {
+            compType tmpVal = __shfl_down(lAccuData, stride, warpSize);
+            int tmpIndex    = __shfl_down(lAccuIndex, stride, warpSize);
+
+            binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex);
+            __all(1);
+        }
+
+        if(thread_inwarp_id == 0)
+            binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex);
+    };
+
+    // This interface implementation does not use HIP built-in device shuffling functions since for
+    // fp16, built-in shuffling functions is not provided by HIP
+    __device__ static void Reduce2Impl2(const BufferType& thread_buffer,
+                                        compType& accuData,
+                                        int& accuIndex,
+                                        int indexStart)
+    {
+        compType lAccuData       = opReduce::GetZeroVal();
+        int lAccuIndex           = 0;
+        index_t thread_id        = get_thread_local_1d_id();
+        index_t warpId           = thread_id / warpSize;
+        index_t thread_inwarp_id = thread_id % warpSize;
+
+        static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
+            int currIndex = thread_inwarp_id * ThreadBufferLen + I + indexStart;
+            binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, currIndex);
+        });
+
+        __shared__ compType shuffle_data_buffer[BlockSize];
+        __shared__ int shuffle_indices_buffer[BlockSize];
+
+        compType* myDataBuffer = &shuffle_data_buffer[warpId * warpSize];
+        int* myIndicesBuffer   = &shuffle_indices_buffer[warpId * warpSize];
+
+        myDataBuffer[thread_inwarp_id]    = lAccuData;
+        myIndicesBuffer[thread_inwarp_id] = lAccuIndex;
+
+        __syncthreads();
+
+        for(index_t stride = 1; stride < warpSize; stride *= 2)
+        {
+            compType currVal1 = myDataBuffer[thread_inwarp_id];
+            compType currVal2 = myDataBuffer[thread_inwarp_id + stride];
+            int currIndex1    = myIndicesBuffer[thread_inwarp_id];
+            int currIndex2    = myIndicesBuffer[thread_inwarp_id + stride];
+
+            binop::calculate(currVal1, currVal2, currIndex1, currIndex2);
+
+            myDataBuffer[thread_inwarp_id]    = currVal1;
+            myIndicesBuffer[thread_inwarp_id] = currIndex1;
+
+            __syncthreads();
+        }
+
+        if(thread_inwarp_id == 0)
+            binop::calculate(accuData, myDataBuffer[0], accuIndex, myIndicesBuffer[0]);
+    };
+
+    // cppcheck-suppress constParameter
+    __device__ static void set_buffer_value(BufferType& thread_buffer, compType value)
+    {
+        static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; });
+
+        __all(1);
+    };
+
+    // Execute unary operation on the per-thread buffer elements
+    template <typename unary_op_type>
+    __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer)
+    {
+        static_for<0, ThreadBufferLen, 1>{}(
+            [&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); });
+
+        __all(1);
+    };
+};
+
+template <typename BufferType,
+          typename IdxBufferType,
+          index_t BlockSize,
+          typename opReduce,
+          NanPropagation_t nanPropaOpt>
+struct WarpReduceWithIndicesInput
+{
+    using compType = typename opReduce::dataType;
+    using binop    = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
+
+    static_assert(BufferType::IsStaticBuffer(),
+                  "Per-thread buffer for WarpWise reduction should be StaticBuffer!");
+    static_assert(IdxBufferType::IsStaticBuffer(),
+                  "Per-thread buffer for WarpWise reduction should be StaticBuffer for indices!");
+
+    static_assert(std::is_same<typename BufferType::type, compType>::value,
+                  "Data type of per-thread StaticBuffer for WarpWise reduction should be same as "
+                  "the compType!");
+    static_assert(
+        std::is_same<typename IdxBufferType::type, index_t>::value,
+        "Indices type per-thread of StaticBuffer for WarpWise reduction should be index_t!");
+
+    static_assert(BufferType::Size() == IdxBufferType::Size(),
+                  "StaticBuffers for data and indices should have the same sizes!");
+
+    static constexpr index_t ThreadBufferLen = BufferType::Size();
+    static constexpr bool have_builtin_shuffle =
+        std::is_same<compType, float>::value || std::is_same<compType, double>::value;
+
+    // This interface accumulates on both data values and indices and is called by Direct_WarpWise
+    // reduction method at second-time reduction
+    __device__ static void Reduce(const BufferType& thread_buffer,
+                                  const IdxBufferType& thread_indices_buffer,
+                                  compType& accuData,
+                                  int& accuIndex)
+    {
+        if constexpr(have_builtin_shuffle)
+            ReduceImpl1(thread_buffer, thread_indices_buffer, accuData, accuIndex);
+        else
+            ReduceImpl2(thread_buffer, thread_indices_buffer, accuData, accuIndex);
+    };
+
+    // This interface implementation uses HIP built-in device shuffling functions
+    __device__ static void ReduceImpl1(const BufferType& thread_buffer,
+                                       const IdxBufferType& thread_indices_buffer,
+                                       compType& accuData,
+                                       int& accuIndex)
+    {
+        compType lAccuData = opReduce::GetZeroVal();
+        int lAccuIndex     = 0;
+
+        static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
+            binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, thread_indices_buffer[I]);
+        });
+
+        // synchronize among all threads in this warp
+        __all(1);
+
+        for(index_t stride = 1; stride < warpSize; stride *= 2)
+        {
+            compType tmpVal = __shfl_down(lAccuData, stride, warpSize);
+            int tmpIndex    = __shfl_down(lAccuIndex, stride, warpSize);
+
+            binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex);
+            __all(1);
+        }
+
+        binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex);
+    };
+
+    // This interface implementation does not use HIP built-in device shuffling functions
+    // since for fp16, built-in shuffling functions is not provided by HIP
+    __device__ static void ReduceImpl2(const BufferType& thread_buffer,
+                                       const IdxBufferType& thread_indices_buffer,
+                                       compType& accuData,
+                                       int& accuIndex)
+    {
+        compType lAccuData       = opReduce::GetZeroVal();
+        int lAccuIndex           = 0;
+        index_t thread_id        = get_thread_local_1d_id();
+        index_t warpId           = thread_id / warpSize;
+        index_t thread_inwarp_id = thread_id % warpSize;
+
+        static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
+            binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, thread_indices_buffer[I]);
+        });
+
+        __shared__ compType shuffle_data_buffer[BlockSize];
+        __shared__ int shuffle_indices_buffer[BlockSize];
+
+        compType* myDataBuffer = &shuffle_data_buffer[warpId * warpSize];
+        int* myIndicesBuffer   = &shuffle_indices_buffer[warpId * warpSize];
+
+        myDataBuffer[thread_inwarp_id]    = lAccuData;
+        myIndicesBuffer[thread_inwarp_id] = lAccuIndex;
+
+        __syncthreads();
+
+        for(index_t stride = 1; stride < warpSize; stride *= 2)
+        {
+            compType currVal1 = myDataBuffer[thread_inwarp_id];
+            compType currVal2 = myDataBuffer[thread_inwarp_id + stride];
+            int currIndex1    = myIndicesBuffer[thread_inwarp_id];
+            int currIndex2    = myIndicesBuffer[thread_inwarp_id + stride];
+
+            binop::calculate(currVal1, currVal2, currIndex1, currIndex2);
+
+            myDataBuffer[thread_inwarp_id]    = currVal1;
+            myIndicesBuffer[thread_inwarp_id] = currIndex1;
+
+            __syncthreads();
+        }
+
+        if(thread_inwarp_id == 0)
+            binop::calculate(accuData, myDataBuffer[0], accuIndex, myIndicesBuffer[0]);
+    };
+
+    // cppcheck-suppress constParameter
+    __device__ static void set_buffer_value(BufferType& thread_buffer, compType value)
+    {
+        static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; });
+
+        __all(1);
+    };
+
+    // Execute unary operation on the per-thread buffer elements
+    template <typename unary_op_type>
+    __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer)
+    {
+        static_for<0, ThreadBufferLen, 1>{}(
+            [&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); });
+
+        __all(1);
+    };
+};
+
+}; // end of namespace ck
+
+#endif
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
@@ -713,9 +713,6 @@ struct ThreadwiseTensorSliceTransfer_v3
        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
          dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin))
    {
-        // TODO: fix this
-        static_assert(is_same<SrcData, DstData>::value,
-                      "wrong! current implementation assume SrcData and DstData are same type");
    }

    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
@@ -985,7 +982,8 @@ struct ThreadwiseTensorSliceTransfer_v3
                constexpr index_t buffer_offset =
                    buffer_desc_.CalculateOffset(dst_data_idx + i * dst_scalar_step_in_vector);

-                dst_tmp_vector.template AsType<DstData>()(i) = buffer_[Number<buffer_offset>{}];
+                dst_tmp_vector.template AsType<DstData>()(i) =
+                    type_convert<DstData>{}(buffer_[Number<buffer_offset>{}]);
            });

            using dst_vector_t = typename decltype(dst_tmp_vector)::type;

--- a/composable_kernel/include/utility/dynamic_buffer.hpp
+++ b/composable_kernel/include/utility/dynamic_buffer.hpp
@@ -38,6 +38,10 @@ struct DynamicBuffer
        return BufferAddressSpace;
    }

+    __host__ __device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; }
+
+    __host__ __device__ constexpr T& operator()(index_t i) { return p_data_[i]; }
+
    template <typename X,
              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
                                         typename scalar_type<remove_cvref_t<T>>::type>::value,

--- a/composable_kernel/include/utility/reduction_common.hpp
+++ b/composable_kernel/include/utility/reduction_common.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_REDUCTION_COMMON_HPP
+#define CK_REDUCTION_COMMON_HPP
+
+// this enumerate should be synchronized with include/miopen/reduce_common.hpp
+namespace ck {
+enum class ReductionMethod_t
+{
+    DirectThreadWise = 1,
+    DirectWarpWise   = 2,
+    BlockWise        = 3,
+    MultiBlock       = 4
+}; // end of namespace ck
+
+enum class ReduceTensorOp_t
+{
+    ADD   = 0,
+    MUL   = 1,
+    MIN   = 2,
+    MAX   = 3,
+    AMAX  = 4,
+    AVG   = 5,
+    NORM1 = 6,
+    NORM2 = 7,
+    // MUL_NO_ZEROS = 8,
+};
+
+enum class NanPropagation_t
+{
+    NOT_PROPAGATE_NAN = 0,
+    PROPAGATE_NAN     = 1,
+};
+
+enum class ReduceTensorIndices_t
+{
+    NO_INDICES        = 0,
+    FLATTENED_INDICES = 1,
+};
+
+enum class IndicesType_t
+{
+    INDICES_32BIT = 0,
+    INDICES_64BIT = 1,
+    INDICES_16BIT = 2,
+    INDICES_8BIT  = 3,
+};
+
+struct float_equal_one
+{
+    template <class T>
+    __device__ static inline bool apply(T x)
+    {
+        return x <= type_convert<T>{}(1.0f) and x >= type_convert<T>{}(1.0f);
+    }
+
+    template <class T>
+    __device__ inline bool operator()(T x)
+    {
+        return (float_equal_one::apply(x));
+    };
+};
+
+struct float_equal_zero
+{
+    template <class T>
+    __device__ static inline bool apply(T x)
+    {
+        return x <= type_convert<T>{}(0.0f) and x >= type_convert<T>{}(0.0f);
+    }
+
+    template <class T>
+    __device__ inline bool operator()(T x)
+    {
+        return (float_equal_zero::apply(x));
+    };
+};
+
+}; // end of namespace ck
+
+#endif
--- a/composable_kernel/include/utility/reduction_functions_binop.hpp
+++ b/composable_kernel/include/utility/reduction_functions_binop.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_REDUCTION_FUNCTIONS_BINOP_HPP
+#define CK_REDUCTION_FUNCTIONS_BINOP_HPP
+
+#include "data_type.hpp"
+
+#include "reduction_common.hpp"
+#include "reduction_operator.hpp"
+
+namespace ck {
+namespace detail {
+
+static inline __device__ bool isnan(half_t x) { return __hisnan(x); };
+
+template <NanPropagation_t nanPropaOpt, typename opReduce, typename compType>
+struct binop_with_nan_check;
+
+template <typename opReduce, typename compType>
+struct binop_with_nan_check<NanPropagation_t::NOT_PROPAGATE_NAN, opReduce, compType>
+{
+    // cppcheck-suppress constParameter
+    __device__ static inline void calculate(compType& accuVal, compType currVal)
+    {
+        opReduce{}(accuVal, currVal);
+    };
+
+    // The method is called when the opReduce is indexable and the user asked for indices
+    __device__ static inline void
+    // cppcheck-suppress constParameter
+    calculate(compType& accuVal, compType currVal, int& accuIndex, int currIndex)
+    {
+        bool changed = false;
+
+        opReduce{}(accuVal, currVal, changed);
+
+        if(changed)
+            accuIndex = currIndex;
+    };
+};
+
+template <typename opReduce, typename compType>
+struct binop_with_nan_check<NanPropagation_t::PROPAGATE_NAN, opReduce, compType>
+{
+    __device__ static inline void calculate(compType& accuVal, compType currVal)
+    {
+        if(isnan(currVal))
+            accuVal = currVal;
+        else
+            opReduce{}(accuVal, currVal);
+    };
+
+    // The method is called when the opReduce is indexable and the user asked for indices
+    __device__ static inline void
+    calculate(compType& accuVal, compType currVal, int& accuIndex, int currIndex)
+    {
+        if(isnan(currVal))
+        {
+            accuVal   = currVal;
+            accuIndex = currIndex;
+        }
+        else
+        {
+            bool changed = false;
+
+            opReduce{}(accuVal, currVal, changed);
+
+            if(changed)
+                accuIndex = currIndex;
+        }
+    };
+};
+
+}; // namespace detail
+}; // end of namespace ck
+
+#endif
--- a/composable_kernel/include/utility/reduction_operator.hpp
+++ b/composable_kernel/include/utility/reduction_operator.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_REDUCTION_OPERATOR_HPP
+#define CK_REDUCTION_OPERATOR_HPP
+
+#include "reduction_common.hpp"
+
+namespace ck {
+
+namespace reduce {
+
+// Every binary operator used in reduction is represented by a templated functor class. Each functor
+// class must provide at least
+// three members:
+// 1) GetZeroVal() -- the interface to return the "identity element" for the binary operator,
+// "identity element" is the unique
+//                    element in the algebraic space that doesn't affect the value of other elements
+//                    when operated with any of them.
+// 2) indexable -- boolean value indicating whether indices of the operated elements could be
+// recorded. Usually, Min/Max operator could
+//                 need to record the indices of elements. For operator like Add/Mul, no need to
+//                 record the indices.
+// 3) operator() -- the first argument of the operator must be both an input & output, and the
+// corresponding variable usually stores
+//                  the accumulated result of many operator() calls; the second argument is only an
+//                  input. For indexable binary
+//                  operator, the second version of operator() has third argument (which is an
+//                  output) to indicate whether the
+//                  accumulated value (the first argument) has changed, in which case the recorded
+//                  accumulated index also need be
+//                  changed.
+
+template <class T>
+struct Add
+{
+    using dataType = T;
+
+    __device__ static T GetZeroVal() { return type_convert<T>{}(0.0f); };
+
+    __device__ inline constexpr void operator()(T& a, T b) const { a = a + b; }
+
+    static constexpr bool indexable = false;
+};
+
+template <class T>
+struct Mul
+{
+    using dataType = T;
+
+    __device__ static T GetZeroVal() { return type_convert<T>{}(1.0f); };
+
+    __device__ inline constexpr void operator()(T& a, T b) const { a = a * b; }
+
+    static constexpr bool indexable = false;
+};
+
+template <class T>
+struct Max
+{
+    using dataType = T;
+
+    __device__ static T GetZeroVal() { return std::numeric_limits<T>::min(); };
+
+    __device__ inline constexpr void operator()(T& a, T b) const
+    {
+        if(a < b)
+            a = b;
+    }
+
+    __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
+    {
+        if(a < b)
+        {
+            a       = b;
+            changed = true;
+        }
+    }
+
+    static constexpr bool indexable = true;
+};
+
+template <class T>
+struct Min
+{
+    using dataType = T;
+
+    __device__ static T GetZeroVal() { return std::numeric_limits<T>::max(); };
+
+    __device__ inline constexpr void operator()(T& a, T b) const
+    {
+        if(a > b)
+            a = b;
+    }
+
+    __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
+    {
+        if(a > b)
+        {
+            a       = b;
+            changed = true;
+        }
+    }
+
+    static constexpr bool indexable = true;
+};
+
+template <>
+__device__ half_t Max<half_t>::GetZeroVal()
+{
+    return type_convert<half_t>{}(std::numeric_limits<float>::min());
+};
+
+template <>
+__device__ half_t Min<half_t>::GetZeroVal()
+{
+    return type_convert<half_t>{}(std::numeric_limits<float>::max());
+};
+
+// Unary operators are usually called element-wisely before the reduction is executed on the
+// elements.
+// They are needed for easy implementation of reduction types of AVG, NRM1, NRM2
+template <class T, bool hasDividing>
+struct unary_identic
+{
+    __device__ unary_identic(const int divider = 1)
+    {
+        scaler = 1.0f / static_cast<float>(divider);
+    };
+
+    __device__ inline constexpr T operator()(T a) const { return a * type_convert<T>{}(scaler); };
+
+    float scaler = 1.0f;
+};
+
+template <class T>
+struct unary_identic<T, false>
+{
+    __device__ unary_identic(const int divider = 1) { (void)divider; };
+
+    __device__ inline constexpr T operator()(T a) const { return a; };
+};
+
+template <class T, bool hasDividing>
+struct unary_square
+{
+    __device__ unary_square(const int divider = 1) { scaler = 1.0f / static_cast<float>(divider); };
+
+    __device__ inline constexpr T operator()(T a) const
+    {
+        a = a * a;
+
+        return a * type_convert<T>{}(scaler);
+    };
+
+    float scaler = 1.0f;
+};
+
+template <class T>
+struct unary_square<T, false>
+{
+    __device__ unary_square(const int divider = 1) { (void)divider; };
+
+    __device__ inline constexpr T operator()(T a) const { return a * a; };
+};
+
+template <class T, bool hasDividing>
+struct unary_abs
+{
+    __device__ unary_abs(const int divider = 1) { scaler = 1.0f / static_cast<float>(divider); };
+
+    __device__ inline constexpr T operator()(T a) const
+    {
+        a = abs(a);
+
+        return a * type_convert<T>{}(scaler);
+    };
+
+    float scaler = 1.0f;
+};
+
+template <class T>
+struct unary_abs<T, false>
+{
+    __device__ unary_abs(const int divider = 1) { (void)divider; };
+
+    __device__ inline constexpr T operator()(T a) const { return abs(a); };
+};
+
+// We know for sure that 4.0 has __habs(), but 3.0 does not have it.
+// Let's assume that __habs() exists since 3.5.
+#if HIP_PACKAGE_VERSION_FLAT < 3005000000
+inline __device__ __half __habs(__half x)
+{
+    union
+    {
+        __half half;
+        unsigned short u16;
+    } val;
+    val.half = x;
+    val.u16  = val.u16 & 0x7fff;
+    return val.half;
+}
+#endif
+
+template <bool hasDividing>
+struct unary_abs<half_t, hasDividing>
+{
+    __device__ unary_abs(const int divider = 1) { scaler = 1.0f / static_cast<float>(divider); };
+
+    __device__ inline half_t operator()(half_t a) const
+    {
+        a = static_cast<half_t>(__habs(a));
+
+        return a * type_convert<half_t>{}(scaler);
+    };
+
+    float scaler = 1.0f;
+};
+
+template <>
+struct unary_abs<half_t, false>
+{
+    __device__ unary_abs(const int divider = 1) { (void)divider; };
+
+    __device__ inline half_t operator()(half_t a) const { return static_cast<half_t>(__habs(a)); };
+};
+
+template <class T>
+struct unary_sqrt
+{
+    __device__ unary_sqrt(const int divider = 1) { (void)divider; };
+
+    __device__ inline T operator()(T a) const { return sqrtf(a); };
+};
+
+template <>
+struct unary_sqrt<half_t>
+{
+    __device__ unary_sqrt(const int divider = 1) { (void)divider; };
+
+    __device__ inline half_t operator()(half_t a) const { return static_cast<half_t>(hsqrt(a)); };
+};
+
+}; // end of namespace reduce
+
+// The templated struct reduce_binary_operator maps the enum Ids of binary operators to their
+// respective functor classes.
+// The "GetZeroVal()" interface and boolean member "indexable" are also provided in
+// reduce_binary_operactor for
+// easier checking by the upper-layer codes in the kernels.
+
+template <typename T, ReduceTensorOp_t op>
+struct reduce_binary_operator;
+
+template <typename T>
+struct reduce_binary_operator<T, ReduceTensorOp_t::ADD>
+{
+    using opType   = reduce::Add<T>;
+    using dataType = T;
+
+    __device__ static T GetZeroVal() { return reduce::Add<T>::GetZeroVal(); };
+
+    static constexpr bool indexable = reduce::Add<T>::indexable;
+};
+
+template <typename T>
+struct reduce_binary_operator<T, ReduceTensorOp_t::MUL>
+{
+    using opType   = reduce::Mul<T>;
+    using dataType = T;
+
+    __device__ static T GetZeroVal() { return reduce::Mul<T>::GetZeroVal(); };
+
+    static constexpr bool indexable = reduce::Mul<T>::indexable;
+};
+
+template <typename T>
+struct reduce_binary_operator<T, ReduceTensorOp_t::MIN>
+{
+    using opType   = reduce::Min<T>;
+    using dataType = T;
+
+    __device__ static T GetZeroVal() { return reduce::Min<T>::GetZeroVal(); };
+
+    static constexpr bool indexable = reduce::Min<T>::indexable;
+};
+
+template <typename T>
+struct reduce_binary_operator<T, ReduceTensorOp_t::MAX>
+{
+    using opType   = reduce::Max<T>;
+    using dataType = T;
+
+    __device__ static T GetZeroVal() { return reduce::Max<T>::GetZeroVal(); };
+
+    static constexpr bool indexable = reduce::Max<T>::indexable;
+};
+
+template <typename T>
+struct reduce_binary_operator<T, ReduceTensorOp_t::AMAX>
+{
+    using opType   = reduce::Max<T>;
+    using dataType = T;
+
+    __device__ static T GetZeroVal() { return reduce::Max<T>::GetZeroVal(); };
+
+    static constexpr bool indexable = reduce::Max<T>::indexable;
+};
+
+template <typename T>
+struct reduce_binary_operator<T, ReduceTensorOp_t::AVG>
+{
+    using opType   = reduce::Add<T>;
+    using dataType = T;
+
+    __device__ static T GetZeroVal() { return reduce::Add<T>::GetZeroVal(); };
+
+    static constexpr bool indexable = reduce::Add<T>::indexable;
+};
+
+template <typename T>
+struct reduce_binary_operator<T, ReduceTensorOp_t::NORM1>
+{
+    using opType   = reduce::Add<T>;
+    using dataType = T;
+
+    __device__ static T GetZeroVal() { return reduce::Add<T>::GetZeroVal(); };
+
+    static constexpr bool indexable = reduce::Add<T>::indexable;
+};
+
+template <typename T>
+struct reduce_binary_operator<T, ReduceTensorOp_t::NORM2>
+{
+    using opType   = reduce::Add<T>;
+    using dataType = T;
+
+    __device__ static T GetZeroVal() { return reduce::Add<T>::GetZeroVal(); };
+
+    static constexpr bool indexable = reduce::Add<T>::indexable;
+};
+
+// The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary
+// functor classes.
+// The two unary functors are called before and afer the Reduction is executed respectively
+template <typename T, ReduceTensorOp_t op, bool isFirsReduce, bool isLastReduce>
+struct reduce_unary_operator
+{
+    using preUnaryOp = reduce::unary_identic<T, false>;
+    using posUnaryOp = reduce::unary_identic<T, false>;
+};
+
+template <typename T, bool isFirstReduce>
+struct reduce_unary_operator<T, ReduceTensorOp_t::AVG, isFirstReduce, true>
+{
+    using preUnaryOp = reduce::unary_identic<T, false>;
+    using posUnaryOp = reduce::unary_identic<T, true>;
+};
+
+template <typename T, bool isLastReduce>
+struct reduce_unary_operator<T, ReduceTensorOp_t::NORM1, true, isLastReduce>
+{
+    using preUnaryOp = reduce::unary_abs<T, false>;
+    using posUnaryOp = reduce::unary_identic<T, false>;
+};
+
+template <typename T, bool isLastReduce>
+struct reduce_unary_operator<T, ReduceTensorOp_t::AMAX, true, isLastReduce>
+{
+    using preUnaryOp = reduce::unary_abs<T, false>;
+    using posUnaryOp = reduce::unary_identic<T, false>;
+};
+
+template <typename T>
+struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, true, false>
+{
+    using preUnaryOp = reduce::unary_square<T, false>;
+    using posUnaryOp = reduce::unary_identic<T, false>;
+};
+
+template <typename T>
+struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, true, true>
+{
+    using preUnaryOp = reduce::unary_square<T, false>;
+    using posUnaryOp = reduce::unary_sqrt<T>;
+};
+
+template <typename T>
+struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, false, true>
+{
+    using preUnaryOp = reduce::unary_identic<T, false>;
+    using posUnaryOp = reduce::unary_sqrt<T>;
+};
+
+} // end of namespace ck
+
+#endif
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_blockwise.hpp"
+
+using namespace ck;
+
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+
+constexpr index_t srcDims = CK_PARAM_IN_DIMS;
+constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
+
+using toReduceDims = Sequence<CK_PARAM_TOREDUCE_DIMS>;
+
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+
+////////////////////////////////////////////////////////////////////////////////////////
+using specDims = typename sequence_merge<Sequence<>, toReduceDims>::type;
+
+static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
+              "Wrong invariant and/or toReduce dimensions!");
+
+// The number of invariant dimensions can be zero if all dimension are to be reduced
+static_assert(dstDims == 1,
+              "If all source dimensions are reduced, the dest should have only one dimension !!");
+
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+
+constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
+
+// helper functions using variadic template arguments
+template <index_t... Ns>
+__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
+{
+    return make_tuple(static_cast<index_t>(lengths[Ns])...);
+};
+
+template <index_t arraySize>
+__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
+{
+    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
+
+    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
+
+    return make_tuple_from_array_and_index_seq(lengths, index_seq);
+};
+
+template <index_t... Ns>
+__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
+{
+    return make_tuple(Ns...);
+};
+
+extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
+                                                             int BlkGroupSize,
+                                                             int inLength0,
+                                                             int inLength1,
+                                                             int inLength2,
+                                                             int inLength3,
+                                                             int inLength4,
+                                                             int inLength5,
+                                                             int inStride0,
+                                                             int inStride1,
+                                                             int inStride2,
+                                                             int inStride3,
+                                                             int inStride4,
+                                                             int inStride5,
+                                                             int outLength0,
+                                                             int outLength1,
+                                                             int outLength2,
+                                                             int outLength3,
+                                                             int outLength4,
+                                                             int outLength5,
+                                                             int outStride0,
+                                                             int outStride1,
+                                                             int outStride2,
+                                                             int outStride3,
+                                                             int outStride4,
+                                                             int outStride5,
+                                                             void* __restrict__ ws_global)
+{
+    (void)GridSize;
+    (void)BlkGroupSize;
+
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+
+    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
+    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
+    const int dstLengths[6] = {
+        outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
+    const int dstStrides[6] = {
+        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
+
+    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
+    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
+    const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
+    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
+
+    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+
+    const auto one_dim_srcDesc = transform_tensor_descriptor(
+        srcDesc,
+        make_tuple(make_merge_transform(tupleSrcLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    auto src2dDesc = transform_tensor_descriptor(
+        one_dim_srcDesc,
+        make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))),
+        make_tuple(Sequence<0>{}),
+        make_tuple(Sequence<0, 1>{}));
+
+    auto dst1dDesc = transform_tensor_descriptor(
+        dstDesc,
+        make_tuple(make_merge_transform(tupleDstLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    const auto invariantLen = src2dDesc.GetLength(Number<0>{});
+    const auto toReduceLen  = src2dDesc.GetLength(Number<1>{});
+
+    constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
+
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad =
+            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
+
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pass_through_transform(invariantLen),
+                                                   make_pad_transform(toReduceLen, 0, srcPad)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+
+    if(hipThreadIdx_x == 0)
+        *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
+};
+
+template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
+struct get_ref_desc_types
+{
+    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
+    static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 1>::type{};
+
+    // don't have to use accurate strides to get an expected referrence type
+    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
+    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
+
+    static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
+        ref_srcDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))),
+        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    static constexpr auto ref_src2dDesc =
+        transform_tensor_descriptor(ref_one_dim_srcDesc,
+                                    make_tuple(make_unmerge_transform(
+                                        make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))),
+                                    make_tuple(Sequence<0>{}),
+                                    make_tuple(Sequence<0, 1>{}));
+
+    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
+        ref_dstDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
+    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
+
+    // used by the BlockWise and MultiBlock method
+    using refType_src2dDesc_padded_34 = decltype(
+        transform_tensor_descriptor(ref_src2dDesc,
+                                    make_tuple(make_pass_through_transform(ref_invariantLen),
+                                               make_pad_transform(ref_toReduceLen, 0, 2)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{})));
+
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dst1dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dst1dDesc);
+};
+
+using refType_src2dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc;
+using refType_dst1dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc;
+using refType_src2dDesc_padded_34 =
+    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc_padded_34;
+using refType_dst1dDesc_padded =
+    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc_padded;
+
+template <ReductionMethod_t impl, bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+
+extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
+                                                     int BlkGroupSize,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     void* __restrict__ ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)BlkGroupSize;
+    (void)ws_buf2_bytes_offset;
+
+    const void* p_src2dDesc = ws_global;
+    const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+
+    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise<BlockSize,
+                                                                   srcDataType,
+                                                                   dstDataType,
+                                                                   compType,
+                                                                   decltype(src2dDesc),
+                                                                   decltype(dst1dDesc),
+                                                                   op,
+                                                                   nanPropaOpt,
+                                                                   reduceIndicesOpt,
+                                                                   true,
+                                                                   true,
+                                                                   GredAccessesPerThreadInBlock>;
+
+    constexpr int RunId = need_indices ? 2 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(p_src_global),
+        beta,
+        static_cast<dstDataType* const __restrict__>(p_dst_global),
+        static_cast<const int* const __restrict__>(nullptr),
+        static_cast<int* const __restrict__>(indices_global));
+};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_blockwise.hpp"
+
+using namespace ck;
+
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+
+constexpr index_t srcDims = CK_PARAM_IN_DIMS;
+constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
+
+using toReduceDims  = Sequence<CK_PARAM_TOREDUCE_DIMS>;
+using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>;
+
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+
+////////////////////////////////////////////////////////////////////////////////////////
+using specDims = typename sequence_merge<invariantDims, toReduceDims>::type;
+
+static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
+              "Wrong invariant and/or toReduce dimensions!");
+
+// The number of invariant dimensions can be zero if all dimension are to be reduced
+static_assert(invariantDims::Size() > 0 || dstDims == 1,
+              "If all source dimensions are reduced, the dest should have only one dimension !!");
+
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+
+constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
+
+// helper functions using variadic template arguments
+template <index_t... Ns>
+__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
+{
+    return make_tuple(static_cast<index_t>(lengths[Ns])...);
+};
+
+template <index_t arraySize>
+__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
+{
+    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
+
+    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
+
+    return make_tuple_from_array_and_index_seq(lengths, index_seq);
+};
+
+template <index_t... Ns>
+__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
+{
+    return make_tuple(Ns...);
+};
+
+extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
+                                                             int BlkGroupSize,
+                                                             int inLength0,
+                                                             int inLength1,
+                                                             int inLength2,
+                                                             int inLength3,
+                                                             int inLength4,
+                                                             int inLength5,
+                                                             int inStride0,
+                                                             int inStride1,
+                                                             int inStride2,
+                                                             int inStride3,
+                                                             int inStride4,
+                                                             int inStride5,
+                                                             int outLength0,
+                                                             int outLength1,
+                                                             int outLength2,
+                                                             int outLength3,
+                                                             int outLength4,
+                                                             int outLength5,
+                                                             int outStride0,
+                                                             int outStride1,
+                                                             int outStride2,
+                                                             int outStride3,
+                                                             int outStride4,
+                                                             int outStride5,
+                                                             void* __restrict__ ws_global)
+{
+    (void)GridSize;
+    (void)BlkGroupSize;
+
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+
+    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
+    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
+    const int dstLengths[6] = {
+        outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
+    const int dstStrides[6] = {
+        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
+
+    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
+    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
+    const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
+    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
+
+    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+
+    const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{});
+    const auto invariantDimLengths =
+        make_tuple_from_array_and_index_seq(srcLengths, invariantDims{});
+
+    auto src2dDesc =
+        transform_tensor_descriptor(srcDesc,
+                                    make_tuple(make_merge_transform(invariantDimLengths),
+                                               make_merge_transform(toReduceDimLengths)),
+                                    make_tuple(invariantDims{}, toReduceDims{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    auto dst1dDesc = transform_tensor_descriptor(
+        dstDesc,
+        make_tuple(make_merge_transform(tupleDstLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    const auto invariantLen = src2dDesc.GetLength(Number<0>{});
+    const auto toReduceLen  = src2dDesc.GetLength(Number<1>{});
+
+    constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
+
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad =
+            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
+
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pass_through_transform(invariantLen),
+                                                   make_pad_transform(toReduceLen, 0, srcPad)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+
+    if(hipThreadIdx_x == 0)
+        *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
+};
+
+template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
+struct get_ref_desc_types
+{
+    static constexpr auto ref_toReduceDimLengths =
+        typename uniform_sequence_gen<toReduceDims::Size(), 8>::type{};
+    static constexpr auto ref_invariantDimLengths =
+        typename uniform_sequence_gen<invariantDims::Size(), 8>::type{};
+
+    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
+    static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 8>::type{};
+
+    // don't have to use accurate strides to get an expected referrence type
+    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
+    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
+
+    static constexpr auto ref_src2dDesc = transform_tensor_descriptor(
+        ref_srcDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)),
+                   make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))),
+        make_tuple(invariantDims{}, toReduceDims{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
+        ref_dstDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
+    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
+
+    // used by the BlockWise and MultiBlock method
+    using refType_src2dDesc_padded_34 = decltype(
+        transform_tensor_descriptor(ref_src2dDesc,
+                                    make_tuple(make_pass_through_transform(ref_invariantLen),
+                                               make_pad_transform(ref_toReduceLen, 0, 2)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{})));
+
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dst1dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dst1dDesc);
+};
+
+using refType_src2dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
+using refType_dst1dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
+using refType_src2dDesc_padded_34 =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
+        refType_src2dDesc_padded_34;
+using refType_dst1dDesc_padded =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
+        refType_dst1dDesc_padded;
+
+template <bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+
+extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
+                                                     int BlkGroupSize,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     void* __restrict__ ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)BlkGroupSize;
+    (void)ws_buf2_bytes_offset;
+
+    const void* p_src2dDesc = ws_global;
+    const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+
+    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise<BlockSize,
+                                                                   srcDataType,
+                                                                   dstDataType,
+                                                                   compType,
+                                                                   decltype(src2dDesc),
+                                                                   decltype(dst1dDesc),
+                                                                   op,
+                                                                   nanPropaOpt,
+                                                                   reduceIndicesOpt,
+                                                                   true,
+                                                                   true,
+                                                                   GredAccessesPerThreadInBlock>;
+
+    constexpr int RunId = need_indices ? 2 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(p_src_global),
+        beta,
+        static_cast<dstDataType* const __restrict__>(p_dst_global),
+        static_cast<const int* const __restrict__>(nullptr),
+        static_cast<int* const __restrict__>(indices_global));
+};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_multiblock.hpp"
+
+using namespace ck;
+
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+
+constexpr index_t srcDims = CK_PARAM_IN_DIMS;
+constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
+
+using toReduceDims  = Sequence<CK_PARAM_TOREDUCE_DIMS>;
+using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>; // this could be empty
+
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+
+////////////////////////////////////////////////////////////////////////////////////////
+using specDims = typename sequence_merge<Sequence<>, toReduceDims>::type;
+
+static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
+              "Wrong invariant and/or toReduce dimensions!");
+
+// The number of invariant dimensions can be zero if all dimension are to be reduced
+static_assert(dstDims == 1,
+              "If all source dimensions are reduced, the dest should have only one dimension !!");
+
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+
+constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
+
+// helper functions using variadic template arguments
+template <index_t... Ns>
+__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
+{
+    return make_tuple(static_cast<index_t>(lengths[Ns])...);
+};
+
+template <index_t arraySize>
+__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
+{
+    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
+
+    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
+
+    return make_tuple_from_array_and_index_seq(lengths, index_seq);
+};
+
+template <index_t... Ns>
+__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
+{
+    return make_tuple(Ns...);
+};
+
+extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
+                                                             int BlkGroupSize,
+                                                             int inLength0,
+                                                             int inLength1,
+                                                             int inLength2,
+                                                             int inLength3,
+                                                             int inLength4,
+                                                             int inLength5,
+                                                             int inStride0,
+                                                             int inStride1,
+                                                             int inStride2,
+                                                             int inStride3,
+                                                             int inStride4,
+                                                             int inStride5,
+                                                             int outLength0,
+                                                             int outLength1,
+                                                             int outLength2,
+                                                             int outLength3,
+                                                             int outLength4,
+                                                             int outLength5,
+                                                             int outStride0,
+                                                             int outStride1,
+                                                             int outStride2,
+                                                             int outStride3,
+                                                             int outStride4,
+                                                             int outStride5,
+                                                             void* __restrict__ ws_global)
+{
+    (void)GridSize;
+
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+
+    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
+    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
+    const int dstLengths[6] = {
+        outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
+    const int dstStrides[6] = {
+        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
+
+    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
+    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
+    const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
+    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
+
+    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+
+    const auto one_dim_srcDesc = transform_tensor_descriptor(
+        srcDesc,
+        make_tuple(make_merge_transform(tupleSrcLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    auto src2dDesc = transform_tensor_descriptor(
+        one_dim_srcDesc,
+        make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))),
+        make_tuple(Sequence<0>{}),
+        make_tuple(Sequence<0, 1>{}));
+
+    auto dst1dDesc = transform_tensor_descriptor(
+        dstDesc,
+        make_tuple(make_merge_transform(tupleDstLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    const auto invariantLen = src2dDesc.GetLength(Number<0>{});
+    const auto toReduceLen  = src2dDesc.GetLength(Number<1>{});
+
+    constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
+    const index_t reduceSizePerBlock =
+        (((toReduceLen + BlkGroupSize - 1) / BlkGroupSize + copySliceLen - 1) / copySliceLen) *
+        copySliceLen;
+
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad = reduceSizePerBlock * BlkGroupSize - toReduceLen;
+
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pass_through_transform(invariantLen),
+                                                   make_pad_transform(toReduceLen, 0, srcPad)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+
+    if(hipThreadIdx_x == 0)
+        *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
+};
+
+template <index_t srcDims, index_t dstDims, typename toReduceDims>
+struct get_ref_desc_types
+{
+    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
+    static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 1>::type{};
+
+    // don't have to use accurate strides to get an expected referrence type
+    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
+    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
+
+    static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
+        ref_srcDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))),
+        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    static constexpr auto ref_src2dDesc =
+        transform_tensor_descriptor(ref_one_dim_srcDesc,
+                                    make_tuple(make_unmerge_transform(
+                                        make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))),
+                                    make_tuple(Sequence<0>{}),
+                                    make_tuple(Sequence<0, 1>{}));
+
+    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
+        ref_dstDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
+    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
+
+    // used by the BlockWise and MultiBlock method
+    using refType_src2dDesc_padded_34 = decltype(
+        transform_tensor_descriptor(ref_src2dDesc,
+                                    make_tuple(make_pass_through_transform(ref_invariantLen),
+                                               make_pad_transform(ref_toReduceLen, 0, 2)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{})));
+
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dst1dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dst1dDesc);
+};
+
+using refType_src2dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc;
+using refType_dst1dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc;
+using refType_src2dDesc_padded_34 =
+    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc_padded_34;
+using refType_dst1dDesc_padded =
+    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc_padded;
+
+template <bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+
+extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
+                                                     int BlkGroupSize,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     void* __restrict__ ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)p_dst_global;
+    (void)indices_global;
+
+    const void* p_src2dDesc = ws_global;
+    const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    void* ws_buf1_global    = static_cast<char*>(ws_global) + 4096;
+
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+
+    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_multiblock<BlockSize,
+                                                                    srcDataType,
+                                                                    dstDataType,
+                                                                    compType,
+                                                                    decltype(src2dDesc),
+                                                                    decltype(dst1dDesc),
+                                                                    op,
+                                                                    nanPropaOpt,
+                                                                    reduceIndicesOpt,
+                                                                    GredAccessesPerThreadInBlock>;
+
+    void* const ws_buf2_global =
+        ws_buf2_bytes_offset > 0
+            ? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
+            : nullptr;
+
+    constexpr int RunId = need_indices ? 2 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        BlkGroupSize,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(p_src_global),
+        beta,
+        static_cast<srcDataType* const __restrict__>(ws_buf1_global),
+        static_cast<int* const __restrict__>(ws_buf2_global));
+};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_multiblock.hpp"
+
+using namespace ck;
+
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+
+constexpr index_t srcDims = CK_PARAM_IN_DIMS;
+constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
+
+using toReduceDims  = Sequence<CK_PARAM_TOREDUCE_DIMS>;
+using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>;
+
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+
+////////////////////////////////////////////////////////////////////////////////////////
+using specDims = typename sequence_merge<invariantDims, toReduceDims>::type;
+
+static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
+              "Wrong invariant and/or toReduce dimensions!");
+
+// The number of invariant dimensions can be zero if all dimension are to be reduced
+static_assert(invariantDims::Size() > 0 || dstDims == 1,
+              "If all source dimensions are reduced, the dest should have only one dimension !!");
+
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+
+constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
+
+// helper functions using variadic template arguments
+template <index_t... Ns>
+__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
+{
+    return make_tuple(static_cast<index_t>(lengths[Ns])...);
+};
+
+template <index_t arraySize>
+__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
+{
+    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
+
+    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
+
+    return make_tuple_from_array_and_index_seq(lengths, index_seq);
+};
+
+template <index_t... Ns>
+__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
+{
+    return make_tuple(Ns...);
+};
+
+extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
+                                                             int BlkGroupSize,
+                                                             int inLength0,
+                                                             int inLength1,
+                                                             int inLength2,
+                                                             int inLength3,
+                                                             int inLength4,
+                                                             int inLength5,
+                                                             int inStride0,
+                                                             int inStride1,
+                                                             int inStride2,
+                                                             int inStride3,
+                                                             int inStride4,
+                                                             int inStride5,
+                                                             int outLength0,
+                                                             int outLength1,
+                                                             int outLength2,
+                                                             int outLength3,
+                                                             int outLength4,
+                                                             int outLength5,
+                                                             int outStride0,
+                                                             int outStride1,
+                                                             int outStride2,
+                                                             int outStride3,
+                                                             int outStride4,
+                                                             int outStride5,
+                                                             void* __restrict__ ws_global)
+{
+    (void)GridSize;
+
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+
+    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
+    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
+    const int dstLengths[6] = {
+        outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
+    const int dstStrides[6] = {
+        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
+
+    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
+    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
+    const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
+    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
+
+    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+
+    const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{});
+    const auto invariantDimLengths =
+        make_tuple_from_array_and_index_seq(srcLengths, invariantDims{});
+
+    auto src2dDesc =
+        transform_tensor_descriptor(srcDesc,
+                                    make_tuple(make_merge_transform(invariantDimLengths),
+                                               make_merge_transform(toReduceDimLengths)),
+                                    make_tuple(invariantDims{}, toReduceDims{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    auto dst1dDesc = transform_tensor_descriptor(
+        dstDesc,
+        make_tuple(make_merge_transform(tupleDstLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    const auto invariantLen = src2dDesc.GetLength(Number<0>{});
+    const auto toReduceLen  = src2dDesc.GetLength(Number<1>{});
+
+    constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
+    const index_t reduceSizePerBlock =
+        (((toReduceLen + BlkGroupSize - 1) / BlkGroupSize + copySliceLen - 1) / copySliceLen) *
+        copySliceLen;
+
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad = reduceSizePerBlock * BlkGroupSize - toReduceLen;
+
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pass_through_transform(invariantLen),
+                                                   make_pad_transform(toReduceLen, 0, srcPad)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+
+    if(hipThreadIdx_x == 0)
+        *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
+};
+
+template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
+struct get_ref_desc_types
+{
+    static constexpr auto ref_toReduceDimLengths =
+        typename uniform_sequence_gen<toReduceDims::Size(), 8>::type{};
+    static constexpr auto ref_invariantDimLengths =
+        typename uniform_sequence_gen<invariantDims::Size(), 8>::type{};
+
+    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
+    static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 8>::type{};
+
+    // don't have to use accurate strides to get an expected referrence type
+    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
+    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
+
+    static constexpr auto ref_src2dDesc = transform_tensor_descriptor(
+        ref_srcDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)),
+                   make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))),
+        make_tuple(invariantDims{}, toReduceDims{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
+        ref_dstDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
+    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
+
+    // used by the BlockWise and MultiBlock method
+    using refType_src2dDesc_padded_34 = decltype(
+        transform_tensor_descriptor(ref_src2dDesc,
+                                    make_tuple(make_pass_through_transform(ref_invariantLen),
+                                               make_pad_transform(ref_toReduceLen, 0, 2)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{})));
+
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dst1dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dst1dDesc);
+};
+
+using refType_src2dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
+using refType_dst1dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
+using refType_src2dDesc_padded_34 =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
+        refType_src2dDesc_padded_34;
+using refType_dst1dDesc_padded =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
+        refType_dst1dDesc_padded;
+
+template <bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+
+extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
+                                                     int BlkGroupSize,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     void* __restrict__ ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)p_dst_global;
+    (void)indices_global;
+
+    const void* p_src2dDesc = ws_global;
+    const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    void* ws_buf1_global    = static_cast<char*>(ws_global) + 4096;
+
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+
+    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_multiblock<BlockSize,
+                                                                    srcDataType,
+                                                                    dstDataType,
+                                                                    compType,
+                                                                    decltype(src2dDesc),
+                                                                    decltype(dst1dDesc),
+                                                                    op,
+                                                                    nanPropaOpt,
+                                                                    reduceIndicesOpt,
+                                                                    GredAccessesPerThreadInBlock>;
+
+    void* const ws_buf2_global =
+        ws_buf2_bytes_offset > 0
+            ? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
+            : nullptr;
+
+    constexpr int RunId = need_indices ? 2 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        BlkGroupSize,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(p_src_global),
+        beta,
+        static_cast<srcDataType* const __restrict__>(ws_buf1_global),
+        static_cast<int* const __restrict__>(ws_buf2_global));
+};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_direct_threadwise.hpp"
+
+using namespace ck;
+
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+
+constexpr index_t srcDims = CK_PARAM_IN_DIMS;
+constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
+
+using toReduceDims = Sequence<CK_PARAM_TOREDUCE_DIMS>;
+
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+
+////////////////////////////////////////////////////////////////////////////////////////
+using specDims = typename sequence_merge<Sequence<>, toReduceDims>::type;
+
+static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
+              "Wrong invariant and/or toReduce dimensions!");
+
+// The number of invariant dimensions can be zero if all dimension are to be reduced
+static_assert(dstDims == 1,
+              "If all source dimensions are reduced, the dest should have only one dimension !!");
+
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+
+constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable
+
+// helper functions using variadic template arguments
+template <index_t... Ns>
+__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
+{
+    return make_tuple(static_cast<index_t>(lengths[Ns])...);
+};
+
+template <index_t arraySize>
+__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
+{
+    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
+
+    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
+
+    return make_tuple_from_array_and_index_seq(lengths, index_seq);
+};
+
+template <index_t... Ns>
+__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
+{
+    return make_tuple(Ns...);
+};
+
+extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
+                                                             int BlkGroupSize,
+                                                             int inLength0,
+                                                             int inLength1,
+                                                             int inLength2,
+                                                             int inLength3,
+                                                             int inLength4,
+                                                             int inLength5,
+                                                             int inStride0,
+                                                             int inStride1,
+                                                             int inStride2,
+                                                             int inStride3,
+                                                             int inStride4,
+                                                             int inStride5,
+                                                             int outLength0,
+                                                             int outLength1,
+                                                             int outLength2,
+                                                             int outLength3,
+                                                             int outLength4,
+                                                             int outLength5,
+                                                             int outStride0,
+                                                             int outStride1,
+                                                             int outStride2,
+                                                             int outStride3,
+                                                             int outStride4,
+                                                             int outStride5,
+                                                             void* __restrict__ ws_global)
+{
+    (void)BlkGroupSize;
+
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+
+    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
+    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
+    const int dstLengths[6] = {
+        outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
+    const int dstStrides[6] = {
+        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
+
+    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
+    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
+    const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
+    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
+
+    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+
+    const auto one_dim_srcDesc = transform_tensor_descriptor(
+        srcDesc,
+        make_tuple(make_merge_transform(tupleSrcLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    auto src2dDesc = transform_tensor_descriptor(
+        one_dim_srcDesc,
+        make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))),
+        make_tuple(Sequence<0>{}),
+        make_tuple(Sequence<0, 1>{}));
+
+    auto dst1dDesc = transform_tensor_descriptor(
+        dstDesc,
+        make_tuple(make_merge_transform(tupleDstLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    const auto invariantLen = src2dDesc.GetLength(Number<0>{});
+    const auto toReduceLen  = src2dDesc.GetLength(Number<1>{});
+
+    constexpr auto copySliceLen = GredThreadBufferLength;
+
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad1 = GridSize * BlockSize - invariantLen;
+        const auto srcPad2 =
+            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
+                                                   make_pad_transform(toReduceLen, 0, srcPad2)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+
+    if constexpr(dst1d_need_padding)
+    {
+        const auto dstPad = GridSize * BlockSize - invariantLen;
+        auto dst1dDesc_2 =
+            transform_tensor_descriptor(dst1dDesc,
+                                        make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
+    }
+    else
+    {
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
+    }
+};
+
+template <index_t srcDims, index_t dstDims, typename toReduceDims>
+struct get_ref_desc_types
+{
+    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
+    static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 1>::type{};
+
+    // don't have to use accurate strides to get an expected referrence type
+    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
+    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
+
+    static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
+        ref_srcDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))),
+        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    static constexpr auto ref_src2dDesc =
+        transform_tensor_descriptor(ref_one_dim_srcDesc,
+                                    make_tuple(make_unmerge_transform(
+                                        make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))),
+                                    make_tuple(Sequence<0>{}),
+                                    make_tuple(Sequence<0, 1>{}));
+
+    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
+        ref_dstDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
+    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
+
+    // used by the DirectThreadWise and DirectWarpWise method
+    using refType_src2dDesc_padded_12 =
+        decltype(transform_tensor_descriptor(ref_src2dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
+                                                        make_pad_transform(ref_toReduceLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                             make_tuple(Sequence<0>{}, Sequence<1>{})));
+
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dst1dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dst1dDesc);
+};
+
+using refType_src2dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc;
+using refType_dst1dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc;
+using refType_src2dDesc_padded_12 =
+    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc_padded_12;
+using refType_dst1dDesc_padded =
+    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc_padded;
+
+template <bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+
+extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
+                                                     int BlkGroupSize,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     void* __restrict__ ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)BlkGroupSize;
+    (void)ws_buf2_bytes_offset;
+
+    const void* p_src2dDesc = ws_global;
+    const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+
+    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise<BlockSize,
+                                                                           srcDataType,
+                                                                           dstDataType,
+                                                                           compType,
+                                                                           decltype(src2dDesc),
+                                                                           decltype(dst1dDesc),
+                                                                           op,
+                                                                           nanPropaOpt,
+                                                                           reduceIndicesOpt,
+                                                                           true,
+                                                                           true,
+                                                                           GredThreadBufferLength>;
+
+    constexpr int RunId = need_indices ? 2 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(p_src_global),
+        beta,
+        static_cast<dstDataType* const __restrict__>(p_dst_global),
+        static_cast<const int* const __restrict__>(nullptr),
+        static_cast<int* const __restrict__>(indices_global));
+};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_direct_threadwise.hpp"
+
+using namespace ck;
+
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+
+constexpr index_t srcDims = CK_PARAM_IN_DIMS;
+constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
+
+using toReduceDims  = Sequence<CK_PARAM_TOREDUCE_DIMS>;
+using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>;
+
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+
+////////////////////////////////////////////////////////////////////////////////////////
+using specDims = typename sequence_merge<invariantDims, toReduceDims>::type;
+
+static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
+              "Wrong invariant and/or toReduce dimensions!");
+
+// The number of invariant dimensions can be zero if all dimension are to be reduced
+static_assert(invariantDims::Size() > 0 || dstDims == 1,
+              "If all source dimensions are reduced, the dest should have only one dimension !!");
+
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+
+constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable
+
+// helper functions using variadic template arguments
+template <index_t... Ns>
+__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
+{
+    return make_tuple(static_cast<index_t>(lengths[Ns])...);
+};
+
+template <index_t arraySize>
+__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
+{
+    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
+
+    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
+
+    return make_tuple_from_array_and_index_seq(lengths, index_seq);
+};
+
+template <index_t... Ns>
+__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
+{
+    return make_tuple(Ns...);
+};
+
+extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
+                                                             int BlkGroupSize,
+                                                             int inLength0,
+                                                             int inLength1,
+                                                             int inLength2,
+                                                             int inLength3,
+                                                             int inLength4,
+                                                             int inLength5,
+                                                             int inStride0,
+                                                             int inStride1,
+                                                             int inStride2,
+                                                             int inStride3,
+                                                             int inStride4,
+                                                             int inStride5,
+                                                             int outLength0,
+                                                             int outLength1,
+                                                             int outLength2,
+                                                             int outLength3,
+                                                             int outLength4,
+                                                             int outLength5,
+                                                             int outStride0,
+                                                             int outStride1,
+                                                             int outStride2,
+                                                             int outStride3,
+                                                             int outStride4,
+                                                             int outStride5,
+                                                             void* __restrict__ ws_global)
+{
+    (void)BlkGroupSize;
+
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+
+    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
+    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
+    const int dstLengths[6] = {
+        outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
+    const int dstStrides[6] = {
+        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
+
+    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
+    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
+    const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
+    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
+
+    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+
+    const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{});
+    const auto invariantDimLengths =
+        make_tuple_from_array_and_index_seq(srcLengths, invariantDims{});
+
+    auto src2dDesc =
+        transform_tensor_descriptor(srcDesc,
+                                    make_tuple(make_merge_transform(invariantDimLengths),
+                                               make_merge_transform(toReduceDimLengths)),
+                                    make_tuple(invariantDims{}, toReduceDims{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    auto dst1dDesc = transform_tensor_descriptor(
+        dstDesc,
+        make_tuple(make_merge_transform(tupleDstLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    const auto invariantLen = src2dDesc.GetLength(Number<0>{});
+    const auto toReduceLen  = src2dDesc.GetLength(Number<1>{});
+
+    constexpr auto copySliceLen = GredThreadBufferLength;
+
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad1 = GridSize * BlockSize - invariantLen;
+        const auto srcPad2 =
+            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
+                                                   make_pad_transform(toReduceLen, 0, srcPad2)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+
+    if constexpr(dst1d_need_padding)
+    {
+        const auto dstPad = GridSize * BlockSize - invariantLen;
+        auto dst1dDesc_2 =
+            transform_tensor_descriptor(dst1dDesc,
+                                        make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
+    }
+    else
+    {
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
+    }
+};
+
+template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
+struct get_ref_desc_types
+{
+    static constexpr auto ref_toReduceDimLengths =
+        typename uniform_sequence_gen<toReduceDims::Size(), 8>::type{};
+    static constexpr auto ref_invariantDimLengths =
+        typename uniform_sequence_gen<invariantDims::Size(), 8>::type{};
+
+    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
+    static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 8>::type{};
+
+    // don't have to use accurate strides to get an expected referrence type
+    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
+    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
+
+    static constexpr auto ref_src2dDesc = transform_tensor_descriptor(
+        ref_srcDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)),
+                   make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))),
+        make_tuple(invariantDims{}, toReduceDims{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
+        ref_dstDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
+    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
+
+    // used by the DirectThreadWise and DirectWarpWise method
+    using refType_src2dDesc_padded_12 =
+        decltype(transform_tensor_descriptor(ref_src2dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
+                                                        make_pad_transform(ref_toReduceLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                             make_tuple(Sequence<0>{}, Sequence<1>{})));
+
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dst1dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dst1dDesc);
+};
+
+using refType_src2dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
+using refType_dst1dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
+using refType_src2dDesc_padded_12 =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
+        refType_src2dDesc_padded_12;
+using refType_dst1dDesc_padded =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
+        refType_dst1dDesc_padded;
+
+template <bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+
+extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
+                                                     int BlkGroupSize,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     void* __restrict__ ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)BlkGroupSize;
+    (void)ws_buf2_bytes_offset;
+
+    const void* p_src2dDesc = ws_global;
+    const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+
+    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise<BlockSize,
+                                                                           srcDataType,
+                                                                           dstDataType,
+                                                                           compType,
+                                                                           decltype(src2dDesc),
+                                                                           decltype(dst1dDesc),
+                                                                           op,
+                                                                           nanPropaOpt,
+                                                                           reduceIndicesOpt,
+                                                                           true,
+                                                                           true,
+                                                                           GredThreadBufferLength>;
+
+    constexpr int RunId = need_indices ? 2 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(p_src_global),
+        beta,
+        static_cast<dstDataType* const __restrict__>(p_dst_global),
+        static_cast<const int* const __restrict__>(nullptr),
+        static_cast<int* const __restrict__>(indices_global));
+};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_direct_warpwise.hpp"
+
+using namespace ck;
+
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+
+constexpr index_t srcDims = CK_PARAM_IN_DIMS;
+constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
+
+using toReduceDims = Sequence<CK_PARAM_TOREDUCE_DIMS>;
+
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+
+////////////////////////////////////////////////////////////////////////////////////////
+using specDims = typename sequence_merge<Sequence<>, toReduceDims>::type;
+
+static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
+              "Wrong invariant and/or toReduce dimensions!");
+
+// The number of invariant dimensions can be zero if all dimension are to be reduced
+static_assert(dstDims == 1,
+              "If all source dimensions are reduced, the dest should have only one dimension !!");
+
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+
+constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable
+
+// helper functions using variadic template arguments
+template <index_t... Ns>
+__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
+{
+    return make_tuple(static_cast<index_t>(lengths[Ns])...);
+};
+
+template <index_t arraySize>
+__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
+{
+    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
+
+    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
+
+    return make_tuple_from_array_and_index_seq(lengths, index_seq);
+};
+
+template <index_t... Ns>
+__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
+{
+    return make_tuple(Ns...);
+};
+
+extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
+                                                             int BlkGroupSize,
+                                                             int inLength0,
+                                                             int inLength1,
+                                                             int inLength2,
+                                                             int inLength3,
+                                                             int inLength4,
+                                                             int inLength5,
+                                                             int inStride0,
+                                                             int inStride1,
+                                                             int inStride2,
+                                                             int inStride3,
+                                                             int inStride4,
+                                                             int inStride5,
+                                                             int outLength0,
+                                                             int outLength1,
+                                                             int outLength2,
+                                                             int outLength3,
+                                                             int outLength4,
+                                                             int outLength5,
+                                                             int outStride0,
+                                                             int outStride1,
+                                                             int outStride2,
+                                                             int outStride3,
+                                                             int outStride4,
+                                                             int outStride5,
+                                                             void* __restrict__ ws_global)
+{
+    (void)BlkGroupSize;
+
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+
+    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
+    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
+    const int dstLengths[6] = {
+        outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
+    const int dstStrides[6] = {
+        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
+
+    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
+    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
+    const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
+    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
+
+    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+
+    const auto one_dim_srcDesc = transform_tensor_descriptor(
+        srcDesc,
+        make_tuple(make_merge_transform(tupleSrcLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    auto src2dDesc = transform_tensor_descriptor(
+        one_dim_srcDesc,
+        make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))),
+        make_tuple(Sequence<0>{}),
+        make_tuple(Sequence<0, 1>{}));
+
+    auto dst1dDesc = transform_tensor_descriptor(
+        dstDesc,
+        make_tuple(make_merge_transform(tupleDstLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    const auto invariantLen = src2dDesc.GetLength(Number<0>{});
+    const auto toReduceLen  = src2dDesc.GetLength(Number<1>{});
+
+    constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp;
+
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen;
+        const auto srcPad2 =
+            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
+
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
+                                                   make_pad_transform(toReduceLen, 0, srcPad2)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+
+    if constexpr(dst1d_need_padding)
+    {
+        const auto dstPad = GridSize * BlockSize / warpSize - invariantLen;
+        auto dst1dDesc_2 =
+            transform_tensor_descriptor(dst1dDesc,
+                                        make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
+    }
+    else
+    {
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
+    }
+};
+
+template <index_t srcDims, index_t dstDims, typename toReduceDims>
+struct get_ref_desc_types
+{
+    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
+    static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 1>::type{};
+
+    // don't have to use accurate strides to get an expected referrence type
+    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
+    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
+
+    static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
+        ref_srcDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))),
+        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    static constexpr auto ref_src2dDesc =
+        transform_tensor_descriptor(ref_one_dim_srcDesc,
+                                    make_tuple(make_unmerge_transform(
+                                        make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))),
+                                    make_tuple(Sequence<0>{}),
+                                    make_tuple(Sequence<0, 1>{}));
+
+    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
+        ref_dstDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
+    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
+
+    // used by the DirectThreadWise and DirectWarpWise method
+    using refType_src2dDesc_padded_12 =
+        decltype(transform_tensor_descriptor(ref_src2dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
+                                                        make_pad_transform(ref_toReduceLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                             make_tuple(Sequence<0>{}, Sequence<1>{})));
+
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dst1dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dst1dDesc);
+};
+
+using refType_src2dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc;
+using refType_dst1dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc;
+using refType_src2dDesc_padded_12
+    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc_padded_12;
+using refType_dst1dDesc_padded =
+    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc_padded;
+
+template <bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+
+extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
+                                                     int BlkGroupSize,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     void* __restrict__ ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)BlkGroupSize;
+    (void)ws_buf2_bytes_offset;
+
+    const void* p_src2dDesc = ws_global;
+    const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+
+    using gridwise_2d_reduce =
+        GridwiseReduction_xy_to_x_direct_warpwise<BlockSize,
+                                                  srcDataType,
+                                                  dstDataType,
+                                                  compType,
+                                                  decltype(src2dDesc),
+                                                  decltype(dst1dDesc),
+                                                  op,
+                                                  nanPropaOpt,
+                                                  reduceIndicesOpt,
+                                                  true,
+                                                  true,
+                                                  GredAccessesPerThreadInWarp>;
+
+    constexpr int RunId = need_indices ? 2 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(p_src_global),
+        beta,
+        static_cast<dstDataType* const __restrict__>(p_dst_global),
+        static_cast<const int* const __restrict__>(nullptr),
+        static_cast<int* const __restrict__>(indices_global));
+};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_direct_warpwise.hpp"
+
+using namespace ck;
+
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+
+constexpr index_t srcDims = CK_PARAM_IN_DIMS;
+constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
+
+using toReduceDims  = Sequence<CK_PARAM_TOREDUCE_DIMS>;
+using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>;
+
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+
+////////////////////////////////////////////////////////////////////////////////////////
+using specDims = typename sequence_merge<invariantDims, toReduceDims>::type;
+
+static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
+              "Wrong invariant and/or toReduce dimensions!");
+
+// The number of invariant dimensions can be zero if all dimension are to be reduced
+static_assert(invariantDims::Size() > 0 || dstDims == 1,
+              "If all source dimensions are reduced, the dest should have only one dimension !!");
+
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+
+constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable
+
+// helper functions using variadic template arguments
+template <index_t... Ns>
+__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
+{
+    return make_tuple(static_cast<index_t>(lengths[Ns])...);
+};
+
+template <index_t arraySize>
+__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
+{
+    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
+
+    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
+
+    return make_tuple_from_array_and_index_seq(lengths, index_seq);
+};
+
+template <index_t... Ns>
+__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
+{
+    return make_tuple(Ns...);
+};
+
+extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
+                                                             int BlkGroupSize,
+                                                             int inLength0,
+                                                             int inLength1,
+                                                             int inLength2,
+                                                             int inLength3,
+                                                             int inLength4,
+                                                             int inLength5,
+                                                             int inStride0,
+                                                             int inStride1,
+                                                             int inStride2,
+                                                             int inStride3,
+                                                             int inStride4,
+                                                             int inStride5,
+                                                             int outLength0,
+                                                             int outLength1,
+                                                             int outLength2,
+                                                             int outLength3,
+                                                             int outLength4,
+                                                             int outLength5,
+                                                             int outStride0,
+                                                             int outStride1,
+                                                             int outStride2,
+                                                             int outStride3,
+                                                             int outStride4,
+                                                             int outStride5,
+                                                             void* __restrict__ ws_global)
+{
+    (void)BlkGroupSize;
+
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+
+    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
+    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
+    const int dstLengths[6] = {
+        outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
+    const int dstStrides[6] = {
+        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
+
+    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
+    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
+    const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
+    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
+
+    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+
+    const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{});
+    const auto invariantDimLengths =
+        make_tuple_from_array_and_index_seq(srcLengths, invariantDims{});
+
+    auto src2dDesc =
+        transform_tensor_descriptor(srcDesc,
+                                    make_tuple(make_merge_transform(invariantDimLengths),
+                                               make_merge_transform(toReduceDimLengths)),
+                                    make_tuple(invariantDims{}, toReduceDims{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    auto dst1dDesc = transform_tensor_descriptor(
+        dstDesc,
+        make_tuple(make_merge_transform(tupleDstLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    const auto invariantLen = src2dDesc.GetLength(Number<0>{});
+    const auto toReduceLen  = src2dDesc.GetLength(Number<1>{});
+
+    constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp;
+
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen;
+        const auto srcPad2 =
+            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
+
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
+                                                   make_pad_transform(toReduceLen, 0, srcPad2)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+
+    if constexpr(dst1d_need_padding)
+    {
+        const auto dstPad = GridSize * BlockSize / warpSize - invariantLen;
+        auto dst1dDesc_2 =
+            transform_tensor_descriptor(dst1dDesc,
+                                        make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
+    }
+    else
+    {
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
+    }
+};
+
+template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
+struct get_ref_desc_types
+{
+    static constexpr auto ref_toReduceDimLengths =
+        typename uniform_sequence_gen<toReduceDims::Size(), 8>::type{};
+    static constexpr auto ref_invariantDimLengths =
+        typename uniform_sequence_gen<invariantDims::Size(), 8>::type{};
+
+    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
+    static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 8>::type{};
+
+    // don't have to use accurate strides to get an expected referrence type
+    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
+    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
+
+    static constexpr auto ref_src2dDesc = transform_tensor_descriptor(
+        ref_srcDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)),
+                   make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))),
+        make_tuple(invariantDims{}, toReduceDims{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
+        ref_dstDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
+    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
+
+    // used by the DirectThreadWise and DirectWarpWise method
+    using refType_src2dDesc_padded_12 =
+        decltype(transform_tensor_descriptor(ref_src2dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
+                                                        make_pad_transform(ref_toReduceLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                             make_tuple(Sequence<0>{}, Sequence<1>{})));
+
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dst1dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dst1dDesc);
+};
+
+using refType_src2dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
+using refType_dst1dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
+using refType_src2dDesc_padded_12 =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
+        refType_src2dDesc_padded_12;
+using refType_dst1dDesc_padded =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
+        refType_dst1dDesc_padded;
+
+template <bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+
+extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
+                                                     int BlkGroupSize,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     void* __restrict__ ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)BlkGroupSize;
+    (void)ws_buf2_bytes_offset;
+
+    const void* p_src2dDesc = ws_global;
+    const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+
+    using gridwise_2d_reduce =
+        GridwiseReduction_xy_to_x_direct_warpwise<BlockSize,
+                                                  srcDataType,
+                                                  dstDataType,
+                                                  compType,
+                                                  decltype(src2dDesc),
+                                                  decltype(dst1dDesc),
+                                                  op,
+                                                  nanPropaOpt,
+                                                  reduceIndicesOpt,
+                                                  true,
+                                                  true,
+                                                  GredAccessesPerThreadInWarp>;
+
+    constexpr int RunId = need_indices ? 2 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(p_src_global),
+        beta,
+        static_cast<dstDataType* const __restrict__>(p_dst_global),
+        static_cast<const int* const __restrict__>(nullptr),
+        static_cast<int* const __restrict__>(indices_global));
+};