:Merge remote-tracking branch 'origin/develop' into CK_upstream

df0d6810 · Chao Liu · 31b40352 · 9e80cdce · df0d6810 · df0d6810
Commit df0d6810 authored Sep 20, 2021 by Chao Liu
20 changed files
--- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
--- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
--- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp
--- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_MULTIBLOCK_HPP
+#define CK_GRIDWISE_GENERIC_2D_REDUCTION_MULTIBLOCK_HPP
+#include "reduction_common.hpp"
+#include "reduction_operator.hpp"
+#include "reduction_functions_blockwise.hpp"
+#include "blockwise_tensor_slice_transfer.hpp"
+namespace ck {
+template <index_t BlockSize,
+          typename srcDataType,
+          typename dstDataType, // not used together with the beta input
+          typename compType,
+          typename src2dDescType,
+          typename dst1dDescType,
+          ReduceTensorOp_t op,
+          NanPropagation_t nanPropaOpt,
+          ReduceTensorIndices_t reduceIndicesOpt,
+          index_t GredAccessesPerThreadInBlock>
+struct GridwiseReduction_xy_to_x_multiblock
+{
+    using opReduce       = typename reduce_binary_operator<compType, op>::opType;
+    using preUnaryOpType = typename reduce_unary_operator<compType, op, true, false>::preUnaryOp;
+    using posUnaryOpType = typename reduce_unary_operator<compType, op, true, false>::posUnaryOp;
+    static constexpr auto buffer2dDesc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<GredAccessesPerThreadInBlock>{}, Number<BlockSize>{}));
+    using blockwise_reduce =
+        BlockwiseReduction_2d_block_buffer<decltype(buffer2dDesc), true, opReduce, nanPropaOpt>;
+    static constexpr index_t BlockBufferSize = buffer2dDesc.GetElementSize();
+    static constexpr auto I0 = Number<0>{};
+    template <int RunId>
+    __device__ static void Run(const src2dDescType& src2dDesc,
+                               const dst1dDescType& dst1dDesc,
+                               int origReduceLen,
+                               int BlkGroupSize,
+                               srcDataType alpha,
+                               const srcDataType* const __restrict__ p_src_global,
+                               dstDataType beta,
+                               srcDataType* const __restrict__ ws_values_global,
+                               int* const __restrict__ ws_indices_global);
+    template <>
+    __device__ static void Run<1>(const src2dDescType& src2dDesc,
+                                  const dst1dDescType& dst1dDesc,
+                                  int origReduceLen,
+                                  int BlkGroupSize,
+                                  srcDataType alpha,
+                                  const srcDataType* const __restrict__ p_src_global,
+                                  dstDataType beta,
+                                  srcDataType* const __restrict__ ws_values_global,
+                                  int* const __restrict__ ws_indices_global)
+    {
+        (void)ws_indices_global;
+        (void)alpha; // unused
+        (void)beta;  // unused
+        auto zeroVal = opReduce::GetZeroVal();
+        // LDS
+        __shared__ compType p_in_block_buffer[BlockBufferSize];
+        const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
+        auto workspace_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            ws_values_global, dst1dDesc.GetLength(I0) * BlkGroupSize);
+        auto in_block_buf =
+            make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_buffer, BlockBufferSize);
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
+        accuValue_buf(I0) = zeroVal;
+        const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
+        const int divider         = origReduceLen;
+        const preUnaryOpType preUnaryOp(divider);
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+        const index_t blkgroup_id     = block_global_id / BlkGroupSize;
+        const index_t block_local_id  = block_global_id % BlkGroupSize;
+        const index_t reduceSizePerBlock =
+            (((toReduceLength + BlkGroupSize - 1) / BlkGroupSize + BlockBufferSize - 1) /
+             BlockBufferSize) *
+            BlockBufferSize;
+        constexpr auto in_block_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<1>{}, Number<BlockSize * GredAccessesPerThreadInBlock>{}));
+        using ThreadSliceLengths   = Sequence<1, GredAccessesPerThreadInBlock>;
+        using ThreadClusterLengths = Sequence<1, BlockSize>;
+        auto blockwise_src_load = BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                                                  InMemoryDataOperationEnum_t::Set,
+                                                                  Sequence<1, BlockBufferSize>,
+                                                                  ThreadSliceLengths,
+                                                                  ThreadClusterLengths,
+                                                                  Sequence<0, 1>,
+                                                                  srcDataType,
+                                                                  compType,
+                                                                  src2dDescType,
+                                                                  decltype(in_block_desc),
+                                                                  Sequence<0, 1>,
+                                                                  Sequence<0, 1>,
+                                                                  1,
+                                                                  1,
+                                                                  1,
+                                                                  1,
+                                                                  1,
+                                                                  1,
+                                                                  false,
+                                                                  true>(
+            src2dDesc,
+            make_multi_index(blkgroup_id, block_local_id * reduceSizePerBlock),
+            in_block_desc,
+            make_multi_index(0, 0));
+        constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
+        const index_t toReduceBlocks = (reduceSizePerBlock + BlockSize - 1) / BlockSize;
+        for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
+            reducedBlocks += GredAccessesPerThreadInBlock)
+        {
+            blockwise_src_load.RunRead(src2dDesc, src_global_buf);
+            blockwise_src_load.RunWrite(in_block_desc, in_block_buf);
+            __syncthreads();
+            // do element-wise pre-reduction operation
+            blockwise_reduce::operate_on_elements(preUnaryOp, in_block_buf);
+            index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
+                                        ? GredAccessesPerThreadInBlock
+                                        : toReduceBlocks - reducedBlocks;
+            blockwise_reduce::Reduce(in_block_buf, BlocksInOneOp, accuValue_buf(I0));
+            blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
+        }
+        constexpr auto ReducedDataDesc =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
+        const auto workspace_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(dst1dDesc.GetLength(I0) * BlkGroupSize));
+        // The first thread in the block stores the reduced result to the global location
+        // representing the block
+        if(thread_local_id == 0)
+        {
+            auto threadwise_workspace_store =
+                ThreadwiseTensorSliceTransfer_v1r3<compType,
+                                                   srcDataType,
+                                                   decltype(ReducedDataDesc),
+                                                   decltype(workspace_desc),
+                                                   Sequence<1>,
+                                                   Sequence<0>,
+                                                   0,
+                                                   1,
+                                                   InMemoryDataOperationEnum_t::Set,
+                                                   1,
+                                                   true>(workspace_desc,
+                                                         make_multi_index(block_global_id));
+            threadwise_workspace_store.Run(ReducedDataDesc,
+                                           make_tuple(I0),
+                                           accuValue_buf,
+                                           workspace_desc,
+                                           workspace_global_buf);
+        }
+    };
+    template <>
+    __device__ static void Run<2>(const src2dDescType& src2dDesc,
+                                  const dst1dDescType& dst1dDesc,
+                                  int origReduceLen,
+                                  int BlkGroupSize,
+                                  srcDataType alpha,
+                                  const srcDataType* const __restrict__ p_src_global,
+                                  dstDataType beta,
+                                  srcDataType* const __restrict__ ws_values_global,
+                                  int* const __restrict__ ws_indices_global)
+    {
+        (void)alpha; // unused
+        (void)beta;  // unused
+        auto zeroVal = opReduce::GetZeroVal();
+        // LDS
+        __shared__ compType p_in_block_values_buffer[BlockBufferSize];
+        __shared__ int p_in_block_indices_buffer[BlockBufferSize];
+        const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
+        auto workspace_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            ws_values_global, dst1dDesc.GetLength(I0) * BlkGroupSize);
+        auto workspace_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            ws_indices_global, dst1dDesc.GetLength(I0) * BlkGroupSize);
+        auto in_block_val_buf =
+            make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_values_buffer, BlockBufferSize);
+        auto in_block_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_in_block_indices_buffer, BlockBufferSize);
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
+        accuValue_buf(I0) = zeroVal;
+        accuIndex_buf(I0) = 0;
+        const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
+        const int divider         = origReduceLen;
+        const preUnaryOpType preUnaryOp(divider);
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+        const index_t blkgroup_id     = block_global_id / BlkGroupSize;
+        const index_t block_local_id  = block_global_id % BlkGroupSize;
+        const index_t reduceSizePerBlock =
+            (((toReduceLength + BlkGroupSize - 1) / BlkGroupSize + BlockBufferSize - 1) /
+             BlockBufferSize) *
+            BlockBufferSize;
+        constexpr auto in_block_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<1>{}, Number<BlockSize * GredAccessesPerThreadInBlock>{}));
+        using ThreadSliceLengths   = Sequence<1, GredAccessesPerThreadInBlock>;
+        using ThreadClusterLengths = Sequence<1, BlockSize>;
+        auto blockwise_src_load = BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                                                  InMemoryDataOperationEnum_t::Set,
+                                                                  Sequence<1, BlockBufferSize>,
+                                                                  ThreadSliceLengths,
+                                                                  ThreadClusterLengths,
+                                                                  Sequence<0, 1>,
+                                                                  srcDataType,
+                                                                  compType,
+                                                                  src2dDescType,
+                                                                  decltype(in_block_desc),
+                                                                  Sequence<0, 1>,
+                                                                  Sequence<0, 1>,
+                                                                  1,
+                                                                  1,
+                                                                  1,
+                                                                  1,
+                                                                  1,
+                                                                  1,
+                                                                  false,
+                                                                  true>(
+            src2dDesc,
+            make_multi_index(blkgroup_id, block_local_id * reduceSizePerBlock),
+            in_block_desc,
+            make_multi_index(0, 0));
+        constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
+        const index_t toReduceBlocks = (reduceSizePerBlock + BlockSize - 1) / BlockSize;
+        int indexOffset = block_local_id * reduceSizePerBlock;
+        for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
+            reducedBlocks += GredAccessesPerThreadInBlock)
+        {
+            blockwise_reduce::init_buffer_indices(in_block_idx_buf, indexOffset);
+            blockwise_src_load.RunRead(src2dDesc, src_global_buf);
+            blockwise_src_load.RunWrite(in_block_desc, in_block_val_buf);
+            __syncthreads();
+            // unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
+            // done here
+            blockwise_reduce::operate_on_elements(preUnaryOp, in_block_val_buf);
+            index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
+                                        ? GredAccessesPerThreadInBlock
+                                        : toReduceBlocks - reducedBlocks;
+            blockwise_reduce::Reduce2(in_block_val_buf,
+                                      in_block_idx_buf,
+                                      BlocksInOneOp,
+                                      accuValue_buf(I0),
+                                      accuIndex_buf(I0));
+            indexOffset += BlockBufferSize;
+            blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
+        }
+        constexpr auto ReducedDataDesc =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
+        const auto workspace_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(dst1dDesc.GetLength(I0) * BlkGroupSize));
+        // The first thread in the block stores the reduced result to the global location
+        // representing the block
+        if(thread_local_id == 0)
+        {
+            auto threadwise_workspace_val_store =
+                ThreadwiseTensorSliceTransfer_v1r3<compType,
+                                                   srcDataType,
+                                                   decltype(ReducedDataDesc),
+                                                   decltype(workspace_desc),
+                                                   Sequence<1>,
+                                                   Sequence<0>,
+                                                   0,
+                                                   1,
+                                                   InMemoryDataOperationEnum_t::Set,
+                                                   1,
+                                                   true>(workspace_desc,
+                                                         make_multi_index(block_global_id));
+            auto threadwise_workspace_idx_store =
+                ThreadwiseTensorSliceTransfer_v1r3<int,
+                                                   int,
+                                                   decltype(ReducedDataDesc),
+                                                   decltype(workspace_desc),
+                                                   Sequence<1>,
+                                                   Sequence<0>,
+                                                   0,
+                                                   1,
+                                                   InMemoryDataOperationEnum_t::Set,
+                                                   1,
+                                                   true>(workspace_desc,
+                                                         make_multi_index(block_global_id));
+            threadwise_workspace_val_store.Run(ReducedDataDesc,
+                                               make_tuple(I0),
+                                               accuValue_buf,
+                                               workspace_desc,
+                                               workspace_global_val_buf);
+            threadwise_workspace_idx_store.Run(ReducedDataDesc,
+                                               make_tuple(I0),
+                                               accuIndex_buf,
+                                               workspace_desc,
+                                               workspace_global_idx_buf);
+        }
+    };
+};
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp
+++ b/composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
+#define CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
+#include "data_type.hpp"
+#include "reduction_common.hpp"
+#include "reduction_operator.hpp"
+#include "reduction_functions_binop.hpp"
+namespace ck {
+template <typename buffer2dDescType,
+          bool blockIsOneRow,
+          typename opReduce,
+          NanPropagation_t nanPropaOpt>
+struct BlockwiseReduction_2d_block_buffer
+{
+    using compType = typename opReduce::dataType;
+    static constexpr auto buffer2dDesc = buffer2dDescType{};
+    static constexpr index_t BlockSize =
+        blockIsOneRow ? buffer2dDesc.GetLength(Number<1>{}) : buffer2dDesc.GetLength(Number<0>{});
+    static constexpr index_t NumBlocks =
+        blockIsOneRow ? buffer2dDesc.GetLength(Number<0>{}) : buffer2dDesc.GetLength(Number<1>{});
+    using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
+    // This interface does not accumulate on indices
+    template <typename BufferType>
+    __device__ static void
+    Reduce(BufferType& block_buffer, index_t toReduceBlocks, compType& accuData)
+    {
+        const index_t thread_local_id = get_thread_local_1d_id();
+        compType lAccuData            = opReduce::GetZeroVal();
+        index_t offset;
+        for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
+        {
+            offset = blockIsOneRow
+                         ? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_local_id))
+                         : buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, otherDimInd));
+            compType opData = type_convert<compType>{}(block_buffer[offset]);
+            binop::calculate(lAccuData, opData);
+        }
+        offset = blockIsOneRow ? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id))
+                               : buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
+        block_buffer(offset) = lAccuData;
+        __syncthreads();
+        for(index_t indOffset = BlockSize / 2; indOffset > 0; indOffset /= 2)
+        {
+            if(thread_local_id < indOffset)
+            {
+                index_t offset1 =
+                    blockIsOneRow ? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id))
+                                  : buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
+                index_t offset2 =
+                    blockIsOneRow
+                        ? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id + indOffset))
+                        : buffer2dDesc.CalculateOffset(make_tuple(thread_local_id + indOffset, 0));
+                compType opData1 = type_convert<compType>{}(block_buffer[offset1]);
+                compType opData2 = type_convert<compType>{}(block_buffer[offset2]);
+                binop::calculate(opData1, opData2);
+                block_buffer(offset1) = type_convert<compType>{}(opData1);
+            }
+            __syncthreads();
+        }
+        if(thread_local_id == 0)
+        {
+            compType tmpVal = type_convert<compType>{}(block_buffer[0]);
+            binop::calculate(accuData, tmpVal);
+        }
+    };
+    // This interface accumulates on both data values and indices
+    template <typename BufferType, typename IdxBufferType>
+    __device__ static void Reduce2(BufferType& block_buffer,
+                                   IdxBufferType& block_indices_buffer,
+                                   index_t toReduceBlocks,
+                                   compType& accuData,
+                                   int& accuIndex)
+    {
+        const index_t thread_local_id = get_thread_local_1d_id();
+        compType lAccuData            = opReduce::GetZeroVal();
+        int lAccuIndex                = 0;
+        if constexpr(blockIsOneRow)
+        {
+            for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
+            {
+                for(index_t indOffset = 1; indOffset < BlockSize; indOffset *= 2)
+                {
+                    if(thread_local_id % (indOffset * 2) == 0)
+                    {
+                        index_t offset1 =
+                            buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_local_id));
+                        index_t offset2 = buffer2dDesc.CalculateOffset(
+                            make_tuple(otherDimInd, thread_local_id + indOffset));
+                        compType currVal1 = type_convert<compType>{}(block_buffer[offset1]);
+                        compType currVal2 = type_convert<compType>{}(block_buffer[offset2]);
+                        int currIndex1    = block_indices_buffer[offset1];
+                        int currIndex2    = block_indices_buffer[offset2];
+                        binop::calculate(currVal1, currVal2, currIndex1, currIndex2);
+                        block_buffer(offset1)         = type_convert<compType>{}(currVal1);
+                        block_indices_buffer(offset1) = currIndex1;
+                    }
+                    __syncthreads();
+                }
+            }
+            if(thread_local_id == 0)
+            {
+                for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
+                {
+                    index_t offset = buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, 0));
+                    compType tmpVal = type_convert<compType>{}(block_buffer[offset]);
+                    int tmpIndex    = block_indices_buffer[offset];
+                    binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex);
+                }
+                binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex);
+            }
+        }
+        else
+        {
+            index_t offset;
+            for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
+            {
+                offset = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, otherDimInd));
+                compType currVal = type_convert<compType>{}(block_buffer[offset]);
+                int currIndex    = block_indices_buffer[offset];
+                binop::calculate(lAccuData, currVal, lAccuIndex, currIndex);
+            }
+            offset = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
+            block_buffer(offset)         = lAccuData;
+            block_indices_buffer(offset) = lAccuIndex;
+            __syncthreads();
+            for(index_t indOffset = 1; indOffset < BlockSize; indOffset *= 2)
+            {
+                if(thread_local_id % (indOffset * 2) == 0)
+                {
+                    index_t offset1 = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
+                    index_t offset2 =
+                        buffer2dDesc.CalculateOffset(make_tuple(thread_local_id + indOffset, 0));
+                    compType currVal1 = type_convert<compType>{}(block_buffer[offset1]);
+                    compType currVal2 = type_convert<compType>{}(block_buffer[offset2]);
+                    int currIndex1    = block_indices_buffer[offset1];
+                    int currIndex2    = block_indices_buffer[offset2];
+                    binop::calculate(currVal1, currVal2, currIndex1, currIndex2);
+                    block_buffer(offset1)         = type_convert<compType>{}(currVal1);
+                    block_indices_buffer(offset1) = currIndex1;
+                }
+                __syncthreads();
+            }
+            if(thread_local_id == 0)
+            {
+                compType tmpVal = type_convert<compType>{}(block_buffer[0]);
+                int tmpIndex    = block_indices_buffer[0];
+                binop::calculate(accuData, tmpVal, accuIndex, tmpIndex);
+            }
+        }
+    };
+    template <typename BufferType>
+    __device__ static void set_buffer_value(BufferType& block_buffer, compType value)
+    {
+        index_t thread_id = get_thread_local_1d_id();
+        for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++)
+        {
+            index_t offset = blockIsOneRow
+                                 ? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id))
+                                 : buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd));
+            block_buffer(offset) = value;
+            __syncthreads();
+        }
+    };
+    // Initialize the block-wise indices buffer, the index for each element in the block-wise data
+    // buffer
+    // is calculated according to its position in the buffer and the global starting index
+    template <typename IdxBufferType>
+    __device__ static void init_buffer_indices(IdxBufferType& block_indices_buffer, int indexStart)
+    {
+        index_t thread_id = get_thread_local_1d_id();
+        for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++)
+        {
+            index_t offset = blockIsOneRow
+                                 ? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id))
+                                 : buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd));
+            block_indices_buffer(offset) = offset + indexStart;
+            __syncthreads();
+        }
+    };
+    // Execute unary operation on the block buffer elements
+    template <typename unary_op_type, typename BufferType>
+    __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& block_buffer)
+    {
+        index_t thread_id = get_thread_local_1d_id();
+        for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++)
+        {
+            index_t offset = blockIsOneRow
+                                 ? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id))
+                                 : buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd));
+            block_buffer(offset) = unary_op(block_buffer[offset]);
+            __syncthreads();
+        }
+    };
+};
+}; // end of namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/reduction_functions_threadwise.hpp
+++ b/composable_kernel/include/tensor_operation/reduction_functions_threadwise.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_REDUCTION_FUNCTIONS_THREADWISE_HPP
+#define CK_REDUCTION_FUNCTIONS_THREADWISE_HPP
+#include "data_type.hpp"
+#include "reduction_common.hpp"
+#include "reduction_operator.hpp"
+#include "reduction_functions_binop.hpp"
+namespace ck {
+template <typename BufferType, typename opReduce, NanPropagation_t nanPropaOpt>
+struct ThreadReduce
+{
+    using compType = typename opReduce::dataType;
+    static_assert(BufferType::IsStaticBuffer(), "Thread-wise reduction needs use StaticBuffer!");
+    static_assert(
+        std::is_same<typename BufferType::type, compType>::value,
+        "Data type of StaticBuffer for Thread-wise reduction should be same as the compType!");
+    static constexpr index_t ThreadBufferLen = BufferType::Size();
+    using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
+    // This interface does not accumulate on indices
+    __device__ static void Reduce(const BufferType& thread_buffer, compType& accuData)
+    {
+        static_for<0, ThreadBufferLen, 1>{}(
+            [&](auto I) { binop::calculate(accuData, thread_buffer[I]); });
+    };
+    // This interface accumulates on both data values and indices and
+    // is called by Direct_ThreadWise reduction method at first-time reduction
+    __device__ static void
+    Reduce2(const BufferType& thread_buffer, compType& accuData, int& accuIndex, int indexStart)
+    {
+        static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
+            int currIndex = I + indexStart;
+            binop::calculate(accuData, thread_buffer[I], accuIndex, currIndex);
+        });
+    };
+    // Set the elements in the per-thread buffer to a specific value
+    // cppcheck-suppress constParameter
+    __device__ static void set_buffer_value(BufferType& thread_buffer, compType value)
+    {
+        static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; });
+    };
+    // Execute unary operation on the per-thread buffer elements
+    template <typename unary_op_type>
+    __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer)
+    {
+        static_for<0, ThreadBufferLen, 1>{}(
+            [&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); });
+    };
+};
+template <typename BufferType,
+          typename IdxBufferType,
+          typename opReduce,
+          NanPropagation_t nanPropaOpt>
+struct ThreadReduceWithIndicesInput
+{
+    using compType = typename opReduce::dataType;
+    static_assert(BufferType::IsStaticBuffer(), "Thread-wise reduction needs use StaticBuffer!");
+    static_assert(IdxBufferType::IsStaticBuffer(),
+                  "Thread-wise reduction needs use StaticBuffer for indices!");
+    static_assert(
+        std::is_same<typename BufferType::type, compType>::value,
+        "Data type of StaticBuffer for Thread-wise reduction should be same as the compType!");
+    static_assert(std::is_same<typename IdxBufferType::type, index_t>::value,
+                  "Indices type of StaticBuffer for Thread-wise reduction should be index_t!");
+    static_assert(BufferType::Size() == IdxBufferType::Size(),
+                  "StaticBuffers for data and indices should have the same sizes!");
+    static constexpr index_t ThreadBufferLen = BufferType::Size();
+    using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
+    // This interface accumulates on both data values and indices and
+    // is called by Direct_ThreadWise reduction method at second-time reduction
+    __device__ static void Reduce(const BufferType& thread_buffer,
+                                  const IdxBufferType& thread_indices_buffer,
+                                  compType& accuData,
+                                  int& accuIndex)
+    {
+        static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
+            binop::calculate(accuData, thread_buffer[I], accuIndex, thread_indices_buffer[I]);
+        });
+    };
+    // Set the elements in the per-thread buffer to a specific value
+    // cppcheck-suppress constParameter
+    __device__ static void set_buffer_value(BufferType& thread_buffer, compType value)
+    {
+        static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; });
+    };
+    // Execute unary operation on the per-thread buffer elements
+    template <typename unary_op_type>
+    __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer)
+    {
+        static_for<0, ThreadBufferLen, 1>{}(
+            [&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); });
+    };
+};
+}; // end of namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp
+++ b/composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_REDUCTION_FUNCTIONS_WARPWISE_HPP
+#define CK_REDUCTION_FUNCTIONS_WARPWISE_HPP
+#include "data_type.hpp"
+#include "reduction_common.hpp"
+#include "reduction_operator.hpp"
+#include "reduction_functions_binop.hpp"
+namespace ck {
+template <typename BufferType, index_t BlockSize, typename opReduce, NanPropagation_t nanPropaOpt>
+struct WarpReduce
+{
+    using compType = typename opReduce::dataType;
+    using binop    = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
+    static_assert(BufferType::IsStaticBuffer(),
+                  "Per-thread buffer for WarpWise reduction should be StaticBuffer!");
+    static_assert(std::is_same<typename BufferType::type, compType>::value,
+                  "Data type of per-thread StaticBuffer for WarpWise reduction should be same as "
+                  "the compType!");
+    static constexpr index_t ThreadBufferLen = BufferType::Size();
+    static constexpr bool have_builtin_shuffle =
+        std::is_same<compType, float>::value || std::is_same<compType, double>::value;
+    // This interface does not accumulate on indices
+    __device__ static void Reduce(const BufferType& thread_buffer, compType& accuData)
+    {
+        if constexpr(have_builtin_shuffle)
+            ReduceImpl1(thread_buffer, accuData);
+        else
+            ReduceImpl2(thread_buffer, accuData);
+    };
+    // This interface implementation uses HIP built-in device shuffling functions
+    __device__ static void ReduceImpl1(const BufferType& thread_buffer, compType& accuData)
+    {
+        compType lAccuData = opReduce::GetZeroVal();
+        static_for<0, ThreadBufferLen, 1>{}(
+            [&](auto I) { binop::calculate(lAccuData, thread_buffer[I]); });
+        // synchronize among all threads in this warp
+        __all(1);
+        for(index_t stride = warpSize / 2; stride > 0; stride /= 2)
+        {
+            compType tmpVal = __shfl_down(lAccuData, stride, warpSize);
+            binop::calculate(lAccuData, tmpVal);
+            __all(1);
+        }
+        binop::calculate(accuData, lAccuData);
+    };
+    // This interface implementation does not use HIP built-in device shuffling functions
+    // since for fp16, built-in shuffling functions is not provided by HIP
+    __device__ static void ReduceImpl2(const BufferType& thread_buffer, compType& accuData)
+    {
+        compType lAccuData = opReduce::GetZeroVal();
+        static_for<0, ThreadBufferLen, 1>{}(
+            [&](auto I) { binop::calculate(lAccuData, thread_buffer[I]); });
+        __syncthreads();
+        index_t thread_id        = get_thread_local_1d_id();
+        index_t warpId           = thread_id / warpSize;
+        index_t thread_inwarp_id = thread_id % warpSize;
+        __shared__ compType shuffle_buffer[BlockSize];
+        compType* myBuffer = &shuffle_buffer[warpId * warpSize];
+        myBuffer[thread_inwarp_id] = lAccuData;
+        __syncthreads();
+        for(index_t stride = warpSize / 2; stride > 0; stride /= 2)
+        {
+            if(thread_inwarp_id < stride)
+            {
+                compType currVal1 = myBuffer[thread_inwarp_id];
+                compType currVal2 = myBuffer[thread_inwarp_id + stride];
+                binop::calculate(currVal1, currVal2);
+                myBuffer[thread_inwarp_id] = currVal1;
+            }
+            __syncthreads();
+        }
+        if(thread_inwarp_id == 0)
+            binop::calculate(accuData, myBuffer[0]);
+    };
+    // This interface accumulates on both data values and indices and is called by Direct_WarpWise
+    // reduction method at first-time reduction
+    __device__ static void
+    Reduce2(const BufferType& thread_buffer, compType& accuData, int& accuIndex, int indexStart)
+    {
+        if constexpr(have_builtin_shuffle)
+            Reduce2Impl1(thread_buffer, accuData, accuIndex, indexStart);
+        else
+            Reduce2Impl2(thread_buffer, accuData, accuIndex, indexStart);
+    };
+    // This interface implementation uses HIP built-in device shuffling functions
+    __device__ static void Reduce2Impl1(const BufferType& thread_buffer,
+                                        compType& accuData,
+                                        int& accuIndex,
+                                        int indexStart)
+    {
+        compType lAccuData       = opReduce::GetZeroVal();
+        int lAccuIndex           = 0;
+        index_t thread_inwarp_id = get_thread_local_1d_id() % warpSize;
+        static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
+            int currIndex = thread_inwarp_id * ThreadBufferLen + I + indexStart;
+            binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, currIndex);
+        });
+        // synchronize among all threads in this warp
+        __all(1);
+        for(index_t stride = 1; stride < warpSize; stride *= 2)
+        {
+            compType tmpVal = __shfl_down(lAccuData, stride, warpSize);
+            int tmpIndex    = __shfl_down(lAccuIndex, stride, warpSize);
+            binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex);
+            __all(1);
+        }
+        if(thread_inwarp_id == 0)
+            binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex);
+    };
+    // This interface implementation does not use HIP built-in device shuffling functions since for
+    // fp16, built-in shuffling functions is not provided by HIP
+    __device__ static void Reduce2Impl2(const BufferType& thread_buffer,
+                                        compType& accuData,
+                                        int& accuIndex,
+                                        int indexStart)
+    {
+        compType lAccuData       = opReduce::GetZeroVal();
+        int lAccuIndex           = 0;
+        index_t thread_id        = get_thread_local_1d_id();
+        index_t warpId           = thread_id / warpSize;
+        index_t thread_inwarp_id = thread_id % warpSize;
+        static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
+            int currIndex = thread_inwarp_id * ThreadBufferLen + I + indexStart;
+            binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, currIndex);
+        });
+        __shared__ compType shuffle_data_buffer[BlockSize];
+        __shared__ int shuffle_indices_buffer[BlockSize];
+        compType* myDataBuffer = &shuffle_data_buffer[warpId * warpSize];
+        int* myIndicesBuffer   = &shuffle_indices_buffer[warpId * warpSize];
+        myDataBuffer[thread_inwarp_id]    = lAccuData;
+        myIndicesBuffer[thread_inwarp_id] = lAccuIndex;
+        __syncthreads();
+        for(index_t stride = 1; stride < warpSize; stride *= 2)
+        {
+            compType currVal1 = myDataBuffer[thread_inwarp_id];
+            compType currVal2 = myDataBuffer[thread_inwarp_id + stride];
+            int currIndex1    = myIndicesBuffer[thread_inwarp_id];
+            int currIndex2    = myIndicesBuffer[thread_inwarp_id + stride];
+            binop::calculate(currVal1, currVal2, currIndex1, currIndex2);
+            myDataBuffer[thread_inwarp_id]    = currVal1;
+            myIndicesBuffer[thread_inwarp_id] = currIndex1;
+            __syncthreads();
+        }
+        if(thread_inwarp_id == 0)
+            binop::calculate(accuData, myDataBuffer[0], accuIndex, myIndicesBuffer[0]);
+    };
+    // cppcheck-suppress constParameter
+    __device__ static void set_buffer_value(BufferType& thread_buffer, compType value)
+    {
+        static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; });
+        __all(1);
+    };
+    // Execute unary operation on the per-thread buffer elements
+    template <typename unary_op_type>
+    __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer)
+    {
+        static_for<0, ThreadBufferLen, 1>{}(
+            [&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); });
+        __all(1);
+    };
+};
+template <typename BufferType,
+          typename IdxBufferType,
+          index_t BlockSize,
+          typename opReduce,
+          NanPropagation_t nanPropaOpt>
+struct WarpReduceWithIndicesInput
+{
+    using compType = typename opReduce::dataType;
+    using binop    = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
+    static_assert(BufferType::IsStaticBuffer(),
+                  "Per-thread buffer for WarpWise reduction should be StaticBuffer!");
+    static_assert(IdxBufferType::IsStaticBuffer(),
+                  "Per-thread buffer for WarpWise reduction should be StaticBuffer for indices!");
+    static_assert(std::is_same<typename BufferType::type, compType>::value,
+                  "Data type of per-thread StaticBuffer for WarpWise reduction should be same as "
+                  "the compType!");
+    static_assert(
+        std::is_same<typename IdxBufferType::type, index_t>::value,
+        "Indices type per-thread of StaticBuffer for WarpWise reduction should be index_t!");
+    static_assert(BufferType::Size() == IdxBufferType::Size(),
+                  "StaticBuffers for data and indices should have the same sizes!");
+    static constexpr index_t ThreadBufferLen = BufferType::Size();
+    static constexpr bool have_builtin_shuffle =
+        std::is_same<compType, float>::value || std::is_same<compType, double>::value;
+    // This interface accumulates on both data values and indices and is called by Direct_WarpWise
+    // reduction method at second-time reduction
+    __device__ static void Reduce(const BufferType& thread_buffer,
+                                  const IdxBufferType& thread_indices_buffer,
+                                  compType& accuData,
+                                  int& accuIndex)
+    {
+        if constexpr(have_builtin_shuffle)
+            ReduceImpl1(thread_buffer, thread_indices_buffer, accuData, accuIndex);
+        else
+            ReduceImpl2(thread_buffer, thread_indices_buffer, accuData, accuIndex);
+    };
+    // This interface implementation uses HIP built-in device shuffling functions
+    __device__ static void ReduceImpl1(const BufferType& thread_buffer,
+                                       const IdxBufferType& thread_indices_buffer,
+                                       compType& accuData,
+                                       int& accuIndex)
+    {
+        compType lAccuData = opReduce::GetZeroVal();
+        int lAccuIndex     = 0;
+        static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
+            binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, thread_indices_buffer[I]);
+        });
+        // synchronize among all threads in this warp
+        __all(1);
+        for(index_t stride = 1; stride < warpSize; stride *= 2)
+        {
+            compType tmpVal = __shfl_down(lAccuData, stride, warpSize);
+            int tmpIndex    = __shfl_down(lAccuIndex, stride, warpSize);
+            binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex);
+            __all(1);
+        }
+        binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex);
+    };
+    // This interface implementation does not use HIP built-in device shuffling functions
+    // since for fp16, built-in shuffling functions is not provided by HIP
+    __device__ static void ReduceImpl2(const BufferType& thread_buffer,
+                                       const IdxBufferType& thread_indices_buffer,
+                                       compType& accuData,
+                                       int& accuIndex)
+    {
+        compType lAccuData       = opReduce::GetZeroVal();
+        int lAccuIndex           = 0;
+        index_t thread_id        = get_thread_local_1d_id();
+        index_t warpId           = thread_id / warpSize;
+        index_t thread_inwarp_id = thread_id % warpSize;
+        static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
+            binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, thread_indices_buffer[I]);
+        });
+        __shared__ compType shuffle_data_buffer[BlockSize];
+        __shared__ int shuffle_indices_buffer[BlockSize];
+        compType* myDataBuffer = &shuffle_data_buffer[warpId * warpSize];
+        int* myIndicesBuffer   = &shuffle_indices_buffer[warpId * warpSize];
+        myDataBuffer[thread_inwarp_id]    = lAccuData;
+        myIndicesBuffer[thread_inwarp_id] = lAccuIndex;
+        __syncthreads();
+        for(index_t stride = 1; stride < warpSize; stride *= 2)
+        {
+            compType currVal1 = myDataBuffer[thread_inwarp_id];
+            compType currVal2 = myDataBuffer[thread_inwarp_id + stride];
+            int currIndex1    = myIndicesBuffer[thread_inwarp_id];
+            int currIndex2    = myIndicesBuffer[thread_inwarp_id + stride];
+            binop::calculate(currVal1, currVal2, currIndex1, currIndex2);
+            myDataBuffer[thread_inwarp_id]    = currVal1;
+            myIndicesBuffer[thread_inwarp_id] = currIndex1;
+            __syncthreads();
+        }
+        if(thread_inwarp_id == 0)
+            binop::calculate(accuData, myDataBuffer[0], accuIndex, myIndicesBuffer[0]);
+    };
+    // cppcheck-suppress constParameter
+    __device__ static void set_buffer_value(BufferType& thread_buffer, compType value)
+    {
+        static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; });
+        __all(1);
+    };
+    // Execute unary operation on the per-thread buffer elements
+    template <typename unary_op_type>
+    __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer)
+    {
+        static_for<0, ThreadBufferLen, 1>{}(
+            [&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); });
+        __all(1);
+    };
+};
+}; // end of namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
@@ -709,9 +709,6 @@ struct ThreadwiseTensorSliceTransfer_v3
        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
          dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin))
    {
-        // TODO: fix this
-        static_assert(is_same<SrcData, DstData>::value,
-                      "wrong! current implementation assume SrcData and DstData are same type");
    }
    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
@@ -981,7 +978,8 @@ struct ThreadwiseTensorSliceTransfer_v3
                constexpr index_t buffer_offset =
                    buffer_desc_.CalculateOffset(dst_data_idx + i * dst_scalar_step_in_vector);
-                dst_tmp_vector.template AsType<DstData>()(i) = buffer_[Number<buffer_offset>{}];
+                dst_tmp_vector.template AsType<DstData>()(i) =
+                    type_convert<DstData>{}(buffer_[Number<buffer_offset>{}]);
            });
            using dst_vector_t = typename decltype(dst_tmp_vector)::type;

--- a/composable_kernel/include/utility/amd_buffer_addressing.hpp
+++ b/composable_kernel/include/utility/amd_buffer_addressing.hpp
@@ -209,13 +209,49 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
                                                                 index_t src_wave_addr_offset)
 {
    static_assert(
-        (is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
+        (is_same<T, double>::value && (N == 1 || N == 2 || N == 4)) ||
-            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
            (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
-            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)),
+            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
+            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
        "wrong! not implemented");
-    if constexpr(is_same<T, float>::value)
+    if constexpr(is_same<T, double>::value)
+    {
+        // use fp32 load to mimic fp64 load
+        if constexpr(N == 1)
+        {
+            const float2_t tmp = llvm_amdgcn_raw_buffer_load_fp32x2(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return as_type<double>(tmp);
+        }
+        else if constexpr(N == 2)
+        {
+            const float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return as_type<double2_t>(tmp);
+        }
+        else if constexpr(N == 4)
+        {
+            const float4_t f32_0 = llvm_amdgcn_raw_buffer_load_fp32x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            const float4_t f32_1 =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset + 4 * sizeof(float),
+                                                   0);
+            vector_type<double, 4> tmp;
+            tmp.AsType<double2_t>()(Number<0>{}) = as_type<double2_t>(f32_0);
+            tmp.AsType<double2_t>()(Number<1>{}) = as_type<double2_t>(f32_1);
+            return tmp.AsType<double4_t>()(Number<0>{});
+        }
+    }
+    else if constexpr(is_same<T, float>::value)
    {
        if constexpr(N == 1)
        {
@@ -267,25 +303,11 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
        }
        else if constexpr(N == 8)
        {
-#if 0
+            // use fp32 load to mimic fp16 load
-            vector_type<half_t, 8> tmp;
-            tmp.AsType<half4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_fp16x4(
-                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
-            tmp.AsType<half4_t>()(Number<1>{}) =
-                llvm_amdgcn_raw_buffer_load_fp16x4(src_wave_buffer_resource,
-                                                     src_thread_addr_offset,
-                                                     src_wave_addr_offset + 4 * sizeof(half_t),
-                                                     0);
-            return tmp.AsType<half8_t>()(Number<0>{});
-#else
            float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
            return as_type<half8_t>(tmp);
-#endif
        }
    }
    else if constexpr(is_same<T, int32_t>::value)
@@ -417,13 +439,34 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                      index_t dst_wave_addr_offset)
 {
    static_assert(
-        (is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
+        (is_same<T, double>::value && (N == 1 || N == 2)) ||
+            (is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
+            (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)) ||
-            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
-            (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)),
        "wrong! not implemented");
-    if constexpr(is_same<T, float>::value)
+    if constexpr(is_same<T, double>::value)
+    {
+        // use fp32 store to mimic fp64 store
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_store_fp32x2(as_type<float2_t>(src_thread_data),
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                0);
+        }
+        else if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_store_fp32x4(as_type<float4_t>(src_thread_data),
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                0);
+        }
+    }
+    else if constexpr(is_same<T, float>::value)
    {
        if constexpr(N == 1)
        {
@@ -450,6 +493,49 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                0);
        }
    }
+    else if constexpr(is_same<T, half_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_store_fp16(src_thread_data,
+                                              dst_wave_buffer_resource,
+                                              dst_thread_addr_offset,
+                                              dst_wave_addr_offset,
+                                              0);
+        }
+        else if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_store_fp16x2(src_thread_data,
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                0);
+        }
+        else if constexpr(N == 4)
+        {
+            llvm_amdgcn_raw_buffer_store_fp16x4(src_thread_data,
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                0);
+        }
+        else if constexpr(N == 8)
+        {
+            vector_type<half_t, 8> tmp{src_thread_data};
+            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<0>{}],
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                0);
+            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<1>{}],
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset + 4 * sizeof(half_t),
+                                                0);
+        }
+    }
    else if constexpr(is_same<T, int32_t>::value)
    {
        if constexpr(N == 1)
@@ -536,49 +622,6 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                               0);
        }
    }
-    else if constexpr(is_same<T, half_t>::value)
-    {
-        if constexpr(N == 1)
-        {
-            llvm_amdgcn_raw_buffer_store_fp16(src_thread_data,
-                                              dst_wave_buffer_resource,
-                                              dst_thread_addr_offset,
-                                              dst_wave_addr_offset,
-                                              0);
-        }
-        else if constexpr(N == 2)
-        {
-            llvm_amdgcn_raw_buffer_store_fp16x2(src_thread_data,
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset,
-                                                0);
-        }
-        else if constexpr(N == 4)
-        {
-            llvm_amdgcn_raw_buffer_store_fp16x4(src_thread_data,
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset,
-                                                0);
-        }
-        else if constexpr(N == 8)
-        {
-            vector_type<half_t, 8> tmp{src_thread_data};
-            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<0>{}],
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset,
-                                                0);
-            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<1>{}],
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset + 4 * sizeof(half_t),
-                                                0);
-        }
-    }
 }
 // buffer_load requires:

--- a/composable_kernel/include/utility/data_type.hpp
+++ b/composable_kernel/include/utility/data_type.hpp
@@ -73,6 +73,13 @@ struct scalar_type<vector_type<T, N>>
 };
 //
+template <>
+struct scalar_type<double>
+{
+    using type                           = double;
+    static constexpr index_t vector_size = 1;
+};
 template <>
 struct scalar_type<float>
 {
@@ -864,6 +871,10 @@ struct vector_type<T, 256>
    }
 };
+// fp64
+using double2_t = typename vector_type<double, 2>::type;
+using double4_t = typename vector_type<double, 4>::type;
 // fp32
 using float2_t  = typename vector_type<float, 2>::type;
 using float4_t  = typename vector_type<float, 4>::type;

--- a/composable_kernel/include/utility/dynamic_buffer.hpp
+++ b/composable_kernel/include/utility/dynamic_buffer.hpp
@@ -38,6 +38,10 @@ struct DynamicBuffer
        return BufferAddressSpace;
    }
+    __host__ __device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; }
+    __host__ __device__ constexpr T& operator()(index_t i) { return p_data_[i]; }
    template <typename X,
              typename enable_if<
                  is_same<typename scalar_type<remove_cv_t<remove_reference_t<X>>>::type,
@@ -234,9 +238,14 @@ __host__ __device__ constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize el
    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, true>{p, element_space_size};
 }
-template <AddressSpaceEnum_t BufferAddressSpace, typename T, typename ElementSpaceSize>
+template <
+    AddressSpaceEnum_t BufferAddressSpace,
+    typename T,
+    typename ElementSpaceSize,
+    typename X,
+    typename enable_if<is_same<remove_cvref_t<T>, remove_cvref_t<X>>::value, bool>::type = false>
 __host__ __device__ constexpr auto
-make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, T invalid_element_value)
+make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, X invalid_element_value)
 {
    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, false>{
        p, element_space_size, invalid_element_value};

--- a/composable_kernel/include/utility/reduction_common.hpp
+++ b/composable_kernel/include/utility/reduction_common.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_REDUCTION_COMMON_HPP
+#define CK_REDUCTION_COMMON_HPP
+// this enumerate should be synchronized with include/miopen/reduce_common.hpp
+namespace ck {
+enum class ReductionMethod_t
+{
+    DirectThreadWise = 1,
+    DirectWarpWise   = 2,
+    BlockWise        = 3,
+    MultiBlock       = 4
+}; // end of namespace ck
+enum class ReduceTensorOp_t
+{
+    ADD   = 0,
+    MUL   = 1,
+    MIN   = 2,
+    MAX   = 3,
+    AMAX  = 4,
+    AVG   = 5,
+    NORM1 = 6,
+    NORM2 = 7,
+    // MUL_NO_ZEROS = 8,
+};
+enum class NanPropagation_t
+{
+    NOT_PROPAGATE_NAN = 0,
+    PROPAGATE_NAN     = 1,
+};
+enum class ReduceTensorIndices_t
+{
+    NO_INDICES        = 0,
+    FLATTENED_INDICES = 1,
+};
+enum class IndicesType_t
+{
+    INDICES_32BIT = 0,
+    INDICES_64BIT = 1,
+    INDICES_16BIT = 2,
+    INDICES_8BIT  = 3,
+};
+struct float_equal_one
+{
+    template <class T>
+    __device__ static inline bool apply(T x)
+    {
+        return x <= type_convert<T>{}(1.0f) and x >= type_convert<T>{}(1.0f);
+    }
+    template <class T>
+    __device__ inline bool operator()(T x)
+    {
+        return (float_equal_one::apply(x));
+    };
+};
+struct float_equal_zero
+{
+    template <class T>
+    __device__ static inline bool apply(T x)
+    {
+        return x <= type_convert<T>{}(0.0f) and x >= type_convert<T>{}(0.0f);
+    }
+    template <class T>
+    __device__ inline bool operator()(T x)
+    {
+        return (float_equal_zero::apply(x));
+    };
+};
+}; // end of namespace ck
+#endif
--- a/composable_kernel/include/utility/reduction_functions_binop.hpp
+++ b/composable_kernel/include/utility/reduction_functions_binop.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_REDUCTION_FUNCTIONS_BINOP_HPP
+#define CK_REDUCTION_FUNCTIONS_BINOP_HPP
+#include "data_type.hpp"
+#include "reduction_common.hpp"
+#include "reduction_operator.hpp"
+namespace ck {
+namespace detail {
+static inline __device__ bool isnan(half_t x) { return __hisnan(x); };
+template <NanPropagation_t nanPropaOpt, typename opReduce, typename compType>
+struct binop_with_nan_check;
+template <typename opReduce, typename compType>
+struct binop_with_nan_check<NanPropagation_t::NOT_PROPAGATE_NAN, opReduce, compType>
+{
+    // cppcheck-suppress constParameter
+    __device__ static inline void calculate(compType& accuVal, compType currVal)
+    {
+        opReduce{}(accuVal, currVal);
+    };
+    // The method is called when the opReduce is indexable and the user asked for indices
+    __device__ static inline void
+    // cppcheck-suppress constParameter
+    calculate(compType& accuVal, compType currVal, int& accuIndex, int currIndex)
+    {
+        bool changed = false;
+        opReduce{}(accuVal, currVal, changed);
+        if(changed)
+            accuIndex = currIndex;
+    };
+};
+template <typename opReduce, typename compType>
+struct binop_with_nan_check<NanPropagation_t::PROPAGATE_NAN, opReduce, compType>
+{
+    __device__ static inline void calculate(compType& accuVal, compType currVal)
+    {
+        if(isnan(currVal))
+            accuVal = currVal;
+        else
+            opReduce{}(accuVal, currVal);
+    };
+    // The method is called when the opReduce is indexable and the user asked for indices
+    __device__ static inline void
+    calculate(compType& accuVal, compType currVal, int& accuIndex, int currIndex)
+    {
+        if(isnan(currVal))
+        {
+            accuVal   = currVal;
+            accuIndex = currIndex;
+        }
+        else
+        {
+            bool changed = false;
+            opReduce{}(accuVal, currVal, changed);
+            if(changed)
+                accuIndex = currIndex;
+        }
+    };
+};
+}; // namespace detail
+}; // end of namespace ck
+#endif
--- a/composable_kernel/include/utility/reduction_operator.hpp
+++ b/composable_kernel/include/utility/reduction_operator.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_REDUCTION_OPERATOR_HPP
+#define CK_REDUCTION_OPERATOR_HPP
+#include "reduction_common.hpp"
+namespace ck {
+namespace reduce {
+// Every binary operator used in reduction is represented by a templated functor class. Each functor
+// class must provide at least
+// three members:
+// 1) GetZeroVal() -- the interface to return the "identity element" for the binary operator,
+// "identity element" is the unique
+//                    element in the algebraic space that doesn't affect the value of other elements
+//                    when operated with any of them.
+// 2) indexable -- boolean value indicating whether indices of the operated elements could be
+// recorded. Usually, Min/Max operator could
+//                 need to record the indices of elements. For operator like Add/Mul, no need to
+//                 record the indices.
+// 3) operator() -- the first argument of the operator must be both an input & output, and the
+// corresponding variable usually stores
+//                  the accumulated result of many operator() calls; the second argument is only an
+//                  input. For indexable binary
+//                  operator, the second version of operator() has third argument (which is an
+//                  output) to indicate whether the
+//                  accumulated value (the first argument) has changed, in which case the recorded
+//                  accumulated index also need be
+//                  changed.
+template <class T>
+struct Add
+{
+    using dataType = T;
+    __device__ static T GetZeroVal() { return type_convert<T>{}(0.0f); };
+    __device__ inline constexpr void operator()(T& a, T b) const { a = a + b; }
+    static constexpr bool indexable = false;
+};
+template <class T>
+struct Mul
+{
+    using dataType = T;
+    __device__ static T GetZeroVal() { return type_convert<T>{}(1.0f); };
+    __device__ inline constexpr void operator()(T& a, T b) const { a = a * b; }
+    static constexpr bool indexable = false;
+};
+template <class T>
+struct Max
+{
+    using dataType = T;
+    __device__ static T GetZeroVal() { return std::numeric_limits<T>::min(); };
+    __device__ inline constexpr void operator()(T& a, T b) const
+    {
+        if(a < b)
+            a = b;
+    }
+    __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
+    {
+        if(a < b)
+        {
+            a       = b;
+            changed = true;
+        }
+    }
+    static constexpr bool indexable = true;
+};
+template <class T>
+struct Min
+{
+    using dataType = T;
+    __device__ static T GetZeroVal() { return std::numeric_limits<T>::max(); };
+    __device__ inline constexpr void operator()(T& a, T b) const
+    {
+        if(a > b)
+            a = b;
+    }
+    __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
+    {
+        if(a > b)
+        {
+            a       = b;
+            changed = true;
+        }
+    }
+    static constexpr bool indexable = true;
+};
+template <>
+__device__ half_t Max<half_t>::GetZeroVal()
+{
+    return type_convert<half_t>{}(std::numeric_limits<float>::min());
+};
+template <>
+__device__ half_t Min<half_t>::GetZeroVal()
+{
+    return type_convert<half_t>{}(std::numeric_limits<float>::max());
+};
+// Unary operators are usually called element-wisely before the reduction is executed on the
+// elements.
+// They are needed for easy implementation of reduction types of AVG, NRM1, NRM2
+template <class T, bool hasDividing>
+struct unary_identic
+{
+    __device__ unary_identic(const int divider = 1)
+    {
+        scaler = 1.0f / static_cast<float>(divider);
+    };
+    __device__ inline constexpr T operator()(T a) const { return a * type_convert<T>{}(scaler); };
+    float scaler = 1.0f;
+};
+template <class T>
+struct unary_identic<T, false>
+{
+    __device__ unary_identic(const int divider = 1) { (void)divider; };
+    __device__ inline constexpr T operator()(T a) const { return a; };
+};
+template <class T, bool hasDividing>
+struct unary_square
+{
+    __device__ unary_square(const int divider = 1) { scaler = 1.0f / static_cast<float>(divider); };
+    __device__ inline constexpr T operator()(T a) const
+    {
+        a = a * a;
+        return a * type_convert<T>{}(scaler);
+    };
+    float scaler = 1.0f;
+};
+template <class T>
+struct unary_square<T, false>
+{
+    __device__ unary_square(const int divider = 1) { (void)divider; };
+    __device__ inline constexpr T operator()(T a) const { return a * a; };
+};
+template <class T, bool hasDividing>
+struct unary_abs
+{
+    __device__ unary_abs(const int divider = 1) { scaler = 1.0f / static_cast<float>(divider); };
+    __device__ inline constexpr T operator()(T a) const
+    {
+        a = abs(a);
+        return a * type_convert<T>{}(scaler);
+    };
+    float scaler = 1.0f;
+};
+template <class T>
+struct unary_abs<T, false>
+{
+    __device__ unary_abs(const int divider = 1) { (void)divider; };
+    __device__ inline constexpr T operator()(T a) const { return abs(a); };
+};
+// We know for sure that 4.0 has __habs(), but 3.0 does not have it.
+// Let's assume that __habs() exists since 3.5.
+#if HIP_PACKAGE_VERSION_FLAT < 3005000000
+inline __device__ __half __habs(__half x)
+{
+    union
+    {
+        __half half;
+        unsigned short u16;
+    } val;
+    val.half = x;
+    val.u16  = val.u16 & 0x7fff;
+    return val.half;
+}
+#endif
+template <bool hasDividing>
+struct unary_abs<half_t, hasDividing>
+{
+    __device__ unary_abs(const int divider = 1) { scaler = 1.0f / static_cast<float>(divider); };
+    __device__ inline half_t operator()(half_t a) const
+    {
+        a = static_cast<half_t>(__habs(a));
+        return a * type_convert<half_t>{}(scaler);
+    };
+    float scaler = 1.0f;
+};
+template <>
+struct unary_abs<half_t, false>
+{
+    __device__ unary_abs(const int divider = 1) { (void)divider; };
+    __device__ inline half_t operator()(half_t a) const { return static_cast<half_t>(__habs(a)); };
+};
+template <class T>
+struct unary_sqrt
+{
+    __device__ unary_sqrt(const int divider = 1) { (void)divider; };
+    __device__ inline T operator()(T a) const { return sqrtf(a); };
+};
+template <>
+struct unary_sqrt<half_t>
+{
+    __device__ unary_sqrt(const int divider = 1) { (void)divider; };
+    __device__ inline half_t operator()(half_t a) const { return static_cast<half_t>(hsqrt(a)); };
+};
+}; // end of namespace reduce
+// The templated struct reduce_binary_operator maps the enum Ids of binary operators to their
+// respective functor classes.
+// The "GetZeroVal()" interface and boolean member "indexable" are also provided in
+// reduce_binary_operactor for
+// easier checking by the upper-layer codes in the kernels.
+template <typename T, ReduceTensorOp_t op>
+struct reduce_binary_operator;
+template <typename T>
+struct reduce_binary_operator<T, ReduceTensorOp_t::ADD>
+{
+    using opType   = reduce::Add<T>;
+    using dataType = T;
+    __device__ static T GetZeroVal() { return reduce::Add<T>::GetZeroVal(); };
+    static constexpr bool indexable = reduce::Add<T>::indexable;
+};
+template <typename T>
+struct reduce_binary_operator<T, ReduceTensorOp_t::MUL>
+{
+    using opType   = reduce::Mul<T>;
+    using dataType = T;
+    __device__ static T GetZeroVal() { return reduce::Mul<T>::GetZeroVal(); };
+    static constexpr bool indexable = reduce::Mul<T>::indexable;
+};
+template <typename T>
+struct reduce_binary_operator<T, ReduceTensorOp_t::MIN>
+{
+    using opType   = reduce::Min<T>;
+    using dataType = T;
+    __device__ static T GetZeroVal() { return reduce::Min<T>::GetZeroVal(); };
+    static constexpr bool indexable = reduce::Min<T>::indexable;
+};
+template <typename T>
+struct reduce_binary_operator<T, ReduceTensorOp_t::MAX>
+{
+    using opType   = reduce::Max<T>;
+    using dataType = T;
+    __device__ static T GetZeroVal() { return reduce::Max<T>::GetZeroVal(); };
+    static constexpr bool indexable = reduce::Max<T>::indexable;
+};
+template <typename T>
+struct reduce_binary_operator<T, ReduceTensorOp_t::AMAX>
+{
+    using opType   = reduce::Max<T>;
+    using dataType = T;
+    __device__ static T GetZeroVal() { return reduce::Max<T>::GetZeroVal(); };
+    static constexpr bool indexable = reduce::Max<T>::indexable;
+};
+template <typename T>
+struct reduce_binary_operator<T, ReduceTensorOp_t::AVG>
+{
+    using opType   = reduce::Add<T>;
+    using dataType = T;
+    __device__ static T GetZeroVal() { return reduce::Add<T>::GetZeroVal(); };
+    static constexpr bool indexable = reduce::Add<T>::indexable;
+};
+template <typename T>
+struct reduce_binary_operator<T, ReduceTensorOp_t::NORM1>
+{
+    using opType   = reduce::Add<T>;
+    using dataType = T;
+    __device__ static T GetZeroVal() { return reduce::Add<T>::GetZeroVal(); };
+    static constexpr bool indexable = reduce::Add<T>::indexable;
+};
+template <typename T>
+struct reduce_binary_operator<T, ReduceTensorOp_t::NORM2>
+{
+    using opType   = reduce::Add<T>;
+    using dataType = T;
+    __device__ static T GetZeroVal() { return reduce::Add<T>::GetZeroVal(); };
+    static constexpr bool indexable = reduce::Add<T>::indexable;
+};
+// The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary
+// functor classes.
+// The two unary functors are called before and afer the Reduction is executed respectively
+template <typename T, ReduceTensorOp_t op, bool isFirsReduce, bool isLastReduce>
+struct reduce_unary_operator
+{
+    using preUnaryOp = reduce::unary_identic<T, false>;
+    using posUnaryOp = reduce::unary_identic<T, false>;
+};
+template <typename T, bool isFirstReduce>
+struct reduce_unary_operator<T, ReduceTensorOp_t::AVG, isFirstReduce, true>
+{
+    using preUnaryOp = reduce::unary_identic<T, false>;
+    using posUnaryOp = reduce::unary_identic<T, true>;
+};
+template <typename T, bool isLastReduce>
+struct reduce_unary_operator<T, ReduceTensorOp_t::NORM1, true, isLastReduce>
+{
+    using preUnaryOp = reduce::unary_abs<T, false>;
+    using posUnaryOp = reduce::unary_identic<T, false>;
+};
+template <typename T, bool isLastReduce>
+struct reduce_unary_operator<T, ReduceTensorOp_t::AMAX, true, isLastReduce>
+{
+    using preUnaryOp = reduce::unary_abs<T, false>;
+    using posUnaryOp = reduce::unary_identic<T, false>;
+};
+template <typename T>
+struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, true, false>
+{
+    using preUnaryOp = reduce::unary_square<T, false>;
+    using posUnaryOp = reduce::unary_identic<T, false>;
+};
+template <typename T>
+struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, true, true>
+{
+    using preUnaryOp = reduce::unary_square<T, false>;
+    using posUnaryOp = reduce::unary_sqrt<T>;
+};
+template <typename T>
+struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, false, true>
+{
+    using preUnaryOp = reduce::unary_identic<T, false>;
+    using posUnaryOp = reduce::unary_sqrt<T>;
+};
+} // end of namespace ck
+#endif
--- a/composable_kernel/include/utility/type.hpp
+++ b/composable_kernel/include/utility/type.hpp
@@ -22,6 +22,9 @@ using remove_reference_t = typename std::remove_reference<T>::type;
 template <typename T>
 using remove_cv_t = typename std::remove_cv<T>::type;
+template <typename T>
+using remove_cvref_t = remove_cv_t<std::remove_reference_t<T>>;
 template <typename T>
 inline constexpr bool is_pointer_v = std::is_pointer<T>::value;

--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_blockwise.hpp"
+using namespace ck;
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+constexpr index_t srcDims = CK_PARAM_IN_DIMS;
+constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
+using toReduceDims = Sequence<CK_PARAM_TOREDUCE_DIMS>;
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+////////////////////////////////////////////////////////////////////////////////////////
+using specDims = typename sequence_merge<Sequence<>, toReduceDims>::type;
+static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
+              "Wrong invariant and/or toReduce dimensions!");
+// The number of invariant dimensions can be zero if all dimension are to be reduced
+static_assert(dstDims == 1,
+              "If all source dimensions are reduced, the dest should have only one dimension !!");
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
+// helper functions using variadic template arguments
+template <index_t... Ns>
+__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
+{
+    return make_tuple(static_cast<index_t>(lengths[Ns])...);
+};
+template <index_t arraySize>
+__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
+{
+    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
+    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
+    return make_tuple_from_array_and_index_seq(lengths, index_seq);
+};
+template <index_t... Ns>
+__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
+{
+    return make_tuple(Ns...);
+};
+extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
+                                                             int BlkGroupSize,
+                                                             int inLength0,
+                                                             int inLength1,
+                                                             int inLength2,
+                                                             int inLength3,
+                                                             int inLength4,
+                                                             int inLength5,
+                                                             int inStride0,
+                                                             int inStride1,
+                                                             int inStride2,
+                                                             int inStride3,
+                                                             int inStride4,
+                                                             int inStride5,
+                                                             int outLength0,
+                                                             int outLength1,
+                                                             int outLength2,
+                                                             int outLength3,
+                                                             int outLength4,
+                                                             int outLength5,
+                                                             int outStride0,
+                                                             int outStride1,
+                                                             int outStride2,
+                                                             int outStride3,
+                                                             int outStride4,
+                                                             int outStride5,
+                                                             void* __restrict__ ws_global)
+{
+    (void)GridSize;
+    (void)BlkGroupSize;
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
+    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
+    const int dstLengths[6] = {
+        outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
+    const int dstStrides[6] = {
+        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
+    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
+    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
+    const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
+    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
+    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+    const auto one_dim_srcDesc = transform_tensor_descriptor(
+        srcDesc,
+        make_tuple(make_merge_transform(tupleSrcLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    auto src2dDesc = transform_tensor_descriptor(
+        one_dim_srcDesc,
+        make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))),
+        make_tuple(Sequence<0>{}),
+        make_tuple(Sequence<0, 1>{}));
+    auto dst1dDesc = transform_tensor_descriptor(
+        dstDesc,
+        make_tuple(make_merge_transform(tupleDstLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    const auto invariantLen = src2dDesc.GetLength(Number<0>{});
+    const auto toReduceLen  = src2dDesc.GetLength(Number<1>{});
+    constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad =
+            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pass_through_transform(invariantLen),
+                                                   make_pad_transform(toReduceLen, 0, srcPad)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+    if(hipThreadIdx_x == 0)
+        *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
+};
+template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
+struct get_ref_desc_types
+{
+    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
+    static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 1>::type{};
+    // don't have to use accurate strides to get an expected referrence type
+    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
+    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
+    static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
+        ref_srcDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))),
+        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    static constexpr auto ref_src2dDesc =
+        transform_tensor_descriptor(ref_one_dim_srcDesc,
+                                    make_tuple(make_unmerge_transform(
+                                        make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))),
+                                    make_tuple(Sequence<0>{}),
+                                    make_tuple(Sequence<0, 1>{}));
+    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
+        ref_dstDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
+    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
+    // used by the BlockWise and MultiBlock method
+    using refType_src2dDesc_padded_34 = decltype(
+        transform_tensor_descriptor(ref_src2dDesc,
+                                    make_tuple(make_pass_through_transform(ref_invariantLen),
+                                               make_pad_transform(ref_toReduceLen, 0, 2)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{})));
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dst1dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dst1dDesc);
+};
+using refType_src2dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc;
+using refType_dst1dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc;
+using refType_src2dDesc_padded_34 =
+    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc_padded_34;
+using refType_dst1dDesc_padded =
+    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc_padded;
+template <ReductionMethod_t impl, bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
+                                                     int BlkGroupSize,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     void* __restrict__ ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)BlkGroupSize;
+    (void)ws_buf2_bytes_offset;
+    const void* p_src2dDesc = ws_global;
+    const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise<BlockSize,
+                                                                   srcDataType,
+                                                                   dstDataType,
+                                                                   compType,
+                                                                   decltype(src2dDesc),
+                                                                   decltype(dst1dDesc),
+                                                                   op,
+                                                                   nanPropaOpt,
+                                                                   reduceIndicesOpt,
+                                                                   true,
+                                                                   true,
+                                                                   GredAccessesPerThreadInBlock>;
+    constexpr int RunId = need_indices ? 2 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(p_src_global),
+        beta,
+        static_cast<dstDataType* const __restrict__>(p_dst_global),
+        static_cast<const int* const __restrict__>(nullptr),
+        static_cast<int* const __restrict__>(indices_global));
+};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_blockwise.hpp"
+using namespace ck;
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+constexpr index_t srcDims = CK_PARAM_IN_DIMS;
+constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
+using toReduceDims  = Sequence<CK_PARAM_TOREDUCE_DIMS>;
+using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>;
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+////////////////////////////////////////////////////////////////////////////////////////
+using specDims = typename sequence_merge<invariantDims, toReduceDims>::type;
+static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
+              "Wrong invariant and/or toReduce dimensions!");
+// The number of invariant dimensions can be zero if all dimension are to be reduced
+static_assert(invariantDims::Size() > 0 || dstDims == 1,
+              "If all source dimensions are reduced, the dest should have only one dimension !!");
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
+// helper functions using variadic template arguments
+template <index_t... Ns>
+__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
+{
+    return make_tuple(static_cast<index_t>(lengths[Ns])...);
+};
+template <index_t arraySize>
+__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
+{
+    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
+    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
+    return make_tuple_from_array_and_index_seq(lengths, index_seq);
+};
+template <index_t... Ns>
+__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
+{
+    return make_tuple(Ns...);
+};
+extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
+                                                             int BlkGroupSize,
+                                                             int inLength0,
+                                                             int inLength1,
+                                                             int inLength2,
+                                                             int inLength3,
+                                                             int inLength4,
+                                                             int inLength5,
+                                                             int inStride0,
+                                                             int inStride1,
+                                                             int inStride2,
+                                                             int inStride3,
+                                                             int inStride4,
+                                                             int inStride5,
+                                                             int outLength0,
+                                                             int outLength1,
+                                                             int outLength2,
+                                                             int outLength3,
+                                                             int outLength4,
+                                                             int outLength5,
+                                                             int outStride0,
+                                                             int outStride1,
+                                                             int outStride2,
+                                                             int outStride3,
+                                                             int outStride4,
+                                                             int outStride5,
+                                                             void* __restrict__ ws_global)
+{
+    (void)GridSize;
+    (void)BlkGroupSize;
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
+    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
+    const int dstLengths[6] = {
+        outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
+    const int dstStrides[6] = {
+        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
+    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
+    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
+    const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
+    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
+    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+    const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{});
+    const auto invariantDimLengths =
+        make_tuple_from_array_and_index_seq(srcLengths, invariantDims{});
+    auto src2dDesc =
+        transform_tensor_descriptor(srcDesc,
+                                    make_tuple(make_merge_transform(invariantDimLengths),
+                                               make_merge_transform(toReduceDimLengths)),
+                                    make_tuple(invariantDims{}, toReduceDims{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+    auto dst1dDesc = transform_tensor_descriptor(
+        dstDesc,
+        make_tuple(make_merge_transform(tupleDstLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    const auto invariantLen = src2dDesc.GetLength(Number<0>{});
+    const auto toReduceLen  = src2dDesc.GetLength(Number<1>{});
+    constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad =
+            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pass_through_transform(invariantLen),
+                                                   make_pad_transform(toReduceLen, 0, srcPad)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+    if(hipThreadIdx_x == 0)
+        *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
+};
+template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
+struct get_ref_desc_types
+{
+    static constexpr auto ref_toReduceDimLengths =
+        typename uniform_sequence_gen<toReduceDims::Size(), 8>::type{};
+    static constexpr auto ref_invariantDimLengths =
+        typename uniform_sequence_gen<invariantDims::Size(), 8>::type{};
+    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
+    static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 8>::type{};
+    // don't have to use accurate strides to get an expected referrence type
+    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
+    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
+    static constexpr auto ref_src2dDesc = transform_tensor_descriptor(
+        ref_srcDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)),
+                   make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))),
+        make_tuple(invariantDims{}, toReduceDims{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
+        ref_dstDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
+    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
+    // used by the BlockWise and MultiBlock method
+    using refType_src2dDesc_padded_34 = decltype(
+        transform_tensor_descriptor(ref_src2dDesc,
+                                    make_tuple(make_pass_through_transform(ref_invariantLen),
+                                               make_pad_transform(ref_toReduceLen, 0, 2)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{})));
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dst1dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dst1dDesc);
+};
+using refType_src2dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
+using refType_dst1dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
+using refType_src2dDesc_padded_34 =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
+        refType_src2dDesc_padded_34;
+using refType_dst1dDesc_padded =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
+        refType_dst1dDesc_padded;
+template <bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
+                                                     int BlkGroupSize,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     void* __restrict__ ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)BlkGroupSize;
+    (void)ws_buf2_bytes_offset;
+    const void* p_src2dDesc = ws_global;
+    const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise<BlockSize,
+                                                                   srcDataType,
+                                                                   dstDataType,
+                                                                   compType,
+                                                                   decltype(src2dDesc),
+                                                                   decltype(dst1dDesc),
+                                                                   op,
+                                                                   nanPropaOpt,
+                                                                   reduceIndicesOpt,
+                                                                   true,
+                                                                   true,
+                                                                   GredAccessesPerThreadInBlock>;
+    constexpr int RunId = need_indices ? 2 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(p_src_global),
+        beta,
+        static_cast<dstDataType* const __restrict__>(p_dst_global),
+        static_cast<const int* const __restrict__>(nullptr),
+        static_cast<int* const __restrict__>(indices_global));
+};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_multiblock.hpp"
+using namespace ck;
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+constexpr index_t srcDims = CK_PARAM_IN_DIMS;
+constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
+using toReduceDims  = Sequence<CK_PARAM_TOREDUCE_DIMS>;
+using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>; // this could be empty
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+////////////////////////////////////////////////////////////////////////////////////////
+using specDims = typename sequence_merge<Sequence<>, toReduceDims>::type;
+static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
+              "Wrong invariant and/or toReduce dimensions!");
+// The number of invariant dimensions can be zero if all dimension are to be reduced
+static_assert(dstDims == 1,
+              "If all source dimensions are reduced, the dest should have only one dimension !!");
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
+// helper functions using variadic template arguments
+template <index_t... Ns>
+__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
+{
+    return make_tuple(static_cast<index_t>(lengths[Ns])...);
+};
+template <index_t arraySize>
+__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
+{
+    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
+    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
+    return make_tuple_from_array_and_index_seq(lengths, index_seq);
+};
+template <index_t... Ns>
+__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
+{
+    return make_tuple(Ns...);
+};
+extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
+                                                             int BlkGroupSize,
+                                                             int inLength0,
+                                                             int inLength1,
+                                                             int inLength2,
+                                                             int inLength3,
+                                                             int inLength4,
+                                                             int inLength5,
+                                                             int inStride0,
+                                                             int inStride1,
+                                                             int inStride2,
+                                                             int inStride3,
+                                                             int inStride4,
+                                                             int inStride5,
+                                                             int outLength0,
+                                                             int outLength1,
+                                                             int outLength2,
+                                                             int outLength3,
+                                                             int outLength4,
+                                                             int outLength5,
+                                                             int outStride0,
+                                                             int outStride1,
+                                                             int outStride2,
+                                                             int outStride3,
+                                                             int outStride4,
+                                                             int outStride5,
+                                                             void* __restrict__ ws_global)
+{
+    (void)GridSize;
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
+    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
+    const int dstLengths[6] = {
+        outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
+    const int dstStrides[6] = {
+        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
+    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
+    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
+    const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
+    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
+    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+    const auto one_dim_srcDesc = transform_tensor_descriptor(
+        srcDesc,
+        make_tuple(make_merge_transform(tupleSrcLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    auto src2dDesc = transform_tensor_descriptor(
+        one_dim_srcDesc,
+        make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))),
+        make_tuple(Sequence<0>{}),
+        make_tuple(Sequence<0, 1>{}));
+    auto dst1dDesc = transform_tensor_descriptor(
+        dstDesc,
+        make_tuple(make_merge_transform(tupleDstLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    const auto invariantLen = src2dDesc.GetLength(Number<0>{});
+    const auto toReduceLen  = src2dDesc.GetLength(Number<1>{});
+    constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
+    const index_t reduceSizePerBlock =
+        (((toReduceLen + BlkGroupSize - 1) / BlkGroupSize + copySliceLen - 1) / copySliceLen) *
+        copySliceLen;
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad = reduceSizePerBlock * BlkGroupSize - toReduceLen;
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pass_through_transform(invariantLen),
+                                                   make_pad_transform(toReduceLen, 0, srcPad)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+    if(hipThreadIdx_x == 0)
+        *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
+};
+template <index_t srcDims, index_t dstDims, typename toReduceDims>
+struct get_ref_desc_types
+{
+    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
+    static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 1>::type{};
+    // don't have to use accurate strides to get an expected referrence type
+    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
+    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
+    static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
+        ref_srcDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))),
+        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    static constexpr auto ref_src2dDesc =
+        transform_tensor_descriptor(ref_one_dim_srcDesc,
+                                    make_tuple(make_unmerge_transform(
+                                        make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))),
+                                    make_tuple(Sequence<0>{}),
+                                    make_tuple(Sequence<0, 1>{}));
+    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
+        ref_dstDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
+    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
+    // used by the BlockWise and MultiBlock method
+    using refType_src2dDesc_padded_34 = decltype(
+        transform_tensor_descriptor(ref_src2dDesc,
+                                    make_tuple(make_pass_through_transform(ref_invariantLen),
+                                               make_pad_transform(ref_toReduceLen, 0, 2)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{})));
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dst1dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dst1dDesc);
+};
+using refType_src2dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc;
+using refType_dst1dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc;
+using refType_src2dDesc_padded_34 =
+    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc_padded_34;
+using refType_dst1dDesc_padded =
+    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc_padded;
+template <bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
+                                                     int BlkGroupSize,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     void* __restrict__ ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)p_dst_global;
+    (void)indices_global;
+    const void* p_src2dDesc = ws_global;
+    const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    void* ws_buf1_global    = static_cast<char*>(ws_global) + 4096;
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_multiblock<BlockSize,
+                                                                    srcDataType,
+                                                                    dstDataType,
+                                                                    compType,
+                                                                    decltype(src2dDesc),
+                                                                    decltype(dst1dDesc),
+                                                                    op,
+                                                                    nanPropaOpt,
+                                                                    reduceIndicesOpt,
+                                                                    GredAccessesPerThreadInBlock>;
+    void* const ws_buf2_global =
+        ws_buf2_bytes_offset > 0
+            ? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
+            : nullptr;
+    constexpr int RunId = need_indices ? 2 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        BlkGroupSize,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(p_src_global),
+        beta,
+        static_cast<srcDataType* const __restrict__>(ws_buf1_global),
+        static_cast<int* const __restrict__>(ws_buf2_global));
+};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_multiblock.hpp"
+using namespace ck;
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+constexpr index_t srcDims = CK_PARAM_IN_DIMS;
+constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
+using toReduceDims  = Sequence<CK_PARAM_TOREDUCE_DIMS>;
+using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>;
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+////////////////////////////////////////////////////////////////////////////////////////
+using specDims = typename sequence_merge<invariantDims, toReduceDims>::type;
+static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
+              "Wrong invariant and/or toReduce dimensions!");
+// The number of invariant dimensions can be zero if all dimension are to be reduced
+static_assert(invariantDims::Size() > 0 || dstDims == 1,
+              "If all source dimensions are reduced, the dest should have only one dimension !!");
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
+// helper functions using variadic template arguments
+template <index_t... Ns>
+__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
+{
+    return make_tuple(static_cast<index_t>(lengths[Ns])...);
+};
+template <index_t arraySize>
+__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
+{
+    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
+    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
+    return make_tuple_from_array_and_index_seq(lengths, index_seq);
+};
+template <index_t... Ns>
+__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
+{
+    return make_tuple(Ns...);
+};
+extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
+                                                             int BlkGroupSize,
+                                                             int inLength0,
+                                                             int inLength1,
+                                                             int inLength2,
+                                                             int inLength3,
+                                                             int inLength4,
+                                                             int inLength5,
+                                                             int inStride0,
+                                                             int inStride1,
+                                                             int inStride2,
+                                                             int inStride3,
+                                                             int inStride4,
+                                                             int inStride5,
+                                                             int outLength0,
+                                                             int outLength1,
+                                                             int outLength2,
+                                                             int outLength3,
+                                                             int outLength4,
+                                                             int outLength5,
+                                                             int outStride0,
+                                                             int outStride1,
+                                                             int outStride2,
+                                                             int outStride3,
+                                                             int outStride4,
+                                                             int outStride5,
+                                                             void* __restrict__ ws_global)
+{
+    (void)GridSize;
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
+    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
+    const int dstLengths[6] = {
+        outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
+    const int dstStrides[6] = {
+        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
+    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
+    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
+    const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
+    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
+    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+    const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{});
+    const auto invariantDimLengths =
+        make_tuple_from_array_and_index_seq(srcLengths, invariantDims{});
+    auto src2dDesc =
+        transform_tensor_descriptor(srcDesc,
+                                    make_tuple(make_merge_transform(invariantDimLengths),
+                                               make_merge_transform(toReduceDimLengths)),
+                                    make_tuple(invariantDims{}, toReduceDims{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+    auto dst1dDesc = transform_tensor_descriptor(
+        dstDesc,
+        make_tuple(make_merge_transform(tupleDstLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    const auto invariantLen = src2dDesc.GetLength(Number<0>{});
+    const auto toReduceLen  = src2dDesc.GetLength(Number<1>{});
+    constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
+    const index_t reduceSizePerBlock =
+        (((toReduceLen + BlkGroupSize - 1) / BlkGroupSize + copySliceLen - 1) / copySliceLen) *
+        copySliceLen;
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad = reduceSizePerBlock * BlkGroupSize - toReduceLen;
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pass_through_transform(invariantLen),
+                                                   make_pad_transform(toReduceLen, 0, srcPad)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(hipThreadIdx_x == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+    if(hipThreadIdx_x == 0)
+        *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
+};
+template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
+struct get_ref_desc_types
+{
+    static constexpr auto ref_toReduceDimLengths =
+        typename uniform_sequence_gen<toReduceDims::Size(), 8>::type{};
+    static constexpr auto ref_invariantDimLengths =
+        typename uniform_sequence_gen<invariantDims::Size(), 8>::type{};
+    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
+    static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 8>::type{};
+    // don't have to use accurate strides to get an expected referrence type
+    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
+    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
+    static constexpr auto ref_src2dDesc = transform_tensor_descriptor(
+        ref_srcDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)),
+                   make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))),
+        make_tuple(invariantDims{}, toReduceDims{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
+        ref_dstDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
+    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
+    // used by the BlockWise and MultiBlock method
+    using refType_src2dDesc_padded_34 = decltype(
+        transform_tensor_descriptor(ref_src2dDesc,
+                                    make_tuple(make_pass_through_transform(ref_invariantLen),
+                                               make_pad_transform(ref_toReduceLen, 0, 2)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{})));
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dst1dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dst1dDesc);
+};
+using refType_src2dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
+using refType_dst1dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
+using refType_src2dDesc_padded_34 =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
+        refType_src2dDesc_padded_34;
+using refType_dst1dDesc_padded =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
+        refType_dst1dDesc_padded;
+template <bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
+                                                     int BlkGroupSize,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     void* __restrict__ ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)p_dst_global;
+    (void)indices_global;
+    const void* p_src2dDesc = ws_global;
+    const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    void* ws_buf1_global    = static_cast<char*>(ws_global) + 4096;
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_multiblock<BlockSize,
+                                                                    srcDataType,
+                                                                    dstDataType,
+                                                                    compType,
+                                                                    decltype(src2dDesc),
+                                                                    decltype(dst1dDesc),
+                                                                    op,
+                                                                    nanPropaOpt,
+                                                                    reduceIndicesOpt,
+                                                                    GredAccessesPerThreadInBlock>;
+    void* const ws_buf2_global =
+        ws_buf2_bytes_offset > 0
+            ? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
+            : nullptr;
+    constexpr int RunId = need_indices ? 2 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        BlkGroupSize,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(p_src_global),
+        beta,
+        static_cast<srcDataType* const __restrict__>(ws_buf1_global),
+        static_cast<int* const __restrict__>(ws_buf2_global));
+};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp