Merge remote-tracking branch 'origin/develop' into myamlak/cgemm

bb1f8082 · root · 97ac5007 · 82d7d993 · bb1f8082 · bb1f8082
Commit bb1f8082 authored May 26, 2022 by root
20 changed files
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r3.hpp
-#ifndef CK_BLOCKWISE_GEMM_DLOPS_V2R3_HPP
+#pragma once
-#define CK_BLOCKWISE_GEMM_DLOPS_V2R3_HPP
 #include "common_header.hpp"
 #include "tensor_adaptor.hpp"
-#include "threadwise_tensor_slice_transfer_v2.hpp"
+#include "threadwise_tensor_slice_transfer_v4r1.hpp"
-#include "threadwise_contraction_dlops.hpp"
+#include "threadwise_contraction_dl.hpp"
 namespace ck {
@@ -41,7 +39,7 @@ template <index_t BlockSize,
          typename enable_if<ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&
                                 BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(),
                             bool>::type = false>
-struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2
+struct BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2
 {
    using AIndex = MultiIndex<3>;
    using BIndex = MultiIndex<3>;
@@ -148,7 +146,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
        MakeBBlockDescriptor_BK0_BN0_BN1_BK1(BBlockDesc_BK0_BN_BK1{});
    public:
-    __device__ BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2()
+    __device__ BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2()
        : c_thread_origin_data_idx_{CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(
              get_thread_local_1d_id())},
          a_thread_copy_{
@@ -175,6 +173,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
                      "wrong!");
        // TODO: remove this restriction
+        static_assert(BM0 == 2, "wrong");
        static_assert(BM0 == 2 && BN0 == 2, "wrong");
    }
@@ -226,7 +225,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
            b_thread_desc_bk0_bn0_bn1_bk1_.GetElementSpaceSize());
        constexpr auto threadwise_contraction =
-            ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1<
+            ThreadwiseContractionDl_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1<
                FloatA,
                FloatB,
                FloatC,
@@ -407,4 +406,3 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
 };
 } // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
@@ -75,14 +75,13 @@ struct BlockwiseTensorSliceTransfer_v5r1
        }
    }
-    template <typename SrcBuffer, typename SrcStepHacks>
+    template <typename SrcBuffer>
-    __device__ void
+    __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf)
-    RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
    {
        if(BlockSize == thread_cluster_desc_.GetElementSize() or
           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
        {
-            threadwise_transfer_.RunRead(src_desc, src_buf, src_step_hacks);
+            threadwise_transfer_.RunRead(src_desc, src_buf);
        }
    }

--- a/include/ck/tensor_operation/gpu/device/device_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_base.hpp
@@ -40,6 +40,8 @@ struct BaseOperator
    virtual bool IsSupportedArgument(const BaseArgument*) { return false; }
    virtual std::string GetTypeString() const { return ""; }
+    virtual size_t GetWorkSpaceSize(const BaseArgument*) const { return 0; }
    virtual ~BaseOperator() {}
 };

--- a/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
@@ -15,91 +15,107 @@ template <typename ADataType,
          typename CDataType,
          typename ComputeDataType,
          typename ElementwiseFunctor,
-          index_t Dim,
+          index_t NDim,
-          index_t ScalarPerVector>
+          index_t MPerThread,
+          index_t AScalarPerVector,
+          index_t BScalarPerVector,
+          index_t CScalarPerVector>
 struct DeviceBinaryElementwise : public BaseOperator
 {
    static constexpr auto I0 = Number<0>{};
-    template <typename Desc_M0>
+    template <typename Desc_M>
-    static auto PadDescriptor_M0_1d(Desc_M0 desc_m0, index_t gridSize, index_t blockSize)
+    static auto PadDescriptor_M_1d(Desc_M desc_m, index_t gridSize, index_t blockSize)
    {
-        const auto m0           = desc_m0.GetLength(I0);
+        const auto M            = desc_m.GetLength(I0);
-        const index_t loop_step = gridSize * blockSize * ScalarPerVector;
+        const index_t loop_step = gridSize * blockSize * MPerThread;
-        const auto pad          = math::integer_least_multiple(m0, loop_step) - m0;
+        const auto pad          = math::integer_least_multiple(M, loop_step) - M;
-        const auto desc_m0_pad =
+        const auto desc_m_pad =
-            transform_tensor_descriptor(desc_m0,
+            transform_tensor_descriptor(desc_m,
-                                        make_tuple(make_right_pad_transform(m0, pad)),
+                                        make_tuple(make_right_pad_transform(M, pad)),
                                        make_tuple(Sequence<0>{}),
                                        make_tuple(Sequence<0>{}));
-        return desc_m0_pad;
+        return desc_m_pad;
    }
-    static auto MakeDescriptor_M0(const std::vector<index_t>& shape,
+    static auto MakeDescriptor_M(const std::vector<index_t>& lengths,
-                                  const std::vector<index_t>& stride,
+                                 const std::vector<index_t>& strides,
                                 index_t gridSize,
                                 index_t blockSize)
    {
-        auto tupleOfShape  = generate_tuple([&](auto I) { return shape[I]; }, Number<Dim>{});
+        auto tupleOfShape  = generate_tuple([&](auto I) { return lengths[I]; }, Number<NDim>{});
-        auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number<Dim>{});
+        auto tupleOfStride = generate_tuple([&](auto I) { return strides[I]; }, Number<NDim>{});
        // nd desc - [s0, s1, s2, ...]
        const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride);
        // merge nd to 1d desc - [s0 * s1 * ...]
-        if constexpr(Dim > 1)
+        if constexpr(NDim > 1)
        {
-            const auto desc_m0 = transform_tensor_descriptor(
+            const auto desc_m = transform_tensor_descriptor(
                desc,
                make_tuple(make_merge_transform(tupleOfShape)),
-                make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number<Dim>{})),
+                make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number<NDim>{})),
                make_tuple(Sequence<0>{}));
-            return PadDescriptor_M0_1d(desc_m0, gridSize, blockSize);
+            return PadDescriptor_M_1d(desc_m, gridSize, blockSize);
        }
        else
-            return PadDescriptor_M0_1d(desc, gridSize, blockSize);
+            return PadDescriptor_M_1d(desc, gridSize, blockSize);
    }
-    using GridDesc_M0        = decltype(MakeDescriptor_M0({1, 1}, {1, 1}, 1, 1));
+    using AGridDesc_M        = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1));
+    using BGridDesc_M        = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1));
+    using CGridDesc_M        = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1));
    using GridwiseBinEltwise = GridwiseBinaryElementwise_1D<ADataType,
                                                            BDataType,
                                                            CDataType,
                                                            ComputeDataType,
-                                                            GridDesc_M0,
+                                                            AGridDesc_M,
+                                                            BGridDesc_M,
+                                                            CGridDesc_M,
                                                            ElementwiseFunctor,
-                                                            ScalarPerVector>;
+                                                            MPerThread,
+                                                            AScalarPerVector,
+                                                            BScalarPerVector,
+                                                            CScalarPerVector>;
    struct Argument : public BaseArgument
    {
        Argument(const ADataType* p_a,
                 const BDataType* p_b,
                 CDataType* p_c,
-                 const std::vector<index_t>& shape,
+                 const std::vector<index_t>& lengths,
-                 const std::vector<index_t>& stride_a,
+                 const std::vector<index_t>& a_strides,
-                 const std::vector<index_t>& stride_b,
+                 const std::vector<index_t>& b_strides,
-                 const std::vector<index_t>& stride_c,
+                 const std::vector<index_t>& c_strides,
                 ElementwiseFunctor functor)
            : p_a_(p_a),
              p_b_(p_b),
              p_c_(p_c),
-              shape_(shape),
+              lengths_(lengths),
+              a_strides_(a_strides),
+              b_strides_(b_strides),
+              c_strides_(c_strides),
              functor_(functor),
              blockSize_(256),
              gridSize_(120) // FIXME - Calculate the grid size by number of CU in the future
        {
-            a_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_a, gridSize_, blockSize_);
+            a_grid_desc_m_ = MakeDescriptor_M(lengths, a_strides, gridSize_, blockSize_);
-            b_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_b, gridSize_, blockSize_);
+            b_grid_desc_m_ = MakeDescriptor_M(lengths, b_strides, gridSize_, blockSize_);
-            c_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_c, gridSize_, blockSize_);
+            c_grid_desc_m_ = MakeDescriptor_M(lengths, c_strides, gridSize_, blockSize_);
        }
        const ADataType* p_a_;
        const BDataType* p_b_;
        CDataType* p_c_;
-        std::vector<int> shape_;
+        std::vector<int> lengths_;
-        GridDesc_M0 a_grid_desc_m0_;
+        AGridDesc_M a_grid_desc_m_;
-        GridDesc_M0 b_grid_desc_m0_;
+        BGridDesc_M b_grid_desc_m_;
-        GridDesc_M0 c_grid_desc_m0_;
+        CGridDesc_M c_grid_desc_m_;
+        std::vector<index_t> a_strides_;
+        std::vector<index_t> b_strides_;
+        std::vector<index_t> c_strides_;
        ElementwiseFunctor functor_;
        index_t blockSize_;
        index_t gridSize_;
@@ -113,7 +129,9 @@ struct DeviceBinaryElementwise : public BaseOperator
                                                             ADataType,
                                                             BDataType,
                                                             CDataType,
-                                                             GridDesc_M0,
+                                                             AGridDesc_M,
+                                                             BGridDesc_M,
+                                                             CGridDesc_M,
                                                             ElementwiseFunctor>;
            float elapsed_time = launch_and_time_kernel(stream_config,
@@ -124,9 +142,9 @@ struct DeviceBinaryElementwise : public BaseOperator
                                                        arg.p_a_,
                                                        arg.p_b_,
                                                        arg.p_c_,
-                                                        arg.a_grid_desc_m0_,
+                                                        arg.a_grid_desc_m_,
-                                                        arg.b_grid_desc_m0_,
+                                                        arg.b_grid_desc_m_,
-                                                        arg.c_grid_desc_m0_,
+                                                        arg.c_grid_desc_m_,
                                                        arg.functor_);
            return elapsed_time;
        }
@@ -146,7 +164,30 @@ struct DeviceBinaryElementwise : public BaseOperator
        if(pArg == nullptr)
            return false;
-        if(pArg->shape_.back() % ScalarPerVector != 0)
+        if(pArg->lengths_.size() != NDim)
+            return false;
+        if(pArg->lengths_.back() % MPerThread != 0)
+            return false;
+        auto IsScalarPerVectorValid = [](bool isLastDimensionCoalesced, int scalarPerVector) {
+            bool ret = true;
+            if(!isLastDimensionCoalesced)
+                ret = scalarPerVector == 1;
+            else
+                ret = MPerThread % scalarPerVector == 0;
+            return ret;
+        };
+        if(!IsScalarPerVectorValid(pArg->a_strides_.back() == 1, AScalarPerVector))
+            return false;
+        if(!IsScalarPerVectorValid(pArg->b_strides_.back() == 1, BScalarPerVector))
+            return false;
+        if(!IsScalarPerVectorValid(pArg->c_strides_.back() == 1, CScalarPerVector))
            return false;
        return true;
@@ -155,19 +196,19 @@ struct DeviceBinaryElementwise : public BaseOperator
    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
                                                      const void* p_b,
                                                      void* p_c,
-                                                      std::vector<index_t> shape,
+                                                      std::vector<index_t> lengths,
-                                                      std::vector<index_t> stride_a,
+                                                      std::vector<index_t> a_strides,
-                                                      std::vector<index_t> stride_b,
+                                                      std::vector<index_t> b_strides,
-                                                      std::vector<index_t> stride_c,
+                                                      std::vector<index_t> c_strides,
                                                      ElementwiseFunctor functor)
    {
        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
                                          static_cast<const BDataType*>(p_b),
                                          static_cast<CDataType*>(p_c),
-                                          shape,
+                                          lengths,
-                                          stride_a,
+                                          a_strides,
-                                          stride_b,
+                                          b_strides,
-                                          stride_c,
+                                          c_strides,
                                          functor);
    }
@@ -180,7 +221,7 @@ struct DeviceBinaryElementwise : public BaseOperator
        // clang-format off
        str << "DeviceBinaryElementwise"
            << "<"
-            << "ScalarPerVector = " << ScalarPerVector
+            << "MPerThread = " << MPerThread
            << ">";
        // clang-format on

--- a/include/ck/tensor_operation/gpu/device/device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -1175,6 +1175,57 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
        return str.str();
    }
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 1, bool>::type = false>
+    static size_t GetWorkSpaceSize(const Argument& arg)
+    {
+        size_t WorkSpaceSize = 0;
+        if(arg.k_batch_ > 1)
+        {
+            if constexpr(std::is_same<InDataType, ck::bhalf_t>::value)
+            {
+                WorkSpaceSize =
+                    arg.Conv_K_ * arg.Conv_C_ * arg.filter_spatial_lengths_[0] * sizeof(float);
+            }
+        }
+        return WorkSpaceSize;
+    }
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 2, bool>::type = false>
+    static size_t GetWorkSpaceSize(const Argument& arg)
+    {
+        size_t WorkSpaceSize = 0;
+        if(arg.k_batch_ > 1)
+        {
+            if constexpr(std::is_same<InDataType, ck::bhalf_t>::value)
+            {
+                WorkSpaceSize = arg.Conv_K_ * arg.Conv_C_ * arg.filter_spatial_lengths_[0] *
+                                arg.filter_spatial_lengths_[1] * sizeof(float);
+            }
+        }
+        return WorkSpaceSize;
+    }
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 3, bool>::type = false>
+    static size_t GetWorkSpaceSize(const Argument& arg)
+    {
+        size_t WorkSpaceSize = 0;
+        if(arg.k_batch_ > 1)
+        {
+            if constexpr(std::is_same<InDataType, ck::bhalf_t>::value)
+            {
+                WorkSpaceSize = arg.Conv_K_ * arg.Conv_C_ * arg.filter_spatial_lengths_[0] *
+                                arg.filter_spatial_lengths_[1] * arg.filter_spatial_lengths_[2] *
+                                sizeof(float);
+            }
+        }
+        return WorkSpaceSize;
+    }
+    size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override final
+    {
+        return GetWorkSpaceSize<NumDimSpatial>(*dynamic_cast<const Argument*>(p_arg));
+    }
 };
 } // namespace device

--- a/include/ck/tensor_operation/gpu/device/device_gemm_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_dl.hpp
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp
-#ifndef DEVICE_GEMM_XDL_HPP
+#pragma once
-#define DEVICE_GEMM_XDL_HPP
 #include <iostream>
 #include <sstream>
@@ -12,6 +11,7 @@
 #include "tensor_descriptor_helper.hpp"
 #include "gridwise_gemm_xdlops_v2r3.hpp"
 #include "gemm_specialization.hpp"
+#include "device_prop.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -408,6 +408,11 @@ struct DeviceGemmXdl
    static bool IsSupportedArgument(const Argument& arg)
    {
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        {
+            return false;
+        }
        return GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
                                           arg.b_grid_desc_k0_n_k1_,
                                           arg.c_grid_desc_m_n_,
@@ -515,4 +520,3 @@ struct DeviceGemmXdl
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp
@@ -9,6 +9,7 @@
 #include "tensor_descriptor_helper.hpp"
 #include "gridwise_gemm_xdl_cshuffle_v1.hpp"
 #include "tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "device_prop.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -558,6 +559,11 @@ struct DeviceGemm_Xdl_CShuffle
    static bool IsSupportedArgument(const Argument& arg)
    {
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        {
+            return false;
+        }
        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
                                           arg.b_grid_desc_bk0_n_bk1_,
                                           arg.c_grid_desc_m_n_,

--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp
@@ -12,6 +12,7 @@
 #include "tensor_descriptor_helper.hpp"
 #include "gridwise_gemm_xdlops_v2r4.hpp"
 #include "gemm_specialization.hpp"
+#include "device_prop.hpp"
 #ifndef CK_RUN_KERNEL_AND_TIME
 #define CK_RUN_KERNEL_AND_TIME 1
@@ -528,6 +529,11 @@ struct DeviceGemmXdlSplitK
    static bool IsSupportedArgument(const Argument& arg)
    {
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        {
+            return false;
+        }
        return GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
                                           arg.b_grid_desc_kbatch_k0_n_k1_,
                                           arg.c_grid_desc_m_n_,

--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
@@ -346,7 +346,6 @@ struct DeviceGroupedGemmXdl
            return block_2_ctile_map_.CheckValidity(c_grid_desc_m_n);
        }
-        private:
        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
        ck::index_t BlockStart_;
    };
@@ -418,9 +417,8 @@ struct DeviceGroupedGemmXdl
                    DeviceGroupedGemmXdl::MakeCGridDescriptor_M_N(M, N, StrideC);
                const index_t grid_size_grp =
-                    typename GroupedGemmBlock2CTileMap::UnderlyingBlock2CTileMap(
+                    GroupedGemmBlock2CTileMap(c_grid_desc_m_n_, M01, N01, 0)
-                        c_grid_desc_m_n_, M01, N01)
+                        .block_2_ctile_map_.CalculateGridSize(c_grid_desc_m_n_);
-                        .CalculateGridSize(c_grid_desc_m_n_);
                const index_t BlockStart = grid_size_;
                const index_t BlockEnd   = grid_size_ + grid_size_grp;

--- a/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
@@ -17,7 +17,7 @@ template <typename InDataType,
          typename OutDataType,
          typename AccDataType,
          ck::ReduceTensorOp ReduceOpId,
-          bool NeedIndices,
+          bool OuputIndex,
          ck::index_t BlockSize,
          ck::index_t ReduceMThreadClusterSize,
          ck::index_t ReduceKThreadClusterSize,
@@ -44,8 +44,6 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
            AccElementwiseOperation;
-    static constexpr bool BetaIsZero = true;
    static constexpr index_t InSrcOutDstVectorDim =
        0; // for NHWC, the dim C is the vector Dim for both input and output in memory, which is
           // not reduced.
@@ -206,7 +204,8 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
    {
        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
        {
-            using gridwise_reduce = GridwiseReduction_mk_to_m_threadwise<InDataType,
+            using gridwise_reduce =
+                GridwiseReduction_mk_to_m_threadwise<InDataType,
                                                     OutDataType,
                                                     AccDataType,
                                                     IndexDataType,
@@ -215,11 +214,9 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
                                                     ReduceOperation,
                                                     InElementwiseOperation,
                                                     AccElementwiseOperation,
+                                                     InMemoryDataOperationEnum::Set,
                                                     false, // propagate_nan
-                                                                         BetaIsZero,
                                                     BlockSize,
-                                                                         ReduceMThreadClusterSize,
-                                                                         ReduceKThreadClusterSize,
                                                     ReduceMThreadSliceSize,
                                                     ReduceKThreadSliceSize,
                                                     InSrcOutDstVectorDim,
@@ -227,7 +224,8 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
                                                     InSrcOutDstVectorSize>;
            const auto kernel = kernel_reduce_threadwise<gridwise_reduce,
-                                                         NeedIndices,
+                                                         OuputIndex,
+                                                         false, // don't have index input
                                                         InDataType,
                                                         OutDataType,
                                                         AccDataType,
@@ -252,6 +250,7 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
                                          arg.acc_element_op_,
                                          float(1),
                                          arg.p_in_dev_,
+                                          nullptr,
                                          float(0),
                                          arg.p_out_dev_,
                                          arg.p_out_indices_dev_);

--- a/include/ck/tensor_operation/gpu/device/device_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce.hpp
@@ -16,35 +16,18 @@ namespace device {
 template <typename InElementwiseOperation, typename AccElementwiseOperation>
 struct DeviceReduce : public BaseOperator
 {
-    virtual long_index_t GetWorkspaceSizeInBytes(const std::vector<int> inLengths,
-                                                 const std::vector<int> reduceDims)
-    {
-        (void)inLengths;
-        (void)reduceDims;
-        return (0);
-    };
-    virtual bool HasFurtherCall() { return (false); };
-    virtual std::vector<int> GetWorkspace2dLengths(const BaseArgument* argPtr)
-    {
-        (void)argPtr;
-        return (std::vector<int>{0, 0});
-    };
    virtual std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const std::vector<int> inLengths,
+    MakeArgumentPointer(const std::vector<index_t> inLengths,
-                        const std::vector<int> inStrides,
+                        const std::vector<index_t> inStrides,
-                        const std::vector<int> outLengths,
+                        const std::vector<index_t> outLengths,
-                        const std::vector<int> outStrides,
+                        const std::vector<index_t> outStrides,
                        const std::vector<int> reduceDims,
                        float alpha,
                        float beta,
                        const void* in_dev,
+                        const void* in_index_dev,
                        void* out_dev,
-                        void* out_indices_dev,
+                        void* out_index_dev,
-                        void* workspace_dev,
                        const InElementwiseOperation in_elementwise_op,
                        const AccElementwiseOperation acc_elementwise_op) = 0;

--- a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp
-#ifndef DEVICE_REDUCE_BLOCKWISE_HPP
-#define DEVICE_REDUCE_BLOCKWISE_HPP
-#include <iostream>
-#include <sstream>
-#include "device.hpp"
-#include "device_reduce.hpp"
-#include "device_reduce_common.hpp"
-#include "gridwise_2d_reduction_blockwise.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-template <typename InDataType,
-          typename AccDataType,
-          typename OutDataType,
-          index_t Rank,
-          index_t NumReduceDim,
-          typename ReduceOperation,
-          typename InElementwiseOperation,
-          typename AccElementwiseOperation,
-          bool PropagateNan,
-          bool NeedIndices,
-          index_t BlockSize,
-          index_t MThreadClusterSize,
-          index_t KThreadClusterSize,
-          index_t MThreadSliceSize,
-          index_t KThreadSliceSize,
-          index_t InSrcVectorDim,
-          index_t InSrcVectorSize,
-          index_t OutDstVectorSize>
-struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccElementwiseOperation>
-{
-    static_assert(Rank <= 6, "Bigger Rank size is not supported!");
-    static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
-                  "Invalid thread cluster size assignments!");
-    static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
-                   (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&
-                      (MThreadSliceSize % OutDstVectorSize == 0),
-                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
-    using IndexDataType = int32_t;
-    static constexpr bool BetaIsZero = NeedIndices;
-    static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
-    static constexpr index_t numSrcDim = Rank;
-    static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;
-    static constexpr bool reduceAllDim = (NumInvariantDim == 0);
-    static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
-    static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
-    static auto MakeSrc2dDescriptor(const std::vector<int>& inLengths,
-                                    const std::vector<int>& inStrides)
-    {
-        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<numSrcDim>{});
-        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<numSrcDim>{});
-        const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
-        const auto in_grid_desc_m_k = [&]() {
-            if constexpr(reduceAllDim)
-            {
-                const auto one_dim_inDesc = transform_tensor_descriptor(
-                    inDesc,
-                    make_tuple(make_merge_transform(tupleSrcLengths)),
-                    make_tuple(typename arithmetic_sequence_gen<0, numSrcDim, 1>::type{}),
-                    make_tuple(Sequence<0>{}));
-                return transform_tensor_descriptor(one_dim_inDesc,
-                                                   make_tuple(make_unmerge_transform(make_tuple(
-                                                       1, one_dim_inDesc.GetLength(Number<0>{})))),
-                                                   make_tuple(Sequence<0>{}),
-                                                   make_tuple(Sequence<0, 1>{}));
-            }
-            else
-            {
-                using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
-                using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
-                const auto reduceDimLengths =
-                    make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
-                const auto invariantDimLengths =
-                    make_tuple_from_array_and_index_seq(inLengths, InvariantDims{});
-                return transform_tensor_descriptor(
-                    inDesc,
-                    make_tuple(make_merge_transform(invariantDimLengths),
-                               make_merge_transform(reduceDimLengths)),
-                    make_tuple(InvariantDims{}, ReduceDims{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}));
-            }
-        }();
-        const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
-        const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});
-        const auto inPad_M =
-            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
-        const auto inPad_K =
-            math::integer_least_multiple(reduceLength, K_BlockTileSize) - reduceLength;
-        auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
-            in_grid_desc_m_k,
-            make_tuple(make_right_pad_transform(invariantLength, inPad_M),
-                       make_right_pad_transform(reduceLength, inPad_K)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}));
-        return (in_grid_desc_m_k_padded);
-    };
-    static auto MakeDst1dDescriptor(const std::vector<int>& outLengths,
-                                    const std::vector<int>& outStrides)
-    {
-        const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<numDstDim>{});
-        const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<numDstDim>{});
-        auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
-        auto out_grid_desc_m = transform_tensor_descriptor(
-            outDesc,
-            make_tuple(make_merge_transform(tupleDstLengths)),
-            make_tuple(typename arithmetic_sequence_gen<0, numDstDim, 1>::type{}),
-            make_tuple(Sequence<0>{}));
-        const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{});
-        const auto inPad =
-            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
-        auto out_grid_desc_m_padded = transform_tensor_descriptor(
-            out_grid_desc_m,
-            make_tuple(make_right_pad_transform(invariantLength, inPad)),
-            make_tuple(Sequence<0>{}),
-            make_tuple(Sequence<0>{}));
-        return (out_grid_desc_m_padded);
-    };
-    struct Argument : public BaseArgument
-    {
-        Argument(const std::vector<int> inLengths,
-                 const std::vector<int> inStrides,
-                 const std::vector<int> outLengths,
-                 const std::vector<int> outStrides,
-                 const std::vector<int> reduceDims,
-                 float alpha,
-                 float beta,
-                 const InDataType* in_dev,
-                 OutDataType* out_dev,
-                 IndexDataType* out_indices_dev,
-                 AccDataType* workspace_dev,
-                 const InElementwiseOperation in_elementwise_op,
-                 const AccElementwiseOperation acc_elementwise_op)
-            : outLengths_{outLengths},
-              outStrides_{outStrides},
-              in_dev_{in_dev},
-              out_dev_{out_dev},
-              out_indices_dev_{out_indices_dev},
-              in_elementwise_op_{in_elementwise_op},
-              acc_elementwise_op_{acc_elementwise_op}
-        {
-            (void)workspace_dev;
-            inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
-            inStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inStrides, reduceDims);
-            alpha_ = type_convert<AccDataType>(alpha);
-            beta_  = type_convert<AccDataType>(beta);
-            std::tie(invariant_total_length, reduce_total_length) =
-                get_2d_lengths<Rank, NumReduceDim>(inLengths_);
-            if constexpr(NumInvariantDim == 0)
-                invariant_lowest_length = 1;
-            else
-                invariant_lowest_length = inLengths_[NumInvariantDim - 1];
-            reduce_lowest_length = inLengths_[Rank - 1];
-            gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
-                       M_BlockTileSize;
-        }
-        std::vector<int> inLengths_;
-        std::vector<int> inStrides_;
-        std::vector<int> outLengths_;
-        std::vector<int> outStrides_;
-        AccDataType alpha_;
-        AccDataType beta_;
-        const InDataType* in_dev_;
-        OutDataType* out_dev_;
-        IndexDataType* out_indices_dev_;
-        InElementwiseOperation in_elementwise_op_;
-        AccElementwiseOperation acc_elementwise_op_;
-        int invariant_lowest_length;
-        int reduce_lowest_length;
-        size_t invariant_total_length;
-        size_t reduce_total_length;
-        size_t gridSize;
-    };
-    struct Invoker : public BaseInvoker
-    {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
-        {
-            const auto in_grid_desc_m_k =
-                DeviceReduceBlockWise::MakeSrc2dDescriptor(arg.inLengths_, arg.inStrides_);
-            const auto out_grid_desc_m =
-                DeviceReduceBlockWise::MakeDst1dDescriptor(arg.outLengths_, arg.outStrides_);
-            using InGridDesc_M_K = decltype(in_grid_desc_m_k);
-            using OutGridDesc_M  = decltype(out_grid_desc_m);
-            using GridwiseReduce = GridwiseReduction_mk_to_m_blockwise<InDataType,
-                                                                       OutDataType,
-                                                                       AccDataType,
-                                                                       IndexDataType,
-                                                                       InGridDesc_M_K,
-                                                                       OutGridDesc_M,
-                                                                       ReduceOperation,
-                                                                       InElementwiseOperation,
-                                                                       AccElementwiseOperation,
-                                                                       PropagateNan,
-                                                                       BetaIsZero,
-                                                                       BlockSize,
-                                                                       MThreadClusterSize,
-                                                                       KThreadClusterSize,
-                                                                       MThreadSliceSize,
-                                                                       KThreadSliceSize,
-                                                                       InSrcVectorDim,
-                                                                       InSrcVectorSize,
-                                                                       OutDstVectorSize>;
-            float avg_time = 0;
-            const auto kernel = kernel_reduce_blockwise<GridwiseReduce,
-                                                        NeedIndices,
-                                                        InDataType,
-                                                        OutDataType,
-                                                        AccDataType,
-                                                        IndexDataType,
-                                                        InGridDesc_M_K,
-                                                        OutGridDesc_M,
-                                                        InElementwiseOperation,
-                                                        AccElementwiseOperation>;
-            avg_time = launch_and_time_kernel(stream_config,
-                                              kernel,
-                                              dim3(arg.gridSize),
-                                              dim3(BlockSize),
-                                              0,
-                                              in_grid_desc_m_k,
-                                              out_grid_desc_m,
-                                              arg.in_elementwise_op_,
-                                              arg.acc_elementwise_op_,
-                                              arg.alpha_,
-                                              arg.in_dev_,
-                                              arg.beta_,
-                                              arg.out_dev_,
-                                              nullptr,
-                                              arg.out_indices_dev_);
-            return (avg_time);
-        };
-        float Run(const BaseArgument* p_arg,
-                  const StreamConfig& stream_config = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
-        };
-    };
-    bool IsSupportedArgument(const BaseArgument* p_arg) override
-    {
-        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
-        if constexpr(InSrcVectorDim == 0)
-        {
-            if constexpr(NumInvariantDim == 0)
-            {
-                return (false);
-            }
-            else
-            {
-                if(pArg->inStrides_[NumInvariantDim - 1] != 1)
-                    return (false);
-                if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
-                    return (false);
-            };
-        }
-        else
-        {
-            if(pArg->inStrides_[Rank - 1] != 1)
-                return (false);
-            if(pArg->reduce_lowest_length % InSrcVectorSize != 0)
-                return (false);
-        };
-        // To improve
-        if(pArg->invariant_lowest_length % OutDstVectorSize != 0)
-            return (false);
-        // cases with very small reduce_total_length should be handled by the ThreadWise method
-        if(pArg->reduce_total_length / KThreadSliceSize < 2)
-            return (false);
-        return (true);
-    };
-    std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const std::vector<int> inLengths,
-                        const std::vector<int> inStrides,
-                        const std::vector<int> outLengths,
-                        const std::vector<int> outStrides,
-                        const std::vector<int> reduceDims,
-                        float alpha,
-                        float beta,
-                        const void* in_dev,
-                        void* out_dev,
-                        void* out_indices_dev,
-                        void* workspace_dev,
-                        const InElementwiseOperation in_elementwise_op,
-                        const AccElementwiseOperation acc_elementwise_op) override
-    {
-        return std::make_unique<Argument>(inLengths,
-                                          inStrides,
-                                          outLengths,
-                                          outStrides,
-                                          reduceDims,
-                                          alpha,
-                                          beta,
-                                          static_cast<const InDataType*>(in_dev),
-                                          static_cast<OutDataType*>(out_dev),
-                                          static_cast<IndexDataType*>(out_indices_dev),
-                                          static_cast<AccDataType*>(workspace_dev),
-                                          in_elementwise_op,
-                                          acc_elementwise_op);
-    };
-    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
-    {
-        return std::make_unique<Invoker>();
-    };
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-        // clang-format off
-        str << "DeviceReduceBlockWise<" << BlockSize << ",";
-        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
-        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
-        str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
-        // clang-format on
-        return str.str();
-    }
-};
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp
-#ifndef DEVICE_REDUCE_BLOCKWISE_SECOND_CALL_HPP
-#define DEVICE_REDUCE_BLOCKWISE_SECOND_CALL_HPP
-#include <iostream>
-#include <sstream>
-#include "device.hpp"
-#include "device_reduce.hpp"
-#include "device_reduce_common.hpp"
-#include "gridwise_2d_reduction_blockwise.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-template <typename InDataType,
-          typename AccDataType,
-          typename OutDataType,
-          index_t Rank,
-          index_t NumReduceDim,
-          typename ReduceOperation,
-          typename InElementwiseOperation,
-          typename AccElementwiseOperation,
-          bool PropagateNan,
-          bool NeedIndices,
-          index_t BlockSize,
-          index_t MThreadClusterSize,
-          index_t KThreadClusterSize,
-          index_t MThreadSliceSize,
-          index_t KThreadSliceSize,
-          index_t InSrcVectorDim,
-          index_t InSrcVectorSize,
-          index_t OutDstVectorSize>
-struct DeviceReduceBlockWiseSecondCall
-    : public DeviceReduce<InElementwiseOperation, AccElementwiseOperation>
-{
-    static_assert(Rank <= 6, "Bigger Rank size is not supported!");
-    static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
-                  "Invalid thread cluster size assignments!");
-    static_assert((InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0) &&
-                      (MThreadSliceSize % OutDstVectorSize == 0),
-                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
-    using IndexDataType = int32_t;
-    static constexpr bool BetaIsZero = NeedIndices;
-    static_assert(
-        std::is_same<InDataType, AccDataType>::value,
-        "InDataType and AccDataType should be the same to use DEviceReduceBlockWiseSecondCall!");
-    static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
-    static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;
-    static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
-    static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
-    static auto MakeSrc2dDescriptor(const std::vector<int>& inLengths,
-                                    const std::vector<int>& inStrides)
-    {
-        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<2>{});
-        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<2>{});
-        const auto in_grid_desc_m_k =
-            make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
-        const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
-        const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});
-        const auto inPad_M =
-            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
-        const auto inPad_K =
-            math::integer_least_multiple(reduceLength, K_BlockTileSize) - reduceLength;
-        auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
-            in_grid_desc_m_k,
-            make_tuple(make_right_pad_transform(invariantLength, inPad_M),
-                       make_right_pad_transform(reduceLength, inPad_K)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}));
-        return (in_grid_desc_m_k_padded);
-    };
-    static auto MakeDst1dDescriptor(const std::vector<int>& outLengths,
-                                    const std::vector<int>& outStrides)
-    {
-        const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<numDstDim>{});
-        const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<numDstDim>{});
-        auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
-        auto out_grid_desc_m = transform_tensor_descriptor(
-            outDesc,
-            make_tuple(make_merge_transform(tupleDstLengths)),
-            make_tuple(typename arithmetic_sequence_gen<0, numDstDim, 1>::type{}),
-            make_tuple(Sequence<0>{}));
-        const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{});
-        const auto outPad =
-            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
-        auto out_grid_desc_m_padded = transform_tensor_descriptor(
-            out_grid_desc_m,
-            make_tuple(make_right_pad_transform(invariantLength, outPad)),
-            make_tuple(Sequence<0>{}),
-            make_tuple(Sequence<0>{}));
-        return (out_grid_desc_m_padded);
-    };
-    struct Argument : public BaseArgument
-    {
-        Argument(const std::vector<int>& inLengths,
-                 const std::vector<int>& inStrides,
-                 const std::vector<int>& outLengths,
-                 const std::vector<int>& outStrides,
-                 float alpha,
-                 float beta,
-                 const InDataType* in_dev,
-                 OutDataType* out_dev,
-                 IndexDataType* out_indices_dev,
-                 AccDataType* workspace_dev,
-                 const InElementwiseOperation& in_elementwise_op,
-                 const AccElementwiseOperation& acc_elementwise_op)
-            : inLengths_(inLengths),
-              inStrides_(inStrides),
-              outLengths_(outLengths),
-              outStrides_(outStrides),
-              in_dev_{in_dev},
-              out_dev_{out_dev},
-              out_indices_dev_{out_indices_dev},
-              in_elementwise_op_(in_elementwise_op),
-              acc_elementwise_op_(acc_elementwise_op)
-        {
-            alpha_ = type_convert<AccDataType>(alpha);
-            beta_  = type_convert<AccDataType>(beta);
-            invariant_total_length = inLengths[0];
-            reduce_total_length    = inLengths[1];
-            invariant_lowest_length = inLengths[0];
-            reduce_lowest_length    = inLengths[1];
-            gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
-                       M_BlockTileSize;
-            size_t ws_buf2_bytes_offset = math::integer_least_multiple(
-                invariant_total_length * reduce_total_length * sizeof(AccDataType), 64);
-            if constexpr(NeedIndices)
-                workspace_indices_dev_ = reinterpret_cast<index_t*>(
-                    reinterpret_cast<char*>(workspace_dev) + ws_buf2_bytes_offset);
-            else
-                workspace_indices_dev_ = nullptr;
-        }
-        std::vector<int> inLengths_;
-        std::vector<int> inStrides_;
-        std::vector<int> outLengths_;
-        std::vector<int> outStrides_;
-        AccDataType alpha_;
-        AccDataType beta_;
-        const InDataType* in_dev_;
-        OutDataType* out_dev_;
-        IndexDataType* out_indices_dev_;
-        IndexDataType* workspace_indices_dev_;
-        InElementwiseOperation in_elementwise_op_;
-        AccElementwiseOperation acc_elementwise_op_;
-        int invariant_lowest_length;
-        int reduce_lowest_length;
-        size_t invariant_total_length;
-        size_t reduce_total_length;
-        size_t gridSize;
-    };
-    struct Invoker : public BaseInvoker
-    {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
-        {
-            const auto in_grid_desc_m_k = DeviceReduceBlockWiseSecondCall::MakeSrc2dDescriptor(
-                arg.inLengths_, arg.inStrides_);
-            const auto out_grid_desc_m = DeviceReduceBlockWiseSecondCall::MakeDst1dDescriptor(
-                arg.outLengths_, arg.outStrides_);
-            using InGridDesc_M_K = decltype(in_grid_desc_m_k);
-            using OutGridDesc_M  = decltype(out_grid_desc_m);
-            using GridwiseReduce = GridwiseReduction_mk_to_m_blockwise<InDataType,
-                                                                       OutDataType,
-                                                                       AccDataType,
-                                                                       IndexDataType,
-                                                                       InGridDesc_M_K,
-                                                                       OutGridDesc_M,
-                                                                       ReduceOperation,
-                                                                       InElementwiseOperation,
-                                                                       AccElementwiseOperation,
-                                                                       PropagateNan,
-                                                                       BetaIsZero,
-                                                                       BlockSize,
-                                                                       MThreadClusterSize,
-                                                                       KThreadClusterSize,
-                                                                       MThreadSliceSize,
-                                                                       KThreadSliceSize,
-                                                                       InSrcVectorDim,
-                                                                       InSrcVectorSize,
-                                                                       OutDstVectorSize>;
-            float avg_time = 0;
-            const auto kernel = kernel_reduce_blockwise_second_call<GridwiseReduce,
-                                                                    NeedIndices,
-                                                                    InDataType,
-                                                                    OutDataType,
-                                                                    AccDataType,
-                                                                    IndexDataType,
-                                                                    InGridDesc_M_K,
-                                                                    OutGridDesc_M,
-                                                                    InElementwiseOperation,
-                                                                    AccElementwiseOperation>;
-            avg_time = launch_and_time_kernel(stream_config,
-                                              kernel,
-                                              dim3(arg.gridSize),
-                                              dim3(BlockSize),
-                                              0,
-                                              in_grid_desc_m_k,
-                                              out_grid_desc_m,
-                                              arg.in_elementwise_op_,
-                                              arg.acc_elementwise_op_,
-                                              arg.alpha_,
-                                              arg.in_dev_,
-                                              arg.beta_,
-                                              arg.out_dev_,
-                                              arg.workspace_indices_dev_,
-                                              arg.out_indices_dev_);
-            return (avg_time);
-        };
-        float Run(const BaseArgument* p_arg,
-                  const StreamConfig& stream_config = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
-        }
-    };
-    bool IsSupportedArgument(const BaseArgument* p_arg) override
-    {
-        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
-        if constexpr(InSrcVectorDim == 0)
-            return (false);
-        if(pArg->reduce_lowest_length % InSrcVectorSize != 0)
-            return (false);
-        // To improve
-        if(pArg->invariant_lowest_length % OutDstVectorSize != 0)
-            return (false);
-        // cases with very small reduce_total_length should be handled by the ThreadWise method
-        if(pArg->reduce_total_length / KThreadSliceSize < 2)
-            return (false);
-        return (true);
-    };
-    std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const std::vector<int> inLengths,
-                        const std::vector<int> inStrides,
-                        const std::vector<int> outLengths,
-                        const std::vector<int> outStrides,
-                        const std::vector<int> reduceDims,
-                        float alpha,
-                        float beta,
-                        const void* in_dev,
-                        void* out_dev,
-                        void* out_indices_dev,
-                        void* workspace_dev,
-                        const InElementwiseOperation in_elementwise_op,
-                        const AccElementwiseOperation acc_elementwise_op) override
-    {
-        (void)reduceDims;
-        return std::make_unique<Argument>(inLengths,
-                                          inStrides,
-                                          outLengths,
-                                          outStrides,
-                                          alpha,
-                                          beta,
-                                          static_cast<const InDataType*>(in_dev),
-                                          static_cast<OutDataType*>(out_dev),
-                                          static_cast<IndexDataType*>(out_indices_dev),
-                                          static_cast<AccDataType*>(workspace_dev),
-                                          in_elementwise_op,
-                                          acc_elementwise_op);
-    };
-    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
-    {
-        return std::make_unique<Invoker>();
-    };
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-        // clang-format off
-        str << "DeviceReduceBlockWiseSecondCall<" << BlockSize << ",";
-        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
-        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
-        str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
-        // clang-format on
-        return str.str();
-    }
-};
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/device/device_reduce_common.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_common.hpp
@@ -14,13 +14,13 @@ namespace device {
 // here, inLengths[] is already shuffled so that lengths of invariant dims are included before those
 // of reduce dims
-template <int Rank, int NumReduceDim>
+template <index_t Rank, int NumReduceDim>
-std::pair<size_t, size_t> get_2d_lengths(const std::vector<int>& inLengths)
+std::pair<long_index_t, long_index_t> get_2d_lengths(const std::vector<index_t>& inLengths)
 {
    static_assert(Rank <= 6, "bigger Rank size not supported!");
-    size_t invariant_total_length = 1;
+    long_index_t invariant_total_length = 1;
-    size_t reduce_total_length    = 1;
+    long_index_t reduce_total_length    = 1;
    constexpr int NumInvariantDim = Rank - NumReduceDim;
@@ -35,13 +35,13 @@ std::pair<size_t, size_t> get_2d_lengths(const std::vector<int>& inLengths)
 // helper functions using variadic template arguments
 template <index_t... Ns>
-auto make_tuple_from_array_and_index_seq(const std::vector<int>& lengths, Sequence<Ns...>)
+auto make_tuple_from_array_and_index_seq(const std::vector<index_t>& lengths, Sequence<Ns...>)
 {
    return make_tuple(static_cast<index_t>(lengths[Ns])...);
 };
 template <index_t arraySize>
-static auto make_tuple_from_array(const std::vector<int>& lengths, Number<arraySize>)
+auto make_tuple_from_array(const std::vector<index_t>& lengths, Number<arraySize>)
 {
    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
@@ -51,10 +51,10 @@ static auto make_tuple_from_array(const std::vector<int>& lengths, Number<arrayS
 };
 template <index_t Rank, index_t NumReduceDim>
-std::vector<int> shuffle_tensor_dimensions(const std::vector<int>& origLengthsStrides,
+std::vector<index_t> shuffle_tensor_dimensions(const std::vector<index_t>& origLengthsStrides,
                                               const std::vector<int>& reduceDims)
 {
-    std::vector<int> newLengthsStrides;
+    std::vector<index_t> newLengthsStrides;
    assert(Rank == origLengthsStrides.size() && NumReduceDim == reduceDims.size());

--- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp
--- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
-#ifndef DEVICE_REDUCE_MULTIBLOCK_PARTIAL_REDUCE_HPP
-#define DEVICE_REDUCE_MULTIBLOCK_PARTIAL_REDUCE_HPP
-#include <iostream>
-#include <sstream>
-#include "device.hpp"
-#include "device_reduce.hpp"
-#include "device_reduce_common.hpp"
-#include "gridwise_2d_reduction_multiblock_partial_reduce.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-template <typename InDataType,
-          typename AccDataType,
-          typename OutDataType,
-          index_t Rank,
-          index_t NumReduceDim,
-          typename ReduceOperation,
-          typename InElementwiseOperation,
-          typename AccElementwiseOperation,
-          bool PropagateNan,
-          bool NeedIndices,
-          index_t BlockSize,
-          index_t MThreadClusterSize,
-          index_t KThreadClusterSize,
-          index_t MThreadSliceSize,
-          index_t KThreadSliceSize,
-          index_t InSrcVectorDim,
-          index_t InSrcVectorSize,
-          index_t OutDstVectorSize>
-struct DeviceReduceMultiBlockPartialReduce
-    : public DeviceReduce<InElementwiseOperation, AccElementwiseOperation>
-{
-    static_assert(Rank <= 6, "Bigger Rank size is not supported!");
-    static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
-                  "Invalid thread cluster size assignments!");
-    static_assert((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
-                      (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0),
-                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
-    static_assert(OutDstVectorSize == 1, "OutDstVectorSize must be 1 for MultiBlockPartialReduce!");
-    using IndexDataType = int32_t;
-    static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
-    static constexpr index_t numSrcDim = Rank;
-    static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;
-    static constexpr bool reduceAllDim = (NumInvariantDim == 0);
-    static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
-    static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
-    static constexpr int MaxBlockGroupSize = 256;
-    long_index_t GetWorkspaceSizeInBytes(const std::vector<int> inLengths,
-                                         const std::vector<int> reduceDims) override
-    {
-        size_t invariant_total_length;
-        size_t reduce_total_length;
-        auto inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
-        std::tie(invariant_total_length, reduce_total_length) =
-            get_2d_lengths<Rank, NumReduceDim>(inLengths_);
-        int iterations = 1;
-        while(true)
-        {
-            int testBlkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
-                                   (K_BlockTileSize * iterations);
-            if(testBlkGroupSize <= MaxBlockGroupSize)
-                break;
-            iterations++;
-        };
-        int blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
-                           (K_BlockTileSize * iterations);
-        long_index_t workspace_size = invariant_total_length * blkGroupSize;
-        long_index_t wsSizeInBytes =
-            !NeedIndices
-                ? workspace_size * sizeof(AccDataType)
-                : workspace_size * (sizeof(AccDataType) + sizeof(int32_t)) + 64 + sizeof(int);
-        return (wsSizeInBytes);
-    };
-    bool HasFurtherCall() override { return (true); };
-    static auto MakeSrc2dDescriptor(const std::vector<int>& inLengths,
-                                    const std::vector<int>& inStrides,
-                                    int blkGroupSize,
-                                    int kBlockTileIterations)
-    {
-        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<numSrcDim>{});
-        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<numSrcDim>{});
-        const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
-        const auto in_grid_desc_m_k = [&]() {
-            if constexpr(reduceAllDim)
-            {
-                const auto one_dim_inDesc = transform_tensor_descriptor(
-                    inDesc,
-                    make_tuple(make_merge_transform(tupleSrcLengths)),
-                    make_tuple(typename arithmetic_sequence_gen<0, numSrcDim, 1>::type{}),
-                    make_tuple(Sequence<0>{}));
-                return transform_tensor_descriptor(one_dim_inDesc,
-                                                   make_tuple(make_unmerge_transform(make_tuple(
-                                                       1, one_dim_inDesc.GetLength(Number<0>{})))),
-                                                   make_tuple(Sequence<0>{}),
-                                                   make_tuple(Sequence<0, 1>{}));
-            }
-            else
-            {
-                using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
-                using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
-                const auto reduceDimLengths =
-                    make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
-                const auto invariantDimLengths =
-                    make_tuple_from_array_and_index_seq(inLengths, InvariantDims{});
-                return transform_tensor_descriptor(
-                    inDesc,
-                    make_tuple(make_merge_transform(invariantDimLengths),
-                               make_merge_transform(reduceDimLengths)),
-                    make_tuple(InvariantDims{}, ReduceDims{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}));
-            }
-        }();
-        const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
-        const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});
-        const int reduceSizePerBlock = K_BlockTileSize * kBlockTileIterations;
-        const auto inPad_M =
-            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
-        const auto inPad_K = reduceSizePerBlock * blkGroupSize - reduceLength;
-        auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
-            in_grid_desc_m_k,
-            make_tuple(make_right_pad_transform(invariantLength, inPad_M),
-                       make_right_pad_transform(reduceLength, inPad_K)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}));
-        return (in_grid_desc_m_k_padded);
-    };
-    static auto MakeWorkspace2dDescriptor(int invariantLength, int blkGroupSize)
-    {
-        auto ws_desc_m_k =
-            make_naive_tensor_descriptor_packed(make_tuple(invariantLength, blkGroupSize));
-        const auto wsPad =
-            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
-        auto ws_desc_m_k_padded =
-            transform_tensor_descriptor(ws_desc_m_k,
-                                        make_tuple(make_right_pad_transform(invariantLength, wsPad),
-                                                   make_pass_through_transform(blkGroupSize)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
-        return (ws_desc_m_k_padded);
-    };
-    struct Argument : public BaseArgument
-    {
-        Argument(const std::vector<int> inLengths,
-                 const std::vector<int> inStrides,
-                 const std::vector<int> outLengths,
-                 const std::vector<int> outStrides,
-                 const std::vector<int> reduceDims,
-                 float alpha,
-                 float beta,
-                 const InDataType* in_dev,
-                 OutDataType* out_dev,
-                 IndexDataType* out_indices_dev,
-                 AccDataType* workspace_dev,
-                 const InElementwiseOperation in_elementwise_op,
-                 const AccElementwiseOperation acc_elementwise_op)
-            : outLengths_{outLengths},
-              outStrides_{outStrides},
-              in_dev_{in_dev},
-              out_dev_{out_dev},
-              out_indices_dev_{out_indices_dev},
-              workspace_dev_{workspace_dev},
-              in_elementwise_op_{in_elementwise_op},
-              acc_elementwise_op_{acc_elementwise_op}
-        {
-            inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
-            inStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inStrides, reduceDims);
-            alpha_ = type_convert<AccDataType>(alpha);
-            beta_  = type_convert<AccDataType>(beta);
-            std::tie(invariant_total_length, reduce_total_length) =
-                get_2d_lengths<Rank, NumReduceDim>(inLengths_);
-            if constexpr(NumInvariantDim == 0)
-                invariant_lowest_length = 1;
-            else
-                invariant_lowest_length = inLengths_[NumInvariantDim - 1];
-            reduce_lowest_length = inLengths_[Rank - 1];
-            int iterations = 1;
-            while(true)
-            {
-                int testBlkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
-                                       (K_BlockTileSize * iterations);
-                if(testBlkGroupSize <= MaxBlockGroupSize)
-                    break;
-                iterations++;
-            };
-            blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
-                           (K_BlockTileSize * iterations);
-            kBlockTileIterations = iterations;
-            gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
-                       M_BlockTileSize * blkGroupSize;
-            size_t ws_buf2_bytes_offset = math::integer_least_multiple(
-                invariant_total_length * blkGroupSize * sizeof(AccDataType), 64);
-            if constexpr(NeedIndices)
-                workspace_indices_dev_ = reinterpret_cast<int*>(
-                    reinterpret_cast<char*>(workspace_dev_) + ws_buf2_bytes_offset);
-            else
-                workspace_indices_dev_ = nullptr;
-        }
-        std::vector<int> inLengths_;
-        std::vector<int> inStrides_;
-        std::vector<int> outLengths_;
-        std::vector<int> outStrides_;
-        AccDataType alpha_;
-        AccDataType beta_;
-        const InDataType* in_dev_;
-        OutDataType* out_dev_;
-        IndexDataType* out_indices_dev_;
-        AccDataType* workspace_dev_;
-        IndexDataType* workspace_indices_dev_;
-        InElementwiseOperation in_elementwise_op_;
-        AccElementwiseOperation acc_elementwise_op_;
-        int invariant_lowest_length;
-        int reduce_lowest_length;
-        size_t invariant_total_length;
-        size_t reduce_total_length;
-        index_t blkGroupSize;
-        index_t kBlockTileIterations;
-        size_t gridSize;
-    };
-    struct Invoker : public BaseInvoker
-    {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
-        {
-            const auto in_grid_desc_m_k = DeviceReduceMultiBlockPartialReduce::MakeSrc2dDescriptor(
-                arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.kBlockTileIterations);
-            const auto ws_desc_m_k = DeviceReduceMultiBlockPartialReduce::MakeWorkspace2dDescriptor(
-                arg.invariant_total_length, arg.blkGroupSize);
-            using InGridDesc_M_K    = decltype(in_grid_desc_m_k);
-            using WorkspaceDesc_M_K = decltype(ws_desc_m_k);
-            using GridwiseReduce =
-                GridwiseReduction_mk_to_mk_multiblock_partial_reduce<InDataType,
-                                                                     AccDataType,
-                                                                     IndexDataType,
-                                                                     InGridDesc_M_K,
-                                                                     WorkspaceDesc_M_K,
-                                                                     ReduceOperation,
-                                                                     InElementwiseOperation,
-                                                                     AccElementwiseOperation,
-                                                                     PropagateNan,
-                                                                     BlockSize,
-                                                                     MThreadClusterSize,
-                                                                     KThreadClusterSize,
-                                                                     MThreadSliceSize,
-                                                                     KThreadSliceSize,
-                                                                     InSrcVectorDim,
-                                                                     InSrcVectorSize,
-                                                                     OutDstVectorSize>;
-            float avg_time = 0;
-            const auto kernel = kernel_partial_reduce_multiblock<GridwiseReduce,
-                                                                 NeedIndices,
-                                                                 InDataType,
-                                                                 AccDataType,
-                                                                 IndexDataType,
-                                                                 InGridDesc_M_K,
-                                                                 WorkspaceDesc_M_K,
-                                                                 InElementwiseOperation,
-                                                                 AccElementwiseOperation>;
-            avg_time = launch_and_time_kernel(stream_config,
-                                              kernel,
-                                              dim3(arg.gridSize),
-                                              dim3(BlockSize),
-                                              0,
-                                              in_grid_desc_m_k,
-                                              ws_desc_m_k,
-                                              arg.in_elementwise_op_,
-                                              arg.acc_elementwise_op_,
-                                              arg.blkGroupSize,
-                                              arg.kBlockTileIterations,
-                                              arg.in_dev_,
-                                              arg.workspace_dev_,
-                                              arg.workspace_indices_dev_);
-            return (avg_time);
-        };
-        float Run(const BaseArgument* p_arg,
-                  const StreamConfig& stream_config = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
-        }
-    };
-    bool IsSupportedArgument(const BaseArgument* p_arg) override
-    {
-        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
-        if constexpr(OutDstVectorSize != 1)
-            return (false);
-        if constexpr(InSrcVectorDim == 0)
-        {
-            if constexpr(NumInvariantDim == 0)
-            {
-                return (false);
-            }
-            else
-            {
-                if(pArg->inStrides_[NumInvariantDim - 1] != 1)
-                    return (false);
-                if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
-                    return (false);
-            };
-        }
-        else
-        {
-            if(pArg->inStrides_[Rank - 1] != 1)
-                return (false);
-            if(pArg->reduce_lowest_length % InSrcVectorSize != 0)
-                return (false);
-        };
-        // cases with small reduce_total_length should be handled by the BlockWise method
-        if(pArg->reduce_total_length <= BlockSize * KThreadSliceSize)
-            return (false);
-        return (true);
-    };
-    std::vector<int> GetWorkspace2dLengths(const BaseArgument* p_arg) override
-    {
-        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
-        return (
-            std::vector<int>{static_cast<int>(pArg->invariant_total_length), pArg->blkGroupSize});
-    };
-    std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const std::vector<int> inLengths,
-                        const std::vector<int> inStrides,
-                        const std::vector<int> outLengths,
-                        const std::vector<int> outStrides,
-                        const std::vector<int> reduceDims,
-                        float alpha,
-                        float beta,
-                        const void* in_dev,
-                        void* out_dev,
-                        void* out_indices_dev,
-                        void* workspace_dev,
-                        const InElementwiseOperation in_elementwise_op,
-                        const AccElementwiseOperation acc_elementwise_op) override
-    {
-        return std::make_unique<Argument>(inLengths,
-                                          inStrides,
-                                          outLengths,
-                                          outStrides,
-                                          reduceDims,
-                                          alpha,
-                                          beta,
-                                          static_cast<const InDataType*>(in_dev),
-                                          static_cast<OutDataType*>(out_dev),
-                                          static_cast<IndexDataType*>(out_indices_dev),
-                                          static_cast<AccDataType*>(workspace_dev),
-                                          in_elementwise_op,
-                                          acc_elementwise_op);
-    };
-    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
-    {
-        return std::make_unique<Invoker>();
-    };
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-        // clang-format off
-        str << "DeviceReduceMultiBlockPartialReduce<" << BlockSize << ",";
-        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
-        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
-        str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
-        // clang-format on
-        return str.str();
-    }
-};
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
@@ -6,6 +6,7 @@
 #include "device.hpp"
 #include "device_reduce.hpp"
 #include "device_reduce_common.hpp"
+#include "gridwise_2d_reduction_multiblock.hpp"
 #include "gridwise_2d_reduction_threadwise.hpp"
 namespace ck {
@@ -19,22 +20,19 @@ template <typename InDataType,
          index_t NumReduceDim,
          typename ReduceOperation,
          typename InElementwiseOperation,
-          typename OutElementwiseOperation,
+          typename AccElementwiseOperation,
          bool PropagateNan,
-          bool NeedIndices,
+          bool OutputIndex,
+          bool HaveIndexInputIfOutputIndex,
          index_t BlockSize,
-          index_t MThreadClusterSize,
-          index_t KThreadClusterSize,
          index_t MThreadSliceSize,
          index_t KThreadSliceSize,
          index_t InSrcVectorDim,
          index_t InSrcVectorSize,
          index_t OutDstVectorSize>
-struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutElementwiseOperation>
+struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, AccElementwiseOperation>
 {
    static_assert(Rank <= 6, "Bigger Rank size is not supported!");
-    static_assert((BlockSize == MThreadClusterSize) && (KThreadClusterSize == 1),
-                  "Threadwise can only be called with KThreadClusterSize be 1 !");
    static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
                   (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&
@@ -43,7 +41,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
    using IndexDataType = int32_t;
-    static constexpr bool BetaIsZero = NeedIndices;
+    static constexpr bool HaveIndexInput = OutputIndex && HaveIndexInputIfOutputIndex;
    static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
@@ -51,11 +49,11 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
    static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;
    static constexpr bool reduceAllDim = (NumInvariantDim == 0);
-    static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t M_BlockTileSize = BlockSize * MThreadSliceSize;
-    static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = 1 * KThreadSliceSize;
-    static auto MakeSrc2dDescriptor(const std::vector<int>& inLengths,
+    static auto MakeSrc2dDescriptor(const std::vector<index_t>& inLengths,
-                                    const std::vector<int>& inStrides)
+                                    const std::vector<index_t>& inStrides)
    {
        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<numSrcDim>{});
        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<numSrcDim>{});
@@ -114,8 +112,8 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
        return (in_grid_desc_m_k_padded);
    };
-    static auto MakeDst1dDescriptor(const std::vector<int>& outLengths,
+    static auto MakeDst1dDescriptor(const std::vector<index_t>& outLengths,
-                                    const std::vector<int>& outStrides)
+                                    const std::vector<index_t>& outStrides)
    {
        const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<numDstDim>{});
        const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<numDstDim>{});
@@ -143,30 +141,26 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
    struct Argument : public BaseArgument
    {
-        Argument(const std::vector<int> inLengths,
+        Argument(const std::vector<index_t> inLengths,
-                 const std::vector<int> inStrides,
+                 const std::vector<index_t> inStrides,
-                 const std::vector<int> outLengths,
+                 const std::vector<index_t> outLengths,
-                 const std::vector<int> outStrides,
+                 const std::vector<index_t> outStrides,
                 const std::vector<int> reduceDims,
                 float alpha,
                 float beta,
                 const InDataType* in_dev,
                 OutDataType* out_dev,
-                 IndexDataType* out_indices_dev,
+                 IndexDataType* out_index_dev,
-                 AccDataType* workspace_dev,
                 const InElementwiseOperation in_elementwise_op,
-                 const OutElementwiseOperation acc_elementwise_op)
+                 const AccElementwiseOperation acc_elementwise_op)
            : outLengths_{outLengths},
              outStrides_{outStrides},
              in_dev_{in_dev},
              out_dev_{out_dev},
-              out_indices_dev_{out_indices_dev},
+              out_index_dev_{out_index_dev},
              in_elementwise_op_{in_elementwise_op},
              acc_elementwise_op_{acc_elementwise_op}
        {
-            (void)workspace_dev;
            inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
            inStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inStrides, reduceDims);
@@ -183,30 +177,33 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
            reduce_lowest_length = inLengths_[Rank - 1];
+            numBlockTileIteration = (reduce_total_length + K_BlockTileSize - 1) / K_BlockTileSize;
            gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
                       M_BlockTileSize;
        }
-        std::vector<int> inLengths_;
+        std::vector<index_t> inLengths_;
-        std::vector<int> inStrides_;
+        std::vector<index_t> inStrides_;
-        std::vector<int> outLengths_;
+        std::vector<index_t> outLengths_;
-        std::vector<int> outStrides_;
+        std::vector<index_t> outStrides_;
        AccDataType alpha_;
        AccDataType beta_;
        const InDataType* in_dev_;
        OutDataType* out_dev_;
-        IndexDataType* out_indices_dev_;
+        IndexDataType* out_index_dev_;
        InElementwiseOperation in_elementwise_op_;
-        OutElementwiseOperation acc_elementwise_op_;
+        AccElementwiseOperation acc_elementwise_op_;
-        int invariant_lowest_length;
+        index_t invariant_lowest_length;
-        int reduce_lowest_length;
+        index_t reduce_lowest_length;
-        size_t invariant_total_length;
+        long_index_t invariant_total_length;
-        size_t reduce_total_length;
+        long_index_t reduce_total_length;
+        int numBlockTileIteration;
        size_t gridSize;
    };
@@ -221,7 +218,10 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
            using InGridDesc_M_K = decltype(in_grid_desc_m_k);
            using OutGridDesc_M  = decltype(out_grid_desc_m);
-            using GridwiseReduce = GridwiseReduction_mk_to_m_threadwise<InDataType,
+            float avg_time = 0;
+            using GridwiseReduce =
+                GridwiseReduction_mk_to_m_threadwise<InDataType,
                                                     OutDataType,
                                                     AccDataType,
                                                     IndexDataType,
@@ -229,22 +229,19 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
                                                     OutGridDesc_M,
                                                     ReduceOperation,
                                                     InElementwiseOperation,
-                                                                        OutElementwiseOperation,
+                                                     AccElementwiseOperation,
+                                                     InMemoryDataOperationEnum::Set,
                                                     PropagateNan,
-                                                                        BetaIsZero,
                                                     BlockSize,
-                                                                        MThreadClusterSize,
-                                                                        KThreadClusterSize,
                                                     MThreadSliceSize,
                                                     KThreadSliceSize,
                                                     InSrcVectorDim,
                                                     InSrcVectorSize,
                                                     OutDstVectorSize>;
-            float avg_time = 0;
            const auto kernel = kernel_reduce_threadwise<GridwiseReduce,
-                                                         NeedIndices,
+                                                         OutputIndex,
+                                                         HaveIndexInput,
                                                         InDataType,
                                                         OutDataType,
                                                         AccDataType,
@@ -252,7 +249,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
                                                         InGridDesc_M_K,
                                                         OutGridDesc_M,
                                                         InElementwiseOperation,
-                                                         OutElementwiseOperation>;
+                                                         AccElementwiseOperation>;
            avg_time = launch_and_time_kernel(stream_config,
                                              kernel,
@@ -265,9 +262,10 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
                                              arg.acc_elementwise_op_,
                                              arg.alpha_,
                                              arg.in_dev_,
+                                              nullptr,
                                              arg.beta_,
                                              arg.out_dev_,
-                                              arg.out_indices_dev_);
+                                              arg.out_index_dev_);
            return (avg_time);
        };
@@ -276,7 +274,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
                  const StreamConfig& stream_config = StreamConfig{}) override
        {
            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
-        }
+        };
    };
    bool IsSupportedArgument(const BaseArgument* p_arg) override
@@ -311,9 +309,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
        if(pArg->invariant_lowest_length % OutDstVectorSize != 0)
            return (false);
-        // TODO: remove this. Should return true, as long as this DeviceOP instance support this
+        // cases with big reduce_total_length should be handled by Blockwise kernel
-        // case for bigger reduce_total_length size, we are supposed to use BlockWise method for
-        // better performance
        if(pArg->reduce_total_length / KThreadSliceSize >= 32)
            return (false);
@@ -321,20 +317,22 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
    };
    std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const std::vector<int> inLengths,
+    MakeArgumentPointer(const std::vector<index_t> inLengths,
-                        const std::vector<int> inStrides,
+                        const std::vector<index_t> inStrides,
-                        const std::vector<int> outLengths,
+                        const std::vector<index_t> outLengths,
-                        const std::vector<int> outStrides,
+                        const std::vector<index_t> outStrides,
                        const std::vector<int> reduceDims,
                        float alpha,
                        float beta,
                        const void* in_dev,
+                        const void* in_index_dev,
                        void* out_dev,
-                        void* out_indices_dev,
+                        void* out_index_dev,
-                        void* workspace_dev,
                        const InElementwiseOperation in_elementwise_op,
-                        const OutElementwiseOperation acc_elementwise_op) override
+                        const AccElementwiseOperation acc_elementwise_op) override
    {
+        (void)in_index_dev;
        return std::make_unique<Argument>(inLengths,
                                          inStrides,
                                          outLengths,
@@ -344,8 +342,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
                                          beta,
                                          static_cast<const InDataType*>(in_dev),
                                          static_cast<OutDataType*>(out_dev),
-                                          static_cast<IndexDataType*>(out_indices_dev),
+                                          static_cast<IndexDataType*>(out_index_dev),
-                                          static_cast<AccDataType*>(workspace_dev),
                                          in_elementwise_op,
                                          acc_elementwise_op);
    };
@@ -360,9 +357,9 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
        auto str = std::stringstream();
        // clang-format off
-        str << "DeviceReducceThreadWise<" << BlockSize << ",";
+        str << "DeviceReduceThreadWise<" << BlockSize << ",";
-        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
+        str << "M_C" << BlockSize << "_S" << MThreadSliceSize << ",";
-        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
+        str << "K_C" << 1 << "_S" << KThreadSliceSize << ",";
        str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
        // clang-format on

--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp